From 06175349983f31cfdd76148e700ff7fac67a5549 Mon Sep 17 00:00:00 2001
From: bastienird <bastien.grasset@ird.fr>
Date: Tue, 4 Feb 2025 13:21:13 +0000
Subject: [PATCH] feature: effort dataset with effort from local files rather
 than in googledrive

---
 launching_jsons_creating_GTA.R                |  20 ++-
 .../create_global_tuna_atlas_dataset_v2023.R  | 155 +++++++++---------
 .../generation/get_rfmos_datasets_level0.R    |  20 +--
 3 files changed, 105 insertions(+), 90 deletions(-)

diff --git a/launching_jsons_creating_GTA.R b/launching_jsons_creating_GTA.R
index 4c8b97e..c8cbe2b 100644
--- a/launching_jsons_creating_GTA.R
+++ b/launching_jsons_creating_GTA.R
@@ -132,9 +132,23 @@ time_Summarising_invalid_data_georef <- system.time({
 
 executeWorkflow("manu_geoflow_gta_config_model.json")
 
-tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("tunaatlas_qa_global_datasets_catch.json"))
-# tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("creating_dataset.json"))
-# tunaatlas_qa_global_datasets_effort_path <- executeWorkflow(here::here("create_effort_dataset.json"))
+tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("tunaatlas_qa_global_datasets_catch.json")) # FROM DRIVE
+file.copy(list.files(file.path(tunaatlas_qa_global_datasets_catch_path, "data"), 
+                     full.names = TRUE), 
+          here::here("data"), 
+          recursive = TRUE)
+
+
+effort_path <- executeWorkflow(here::here("tunaatlas_qa_global_datasets_effort.json")) # FROM DRIVE
+file.copy(list.files(file.path(tunaatlas_qa_global_datasets_catch_path, "data"), 
+                     full.names = TRUE), 
+          here::here("data"), 
+          recursive = TRUE)
+# have to download every file.
+
+tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("creating_dataset.json"))
+
+tunaatlas_qa_global_datasets_effort_path <- executeWorkflow(here::here("create_effort_dataset.json"))  # FROM LOCAL IF NOT RUNNING USE DRIVE
 # tunaatlas_qa_services <- initWorkflow("tunaatlas_qa_services.json")
 # save.image()
 # tunaatlas_qa_global_datasets_catch_path <- "jobs/20241104162955/entities/global_catch_ird_level2_rf1"
diff --git a/tunaatlas_scripts/generation/create_global_tuna_atlas_dataset_v2023.R b/tunaatlas_scripts/generation/create_global_tuna_atlas_dataset_v2023.R
index c498cab..b8656a0 100644
--- a/tunaatlas_scripts/generation/create_global_tuna_atlas_dataset_v2023.R
+++ b/tunaatlas_scripts/generation/create_global_tuna_atlas_dataset_v2023.R
@@ -538,88 +538,89 @@ create_global_tuna_atlas_dataset_v2023 <- function(action, entity, config) {
       function_recap_each_step(
         paste0("Removing_duplicated_units"),
         georef_dataset,
-        "As some data is in fact duplicated, we check all the duplicated data and remove when same",
+        "As some data is in fact duplicated for catch, we check all the duplicated data and remove the data in number when same dimensions",
         ""
       )
     
-    config$logger.info(
-      "Extract and load FIRMS Level 0 nominal catch data input (required if raising process is asked) "
-    )
-    if(file.exists(file.path("data", opts$keynominal)) && !opts$forceuseofdoi){
-      nominal_catch <-
-        readr::read_csv(here::here(file.path("data", opts$keynominal)),
-                        guess_max = 0)
-      class(nominal_catch$measurement_value) <- "numeric"
-      #@juldebar if not provided by Google drive line below should be used if nominal catch has to be extracted from the database
-    # } else if(!is.null(opts$doinominal)){
-    #   zen4R::download_zenodo(doi = opts$doinominal, files = opts$keynominal, path = "data")
-    #   nominal_catch <-
-    #     readr::read_csv(here::here(file.path("data", opts$keynominal)),
-    #                     guess_max = 0)
-    #   class(nominal_catch$measurement_value) <- "numeric"
-      
-    } else {
-      stop("Please provide a nominal catch dataset")
-      # nominal_catch <- retrieve_nominal_catch(entity, config, opts)
-    }
-    
-    class(nominal_catch$measurement_value) <- "numeric"
-    mapping_keep_src_code <- FALSE
-    
-    class(nominal_catch$measurement_unit) <- "character"
-    
-    if (any(nominal_catch$measurement_unit == "t"))
-      nominal_catch[nominal_catch$measurement_unit == "t",]$measurement_unit <- "t"
-    if (any(nominal_catch$measurement_unit == "TRUE"))
-      nominal_catch[nominal_catch$measurement_unit == "TRUE",]$measurement_unit <- "t"
-    if (any(nominal_catch$measurement_unit == "no"))
-      nominal_catch[nominal_catch$measurement_unit == "no",]$measurement_unit <- "no"
-    class(nominal_catch$measurement_value) <- "numeric"
-    
-    # Based on this analysis, the following cutoff years were determined for data completeness:
-    #   
-    # IOTC: Keep data starting from 1953.
-    # ICCAT: Keep data starting from 1957.
-    # WCPFC: Keep data starting from 1952.
-    # IATTC: Keep data starting from 1957.
-    # CCSBT: Keep all data, although years 2017 and 2020 have missing months, likely due to no fishing activity rather than missing data.
-    
-    nominal_catch <- nominal_catch %>% 
-      dplyr::mutate(fishing_fleet = ifelse(fishing_fleet == "UNK", "NEI", fishing_fleet))%>%
-      dplyr::mutate(year =lubridate::year(time_start)) %>%
-      dplyr::filter((source_authority == "WCPFC" & year >= 1952) |
-                      (source_authority == "IOTC" & year >= 1953) |
-                      (source_authority == "ICCAT" & year >= 1957) |
-                      (source_authority == "IATTC" & year >= 1957) | source_authority == "CCSBT") %>%
-      dplyr::select(-year) %>% dplyr::rename(geographic_identifier_nom = geographic_identifier)
-    
-    nominal_catch <- nominal_catch %>% 
-      dplyr::mutate(gear_type = ifelse(source_authority == "WCPFC" & gear_type == "09.31", "09.32", gear_type))
-    
-    nominal_catch <- nominal_catch %>% 
-      dplyr::select(-dplyr::any_of(c("measurement", "measurement_status", "measurement_type")))%>%
-      dplyr::ungroup() %>%
-      dplyr::group_by(across(-measurement_value)) %>% 
-      dplyr::summarise(measurement_value = sum(measurement_value, na.rm = TRUE), .groups = 'drop')
-
-  #   georef_sup_nom_init <- compare_nominal_georef_corrected(nominal_catch, georef_dataset,  
-  # list(c("species", "year", "source_authority", "gear_type", "fishing_fleet")))$`species, year, source_authority, gear_type, fishing_fleet`$georef_sup_nominal %>%
-  #   # dplyr::select(c("species", "year", "source_authority", "gear_type", "fishing_fleet")) %>%
-  #     dplyr::distinct() %>% dplyr::group_by(source_authority, year, species) %>% dplyr::mutate(sum = sum(Difference))
-  #   # 
-  #   # georef_sup_nom_init_species_year_gear <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority", "gear_type")))$`species, year, source_authority, gear_type`$georef_sup_nominal %>% 
-  #   #   dplyr::select(c("species", "year", "source_authority", "gear_type"))%>%
-  #   #   dplyr::distinct()
-  #   # 
-  #   georef_sup_nom_init_species_year <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority")))$`species, year, source_authority`$georef_sup_nominal %>%
-  #     # dplyr::select(c("species", "year", "source_authority"))%>%
-  #     dplyr::distinct()
-    # 
-    # global_nominal_catch_firms_level0_nei <- nominal_catch %>% dplyr::filter(species %in% c("TUN", "TUS", "BIL")) %>% dplyr::mutate(year = as.character(year(ymd(time_start)))) %>% 
-    #   dplyr::inner_join(georef_sup_nom_init_species_year, by = c("year", "source_authority")) %>% dplyr::group_by(species.x, year) %>% dplyr::mutate(sumx = sum(measurement_value))
-    
     # LEVEL 1 IRD ---------------------------------------------------
     if(DATASET_LEVEL >= 1){
+      
+      config$logger.info(
+        "Extract and load FIRMS Level 0 nominal catch data input (required if raising process is asked) "
+      )
+      if(file.exists(file.path("data", opts$keynominal)) && !opts$forceuseofdoi){
+        nominal_catch <-
+          readr::read_csv(here::here(file.path("data", opts$keynominal)),
+                          guess_max = 0)
+        class(nominal_catch$measurement_value) <- "numeric"
+        #@juldebar if not provided by Google drive line below should be used if nominal catch has to be extracted from the database
+        # } else if(!is.null(opts$doinominal)){
+        #   zen4R::download_zenodo(doi = opts$doinominal, files = opts$keynominal, path = "data")
+        #   nominal_catch <-
+        #     readr::read_csv(here::here(file.path("data", opts$keynominal)),
+        #                     guess_max = 0)
+        #   class(nominal_catch$measurement_value) <- "numeric"
+        
+      } else {
+        stop("Please provide a nominal catch dataset")
+        # nominal_catch <- retrieve_nominal_catch(entity, config, opts)
+      }
+      
+      class(nominal_catch$measurement_value) <- "numeric"
+      mapping_keep_src_code <- FALSE
+      
+      class(nominal_catch$measurement_unit) <- "character"
+      
+      if (any(nominal_catch$measurement_unit == "t"))
+        nominal_catch[nominal_catch$measurement_unit == "t",]$measurement_unit <- "t"
+      if (any(nominal_catch$measurement_unit == "TRUE"))
+        nominal_catch[nominal_catch$measurement_unit == "TRUE",]$measurement_unit <- "t"
+      if (any(nominal_catch$measurement_unit == "no"))
+        nominal_catch[nominal_catch$measurement_unit == "no",]$measurement_unit <- "no"
+      class(nominal_catch$measurement_value) <- "numeric"
+      
+      # Based on this analysis, the following cutoff years were determined for data completeness:
+      #   
+      # IOTC: Keep data starting from 1953.
+      # ICCAT: Keep data starting from 1957.
+      # WCPFC: Keep data starting from 1952.
+      # IATTC: Keep data starting from 1957.
+      # CCSBT: Keep all data, although years 2017 and 2020 have missing months, likely due to no fishing activity rather than missing data.
+      
+      nominal_catch <- nominal_catch %>% 
+        dplyr::mutate(fishing_fleet = ifelse(fishing_fleet == "UNK", "NEI", fishing_fleet))%>%
+        dplyr::mutate(year =lubridate::year(time_start)) %>%
+        dplyr::filter((source_authority == "WCPFC" & year >= 1952) |
+                        (source_authority == "IOTC" & year >= 1953) |
+                        (source_authority == "ICCAT" & year >= 1957) |
+                        (source_authority == "IATTC" & year >= 1957) | source_authority == "CCSBT") %>%
+        dplyr::select(-year) %>% dplyr::rename(geographic_identifier_nom = geographic_identifier)
+      
+      nominal_catch <- nominal_catch %>% 
+        dplyr::mutate(gear_type = ifelse(source_authority == "WCPFC" & gear_type == "09.31", "09.32", gear_type))
+      
+      nominal_catch <- nominal_catch %>% 
+        dplyr::select(-dplyr::any_of(c("measurement", "measurement_status", "measurement_type")))%>%
+        dplyr::ungroup() %>%
+        dplyr::group_by(across(-measurement_value)) %>% 
+        dplyr::summarise(measurement_value = sum(measurement_value, na.rm = TRUE), .groups = 'drop')
+      
+      #   georef_sup_nom_init <- compare_nominal_georef_corrected(nominal_catch, georef_dataset,  
+      # list(c("species", "year", "source_authority", "gear_type", "fishing_fleet")))$`species, year, source_authority, gear_type, fishing_fleet`$georef_sup_nominal %>%
+      #   # dplyr::select(c("species", "year", "source_authority", "gear_type", "fishing_fleet")) %>%
+      #     dplyr::distinct() %>% dplyr::group_by(source_authority, year, species) %>% dplyr::mutate(sum = sum(Difference))
+      #   # 
+      #   # georef_sup_nom_init_species_year_gear <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority", "gear_type")))$`species, year, source_authority, gear_type`$georef_sup_nominal %>% 
+      #   #   dplyr::select(c("species", "year", "source_authority", "gear_type"))%>%
+      #   #   dplyr::distinct()
+      #   # 
+      #   georef_sup_nom_init_species_year <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority")))$`species, year, source_authority`$georef_sup_nominal %>%
+      #     # dplyr::select(c("species", "year", "source_authority"))%>%
+      #     dplyr::distinct()
+      # 
+      # global_nominal_catch_firms_level0_nei <- nominal_catch %>% dplyr::filter(species %in% c("TUN", "TUS", "BIL")) %>% dplyr::mutate(year = as.character(year(ymd(time_start)))) %>% 
+      #   dplyr::inner_join(georef_sup_nom_init_species_year, by = c("year", "source_authority")) %>% dplyr::group_by(species.x, year) %>% dplyr::mutate(sumx = sum(measurement_value))
+      
       #with this condition code will be run to deal with dataset level 1 and above
       config$logger.info("Level 1 start")
 
diff --git a/tunaatlas_scripts/generation/get_rfmos_datasets_level0.R b/tunaatlas_scripts/generation/get_rfmos_datasets_level0.R
index 83aabc5..2380959 100644
--- a/tunaatlas_scripts/generation/get_rfmos_datasets_level0.R
+++ b/tunaatlas_scripts/generation/get_rfmos_datasets_level0.R
@@ -22,8 +22,8 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){
                       iotc_data <- NULL
                       if(options$include_IOTC){
                         config$logger.info(sprintf("Get %s data", rfmo))
-                        dataset_files_iotc <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 & 
-                                                              regexpr("iotc", names(dataset_files)) > 0]
+                        dataset_files_iotc <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 & 
+                                                              regexpr("iotc", names(dataset_files)) > 0])
                         iotc_data <- do.call("rbind", lapply(dataset_files_iotc, readr::read_csv, guess_max = 0))
                         iotc_data <- as.data.frame(iotc_data)
                         class(iotc_data$measurement_value) <- "numeric"
@@ -40,8 +40,8 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){
                       wcpfc_data <- NULL
                       if(options$include_WCPFC){
                         config$logger.info(sprintf("Get %s data", rfmo))
-                        dataset_files_wcpfc <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 & 
-                                                               regexpr("wcpfc", names(dataset_files)) > 0]
+                        dataset_files_wcpfc <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 & 
+                                                               regexpr("wcpfc", names(dataset_files)) > 0])
                         wcpfc_data <- do.call("rbind", lapply(dataset_files_wcpfc, readr::read_csv, guess_max = 0))
                         wcpfc_data <- as.data.frame(wcpfc_data)
                         class(wcpfc_data$measurement_value) <- "numeric"
@@ -59,8 +59,8 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){
                       ccsbt_data <- NULL
                       if(options$include_CCSBT){
                         config$logger.info(sprintf("Get %s data", rfmo))
-                        dataset_files_ccsbt <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 & 
-                                                               regexpr("ccsbt", names(dataset_files)) > 0]
+                        dataset_files_ccsbt <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 & 
+                                                               regexpr("ccsbt", names(dataset_files)) > 0])
                         ccsbt_data <- do.call("rbind", lapply(dataset_files_ccsbt, readr::read_csv, guess_max = 0))
                         ccsbt_data <- as.data.frame(ccsbt_data)
                         class(ccsbt_data$measurement_value) <- "numeric"
@@ -77,9 +77,9 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){
                       iccat_data <- NULL
                       if(options$include_ICCAT){
                         config$logger.info(sprintf("Get %s data", rfmo))
-                        dataset_files_iccat <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 & 
+                        dataset_files_iccat <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 & 
                                                                regexpr("byschool", names(dataset_files)) < 0 &
-                                                               regexpr("iccat", names(dataset_files)) > 0]
+                                                               regexpr("iccat", names(dataset_files)) > 0])
                         iccat_data <- do.call("rbind", lapply(dataset_files_iccat, readr::read_csv, guess_max = 0))
                         iccat_data <- as.data.frame(iccat_data)
 
@@ -147,10 +147,10 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){
                         if(variable == "catch") {
                           
                           config$logger.info(sprintf("Get %s data", rfmo))
-                          dataset_files_iattc <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 & 
+                          dataset_files_iattc <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 & 
                                                                  regexpr("ps", names(dataset_files)) < 0 & 
                                                                  regexpr("effort", names(dataset_files)) < 0 &
-                                                                 regexpr("iattc", names(dataset_files)) > 0]
+                                                                 regexpr("iattc", names(dataset_files)) > 0])
                           iattc_data <- do.call("rbind", lapply(dataset_files_iattc, readr::read_csv, guess_max = 0))
                           iattc_data <- as.data.frame(iattc_data)