From 06175349983f31cfdd76148e700ff7fac67a5549 Mon Sep 17 00:00:00 2001 From: bastienird Date: Tue, 4 Feb 2025 13:21:13 +0000 Subject: [PATCH] feature: effort dataset with effort from local files rather than in googledrive --- launching_jsons_creating_GTA.R | 20 ++- .../create_global_tuna_atlas_dataset_v2023.R | 155 +++++++++--------- .../generation/get_rfmos_datasets_level0.R | 20 +-- 3 files changed, 105 insertions(+), 90 deletions(-) diff --git a/launching_jsons_creating_GTA.R b/launching_jsons_creating_GTA.R index 4c8b97e..c8cbe2b 100644 --- a/launching_jsons_creating_GTA.R +++ b/launching_jsons_creating_GTA.R @@ -132,9 +132,23 @@ time_Summarising_invalid_data_georef <- system.time({ executeWorkflow("manu_geoflow_gta_config_model.json") -tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("tunaatlas_qa_global_datasets_catch.json")) -# tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("creating_dataset.json")) -# tunaatlas_qa_global_datasets_effort_path <- executeWorkflow(here::here("create_effort_dataset.json")) +tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("tunaatlas_qa_global_datasets_catch.json")) # FROM DRIVE +file.copy(list.files(file.path(tunaatlas_qa_global_datasets_catch_path, "data"), + full.names = TRUE), + here::here("data"), + recursive = TRUE) + + +effort_path <- executeWorkflow(here::here("tunaatlas_qa_global_datasets_effort.json")) # FROM DRIVE +file.copy(list.files(file.path(tunaatlas_qa_global_datasets_catch_path, "data"), + full.names = TRUE), + here::here("data"), + recursive = TRUE) +# have to download every file. + +tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("creating_dataset.json")) + +tunaatlas_qa_global_datasets_effort_path <- executeWorkflow(here::here("create_effort_dataset.json")) # FROM LOCAL IF NOT RUNNING USE DRIVE # tunaatlas_qa_services <- initWorkflow("tunaatlas_qa_services.json") # save.image() # tunaatlas_qa_global_datasets_catch_path <- "jobs/20241104162955/entities/global_catch_ird_level2_rf1" diff --git a/tunaatlas_scripts/generation/create_global_tuna_atlas_dataset_v2023.R b/tunaatlas_scripts/generation/create_global_tuna_atlas_dataset_v2023.R index c498cab..b8656a0 100644 --- a/tunaatlas_scripts/generation/create_global_tuna_atlas_dataset_v2023.R +++ b/tunaatlas_scripts/generation/create_global_tuna_atlas_dataset_v2023.R @@ -538,88 +538,89 @@ create_global_tuna_atlas_dataset_v2023 <- function(action, entity, config) { function_recap_each_step( paste0("Removing_duplicated_units"), georef_dataset, - "As some data is in fact duplicated, we check all the duplicated data and remove when same", + "As some data is in fact duplicated for catch, we check all the duplicated data and remove the data in number when same dimensions", "" ) - config$logger.info( - "Extract and load FIRMS Level 0 nominal catch data input (required if raising process is asked) " - ) - if(file.exists(file.path("data", opts$keynominal)) && !opts$forceuseofdoi){ - nominal_catch <- - readr::read_csv(here::here(file.path("data", opts$keynominal)), - guess_max = 0) - class(nominal_catch$measurement_value) <- "numeric" - #@juldebar if not provided by Google drive line below should be used if nominal catch has to be extracted from the database - # } else if(!is.null(opts$doinominal)){ - # zen4R::download_zenodo(doi = opts$doinominal, files = opts$keynominal, path = "data") - # nominal_catch <- - # readr::read_csv(here::here(file.path("data", opts$keynominal)), - # guess_max = 0) - # class(nominal_catch$measurement_value) <- "numeric" - - } else { - stop("Please provide a nominal catch dataset") - # nominal_catch <- retrieve_nominal_catch(entity, config, opts) - } - - class(nominal_catch$measurement_value) <- "numeric" - mapping_keep_src_code <- FALSE - - class(nominal_catch$measurement_unit) <- "character" - - if (any(nominal_catch$measurement_unit == "t")) - nominal_catch[nominal_catch$measurement_unit == "t",]$measurement_unit <- "t" - if (any(nominal_catch$measurement_unit == "TRUE")) - nominal_catch[nominal_catch$measurement_unit == "TRUE",]$measurement_unit <- "t" - if (any(nominal_catch$measurement_unit == "no")) - nominal_catch[nominal_catch$measurement_unit == "no",]$measurement_unit <- "no" - class(nominal_catch$measurement_value) <- "numeric" - - # Based on this analysis, the following cutoff years were determined for data completeness: - # - # IOTC: Keep data starting from 1953. - # ICCAT: Keep data starting from 1957. - # WCPFC: Keep data starting from 1952. - # IATTC: Keep data starting from 1957. - # CCSBT: Keep all data, although years 2017 and 2020 have missing months, likely due to no fishing activity rather than missing data. - - nominal_catch <- nominal_catch %>% - dplyr::mutate(fishing_fleet = ifelse(fishing_fleet == "UNK", "NEI", fishing_fleet))%>% - dplyr::mutate(year =lubridate::year(time_start)) %>% - dplyr::filter((source_authority == "WCPFC" & year >= 1952) | - (source_authority == "IOTC" & year >= 1953) | - (source_authority == "ICCAT" & year >= 1957) | - (source_authority == "IATTC" & year >= 1957) | source_authority == "CCSBT") %>% - dplyr::select(-year) %>% dplyr::rename(geographic_identifier_nom = geographic_identifier) - - nominal_catch <- nominal_catch %>% - dplyr::mutate(gear_type = ifelse(source_authority == "WCPFC" & gear_type == "09.31", "09.32", gear_type)) - - nominal_catch <- nominal_catch %>% - dplyr::select(-dplyr::any_of(c("measurement", "measurement_status", "measurement_type")))%>% - dplyr::ungroup() %>% - dplyr::group_by(across(-measurement_value)) %>% - dplyr::summarise(measurement_value = sum(measurement_value, na.rm = TRUE), .groups = 'drop') - - # georef_sup_nom_init <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, - # list(c("species", "year", "source_authority", "gear_type", "fishing_fleet")))$`species, year, source_authority, gear_type, fishing_fleet`$georef_sup_nominal %>% - # # dplyr::select(c("species", "year", "source_authority", "gear_type", "fishing_fleet")) %>% - # dplyr::distinct() %>% dplyr::group_by(source_authority, year, species) %>% dplyr::mutate(sum = sum(Difference)) - # # - # # georef_sup_nom_init_species_year_gear <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority", "gear_type")))$`species, year, source_authority, gear_type`$georef_sup_nominal %>% - # # dplyr::select(c("species", "year", "source_authority", "gear_type"))%>% - # # dplyr::distinct() - # # - # georef_sup_nom_init_species_year <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority")))$`species, year, source_authority`$georef_sup_nominal %>% - # # dplyr::select(c("species", "year", "source_authority"))%>% - # dplyr::distinct() - # - # global_nominal_catch_firms_level0_nei <- nominal_catch %>% dplyr::filter(species %in% c("TUN", "TUS", "BIL")) %>% dplyr::mutate(year = as.character(year(ymd(time_start)))) %>% - # dplyr::inner_join(georef_sup_nom_init_species_year, by = c("year", "source_authority")) %>% dplyr::group_by(species.x, year) %>% dplyr::mutate(sumx = sum(measurement_value)) - # LEVEL 1 IRD --------------------------------------------------- if(DATASET_LEVEL >= 1){ + + config$logger.info( + "Extract and load FIRMS Level 0 nominal catch data input (required if raising process is asked) " + ) + if(file.exists(file.path("data", opts$keynominal)) && !opts$forceuseofdoi){ + nominal_catch <- + readr::read_csv(here::here(file.path("data", opts$keynominal)), + guess_max = 0) + class(nominal_catch$measurement_value) <- "numeric" + #@juldebar if not provided by Google drive line below should be used if nominal catch has to be extracted from the database + # } else if(!is.null(opts$doinominal)){ + # zen4R::download_zenodo(doi = opts$doinominal, files = opts$keynominal, path = "data") + # nominal_catch <- + # readr::read_csv(here::here(file.path("data", opts$keynominal)), + # guess_max = 0) + # class(nominal_catch$measurement_value) <- "numeric" + + } else { + stop("Please provide a nominal catch dataset") + # nominal_catch <- retrieve_nominal_catch(entity, config, opts) + } + + class(nominal_catch$measurement_value) <- "numeric" + mapping_keep_src_code <- FALSE + + class(nominal_catch$measurement_unit) <- "character" + + if (any(nominal_catch$measurement_unit == "t")) + nominal_catch[nominal_catch$measurement_unit == "t",]$measurement_unit <- "t" + if (any(nominal_catch$measurement_unit == "TRUE")) + nominal_catch[nominal_catch$measurement_unit == "TRUE",]$measurement_unit <- "t" + if (any(nominal_catch$measurement_unit == "no")) + nominal_catch[nominal_catch$measurement_unit == "no",]$measurement_unit <- "no" + class(nominal_catch$measurement_value) <- "numeric" + + # Based on this analysis, the following cutoff years were determined for data completeness: + # + # IOTC: Keep data starting from 1953. + # ICCAT: Keep data starting from 1957. + # WCPFC: Keep data starting from 1952. + # IATTC: Keep data starting from 1957. + # CCSBT: Keep all data, although years 2017 and 2020 have missing months, likely due to no fishing activity rather than missing data. + + nominal_catch <- nominal_catch %>% + dplyr::mutate(fishing_fleet = ifelse(fishing_fleet == "UNK", "NEI", fishing_fleet))%>% + dplyr::mutate(year =lubridate::year(time_start)) %>% + dplyr::filter((source_authority == "WCPFC" & year >= 1952) | + (source_authority == "IOTC" & year >= 1953) | + (source_authority == "ICCAT" & year >= 1957) | + (source_authority == "IATTC" & year >= 1957) | source_authority == "CCSBT") %>% + dplyr::select(-year) %>% dplyr::rename(geographic_identifier_nom = geographic_identifier) + + nominal_catch <- nominal_catch %>% + dplyr::mutate(gear_type = ifelse(source_authority == "WCPFC" & gear_type == "09.31", "09.32", gear_type)) + + nominal_catch <- nominal_catch %>% + dplyr::select(-dplyr::any_of(c("measurement", "measurement_status", "measurement_type")))%>% + dplyr::ungroup() %>% + dplyr::group_by(across(-measurement_value)) %>% + dplyr::summarise(measurement_value = sum(measurement_value, na.rm = TRUE), .groups = 'drop') + + # georef_sup_nom_init <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, + # list(c("species", "year", "source_authority", "gear_type", "fishing_fleet")))$`species, year, source_authority, gear_type, fishing_fleet`$georef_sup_nominal %>% + # # dplyr::select(c("species", "year", "source_authority", "gear_type", "fishing_fleet")) %>% + # dplyr::distinct() %>% dplyr::group_by(source_authority, year, species) %>% dplyr::mutate(sum = sum(Difference)) + # # + # # georef_sup_nom_init_species_year_gear <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority", "gear_type")))$`species, year, source_authority, gear_type`$georef_sup_nominal %>% + # # dplyr::select(c("species", "year", "source_authority", "gear_type"))%>% + # # dplyr::distinct() + # # + # georef_sup_nom_init_species_year <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority")))$`species, year, source_authority`$georef_sup_nominal %>% + # # dplyr::select(c("species", "year", "source_authority"))%>% + # dplyr::distinct() + # + # global_nominal_catch_firms_level0_nei <- nominal_catch %>% dplyr::filter(species %in% c("TUN", "TUS", "BIL")) %>% dplyr::mutate(year = as.character(year(ymd(time_start)))) %>% + # dplyr::inner_join(georef_sup_nom_init_species_year, by = c("year", "source_authority")) %>% dplyr::group_by(species.x, year) %>% dplyr::mutate(sumx = sum(measurement_value)) + #with this condition code will be run to deal with dataset level 1 and above config$logger.info("Level 1 start") diff --git a/tunaatlas_scripts/generation/get_rfmos_datasets_level0.R b/tunaatlas_scripts/generation/get_rfmos_datasets_level0.R index 83aabc5..2380959 100644 --- a/tunaatlas_scripts/generation/get_rfmos_datasets_level0.R +++ b/tunaatlas_scripts/generation/get_rfmos_datasets_level0.R @@ -22,8 +22,8 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){ iotc_data <- NULL if(options$include_IOTC){ config$logger.info(sprintf("Get %s data", rfmo)) - dataset_files_iotc <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 & - regexpr("iotc", names(dataset_files)) > 0] + dataset_files_iotc <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 & + regexpr("iotc", names(dataset_files)) > 0]) iotc_data <- do.call("rbind", lapply(dataset_files_iotc, readr::read_csv, guess_max = 0)) iotc_data <- as.data.frame(iotc_data) class(iotc_data$measurement_value) <- "numeric" @@ -40,8 +40,8 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){ wcpfc_data <- NULL if(options$include_WCPFC){ config$logger.info(sprintf("Get %s data", rfmo)) - dataset_files_wcpfc <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 & - regexpr("wcpfc", names(dataset_files)) > 0] + dataset_files_wcpfc <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 & + regexpr("wcpfc", names(dataset_files)) > 0]) wcpfc_data <- do.call("rbind", lapply(dataset_files_wcpfc, readr::read_csv, guess_max = 0)) wcpfc_data <- as.data.frame(wcpfc_data) class(wcpfc_data$measurement_value) <- "numeric" @@ -59,8 +59,8 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){ ccsbt_data <- NULL if(options$include_CCSBT){ config$logger.info(sprintf("Get %s data", rfmo)) - dataset_files_ccsbt <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 & - regexpr("ccsbt", names(dataset_files)) > 0] + dataset_files_ccsbt <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 & + regexpr("ccsbt", names(dataset_files)) > 0]) ccsbt_data <- do.call("rbind", lapply(dataset_files_ccsbt, readr::read_csv, guess_max = 0)) ccsbt_data <- as.data.frame(ccsbt_data) class(ccsbt_data$measurement_value) <- "numeric" @@ -77,9 +77,9 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){ iccat_data <- NULL if(options$include_ICCAT){ config$logger.info(sprintf("Get %s data", rfmo)) - dataset_files_iccat <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 & + dataset_files_iccat <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 & regexpr("byschool", names(dataset_files)) < 0 & - regexpr("iccat", names(dataset_files)) > 0] + regexpr("iccat", names(dataset_files)) > 0]) iccat_data <- do.call("rbind", lapply(dataset_files_iccat, readr::read_csv, guess_max = 0)) iccat_data <- as.data.frame(iccat_data) @@ -147,10 +147,10 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){ if(variable == "catch") { config$logger.info(sprintf("Get %s data", rfmo)) - dataset_files_iattc <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 & + dataset_files_iattc <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 & regexpr("ps", names(dataset_files)) < 0 & regexpr("effort", names(dataset_files)) < 0 & - regexpr("iattc", names(dataset_files)) > 0] + regexpr("iattc", names(dataset_files)) > 0]) iattc_data <- do.call("rbind", lapply(dataset_files_iattc, readr::read_csv, guess_max = 0)) iattc_data <- as.data.frame(iattc_data)