Skip to content

Commit

Permalink
feature: effort dataset with effort from local files rather than in g…
Browse files Browse the repository at this point in the history
…oogledrive
  • Loading branch information
bastienird committed Feb 4, 2025
1 parent e0d3a8d commit 0617534
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 90 deletions.
20 changes: 17 additions & 3 deletions launching_jsons_creating_GTA.R
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,23 @@ time_Summarising_invalid_data_georef <- system.time({

executeWorkflow("manu_geoflow_gta_config_model.json")

tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("tunaatlas_qa_global_datasets_catch.json"))
# tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("creating_dataset.json"))
# tunaatlas_qa_global_datasets_effort_path <- executeWorkflow(here::here("create_effort_dataset.json"))
tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("tunaatlas_qa_global_datasets_catch.json")) # FROM DRIVE
file.copy(list.files(file.path(tunaatlas_qa_global_datasets_catch_path, "data"),
full.names = TRUE),
here::here("data"),
recursive = TRUE)


effort_path <- executeWorkflow(here::here("tunaatlas_qa_global_datasets_effort.json")) # FROM DRIVE
file.copy(list.files(file.path(tunaatlas_qa_global_datasets_catch_path, "data"),
full.names = TRUE),
here::here("data"),
recursive = TRUE)
# have to download every file.

tunaatlas_qa_global_datasets_catch_path <- executeWorkflow(here::here("creating_dataset.json"))

tunaatlas_qa_global_datasets_effort_path <- executeWorkflow(here::here("create_effort_dataset.json")) # FROM LOCAL IF NOT RUNNING USE DRIVE
# tunaatlas_qa_services <- initWorkflow("tunaatlas_qa_services.json")
# save.image()
# tunaatlas_qa_global_datasets_catch_path <- "jobs/20241104162955/entities/global_catch_ird_level2_rf1"
Expand Down
155 changes: 78 additions & 77 deletions tunaatlas_scripts/generation/create_global_tuna_atlas_dataset_v2023.R
Original file line number Diff line number Diff line change
Expand Up @@ -538,88 +538,89 @@ create_global_tuna_atlas_dataset_v2023 <- function(action, entity, config) {
function_recap_each_step(
paste0("Removing_duplicated_units"),
georef_dataset,
"As some data is in fact duplicated, we check all the duplicated data and remove when same",
"As some data is in fact duplicated for catch, we check all the duplicated data and remove the data in number when same dimensions",
""
)

config$logger.info(
"Extract and load FIRMS Level 0 nominal catch data input (required if raising process is asked) "
)
if(file.exists(file.path("data", opts$keynominal)) && !opts$forceuseofdoi){
nominal_catch <-
readr::read_csv(here::here(file.path("data", opts$keynominal)),
guess_max = 0)
class(nominal_catch$measurement_value) <- "numeric"
#@juldebar if not provided by Google drive line below should be used if nominal catch has to be extracted from the database
# } else if(!is.null(opts$doinominal)){
# zen4R::download_zenodo(doi = opts$doinominal, files = opts$keynominal, path = "data")
# nominal_catch <-
# readr::read_csv(here::here(file.path("data", opts$keynominal)),
# guess_max = 0)
# class(nominal_catch$measurement_value) <- "numeric"

} else {
stop("Please provide a nominal catch dataset")
# nominal_catch <- retrieve_nominal_catch(entity, config, opts)
}

class(nominal_catch$measurement_value) <- "numeric"
mapping_keep_src_code <- FALSE

class(nominal_catch$measurement_unit) <- "character"

if (any(nominal_catch$measurement_unit == "t"))
nominal_catch[nominal_catch$measurement_unit == "t",]$measurement_unit <- "t"
if (any(nominal_catch$measurement_unit == "TRUE"))
nominal_catch[nominal_catch$measurement_unit == "TRUE",]$measurement_unit <- "t"
if (any(nominal_catch$measurement_unit == "no"))
nominal_catch[nominal_catch$measurement_unit == "no",]$measurement_unit <- "no"
class(nominal_catch$measurement_value) <- "numeric"

# Based on this analysis, the following cutoff years were determined for data completeness:
#
# IOTC: Keep data starting from 1953.
# ICCAT: Keep data starting from 1957.
# WCPFC: Keep data starting from 1952.
# IATTC: Keep data starting from 1957.
# CCSBT: Keep all data, although years 2017 and 2020 have missing months, likely due to no fishing activity rather than missing data.

nominal_catch <- nominal_catch %>%
dplyr::mutate(fishing_fleet = ifelse(fishing_fleet == "UNK", "NEI", fishing_fleet))%>%
dplyr::mutate(year =lubridate::year(time_start)) %>%
dplyr::filter((source_authority == "WCPFC" & year >= 1952) |
(source_authority == "IOTC" & year >= 1953) |
(source_authority == "ICCAT" & year >= 1957) |
(source_authority == "IATTC" & year >= 1957) | source_authority == "CCSBT") %>%
dplyr::select(-year) %>% dplyr::rename(geographic_identifier_nom = geographic_identifier)

nominal_catch <- nominal_catch %>%
dplyr::mutate(gear_type = ifelse(source_authority == "WCPFC" & gear_type == "09.31", "09.32", gear_type))

nominal_catch <- nominal_catch %>%
dplyr::select(-dplyr::any_of(c("measurement", "measurement_status", "measurement_type")))%>%
dplyr::ungroup() %>%
dplyr::group_by(across(-measurement_value)) %>%
dplyr::summarise(measurement_value = sum(measurement_value, na.rm = TRUE), .groups = 'drop')

# georef_sup_nom_init <- compare_nominal_georef_corrected(nominal_catch, georef_dataset,
# list(c("species", "year", "source_authority", "gear_type", "fishing_fleet")))$`species, year, source_authority, gear_type, fishing_fleet`$georef_sup_nominal %>%
# # dplyr::select(c("species", "year", "source_authority", "gear_type", "fishing_fleet")) %>%
# dplyr::distinct() %>% dplyr::group_by(source_authority, year, species) %>% dplyr::mutate(sum = sum(Difference))
# #
# # georef_sup_nom_init_species_year_gear <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority", "gear_type")))$`species, year, source_authority, gear_type`$georef_sup_nominal %>%
# # dplyr::select(c("species", "year", "source_authority", "gear_type"))%>%
# # dplyr::distinct()
# #
# georef_sup_nom_init_species_year <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority")))$`species, year, source_authority`$georef_sup_nominal %>%
# # dplyr::select(c("species", "year", "source_authority"))%>%
# dplyr::distinct()
#
# global_nominal_catch_firms_level0_nei <- nominal_catch %>% dplyr::filter(species %in% c("TUN", "TUS", "BIL")) %>% dplyr::mutate(year = as.character(year(ymd(time_start)))) %>%
# dplyr::inner_join(georef_sup_nom_init_species_year, by = c("year", "source_authority")) %>% dplyr::group_by(species.x, year) %>% dplyr::mutate(sumx = sum(measurement_value))

# LEVEL 1 IRD ---------------------------------------------------
if(DATASET_LEVEL >= 1){

config$logger.info(
"Extract and load FIRMS Level 0 nominal catch data input (required if raising process is asked) "
)
if(file.exists(file.path("data", opts$keynominal)) && !opts$forceuseofdoi){
nominal_catch <-
readr::read_csv(here::here(file.path("data", opts$keynominal)),
guess_max = 0)
class(nominal_catch$measurement_value) <- "numeric"
#@juldebar if not provided by Google drive line below should be used if nominal catch has to be extracted from the database
# } else if(!is.null(opts$doinominal)){
# zen4R::download_zenodo(doi = opts$doinominal, files = opts$keynominal, path = "data")
# nominal_catch <-
# readr::read_csv(here::here(file.path("data", opts$keynominal)),
# guess_max = 0)
# class(nominal_catch$measurement_value) <- "numeric"

} else {
stop("Please provide a nominal catch dataset")
# nominal_catch <- retrieve_nominal_catch(entity, config, opts)
}

class(nominal_catch$measurement_value) <- "numeric"
mapping_keep_src_code <- FALSE

class(nominal_catch$measurement_unit) <- "character"

if (any(nominal_catch$measurement_unit == "t"))
nominal_catch[nominal_catch$measurement_unit == "t",]$measurement_unit <- "t"
if (any(nominal_catch$measurement_unit == "TRUE"))
nominal_catch[nominal_catch$measurement_unit == "TRUE",]$measurement_unit <- "t"
if (any(nominal_catch$measurement_unit == "no"))
nominal_catch[nominal_catch$measurement_unit == "no",]$measurement_unit <- "no"
class(nominal_catch$measurement_value) <- "numeric"

# Based on this analysis, the following cutoff years were determined for data completeness:
#
# IOTC: Keep data starting from 1953.
# ICCAT: Keep data starting from 1957.
# WCPFC: Keep data starting from 1952.
# IATTC: Keep data starting from 1957.
# CCSBT: Keep all data, although years 2017 and 2020 have missing months, likely due to no fishing activity rather than missing data.

nominal_catch <- nominal_catch %>%
dplyr::mutate(fishing_fleet = ifelse(fishing_fleet == "UNK", "NEI", fishing_fleet))%>%
dplyr::mutate(year =lubridate::year(time_start)) %>%
dplyr::filter((source_authority == "WCPFC" & year >= 1952) |
(source_authority == "IOTC" & year >= 1953) |
(source_authority == "ICCAT" & year >= 1957) |
(source_authority == "IATTC" & year >= 1957) | source_authority == "CCSBT") %>%
dplyr::select(-year) %>% dplyr::rename(geographic_identifier_nom = geographic_identifier)

nominal_catch <- nominal_catch %>%
dplyr::mutate(gear_type = ifelse(source_authority == "WCPFC" & gear_type == "09.31", "09.32", gear_type))

nominal_catch <- nominal_catch %>%
dplyr::select(-dplyr::any_of(c("measurement", "measurement_status", "measurement_type")))%>%
dplyr::ungroup() %>%
dplyr::group_by(across(-measurement_value)) %>%
dplyr::summarise(measurement_value = sum(measurement_value, na.rm = TRUE), .groups = 'drop')

# georef_sup_nom_init <- compare_nominal_georef_corrected(nominal_catch, georef_dataset,
# list(c("species", "year", "source_authority", "gear_type", "fishing_fleet")))$`species, year, source_authority, gear_type, fishing_fleet`$georef_sup_nominal %>%
# # dplyr::select(c("species", "year", "source_authority", "gear_type", "fishing_fleet")) %>%
# dplyr::distinct() %>% dplyr::group_by(source_authority, year, species) %>% dplyr::mutate(sum = sum(Difference))
# #
# # georef_sup_nom_init_species_year_gear <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority", "gear_type")))$`species, year, source_authority, gear_type`$georef_sup_nominal %>%
# # dplyr::select(c("species", "year", "source_authority", "gear_type"))%>%
# # dplyr::distinct()
# #
# georef_sup_nom_init_species_year <- compare_nominal_georef_corrected(nominal_catch, georef_dataset, list(c("species", "year", "source_authority")))$`species, year, source_authority`$georef_sup_nominal %>%
# # dplyr::select(c("species", "year", "source_authority"))%>%
# dplyr::distinct()
#
# global_nominal_catch_firms_level0_nei <- nominal_catch %>% dplyr::filter(species %in% c("TUN", "TUS", "BIL")) %>% dplyr::mutate(year = as.character(year(ymd(time_start)))) %>%
# dplyr::inner_join(georef_sup_nom_init_species_year, by = c("year", "source_authority")) %>% dplyr::group_by(species.x, year) %>% dplyr::mutate(sumx = sum(measurement_value))

#with this condition code will be run to deal with dataset level 1 and above
config$logger.info("Level 1 start")

Expand Down
20 changes: 10 additions & 10 deletions tunaatlas_scripts/generation/get_rfmos_datasets_level0.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){
iotc_data <- NULL
if(options$include_IOTC){
config$logger.info(sprintf("Get %s data", rfmo))
dataset_files_iotc <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 &
regexpr("iotc", names(dataset_files)) > 0]
dataset_files_iotc <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 &
regexpr("iotc", names(dataset_files)) > 0])
iotc_data <- do.call("rbind", lapply(dataset_files_iotc, readr::read_csv, guess_max = 0))
iotc_data <- as.data.frame(iotc_data)
class(iotc_data$measurement_value) <- "numeric"
Expand All @@ -40,8 +40,8 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){
wcpfc_data <- NULL
if(options$include_WCPFC){
config$logger.info(sprintf("Get %s data", rfmo))
dataset_files_wcpfc <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 &
regexpr("wcpfc", names(dataset_files)) > 0]
dataset_files_wcpfc <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 &
regexpr("wcpfc", names(dataset_files)) > 0])
wcpfc_data <- do.call("rbind", lapply(dataset_files_wcpfc, readr::read_csv, guess_max = 0))
wcpfc_data <- as.data.frame(wcpfc_data)
class(wcpfc_data$measurement_value) <- "numeric"
Expand All @@ -59,8 +59,8 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){
ccsbt_data <- NULL
if(options$include_CCSBT){
config$logger.info(sprintf("Get %s data", rfmo))
dataset_files_ccsbt <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 &
regexpr("ccsbt", names(dataset_files)) > 0]
dataset_files_ccsbt <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 &
regexpr("ccsbt", names(dataset_files)) > 0])
ccsbt_data <- do.call("rbind", lapply(dataset_files_ccsbt, readr::read_csv, guess_max = 0))
ccsbt_data <- as.data.frame(ccsbt_data)
class(ccsbt_data$measurement_value) <- "numeric"
Expand All @@ -77,9 +77,9 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){
iccat_data <- NULL
if(options$include_ICCAT){
config$logger.info(sprintf("Get %s data", rfmo))
dataset_files_iccat <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 &
dataset_files_iccat <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 &
regexpr("byschool", names(dataset_files)) < 0 &
regexpr("iccat", names(dataset_files)) > 0]
regexpr("iccat", names(dataset_files)) > 0])
iccat_data <- do.call("rbind", lapply(dataset_files_iccat, readr::read_csv, guess_max = 0))
iccat_data <- as.data.frame(iccat_data)

Expand Down Expand Up @@ -147,10 +147,10 @@ get_rfmos_datasets_level0 <- function(rfmo, entity, config, options){
if(variable == "catch") {

config$logger.info(sprintf("Get %s data", rfmo))
dataset_files_iattc <- dataset_files[regexpr("nominal", names(dataset_files)) < 0 &
dataset_files_iattc <- basename(dataset_files[regexpr("nominal", names(dataset_files)) < 0 &
regexpr("ps", names(dataset_files)) < 0 &
regexpr("effort", names(dataset_files)) < 0 &
regexpr("iattc", names(dataset_files)) > 0]
regexpr("iattc", names(dataset_files)) > 0])
iattc_data <- do.call("rbind", lapply(dataset_files_iattc, readr::read_csv, guess_max = 0))
iattc_data <- as.data.frame(iattc_data)

Expand Down

0 comments on commit 0617534

Please sign in to comment.