From 93a6dfcf27a3410937eee9c815f0fabdbf2ccbd3 Mon Sep 17 00:00:00 2001 From: bastienird Date: Thu, 30 Jan 2025 13:34:45 +0000 Subject: [PATCH] chore: tidying launching of new datasets level0/1/2 --- launching_jsons_creating_GTA.R | 135 ++------------------------------- 1 file changed, 7 insertions(+), 128 deletions(-) diff --git a/launching_jsons_creating_GTA.R b/launching_jsons_creating_GTA.R index 67731f1..4c8b97e 100644 --- a/launching_jsons_creating_GTA.R +++ b/launching_jsons_creating_GTA.R @@ -1,6 +1,8 @@ # Load 'renv' for project-specific environments # if (!require("renv")) install.packages("renv") library(renv) +# install.packages("pak") +# pak::pak("bastienird/CWP.dataset") # Activate the project environment (if using project-specific libraries) # renv::activate() # Restore the project library (if using renv) @@ -27,15 +29,6 @@ install_and_load <- function(package) { # Apply the function to each required package sapply(required_packages, install_and_load) -executeAndRename <- function(executed_file, suffix) { - - # Derive folder and file names - folder_file <- file.path("jobs", basename(executed_file)) - - # Rename the file with the given suffix - file.rename(folder_file, paste0("jobs/", basename(executed_file), suffix)) - return(paste0("jobs/", basename(executed_file), suffix)) -} require(geoflow) # Note: This script assumes that the internet connection is available and @@ -55,17 +48,8 @@ if(file.exists(here::here("geoserver_sdi_lab.env"))){ load_dot_env(file = here::here(default_file)) # to be replaced by the one used # load_dot_env(file = "~/Documents/Tunaatlas_level1/catch_local.env") - -running_time_of_workflow <- function(folder){ - # Get the last modified times of the files - json_time <- file.info(file.path(folder, "job.json"))$mtime - txt_time <- file.info(file.path(folder, "job-logs.txt"))$mtime - - # Calculate the difference - time_difference <- txt_time - json_time - - return(time_difference) -} +source(here::here("~/firms-gta/geoflow-tunaatlas/R/running_time_of_workflow.R")) +source(here::here("~/firms-gta/geoflow-tunaatlas/R/executeAndRename.R")) config <- initWorkflow(here::here("tunaatlas_qa_global_datasets_catch.json")) unlink(config$job, recursive = TRUE) @@ -76,7 +60,7 @@ action <- entity$data$actions[[1]] stop("Stop") # First step is creation of the database model and loading of the codelist (around 5 minutes) -db_model <- executeWorkflow(here("tunaatlas_qa_dbmodel+codelists.json")) +db_model <- executeWorkflow(here("tunaatlas_qa_dbmodel+codelists.json")) db_model <- executeAndRename(db_model, "_db_model") running_time_of_workflow(db_model) @@ -91,14 +75,14 @@ running_time_of_workflow(mappings) ## Nominal data: These datasets are mandatory to create the georeferenced dataset level 2. For level 0 or 1 they are not mandatory time around 2.7 minutes # Around 2.7 minutes raw_nominal_catch <- executeWorkflow(here::here("Raw_nominal_catch.json")) -raw_nominal_catch <- executeAndRename(raw_nominal_catch, "_raw_nominal_catch") +raw_nominal_catch <- executeAndRename(raw_nominal_catch, "_raw_nominal_catch_2024") running_time_of_workflow(raw_nominal_catch) ## Georeferenced catch: These datasets contains catch AND EFFORT FOR SOME DATA as effort are used to raise catch data for level 0 to 2 # Around 1.2 hours raw_data_georef <- executeWorkflow(here::here("All_raw_data_georef.json")) -raw_data_georef <- executeAndRename(raw_data_georef, "_raw_data_georef") +raw_data_georef <- executeAndRename(raw_data_georef, "_raw_data_georef_2024") running_time_of_workflow(raw_data_georef) ## Goereferenced effort: These datasets are used to create the georeferenced effort @@ -159,106 +143,6 @@ tunaatlas_qa_global_datasets_catch_path <- executeAndRename(tunaatlas_qa_global_ running_time_of_workflow(tunaatlas_qa_global_datasets_catch_path) create_materialized_view <- "" -compare_nominal_georef_corrected <- function(nominal, georef_mapped, list_strata = list(c("species", "year", "source_authority", "gear_type", "fishing_fleet", "geographic_identifier_nom"))) { - # Convertir les data.frames en data.tables - setDT(nominal) - setDT(georef_mapped) - - # Créer la colonne "year" à partir de time_start - georef_mapped[, year := as.character(year(ymd(time_start)))] - nominal[, year := as.character(year(ymd(time_start)))] - - # Conserver uniquement les données en tonnes - georef_mapped_tons <- georef_mapped[measurement_unit == "t"] - - # Initialise une liste pour stocker les résultats (un résultat pour chaque liste de dimensions à conserver pour faire la comparaison) - results <- list() - - for (strata in list_strata) { - # Nom pour la catégorie actuelle de strata - name <- paste0(toString(strata)) - - # Agréger les données pour le nominal et georef sur les colonnes spécifiées dans 'strata' (ex groupper les données par années, espèces, engins, pavillon) - nominal_grouped <- nominal[, .(measurement_value_nominal = sum(measurement_value, na.rm = TRUE)), by = strata] - georef_mapped_grouped <- georef_mapped[, .(measurement_value_georef = sum(measurement_value, na.rm = TRUE)), by = strata] - georef_mapped_tons_grouped <- georef_mapped_tons[, .(measurement_value_georef_tons = sum(measurement_value, na.rm = TRUE)), by = strata] - - # # Retirer les valeurs des colonnes pour comparer uniquement les strates (si on veut garder que elles) - nominal_grouped_without_value <- nominal_grouped[, .SD, .SDcols = strata] - georef_grouped_without_value <- georef_mapped_grouped[, .SD, .SDcols = strata] - georef_tons_grouped_without_value <- georef_mapped_tons_grouped[, .SD, .SDcols = strata] - - - # # Assurer que les colonnes sont dans le même ordre pour la comparaison - setcolorder(georef_grouped_without_value, names(nominal_grouped_without_value)) - setcolorder(georef_tons_grouped_without_value, names(nominal_grouped_without_value)) - - # Trouver les strates présentes dans georef_mapped mais absentes de nominal - georef_no_nominal <- fsetdiff(georef_grouped_without_value, nominal_grouped_without_value, all = FALSE) - georef_no_nominal_with_value <- merge(georef_mapped_tons_grouped, georef_no_nominal, by = strata, all = FALSE) - sum_georef_no_nominal_tons <- sum(georef_no_nominal_with_value$measurement_value_georef_tons ,na.rm = TRUE) - - - # Comparer uniquement les données en tonnes - georef_tons_no_nominal <- fsetdiff(georef_tons_grouped_without_value, nominal_grouped_without_value, all = FALSE) - - # Comparer les valeurs des strates communes entre nominal et georef_mapped pour les données en tonnes - georef_sup_nominal <- merge(nominal_grouped, georef_mapped_tons_grouped, by = strata, all = FALSE) - - # Vérifier si les colonnes existent après le merge - if ("measurement_value_georef_tons" %in% names(georef_sup_nominal) && - "measurement_value_nominal" %in% names(georef_sup_nominal)) { - georef_sup_nominal[, Difference := measurement_value_georef_tons - measurement_value_nominal] - georef_sup_nominal <- georef_sup_nominal[round(Difference, 3) > 1] # Supérieur strictement à 1, on s'affranchit des petits kouaks - } else { - georef_sup_nominal <- data.table() # Retourne une table vide s'il n'y a pas de données - } - - if ("fishing_fleet" %in% colnames(georef_sup_nominal)){ - tons_nei_georef <- georef_no_nominal_with_value[ - fishing_fleet == "NEI" , - sum(measurement_value_georef_tons)] + georef_sup_nominal[ - fishing_fleet == "NEI" , - sum(measurement_value_georef_tons) - ]} else { - tons_nei_georef <- 0 - } - - tons_aggregated_georef <- georef_no_nominal_with_value[ - species %in% c("TUN", "TUS" ,"BIL"), - sum(measurement_value_georef_tons) - ] + georef_sup_nominal[ - species %in% c("TUN", "TUS" ,"BIL"), - sum(measurement_value_georef_tons) - ] - - if ("fishing_fleet" %in% colnames(nominal_grouped)){ - tons_nei_nominal <- nominal_grouped[ - fishing_fleet == "NEI", - sum(measurement_value_nominal) - ]} else {tons_nei_nominal <- 0} - - - sum_georef_sup_nom <- sum(georef_sup_nominal$Difference, na.rm = TRUE) - - suffisant <- ifelse(sum_georef_no_nominal_tons + sum_georef_sup_nom -(tons_aggregated_georef + tons_nei_georef) > 0, FALSE, TRUE) - # Stocker les résultats - results[[name]] <- list( - georef_no_nominal = georef_no_nominal, # Strates dans georef mais absentes dans nominal - georef_no_nominal_with_value = georef_no_nominal_with_value %>% dplyr::rename(measurement_value = measurement_value_georef_tons), # Strates dans georef mais absentes dans nominal avec la valeur totale - georef_tons_no_nominal = georef_tons_no_nominal, # Strates en tonnes absentes dans nominal - georef_sup_nominal = georef_sup_nominal, # Strates où georef est supérieur à nominal - tons_nei_nominal = tons_nei_nominal, # Strates nei qui pourraient expliquer les différences - tons_nei_georef = tons_nei_georef, # Strates nei qui pourraient expliquer les différences - sum_georef_no_nominal = sum_georef_no_nominal_tons, - suffisant = suffisant, - tons_aggregated_georef = tons_aggregated_georef, - sum_georef_sup_nom = sum_georef_sup_nom - ) - } - - return(results) -} source("~/firms-gta/geoflow-tunaatlas/Analysis_markdown/functions/process_fisheries_data_by_species.R") # IRD_data <- readr::read_csv("data/IOTC_conv_fact_mapped.csv") @@ -299,11 +183,6 @@ con <- config$software$output$dbi # ) source("~/firms-gta/geoflow-tunaatlas/Analysis_markdown/functions/Summarising_step.R") setwd("~/firms-gta/geoflow-tunaatlas") -Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config = config, sizepdf = "short",savestep = FALSE, usesave = FALSE, - source_authoritylist = c("WCPFC" ,"all" )) -Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config = config, sizepdf = "middle",savestep = FALSE, usesave = FALSE, - source_authoritylist = c("WCPFC" ,"all" )) - Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config = config, sizepdf = "short",savestep = FALSE, usesave = FALSE, source_authoritylist = c("all", "WCPFC", "IATTC", "ICCAT", "CCSBT", "IOTC" )) Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config = config, sizepdf = "middle",savestep = FALSE, usesave = FALSE,