From 93a6dfcf27a3410937eee9c815f0fabdbf2ccbd3 Mon Sep 17 00:00:00 2001
From: bastienird <bastien.grasset@ird.fr>
Date: Thu, 30 Jan 2025 13:34:45 +0000
Subject: [PATCH] chore: tidying launching of new datasets level0/1/2

---
 launching_jsons_creating_GTA.R | 135 ++-------------------------------
 1 file changed, 7 insertions(+), 128 deletions(-)

diff --git a/launching_jsons_creating_GTA.R b/launching_jsons_creating_GTA.R
index 67731f1..4c8b97e 100644
--- a/launching_jsons_creating_GTA.R
+++ b/launching_jsons_creating_GTA.R
@@ -1,6 +1,8 @@
 # Load 'renv' for project-specific environments
 # if (!require("renv")) install.packages("renv")
 library(renv)
+# install.packages("pak")
+# pak::pak("bastienird/CWP.dataset")
 # Activate the project environment (if using project-specific libraries)
 # renv::activate()
 # Restore the project library (if using renv)
@@ -27,15 +29,6 @@ install_and_load <- function(package) {
 
 # Apply the function to each required package
 sapply(required_packages, install_and_load)
-executeAndRename <- function(executed_file, suffix) {
-  
-  # Derive folder and file names
-  folder_file <- file.path("jobs", basename(executed_file))
-  
-  # Rename the file with the given suffix
-  file.rename(folder_file, paste0("jobs/", basename(executed_file), suffix))
-  return(paste0("jobs/", basename(executed_file), suffix))
-}
 require(geoflow)
 
 # Note: This script assumes that the internet connection is available and
@@ -55,17 +48,8 @@ if(file.exists(here::here("geoserver_sdi_lab.env"))){
 
 load_dot_env(file = here::here(default_file)) # to be replaced by the one used
 # load_dot_env(file = "~/Documents/Tunaatlas_level1/catch_local.env")
-
-running_time_of_workflow <- function(folder){
-  # Get the last modified times of the files
-  json_time <- file.info(file.path(folder, "job.json"))$mtime
-  txt_time <- file.info(file.path(folder, "job-logs.txt"))$mtime
-  
-  # Calculate the difference
-  time_difference <- txt_time - json_time
-  
-  return(time_difference)
-}
+source(here::here("~/firms-gta/geoflow-tunaatlas/R/running_time_of_workflow.R"))
+source(here::here("~/firms-gta/geoflow-tunaatlas/R/executeAndRename.R"))
 
 config <- initWorkflow(here::here("tunaatlas_qa_global_datasets_catch.json"))
 unlink(config$job, recursive = TRUE)
@@ -76,7 +60,7 @@ action <- entity$data$actions[[1]]
 
 stop("Stop")
 # First step is creation of the database model and loading of the codelist (around 5 minutes)
-db_model <- executeWorkflow(here("tunaatlas_qa_dbmodel+codelists.json")) 
+db_model <- executeWorkflow(here("tunaatlas_qa_dbmodel+codelists.json"))
 db_model <- executeAndRename(db_model, "_db_model")
 running_time_of_workflow(db_model)
 
@@ -91,14 +75,14 @@ running_time_of_workflow(mappings)
 ## Nominal data: These datasets are mandatory to create the georeferenced dataset level 2. For level 0 or 1 they are not mandatory time around 2.7 minutes
 # Around 2.7 minutes
 raw_nominal_catch <- executeWorkflow(here::here("Raw_nominal_catch.json"))
-raw_nominal_catch <- executeAndRename(raw_nominal_catch, "_raw_nominal_catch")
+raw_nominal_catch <- executeAndRename(raw_nominal_catch, "_raw_nominal_catch_2024")
 running_time_of_workflow(raw_nominal_catch)
 
 
 ## Georeferenced catch: These datasets contains catch AND EFFORT FOR SOME DATA as effort are used to raise catch data for level 0 to 2
 # Around 1.2 hours
 raw_data_georef <- executeWorkflow(here::here("All_raw_data_georef.json"))
-raw_data_georef <- executeAndRename(raw_data_georef, "_raw_data_georef")
+raw_data_georef <- executeAndRename(raw_data_georef, "_raw_data_georef_2024")
 running_time_of_workflow(raw_data_georef)
 
 ## Goereferenced effort: These datasets are used to create the georeferenced effort
@@ -159,106 +143,6 @@ tunaatlas_qa_global_datasets_catch_path <- executeAndRename(tunaatlas_qa_global_
 
 running_time_of_workflow(tunaatlas_qa_global_datasets_catch_path)
 create_materialized_view <- ""
-compare_nominal_georef_corrected <- function(nominal, georef_mapped, list_strata = list(c("species", "year", "source_authority", "gear_type", "fishing_fleet", "geographic_identifier_nom"))) {
-  # Convertir les data.frames en data.tables
-  setDT(nominal)
-  setDT(georef_mapped)
-  
-  # Créer la colonne "year" à partir de time_start
-  georef_mapped[, year := as.character(year(ymd(time_start)))]
-  nominal[, year := as.character(year(ymd(time_start)))]
-  
-  # Conserver uniquement les données en tonnes
-  georef_mapped_tons <- georef_mapped[measurement_unit == "t"]
-  
-  # Initialise une liste pour stocker les résultats (un résultat pour chaque liste de dimensions à conserver pour faire la comparaison)
-  results <- list()
-  
-  for (strata in list_strata) {
-    # Nom pour la catégorie actuelle de strata
-    name <- paste0(toString(strata))
-    
-    # Agréger les données pour le nominal et georef sur les colonnes spécifiées dans 'strata' (ex groupper les données par années, espèces, engins, pavillon)
-    nominal_grouped <- nominal[, .(measurement_value_nominal = sum(measurement_value, na.rm = TRUE)), by = strata]
-    georef_mapped_grouped <- georef_mapped[, .(measurement_value_georef = sum(measurement_value, na.rm = TRUE)), by = strata]
-    georef_mapped_tons_grouped <- georef_mapped_tons[, .(measurement_value_georef_tons = sum(measurement_value, na.rm = TRUE)), by = strata]
-    
-    # # Retirer les valeurs des colonnes pour comparer uniquement les strates (si on veut garder que elles)
-    nominal_grouped_without_value <- nominal_grouped[, .SD, .SDcols = strata]
-    georef_grouped_without_value <- georef_mapped_grouped[, .SD, .SDcols = strata]
-    georef_tons_grouped_without_value <- georef_mapped_tons_grouped[, .SD, .SDcols = strata]
-    
-    
-    # # Assurer que les colonnes sont dans le même ordre pour la comparaison
-    setcolorder(georef_grouped_without_value, names(nominal_grouped_without_value))
-    setcolorder(georef_tons_grouped_without_value, names(nominal_grouped_without_value))
-    
-    # Trouver les strates présentes dans georef_mapped mais absentes de nominal
-    georef_no_nominal <- fsetdiff(georef_grouped_without_value, nominal_grouped_without_value, all = FALSE)
-    georef_no_nominal_with_value <- merge(georef_mapped_tons_grouped, georef_no_nominal, by = strata, all = FALSE)
-    sum_georef_no_nominal_tons <- sum(georef_no_nominal_with_value$measurement_value_georef_tons ,na.rm = TRUE)
-    
-    
-    # Comparer uniquement les données en tonnes
-    georef_tons_no_nominal <- fsetdiff(georef_tons_grouped_without_value, nominal_grouped_without_value, all = FALSE)
-    
-    # Comparer les valeurs des strates communes entre nominal et georef_mapped pour les données en tonnes
-    georef_sup_nominal <- merge(nominal_grouped, georef_mapped_tons_grouped, by = strata, all = FALSE)
-    
-    # Vérifier si les colonnes existent après le merge
-    if ("measurement_value_georef_tons" %in% names(georef_sup_nominal) && 
-        "measurement_value_nominal" %in% names(georef_sup_nominal)) {
-      georef_sup_nominal[, Difference := measurement_value_georef_tons - measurement_value_nominal]
-      georef_sup_nominal <- georef_sup_nominal[round(Difference, 3) > 1] # Supérieur strictement à 1, on s'affranchit des petits kouaks
-    } else {
-      georef_sup_nominal <- data.table()  # Retourne une table vide s'il n'y a pas de données
-    }
-    
-    if ("fishing_fleet" %in% colnames(georef_sup_nominal)){
-      tons_nei_georef <- georef_no_nominal_with_value[
-        fishing_fleet == "NEI" ,
-        sum(measurement_value_georef_tons)] + georef_sup_nominal[
-          fishing_fleet == "NEI" ,
-          sum(measurement_value_georef_tons) 
-        ]} else {
-          tons_nei_georef <- 0
-        }
-    
-    tons_aggregated_georef <- georef_no_nominal_with_value[
-      species %in% c("TUN", "TUS" ,"BIL"),
-      sum(measurement_value_georef_tons)
-    ] + georef_sup_nominal[
-      species %in% c("TUN", "TUS" ,"BIL"),
-      sum(measurement_value_georef_tons)
-    ]
-    
-    if ("fishing_fleet" %in% colnames(nominal_grouped)){
-      tons_nei_nominal <- nominal_grouped[
-        fishing_fleet == "NEI",
-        sum(measurement_value_nominal)
-      ]} else {tons_nei_nominal <- 0}
-    
-    
-    sum_georef_sup_nom <- sum(georef_sup_nominal$Difference, na.rm = TRUE)
-    
-    suffisant <- ifelse(sum_georef_no_nominal_tons + sum_georef_sup_nom -(tons_aggregated_georef + tons_nei_georef) > 0, FALSE, TRUE)
-    # Stocker les résultats
-    results[[name]] <- list(
-      georef_no_nominal = georef_no_nominal,           # Strates dans georef mais absentes dans nominal
-      georef_no_nominal_with_value = georef_no_nominal_with_value %>% dplyr::rename(measurement_value = measurement_value_georef_tons),           # Strates dans georef mais absentes dans nominal avec la valeur totale
-      georef_tons_no_nominal = georef_tons_no_nominal, # Strates en tonnes absentes dans nominal
-      georef_sup_nominal = georef_sup_nominal,          # Strates où georef est supérieur à nominal
-      tons_nei_nominal = tons_nei_nominal,          # Strates nei qui pourraient expliquer les différences
-      tons_nei_georef = tons_nei_georef,          # Strates nei qui pourraient expliquer les différences
-      sum_georef_no_nominal = sum_georef_no_nominal_tons, 
-      suffisant = suffisant, 
-      tons_aggregated_georef = tons_aggregated_georef,
-      sum_georef_sup_nom = sum_georef_sup_nom
-    )
-  }
-  
-  return(results)
-}
 source("~/firms-gta/geoflow-tunaatlas/Analysis_markdown/functions/process_fisheries_data_by_species.R")
 
 # IRD_data <- readr::read_csv("data/IOTC_conv_fact_mapped.csv")
@@ -299,11 +183,6 @@ con <- config$software$output$dbi
 # )
 source("~/firms-gta/geoflow-tunaatlas/Analysis_markdown/functions/Summarising_step.R")
 setwd("~/firms-gta/geoflow-tunaatlas")
-Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config  = config, sizepdf = "short",savestep = FALSE, usesave = FALSE, 
-                 source_authoritylist = c("WCPFC" ,"all" ))
-Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config  = config, sizepdf = "middle",savestep = FALSE, usesave = FALSE, 
-                 source_authoritylist = c("WCPFC" ,"all" ))
-
 Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config  = config, sizepdf = "short",savestep = FALSE, usesave = FALSE, 
                  source_authoritylist = c("all", "WCPFC", "IATTC", "ICCAT", "CCSBT", "IOTC" ))
 Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config  = config, sizepdf = "middle",savestep = FALSE, usesave = FALSE,