Skip to content

Commit

Permalink
chore: tidying launching of new datasets level0/1/2
Browse files Browse the repository at this point in the history
  • Loading branch information
bastienird committed Jan 30, 2025
1 parent 2ddf479 commit 93a6dfc
Showing 1 changed file with 7 additions and 128 deletions.
135 changes: 7 additions & 128 deletions launching_jsons_creating_GTA.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Load 'renv' for project-specific environments
# if (!require("renv")) install.packages("renv")
library(renv)
# install.packages("pak")
# pak::pak("bastienird/CWP.dataset")
# Activate the project environment (if using project-specific libraries)
# renv::activate()
# Restore the project library (if using renv)
Expand All @@ -27,15 +29,6 @@ install_and_load <- function(package) {

# Apply the function to each required package
sapply(required_packages, install_and_load)
executeAndRename <- function(executed_file, suffix) {

# Derive folder and file names
folder_file <- file.path("jobs", basename(executed_file))

# Rename the file with the given suffix
file.rename(folder_file, paste0("jobs/", basename(executed_file), suffix))
return(paste0("jobs/", basename(executed_file), suffix))
}
require(geoflow)

# Note: This script assumes that the internet connection is available and
Expand All @@ -55,17 +48,8 @@ if(file.exists(here::here("geoserver_sdi_lab.env"))){

load_dot_env(file = here::here(default_file)) # to be replaced by the one used
# load_dot_env(file = "~/Documents/Tunaatlas_level1/catch_local.env")

running_time_of_workflow <- function(folder){
# Get the last modified times of the files
json_time <- file.info(file.path(folder, "job.json"))$mtime
txt_time <- file.info(file.path(folder, "job-logs.txt"))$mtime

# Calculate the difference
time_difference <- txt_time - json_time

return(time_difference)
}
source(here::here("~/firms-gta/geoflow-tunaatlas/R/running_time_of_workflow.R"))
source(here::here("~/firms-gta/geoflow-tunaatlas/R/executeAndRename.R"))

config <- initWorkflow(here::here("tunaatlas_qa_global_datasets_catch.json"))
unlink(config$job, recursive = TRUE)
Expand All @@ -76,7 +60,7 @@ action <- entity$data$actions[[1]]

stop("Stop")
# First step is creation of the database model and loading of the codelist (around 5 minutes)
db_model <- executeWorkflow(here("tunaatlas_qa_dbmodel+codelists.json"))
db_model <- executeWorkflow(here("tunaatlas_qa_dbmodel+codelists.json"))
db_model <- executeAndRename(db_model, "_db_model")
running_time_of_workflow(db_model)

Expand All @@ -91,14 +75,14 @@ running_time_of_workflow(mappings)
## Nominal data: These datasets are mandatory to create the georeferenced dataset level 2. For level 0 or 1 they are not mandatory time around 2.7 minutes
# Around 2.7 minutes
raw_nominal_catch <- executeWorkflow(here::here("Raw_nominal_catch.json"))
raw_nominal_catch <- executeAndRename(raw_nominal_catch, "_raw_nominal_catch")
raw_nominal_catch <- executeAndRename(raw_nominal_catch, "_raw_nominal_catch_2024")
running_time_of_workflow(raw_nominal_catch)


## Georeferenced catch: These datasets contains catch AND EFFORT FOR SOME DATA as effort are used to raise catch data for level 0 to 2
# Around 1.2 hours
raw_data_georef <- executeWorkflow(here::here("All_raw_data_georef.json"))
raw_data_georef <- executeAndRename(raw_data_georef, "_raw_data_georef")
raw_data_georef <- executeAndRename(raw_data_georef, "_raw_data_georef_2024")
running_time_of_workflow(raw_data_georef)

## Goereferenced effort: These datasets are used to create the georeferenced effort
Expand Down Expand Up @@ -159,106 +143,6 @@ tunaatlas_qa_global_datasets_catch_path <- executeAndRename(tunaatlas_qa_global_

running_time_of_workflow(tunaatlas_qa_global_datasets_catch_path)
create_materialized_view <- ""
compare_nominal_georef_corrected <- function(nominal, georef_mapped, list_strata = list(c("species", "year", "source_authority", "gear_type", "fishing_fleet", "geographic_identifier_nom"))) {
# Convertir les data.frames en data.tables
setDT(nominal)
setDT(georef_mapped)

# Créer la colonne "year" à partir de time_start
georef_mapped[, year := as.character(year(ymd(time_start)))]
nominal[, year := as.character(year(ymd(time_start)))]

# Conserver uniquement les données en tonnes
georef_mapped_tons <- georef_mapped[measurement_unit == "t"]

# Initialise une liste pour stocker les résultats (un résultat pour chaque liste de dimensions à conserver pour faire la comparaison)
results <- list()

for (strata in list_strata) {
# Nom pour la catégorie actuelle de strata
name <- paste0(toString(strata))

# Agréger les données pour le nominal et georef sur les colonnes spécifiées dans 'strata' (ex groupper les données par années, espèces, engins, pavillon)
nominal_grouped <- nominal[, .(measurement_value_nominal = sum(measurement_value, na.rm = TRUE)), by = strata]
georef_mapped_grouped <- georef_mapped[, .(measurement_value_georef = sum(measurement_value, na.rm = TRUE)), by = strata]
georef_mapped_tons_grouped <- georef_mapped_tons[, .(measurement_value_georef_tons = sum(measurement_value, na.rm = TRUE)), by = strata]

# # Retirer les valeurs des colonnes pour comparer uniquement les strates (si on veut garder que elles)
nominal_grouped_without_value <- nominal_grouped[, .SD, .SDcols = strata]
georef_grouped_without_value <- georef_mapped_grouped[, .SD, .SDcols = strata]
georef_tons_grouped_without_value <- georef_mapped_tons_grouped[, .SD, .SDcols = strata]


# # Assurer que les colonnes sont dans le même ordre pour la comparaison
setcolorder(georef_grouped_without_value, names(nominal_grouped_without_value))
setcolorder(georef_tons_grouped_without_value, names(nominal_grouped_without_value))

# Trouver les strates présentes dans georef_mapped mais absentes de nominal
georef_no_nominal <- fsetdiff(georef_grouped_without_value, nominal_grouped_without_value, all = FALSE)
georef_no_nominal_with_value <- merge(georef_mapped_tons_grouped, georef_no_nominal, by = strata, all = FALSE)
sum_georef_no_nominal_tons <- sum(georef_no_nominal_with_value$measurement_value_georef_tons ,na.rm = TRUE)


# Comparer uniquement les données en tonnes
georef_tons_no_nominal <- fsetdiff(georef_tons_grouped_without_value, nominal_grouped_without_value, all = FALSE)

# Comparer les valeurs des strates communes entre nominal et georef_mapped pour les données en tonnes
georef_sup_nominal <- merge(nominal_grouped, georef_mapped_tons_grouped, by = strata, all = FALSE)

# Vérifier si les colonnes existent après le merge
if ("measurement_value_georef_tons" %in% names(georef_sup_nominal) &&
"measurement_value_nominal" %in% names(georef_sup_nominal)) {
georef_sup_nominal[, Difference := measurement_value_georef_tons - measurement_value_nominal]
georef_sup_nominal <- georef_sup_nominal[round(Difference, 3) > 1] # Supérieur strictement à 1, on s'affranchit des petits kouaks
} else {
georef_sup_nominal <- data.table() # Retourne une table vide s'il n'y a pas de données
}

if ("fishing_fleet" %in% colnames(georef_sup_nominal)){
tons_nei_georef <- georef_no_nominal_with_value[
fishing_fleet == "NEI" ,
sum(measurement_value_georef_tons)] + georef_sup_nominal[
fishing_fleet == "NEI" ,
sum(measurement_value_georef_tons)
]} else {
tons_nei_georef <- 0
}

tons_aggregated_georef <- georef_no_nominal_with_value[
species %in% c("TUN", "TUS" ,"BIL"),
sum(measurement_value_georef_tons)
] + georef_sup_nominal[
species %in% c("TUN", "TUS" ,"BIL"),
sum(measurement_value_georef_tons)
]

if ("fishing_fleet" %in% colnames(nominal_grouped)){
tons_nei_nominal <- nominal_grouped[
fishing_fleet == "NEI",
sum(measurement_value_nominal)
]} else {tons_nei_nominal <- 0}


sum_georef_sup_nom <- sum(georef_sup_nominal$Difference, na.rm = TRUE)

suffisant <- ifelse(sum_georef_no_nominal_tons + sum_georef_sup_nom -(tons_aggregated_georef + tons_nei_georef) > 0, FALSE, TRUE)
# Stocker les résultats
results[[name]] <- list(
georef_no_nominal = georef_no_nominal, # Strates dans georef mais absentes dans nominal
georef_no_nominal_with_value = georef_no_nominal_with_value %>% dplyr::rename(measurement_value = measurement_value_georef_tons), # Strates dans georef mais absentes dans nominal avec la valeur totale
georef_tons_no_nominal = georef_tons_no_nominal, # Strates en tonnes absentes dans nominal
georef_sup_nominal = georef_sup_nominal, # Strates où georef est supérieur à nominal
tons_nei_nominal = tons_nei_nominal, # Strates nei qui pourraient expliquer les différences
tons_nei_georef = tons_nei_georef, # Strates nei qui pourraient expliquer les différences
sum_georef_no_nominal = sum_georef_no_nominal_tons,
suffisant = suffisant,
tons_aggregated_georef = tons_aggregated_georef,
sum_georef_sup_nom = sum_georef_sup_nom
)
}

return(results)
}
source("~/firms-gta/geoflow-tunaatlas/Analysis_markdown/functions/process_fisheries_data_by_species.R")

# IRD_data <- readr::read_csv("data/IOTC_conv_fact_mapped.csv")
Expand Down Expand Up @@ -299,11 +183,6 @@ con <- config$software$output$dbi
# )
source("~/firms-gta/geoflow-tunaatlas/Analysis_markdown/functions/Summarising_step.R")
setwd("~/firms-gta/geoflow-tunaatlas")
Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config = config, sizepdf = "short",savestep = FALSE, usesave = FALSE,
source_authoritylist = c("WCPFC" ,"all" ))
Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config = config, sizepdf = "middle",savestep = FALSE, usesave = FALSE,
source_authoritylist = c("WCPFC" ,"all" ))

Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config = config, sizepdf = "short",savestep = FALSE, usesave = FALSE,
source_authoritylist = c("all", "WCPFC", "IATTC", "ICCAT", "CCSBT", "IOTC" ))
Summarising_step(main_dir = tunaatlas_qa_global_datasets_catch_path, connectionDB = con, config = config, sizepdf = "middle",savestep = FALSE, usesave = FALSE,
Expand Down

0 comments on commit 93a6dfc

Please sign in to comment.