From f7e04090db6d78c295a04df4302c05974db5fe1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Courivaud?= Date: Fri, 21 Feb 2025 07:20:07 -0300 Subject: [PATCH] Feat/better handling hierarchy (#1158) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix herarchy and cities Signed-off-by: Raphaël Courivaud * add external communes departements epeci regions Signed-off-by: Raphaël Courivaud --------- Signed-off-by: Raphaël Courivaud --- analytics/dagster/src/assets/dwh/__init__.py | 17 +++++++- .../dagster/src/assets/dwh/ingest/__init__.py | 6 ++- .../assets/dwh/ingest/administrative_cuts.py | 40 +++++++++++++++++++ analytics/dagster/src/config.py | 13 +++++- analytics/dbt/macros/geo/get_hierarchy.sql | 6 +-- .../marts/admin/marts_admin_communes.sql | 3 ++ .../marts/admin/marts_admin_departements.sql | 3 ++ .../models/marts/admin/marts_admin_epci.sql | 3 ++ .../marts/admin/marts_admin_regions.sql | 3 ++ .../marts/common/marts_common_cities.sql | 11 ++--- .../marts_public_establishments_hierarchy.sql | 36 ++++++++--------- analytics/dbt/models/staging/admin/schema.yml | 7 ++++ .../dbt/models/staging/admin/sources.yml | 10 +++++ .../staging/admin/stg_admin_communes.sql | 3 ++ .../staging/admin/stg_admin_departements.sql | 3 ++ .../models/staging/admin/stg_admin_epci.sql | 3 ++ .../staging/admin/stg_admin_regions.sql | 3 ++ .../staging/common/stg_common_cities.sql | 2 +- 18 files changed, 138 insertions(+), 34 deletions(-) create mode 100644 analytics/dagster/src/assets/dwh/ingest/administrative_cuts.py create mode 100644 analytics/dbt/models/marts/admin/marts_admin_communes.sql create mode 100644 analytics/dbt/models/marts/admin/marts_admin_departements.sql create mode 100644 analytics/dbt/models/marts/admin/marts_admin_epci.sql create mode 100644 analytics/dbt/models/marts/admin/marts_admin_regions.sql create mode 100644 analytics/dbt/models/staging/admin/schema.yml create mode 100644 analytics/dbt/models/staging/admin/sources.yml create mode 100644 analytics/dbt/models/staging/admin/stg_admin_communes.sql create mode 100644 analytics/dbt/models/staging/admin/stg_admin_departements.sql create mode 100644 analytics/dbt/models/staging/admin/stg_admin_epci.sql create mode 100644 analytics/dbt/models/staging/admin/stg_admin_regions.sql diff --git a/analytics/dagster/src/assets/dwh/__init__.py b/analytics/dagster/src/assets/dwh/__init__.py index 4516420b6..1a6a8fc3d 100644 --- a/analytics/dagster/src/assets/dwh/__init__.py +++ b/analytics/dagster/src/assets/dwh/__init__.py @@ -1,6 +1,15 @@ from .checks import check_ff_lovac_on_duckdb from .copy import copy_dagster_duckdb_to_metabase_duckdb, export_mother_duck_local_duckdb, copy_dagster_duckdb_to_metabase_duckdb_through_s3 -from .ingest import import_postgres_data_from_replica_to_duckdb, import_cerema_ff_lovac_data_from_s3_to_duckdb, setup_replica_db, setup_s3_connection +from .ingest import ( + import_postgres_data_from_replica_to_duckdb, + import_cerema_ff_lovac_data_from_s3_to_duckdb, + setup_replica_db, + setup_s3_connection, + raw_communes, + raw_epci, + raw_departements, + raw_regions + ) from .upload import upload_duckdb_to_s3, upload_ff_to_s3, download_ff_from_s3 from .setup_duckdb import setup_duckdb @@ -16,5 +25,9 @@ "download_ff_from_s3", "setup_duckdb", "export_mother_duck_local_duckdb", - "copy_dagster_duckdb_to_metabase_duckdb_through_s3" + "copy_dagster_duckdb_to_metabase_duckdb_through_s3", + "raw_communes", + "raw_epci", + "raw_departements", + "raw_regions", ] diff --git a/analytics/dagster/src/assets/dwh/ingest/__init__.py b/analytics/dagster/src/assets/dwh/ingest/__init__.py index 2104e20ae..a8db705f1 100644 --- a/analytics/dagster/src/assets/dwh/ingest/__init__.py +++ b/analytics/dagster/src/assets/dwh/ingest/__init__.py @@ -1,9 +1,13 @@ from .ingest_lovac_ff_s3_asset import import_cerema_ff_lovac_data_from_s3_to_duckdb, setup_s3_connection from .ingest_postgres_asset import import_postgres_data_from_replica_to_duckdb, setup_replica_db - +from .administrative_cuts import raw_communes, raw_epci, raw_departements, raw_regions __all__ = [ "import_postgres_data_from_replica_to_duckdb", "import_cerema_ff_lovac_data_from_s3_to_duckdb", "setup_replica_db", "setup_s3_connection", + "raw_communes", + "raw_epci", + "raw_departements", + "raw_regions", ] \ No newline at end of file diff --git a/analytics/dagster/src/assets/dwh/ingest/administrative_cuts.py b/analytics/dagster/src/assets/dwh/ingest/administrative_cuts.py new file mode 100644 index 000000000..900c989b7 --- /dev/null +++ b/analytics/dagster/src/assets/dwh/ingest/administrative_cuts.py @@ -0,0 +1,40 @@ +# assets.py +import pandas as pd +from dagster_duckdb import DuckDBResource +from dagster import AssetKey, asset, AssetExecutionContext + + + +@asset(deps=[AssetKey("setup_duckdb")], + group_name="external_seeds") +def raw_communes(context: AssetExecutionContext, duckdb: DuckDBResource): + df = pd.read_json('https://geo.api.gouv.fr/communes') + with duckdb.get_connection() as conn: + conn.execute("CREATE SCHEMA IF NOT EXISTS external;") + conn.execute("CREATE OR REPLACE TABLE external.communes AS SELECT * FROM df") +@asset(deps=[AssetKey("setup_duckdb")], + group_name="external_seeds") +def raw_epci(context: AssetExecutionContext, duckdb: DuckDBResource): + df = pd.read_json('https://geo.api.gouv.fr/epcis') + with duckdb.get_connection() as conn: + conn.execute("CREATE SCHEMA IF NOT EXISTS external;") + conn.execute("CREATE OR REPLACE TABLE external.epci AS SELECT * FROM df") + + +@asset(deps=[AssetKey("setup_duckdb")], + group_name="external_seeds") +def raw_departements(context: AssetExecutionContext, duckdb: DuckDBResource): + df = pd.read_json('https://geo.api.gouv.fr/departements') + with duckdb.get_connection() as conn: + conn.execute("CREATE SCHEMA IF NOT EXISTS external;") + conn.execute("CREATE OR REPLACE TABLE external.departements AS SELECT * FROM df") + + + +@asset(deps=[AssetKey("setup_duckdb")], + group_name="external_seeds") +def raw_regions(context: AssetExecutionContext, duckdb: DuckDBResource): + df = pd.read_json('https://geo.api.gouv.fr/regions') + with duckdb.get_connection() as conn: + conn.execute("CREATE SCHEMA IF NOT EXISTS external;") + conn.execute("CREATE OR REPLACE TABLE external.regions AS SELECT * FROM df") \ No newline at end of file diff --git a/analytics/dagster/src/config.py b/analytics/dagster/src/config.py index 9a53d43de..5ebb22b1e 100644 --- a/analytics/dagster/src/config.py +++ b/analytics/dagster/src/config.py @@ -55,6 +55,13 @@ class Config: "marts_public_establishments_hierarchy", ] +admin_tables = [ + "marts_admin_epci", + "marts_admin_communes", + "marts_admin_departements", + "marts_admin_regions", +] + analysis_tables = ["marts_analysis_exit_flow_ff23_lovac"] common_tables = ["marts_common_cities", "marts_common_morphology"] @@ -84,6 +91,10 @@ class Config: "marts_production_join_owner_housing": "join_owner_housing", "marts_production_join_establishment_housing": "join_establishment_housing", "marts_common_cities": "cities_zonage_2024", + "marts_admin_epci": "admin_epci", + "marts_admin_communes": "admin_communes", + "marts_admin_departements": "admin_departements", + "marts_admin_regions": "admin_regions", # "marts_common_morphology": "infra_municipalities_morphology", # "marts_production_campaigns": "prod_campaigns", # "marts_production_establishments": "prod_establishments", @@ -96,7 +107,7 @@ class Config: # "marts_stats_monthly_global": "stats_activity_monthly", } -RESULT_TABLES = production_tables + join_tables + common_tables + public_tables +RESULT_TABLES = production_tables + join_tables + common_tables + public_tables + admin_tables def translate_table_name(table_name): diff --git a/analytics/dbt/macros/geo/get_hierarchy.sql b/analytics/dbt/macros/geo/get_hierarchy.sql index 80d8fc2ef..6200b3e11 100644 --- a/analytics/dbt/macros/geo/get_hierarchy.sql +++ b/analytics/dbt/macros/geo/get_hierarchy.sql @@ -1,4 +1,4 @@ -{% macro generate_hierarchy_relations(source_type, target_type, depth) %} +{% macro generate_hierarchy_relations(source_types, target_types, depth) %} ( SELECT DISTINCT source.id as ancestor_id, @@ -11,6 +11,6 @@ JOIN {{ ref('int_production_establishments') }} target CROSS JOIN UNNEST(target.localities_geo_code) as t_geo_code ON CAST(s_geo_code AS VARCHAR) = CAST(t_geo_code AS VARCHAR) - WHERE source.kind = '{{ source_type }}' - AND target.kind = '{{ target_type }}') + WHERE source.kind IN ({{ source_types }}) + AND target.kind IN ({{ target_types }})) {% endmacro %} \ No newline at end of file diff --git a/analytics/dbt/models/marts/admin/marts_admin_communes.sql b/analytics/dbt/models/marts/admin/marts_admin_communes.sql new file mode 100644 index 000000000..7c57f559c --- /dev/null +++ b/analytics/dbt/models/marts/admin/marts_admin_communes.sql @@ -0,0 +1,3 @@ +SELECT +* +FROM {{ ref('stg_admin_communes') }} \ No newline at end of file diff --git a/analytics/dbt/models/marts/admin/marts_admin_departements.sql b/analytics/dbt/models/marts/admin/marts_admin_departements.sql new file mode 100644 index 000000000..50de8e930 --- /dev/null +++ b/analytics/dbt/models/marts/admin/marts_admin_departements.sql @@ -0,0 +1,3 @@ +SELECT +* +FROM {{ ref('stg_admin_departements') }} \ No newline at end of file diff --git a/analytics/dbt/models/marts/admin/marts_admin_epci.sql b/analytics/dbt/models/marts/admin/marts_admin_epci.sql new file mode 100644 index 000000000..ba322a748 --- /dev/null +++ b/analytics/dbt/models/marts/admin/marts_admin_epci.sql @@ -0,0 +1,3 @@ +SELECT +* +FROM {{ ref('stg_admin_epci') }} \ No newline at end of file diff --git a/analytics/dbt/models/marts/admin/marts_admin_regions.sql b/analytics/dbt/models/marts/admin/marts_admin_regions.sql new file mode 100644 index 000000000..48b716592 --- /dev/null +++ b/analytics/dbt/models/marts/admin/marts_admin_regions.sql @@ -0,0 +1,3 @@ +SELECT +* +FROM {{ ref('stg_admin_regions') }} \ No newline at end of file diff --git a/analytics/dbt/models/marts/common/marts_common_cities.sql b/analytics/dbt/models/marts/common/marts_common_cities.sql index 067587e1c..5df3f5948 100644 --- a/analytics/dbt/models/marts/common/marts_common_cities.sql +++ b/analytics/dbt/models/marts/common/marts_common_cities.sql @@ -1,13 +1,8 @@ SELECT ccm.city_code, - MIN(label) AS label, -- Si le label est le même pour tous les arrondissements, sinon utiliser GROUP_CONCAT - MIN(zip_code) AS zip_code, -- Pour prendre un code postal représentatif, sinon utiliser GROUP_CONCAT - AVG(latitude) AS avg_latitude, - AVG(longitude) AS avg_longitude, - MIN(department_name) AS department_name, -- Même remarque que pour le label - MIN(department_number) AS department_number, - MIN(region_name) AS region_name, - MIN(region_geojson_name) AS region_geojson_name, + MIN(cc.libelle) AS label, + MIN(cc.department_code) AS department_code, + MIN(cc.region_code) AS region_code, MAX(ca1.is_in) AS tlv1, -- Prend 1 s'il y a au moins un arrondissement où la valeur est 1 MAX(ca2.is_in) AS tlv2, -- Idem MAX(action_coeur_de_ville) AS action_coeur_de_ville, diff --git a/analytics/dbt/models/marts/public/marts_public_establishments_hierarchy.sql b/analytics/dbt/models/marts/public/marts_public_establishments_hierarchy.sql index bc26c1f43..22f01369e 100644 --- a/analytics/dbt/models/marts/public/marts_public_establishments_hierarchy.sql +++ b/analytics/dbt/models/marts/public/marts_public_establishments_hierarchy.sql @@ -1,32 +1,32 @@ WITH +-- Définir les groupes de types pour chaque niveau +{% set niveau_1 = "'Commune'" %} +{% set niveau_2 = "'CA', 'CC', 'CU', 'EPCI', 'ME'" %} +{% set niveau_3 = "'SDED', 'DEP'" %} +{% set niveau_4 = "'REG', 'SDER'" %} -- Toutes les relations possibles all_relations AS ( - -- Commune -> EPCI (profondeur 1) - {{ generate_hierarchy_relations('Commune', 'CA', 1) }} + -- Niveau 1 -> Niveau 2 (profondeur 1) + {{ generate_hierarchy_relations(niveau_1, niveau_2, 1) }} UNION ALL - {{ generate_hierarchy_relations('Commune', 'CC', 1) }} + -- Niveau 1 -> Niveau 3 (profondeur 2) + {{ generate_hierarchy_relations(niveau_1, niveau_3, 2) }} UNION ALL - -- Commune -> Département (profondeur 2) - {{ generate_hierarchy_relations('Commune', 'DEP', 2) }} + -- Niveau 1 -> Niveau 4 (profondeur 3) + {{ generate_hierarchy_relations(niveau_1, niveau_4, 3) }} UNION ALL - -- Commune -> Région (profondeur 3) - {{ generate_hierarchy_relations('Commune', 'REG', 3) }} + -- Niveau 2 -> Niveau 3 (profondeur 1) + {{ generate_hierarchy_relations(niveau_2, niveau_3, 1) }} UNION ALL - -- EPCI -> Département (profondeur 1) - {{ generate_hierarchy_relations('CA', 'DEP', 1) }} + -- Niveau 2 -> Niveau 4 (profondeur 2) + {{ generate_hierarchy_relations(niveau_2, niveau_4, 2) }} UNION ALL - {{ generate_hierarchy_relations('CC', 'DEP', 1) }} - UNION ALL - -- EPCI -> Région (profondeur 2) - {{ generate_hierarchy_relations('CA', 'REG', 2) }} - UNION ALL - {{ generate_hierarchy_relations('CC', 'REG', 2) }} - UNION ALL - -- Département -> Région (profondeur 1) - {{ generate_hierarchy_relations('DEP', 'REG', 1) }} + -- Niveau 3 -> Niveau 4 (profondeur 1) + {{ generate_hierarchy_relations(niveau_3, niveau_4, 1) }} ) +-- Table finale de hiérarchie SELECT DISTINCT ancestor_id, descendant_id, diff --git a/analytics/dbt/models/staging/admin/schema.yml b/analytics/dbt/models/staging/admin/schema.yml new file mode 100644 index 000000000..907ce9d8a --- /dev/null +++ b/analytics/dbt/models/staging/admin/schema.yml @@ -0,0 +1,7 @@ +version: 2 + +models: + - name: stg_admin_communes + - name: stg_admin_epci + - name: stg_admin_departements + - name: stg_admin_regions \ No newline at end of file diff --git a/analytics/dbt/models/staging/admin/sources.yml b/analytics/dbt/models/staging/admin/sources.yml new file mode 100644 index 000000000..7eb686d0e --- /dev/null +++ b/analytics/dbt/models/staging/admin/sources.yml @@ -0,0 +1,10 @@ +version: 2 + +sources: + - name: duckdb_raw + schema: external + tables: + - name: communes + - name: epci + - name: departements + - name: regions diff --git a/analytics/dbt/models/staging/admin/stg_admin_communes.sql b/analytics/dbt/models/staging/admin/stg_admin_communes.sql new file mode 100644 index 000000000..0bc54a17b --- /dev/null +++ b/analytics/dbt/models/staging/admin/stg_admin_communes.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM {{ source ('duckdb_raw', 'communes') }} diff --git a/analytics/dbt/models/staging/admin/stg_admin_departements.sql b/analytics/dbt/models/staging/admin/stg_admin_departements.sql new file mode 100644 index 000000000..12cc9479d --- /dev/null +++ b/analytics/dbt/models/staging/admin/stg_admin_departements.sql @@ -0,0 +1,3 @@ +SELECT +* +FROM {{ source ('duckdb_raw', 'departements') }} \ No newline at end of file diff --git a/analytics/dbt/models/staging/admin/stg_admin_epci.sql b/analytics/dbt/models/staging/admin/stg_admin_epci.sql new file mode 100644 index 000000000..b80eefb19 --- /dev/null +++ b/analytics/dbt/models/staging/admin/stg_admin_epci.sql @@ -0,0 +1,3 @@ +SELECT +* +FROM {{ source ('duckdb_raw', 'epci') }} \ No newline at end of file diff --git a/analytics/dbt/models/staging/admin/stg_admin_regions.sql b/analytics/dbt/models/staging/admin/stg_admin_regions.sql new file mode 100644 index 000000000..53e39189c --- /dev/null +++ b/analytics/dbt/models/staging/admin/stg_admin_regions.sql @@ -0,0 +1,3 @@ +SELECT +* +FROM {{ source ('duckdb_raw', 'regions') }} \ No newline at end of file diff --git a/analytics/dbt/models/staging/common/stg_common_cities.sql b/analytics/dbt/models/staging/common/stg_common_cities.sql index f6c1f699f..a4ef09700 100644 --- a/analytics/dbt/models/staging/common/stg_common_cities.sql +++ b/analytics/dbt/models/staging/common/stg_common_cities.sql @@ -1,7 +1,7 @@ SELECT TYPECOM as city_kind, COM as geo_code, -REG as region, +REG as region_code, DEP as department_code, CTCD as ctcd, ARR as arr,