From 920cfa8883cf851592a799827a455cb66cf2fd33 Mon Sep 17 00:00:00 2001 From: NicholasTurner23 Date: Thu, 12 Dec 2024 19:21:48 +0300 Subject: [PATCH 01/12] Create new datatable without tenant clustering --- src/workflows/airqo_etl_utils/bigquery_api.py | 2 ++ src/workflows/airqo_etl_utils/config.py | 4 ++++ src/workflows/airqo_etl_utils/meta_data_utils.py | 3 +-- src/workflows/airqo_etl_utils/schema/devices.json | 4 ++-- src/workflows/airqo_etl_utils/schema/sites.json | 12 +----------- src/workflows/dags/meta_data.py | 4 ++-- 6 files changed, 12 insertions(+), 17 deletions(-) diff --git a/src/workflows/airqo_etl_utils/bigquery_api.py b/src/workflows/airqo_etl_utils/bigquery_api.py index db491a20ac..761113a622 100644 --- a/src/workflows/airqo_etl_utils/bigquery_api.py +++ b/src/workflows/airqo_etl_utils/bigquery_api.py @@ -46,6 +46,7 @@ def __init__(self): self.raw_weather_table = configuration.BIGQUERY_RAW_WEATHER_TABLE self.consolidated_data_table = configuration.BIGQUERY_ANALYTICS_TABLE self.sites_table = configuration.BIGQUERY_SITES_TABLE + self.sites_sites_table = configuration.BIGQUERY_SITES_SITES_TABLE self.airqlouds_table = configuration.BIGQUERY_AIRQLOUDS_TABLE self.airqlouds_sites_table = configuration.BIGQUERY_AIRQLOUDS_SITES_TABLE self.grids_table = configuration.BIGQUERY_GRIDS_TABLE @@ -54,6 +55,7 @@ def __init__(self): self.cohorts_devices_table = configuration.BIGQUERY_COHORTS_DEVICES_TABLE self.sites_meta_data_table = configuration.BIGQUERY_SITES_META_DATA_TABLE self.devices_table = configuration.BIGQUERY_DEVICES_TABLE + self.devices_devices_table = configuration.BIGQUERY_DEVICES_DEVICES_TABLE self.devices_summary_table = configuration.BIGQUERY_DEVICES_SUMMARY_TABLE self.openweathermap_table = configuration.BIGQUERY_OPENWEATHERMAP_TABLE self.satellite_data_table = configuration.BIGQUERY_SATELLITE_DATA_TABLE diff --git a/src/workflows/airqo_etl_utils/config.py b/src/workflows/airqo_etl_utils/config.py index f583a5088b..1d30c0a3b0 100644 --- a/src/workflows/airqo_etl_utils/config.py +++ b/src/workflows/airqo_etl_utils/config.py @@ -57,8 +57,10 @@ class Config: # Meta data BIGQUERY_DEVICES_TABLE = os.getenv("BIGQUERY_DEVICES_TABLE") + BIGQUERY_DEVICES_DEVICES_TABLE = os.getenv("BIGQUERY_DEVICES_DEVICES_TABLE") BIGQUERY_DEVICES_DATA_TABLE = os.getenv("BIGQUERY_DEVICES_DATA_TABLE") BIGQUERY_SITES_TABLE = os.getenv("BIGQUERY_SITES_TABLE") + BIGQUERY_SITES_SITES_TABLE = os.getenv("BIGQUERY_SITES_SITES_TABLE") BIGQUERY_SITES_META_DATA_TABLE = os.getenv("BIGQUERY_SITES_META_DATA_TABLE") BIGQUERY_AIRQLOUDS_TABLE = os.getenv("BIGQUERY_AIRQLOUDS_TABLE") BIGQUERY_AIRQLOUDS_SITES_TABLE = os.getenv("BIGQUERY_AIRQLOUDS_SITES_TABLE") @@ -371,9 +373,11 @@ class Config: BIGQUERY_GRIDS_SITES_TABLE: "grids_sites.json", BIGQUERY_COHORTS_DEVICES_TABLE: "cohorts_devices.json", BIGQUERY_SITES_TABLE: "sites.json", + BIGQUERY_SITES_SITES_TABLE: "sites.json", BIGQUERY_SITES_META_DATA_TABLE: "sites_meta_data.json", SENSOR_POSITIONS_TABLE: "sensor_positions.json", BIGQUERY_DEVICES_TABLE: "devices.json", + BIGQUERY_DEVICES_DEVICES_TABLE: "devices.json", BIGQUERY_CLEAN_RAW_MOBILE_EVENTS_TABLE: "mobile_measurements.json", BIGQUERY_UNCLEAN_RAW_MOBILE_EVENTS_TABLE: "mobile_measurements.json", BIGQUERY_AIRQO_MOBILE_EVENTS_TABLE: "airqo_mobile_measurements.json", diff --git a/src/workflows/airqo_etl_utils/meta_data_utils.py b/src/workflows/airqo_etl_utils/meta_data_utils.py index b54ea6a485..046ccc1a01 100644 --- a/src/workflows/airqo_etl_utils/meta_data_utils.py +++ b/src/workflows/airqo_etl_utils/meta_data_utils.py @@ -129,7 +129,6 @@ def extract_sites_from_api(network: str = "all") -> pd.DataFrame: "approximate_latitude", "approximate_longitude", "name", - "location", "search_name", "location_name", "description", @@ -147,7 +146,7 @@ def extract_sites_from_api(network: str = "all") -> pd.DataFrame: }, inplace=True, ) - + dataframe["last_updated"] = datetime.now(timezone.utc) dataframe = DataValidationUtils.remove_outliers(dataframe) return dataframe diff --git a/src/workflows/airqo_etl_utils/schema/devices.json b/src/workflows/airqo_etl_utils/schema/devices.json index 04f7312d0a..82537f251a 100644 --- a/src/workflows/airqo_etl_utils/schema/devices.json +++ b/src/workflows/airqo_etl_utils/schema/devices.json @@ -22,7 +22,7 @@ { "name": "device_id", "type": "STRING", - "mode": "NULLABLE" + "mode": "REQUIRED" }, { "name": "device_number", @@ -52,6 +52,6 @@ { "name": "last_updated", "type": "TIMESTAMP", - "mode": "NULLABLE" + "mode": "REQUIRED" } ] diff --git a/src/workflows/airqo_etl_utils/schema/sites.json b/src/workflows/airqo_etl_utils/schema/sites.json index 7da7508e4c..d6883efc5f 100644 --- a/src/workflows/airqo_etl_utils/schema/sites.json +++ b/src/workflows/airqo_etl_utils/schema/sites.json @@ -1,9 +1,4 @@ [ - { - "name": "tenant", - "type": "STRING", - "mode": "REQUIRED" - }, { "name": "network", "type": "STRING", @@ -39,11 +34,6 @@ "type": "STRING", "mode": "NULLABLE" }, - { - "name": "location", - "type": "STRING", - "mode": "NULLABLE" - }, { "name": "display_name", "type": "STRING", @@ -76,7 +66,7 @@ }, { "name": "last_updated", - "type": "DATE", + "type": "TIMESTAMP", "mode": "NULLABLE" } ] diff --git a/src/workflows/dags/meta_data.py b/src/workflows/dags/meta_data.py index 914c038dfc..037ca5b260 100644 --- a/src/workflows/dags/meta_data.py +++ b/src/workflows/dags/meta_data.py @@ -46,7 +46,7 @@ def load_sites(data: pd.DataFrame): big_query_api = BigQueryApi() big_query_api.update_sites_and_devices( dataframe=data, - table=big_query_api.sites_table, + table=big_query_api.sites_sites_table, component="sites", ) @@ -75,7 +75,7 @@ def load_devices(data: pd.DataFrame): big_query_api = BigQueryApi() big_query_api.update_sites_and_devices( dataframe=data, - table=big_query_api.devices_table, + table=big_query_api.devices_devices_table, component="devices", ) From 424fe2508e1b8a6b593fe0911dc455af3ec41540 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:09:47 +0300 Subject: [PATCH 02/12] Update device registry staging image tag to stage-04fa3ebc-1734023297 --- k8s/device-registry/values-stage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/device-registry/values-stage.yaml b/k8s/device-registry/values-stage.yaml index a7db8ac00a..c24cd08854 100644 --- a/k8s/device-registry/values-stage.yaml +++ b/k8s/device-registry/values-stage.yaml @@ -6,7 +6,7 @@ app: replicaCount: 2 image: repository: eu.gcr.io/airqo-250220/airqo-stage-device-registry-api - tag: stage-a67d93c7-1733998369 + tag: stage-04fa3ebc-1734023297 nameOverride: '' fullnameOverride: '' podAnnotations: {} From 49bcdf304ed64914cbe56774f22f7ee0638ff417 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:10:05 +0300 Subject: [PATCH 03/12] Update AirQo exceedance production image tag to prod-48e4e72c-1734023346 --- k8s/exceedance/values-prod-airqo.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/exceedance/values-prod-airqo.yaml b/k8s/exceedance/values-prod-airqo.yaml index 363a966dd5..7ad3db6bb6 100644 --- a/k8s/exceedance/values-prod-airqo.yaml +++ b/k8s/exceedance/values-prod-airqo.yaml @@ -4,6 +4,6 @@ app: configmap: env-exceedance-production image: repository: eu.gcr.io/airqo-250220/airqo-exceedance-job - tag: prod-3cb7d03d-1734014240 + tag: prod-48e4e72c-1734023346 nameOverride: '' fullnameOverride: '' From d06519d9a83e5b9fda5b49c51908fbca65d22d33 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:10:13 +0300 Subject: [PATCH 04/12] Update KCCA exceedance production image tag to prod-48e4e72c-1734023346 --- k8s/exceedance/values-prod-kcca.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/exceedance/values-prod-kcca.yaml b/k8s/exceedance/values-prod-kcca.yaml index 785b4582de..8a9cbded63 100644 --- a/k8s/exceedance/values-prod-kcca.yaml +++ b/k8s/exceedance/values-prod-kcca.yaml @@ -4,6 +4,6 @@ app: configmap: env-exceedance-production image: repository: eu.gcr.io/airqo-250220/kcca-exceedance-job - tag: prod-3cb7d03d-1734014240 + tag: prod-48e4e72c-1734023346 nameOverride: '' fullnameOverride: '' From 350155ac4faad4acbc444c20701cf6304a3a6e13 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:10:55 +0300 Subject: [PATCH 05/12] Update device registry production image tag to prod-48e4e72c-1734023346 --- k8s/device-registry/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/device-registry/values-prod.yaml b/k8s/device-registry/values-prod.yaml index 2e7c93671f..d63a9d8eac 100644 --- a/k8s/device-registry/values-prod.yaml +++ b/k8s/device-registry/values-prod.yaml @@ -6,7 +6,7 @@ app: replicaCount: 3 image: repository: eu.gcr.io/airqo-250220/airqo-device-registry-api - tag: prod-3cb7d03d-1734014240 + tag: prod-48e4e72c-1734023346 nameOverride: '' fullnameOverride: '' podAnnotations: {} From 88a2128a5552a59daa0941bbc88b7a8e40f4309c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:10:59 +0300 Subject: [PATCH 06/12] Update website production image tag to prod-48e4e72c-1734023346 --- k8s/website/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/website/values-prod.yaml b/k8s/website/values-prod.yaml index b43069acfc..5fa30b847f 100644 --- a/k8s/website/values-prod.yaml +++ b/k8s/website/values-prod.yaml @@ -6,7 +6,7 @@ app: replicaCount: 3 image: repository: eu.gcr.io/airqo-250220/airqo-website-api - tag: prod-3cb7d03d-1734014240 + tag: prod-48e4e72c-1734023346 nameOverride: '' fullnameOverride: '' podAnnotations: {} From ba0140f6b5238fc4ead1b71cc51bb967afa05114 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:11:42 +0300 Subject: [PATCH 07/12] Update workflows prod image tag to prod-48e4e72c-1734023346 --- k8s/workflows/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/workflows/values-prod.yaml b/k8s/workflows/values-prod.yaml index 3e839b93ad..d975324084 100644 --- a/k8s/workflows/values-prod.yaml +++ b/k8s/workflows/values-prod.yaml @@ -10,7 +10,7 @@ images: initContainer: eu.gcr.io/airqo-250220/airqo-workflows-xcom redisContainer: eu.gcr.io/airqo-250220/airqo-redis containers: eu.gcr.io/airqo-250220/airqo-workflows - tag: prod-3cb7d03d-1734014240 + tag: prod-48e4e72c-1734023346 nameOverride: '' fullnameOverride: '' podAnnotations: {} From 8fc8ec8dc1a4970d0908781c56188645f65c7557 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:12:08 +0300 Subject: [PATCH 08/12] Update analytics production image tag to prod-48e4e72c-1734023346 --- k8s/analytics/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/analytics/values-prod.yaml b/k8s/analytics/values-prod.yaml index 71693499d2..95a2d215e8 100644 --- a/k8s/analytics/values-prod.yaml +++ b/k8s/analytics/values-prod.yaml @@ -8,7 +8,7 @@ images: celeryWorker: eu.gcr.io/airqo-250220/airqo-analytics-celery-worker reportJob: eu.gcr.io/airqo-250220/airqo-analytics-report-job devicesSummaryJob: eu.gcr.io/airqo-250220/airqo-analytics-devices-summary-job - tag: prod-3cb7d03d-1734014240 + tag: prod-48e4e72c-1734023346 api: name: airqo-analytics-api label: analytics-api From b7765df0e6537bb1c7f2d0b187b3262e820f71d9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:12:42 +0300 Subject: [PATCH 09/12] Update predict production image tag to prod-48e4e72c-1734023346 --- k8s/predict/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/predict/values-prod.yaml b/k8s/predict/values-prod.yaml index 354e448b2f..81a044d332 100644 --- a/k8s/predict/values-prod.yaml +++ b/k8s/predict/values-prod.yaml @@ -7,7 +7,7 @@ images: predictJob: eu.gcr.io/airqo-250220/airqo-predict-job trainJob: eu.gcr.io/airqo-250220/airqo-train-job predictPlaces: eu.gcr.io/airqo-250220/airqo-predict-places-air-quality - tag: prod-3cb7d03d-1734014240 + tag: prod-48e4e72c-1734023346 api: name: airqo-prediction-api label: prediction-api From 3b2cde043614279a4040ac45ed77c11cb033e15f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:18:56 +0300 Subject: [PATCH 10/12] Update spatial production image tag to prod-48e4e72c-1734023346 --- k8s/spatial/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/spatial/values-prod.yaml b/k8s/spatial/values-prod.yaml index 1d2410ee20..c387dd85dc 100644 --- a/k8s/spatial/values-prod.yaml +++ b/k8s/spatial/values-prod.yaml @@ -6,7 +6,7 @@ app: replicaCount: 3 image: repository: eu.gcr.io/airqo-250220/airqo-spatial-api - tag: prod-3cb7d03d-1734014240 + tag: prod-48e4e72c-1734023346 nameOverride: '' fullnameOverride: '' podAnnotations: {} From 78d3de1209f28a110eeb565da9266007ffbaccb3 Mon Sep 17 00:00:00 2001 From: NicholasTurner23 Date: Thu, 12 Dec 2024 20:34:28 +0300 Subject: [PATCH 11/12] Update cleaning for raw data to handle non airqo devices data --- src/analytics/api/models/events.py | 102 ++++++++++++++++------------- 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/src/analytics/api/models/events.py b/src/analytics/api/models/events.py index a3883752f2..f4884aa24a 100644 --- a/src/analytics/api/models/events.py +++ b/src/analytics/api/models/events.py @@ -27,7 +27,9 @@ class EventsModel(BasePyMongoModel): BIGQUERY_COHORTS = f"`{CONFIGURATIONS.BIGQUERY_COHORTS}`" BIGQUERY_COHORTS_DEVICES = f"`{CONFIGURATIONS.BIGQUERY_COHORTS_DEVICES}`" BIGQUERY_SITES = f"`{CONFIGURATIONS.BIGQUERY_SITES}`" + BIGQUERY_SITES_SITES = f"`{CONFIGURATIONS.BIGQUERY_SITES_SITES}`" BIGQUERY_DEVICES = f"`{CONFIGURATIONS.BIGQUERY_DEVICES}`" + BIGQUERY_DEVICES_DEVICES = f"`{CONFIGURATIONS.BIGQUERY_DEVICES_DEVICES}`" DATA_EXPORT_DECIMAL_PLACES = CONFIGURATIONS.DATA_EXPORT_DECIMAL_PLACES BIGQUERY_EVENTS = CONFIGURATIONS.BIGQUERY_EVENTS @@ -51,28 +53,30 @@ def __init__(self, tenant): """ self.limit_mapper = {"pm2_5": 500.5, "pm10": 604.5, "no2": 2049} self.sites_table = self.BIGQUERY_SITES + self.sites_sites_table = self.BIGQUERY_SITES_SITES self.airqlouds_sites_table = self.BIGQUERY_AIRQLOUDS_SITES self.devices_table = self.BIGQUERY_DEVICES + self.devices_devices_table = self.BIGQUERY_DEVICES_DEVICES self.airqlouds_table = self.BIGQUERY_AIRQLOUDS super().__init__(tenant, collection_name="events") @property def device_info_query(self): - """Generates a device information query including site_id, tenant, and approximate location details.""" + """Generates a device information query including site_id, network, and approximate location details.""" return ( - f"{self.devices_table}.site_id AS site_id, " - f"{self.devices_table}.tenant AS tenant " + f"{self.devices_devices_table}.site_id AS site_id, " + f"{self.devices_devices_table}.network AS network " ) @property def device_info_query_airqloud(self): """Generates a device information query specifically for airqlouds, excluding the site_id.""" - return f"{self.devices_table}.tenant AS tenant " + return f"{self.devices_devices_table}.network AS network " @property def site_info_query(self): """Generates a site information query to retrieve site name and approximate location details.""" - return f"{self.sites_table}.name AS site_name " + return f"{self.sites_sites_table}.name AS site_name " @property def airqloud_info_query(self): @@ -92,8 +96,8 @@ def add_device_join(self, data_query, filter_clause=""): """ return ( f"SELECT {self.device_info_query}, data.* " - f"FROM {self.devices_table} " - f"RIGHT JOIN ({data_query}) data ON data.device_name = {self.devices_table}.device_id " + f"FROM {self.devices_devices_table} " + f"RIGHT JOIN ({data_query}) data ON data.device_name = {self.devices_devices_table}.device_id " f"{filter_clause}" ) @@ -110,8 +114,8 @@ def add_device_join_to_airqlouds(self, data_query, filter_clause=""): """ return ( f"SELECT {self.device_info_query_airqloud}, data.* " - f"FROM {self.devices_table} " - f"RIGHT JOIN ({data_query}) data ON data.site_id = {self.devices_table}.site_id " + f"FROM {self.devices_devices_table} " + f"RIGHT JOIN ({data_query}) data ON data.site_id = {self.devices_devices_table}.site_id " f"{filter_clause}" ) @@ -127,8 +131,8 @@ def add_site_join(self, data_query): """ return ( f"SELECT {self.site_info_query}, data.* " - f"FROM {self.sites_table} " - f"RIGHT JOIN ({data_query}) data ON data.site_id = {self.sites_table}.id " + f"FROM {self.sites_sites_table} " + f"RIGHT JOIN ({data_query}) data ON data.site_id = {self.sites_sites_table}.id " ) def add_airqloud_join(self, data_query): @@ -194,11 +198,11 @@ def get_device_query( including BAM data if applicable. """ query = ( - f"{pollutants_query}, {time_grouping}, {self.device_info_query}, {self.devices_table}.name AS device_name " + f"{pollutants_query}, {time_grouping}, {self.device_info_query}, {self.devices_devices_table}.name AS device_name " f"FROM {data_table} " - f"JOIN {self.devices_table} ON {self.devices_table}.device_id = {data_table}.device_id " + f"JOIN {self.devices_devices_table} ON {self.devices_devices_table}.device_id = {data_table}.device_id " f"WHERE {data_table}.timestamp BETWEEN '{start_date}' AND '{end_date}' " - f"AND {self.devices_table}.device_id IN UNNEST(@filter_value) " + f"AND {self.devices_devices_table}.device_id IN UNNEST(@filter_value) " ) if frequency in ["weekly", "monthly", "yearly"]: query += " GROUP BY ALL" @@ -206,11 +210,11 @@ def get_device_query( query = self.add_site_join(query) if frequency in ["hourly", "weekly", "monthly", "yearly"]: bam_query = ( - f"{bam_pollutants_query}, {time_grouping}, {self.device_info_query}, {self.devices_table}.name AS device_name " + f"{bam_pollutants_query}, {time_grouping}, {self.device_info_query}, {self.devices_devices_table}.name AS device_name " f"FROM {self.BIGQUERY_BAM_DATA} " - f"JOIN {self.devices_table} ON {self.devices_table}.device_id = {self.BIGQUERY_BAM_DATA}.device_id " + f"JOIN {self.devices_devices_table} ON {self.devices_devices_table}.device_id = {self.BIGQUERY_BAM_DATA}.device_id " f"WHERE {self.BIGQUERY_BAM_DATA}.timestamp BETWEEN '{start_date}' AND '{end_date}' " - f"AND {self.devices_table}.device_id IN UNNEST(@filter_value) " + f"AND {self.devices_devices_table}.device_id IN UNNEST(@filter_value) " ) if frequency in ["weekly", "monthly", "yearly"]: bam_query += " GROUP BY ALL" @@ -247,9 +251,9 @@ def get_site_query( query = ( f"{pollutants_query}, {time_grouping}, {self.site_info_query}, {data_table}.device_id AS device_name " f"FROM {data_table} " - f"JOIN {self.sites_table} ON {self.sites_table}.id = {data_table}.site_id " + f"JOIN {self.sites_sites_table} ON {self.sites_sites_table}.id = {data_table}.site_id " f"WHERE {data_table}.timestamp BETWEEN '{start_date}' AND '{end_date}' " - f"AND {self.sites_table}.id IN UNNEST(@filter_value) " + f"AND {self.sites_sites_table}.id IN UNNEST(@filter_value) " ) if frequency in ["weekly", "monthly", "yearly"]: query += " GROUP BY ALL" @@ -538,7 +542,7 @@ def download_from_bigquery( else: drop_columns.append("datetime") sorting_cols.append("datetime") - + dataframe.to_csv("raw_data50.csv") if data_type == "raw": cls.simple_data_cleaning(dataframe) @@ -551,39 +555,45 @@ def download_from_bigquery( @classmethod def simple_data_cleaning(cls, data: pd.DataFrame) -> pd.DataFrame: """ - Perform data cleaning on a pandas DataFrame to handle specific conditions - related to "pm2_5" and "pm2_5_raw_value" columns. - - The cleaning process includes: - 1. Ensuring correct numeric data types for "pm2_5" and "pm2_5_raw_value". - 2. Removing "pm2_5" values where "pm2_5_raw_value" has data. - 3. Dropping the "pm2_5_raw_value" column if it has no data at all. - 4. Retaining "pm2_5" values where "pm2_5_raw_value" has no data, and removing - "pm2_5" values where "pm2_5_raw_value" has data. - 5. Dropping any column (including "pm2_5" and "pm2_5_raw_value") if it is - entirely empty. - - Args: - cls: Class reference (used in classmethods). - data (pd.DataFrame): Input pandas DataFrame with "pm2_5" and - "pm2_5_raw_value" columns. - - Returns: - pd.DataFrame: Cleaned DataFrame with updates applied in place. + Perform data cleaning on a pandas DataFrame to handle specific conditions + related to "pm2_5" and "pm2_5_raw_value" columns. + + The cleaning process includes: + 1. Ensuring correct numeric data types for "pm2_5" and "pm2_5_raw_value". + 2. Removing "pm2_5" values where "pm2_5_raw_value" values are not 0s. + 3. Dropping the "pm2_5_raw_value" column if all its values are 0s. + 4. Retaining "pm2_5" values where "pm2_5_raw_value" values are all 0s. + 5. Dropping any column (including "pm2_5" and "pm2_5_raw_value") if all values are 0s. + Args: + cls: Class reference (used in classmethods). + data (pd.DataFrame): Input pandas DataFrame with "pm2_5" and + "pm2_5_raw_value" columns. + + Returns: + pd.DataFrame: Cleaned DataFrame with updates applied in place. + + Raises: + ValueError: If "pm2_5" or "pm2_5_raw_value" columns are missing. """ - data["pm2_5_raw_value"] = pd.to_numeric( - data["pm2_5_raw_value"], errors="coerce" + required_columns = ["pm2_5", "pm2_5_raw_value"] + + missing_columns = [col for col in required_columns if col not in data.columns] + if missing_columns: + raise ValueError(f"Missing required columns: {missing_columns}") + + numeric_columns = ["pm2_5", "pm2_5_raw_value"] + data[numeric_columns] = data[numeric_columns].apply( + pd.to_numeric, errors="coerce" ) - data["pm2_5"] = pd.to_numeric(data["pm2_5"], errors="coerce") - data.loc[~data["pm2_5_raw_value"].isna(), "pm2_5"] = np.nan + data.loc[data["pm2_5_raw_value"] != 0, "pm2_5"] = np.nan - if data["pm2_5_raw_value"].isna().all(): + if (data["pm2_5_raw_value"] == 0).all(): data.drop(columns=["pm2_5_raw_value"], inplace=True) - data["pm2_5"] = data["pm2_5"].where(data["pm2_5_raw_value"].isna(), np.nan) - + zero_columns = data.loc[:, (data == 0).all()].columns + data.drop(columns=zero_columns, inplace=True) data.dropna(how="all", axis=1, inplace=True) return data From f9559ce8ad4a2c385f7ccaed8df3324dadd75a91 Mon Sep 17 00:00:00 2001 From: NicholasTurner23 Date: Thu, 12 Dec 2024 20:34:57 +0300 Subject: [PATCH 12/12] Update config with new tables --- src/analytics/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/analytics/config.py b/src/analytics/config.py index 57ca91b67f..7095a05bf4 100644 --- a/src/analytics/config.py +++ b/src/analytics/config.py @@ -32,7 +32,9 @@ class Config: CACHE_REDIS_URL = f"redis://{env_var('REDIS_SERVER')}:{env_var('REDIS_PORT')}" BIGQUERY_DEVICES = env_var("BIGQUERY_DEVICES") + BIGQUERY_DEVICES_DEVICES = env_var("BIGQUERY_DEVICES_DEVICES") BIGQUERY_SITES = env_var("BIGQUERY_SITES") + BIGQUERY_SITES_SITES = env_var("BIGQUERY_SITES_SITES") BIGQUERY_AIRQLOUDS_SITES = env_var("BIGQUERY_AIRQLOUDS_SITES") BIGQUERY_AIRQLOUDS = env_var("BIGQUERY_AIRQLOUDS") DATA_EXPORT_DECIMAL_PLACES = os.getenv("DATA_EXPORT_DECIMAL_PLACES", 2)