NTD: ingest 2022 agency and create external table, expand dynamic url scrape, clean up previous code (#3604)

charlie-costanzo · web-flow · commit 43afccea326d · 2024-12-18T22:26:45.000-05:00
* expand dynamic ntd url scrape to cover agency data for multiple years

* new dag for 2022 agency scrape

* add new external table for 2022 agency info, fix bugs in previous external table creation

* clean up xcoms fetch
diff --git a/airflow/dags/create_external_tables/ntd_data_products/2022__annual_database_agency_information.yml b/airflow/dags/create_external_tables/ntd_data_products/2022__annual_database_agency_information.yml
@@ -0,0 +1,16 @@
+operator: operators.ExternalTable
+bucket: gs://calitp-ntd-xlsx-products-clean
+prefix_bucket: false
+post_hook: |
+  SELECT *
+  FROM `{{ get_project_id() }}`.external_ntd__annual_reporting.2022__annual_database_agency_information
+  LIMIT 1;
+source_objects:
+  - "annual_database_agency_information/2022/_2022_agency_information/*.jsonl.gz"
+destination_project_dataset_table: "external_ntd__annual_reporting.2022__annual_database_agency_information"
+source_format: NEWLINE_DELIMITED_JSON
+use_bq_client: true
+hive_options:
+  mode: AUTO
+  require_partition_filter: false
+  source_uri_prefix: "annual_database_agency_information/2022/_2022_agency_information/"
diff --git a/airflow/dags/create_external_tables/ntd_data_products/2023__annual_database_agency_information.yml b/airflow/dags/create_external_tables/ntd_data_products/2023__annual_database_agency_information.yml
@@ -1,5 +1,5 @@
 operator: operators.ExternalTable
-bucket: gs://test-calitp-ntd-xlsx-products-clean
+bucket: gs://calitp-ntd-xlsx-products-clean
 prefix_bucket: false
 post_hook: |
   SELECT *
diff --git a/airflow/dags/create_external_tables/ntd_data_products/2023__annual_database_contractual_relationships.yml b/airflow/dags/create_external_tables/ntd_data_products/2023__annual_database_contractual_relationships.yml
@@ -1,5 +1,5 @@
 operator: operators.ExternalTable
-bucket: gs://test-calitp-ntd-xlsx-products-clean
+bucket: gs://calitp-ntd-xlsx-products-clean
 prefix_bucket: false
 post_hook: |
   SELECT *
diff --git a/airflow/dags/sync_ntd_data_xlsx/agency_information/2022_annual_database_agency_information.yml b/airflow/dags/sync_ntd_data_xlsx/agency_information/2022_annual_database_agency_information.yml
@@ -0,0 +1,7 @@
+operator: operators.NtdDataProductXLSXOperator
+
+product: 'annual_database_agency_information'
+xlsx_file_url: 'https://www.transit.dot.gov/ntd/data-product/2022-annual-database-agency-information' # placeholder for scraped url from scrape_ntd_xlsx_urls task
+year: '2022' # one of: 'historical' (long history), 'mutli-year' (select history), or a specific year (ex: 2022)
+dependencies:
+  - scrape_ntd_xlsx_urls
diff --git a/airflow/dags/sync_ntd_data_xlsx/scrape_ntd_xlsx_urls.py b/airflow/dags/sync_ntd_data_xlsx/scrape_ntd_xlsx_urls.py
@@ -10,7 +10,8 @@
 
 xlsx_urls = {
     "ridership_url": "https://www.transit.dot.gov/ntd/data-product/monthly-module-raw-data-release",
-    "agency_url": "https://www.transit.dot.gov/ntd/data-product/2023-annual-database-agency-information",
+    "2022_agency_url": "https://www.transit.dot.gov/ntd/data-product/2022-annual-database-agency-information",
+    "2023_agency_url": "https://www.transit.dot.gov/ntd/data-product/2023-annual-database-agency-information",
     "contractual_relationship_url": "https://www.transit.dot.gov/ntd/data-product/2023-annual-database-contractual-relationship",
 }
 
@@ -46,36 +47,3 @@ def scrape_ntd_xlsx_urls(**context):
         logging.info(f"Validated URL: {validated_url}.")
 
         push_url_to_xcom(key=key, scraped_url=validated_url, context=context)
-
-
-# # pushes the scraped URL value to XCom
-# def push_url_to_xcom(scraped_url, context):
-#     task_instance = context["ti"]
-#     task_instance.xcom_push(key="current_url", value=scraped_url)
-
-
-# # Look for an anchor tag where the href ends with '.xlsx' and starts with '/sites/fta.dot.gov/files/'
-# def href_matcher(href):
-#     return (
-#         href and href.startswith("/sites/fta.dot.gov/files/") and href.endswith(".xlsx")
-#     )
-
-
-# def scrape_ntd_xlsx_urls(**context):
-#     # page to find download URL
-#     url = "https://www.transit.dot.gov/ntd/data-product/monthly-module-raw-data-release"
-#     req = requests.get(url)
-#     soup = BeautifulSoup(req.text, "html.parser")
-
-#     link = soup.find("a", href=href_matcher)
-
-#     # Extract the href if the link is found
-#     file_link = link["href"] if link else None
-
-#     updated_url = f"https://www.transit.dot.gov{file_link}"
-
-#     validated_url = parse_obj_as(HttpUrl, updated_url)
-
-#     logging.info(f"Validated URL: {validated_url}.")
-
-#     push_url_to_xcom(scraped_url=validated_url, context=context)
diff --git a/airflow/plugins/operators/scrape_ntd_xlsx.py b/airflow/plugins/operators/scrape_ntd_xlsx.py
@@ -19,15 +19,25 @@
 RAW_XLSX_BUCKET = os.environ["CALITP_BUCKET__NTD_XLSX_DATA_PRODUCTS__RAW"]
 CLEAN_XLSX_BUCKET = os.environ["CALITP_BUCKET__NTD_XLSX_DATA_PRODUCTS__CLEAN"]
 
+# Map product and year combinations to their xcom keys for dynamic url scraping
+xcom_keys = {
+    (
+        "complete_monthly_ridership_with_adjustments_and_estimates",
+        "historical",
+    ): "ridership_url",
+    ("annual_database_agency_information", "2022"): "2022_agency_url",
+    ("annual_database_agency_information", "2023"): "2023_agency_url",
+    (
+        "annual_database_contractual_relationship",
+        "2023",
+    ): "contractual_relationship_url",
+}
+
 
 # pulls the URL from XCom
 def pull_url_from_xcom(key, context):
     task_instance = context["ti"]
-    pulled_value = task_instance.xcom_pull(
-        task_ids="scrape_ntd_xlsx_urls",
-        key=key
-        # task_ids="scrape_ntd_xlsx_urls", key="current_url"
-    )
+    pulled_value = task_instance.xcom_pull(task_ids="scrape_ntd_xlsx_urls", key=key)
     print(f"Pulled value from XCom: {pulled_value}")
     return pulled_value
 
@@ -120,16 +130,10 @@ def __init__(
     def execute(self, context, *args, **kwargs):
         download_url = self.raw_excel_extract.file_url
 
-        if self.product == "complete_monthly_ridership_with_adjustments_and_estimates":
-            download_url = pull_url_from_xcom(key="ridership_url", context=context)
-
-        if self.product == "annual_database_agency_information":
-            download_url = pull_url_from_xcom(key="agency_url", context=context)
+        key = (self.product, self.year)
 
-        if self.product == "annual_database_contractual_relationship":
-            download_url = pull_url_from_xcom(
-                key="contractual_relationship_url", context=context
-            )
+        if key in xcom_keys:
+            download_url = pull_url_from_xcom(key=xcom_keys[key], context=context)
 
         # see what is returned
         logging.info(f"reading {self.product} url as {download_url}")
@@ -164,140 +168,3 @@ def execute(self, context, *args, **kwargs):
             self.clean_excel_extract.save_content(
                 fs=get_fs(), content=self.clean_gzipped_content
             )
-
-
-# # pulls the URL from XCom
-# def pull_url_from_xcom(context):
-#     task_instance = context["ti"]
-#     pulled_value = task_instance.xcom_pull(
-#         task_ids="scrape_ntd_xlsx_urls", key="current_url"
-#     )
-#     print(f"Pulled value from XCom: {pulled_value}")
-#     return pulled_value
-
-
-# class NtdDataProductXLSXExtract(PartitionedGCSArtifact):
-#     bucket: ClassVar[str]
-#     year: str
-#     product: str
-#     execution_ts: pendulum.DateTime = pendulum.now()
-#     dt: pendulum.Date = execution_ts.date()
-#     file_url: HttpUrl = None
-#     partition_names: ClassVar[List[str]] = ["dt", "execution_ts"]
-
-#     @property
-#     def table(self) -> str:
-#         return self.product
-
-#     @property
-#     def filename(self) -> str:
-#         return self.table
-
-#     class Config:
-#         arbitrary_types_allowed = True
-
-#     def fetch_from_ntd_xlsx(self, file_url):
-#         # As of now, the only file that we are downloading is for complete_monthly_ridership_with_adjustments_and_estimates
-#         # and the download link changes every time they update the date, so we have special handling for that here, which is dependent
-#         # another dag task called scrape_ntd_xlsx_urls.py. if we look to download other xlsx files from the DOT portal and they
-#         # also change the file name every time they publish, they we will have to add the same handling for all of these files and make it programmatic
-
-#         validated_url = parse_obj_as(HttpUrl, file_url)
-
-#         logging.info(f"reading file from url {validated_url}")
-
-#         try:
-#             excel_content = requests.get(validated_url).content
-
-#             if excel_content is None or len(excel_content) == 0:
-#                 logging.info(
-#                     f"There is no data to download for {self.year} / {self.product}. Ending pipeline."
-#                 )
-
-#                 pass
-
-#             else:
-#                 logging.info(
-#                     f"Downloaded {self.product} data for {self.year} with {len(excel_content)} rows!"
-#                 )
-
-#                 return excel_content
-
-#         except requests.exceptions.RequestException as e:
-#             logging.info(f"An error occurred: {e}")
-
-#             raise
-
-
-# class RawExtract(NtdDataProductXLSXExtract):
-#     bucket = RAW_XLSX_BUCKET
-
-
-# class CleanExtract(NtdDataProductXLSXExtract):
-#     bucket = CLEAN_XLSX_BUCKET
-
-
-# class NtdDataProductXLSXOperator(BaseOperator):
-#     template_fields = ("year", "product", "xlsx_file_url")
-
-#     def __init__(
-#         self,
-#         product: str,
-#         xlsx_file_url,
-#         year: int,
-#         *args,
-#         **kwargs,
-#     ):
-#         self.year = year
-#         self.product = product
-#         self.xlsx_file_url = xlsx_file_url
-
-#         # Save initial excel files to the raw bucket
-#         self.raw_excel_extract = RawExtract(
-#             year=self.year,
-#             product=self.product + "_raw" + "/" + self.year,
-#             file_url=self.xlsx_file_url,
-#             filename=f"{self.year}__{self.product}_raw.xlsx",
-#         )
-
-#         super().__init__(*args, **kwargs)
-
-#     def execute(self, context, *args, **kwargs):
-#         download_url = self.raw_excel_extract.file_url
-
-#         if self.product == "complete_monthly_ridership_with_adjustments_and_estimates":
-#             download_url = pull_url_from_xcom(context=context)
-
-#         # see what is returned
-#         logging.info(f"reading ridership url as {download_url}")
-
-#         excel_content = self.raw_excel_extract.fetch_from_ntd_xlsx(download_url)
-
-#         self.raw_excel_extract.save_content(fs=get_fs(), content=excel_content)
-
-#         excel_data = BytesIO(excel_content)
-#         df_dict = pd.read_excel(excel_data, sheet_name=None, engine="openpyxl")
-
-#         for key, df in df_dict.items():
-#             df = df.rename(make_name_bq_safe, axis="columns")
-
-#             logging.info(f"read {df.shape[0]} rows and {df.shape[1]} columns")
-
-#             self.clean_gzipped_content = gzip.compress(
-#                 df.to_json(orient="records", lines=True).encode()
-#             )
-
-#             tab_name = ""
-
-#             tab_name = make_name_bq_safe(key)
-
-#             # Save clean gzipped jsonl files to the clean bucket
-#             self.clean_excel_extract = CleanExtract(
-#                 year=self.year,
-#                 product=self.product + "/" + self.year + "/" + tab_name,
-#                 filename=f"{self.year}__{self.product}__{tab_name}.jsonl.gz",
-#             )
-
-#             self.clean_excel_extract.save_content(
-#                 fs=get_fs(), content=self.clean_gzipped_content
-#             )