|
19 | 19 | RAW_XLSX_BUCKET = os.environ["CALITP_BUCKET__NTD_XLSX_DATA_PRODUCTS__RAW"]
|
20 | 20 | CLEAN_XLSX_BUCKET = os.environ["CALITP_BUCKET__NTD_XLSX_DATA_PRODUCTS__CLEAN"]
|
21 | 21 |
|
| 22 | +# Map product and year combinations to their xcom keys for dynamic url scraping |
| 23 | +xcom_keys = { |
| 24 | + ( |
| 25 | + "complete_monthly_ridership_with_adjustments_and_estimates", |
| 26 | + "historical", |
| 27 | + ): "ridership_url", |
| 28 | + ("annual_database_agency_information", "2022"): "2022_agency_url", |
| 29 | + ("annual_database_agency_information", "2023"): "2023_agency_url", |
| 30 | + ( |
| 31 | + "annual_database_contractual_relationship", |
| 32 | + "2023", |
| 33 | + ): "contractual_relationship_url", |
| 34 | +} |
| 35 | + |
22 | 36 |
|
23 | 37 | # pulls the URL from XCom
|
24 | 38 | def pull_url_from_xcom(key, context):
|
25 | 39 | task_instance = context["ti"]
|
26 |
| - pulled_value = task_instance.xcom_pull( |
27 |
| - task_ids="scrape_ntd_xlsx_urls", |
28 |
| - key=key |
29 |
| - # task_ids="scrape_ntd_xlsx_urls", key="current_url" |
30 |
| - ) |
| 40 | + pulled_value = task_instance.xcom_pull(task_ids="scrape_ntd_xlsx_urls", key=key) |
31 | 41 | print(f"Pulled value from XCom: {pulled_value}")
|
32 | 42 | return pulled_value
|
33 | 43 |
|
@@ -120,16 +130,10 @@ def __init__(
|
120 | 130 | def execute(self, context, *args, **kwargs):
|
121 | 131 | download_url = self.raw_excel_extract.file_url
|
122 | 132 |
|
123 |
| - if self.product == "complete_monthly_ridership_with_adjustments_and_estimates": |
124 |
| - download_url = pull_url_from_xcom(key="ridership_url", context=context) |
125 |
| - |
126 |
| - if self.product == "annual_database_agency_information": |
127 |
| - download_url = pull_url_from_xcom(key="agency_url", context=context) |
| 133 | + key = (self.product, self.year) |
128 | 134 |
|
129 |
| - if self.product == "annual_database_contractual_relationship": |
130 |
| - download_url = pull_url_from_xcom( |
131 |
| - key="contractual_relationship_url", context=context |
132 |
| - ) |
| 135 | + if key in xcom_keys: |
| 136 | + download_url = pull_url_from_xcom(key=xcom_keys[key], context=context) |
133 | 137 |
|
134 | 138 | # see what is returned
|
135 | 139 | logging.info(f"reading {self.product} url as {download_url}")
|
@@ -164,140 +168,3 @@ def execute(self, context, *args, **kwargs):
|
164 | 168 | self.clean_excel_extract.save_content(
|
165 | 169 | fs=get_fs(), content=self.clean_gzipped_content
|
166 | 170 | )
|
167 |
| - |
168 |
| - |
169 |
| -# # pulls the URL from XCom |
170 |
| -# def pull_url_from_xcom(context): |
171 |
| -# task_instance = context["ti"] |
172 |
| -# pulled_value = task_instance.xcom_pull( |
173 |
| -# task_ids="scrape_ntd_xlsx_urls", key="current_url" |
174 |
| -# ) |
175 |
| -# print(f"Pulled value from XCom: {pulled_value}") |
176 |
| -# return pulled_value |
177 |
| - |
178 |
| - |
179 |
| -# class NtdDataProductXLSXExtract(PartitionedGCSArtifact): |
180 |
| -# bucket: ClassVar[str] |
181 |
| -# year: str |
182 |
| -# product: str |
183 |
| -# execution_ts: pendulum.DateTime = pendulum.now() |
184 |
| -# dt: pendulum.Date = execution_ts.date() |
185 |
| -# file_url: HttpUrl = None |
186 |
| -# partition_names: ClassVar[List[str]] = ["dt", "execution_ts"] |
187 |
| - |
188 |
| -# @property |
189 |
| -# def table(self) -> str: |
190 |
| -# return self.product |
191 |
| - |
192 |
| -# @property |
193 |
| -# def filename(self) -> str: |
194 |
| -# return self.table |
195 |
| - |
196 |
| -# class Config: |
197 |
| -# arbitrary_types_allowed = True |
198 |
| - |
199 |
| -# def fetch_from_ntd_xlsx(self, file_url): |
200 |
| -# # As of now, the only file that we are downloading is for complete_monthly_ridership_with_adjustments_and_estimates |
201 |
| -# # and the download link changes every time they update the date, so we have special handling for that here, which is dependent |
202 |
| -# # another dag task called scrape_ntd_xlsx_urls.py. if we look to download other xlsx files from the DOT portal and they |
203 |
| -# # also change the file name every time they publish, they we will have to add the same handling for all of these files and make it programmatic |
204 |
| - |
205 |
| -# validated_url = parse_obj_as(HttpUrl, file_url) |
206 |
| - |
207 |
| -# logging.info(f"reading file from url {validated_url}") |
208 |
| - |
209 |
| -# try: |
210 |
| -# excel_content = requests.get(validated_url).content |
211 |
| - |
212 |
| -# if excel_content is None or len(excel_content) == 0: |
213 |
| -# logging.info( |
214 |
| -# f"There is no data to download for {self.year} / {self.product}. Ending pipeline." |
215 |
| -# ) |
216 |
| - |
217 |
| -# pass |
218 |
| - |
219 |
| -# else: |
220 |
| -# logging.info( |
221 |
| -# f"Downloaded {self.product} data for {self.year} with {len(excel_content)} rows!" |
222 |
| -# ) |
223 |
| - |
224 |
| -# return excel_content |
225 |
| - |
226 |
| -# except requests.exceptions.RequestException as e: |
227 |
| -# logging.info(f"An error occurred: {e}") |
228 |
| - |
229 |
| -# raise |
230 |
| - |
231 |
| - |
232 |
| -# class RawExtract(NtdDataProductXLSXExtract): |
233 |
| -# bucket = RAW_XLSX_BUCKET |
234 |
| - |
235 |
| - |
236 |
| -# class CleanExtract(NtdDataProductXLSXExtract): |
237 |
| -# bucket = CLEAN_XLSX_BUCKET |
238 |
| - |
239 |
| - |
240 |
| -# class NtdDataProductXLSXOperator(BaseOperator): |
241 |
| -# template_fields = ("year", "product", "xlsx_file_url") |
242 |
| - |
243 |
| -# def __init__( |
244 |
| -# self, |
245 |
| -# product: str, |
246 |
| -# xlsx_file_url, |
247 |
| -# year: int, |
248 |
| -# *args, |
249 |
| -# **kwargs, |
250 |
| -# ): |
251 |
| -# self.year = year |
252 |
| -# self.product = product |
253 |
| -# self.xlsx_file_url = xlsx_file_url |
254 |
| - |
255 |
| -# # Save initial excel files to the raw bucket |
256 |
| -# self.raw_excel_extract = RawExtract( |
257 |
| -# year=self.year, |
258 |
| -# product=self.product + "_raw" + "/" + self.year, |
259 |
| -# file_url=self.xlsx_file_url, |
260 |
| -# filename=f"{self.year}__{self.product}_raw.xlsx", |
261 |
| -# ) |
262 |
| - |
263 |
| -# super().__init__(*args, **kwargs) |
264 |
| - |
265 |
| -# def execute(self, context, *args, **kwargs): |
266 |
| -# download_url = self.raw_excel_extract.file_url |
267 |
| - |
268 |
| -# if self.product == "complete_monthly_ridership_with_adjustments_and_estimates": |
269 |
| -# download_url = pull_url_from_xcom(context=context) |
270 |
| - |
271 |
| -# # see what is returned |
272 |
| -# logging.info(f"reading ridership url as {download_url}") |
273 |
| - |
274 |
| -# excel_content = self.raw_excel_extract.fetch_from_ntd_xlsx(download_url) |
275 |
| - |
276 |
| -# self.raw_excel_extract.save_content(fs=get_fs(), content=excel_content) |
277 |
| - |
278 |
| -# excel_data = BytesIO(excel_content) |
279 |
| -# df_dict = pd.read_excel(excel_data, sheet_name=None, engine="openpyxl") |
280 |
| - |
281 |
| -# for key, df in df_dict.items(): |
282 |
| -# df = df.rename(make_name_bq_safe, axis="columns") |
283 |
| - |
284 |
| -# logging.info(f"read {df.shape[0]} rows and {df.shape[1]} columns") |
285 |
| - |
286 |
| -# self.clean_gzipped_content = gzip.compress( |
287 |
| -# df.to_json(orient="records", lines=True).encode() |
288 |
| -# ) |
289 |
| - |
290 |
| -# tab_name = "" |
291 |
| - |
292 |
| -# tab_name = make_name_bq_safe(key) |
293 |
| - |
294 |
| -# # Save clean gzipped jsonl files to the clean bucket |
295 |
| -# self.clean_excel_extract = CleanExtract( |
296 |
| -# year=self.year, |
297 |
| -# product=self.product + "/" + self.year + "/" + tab_name, |
298 |
| -# filename=f"{self.year}__{self.product}__{tab_name}.jsonl.gz", |
299 |
| -# ) |
300 |
| - |
301 |
| -# self.clean_excel_extract.save_content( |
302 |
| -# fs=get_fs(), content=self.clean_gzipped_content |
303 |
| -# ) |
0 commit comments