@@ -77,13 +77,13 @@ def get_undefined_feeds(feeds_on_target: pd.DataFrame) -> pd.DataFrame:
7777
7878def report_unavailable_feeds (feeds , fname ):
7979 undefined = feeds .loc [
80- feeds ["valid_date_other_than_service_date" ] | ~ feeds ["usable_schedule_feed_exists " ]
80+ feeds ["valid_date_other_than_service_date" ] | feeds ["no_schedule_feed_found " ]
8181 ].copy ()
8282 if undefined .empty :
8383 print ('no undefined service feeds' )
8484 else :
8585 print ('these feeds have no service defined on target date, nor are their services captured in other feeds:' )
86- print (undefined .loc [~ undefined ["usable_schedule_feed_exists " ], "gtfs_dataset_name" ].drop_duplicates ())
86+ print (undefined .loc [undefined ["no_schedule_feed_found " ], "gtfs_dataset_name" ].drop_duplicates ())
8787 print ('these feeds have defined service, but only in a feed defined on a prior day' )
8888 print (undefined .loc [undefined ["valid_date_other_than_service_date" ], "gtfs_dataset_name" ].drop_duplicates ())
8989 print (f'saving detailed csv to { fname } ' )
@@ -102,63 +102,54 @@ def get_old_feeds(undefined_feeds_base64_urls: pd.Series, target_date: dt.date |
102102 SELECT
103103 `mart_gtfs.dim_schedule_feeds`.base64_url AS base64_url,
104104 `mart_gtfs.dim_schedule_feeds`.key as feed_key,
105- `mart_gtfs.dim_calendar`.{ day_of_the_week } AS target_day_of_the_week,
106- MAX(`mart_gtfs.dim_schedule_feeds`._valid_to) AS valid_feed_date,
105+ MAX(`mart_gtfs.dim_schedule_feeds`._valid_to) AS valid_feed_date
107106 from `mart_gtfs.dim_schedule_feeds`
108107 LEFT JOIN `mart_gtfs.dim_calendar`
109108 ON `mart_gtfs.dim_schedule_feeds`.key = `mart_gtfs.dim_calendar`.feed_key
110109 WHERE `mart_gtfs.dim_schedule_feeds`.base64_url IN { base_64_urls_str }
111- AND `mart_gtfs.dim_schedule_feeds`._valid_to <= '{ target_date } '
112110 AND `mart_gtfs.dim_schedule_feeds`._valid_to >= '{ max_lookback_date } '
113- AND `mart_gtfs.dim_calendar`.start_date <= '{ target_date } '
111+ AND `mart_gtfs.dim_schedule_feeds`._valid_to <= '{ target_date } '
112+ AND `mart_gtfs.dim_calendar`.{ day_of_the_week } = 1
113+ AND `mart_gtfs.dim_calendar`.start_date <= '{ target_date } '
114114 AND `mart_gtfs.dim_calendar`.end_date >= '{ target_date } '
115115 GROUP BY
116116 `mart_gtfs.dim_schedule_feeds`.base64_url,
117- `mart_gtfs.dim_schedule_feeds`.key,
118- `mart_gtfs.dim_calendar`.{ day_of_the_week }
119- ORDER BY target_day_of_the_week DESC
117+ `mart_gtfs.dim_schedule_feeds`.key
120118 LIMIT 1000
121119 """
122120 response = query_sql (
123121 query
124122 )
125123 response_grouped = response .groupby ("base64_url" )
126124 feed_info_by_url = response_grouped [["valid_feed_date" , "feed_key" ]].first ()
127- print (feed_info_by_url )
128- feed_info_by_url ["valid_feed_date" ] = feed_info_by_url ["valid_feed_date" ].dt .date - dt .timedelta (days = 1 )
125+ feed_info_by_url ["date_processed" ] = feed_info_by_url ["valid_feed_date" ].dt .date - dt .timedelta (days = 1 )
129126 # we have the day the feed becomes invalid, so the day we are interested in where the feed *is* valid is the day after
130- feed_info_by_url ["no_operations_on_target_day_of_the_week" ] = ~ (response_grouped ["target_day_of_the_week" ].any ())
131- return feed_info_by_url
127+ return feed_info_by_url .drop ("valid_feed_date" , axis = 1 )
132128
133129def merge_old_feeds (df_all_feeds : pd .DataFrame , df_undefined_feeds : pd .DataFrame , target_date : dt .date , max_lookback_timedelta : dt .timedelta ) -> pd .DataFrame :
134130 feed_search_result = get_old_feeds (
135131 df_undefined_feeds ["base64_url" ],
136132 target_date ,
137133 max_lookback_timedelta
138134 )
135+ print (feed_search_result )
139136 feeds_merged = df_all_feeds .merge (
140137 feed_search_result ,
141138 how = "left" ,
142139 left_on = "base64_url" ,
143140 right_index = True ,
144141 validate = "many_to_one"
145142 )
143+ print (list (feeds_merged .columns ))
146144 feeds_merged ["feed_key" ] = feeds_merged ["feed_key_y" ].fillna (feeds_merged ["feed_key_x" ])
147145 feeds_merged ["no_schedule_feed_found" ] = (
148146 (feeds_merged ["base64_url" ].isin (df_undefined_feeds ["base64_url" ])) & (~ feeds_merged ["base64_url" ].isin (feed_search_result .index ))
149- )
150- feeds_merged ["no_operations_on_target_date_but_valid_feed_exists" ] = (feeds_merged ["no_operations_on_target_day_of_the_week" ].fillna (False ))
151- feeds_merged ["usable_schedule_feed_exists" ] = (
152- ~ (feeds_merged ["no_schedule_feed_found" ] | feeds_merged ["no_operations_on_target_date_but_valid_feed_exists" ])
153- )
154- feeds_merged ["date" ] = feeds_merged .loc [
155- ~ feeds_merged ["no_operations_on_target_date_but_valid_feed_exists" ], "valid_feed_date"
156- ]
157- feeds_merged ["date" ] = feeds_merged ["date" ].fillna (target_date )
147+ ).fillna (False )
148+ feeds_merged ["date" ] = feeds_merged ["date_processed" ].fillna (target_date )
158149 feeds_merged ["valid_date_other_than_service_date" ] = feeds_merged ["date" ] != target_date
159150
160151 return feeds_merged .drop (
161- ["valid_feed_date" , "no_operations_on_target_day_of_the_week " , "feed_key_x" , "feed_key_y" ], axis = 1
152+ ["date_processed " , "feed_key_x" , "feed_key_y" ], axis = 1
162153 )
163154
164155if __name__ == '__main__' :
0 commit comments