@@ -77,13 +77,13 @@ def get_undefined_feeds(feeds_on_target: pd.DataFrame) -> pd.DataFrame:
77
77
78
78
def report_unavailable_feeds (feeds , fname ):
79
79
undefined = feeds .loc [
80
- feeds ["valid_date_other_than_service_date" ] | ~ feeds ["usable_schedule_feed_exists " ]
80
+ feeds ["valid_date_other_than_service_date" ] | feeds ["no_schedule_feed_found " ]
81
81
].copy ()
82
82
if undefined .empty :
83
83
print ('no undefined service feeds' )
84
84
else :
85
85
print ('these feeds have no service defined on target date, nor are their services captured in other feeds:' )
86
- print (undefined .loc [~ undefined ["usable_schedule_feed_exists " ], "gtfs_dataset_name" ].drop_duplicates ())
86
+ print (undefined .loc [undefined ["no_schedule_feed_found " ], "gtfs_dataset_name" ].drop_duplicates ())
87
87
print ('these feeds have defined service, but only in a feed defined on a prior day' )
88
88
print (undefined .loc [undefined ["valid_date_other_than_service_date" ], "gtfs_dataset_name" ].drop_duplicates ())
89
89
print (f'saving detailed csv to { fname } ' )
@@ -102,63 +102,54 @@ def get_old_feeds(undefined_feeds_base64_urls: pd.Series, target_date: dt.date |
102
102
SELECT
103
103
`mart_gtfs.dim_schedule_feeds`.base64_url AS base64_url,
104
104
`mart_gtfs.dim_schedule_feeds`.key as feed_key,
105
- `mart_gtfs.dim_calendar`.{ day_of_the_week } AS target_day_of_the_week,
106
- MAX(`mart_gtfs.dim_schedule_feeds`._valid_to) AS valid_feed_date,
105
+ MAX(`mart_gtfs.dim_schedule_feeds`._valid_to) AS valid_feed_date
107
106
from `mart_gtfs.dim_schedule_feeds`
108
107
LEFT JOIN `mart_gtfs.dim_calendar`
109
108
ON `mart_gtfs.dim_schedule_feeds`.key = `mart_gtfs.dim_calendar`.feed_key
110
109
WHERE `mart_gtfs.dim_schedule_feeds`.base64_url IN { base_64_urls_str }
111
- AND `mart_gtfs.dim_schedule_feeds`._valid_to <= '{ target_date } '
112
110
AND `mart_gtfs.dim_schedule_feeds`._valid_to >= '{ max_lookback_date } '
113
- AND `mart_gtfs.dim_calendar`.start_date <= '{ target_date } '
111
+ AND `mart_gtfs.dim_schedule_feeds`._valid_to <= '{ target_date } '
112
+ AND `mart_gtfs.dim_calendar`.{ day_of_the_week } = 1
113
+ AND `mart_gtfs.dim_calendar`.start_date <= '{ target_date } '
114
114
AND `mart_gtfs.dim_calendar`.end_date >= '{ target_date } '
115
115
GROUP BY
116
116
`mart_gtfs.dim_schedule_feeds`.base64_url,
117
- `mart_gtfs.dim_schedule_feeds`.key,
118
- `mart_gtfs.dim_calendar`.{ day_of_the_week }
119
- ORDER BY target_day_of_the_week DESC
117
+ `mart_gtfs.dim_schedule_feeds`.key
120
118
LIMIT 1000
121
119
"""
122
120
response = query_sql (
123
121
query
124
122
)
125
123
response_grouped = response .groupby ("base64_url" )
126
124
feed_info_by_url = response_grouped [["valid_feed_date" , "feed_key" ]].first ()
127
- print (feed_info_by_url )
128
- feed_info_by_url ["valid_feed_date" ] = feed_info_by_url ["valid_feed_date" ].dt .date - dt .timedelta (days = 1 )
125
+ feed_info_by_url ["date_processed" ] = feed_info_by_url ["valid_feed_date" ].dt .date - dt .timedelta (days = 1 )
129
126
# we have the day the feed becomes invalid, so the day we are interested in where the feed *is* valid is the day after
130
- feed_info_by_url ["no_operations_on_target_day_of_the_week" ] = ~ (response_grouped ["target_day_of_the_week" ].any ())
131
- return feed_info_by_url
127
+ return feed_info_by_url .drop ("valid_feed_date" , axis = 1 )
132
128
133
129
def merge_old_feeds (df_all_feeds : pd .DataFrame , df_undefined_feeds : pd .DataFrame , target_date : dt .date , max_lookback_timedelta : dt .timedelta ) -> pd .DataFrame :
134
130
feed_search_result = get_old_feeds (
135
131
df_undefined_feeds ["base64_url" ],
136
132
target_date ,
137
133
max_lookback_timedelta
138
134
)
135
+ print (feed_search_result )
139
136
feeds_merged = df_all_feeds .merge (
140
137
feed_search_result ,
141
138
how = "left" ,
142
139
left_on = "base64_url" ,
143
140
right_index = True ,
144
141
validate = "many_to_one"
145
142
)
143
+ print (list (feeds_merged .columns ))
146
144
feeds_merged ["feed_key" ] = feeds_merged ["feed_key_y" ].fillna (feeds_merged ["feed_key_x" ])
147
145
feeds_merged ["no_schedule_feed_found" ] = (
148
146
(feeds_merged ["base64_url" ].isin (df_undefined_feeds ["base64_url" ])) & (~ feeds_merged ["base64_url" ].isin (feed_search_result .index ))
149
- )
150
- feeds_merged ["no_operations_on_target_date_but_valid_feed_exists" ] = (feeds_merged ["no_operations_on_target_day_of_the_week" ].fillna (False ))
151
- feeds_merged ["usable_schedule_feed_exists" ] = (
152
- ~ (feeds_merged ["no_schedule_feed_found" ] | feeds_merged ["no_operations_on_target_date_but_valid_feed_exists" ])
153
- )
154
- feeds_merged ["date" ] = feeds_merged .loc [
155
- ~ feeds_merged ["no_operations_on_target_date_but_valid_feed_exists" ], "valid_feed_date"
156
- ]
157
- feeds_merged ["date" ] = feeds_merged ["date" ].fillna (target_date )
147
+ ).fillna (False )
148
+ feeds_merged ["date" ] = feeds_merged ["date_processed" ].fillna (target_date )
158
149
feeds_merged ["valid_date_other_than_service_date" ] = feeds_merged ["date" ] != target_date
159
150
160
151
return feeds_merged .drop (
161
- ["valid_feed_date" , "no_operations_on_target_day_of_the_week " , "feed_key_x" , "feed_key_y" ], axis = 1
152
+ ["date_processed " , "feed_key_x" , "feed_key_y" ], axis = 1
162
153
)
163
154
164
155
if __name__ == '__main__' :
0 commit comments