3
3
from shared_utils import gtfs_utils_v2
4
4
5
5
from calitp_data_analysis .tables import tbls
6
+ from calitp_data_analysis .sql import query_sql
6
7
from siuba import *
7
8
import pandas as pd
8
9
import datetime as dt
9
10
10
11
import conveyal_vars
11
12
13
+ TARGET_DATE = conveyal_vars .TARGET_DATE
14
+ REGIONAL_SUBFEED_NAME = "Regional Subfeed"
15
+
12
16
def check_defined_elsewhere (row , df ):
13
17
'''
14
18
for feeds without service defined, check if the same service is captured in another feed that does include service
@@ -17,11 +21,6 @@ def check_defined_elsewhere(row, df):
17
21
row ['service_any_feed' ] = is_defined
18
22
return row
19
23
20
-
21
-
22
- TARGET_DATE = conveyal_vars .TARGET_DATE
23
- REGIONAL_SUBFEED_NAME = "Regional Subfeed"
24
-
25
24
def get_feeds_check_service ():
26
25
feeds_on_target = gtfs_utils_v2 .schedule_daily_feed_to_gtfs_dataset_name (selected_date = TARGET_DATE )
27
26
feeds_on_target = feeds_on_target .rename (columns = {'name' :'gtfs_dataset_name' })
@@ -62,25 +61,115 @@ def attach_transit_services(feeds_on_target: pd.DataFrame):
62
61
].copy ()
63
62
return feeds_services_filtered
64
63
65
- def report_undefined (feeds_on_target : pd .DataFrame ):
66
- fname = 'no_apparent_service.csv'
64
+ def get_undefined_feeds (feeds_on_target : pd .DataFrame ) -> pd .DataFrame :
67
65
undefined = feeds_on_target .apply (check_defined_elsewhere , axis = 1 , args = [feeds_on_target ]) >> filter (- _ .service_any_feed )
66
+ return undefined
67
+
68
+ INT_TO_GTFS_WEEKDAY = {
69
+ 0 : "monday" ,
70
+ 1 : "tuesday" ,
71
+ 2 : "wednesday" ,
72
+ 3 : "thursday" ,
73
+ 4 : "friday" ,
74
+ 5 : "saturday" ,
75
+ 6 : "sunday"
76
+ }
77
+
78
+ def report_unavailable_feeds (feeds , fname ):
79
+ undefined = feeds .loc [
80
+ feeds ["valid_date_other_than_service_date" ] | ~ feeds ["usable_schedule_feed_exists" ]
81
+ ].copy ()
68
82
if undefined .empty :
69
83
print ('no undefined service feeds' )
70
84
else :
71
- print (undefined .columns )
72
85
print ('these feeds have no service defined on target date, nor are their services captured in other feeds:' )
73
- # gtfs_dataset_name no longer present, this whole script should probably be updated/replaced
74
- print (undefined >> select (_ .gtfs_dataset_name , _ .service_any_feed ))
86
+ print (undefined .loc [~ undefined ["usable_schedule_feed_exists" ], "gtfs_dataset_name" ].drop_duplicates ())
87
+ print ('these feeds have defined service, but only in a feed defined on a prior day' )
88
+ print (undefined .loc [undefined ["valid_date_other_than_service_date" ], "gtfs_dataset_name" ].drop_duplicates ())
75
89
print (f'saving detailed csv to { fname } ' )
76
- undefined .to_csv (fname )
77
- return
90
+ undefined .to_csv (fname , index = False )
91
+
92
+ ISO_DATE_ONLY_FORMAT = "%y-%m-%d"
93
+
94
+ def get_old_feeds (undefined_feeds_base64_urls : pd .Series , target_date : dt .date | dt .datetime , max_lookback_timedelta : dt .timedelta ) -> pd .Series :
95
+
96
+ base_64_urls_str = "('" + "', '" .join (undefined_feeds_base64_urls ) + "')"
97
+ day_of_the_week = INT_TO_GTFS_WEEKDAY [target_date .weekday ()]
98
+ max_lookback_date = target_date - max_lookback_timedelta
99
+ target_date_iso = target_date .strftime (ISO_DATE_ONLY_FORMAT )
100
+
101
+ query = f"""
102
+ SELECT
103
+ `mart_gtfs.dim_schedule_feeds`.base64_url AS base64_url,
104
+ `mart_gtfs.dim_schedule_feeds`.key as feed_key,
105
+ `mart_gtfs.dim_calendar`.{ day_of_the_week } AS target_day_of_the_week,
106
+ MAX(`mart_gtfs.dim_schedule_feeds`._valid_to) AS valid_feed_date,
107
+ from `mart_gtfs.dim_schedule_feeds`
108
+ LEFT JOIN `mart_gtfs.dim_calendar`
109
+ ON `mart_gtfs.dim_schedule_feeds`.key = `mart_gtfs.dim_calendar`.feed_key
110
+ WHERE `mart_gtfs.dim_schedule_feeds`.base64_url IN { base_64_urls_str }
111
+ AND `mart_gtfs.dim_schedule_feeds`._valid_to <= '{ target_date } '
112
+ AND `mart_gtfs.dim_schedule_feeds`._valid_to >= '{ max_lookback_date } '
113
+ AND `mart_gtfs.dim_calendar`.start_date <= '{ target_date } '
114
+ AND `mart_gtfs.dim_calendar`.end_date >= '{ target_date } '
115
+ GROUP BY
116
+ `mart_gtfs.dim_schedule_feeds`.base64_url,
117
+ `mart_gtfs.dim_schedule_feeds`.key,
118
+ `mart_gtfs.dim_calendar`.{ day_of_the_week }
119
+ ORDER BY target_day_of_the_week DESC
120
+ LIMIT 1000
121
+ """
122
+ response = query_sql (
123
+ query
124
+ )
125
+ response_grouped = response .groupby ("base64_url" )
126
+ feed_info_by_url = response_grouped [["valid_feed_date" , "feed_key" ]].first ()
127
+ print (feed_info_by_url )
128
+ feed_info_by_url ["valid_feed_date" ] = feed_info_by_url ["valid_feed_date" ].dt .date - dt .timedelta (days = 1 )
129
+ # we have the day the feed becomes invalid, so the day we are interested in where the feed *is* valid is the day after
130
+ feed_info_by_url ["no_operations_on_target_day_of_the_week" ] = ~ (response_grouped ["target_day_of_the_week" ].any ())
131
+ return feed_info_by_url
132
+
133
+ def merge_old_feeds (df_all_feeds : pd .DataFrame , df_undefined_feeds : pd .DataFrame , target_date : dt .date , max_lookback_timedelta : dt .timedelta ) -> pd .DataFrame :
134
+ feed_search_result = get_old_feeds (
135
+ df_undefined_feeds ["base64_url" ],
136
+ target_date ,
137
+ max_lookback_timedelta
138
+ )
139
+ feeds_merged = df_all_feeds .merge (
140
+ feed_search_result ,
141
+ how = "left" ,
142
+ left_on = "base64_url" ,
143
+ right_index = True ,
144
+ validate = "many_to_one"
145
+ )
146
+ feeds_merged ["feed_key" ] = feeds_merged ["feed_key_y" ].fillna (feeds_merged ["feed_key_x" ])
147
+ feeds_merged ["no_schedule_feed_found" ] = (
148
+ (feeds_merged ["base64_url" ].isin (df_undefined_feeds ["base64_url" ])) & (~ feeds_merged ["base64_url" ].isin (feed_search_result .index ))
149
+ )
150
+ feeds_merged ["no_operations_on_target_date_but_valid_feed_exists" ] = (feeds_merged ["no_operations_on_target_day_of_the_week" ].fillna (False ))
151
+ feeds_merged ["usable_schedule_feed_exists" ] = (
152
+ ~ (feeds_merged ["no_schedule_feed_found" ] | feeds_merged ["no_operations_on_target_date_but_valid_feed_exists" ])
153
+ )
154
+ feeds_merged ["date" ] = feeds_merged .loc [
155
+ ~ feeds_merged ["no_operations_on_target_date_but_valid_feed_exists" ], "valid_feed_date"
156
+ ]
157
+ feeds_merged ["date" ] = feeds_merged ["date" ].fillna (target_date )
158
+ feeds_merged ["valid_date_other_than_service_date" ] = feeds_merged ["date" ] != target_date
159
+
160
+ return feeds_merged .drop (
161
+ ["valid_feed_date" , "no_operations_on_target_day_of_the_week" , "feed_key_x" , "feed_key_y" ], axis = 1
162
+ )
78
163
79
164
if __name__ == '__main__' :
80
165
81
166
feeds_on_target = get_feeds_check_service ()
82
167
feeds_on_target = attach_transit_services (feeds_on_target )
83
168
print (f'feeds on target date shape: { feeds_on_target .shape } ' )
84
- report_undefined (feeds_on_target )
85
- feeds_on_target .to_parquet (f'{ conveyal_vars .GCS_PATH } feeds_{ TARGET_DATE } .parquet' )
169
+ undefined_feeds = get_undefined_feeds (feeds_on_target )
170
+ feeds_merged = merge_old_feeds (
171
+ feeds_on_target , undefined_feeds , dt .date .fromisoformat (TARGET_DATE ), conveyal_vars .LOOKBACK_TIME
172
+ )
173
+ report_unavailable_feeds (feeds_merged , 'no_apparent_service.csv' )
174
+ feeds_merged .to_parquet (f'{ conveyal_vars .GCS_PATH } feeds_{ TARGET_DATE } .parquet' )
86
175
0 commit comments