1
- import pandas as pd
2
- import numpy as np
3
- from segment_speed_utils import helpers , time_series_utils , gtfs_schedule_wrangling
4
- from segment_speed_utils .project_vars import (COMPILED_CACHED_VIEWS , RT_SCHED_GCS , SCHED_GCS )
5
-
6
- from shared_utils import catalog_utils , rt_dates
7
-
8
- GTFS_DATA_DICT = catalog_utils .get_catalog ("gtfs_analytics_data" )
9
-
10
1
"""
11
2
Finding the total number of scheduled service hours for
12
3
an operator across its routes for a full week. The data is
13
4
downloaded every 1/2 a year.
14
5
15
6
Grain is operator-service_date-route
16
7
"""
8
+ import pandas as pd
9
+
10
+ from segment_speed_utils import (gtfs_schedule_wrangling , helpers ,
11
+ time_series_utils )
12
+ from segment_speed_utils .project_vars import (
13
+ COMPILED_CACHED_VIEWS , weeks_available )
14
+ from shared_utils import gtfs_utils_v2 , publish_utils , rt_dates
15
+ from update_vars import GTFS_DATA_DICT , RT_SCHED_GCS
16
+
17
+
17
18
def concatenate_trips (
18
19
date_list : list ,
19
20
) -> pd .DataFrame :
@@ -44,29 +45,6 @@ def concatenate_trips(
44
45
45
46
return df
46
47
47
- def get_day_type (date ):
48
- """
49
- Function to return the day type (e.g., Monday, Tuesday, etc.) from a datetime object.
50
- """
51
- days_of_week = ["Monday" ,
52
- "Tuesday" ,
53
- "Wednesday" ,
54
- "Thursday" ,
55
- "Friday" ,
56
- "Saturday" ,
57
- "Sunday" ]
58
- return days_of_week [date .weekday ()]
59
-
60
- def weekday_or_weekend (row ):
61
- """
62
- Tag if a day is a weekday or Saturday/Sunday
63
- """
64
- if row .day_type == "Sunday" :
65
- return "Sunday"
66
- if row .day_type == "Saturday" :
67
- return "Saturday"
68
- else :
69
- return "Weekday"
70
48
71
49
def total_service_hours (date_list : list ) -> pd .DataFrame :
72
50
"""
@@ -76,67 +54,81 @@ def total_service_hours(date_list: list) -> pd.DataFrame:
76
54
# Combine all the days' data for a week.
77
55
df = concatenate_trips (date_list )
78
56
79
- # Find day type aka Monday, Tuesday, Wednesday based on service date.
80
- df ['day_type' ] = df ['service_date' ].apply (get_day_type )
81
-
82
- # Tag if the day is a weekday, Saturday, or Sunday.
83
- df ["weekday_weekend" ] = df .apply (weekday_or_weekend , axis = 1 )
57
+ WEEKDAY_DICT = {
58
+ ** {k : "Weekday" for k in ["Monday" , "Tuesday" , "Wednesday" ,
59
+ "Thursday" , "Friday" ]},
60
+ "Saturday" : "Saturday" ,
61
+ "Sunday" : "Sunday"
62
+ }
84
63
85
- # df = gtfs_schedule_wrangling.add_weekday_weekend_column(df)
86
-
87
- # Find the minimum departure hour.
88
- df ["departure_hour" ] = df .trip_first_departure_datetime_pacific .dt .hour
64
+ # Find day type (Monday, Tuesday, etc), departure hour, month_year, and weekday_weekend
65
+ df = df .assign (
66
+ day_type = df .service_date .dt .day_name (),
67
+ departure_hour = df .trip_first_departure_datetime_pacific .dt .hour .astype ("Int64" ),
68
+ # get month_year that's 2024-04 for Apr2024 format
69
+ month_year = (df .service_date .dt .year .astype (str ) +
70
+ "-" + df .service_date .dt .month .astype (str ).str .zfill (2 )),
71
+ ).pipe (
72
+ gtfs_schedule_wrangling .add_weekday_weekend_column , WEEKDAY_DICT
73
+ )
89
74
90
- # Delete out the specific day, leave only month & year.
91
- df ["month" ] = df .service_date .astype (str ).str .slice (stop = 7 )
92
75
93
- # Total up service hours by weekday, Sunday, and Saturday.
76
+ # Total up hourly service hours by weekday, Sunday, and Saturday.
94
77
df2 = (
95
78
df .groupby (["name" ,
96
- "month " ,
79
+ "month_year " ,
97
80
"weekday_weekend" ,
98
81
"departure_hour" ])
99
- .agg (
100
- {
101
- "service_hours" : "sum" ,
102
- }
103
- )
82
+ .agg ({"service_hours" : "sum" })
104
83
.reset_index ()
105
84
)
106
85
107
- # For weekday hours, divide by 5.
108
- df2 ["weekday_service_hours" ] = df2 .service_hours / 5
86
+ # weekday hours should be divided by 5, while keeping sat/sun intact
87
+ df2 = df2 .assign (
88
+ daily_service_hours = df2 .apply (
89
+ lambda x : round (x .service_hours / 5 , 2 )
90
+ if x .weekday_weekend == "Weekday"
91
+ else round (x .service_hours , 2 ), axis = 1
92
+ ),
93
+ service_hours = df2 .service_hours .round (2 ),
94
+ )
109
95
110
- # Rename projects.
111
- df2 = df2 .rename (columns = {'service_hours' :'weekend_service_hours' })
112
96
return df2
113
97
114
- def total_service_hours_all_months () -> pd .DataFrame :
98
+
99
+ def total_service_hours_all_months (week_list : list [list ]) -> pd .DataFrame :
115
100
"""
116
101
Find service hours for a full week for one operator
117
102
and for the months we have a full week's worth of data downloaded.
118
- As of 5/2024, we have April 2023 and October 2023.
119
- """
120
- # Grab the dataframes with a full week's worth of data.
121
- apr_23week = rt_dates .get_week (month = "apr2023" , exclude_wed = False )
122
- oct_23week = rt_dates .get_week (month = "oct2023" , exclude_wed = False )
123
- apr_24week = rt_dates .get_week (month = "apr2024" , exclude_wed = False )
124
-
125
- # Sum up total service_hours
126
- apr_23df = total_service_hours (apr_23week )
127
- oct_23df = total_service_hours (oct_23week )
128
- apr_24df = total_service_hours (apr_24week )
103
+ As of 5/2024, we have April 2023, October 2023, and April 2024.
104
+ """
105
+ public_datasets = gtfs_utils_v2 .filter_to_public_schedule_gtfs_dataset_keys (
106
+ get_df = True
107
+ )
108
+ public_feeds = public_datasets .gtfs_dataset_name .unique ().tolist ()
129
109
130
110
# Combine everything
131
- all_df = pd .concat ([apr_23df , oct_23df , apr_24df ])
132
-
111
+ all_df = pd .concat (
112
+ [total_service_hours (one_week ) for one_week in week_list ]
113
+ ).pipe (
114
+ publish_utils .exclude_private_datasets ,
115
+ col = "name" ,
116
+ public_gtfs_dataset_keys = public_feeds
117
+ )
118
+
133
119
return all_df
134
120
135
121
136
122
if __name__ == "__main__" :
137
123
138
- # Save service hours.
139
- SERVICE_EXPORT = f"{ GTFS_DATA_DICT .digest_tables .dir } { GTFS_DATA_DICT .digest_tables .scheduled_service_hours } .parquet"
140
- service_hours = total_service_hours_all_months ()
141
- service_hours .to_parquet (SERVICE_EXPORT )
124
+ print (f"Aggregating for dates: { weeks_available } " )
125
+
126
+ # Save service hours
127
+ SERVICE_EXPORT = GTFS_DATA_DICT .digest_tables .scheduled_service_hours
128
+
129
+ service_hours = total_service_hours_all_months (weeks_available )
130
+
131
+ service_hours .to_parquet (
132
+ f"{ RT_SCHED_GCS } { SERVICE_EXPORT } .parquet"
133
+ )
142
134
0 commit comments