Skip to content

Commit 848f35c

Browse files
authored
Merge pull request #1420 from cal-itp/more-averaging
weekday-time_of_day-year segment speeds aggregation
2 parents c060cc1 + c587f91 commit 848f35c

File tree

4 files changed

+100
-37
lines changed

4 files changed

+100
-37
lines changed

_shared_utils/shared_utils/gtfs_analytics_data.yml

+2-3
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,6 @@ rt_vs_schedule_tables:
6262
digest_tables:
6363
dir: ${gcs_paths.RT_SCHED_GCS}
6464
route_schedule_vp: "digest/schedule_vp_metrics"
65-
route_segment_speeds: "digest/segment_speeds"
66-
route_segment_geometry: "digest/segment_speeds_geom"
6765
operator_profiles: "digest/operator_profiles"
6866
operator_routes_map: "digest/operator_routes"
6967
operator_sched_rt: "digest/operator_schedule_rt_category"
@@ -85,7 +83,8 @@ rt_stop_times:
8583
trip_speeds_single_summary: "rollup_singleday/speeds_trip"
8684
route_dir_timeofday: "rollup_singleday/speeds_route_dir"
8785
segment_peakoffpeak_weekday_month: "rollup_singleday/month_speeds_route_dir_peak_segments" # NEW? what to name
88-
segment_timeofday_weekday_year: "rollup_multiday/year_weekday_speeds_route_dir_segments"
86+
segment_timeofday_weekday_year: "rollup_multiday/weekday_speeds_route_dir_segments"
87+
segments_year_file: "rollup_multiday/stop_segments"
8988
min_trip_minutes: ${speed_vars.time_min_cutoff}
9089
max_trip_minutes: 180
9190
max_speed: ${speed_vars.max_speed}

_shared_utils/shared_utils/rt_dates.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@
8787
valid_weeks = ["apr2023", "oct2023", "apr2024", "oct2024"]
8888

8989
# Remove all the one-offs
90-
one_off_dates = ["jan2023", "feb2023" "aug2023a", "oct2024g"]
90+
one_off_dates = ["jan2023", "feb2023", "aug2023a", "oct2024g"]
9191
all_dates = [v for k, v in DATES.items() if k not in one_off_dates and "2022" not in k]
9292

9393

rt_segment_speeds/logs/avg_speeds.log

+3
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,6 @@
1818
2025-03-03 13:43:54.606 | INFO | __main__:summary_speeds_by_peak_offpeak:158 - rt_stop_times summary speed averaging by peak/offpeak for 2025-02-12 execution time: 0:00:13.043791
1919
2025-03-03 13:44:02.598 | INFO | __main__:trip_summary_speeds_by_time_of_day:92 - rt_stop_times summary speed averaging by time-of-day 2025-01-15 execution time: 0:00:07.986208
2020
2025-03-03 13:44:15.279 | INFO | __main__:summary_speeds_by_peak_offpeak:158 - rt_stop_times summary speed averaging by peak/offpeak for 2025-01-15 execution time: 0:00:12.679323
21+
2025-03-14 13:34:30.155 | INFO | __main__:annual_time_of_day_averages:171 - rt_stop_times: weekday/time-of-day averages for 2023 execution time: 0:02:23.782640
22+
2025-03-14 13:36:37.069 | INFO | __main__:annual_time_of_day_averages:171 - rt_stop_times: weekday/time-of-day averages for 2024 execution time: 0:02:06.806331
23+
2025-03-14 13:36:50.516 | INFO | __main__:annual_time_of_day_averages:171 - rt_stop_times: weekday/time-of-day averages for 2025 execution time: 0:00:13.404898

rt_segment_speeds/scripts/average_segment_speeds.py

+94-33
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from typing import Literal, Optional
1515

1616
from calitp_data_analysis import utils
17+
from calitp_data_analysis.geography_utils import WGS84
1718

1819
from segment_speed_utils import gtfs_schedule_wrangling, segment_calcs, time_series_utils
1920
from shared_utils import publish_utils, time_helpers
@@ -43,8 +44,60 @@ def import_singleday_segment_speeds(
4344
return df
4445

4546

47+
def export_segment_geometry(
48+
year: str,
49+
):
50+
"""
51+
Dedupe segment geometries using columns,
52+
since geometries may slightly differ.
53+
Visual inspection shows start and endpoints might be
54+
slightly different but still capture the same corridor.
55+
56+
Big Blue Bus: stop_pair = "1115__187"
57+
In 2024, there are 4 rows, but the 4 rows are basically the same,
58+
so let's keep the most recent row.
59+
"""
60+
SEGMENTS_FILE = GTFS_DATA_DICT.rt_stop_times.segments_file
61+
EXPORT_FILE = GTFS_DATA_DICT.rt_stop_times.segments_year_file
62+
63+
keep_cols = [
64+
"schedule_gtfs_dataset_key",
65+
"route_id", "direction_id",
66+
"stop_pair",
67+
]
68+
69+
dates_in_year = [
70+
date for date in rt_dates.all_dates if year in date
71+
]
72+
73+
df = time_series_utils.concatenate_datasets_across_dates(
74+
SEGMENT_GCS,
75+
SEGMENTS_FILE,
76+
dates_in_year,
77+
columns = keep_cols + ["geometry"],
78+
data_type = "gdf",
79+
get_pandas= False,
80+
).sort_values(
81+
"service_date", ascending=False
82+
).drop(
83+
columns = "service_date"
84+
).drop_duplicates(
85+
subset = keep_cols
86+
).reset_index(drop=True).to_crs(WGS84)
87+
88+
df = df.compute()
89+
90+
df.to_parquet(
91+
f"{SEGMENT_GCS}{EXPORT_FILE}_{year}.parquet",
92+
)
93+
94+
print(f"exported stop segments for year {year}")
95+
96+
return
97+
98+
4699
def annual_time_of_day_averages(
47-
analysis_date_list: list,
100+
year: str,
48101
segment_type: Literal[SEGMENT_TYPES],
49102
config_path: Optional = GTFS_DATA_DICT
50103
):
@@ -63,6 +116,7 @@ def annual_time_of_day_averages(
63116
dict_inputs = config_path[segment_type]
64117

65118
SPEED_FILE = dict_inputs["segment_timeofday"]
119+
SEGMENTS_YEAR_FILE = dict_inputs["segments_year_file"]
66120
EXPORT_FILE = dict_inputs["segment_timeofday_weekday_year"]
67121

68122
SEGMENT_COLS = [*dict_inputs["segment_cols"]]
@@ -71,6 +125,10 @@ def annual_time_of_day_averages(
71125
OPERATOR_COLS = ["schedule_gtfs_dataset_key"]
72126
CROSSWALK_COLS = [*dict_inputs.crosswalk_cols]
73127

128+
analysis_date_list = [
129+
date for date in rt_dates.all_dates if year in date
130+
]
131+
74132
df = import_singleday_segment_speeds(
75133
SEGMENT_GCS,
76134
SPEED_FILE,
@@ -80,50 +138,44 @@ def annual_time_of_day_averages(
80138
).pipe(
81139
time_helpers.add_quarter
82140
)
83-
84-
avg_speeds = segment_calcs.calculate_weighted_averages(
85-
df,
86-
OPERATOR_COLS + SEGMENT_COLS_NO_GEOM + ["time_of_day", "weekday_weekend", "year"],
87-
metric_cols = ["p20_mph", "p50_mph", "p80_mph"],
88-
weight_col = "n_trips"
89-
).persist()
90141

91-
publish_utils.if_exists_then_delete(
92-
f"{SEGMENT_GCS}{EXPORT_FILE}"
93-
)
142+
group_cols = OPERATOR_COLS + SEGMENT_COLS_NO_GEOM + [
143+
"time_of_day", "weekday_weekend", "year"]
94144

95-
avg_speeds.to_parquet(
96-
f"{SEGMENT_GCS}{EXPORT_FILE}",
97-
partition_on = "time_of_day"
98-
)
99-
'''
100-
speeds_gdf = delayed(segment_calcs.merge_in_segment_geometry)(
101-
avg_speeds,
102-
analysis_date_list,
103-
segment_type,
104-
SEGMENT_COLS
105-
).pipe(
106-
gtfs_schedule_wrangling.merge_operator_identifiers,
145+
speed_cols = ["p20_mph", "p50_mph", "p80_mph"]
146+
weight_col = "n_trips"
147+
148+
orig_dtypes = df[group_cols + speed_cols + [weight_col]].dtypes.to_dict()
149+
150+
avg_speeds = df.map_partitions(
151+
segment_calcs.calculate_weighted_averages,
152+
OPERATOR_COLS + SEGMENT_COLS_NO_GEOM + ["time_of_day", "weekday_weekend", "year"],
153+
metric_cols = speed_cols,
154+
weight_col = weight_col,
155+
meta = {
156+
**orig_dtypes,
157+
},
158+
align_dataframes = False
159+
).compute().pipe(
160+
gtfs_schedule_wrangling.merge_operator_identifiers,
107161
analysis_date_list,
108162
columns = CROSSWALK_COLS
109163
)
110164

111-
utils.geoparquet_gcs_export(
112-
speeds_gdf,
113-
SEGMENT_GCS,
114-
EXPORT_FILE
165+
avg_speeds.to_parquet(
166+
f"{SEGMENT_GCS}{EXPORT_FILE}_{year}.parquet"
115167
)
116-
'''
117168

118169
end = datetime.datetime.now()
119170

120171
logger.info(
121-
f"{segment_type}: weekday/time-of-day averages for {analysis_date_list} "
172+
f"{segment_type}: weekday/time-of-day averages for {year} "
122173
f"execution time: {end - start}"
123174
)
124175

125176
return
126177

178+
127179
if __name__ == "__main__":
128180

129181
from shared_utils import rt_dates
@@ -135,9 +187,18 @@ def annual_time_of_day_averages(
135187
format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}",
136188
level="INFO")
137189

190+
# isolate segments per year to allow for export
191+
# rerun previous years when necessary
192+
for year in ["2025"]:
193+
194+
export_segment_geometry(year)
138195

139-
annual_time_of_day_averages(
140-
rt_dates.all_dates,
141-
segment_type = "rt_stop_times",
142-
)
196+
annual_time_of_day_averages(
197+
year,
198+
segment_type = "rt_stop_times",
199+
)
200+
201+
202+
203+
143204

0 commit comments

Comments
 (0)