Skip to content

Commit 9e6b007

Browse files
authored
Merge pull request #1228 from cal-itp/sep-open-data
Sep open data
2 parents b1061b7 + e382995 commit 9e6b007

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1235
-1404
lines changed

_shared_utils/shared_utils/rt_dates.py

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
"jun2024": "2024-06-12",
6464
"jul2024": "2024-07-17",
6565
"aug2024": "2024-08-14",
66+
"sep2024": "2024-09-18",
6667
}
6768

6869
y2023_dates = [

gtfs_funnel/Makefile

+2-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ route_typologies_data:
3030
# Clean route names for displaying across time
3131
timeseries_preprocessing:
3232
python clean_route_naming.py
33-
33+
python track_publish_dates.py
34+
3435
# monthly scheduled service, download after the end of each month
3536
monthly_scheduled_data:
3637
python download_monthly_service.py

gtfs_funnel/logs/download_data.log

+17
Original file line numberDiff line numberDiff line change
@@ -516,3 +516,20 @@
516516
2024-08-15 09:09:27.480 | INFO | __main__:download_one_day:33 - *********** Download st data ***********
517517
2024-08-15 09:11:56.577 | INFO | __main__:download_one_day:56 - execution time: 0:02:30.991910
518518
2024-08-15 10:30:38.864 | INFO | __main__:download_one_year:35 - execution time: 0:00:25.978363
519+
2024-09-19 08:13:46.511 | INFO | __main__:download_one_day:45 - Analysis date: 2024-09-18
520+
2024-09-19 08:13:49.222 | INFO | __main__:download_one_day:52 - # operators to run: 221
521+
2024-09-19 08:13:49.223 | INFO | __main__:download_one_day:56 - *********** Download trips data ***********
522+
2024-09-19 08:14:16.573 | INFO | __main__:download_one_day:86 - execution time: 0:00:30.061230
523+
2024-09-19 08:14:35.388 | INFO | __main__:download_one_day:22 - Analysis date: 2024-09-18
524+
2024-09-19 08:14:37.294 | INFO | __main__:download_one_day:29 - # operators to run: 221
525+
2024-09-19 08:14:37.294 | INFO | __main__:download_one_day:33 - *********** Download stops data ***********
526+
2024-09-19 08:14:47.392 | INFO | __main__:download_one_day:64 - execution time: 0:00:12.003376
527+
2024-09-19 08:15:03.834 | INFO | __main__:download_one_day:22 - Analysis date: 2024-09-18
528+
2024-09-19 08:15:05.784 | INFO | __main__:download_one_day:29 - # operators to run: 221
529+
2024-09-19 08:15:05.785 | INFO | __main__:download_one_day:33 - *********** Download routelines data ***********
530+
2024-09-19 08:16:57.558 | INFO | __main__:download_one_day:63 - execution time: 0:01:53.723521
531+
2024-09-19 08:17:14.221 | INFO | __main__:download_one_day:21 - Analysis date: 2024-09-18
532+
2024-09-19 08:17:15.854 | INFO | __main__:download_one_day:29 - # operators to run: 190
533+
2024-09-19 08:17:15.855 | INFO | __main__:download_one_day:33 - *********** Download st data ***********
534+
2024-09-19 08:19:06.258 | INFO | __main__:download_one_day:56 - execution time: 0:01:52.036660
535+
2024-09-19 09:28:35.882 | INFO | __main__:download_one_year:35 - execution time: 0:00:45.388883

gtfs_funnel/logs/download_vp_v2.log

+11
Original file line numberDiff line numberDiff line change
@@ -339,3 +339,14 @@
339339
2024-08-15 09:29:03.589 | INFO | __main__:<module>:112 - export concatenated vp: 0:04:16.418987
340340
2024-08-15 09:34:04.743 | INFO | __main__:<module>:134 - remove batched parquets
341341
2024-08-15 09:34:04.745 | INFO | __main__:<module>:137 - execution time: 0:09:26.469734
342+
2024-09-19 08:19:35.573 | INFO | __main__:<module>:148 - Analysis date: 2024-09-18
343+
2024-09-19 08:21:52.859 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 0 to GCS: 0:02:17.254015
344+
2024-09-19 08:23:01.583 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 1 to GCS: 0:01:08.722700
345+
2024-09-19 08:26:57.364 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 2 to GCS: 0:03:55.780573
346+
2024-09-19 08:28:55.328 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 3 to GCS: 0:01:57.952237
347+
2024-09-19 08:28:55.328 | INFO | __main__:<module>:155 - execution time: 0:09:19.722825
348+
2024-09-19 08:29:19.967 | INFO | __main__:<module>:97 - Analysis date: 2024-09-18
349+
2024-09-19 08:29:38.182 | INFO | __main__:<module>:105 - concat and filter batched data: 0:00:18.208902
350+
2024-09-19 08:33:43.251 | INFO | __main__:<module>:112 - export concatenated vp: 0:04:05.069147
351+
2024-09-19 08:37:30.865 | INFO | __main__:<module>:134 - remove batched parquets
352+
2024-09-19 08:37:30.865 | INFO | __main__:<module>:137 - execution time: 0:08:10.892310

gtfs_funnel/logs/vp_preprocessing.log

+11
Original file line numberDiff line numberDiff line change
@@ -200,3 +200,14 @@
200200
2024-08-15 10:05:01.848 | INFO | __main__:<module>:235 - vp with dwell time 2024-08-14: 0:07:09.680694
201201
2024-08-15 10:13:16.657 | INFO | __main__:<module>:120 - 2024-08-14: condense vp for trip 0:07:51.642337
202202
2024-08-15 10:24:50.802 | INFO | __main__:<module>:128 - 2024-08-14: prepare vp to use in nearest neighbor: 0:11:34.144491
203+
2024-09-19 08:46:17.298 | INFO | __main__:<module>:169 - 2024-09-18: pare down vp: 0:02:12.746302
204+
2024-09-19 08:51:10.542 | INFO | __main__:attach_prior_vp_add_direction:90 - persist vp gddf: 0:04:35.313281
205+
2024-09-19 08:55:04.346 | INFO | __main__:attach_prior_vp_add_direction:122 - np vectorize arrays for direction: 0:03:53.804190
206+
2024-09-19 08:55:11.908 | INFO | __main__:<module>:194 - 2024-09-18: export vp direction: 0:08:36.678934
207+
2024-09-19 08:56:33.980 | INFO | __main__:<module>:200 - 2024-09-18: export usable vp with direction: 0:01:22.071985
208+
2024-09-19 08:56:33.981 | INFO | __main__:<module>:203 - 2024-09-18: vp_direction script execution time: 0:09:58.750919
209+
2024-09-19 09:01:58.870 | INFO | __main__:<module>:212 - compute dwell df: 0:04:44.983561
210+
2024-09-19 09:03:13.198 | INFO | __main__:<module>:234 - merge with original and export: 0:01:14.327719
211+
2024-09-19 09:03:13.200 | INFO | __main__:<module>:235 - vp with dwell time 2024-09-18: 0:05:59.311280
212+
2024-09-19 09:08:43.742 | INFO | __main__:<module>:120 - 2024-09-18: condense vp for trip 0:05:09.575132
213+
2024-09-19 09:20:16.936 | INFO | __main__:<module>:128 - 2024-09-18: prepare vp to use in nearest neighbor: 0:11:33.194871

gtfs_funnel/published_operators.yml

+235
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
2024-09-18:
2+
- Alhambra Schedule
3+
- Amador Schedule
4+
- Anaheim Resort Schedule
5+
- Anaheim Resort Schedule v2
6+
- Antelope Valley Transit Authority Schedule
7+
- Arcadia Schedule
8+
- Arvin Schedule
9+
- Auburn Schedule
10+
- B-Line Schedule
11+
- Baldwin Park Schedule
12+
- Banning Pass Schedule
13+
- Bay Area 511 AC Transit Schedule
14+
- Bay Area 511 ACE Schedule
15+
- Bay Area 511 Angel Island-Tiburon Ferry Schedule
16+
- Bay Area 511 BART Schedule
17+
- Bay Area 511 Caltrain Schedule
18+
- Bay Area 511 Capitol Corridor Schedule
19+
- Bay Area 511 Commute.org Schedule
20+
- Bay Area 511 County Connection Schedule
21+
- Bay Area 511 Dumbarton Express Schedule
22+
- Bay Area 511 Emery Go-Round Schedule
23+
- Bay Area 511 Fairfield and Suisun Transit Schedule
24+
- Bay Area 511 Golden Gate Ferry Schedule
25+
- Bay Area 511 Golden Gate Transit Schedule
26+
- Bay Area 511 MVGO Schedule
27+
- Bay Area 511 Marin Schedule
28+
- Bay Area 511 Mission Bay Schedule
29+
- Bay Area 511 Muni Schedule
30+
- Bay Area 511 Petaluma Schedule
31+
- Bay Area 511 Rio Vista Delta Breeze Schedule
32+
- Bay Area 511 SFO AirTrain Schedule
33+
- Bay Area 511 SamTrans Schedule
34+
- Bay Area 511 San Francisco Bay Ferry Schedule
35+
- Bay Area 511 Santa Clara Transit Schedule
36+
- Bay Area 511 Santa Rosa CityBus Schedule
37+
- Bay Area 511 SolTrans Schedule
38+
- Bay Area 511 Sonoma County Transit Schedule
39+
- Bay Area 511 Sonoma-Marin Area Rail Transit Schedule
40+
- Bay Area 511 South San Francisco Shuttle Schedule
41+
- Bay Area 511 Treasure Island Ferry Schedule
42+
- Bay Area 511 Tri Delta Schedule
43+
- Bay Area 511 Tri-Valley Wheels Schedule
44+
- Bay Area 511 Union City Transit Schedule
45+
- Bay Area 511 Vacaville City Coach Schedule
46+
- Bay Area 511 Vine Transit Schedule
47+
- Bay Area 511 WestCAT Schedule
48+
- Beach Cities GMV Schedule
49+
- Bear Schedule
50+
- Beaumont Pass Schedule
51+
- Bell Gardens Schedule
52+
- Bellflower Bus Schedule
53+
- Big Blue Bus Schedule
54+
- Big Blue Bus Swiftly Schedule
55+
- BruinBus Schedule
56+
- Burbank Schedule
57+
- Calabasas Schedule
58+
- Calaveras Schedule
59+
- Cerritos on Wheels Schedule
60+
- Cerritos on Wheels Website Schedule
61+
- Clean Air Express Schedule
62+
- Clovis Schedule
63+
- Commerce Schedule
64+
- Corona Schedule
65+
- County Express Schedule
66+
- Cudahy Schedule
67+
- Culver City Schedule
68+
- Curry Public Transit Schedule
69+
- Dana Point Trolley Schedule
70+
- Delano Schedule
71+
- Desert Roadrunner GMV Schedule
72+
- Desert Roadrunner Schedule
73+
- DowneyLINK GMV Schedule
74+
- Eastern Sierra Schedule
75+
- El Dorado Schedule
76+
- El Monte Schedule
77+
- Elk Grove Schedule
78+
- Flixbus Schedule
79+
- Foothill Schedule
80+
- Fresno County Schedule
81+
- Fresno Schedule
82+
- G Trans Schedule
83+
- GET Schedule
84+
- Get Around Town Express Schedule
85+
- Glendale Schedule
86+
- Glendora Schedule
87+
- Glenn Schedule
88+
- Go West Schedule
89+
- Grapeline Schedule
90+
- Guadalupe Flyer Schedule
91+
- Havasu Landing Ferry Schedule
92+
- Humboldt Schedule
93+
- Huntington Schedule
94+
- Imperial Valley Transit Schedule
95+
- Inglewood Schedule
96+
- Irvine CONNECT Schedule
97+
- Kern Schedule
98+
- Kings Schedule
99+
- LA DOT Schedule
100+
- LA Metro Bus Schedule
101+
- LA Metro Rail Schedule
102+
- LADPW Schedule
103+
- LAX FlyAway Schedule
104+
- LAX Flyaway Bus Schedule
105+
- LAX Shuttles Schedule
106+
- La Campana Schedule
107+
- La Puente Schedule
108+
- Laguna Beach Schedule
109+
- Lake Schedule
110+
- Lassen Schedule
111+
- Lawndale Beat GMV Schedule
112+
- Lawndale Schedule
113+
- Lompoc Schedule
114+
- Long Beach Schedule
115+
- Lynwood Schedule IPS
116+
- MV Shuttle Schedule
117+
- Madera County Connection Schedule
118+
- Madera Metro Schedule
119+
- Mariposa Grove Shuttle Schedule
120+
- Maywood Schedule
121+
- Mendocino Schedule
122+
- Merced GMV Schedule
123+
- Merced Schedule
124+
- Metrolink Schedule
125+
- Montebello Schedule
126+
- Monterey Salinas Schedule
127+
- Morongo Basin Schedule
128+
- Morro Bay Cal-ITP Schedule
129+
- Mountain Transit GMV Schedule
130+
- Mountain Transit Schedule
131+
- Needles Schedule
132+
- Nevada County Schedule
133+
- North County Schedule
134+
- Norwalk Avail Schedule
135+
- OCTA Schedule
136+
- OmniTrans Schedule
137+
- Oregon POINT
138+
- Palos Verdes PTA Schedule
139+
- Pasadena Schedule
140+
- Placer Schedule
141+
- Plumas Schedule
142+
- PresidiGo Schedule
143+
- Redding Schedule
144+
- Redwood Coast Schedule
145+
- Riverside Schedule
146+
- Rosemead Passio Schedule
147+
- Roseville Schedule
148+
- Roseville Transit GMV Schedule
149+
- SBMTD Schedule
150+
- SLO Schedule
151+
- SLORTA Schedule
152+
- Sage Stage Schedule
153+
- San Clemente Trolley Schedule
154+
- San Diego Schedule
155+
- San Fernando Schedule
156+
- San Joaquin Schedule
157+
- San Juan Capistrano Trolley Schedule
158+
- Santa Clarita Schedule
159+
- Santa Maria Schedule
160+
- Santa Ynez Mecatran Schedule
161+
- Sierra Madre Schedule
162+
- Siskiyou Schedule
163+
- South County Transit Link Schedule
164+
- South San Francisco Schedule
165+
- Spirit Bus Passio Schedule
166+
- StanRTA Schedule
167+
- Stanford Schedule
168+
- SunLine Avail Schedule
169+
- 'TART, North Lake Tahoe Schedule'
170+
- TCRTA TripShot Schedule
171+
- Tahoe Transportation District GMV Schedule
172+
- Tahoe Transportation District Schedule
173+
- Tehama Schedule
174+
- Torrance Schedule
175+
- Tracy Schedule
176+
- Trinity Schedule
177+
- Tuolumne Remix Schedule
178+
- Turlock Schedule
179+
- UCSC Schedule
180+
- Unitrans Schedule
181+
- VCTC GMV Schedule
182+
- Victor Valley GMV Schedule
183+
- Victor Valley Schedule
184+
- Visalia Schedule
185+
- WeHo Schedule
186+
- YARTS Schedule
187+
- Yolobus Schedule
188+
- Yosemite Valley Shuttle Schedule
189+
- Yuba-Sutter Schedule
190+
- Yuma Schedule
191+
- eTrans Schedule
192+
2024-08-14:
193+
- Santa Cruz Schedule
194+
2024-06-12:
195+
- Anteater Express Schedule
196+
- Lassen Flex
197+
- Lynwood Schedule
198+
- Manteca Schedule
199+
2024-05-22:
200+
- El Segundo Schedule
201+
- Redwood Coast Schedulel
202+
2024-04-17:
203+
- Sacramento Schedule
204+
2024-03-13:
205+
- Avalon Schedule
206+
2024-02-14:
207+
- Rosemead Schedule
208+
2023-12-13:
209+
- DowneyLINK Schedule
210+
- Humboldt Flex
211+
- Laguna Beach Flex
212+
- Manteca Flex
213+
- Placer Flex
214+
- San Joaquin Flex
215+
- Spirit Bus Schedule
216+
- StanRTA Flex
217+
- TART Flex
218+
- Thousand Oaks Flex
219+
- Tracy Flex
220+
- Turlock Flex
221+
- Union City Flex
222+
- VCTC Flex
223+
- WestCAT Flex
224+
2023-11-15:
225+
- Amtrak Schedule
226+
- Mission Bay Schedule
227+
2023-08-15:
228+
- Blossom Express Schedule
229+
- Eastern Sierra Flex
230+
2023-06-14:
231+
- Tuolumne Schedule
232+
2023-04-12:
233+
- Guadalupe Flex
234+
2023-03-15:
235+
- TIME GMV Schedule

gtfs_funnel/track_publish_dates.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""
2+
Grab all the operators by service date from
3+
saved scheduled_trips tables from GCS.
4+
5+
Create a yaml that tells us the most recent
6+
date available for each operator (schedule_gtfs_dataset_name).
7+
"""
8+
import pandas as pd
9+
import pyaml # use pyaml because it gets us prettier indents than yaml
10+
11+
from pathlib import Path
12+
from typing import Union
13+
14+
from shared_utils import rt_dates
15+
from segment_speed_utils import time_series_utils
16+
17+
def filter_to_recent_date(df: pd.DataFrame) -> pd.DataFrame:
18+
"""
19+
By schedule_gtfs_dataset_name, keep the most recent
20+
service_date that shows up in scheduled trips.
21+
"""
22+
df2 = (df.groupby("name", group_keys=False)
23+
.service_date
24+
.max()
25+
.reset_index()
26+
.sort_values(["service_date", "name"], ascending=[False, True])
27+
.reset_index(drop=True)
28+
.astype({"service_date": "str"})
29+
)
30+
return df2
31+
32+
def export_results_yml(
33+
df: pd.DataFrame,
34+
export_yaml: Union[str, Path]
35+
):
36+
"""
37+
Save out our results from df.
38+
Convert df into a dictionary and save out dictionary results as yaml.
39+
"""
40+
# TODO: check this list manually and there will be some
41+
# operator names that have more recent names that we are keeping,
42+
# so we can remove these from our yaml
43+
exclude_me = [
44+
"TIME GMV"
45+
]
46+
47+
df2 = df[~df.name.isin(exclude_me)]
48+
49+
my_dict = {
50+
**{
51+
date_key: df2[df2.service_date==date_key].name.tolist()
52+
for date_key in df2.service_date.unique()
53+
}
54+
}
55+
56+
# sort_keys=False to prevent alphabetical sort (earliest date first)
57+
# because we want to main our results and yaml with most recent date first
58+
output = pyaml.dump(my_dict, sort_keys=False)
59+
60+
with open(export_yaml, "w") as f:
61+
f.write(output)
62+
63+
print(f"{export_yaml} exported")
64+
65+
return
66+
67+
68+
if __name__ == "__main__":
69+
70+
from update_vars import (GTFS_DATA_DICT,
71+
COMPILED_CACHED_VIEWS,
72+
PUBLISHED_OPERATORS_YAML)
73+
74+
TABLE = GTFS_DATA_DICT.schedule_downloads.trips
75+
76+
operators = time_series_utils.concatenate_datasets_across_dates(
77+
COMPILED_CACHED_VIEWS,
78+
TABLE,
79+
rt_dates.y2024_dates + rt_dates.y2023_dates,
80+
data_type = "df",
81+
get_pandas = True,
82+
columns = ["name"]
83+
).drop_duplicates().pipe(filter_to_recent_date)
84+
85+
export_results_yml(operators, PUBLISHED_OPERATORS_YAML)

0 commit comments

Comments
 (0)