Skip to content

Commit 913470f

Browse files
authored
Merge pull request #1421 from cal-itp/ah_gtfs
GTFS Digest
2 parents 848f35c + 9ccd51a commit 913470f

22 files changed

+7412
-522
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,121 +1,40 @@
1-
directory: ./gtfs_digest/
2-
notebook: ./gtfs_digest/03_report.ipynb
3-
parts:
4-
- chapters:
5-
- caption: 01 - Eureka (a253a8d7acd57657bb98050f37dd6b0f)
6-
params:
7-
District/Key: 01 - Eureka (a253a8d7acd57657bb98050f37dd6b0f)
8-
sections:
9-
- organization_name: City of Arcata
10-
- organization_name: City of Eureka
11-
- organization_name: Humboldt Transit Authority
12-
- caption: 02 - Redding (73c79ccbfd681df300489226a158b9db)
13-
params:
14-
District/Key: 02 - Redding (73c79ccbfd681df300489226a158b9db)
15-
sections:
16-
- organization_name: Tehama County
17-
- organization_name: Susanville Indian Rancheria
18-
- caption: 02 - Redding (91af7482fde58c6261f386b732404e11)
19-
params:
20-
District/Key: 02 - Redding (91af7482fde58c6261f386b732404e11)
21-
sections:
22-
- organization_name: Shasta County
23-
- organization_name: Redding Area Bus Authority
24-
- caption: 03 - Marysville (6fda78099793184fe08dd78945d188c0)
25-
params:
26-
District/Key: 03 - Marysville (6fda78099793184fe08dd78945d188c0)
27-
sections:
28-
- organization_name: North Lake Tahoe Express
29-
- organization_name: Tahoe Truckee Area Regional Transportation
30-
- caption: 03 - Marysville (70c8a8b71c815224299523bf2115924a)
31-
params:
32-
District/Key: 03 - Marysville (70c8a8b71c815224299523bf2115924a)
33-
sections:
34-
- organization_name: Sacramento Regional Transit District
35-
- organization_name: City of Rancho Cordova
36-
- caption: 04 - Oakland (09e16227fc42c4fe90204a9d11581034)
37-
params:
38-
District/Key: 04 - Oakland (09e16227fc42c4fe90204a9d11581034)
39-
sections:
40-
- organization_name: Sonoma County
41-
- organization_name: Cloverdale Transit
42-
- caption: 04 - Oakland (1ebafaca8716652559b2017b6eedc4ef)
43-
params:
44-
District/Key: 04 - Oakland (1ebafaca8716652559b2017b6eedc4ef)
45-
sections:
46-
- organization_name: Solano County Transit
47-
- organization_name: Solano Transportation Authority
48-
- caption: 04 - Oakland (82f30e22dafe8156367297eb9a316c57)
49-
params:
50-
District/Key: 04 - Oakland (82f30e22dafe8156367297eb9a316c57)
51-
sections:
52-
- organization_name: City of Alameda
53-
- organization_name: San Francisco Bay Area Water Emergency Transit Authority
54-
- caption: 04 - Oakland (c2a40ce92e76ec5beb88c40df3cd3a67)
55-
params:
56-
District/Key: 04 - Oakland (c2a40ce92e76ec5beb88c40df3cd3a67)
57-
sections:
58-
- organization_name: City of Menlo Park
59-
- organization_name: Commute.org
60-
- caption: 05 - San Luis Obispo (b34f8d2270968f55f23f80b267df1d5f)
61-
params:
62-
District/Key: 05 - San Luis Obispo (b34f8d2270968f55f23f80b267df1d5f)
63-
sections:
64-
- organization_name: City of Santa Cruz
65-
- organization_name: University of California, Santa Cruz
66-
- caption: 07 - Los Angeles (1770249a5a2e770ca90628434d4934b1)
67-
params:
68-
District/Key: 07 - Los Angeles (1770249a5a2e770ca90628434d4934b1)
69-
sections:
70-
- organization_name: City of Ojai
71-
- organization_name: Ventura County Transportation Commission
72-
- organization_name: Gold Coast Transit District
73-
- organization_name: City of Simi Valley
74-
- organization_name: City of Moorpark
75-
- organization_name: City of Thousand Oaks
76-
- organization_name: City of Camarillo
77-
- caption: 07 - Los Angeles (a37760dde6b9fdcb76b82e57afab7274)
78-
params:
79-
District/Key: 07 - Los Angeles (a37760dde6b9fdcb76b82e57afab7274)
80-
sections:
81-
- organization_name: Greyhound
82-
- organization_name: FlixBus
83-
- caption: 07 - Los Angeles (f74424acf8c41e4c1e9fd42838c4875c)
84-
params:
85-
District/Key: 07 - Los Angeles (f74424acf8c41e4c1e9fd42838c4875c)
86-
sections:
87-
- organization_name: City of Duarte
88-
- organization_name: Foothill Transit
89-
- caption: 07 - Los Angeles / Ventura (1770249a5a2e770ca90628434d4934b1)
90-
params:
91-
District/Key: 07 - Los Angeles / Ventura (1770249a5a2e770ca90628434d4934b1)
92-
sections:
93-
- organization_name: City of Camarillo
94-
- organization_name: Gold Coast Transit District
95-
- organization_name: City of Moorpark
96-
- organization_name: City of Ojai
97-
- organization_name: City of Simi Valley
98-
- organization_name: City of Thousand Oaks
99-
- organization_name: Ventura County Transportation Commission
100-
- caption: 07 - Los Angeles / Ventura (a37760dde6b9fdcb76b82e57afab7274)
101-
params:
102-
District/Key: 07 - Los Angeles / Ventura (a37760dde6b9fdcb76b82e57afab7274)
103-
sections:
104-
- organization_name: Greyhound
105-
- organization_name: FlixBus
106-
- caption: 07 - Los Angeles / Ventura (f74424acf8c41e4c1e9fd42838c4875c)
107-
params:
108-
District/Key: 07 - Los Angeles / Ventura (f74424acf8c41e4c1e9fd42838c4875c)
109-
sections:
110-
- organization_name: City of Duarte
111-
- organization_name: Foothill Transit
112-
- caption: 11 - San Diego (baeeb157e85a901e47b828ef9fe75091)
113-
params:
114-
District/Key: 11 - San Diego (baeeb157e85a901e47b828ef9fe75091)
115-
sections:
116-
- organization_name: San Diego Metropolitan Transit System
117-
- organization_name: Flagship Cruises and Events Inc.
118-
- organization_name: San Diego International Airport
119-
readme: ./gtfs_digest/README.md
120-
title: This file contains the Schedule GTFS Datset Keys with multiple operators associated
121-
with it.
1+
City of Eureka:
2+
- City of Arcata
3+
City of Moorpark:
4+
- City of Camarillo
5+
City of Ojai:
6+
- City of Camarillo
7+
City of Simi Valley:
8+
- City of Camarillo
9+
City of Thousand Oaks:
10+
- City of Camarillo
11+
Commute.org:
12+
- City of Menlo Park
13+
Foothill Transit:
14+
- City of Duarte
15+
Gold Coast Transit District:
16+
- City of Camarillo
17+
Greyhound:
18+
- FlixBus
19+
Humboldt Transit Authority:
20+
- City of Arcata
21+
Sacramento Regional Transit District:
22+
- City of Rancho Cordova
23+
San Diego International Airport:
24+
- Flagship Cruises and Events Inc.
25+
San Diego Metropolitan Transit System:
26+
- Flagship Cruises and Events Inc.
27+
Shasta County:
28+
- Redding Area Bus Authority
29+
Solano Transportation Authority:
30+
- Solano County Transit
31+
Sonoma County:
32+
- Cloverdale Transit
33+
Tahoe Truckee Area Regional Transportation:
34+
- North Lake Tahoe Express
35+
Tehama County:
36+
- Susanville Indian Rancheria
37+
University of California, Santa Cruz:
38+
- City of Santa Cruz
39+
Ventura County Transportation Commission:
40+
- City of Camarillo

_shared_utils/shared_utils/schedule_gtfs_keys_multi_orgs.py

+56-75
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from update_vars import GTFS_DATA_DICT
1212

1313
sys.path.append("../../gtfs_digest/")
14-
14+
import _operators_prep
1515

1616
def count_orgs(df: pd.DataFrame) -> list:
1717
"""
@@ -23,96 +23,77 @@ def count_orgs(df: pd.DataFrame) -> list:
2323
"""
2424
agg1 = (
2525
df.groupby(["caltrans_district", "schedule_gtfs_dataset_key"])
26-
.agg({"organization_name": "nunique"})
26+
.agg({"repeated_organization_name": "nunique"})
2727
.reset_index()
2828
)
2929

3030
# Filter out rows with more than 1 organization_name
31-
agg1 = agg1.loc[agg1.organization_name > 1].reset_index(drop=True)
31+
agg1 = agg1.loc[agg1.repeated_organization_name > 1].reset_index(drop=True)
3232
# Grab schedule_gtfs_datset_key into a list
3333
multi_org_list = list(agg1.schedule_gtfs_dataset_key.unique())
3434
return multi_org_list
3535

36-
37-
def find_schd_keys_multi_ops() -> pd.DataFrame:
36+
def find_schd_keys_multi_ops() -> dict:
3837
"""
3938
Return a dataframe with all the schedule_gtfs_dataset_keys
4039
that have more than one organization_name that corresponds to it.
4140
This way, we won't include duplicate organizations when publishing
4241
our GTFS products.
4342
"""
44-
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"
45-
46-
subset = [
47-
"caltrans_district",
48-
"schedule_gtfs_dataset_key",
49-
"organization_name",
50-
"service_date",
51-
]
52-
53-
sort_cols = [
54-
"caltrans_district",
55-
"service_date",
56-
"schedule_gtfs_dataset_key",
57-
]
58-
59-
schd_vp_df = pd.read_parquet(
60-
schd_vp_url,
61-
filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
62-
columns=subset,
63-
)
64-
65-
# Sort dataframe to keep the row for district/gtfs_key for the most
66-
# current date
67-
schd_vp_df2 = schd_vp_df.dropna(subset="caltrans_district").sort_values(by=sort_cols, ascending=[True, False, True])
68-
schd_vp_df3 = schd_vp_df2.drop_duplicates(
69-
subset=[
70-
"organization_name",
71-
"schedule_gtfs_dataset_key",
72-
"caltrans_district",
73-
]
74-
)
75-
76-
# Aggregate the dataframe to find schedule_gtfs_dataset_keys
77-
# With multiple organization_names.
78-
multi_orgs_list = count_orgs(schd_vp_df3)
79-
80-
# Filter out the dataframe to only include schedule_gtfs_keys with multiple orgs
81-
schd_vp_df4 = schd_vp_df3.loc[schd_vp_df3.schedule_gtfs_dataset_key.isin(multi_orgs_list)].reset_index(drop=True)
82-
83-
# Drop duplicates for organization_name
84-
schd_vp_df5 = schd_vp_df4.drop_duplicates(subset=["caltrans_district", "organization_name"]).reset_index(drop=True)
85-
86-
# Aggregate the dataframe to find schedule_gtfs_dataset_keys
87-
# with multiple organization_names once more.
88-
multi_orgs_list2 = count_orgs(schd_vp_df5)
89-
90-
# Filter one last time to only include schedule_gtfs_keys with multiple orgs
91-
schd_vp_df6 = schd_vp_df5.loc[schd_vp_df5.schedule_gtfs_dataset_key.isin(multi_orgs_list2)].reset_index(drop=True)
92-
93-
# Clean
94-
schd_vp_df6 = schd_vp_df6.drop(columns=["service_date"])
95-
schd_vp_df6["combo"] = schd_vp_df6.caltrans_district + " (" + schd_vp_df6.schedule_gtfs_dataset_key + ")"
96-
97-
return schd_vp_df6
9843

44+
# Load in the various dataframes that create the GTFS Digest portfolio site yaml
45+
one_to_many_df, one_to_one_df, final = _operators_prep.operators_schd_vp_rt()
46+
47+
# Subset and clean the dataframes
48+
subset_cols = ["schedule_gtfs_dataset_key", "caltrans_district", "organization_name"]
49+
50+
# This dataframe displays the relationship of 1 schedule dataset key to many
51+
# organization names
52+
one_to_many_df = one_to_many_df[subset_cols]
53+
one_to_many_df = one_to_many_df.rename(
54+
columns={"organization_name": "repeated_organization_name"}
55+
)
56+
57+
# This dataframe displays the relationship of 1 schedule dataset key to 1
58+
# organization name
59+
one_to_one_df = one_to_one_df[subset_cols]
60+
one_to_one_df = one_to_one_df.rename(
61+
columns={"organization_name": "kept_organization_name"}
62+
)
63+
# Merge the two dataframes
64+
m1 = pd.merge(
65+
one_to_one_df,
66+
one_to_many_df,
67+
on=["schedule_gtfs_dataset_key", "caltrans_district"],
68+
)
69+
70+
# Find the schedule_dataset_keys with more than one organization_name
71+
# and filter out any rows that don't meet this criteria.
72+
multiple_organizations_list = count_orgs(m1)
73+
m2 = m1.loc[m1.schedule_gtfs_dataset_key.isin(multiple_organizations_list)]
74+
75+
# Delete the rows that house the organization name we use for the portfolio
76+
m2["kept_name_bool"] = m2.kept_organization_name == m2.repeated_organization_name
77+
m3 = m2.loc[m2.kept_name_bool == False]
78+
79+
# Clean and sort
80+
final_cols = ["kept_organization_name", "repeated_organization_name"]
81+
m3 = m3.sort_values(by=final_cols)[final_cols]
82+
83+
# Turn it into a dictionary
84+
my_dict = m3.set_index("repeated_organization_name").T.to_dict("list")
85+
return my_dict
9986

10087
SITE_YML = "./schedule_gtfs_dataset_key_multi_operator.yml"
10188

10289
if __name__ == "__main__":
103-
df = find_schd_keys_multi_ops()
104-
105-
portfolio_utils.create_portfolio_yaml_chapters_with_sections(
106-
SITE_YML,
107-
df,
108-
chapter_info={
109-
"column": "combo",
110-
"name": "District/Key",
111-
"caption_prefix": "",
112-
"caption_suffix": "",
113-
},
114-
section_info={
115-
"column": "organization_name",
116-
"name": "organization_name",
117-
},
118-
)
90+
my_dict = find_schd_keys_multi_ops()
91+
92+
with open(SITE_YML) as f:
93+
site_yaml_dict = yaml.load(f, yaml.Loader)
94+
95+
output = yaml.dump(my_dict)
96+
97+
with open(SITE_YML, "w") as f:
98+
f.write(output)
99+

0 commit comments

Comments
 (0)