Skip to content

Commit aaa8816

Browse files
authored
Merge pull request #1223 from cal-itp/private-datasets
Exclude private datasets from 6 geoportal datasets, GTFS digest
2 parents 8b5e4cf + 01f2382 commit aaa8816

13 files changed

+199
-116
lines changed

_shared_utils/setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
setup(
55
name="shared_utils",
66
packages=find_packages(),
7-
version="2.5",
7+
version="2.5.1",
88
description="Shared utility functions for data analyses",
99
author="Cal-ITP",
1010
license="Apache",

_shared_utils/shared_utils/gtfs_utils_v2.py

+20
Original file line numberDiff line numberDiff line change
@@ -503,3 +503,23 @@ def get_stop_times(
503503
)
504504

505505
return stop_times
506+
507+
508+
def filter_to_public_schedule_gtfs_dataset_keys(get_df: bool = False) -> list:
509+
"""
510+
Return a list of schedule_gtfs_dataset_keys that have
511+
private_dataset == None.
512+
private_dataset holds values:True or None, no False.
513+
"""
514+
dim_gtfs_datasets = schedule_rt_utils.filter_dim_gtfs_datasets(
515+
keep_cols=["key", "name", "private_dataset"],
516+
custom_filtering={
517+
"type": ["schedule"],
518+
},
519+
get_df=True,
520+
) >> filter(_.private_dataset != True)
521+
522+
if get_df:
523+
return dim_gtfs_datasets
524+
else:
525+
return dim_gtfs_datasets.gtfs_dataset_key.unique().tolist()

_shared_utils/shared_utils/publish_utils.py

+12
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Union
44

55
import gcsfs
6+
import pandas as pd
67

78
fs = gcsfs.GCSFileSystem()
89
PUBLIC_BUCKET = "gs://calitp-publish-data-analysis/"
@@ -47,3 +48,14 @@ def if_exists_then_delete(filepath: str):
4748
fs.rm(filepath)
4849

4950
return
51+
52+
53+
def exclude_private_datasets(
54+
df: pd.DataFrame,
55+
col: str = "schedule_gtfs_dataset_key",
56+
public_gtfs_dataset_keys: list = [],
57+
) -> pd.DataFrame:
58+
"""
59+
Filter out private datasets.
60+
"""
61+
return df[df[col].isin(public_gtfs_dataset_keys)].reset_index(drop=True)

gtfs_digest/merge_data.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from calitp_data_analysis import utils
1010
from segment_speed_utils import gtfs_schedule_wrangling, time_series_utils
11+
from shared_utils import gtfs_utils_v2, publish_utils
1112
from update_vars import GTFS_DATA_DICT, SEGMENT_GCS, RT_SCHED_GCS, SCHED_GCS
1213

1314
route_time_cols = ["schedule_gtfs_dataset_key",
@@ -220,16 +221,21 @@ def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
220221

221222
return df3
222223

224+
223225
if __name__ == "__main__":
224226

225227
from shared_utils import rt_dates
226228

227-
analysis_date_list = (rt_dates.y2024_dates + rt_dates.y2023_dates
228-
)
229+
analysis_date_list = (
230+
rt_dates.y2024_dates + rt_dates.y2023_dates
231+
)
229232

230233
DIGEST_RT_SCHED = GTFS_DATA_DICT.digest_tables.route_schedule_vp
231234
DIGEST_SEGMENT_SPEEDS = GTFS_DATA_DICT.digest_tables.route_segment_speeds
232235

236+
# These are public schedule_gtfs_dataset_keys
237+
public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()
238+
233239
# Get cardinal direction for each route
234240
df_sched = concatenate_schedule_by_route_direction(analysis_date_list)
235241

@@ -274,11 +280,15 @@ def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
274280
df_crosswalk,
275281
on = ["schedule_gtfs_dataset_key", "name", "service_date"],
276282
how = "left"
283+
).pipe(
284+
# Find the most common cardinal direction
285+
gtfs_schedule_wrangling.top_cardinal_direction
286+
).pipe(
287+
# Drop any private datasets before exporting
288+
publish_utils.exclude_private_datasets,
289+
public_gtfs_dataset_keys= public_feeds
277290
)
278-
279-
# Find the most common cardinal direction
280-
df = gtfs_schedule_wrangling.top_cardinal_direction(df)
281-
291+
282292
integrify = [
283293
"n_scheduled_trips", "n_vp_trips",
284294
"minutes_atleast1_vp", "minutes_atleast2_vp",
@@ -304,7 +314,9 @@ def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
304314
primary_typology,
305315
on = route_time_cols,
306316
how = "left"
307-
)
317+
).pipe(
318+
publish_utils.exclude_private_datasets,
319+
public_gtfs_dataset_keys= public_feeds)
308320

309321
utils.geoparquet_gcs_export(
310322
segment_speeds2,

gtfs_digest/merge_operator_data.py

+28-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from calitp_data_analysis import utils
1010
from segment_speed_utils import time_series_utils
11+
from shared_utils import publish_utils
1112
from merge_data import merge_in_standardized_route_names
1213
from update_vars import GTFS_DATA_DICT, SCHED_GCS, RT_SCHED_GCS
1314

@@ -100,6 +101,7 @@ def operator_category_counts_by_date() -> pd.DataFrame:
100101

101102
return operator_category_counts
102103

104+
103105
if __name__ == "__main__":
104106

105107
from shared_utils import rt_dates
@@ -111,6 +113,8 @@ def operator_category_counts_by_date() -> pd.DataFrame:
111113
SCHED_RT_CATEGORY = GTFS_DATA_DICT.digest_tables.operator_sched_rt
112114
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk
113115

116+
public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()
117+
114118
# Concat operator profiles
115119
df = concatenate_operator_stats(analysis_date_list)
116120

@@ -141,14 +145,20 @@ def operator_category_counts_by_date() -> pd.DataFrame:
141145

142146
# Merge
143147
merge_cols = ["schedule_gtfs_dataset_key", "service_date"]
144-
op_profiles_df1 = pd.merge(df,
145-
crosswalk_df,
146-
on = merge_cols,
147-
how = "left")
148+
op_profiles_df1 = pd.merge(
149+
df,
150+
crosswalk_df,
151+
on = merge_cols,
152+
how = "left"
153+
)
148154

149155
# Drop duplicates created after merging
150156
op_profiles_df2 = (op_profiles_df1
151-
.drop_duplicates(subset = list(op_profiles_df1.columns))
157+
.pipe(
158+
publish_utils.exclude_private_datasets,
159+
col = "schedule_gtfs_dataset_key",
160+
public_gtfs_dataset_keys = public_feeds
161+
).drop_duplicates(subset = list(op_profiles_df1.columns))
152162
.reset_index(drop = True))
153163

154164
op_profiles_df2.to_parquet(
@@ -157,15 +167,26 @@ def operator_category_counts_by_date() -> pd.DataFrame:
157167

158168
gdf = concatenate_operator_routes(
159169
analysis_date_list
160-
).pipe(merge_in_standardized_route_names)
170+
).pipe(
171+
merge_in_standardized_route_names
172+
).pipe(
173+
publish_utils.exclude_private_datasets,
174+
col = "schedule_gtfs_dataset_key",
175+
public_gtfs_dataset_keys = public_feeds
176+
)
161177

162178
utils.geoparquet_gcs_export(
163179
gdf,
164180
RT_SCHED_GCS,
165181
OPERATOR_ROUTE
166182
)
167183

168-
operator_category_counts = operator_category_counts_by_date()
184+
operator_category_counts = operator_category_counts_by_date().pipe(
185+
publish_utils.exclude_private_datasets,
186+
col = "schedule_gtfs_dataset_key",
187+
public_gtfs_dataset_keys = public_feeds
188+
)
189+
169190
operator_category_counts.to_parquet(
170191
f"{RT_SCHED_GCS}{SCHED_RT_CATEGORY}.parquet"
171192
)

gtfs_digest/merge_operator_service.py

+64-72
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
1-
import pandas as pd
2-
import numpy as np
3-
from segment_speed_utils import helpers, time_series_utils, gtfs_schedule_wrangling
4-
from segment_speed_utils.project_vars import (COMPILED_CACHED_VIEWS, RT_SCHED_GCS, SCHED_GCS)
5-
6-
from shared_utils import catalog_utils, rt_dates
7-
8-
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
9-
101
"""
112
Finding the total number of scheduled service hours for
123
an operator across its routes for a full week. The data is
134
downloaded every 1/2 a year.
145
156
Grain is operator-service_date-route
167
"""
8+
import pandas as pd
9+
10+
from segment_speed_utils import (gtfs_schedule_wrangling, helpers,
11+
time_series_utils)
12+
from segment_speed_utils.project_vars import (
13+
COMPILED_CACHED_VIEWS, weeks_available)
14+
from shared_utils import gtfs_utils_v2, publish_utils, rt_dates
15+
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS
16+
17+
1718
def concatenate_trips(
1819
date_list: list,
1920
) -> pd.DataFrame:
@@ -44,29 +45,6 @@ def concatenate_trips(
4445

4546
return df
4647

47-
def get_day_type(date):
48-
"""
49-
Function to return the day type (e.g., Monday, Tuesday, etc.) from a datetime object.
50-
"""
51-
days_of_week = ["Monday",
52-
"Tuesday",
53-
"Wednesday",
54-
"Thursday",
55-
"Friday",
56-
"Saturday",
57-
"Sunday"]
58-
return days_of_week[date.weekday()]
59-
60-
def weekday_or_weekend(row):
61-
"""
62-
Tag if a day is a weekday or Saturday/Sunday
63-
"""
64-
if row.day_type == "Sunday":
65-
return "Sunday"
66-
if row.day_type == "Saturday":
67-
return "Saturday"
68-
else:
69-
return "Weekday"
7048

7149
def total_service_hours(date_list: list) -> pd.DataFrame:
7250
"""
@@ -76,67 +54,81 @@ def total_service_hours(date_list: list) -> pd.DataFrame:
7654
# Combine all the days' data for a week.
7755
df = concatenate_trips(date_list)
7856

79-
# Find day type aka Monday, Tuesday, Wednesday based on service date.
80-
df['day_type'] = df['service_date'].apply(get_day_type)
81-
82-
# Tag if the day is a weekday, Saturday, or Sunday.
83-
df["weekday_weekend"] = df.apply(weekday_or_weekend, axis=1)
57+
WEEKDAY_DICT = {
58+
**{k: "Weekday" for k in ["Monday", "Tuesday", "Wednesday",
59+
"Thursday", "Friday"]},
60+
"Saturday": "Saturday",
61+
"Sunday": "Sunday"
62+
}
8463

85-
# df = gtfs_schedule_wrangling.add_weekday_weekend_column(df)
86-
87-
# Find the minimum departure hour.
88-
df["departure_hour"] = df.trip_first_departure_datetime_pacific.dt.hour
64+
# Find day type (Monday, Tuesday, etc), departure hour, month_year, and weekday_weekend
65+
df = df.assign(
66+
day_type = df.service_date.dt.day_name(),
67+
departure_hour = df.trip_first_departure_datetime_pacific.dt.hour.astype("Int64"),
68+
# get month_year that's 2024-04 for Apr2024 format
69+
month_year = (df.service_date.dt.year.astype(str) +
70+
"-" + df.service_date.dt.month.astype(str).str.zfill(2)),
71+
).pipe(
72+
gtfs_schedule_wrangling.add_weekday_weekend_column, WEEKDAY_DICT
73+
)
8974

90-
# Delete out the specific day, leave only month & year.
91-
df["month"] = df.service_date.astype(str).str.slice(stop=7)
9275

93-
# Total up service hours by weekday, Sunday, and Saturday.
76+
# Total up hourly service hours by weekday, Sunday, and Saturday.
9477
df2 = (
9578
df.groupby(["name",
96-
"month",
79+
"month_year",
9780
"weekday_weekend",
9881
"departure_hour"])
99-
.agg(
100-
{
101-
"service_hours": "sum",
102-
}
103-
)
82+
.agg({"service_hours": "sum"})
10483
.reset_index()
10584
)
10685

107-
# For weekday hours, divide by 5.
108-
df2["weekday_service_hours"] = df2.service_hours/5
86+
# weekday hours should be divided by 5, while keeping sat/sun intact
87+
df2 = df2.assign(
88+
daily_service_hours = df2.apply(
89+
lambda x: round(x.service_hours / 5, 2)
90+
if x.weekday_weekend=="Weekday"
91+
else round(x.service_hours, 2), axis=1
92+
),
93+
service_hours = df2.service_hours.round(2),
94+
)
10995

110-
# Rename projects.
111-
df2 = df2.rename(columns = {'service_hours':'weekend_service_hours'})
11296
return df2
11397

114-
def total_service_hours_all_months() -> pd.DataFrame:
98+
99+
def total_service_hours_all_months(week_list: list[list]) -> pd.DataFrame:
115100
"""
116101
Find service hours for a full week for one operator
117102
and for the months we have a full week's worth of data downloaded.
118-
As of 5/2024, we have April 2023 and October 2023.
119-
"""
120-
# Grab the dataframes with a full week's worth of data.
121-
apr_23week = rt_dates.get_week(month="apr2023", exclude_wed=False)
122-
oct_23week = rt_dates.get_week(month="oct2023", exclude_wed=False)
123-
apr_24week = rt_dates.get_week(month="apr2024", exclude_wed=False)
124-
125-
# Sum up total service_hours
126-
apr_23df = total_service_hours(apr_23week)
127-
oct_23df = total_service_hours(oct_23week)
128-
apr_24df = total_service_hours(apr_24week)
103+
As of 5/2024, we have April 2023, October 2023, and April 2024.
104+
"""
105+
public_datasets = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys(
106+
get_df=True
107+
)
108+
public_feeds = public_datasets.gtfs_dataset_name.unique().tolist()
129109

130110
# Combine everything
131-
all_df = pd.concat([apr_23df, oct_23df, apr_24df])
132-
111+
all_df = pd.concat(
112+
[total_service_hours(one_week) for one_week in week_list]
113+
).pipe(
114+
publish_utils.exclude_private_datasets,
115+
col = "name",
116+
public_gtfs_dataset_keys = public_feeds
117+
)
118+
133119
return all_df
134120

135121

136122
if __name__ == "__main__":
137123

138-
# Save service hours.
139-
SERVICE_EXPORT = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.scheduled_service_hours}.parquet"
140-
service_hours = total_service_hours_all_months()
141-
service_hours.to_parquet(SERVICE_EXPORT)
124+
print(f"Aggregating for dates: {weeks_available}")
125+
126+
# Save service hours
127+
SERVICE_EXPORT = GTFS_DATA_DICT.digest_tables.scheduled_service_hours
128+
129+
service_hours = total_service_hours_all_months(weeks_available)
130+
131+
service_hours.to_parquet(
132+
f"{RT_SCHED_GCS}{SERVICE_EXPORT}.parquet"
133+
)
142134

0 commit comments

Comments
 (0)