Skip to content

Commit 01f2382

Browse files
author
tiffanychu90
committed
update refs for speeds
1 parent cd065ff commit 01f2382

File tree

5 files changed

+28
-27
lines changed

5 files changed

+28
-27
lines changed

gtfs_digest/merge_data.py

+6-16
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from calitp_data_analysis import utils
1010
from segment_speed_utils import gtfs_schedule_wrangling, time_series_utils
11-
from shared_utils import gtfs_utils_v2
11+
from shared_utils import gtfs_utils_v2, publish_utils
1212
from update_vars import GTFS_DATA_DICT, SEGMENT_GCS, RT_SCHED_GCS, SCHED_GCS
1313

1414
route_time_cols = ["schedule_gtfs_dataset_key",
@@ -222,19 +222,6 @@ def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
222222
return df3
223223

224224

225-
def exclude_private_datasets(
226-
df: pd.DataFrame,
227-
col: str = "schedule_gtfs_dataset_key",
228-
public_gtfs_dataset_keys: list = [],
229-
) -> pd.DataFrame:
230-
"""
231-
Filter out private datasets.
232-
"""
233-
return df[
234-
df[col].isin(public_gtfs_dataset_keys)
235-
].reset_index(drop=True)
236-
237-
238225
if __name__ == "__main__":
239226

240227
from shared_utils import rt_dates
@@ -298,7 +285,8 @@ def exclude_private_datasets(
298285
gtfs_schedule_wrangling.top_cardinal_direction
299286
).pipe(
300287
# Drop any private datasets before exporting
301-
exclude_private_datasets, public_gtfs_dataset_keys= public_feeds
288+
publish_utils.exclude_private_datasets,
289+
public_gtfs_dataset_keys= public_feeds
302290
)
303291

304292
integrify = [
@@ -326,7 +314,9 @@ def exclude_private_datasets(
326314
primary_typology,
327315
on = route_time_cols,
328316
how = "left"
329-
).pipe(exclude_private_datasets, public_gtfs_dataset_keys= public_feeds)
317+
).pipe(
318+
publish_utils.exclude_private_datasets,
319+
public_gtfs_dataset_keys= public_feeds)
330320

331321
utils.geoparquet_gcs_export(
332322
segment_speeds2,

gtfs_digest/merge_operator_data.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88

99
from calitp_data_analysis import utils
1010
from segment_speed_utils import time_series_utils
11-
from merge_data import merge_in_standardized_route_names, exclude_private_datasets
11+
from shared_utils import publish_utils
12+
from merge_data import merge_in_standardized_route_names
1213
from update_vars import GTFS_DATA_DICT, SCHED_GCS, RT_SCHED_GCS
1314

1415
sort_cols = ["schedule_gtfs_dataset_key", "service_date"]
@@ -154,7 +155,7 @@ def operator_category_counts_by_date() -> pd.DataFrame:
154155
# Drop duplicates created after merging
155156
op_profiles_df2 = (op_profiles_df1
156157
.pipe(
157-
exclude_private_datasets,
158+
publish_utils.exclude_private_datasets,
158159
col = "schedule_gtfs_dataset_key",
159160
public_gtfs_dataset_keys = public_feeds
160161
).drop_duplicates(subset = list(op_profiles_df1.columns))
@@ -169,7 +170,7 @@ def operator_category_counts_by_date() -> pd.DataFrame:
169170
).pipe(
170171
merge_in_standardized_route_names
171172
).pipe(
172-
exclude_private_datasets,
173+
publish_utils.exclude_private_datasets,
173174
col = "schedule_gtfs_dataset_key",
174175
public_gtfs_dataset_keys = public_feeds
175176
)
@@ -181,7 +182,7 @@ def operator_category_counts_by_date() -> pd.DataFrame:
181182
)
182183

183184
operator_category_counts = operator_category_counts_by_date().pipe(
184-
exclude_private_datasets,
185+
publish_utils.exclude_private_datasets,
185186
col = "schedule_gtfs_dataset_key",
186187
public_gtfs_dataset_keys = public_feeds
187188
)

gtfs_digest/merge_operator_service.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@
77
"""
88
import pandas as pd
99

10-
from merge_data import exclude_private_datasets
1110
from segment_speed_utils import (gtfs_schedule_wrangling, helpers,
1211
time_series_utils)
1312
from segment_speed_utils.project_vars import (
1413
COMPILED_CACHED_VIEWS, weeks_available)
15-
from shared_utils import gtfs_utils_v2, rt_dates
14+
from shared_utils import gtfs_utils_v2, publish_utils, rt_dates
1615
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS
1716

1817

@@ -103,14 +102,16 @@ def total_service_hours_all_months(week_list: list[list]) -> pd.DataFrame:
103102
and for the months we have a full week's worth of data downloaded.
104103
As of 5/2024, we have April 2023, October 2023, and April 2024.
105104
"""
106-
public_datasets = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys(get_df=True)
105+
public_datasets = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys(
106+
get_df=True
107+
)
107108
public_feeds = public_datasets.gtfs_dataset_name.unique().tolist()
108109

109110
# Combine everything
110111
all_df = pd.concat(
111112
[total_service_hours(one_week) for one_week in week_list]
112113
).pipe(
113-
exclude_private_datasets,
114+
publish_utils.exclude_private_datasets,
114115
col = "name",
115116
public_gtfs_dataset_keys = public_feeds
116117
)

rt_segment_speeds/scripts/publish_open_data.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pathlib import Path
88

99
from calitp_data_analysis import utils
10+
from shared_utils import gtfs_utils_v2
1011
from update_vars import GTFS_DATA_DICT, SEGMENT_GCS
1112

1213

@@ -16,6 +17,8 @@ def stage_open_data_exports(analysis_date: str):
1617
export them to a stable GCS URL so we can always
1718
read it in open_data/catalog.yml.
1819
"""
20+
public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()
21+
1922
datasets = [
2023
GTFS_DATA_DICT.stop_segments.route_dir_single_segment,
2124
#GTFS_DATA_DICT.speedmap_segments.route_dir_single_segment,
@@ -24,7 +27,8 @@ def stage_open_data_exports(analysis_date: str):
2427

2528
for d in datasets:
2629
gdf = gpd.read_parquet(
27-
f"{SEGMENT_GCS}{d}_{analysis_date}.parquet"
30+
f"{SEGMENT_GCS}{d}_{analysis_date}.parquet",
31+
filters = [[("schedule_gtfs_dataset_key", "in", public_feeds)]]
2832
)
2933

3034
utils.geoparquet_gcs_export(

rt_segment_speeds/scripts/publish_public_gcs.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pathlib import Path
99

1010
from calitp_data_analysis import utils
11-
from shared_utils import rt_dates
11+
from shared_utils import rt_dates, gtfs_utils_v2
1212
from update_vars import GTFS_DATA_DICT, SEGMENT_GCS, PUBLIC_GCS
1313

1414
if __name__ == "__main__":
@@ -19,11 +19,16 @@
1919
GTFS_DATA_DICT.speedmap_segments.route_dir_single_segment,
2020
]
2121

22+
public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()
23+
2224
for d in datasets:
2325

2426
start = datetime.datetime.now()
2527

26-
df = gpd.read_parquet(f"{SEGMENT_GCS}{d}_{analysis_date}.parquet")
28+
df = gpd.read_parquet(
29+
f"{SEGMENT_GCS}{d}_{analysis_date}.parquet",
30+
filters = [[("schedule_gtfs_dataset_key", "in", public_feeds)]]
31+
)
2732

2833
utils.geoparquet_gcs_export(
2934
df,

0 commit comments

Comments
 (0)