Skip to content

Commit cb5314b

Browse files
authored
Merge branch 'main' into ah_gtfs
2 parents dc6df71 + 27991ba commit cb5314b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+12529
-28173
lines changed

_shared_utils/requirements.txt

+3-7
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
-e .
2-
altair==5.3.0
32
altair-transform==0.2.0
4-
gtfs-segments==2.1.7
5-
pyairtable==2.2.2
6-
great_tables==0.14.0
3+
great_tables==0.16.1
74
omegaconf==2.3.0 # better yaml configuration
8-
polars==0.20.29
9-
quarto-cli==1.4.554
5+
polars==1.22.0
6+
quarto-cli==1.6.40
107
quarto==0.1.0
11-
typing_extensions==4.12.2

_shared_utils/setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
setup(
55
name="shared_utils",
66
packages=find_packages(),
7-
version="3.0",
7+
version="4.0",
88
description="Shared utility functions for data analyses",
99
author="Cal-ITP",
1010
license="Apache",

_shared_utils/shared_utils/dask_utils.py

+29
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,35 @@ def import_df_func(
142142
return df
143143

144144

145+
def import_ddf_func(path, date_list, data_type, **kwargs):
146+
"""
147+
Equivalent to improt_df_func, except uses dask to read in the dataframe
148+
instead of pandas.
149+
Concatenates the various dates.
150+
"""
151+
if data_type == "df":
152+
ddf = dd.multi.concat(
153+
[
154+
dd.read_parquet(f"{path}_{one_date}.parquet", **kwargs).assign(service_date=one_date)
155+
for one_date in date_list
156+
],
157+
axis=0,
158+
ignore_index=True,
159+
)
160+
161+
elif data_type == "gdf":
162+
ddf = dd.multi.concat(
163+
[
164+
dg.read_parquet(f"{path}_{one_date}.parquet", **kwargs).assign(service_date=one_date)
165+
for one_date in date_list
166+
],
167+
axis=0,
168+
ignore_index=True,
169+
)
170+
171+
return ddf
172+
173+
145174
def get_ddf(paths, date_list, data_type, get_pandas: bool = False, **kwargs):
146175
"""
147176
Set up function with little modifications based on

_shared_utils/shared_utils/gtfs_analytics_data.yml

+5-1
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,14 @@ stop_segments:
8080
stop_pair_cols: ["stop_pair", "stop_pair_name"]
8181
route_dir_cols: ["route_id", "direction_id"]
8282
segment_cols: ["route_id", "direction_id", "stop_pair", "geometry"]
83+
segment_timeofday: "rollup_singleday/speeds_route_dir_timeofday_segments"
84+
# segment_peakoffpeak
85+
# segment_weekday_timeofday
86+
# -- cache segment_timeofday first and use this to build other layers? other keys to make peak/offpeak, weekday/weekend grains clear?
8387
#shape_stop_single_segment: "rollup_singleday/speeds_shape_stop_segments" #-- stop after Oct 2024
8488
route_dir_single_segment: "rollup_singleday/speeds_route_dir_segments"
8589
route_dir_single_segment_detail: "rollup_singleday/speeds_route_dir_segments_detail" # interim for speedmaps
86-
route_dir_multi_segment: "rollup_multiday/speeds_route_dir_segments"
90+
route_dir_multi_segment: "rollup_multiday/speeds_route_dir_segments" # -- this one should be replaced with weekday/weekend, make clear the grain
8791
segments_file: "segment_options/shape_stop_segments"
8892
max_speed: ${speed_vars.max_speed}
8993
route_dir_quarter_segment: "rollup_multiday/quarter_speeds_route_dir_segments"

_shared_utils/shared_utils/rt_dates.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
v for k, v in DATES.items() if k.endswith("2023") and not any(substring in k for substring in ["jan", "feb"])
8181
]
8282

83-
y2024_dates = [v for k, v in DATES.items() if k.endswith("2024")]
83+
y2024_dates = [v for k, v in DATES.items() if k.endswith("2024") and k not in ["oct2024g"]]
8484

8585

8686
valid_weeks = ["apr2023", "oct2023", "apr2024", "oct2024"]
@@ -96,7 +96,7 @@ def get_week(month: Literal[[*valid_weeks]], exclude_wed: bool) -> list:
9696
apr2023_week = get_week(month="apr2023", exclude_wed=False)
9797
oct2023_week = get_week(month="oct2023", exclude_wed=False)
9898
apr2024_week = get_week(month="apr2024", exclude_wed=False)
99-
oct2024_week = get_week(month="oct2024", exclude_wed=False)
99+
oct2024_week = [d for d in get_week(month="oct2024", exclude_wed=False) if d != DATES["oct2024g"]]
100100

101101
MONTH_DICT = {
102102
1: "January",

_shared_utils/shared_utils/rt_utils.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def get_vehicle_positions(ix_df: pd.DataFrame) -> gpd.GeoDataFrame:
402402
vp_all = gpd.read_parquet(f"{VP_FILE_PATH}vp_{date_str}.parquet")
403403
org_vp = vp_all >> filter(_.gtfs_dataset_key.isin(ix_df.vehicle_positions_gtfs_dataset_key))
404404
org_vp = org_vp >> select(-_.location_timestamp, -_.service_date, -_.activity_date)
405-
org_vp = org_vp.to_crs(geography_utils.CA_NAD83Albers)
405+
org_vp = org_vp.to_crs(geography_utils.CA_NAD83Albers_m)
406406
utils.geoparquet_gcs_export(org_vp, GCS_FILE_PATH + V2_SUBFOLDER, filename)
407407

408408
return org_vp
@@ -459,7 +459,9 @@ def get_stops(ix_df: pd.DataFrame) -> gpd.GeoDataFrame:
459459
org_stops = gpd.read_parquet(path)
460460
else:
461461
feed_key_list = list(ix_df.feed_key.unique())
462-
org_stops = gtfs_utils_v2.get_stops(service_date, feed_key_list, stop_cols, crs=geography_utils.CA_NAD83Albers)
462+
org_stops = gtfs_utils_v2.get_stops(
463+
service_date, feed_key_list, stop_cols, crs=geography_utils.CA_NAD83Albers_m
464+
)
463465
utils.geoparquet_gcs_export(org_stops, GCS_FILE_PATH + V2_SUBFOLDER, filename)
464466

465467
return org_stops
@@ -478,7 +480,7 @@ def get_shapes(ix_df: pd.DataFrame) -> gpd.GeoDataFrame:
478480
else:
479481
feed_key_list = list(ix_df.feed_key.unique())
480482
org_shapes = gtfs_utils_v2.get_shapes(
481-
service_date, feed_key_list, crs=geography_utils.CA_NAD83Albers, shape_cols=shape_cols
483+
service_date, feed_key_list, crs=geography_utils.CA_NAD83Albers_m, shape_cols=shape_cols
482484
)
483485
# invalid geos are nones in new df...
484486
org_shapes = org_shapes.dropna(subset=["geometry"])

_shared_utils/shared_utils/shared_data.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def make_county_centroids():
1919
"""
2020
URL = "https://opendata.arcgis.com/datasets/" "8713ced9b78a4abb97dc130a691a8695_0.geojson"
2121

22-
gdf = gpd.read_file(URL).to_crs(geography_utils.CA_StatePlane)
22+
gdf = gpd.read_file(URL).to_crs(geography_utils.CA_NAD83Albers_ft)
2323
gdf.columns = gdf.columns.str.lower()
2424

2525
gdf = (
@@ -167,7 +167,7 @@ def segment_highway_lines_by_postmile(gdf: gpd.GeoDataFrame):
167167

168168
# Assign segment geometry and overwrite the postmile geometry column
169169
gdf2 = (
170-
gdf.assign(geometry=gpd.GeoSeries(segment_geom, crs=geography_utils.CA_NAD83Albers))
170+
gdf.assign(geometry=gpd.GeoSeries(segment_geom, crs=geography_utils.CA_NAD83Albers_m))
171171
.drop(columns=drop_cols)
172172
.set_geometry("geometry")
173173
)
@@ -205,7 +205,7 @@ def create_postmile_segments(
205205
.explode("geometry")
206206
.reset_index(drop=True)
207207
.pipe(round_odometer_values, ["bodometer", "eodometer"], num_decimals=3)
208-
.to_crs(geography_utils.CA_NAD83Albers)
208+
.to_crs(geography_utils.CA_NAD83Albers_m)
209209
)
210210

211211
# Have a list accompany the geometry
@@ -222,7 +222,7 @@ def create_postmile_segments(
222222
f"{GCS_FILE_PATH}state_highway_network_postmiles.parquet", columns=group_cols + ["odometer", "geometry"]
223223
)
224224
.pipe(round_odometer_values, ["odometer"], num_decimals=3)
225-
.to_crs(geography_utils.CA_NAD83Albers)
225+
.to_crs(geography_utils.CA_NAD83Albers_m)
226226
)
227227
# Round to 3 digits for odometer. When there are more decimal places, it makes our cutoffs iffy
228228
# when we use this condition below: odometer >= bodometer & odometer <= eodometer

_shared_utils/shared_utils/time_helpers.py

+5
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@
3838
**{k: "weekend" for k in ["Saturday", "Sunday"]},
3939
}
4040

41+
WEEKDAY_DICT2 = {
42+
**{k: "Weekday" for k in ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]},
43+
**{k: k for k in ["Saturday", "Sunday"]},
44+
}
45+
4146

4247
def time_span_labeling(date_list: list) -> tuple[str]:
4348
"""

bus_service_increase/bus_service_utils/create_parallel_corridors.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ def process_transit_routes(
3131
## Clean transit routes
3232
df = df.assign(
3333
route_length = df.to_crs(
34-
geography_utils.CA_StatePlane).geometry.length
35-
).to_crs(geography_utils.CA_StatePlane)
34+
geography_utils.CA_NAD83Albers_ft).geometry.length
35+
).to_crs(geography_utils.CA_NAD83Albers_ft)
3636

3737
# Get it down to route_id and pick longest shape
3838
df2 = (df.sort_values(operator_cols + ["route_id", "route_length"],
@@ -63,7 +63,7 @@ def prep_highway_directions_for_dissolve(
6363
'''
6464
df = (gpd.read_parquet("gs://calitp-analytics-data/data-analyses/"
6565
"shared_data/state_highway_network.parquet")
66-
.to_crs(geography_utils.CA_StatePlane))
66+
.to_crs(geography_utils.CA_NAD83Albers_ft))
6767

6868
# Get dummies for direction
6969
# Can make data wide instead of long

bus_service_increase/create_analysis_data.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def get_shapes(selected_date: str) -> gpd.GeoDataFrame:
134134
selected_date,
135135
columns = ["shape_array_key", "geometry"],
136136
get_pandas = True,
137-
crs = geography_utils.CA_NAD83Albers
137+
crs = geography_utils.CA_NAD83Albers_m
138138
).pipe(
139139
helpers.remove_shapes_outside_ca
140140
).merge(
@@ -151,7 +151,7 @@ def get_shapes(selected_date: str) -> gpd.GeoDataFrame:
151151

152152

153153
def dissolve_census_tracts(
154-
crs: str = geography_utils.CA_NAD83Albers
154+
crs: str = geography_utils.CA_NAD83Albers_m
155155
) -> gpd.GeoDataFrame:
156156
census_tracts = (
157157
catalog.calenviroscreen_lehd_by_tract.read()

bus_service_increase/highways-existing-transit.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@
7878
"plot_df = gdf[\n",
7979
" gdf.route_length >= geography_utils.FEET_PER_MI * 0.5\n",
8080
" ].assign(\n",
81-
" geometry = (gdf.geometry.to_crs(geography_utils.CA_StatePlane)\n",
81+
" geometry = (gdf.geometry.to_crs(geography_utils.CA_NAD83Albers_ft)\n",
8282
" .buffer(300)\n",
8383
" .to_crs(geography_utils.WGS84)\n",
8484
" )\n",

0 commit comments

Comments
 (0)