Skip to content

Commit e66fdd9

Browse files
authored
Merge pull request #770 from cal-itp/dask-client-mvp
refactor bottlenecks, sjoin and cutting special segments
2 parents abe0252 + 594e410 commit e66fdd9

14 files changed

+349
-252
lines changed

Makefile

+5-1
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,8 @@ install_env:
6464
cd ~/data-analyses/_shared_utils && make setup_env && cd ..
6565
#cd bus_service_increase/ && make setup_bus_service_utils && cd ..
6666
#cd rt_delay/ && make setup_rt_analysis && cd ..
67-
cd rt_segment_speeds && pip install -r requirements.txt && cd ..
67+
cd rt_segment_speeds && pip install -r requirements.txt && cd ..
68+
69+
# Create .egg to upload to dask cloud cluster
70+
egg_modules:
71+
cd ~/data-analyses/rt_segment_speeds && python setup.py bdist_egg && cd ..

_shared_utils/shared_utils/dask_utils.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -96,17 +96,11 @@ def concatenate_list_of_files(
9696
with pandas or geopandas. Use dask.delayed to loop through and
9797
assemble a concatenated pandas or geopandas dataframe.
9898
"""
99-
dfs = []
100-
10199
if file_type == "df":
102-
for f in list_of_filepaths:
103-
indiv_df = delayed(pd.read_parquet)(f)
104-
dfs.append(indiv_df)
100+
dfs = [delayed(pd.read_parquet)(f) for f in list_of_filepaths]
105101

106102
elif file_type == "gdf":
107-
for f in list_of_filepaths:
108-
indiv_df = delayed(gpd.read_parquet)(f)
109-
dfs.append(indiv_df)
103+
dfs = [delayed(gpd.read_parquet)(f) for f in list_of_filepaths]
110104

111105
results = [compute(i)[0] for i in dfs]
112106
full_df = pd.concat(results, axis=0).reset_index(drop=True)

_shared_utils/shared_utils/gtfs_utils_v2.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def get_metrolink_feed_key(selected_date: Union[str, datetime.date], get_df: boo
169169

170170
metrolink_feed = (
171171
tbls.mart_gtfs.fct_daily_schedule_feeds()
172-
>> filter(_.date == selected_date, _.is_future == False)
172+
>> filter(_.date == selected_date)
173173
>> inner_join(_, metrolink_in_airtable, on="gtfs_dataset_key")
174174
>> subset_cols(["feed_key", "name"])
175175
>> collect()

open_data/logs/download_data.log

+15
Original file line numberDiff line numberDiff line change
@@ -6216,3 +6216,18 @@
62166216
2023-05-18 09:06:45.814 | INFO | __main__:<module>:33 - # operators to run: 158
62176217
2023-05-18 09:06:45.815 | INFO | __main__:<module>:37 - *********** Download st data ***********
62186218
2023-05-18 09:09:01.946 | INFO | __main__:<module>:66 - execution time: 0:02:17.484357
6219+
2023-05-31 16:09:26.157 | INFO | __main__:<module>:49 - Analysis date: 2023-04-12
6220+
2023-05-31 16:09:28.746 | INFO | __main__:<module>:56 - # operators to run: 201
6221+
2023-05-31 16:09:28.747 | INFO | __main__:<module>:59 - *********** Download trips data ***********
6222+
2023-05-31 16:18:58.486 | INFO | __main__:<module>:49 - Analysis date: 2023-04-12
6223+
2023-05-31 16:19:00.349 | INFO | __main__:<module>:56 - # operators to run: 201
6224+
2023-05-31 16:19:00.351 | INFO | __main__:<module>:59 - *********** Download trips data ***********
6225+
2023-05-31 16:19:26.201 | INFO | __main__:<module>:87 - execution time: 0:00:27.714166
6226+
2023-05-31 16:21:50.716 | INFO | __main__:<module>:49 - Analysis date: 2023-03-15
6227+
2023-05-31 16:21:52.439 | INFO | __main__:<module>:56 - # operators to run: 202
6228+
2023-05-31 16:21:52.440 | INFO | __main__:<module>:59 - *********** Download trips data ***********
6229+
2023-05-31 16:22:17.726 | INFO | __main__:<module>:87 - execution time: 0:00:27.009313
6230+
2023-05-31 16:24:03.293 | INFO | __main__:<module>:49 - Analysis date: 2023-02-15
6231+
2023-05-31 16:24:04.974 | INFO | __main__:<module>:56 - # operators to run: 196
6232+
2023-05-31 16:24:04.975 | INFO | __main__:<module>:59 - *********** Download trips data ***********
6233+
2023-05-31 16:24:30.056 | INFO | __main__:<module>:87 - execution time: 0:00:26.762197

rt_segment_speeds/logs/cut_stop_segments.log

+4
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,7 @@
3838
2023-05-19 16:59:20.971 | INFO | __main__:<module>:265 - Analysis date: 2023-04-12
3939
2023-05-19 16:59:22.251 | INFO | __main__:<module>:299 - Cut special stop segments: 0:00:01.274668
4040
2023-05-19 18:27:38.239 | INFO | __main__:<module>:319 - Export results: 1:28:15.988022
41+
2023-05-31 14:06:27.162 | INFO | __main__:<module>:268 - Analysis date: 2023-04-12
42+
2023-05-31 14:46:10.347 | INFO | __main__:<module>:313 - Cut special stop segments: 0:39:43.179497
43+
2023-05-31 14:46:12.143 | INFO | __main__:<module>:332 - export results: 0:00:01.795914
44+
2023-05-31 14:46:12.144 | INFO | __main__:<module>:333 - execution time: 0:39:44.975411

rt_segment_speeds/logs/sjoin_vp_segments.log

+10-2
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,17 @@
4949
2023-05-15 14:11:43.758 | INFO | __main__:<module>:280 - Analysis date: 2023-03-15
5050
2023-05-15 16:09:03.504 | INFO | __main__:<module>:313 - attach vp to stop-to-stop segments: 1:57:19.737088
5151
2023-05-15 16:09:03.505 | INFO | __main__:<module>:316 - execution time: 1:57:19.745243
52-
2023-05-19 18:40:22.816 | INFO | __main__:<module>:278 - Analysis date: 2023-04-12
53-
2023-05-19 18:44:49.439 | INFO | __main__:<module>:278 - Analysis date: 2023-04-12
5452
2023-05-19 18:46:21.102 | INFO | __main__:<module>:278 - Analysis date: 2023-04-12
5553
2023-05-19 20:18:23.376 | INFO | __main__:<module>:296 - attach vp to stop-to-stop segments: 1:32:02.273276
5654
2023-05-19 20:19:56.588 | INFO | __main__:<module>:306 - compiled partitioned parquets: 0:01:33.211730
5755
2023-05-19 20:19:56.589 | INFO | __main__:<module>:307 - execution time: 1:33:35.485006
56+
2023-05-26 17:58:19.393 | INFO | __main__:<module>:111 - Analysis date: 2023-04-12
57+
2023-05-26 18:01:16.079 | INFO | __main__:<module>:176 - delayed results: 0:02:56.684602
58+
2023-05-26 19:46:57.454 | INFO | __main__:<module>:182 - stack arrays: 1:45:41.374605
59+
2023-05-26 19:47:05.557 | INFO | __main__:<module>:191 - execution time: 1:48:46.162546
60+
2023-05-26 21:24:22.480 | INFO | __main__:<module>:111 - Analysis date: 2023-04-12
61+
2023-05-26 23:14:16.971 | INFO | __main__:<module>:202 - execution time: 1:49:54.490501
62+
2023-05-30 11:09:47.568 | INFO | __main__:<module>:286 - Analysis date: 2023-04-12
63+
2023-05-30 11:58:29.934 | INFO | __main__:<module>:298 - attach vp to stop-to-stop segments: 0:48:42.364791
64+
2023-05-30 12:00:23.165 | INFO | __main__:<module>:308 - compiled parquets: 0:01:53.231698
65+
2023-05-30 12:00:23.166 | INFO | __main__:<module>:309 - execution time: 0:50:35.596489

0 commit comments

Comments
 (0)