Skip to content

Commit d2f526c

Browse files
authored
Merge pull request #1224 from cal-itp/pems-daytype
Match PEMS stations with SHN postmiles
2 parents aaa8816 + 81ebba0 commit d2f526c

12 files changed

+1803
-17
lines changed

.pre-commit-config.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@ repos:
1212
rev: 6.0.0
1313
hooks:
1414
- id: flake8
15-
args: ["--ignore=E501,W503,F403,F405,E711,E712,E231,E702"]
15+
args: ["--ignore=E501,W503,F403,F405,E711,E712,E231,E702,E203"]
1616
# E711: comparison to None should be 'if cond is not None:' (siuba filtering requires we use != None and not is not)
1717
# E712: line too long and line before binary operator (black is ok with these), assign lambda expression OK, comparison to True with is (siuba uses ==)
1818
# E231: missing whitespace after colon (we don't want white space when setting gs://)
1919
# E702: multiple statements on one line (semicolon)
20+
# E203: whitespace before ':', this rule is in conflict with another formatter's.
2021
types:
2122
- python
2223
files: _shared_utils

_shared_utils/setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
setup(
55
name="shared_utils",
66
packages=find_packages(),
7-
version="2.5.1",
7+
version="2.6",
88
description="Shared utility functions for data analyses",
99
author="Cal-ITP",
1010
license="Apache",

_shared_utils/shared_utils/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from . import (
2+
arcgis_query,
23
catalog_utils,
34
dask_utils,
45
gtfs_utils_v2,
@@ -10,6 +11,7 @@
1011
)
1112

1213
__all__ = [
14+
"arcgis_query",
1315
"catalog_utils",
1416
"dask_utils",
1517
"gtfs_utils_v2",
+156
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""
2+
Query beyond the 2,000 rows ESRI gives.
3+
4+
https://gis.stackexchange.com/questions/266897/how-to-get-around-the-1000-objectids-limit-on-arcgis-server
5+
"""
6+
import urllib.parse
7+
8+
import geopandas as gpd
9+
import numpy as np
10+
import pandas as pd
11+
import requests
12+
13+
14+
def query_arcgis_feature_server(url_feature_server=""):
15+
"""
16+
This function downloads all of the features available on a given ArcGIS
17+
feature server. The function is written to bypass the limitations imposed
18+
by the online service, such as only returning up to 1,000 or 2,000 featues
19+
at a time.
20+
21+
Parameters
22+
----------
23+
url_feature_server : string
24+
Sting containing the URL of the service API you want to query. It should
25+
end in a forward slash and look something like this:
26+
'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/USA_Counties/FeatureServer/0/'
27+
28+
Returns
29+
-------
30+
geodata_final : gpd.GeoDataFrame
31+
This is a GeoDataFrame that contains all of the features from the
32+
Feature Server. After calling this function, the `geodata_final` object
33+
can be used to store the data on disk in several different formats
34+
including, but not limited to, Shapefile (.shp), GeoJSON (.geojson),
35+
GeoPackage (.gpkg), or PostGIS.
36+
See https://geopandas.org/en/stable/docs/user_guide/io.html#writing-spatial-data
37+
for more details.
38+
39+
"""
40+
if url_feature_server == "":
41+
geodata_final = gpd.GeoDataFrame()
42+
return geodata_final
43+
44+
# Fixing last character in case the URL provided didn't end in a
45+
# forward slash
46+
if url_feature_server[-1] != "/":
47+
url_feature_server = url_feature_server + "/"
48+
49+
# Getting the layer definitions. This contains important info such as the
50+
# name of the column used as feature_ids/object_ids, among other things.
51+
layer_def = requests.get(url_feature_server + "?f=pjson").json()
52+
53+
# The `objectIdField` is the column name used for the
54+
# feature_ids/object_ids
55+
fid_colname = layer_def["objectIdField"]
56+
57+
# The `maxRecordCount` tells us the maximum number of records this REST
58+
# API service can return at once. The code below is written such that we
59+
# perform multiple calls to the API, each one being short enough never to
60+
# go beyond this limit.
61+
record_count_max = layer_def["maxRecordCount"]
62+
63+
# Part of the URL that specifically requests only the object IDs
64+
url_query_get_ids = f"query?f=geojson&returnIdsOnly=true" f"&where={fid_colname}+is+not+null"
65+
66+
url_comb = url_feature_server + url_query_get_ids
67+
68+
# Getting all the object IDs
69+
service_request = requests.get(url_comb)
70+
all_objectids = np.sort(service_request.json()["properties"]["objectIds"])
71+
72+
# This variable will store all the parts of the multiple queries. These
73+
# parts will, at the end, be concatenated into one large GeoDataFrame.
74+
geodata_parts = []
75+
76+
# This part of the query is fixed and never actually changes
77+
url_query_fixed = "query?f=geojson&outFields=*&where="
78+
79+
# Identifying the largest query size allowed per request. This will dictate
80+
# how many queries will need to be made. We start the search at
81+
# the max record count, but that generates errors sometimes - the query
82+
# might time out because it's too big. If the test query times out, we try
83+
# shrink the query size until the test query goes through without
84+
# generating a time-out error.
85+
block_size = min(record_count_max, len(all_objectids))
86+
worked = False
87+
while not worked:
88+
# Moving the "cursors" to their appropriate locations
89+
id_start = all_objectids[0]
90+
id_end = all_objectids[block_size - 1]
91+
92+
readable_query_string = f"{fid_colname}>={id_start} " f"and {fid_colname}<={id_end}"
93+
94+
url_query_variable = urllib.parse.quote(readable_query_string)
95+
96+
url_comb = url_feature_server + url_query_fixed + url_query_variable
97+
98+
url_get = requests.get(url_comb)
99+
100+
if "error" in url_get.json():
101+
block_size = int(block_size / 2) + 1
102+
else:
103+
geodata_part = gpd.read_file(url_get.text)
104+
105+
geodata_parts.append(geodata_part.copy())
106+
worked = True
107+
108+
# Performing the actual query to the API multiple times. This skips the
109+
# first few rows/features in the data because those rows were already
110+
# captured in the query performed in the code chunk above.
111+
for i in range(block_size, len(all_objectids), block_size):
112+
# Moving the "cursors" to their appropriate locations and finding the
113+
# limits of each block
114+
sub_list = all_objectids[i : i + block_size]
115+
id_start = sub_list[0]
116+
id_end = sub_list[-1]
117+
118+
readable_query_string = f"{fid_colname}>={id_start} " f"and {fid_colname}<={id_end}"
119+
120+
# Encoding from readable text to URL
121+
url_query_variable = urllib.parse.quote(readable_query_string)
122+
123+
# Constructing the full request URL
124+
url_comb = url_feature_server + url_query_fixed + url_query_variable
125+
126+
# Actually performing the query and storing its results in a
127+
# GeoDataFrame
128+
geodata_part = gpd.read_file(url_comb, driver="GeoJSON")
129+
130+
# Appending the result to `geodata_parts`
131+
if geodata_part.shape[0] > 0:
132+
geodata_parts.append(geodata_part)
133+
134+
# Concatenating all of the query parts into one large GeoDataFrame
135+
geodata_final = pd.concat(geodata_parts, ignore_index=True).sort_values(by=fid_colname).reset_index(drop=True)
136+
137+
# Checking if any object ID is missing
138+
ids_queried = set(geodata_final[fid_colname])
139+
for i, this_id in enumerate(all_objectids):
140+
if this_id not in ids_queried:
141+
print("WARNING! The following ObjectID is missing from the final " f"GeoDataFrame: ObjectID={this_id}")
142+
pass
143+
144+
# Checking if any object ID is included twice
145+
geodata_temp = geodata_final[[fid_colname]].copy()
146+
geodata_temp["temp"] = 1
147+
geodata_temp = geodata_temp.groupby(fid_colname).agg({"temp": "sum"}).reset_index()
148+
geodata_temp = geodata_temp.loc[geodata_temp["temp"] > 1].copy()
149+
for i, this_id in enumerate(geodata_temp[fid_colname].values):
150+
n_times = geodata_temp["temp"].values[i]
151+
print(
152+
"WARNING! The following ObjectID is included multiple times in"
153+
f"the final GeoDataFrame: ObjectID={this_id}\tOccurrences={n_times}"
154+
)
155+
156+
return geodata_final

_shared_utils/shared_utils/shared_data.py

+89-5
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33
"""
44
import geopandas as gpd
55
import pandas as pd
6+
import shapely
67
from calitp_data_analysis import geography_utils, utils
8+
from calitp_data_analysis.sql import to_snakecase
9+
from shared_utils.arcgis_query import query_arcgis_feature_server
710

811
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/shared_data/"
912

@@ -47,20 +50,21 @@ def make_county_centroids():
4750
ca_row2 = pd.DataFrame.from_dict(ca_row, orient="index").T
4851
gdf2 = gdf.append(ca_row2).reset_index(drop=True)
4952

50-
print("County centroids dataset created")
51-
5253
# Save as parquet, because lat/lon held in list, not point geometry anymore
5354
gdf2.to_parquet(f"{GCS_FILE_PATH}ca_county_centroids.parquet")
5455

5556
print("County centroids exported to GCS")
5657

58+
return
59+
5760

5861
def make_clean_state_highway_network():
5962
"""
6063
Create State Highway Network dataset.
6164
"""
62-
HIGHWAY_URL = "https://opendata.arcgis.com/datasets/" "77f2d7ba94e040a78bfbe36feb6279da_0.geojson"
63-
gdf = gpd.read_file(HIGHWAY_URL)
65+
URL = "https://opendata.arcgis.com/datasets/" "77f2d7ba94e040a78bfbe36feb6279da_0.geojson"
66+
67+
gdf = gpd.read_file(URL)
6468

6569
keep_cols = ["Route", "County", "District", "RouteType", "Direction", "geometry"]
6670

@@ -78,8 +82,88 @@ def make_clean_state_highway_network():
7882
utils.geoparquet_gcs_export(gdf2, GCS_FILE_PATH, "state_highway_network")
7983

8084

81-
# Run functions to create these datasets...store in GCS
85+
def export_shn_postmiles():
86+
"""
87+
Create State Highway Network postmiles dataset.
88+
These are points....maybe we can somehow create line segments?
89+
"""
90+
URL = "https://caltrans-gis.dot.ca.gov/arcgis/rest/services/" "CHhighway/SHN_Postmiles_Tenth/" "FeatureServer/0/"
91+
92+
gdf = query_arcgis_feature_server(URL)
93+
94+
gdf2 = to_snakecase(gdf).drop(columns="objectid")
95+
96+
utils.geoparquet_gcs_export(gdf2, GCS_FILE_PATH, "state_highway_network_postmiles")
97+
98+
return
99+
100+
101+
def draw_line_between_points(gdf: gpd.GeoDataFrame, group_cols: list) -> gpd.GeoDataFrame:
102+
"""
103+
Use the current postmile as the
104+
starting geometry / segment beginning
105+
and the subsequent postmile (based on odometer)
106+
as the ending geometry / segment end.
107+
108+
Segment goes from current to next postmile.
109+
"""
110+
# Grab the subsequent point geometry
111+
# We can drop whenever the last point is missing within
112+
# a group. If we have 3 points, we can draw 2 lines.
113+
gdf = gdf.assign(end_geometry=(gdf.groupby(group_cols, group_keys=False).geometry.shift(-1))).dropna(
114+
subset="end_geometry"
115+
)
116+
117+
# Construct linestring with 2 point coordinates
118+
gdf = (
119+
gdf.assign(
120+
line_geometry=gdf.apply(lambda x: shapely.LineString([x.geometry, x.end_geometry]), axis=1).set_crs(
121+
geography_utils.WGS84
122+
)
123+
)
124+
.drop(columns=["geometry", "end_geometry"])
125+
.rename(columns={"line_geometry": "geometry"})
126+
)
127+
128+
return gdf
129+
130+
131+
def create_postmile_segments(group_cols: list) -> gpd.GeoDataFrame:
132+
"""
133+
Take the SHN postmiles gdf, group by highway / odometer
134+
and convert the points into lines.
135+
We'll lose the last postmile for each highway-direction.
136+
Segment goes from current postmile point to subseq postmile point.
137+
"""
138+
gdf = gpd.read_parquet(
139+
f"{GCS_FILE_PATH}state_highway_network_postmiles.parquet",
140+
columns=["route", "direction", "odometer", "geometry"],
141+
)
142+
143+
# If there are duplicates with highway-direction and odometer
144+
# (where pm or other column differs slightly),
145+
# we'll drop and cut as long of a segment we can
146+
# There may be differences in postmile (relative to county start)
147+
# and odometer (relative to line's origin).
148+
gdf2 = (
149+
gdf.sort_values(group_cols + ["odometer"])
150+
.drop_duplicates(subset=group_cols + ["odometer"])
151+
.reset_index(drop=True)
152+
)
153+
154+
gdf3 = draw_line_between_points(gdf2, group_cols)
155+
156+
utils.geoparquet_gcs_export(gdf3, GCS_FILE_PATH, "state_highway_network_postmile_segments")
157+
158+
return
159+
160+
82161
if __name__ == "__main__":
162+
# Run functions to create these datasets...store in GCS
163+
83164
make_county_centroids()
84165

85166
make_clean_state_highway_network()
167+
export_shn_postmiles()
168+
169+
create_postmile_segments(["route", "direction"])

_shared_utils/shared_utils/shared_data_catalog.yml

+13
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,16 @@ sources:
5959
args:
6060
# source: bus_service_increase/bus_service_utils/generate_calenviroscreen_lehd_data.py
6161
urlpath: gs://calitp-analytics-data/data-analyses/bus_service_increase/calenviroscreen_lehd_by_tract.parquet
62+
state_highway_network_postmiles:
63+
driver: geoparquet
64+
description: Caltrans State Highway Network postmiles (every 0.1 mile) with postmiles as point geometry.
65+
args:
66+
# source: https://gisdata-caltrans.opendata.arcgis.com/datasets/c22341fec9c74c6b9488ee4da23dd967_0/about
67+
# hitting url directly would limit us to 2,000 rows
68+
urlpath: gs://calitp-analytics-data/data-analyses/shared_data/state_highway_network_postmiles.parquet
69+
state_highway_network_postmile_segments:
70+
driver: geoparquet
71+
description: Caltrans State Highway Network postmile segments (postmiles converted to line segments)
72+
args:
73+
# source: shared_utils/shared_data.py
74+
urlpath: gs://calitp-analytics-data/data-analyses/shared_data/state_highway_network_postmile_segments.parquet

0 commit comments

Comments
 (0)