Skip to content

Commit a1e6cda

Browse files
authored
Merge pull request #303 from cal-itp/more-metadata
More metadata
2 parents b8ec7b2 + 9a35637 commit a1e6cda

13 files changed

+104
-146
lines changed

open_data/README.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,7 @@
3838

3939
## Open Data Portal Datasets
4040
1. [High Quality Transit Areas (HQTA)](./hqta.py)
41-
1. [Transit Stops and Routes (Traffic Ops request)](./traffic_ops.py)
41+
1. [Transit Stops and Routes (Traffic Ops request)](./traffic_ops.py)
42+
43+
## Open Data Intake Process
44+
Open a ticket on the Intranet to update or add new services and provide [justification](./intake_justification.md)

open_data/arcgis_script.py

+17-40
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
# Save a version of script that runs within ArcGIS
1515
import arcpy
1616
import os
17-
import zipfile
1817

1918
#arcpy.env.workspace = "C:\Users\s153936\Documents\ArcGIS"
2019
arcpy.env.workspace = ARCGIS_PATH
@@ -35,14 +34,14 @@
3534

3635
# Export metadata using FGDC
3736
translator = directory + 'Metadata\Translator\ArcGIS2FGDC.xml'
38-
#translator = directory + 'Metadata\Translator\ArcGIS2ISO19139.xml'
39-
4037

38+
## (1) Convert shapefile layer to gdb feature class
4139
for f in in_features:
4240
# construct the filename, which is takes form of routes_assembled/routes_assembled.shp
4341
shp_file_name = f + '/' + f + '.shp'
4442

45-
# Need this try/except because arcpy won't let you overwrite, and will error if it finds it
43+
# Need this try/except because arcpy won't let you overwrite,
44+
# and will error if it finds it
4645
try:
4746
arcpy.management.Delete(staging_location + '/' + f)
4847
except:
@@ -59,7 +58,9 @@
5958
print(field.name)
6059

6160

62-
# Rename fields
61+
## (2) Rename fields where needed
62+
# Do this once it's a feature class, so we can preserve the new column names
63+
# before metadata is created
6364
need_renaming = [
6465
'ca_hq_transit_areas',
6566
'ca_hq_transit_stops',
@@ -92,6 +93,7 @@
9293
field.name, RENAME_CA_HQTA[field.name])
9394

9495

96+
## (3) Export metadata associated with file gdb feature class in FGDC format
9597
for f in in_features:
9698
# Construct XML filename
9799
# Spit out one that's before it's manually changed
@@ -103,10 +105,12 @@
103105

104106

105107

106-
### UPDATE XML METADATA SEPARATELY IN PYTHON OUTSIDE OF ARCGIS
108+
### (4) UPDATE XML METADATA SEPARATELY IN PYTHON OUTSIDE OF ARCGIS
107109

108-
# Run this after putting the updated XML in the file gdb
109-
# Clean up the open_data file gdb
110+
## (5) Copy the feature class from staging location to out location
111+
# In the out location, we can drop the new XML and use it to sync
112+
# Use staging location and out location because otherwise, arcpy errors when it detects
113+
# another XML when you try and update the layer in a subsequent update
110114
for f in in_features:
111115
# Delete the feature class in this gdb, because we don't want _1 appended to end
112116
try:
@@ -119,7 +123,7 @@
119123
out_location + '/',
120124
f)
121125

122-
126+
## (6) Sync the XML with the feature class
123127
for f in in_features:
124128
# This is the one after it's manually changed. Keep separate to see what works.
125129
updated_xml_file = out_location + '/' + f + '.xml'
@@ -131,38 +135,11 @@
131135
"ENABLED")
132136

133137

134-
# Clean up XML file in staging.gdb
135-
# If not, next time, it will error because it can't output an XML file when one is present (no overwriting)
138+
## (7) Clean up XML file in staging.gdb
139+
# If not, next time, it will error because it can't output an XML file
140+
# when one is present (no overwriting)
136141
for f in in_features:
137142
try:
138143
os.remove(xml_file)
139144
except:
140-
pass
141-
142-
# Compress file gdb for sending -- nope, this doesn't create a zipped file
143-
#arcpy.CompressFileGeodatabaseData_management(out_location, "Lossless compression")
144-
145-
#https://community.esri.com/t5/python-questions/zip-a-file-geodatabase-using-arcpy-or-zipfile/td-p/388286
146-
#Creates the empty zip file and opens it for writing
147-
148-
# This isn't working. Can't open the file gdb as directory in ArcGIS,
149-
# though outside of ArcGIS, it seems like it's finding the folder
150-
def zip_gdb(input_gdb):
151-
gdb_file = str(input_gdb)
152-
out_file = gdb_file + '.zip'
153-
gdb_name = os.path.basename(gdb_file)
154-
155-
with zipfile.ZipFile(out_file, mode='w',
156-
compression=zipfile.ZIP_DEFLATED,
157-
allowZip64=True) as myzip:
158-
for f in os.listdir(gdb_file):
159-
if f[-5:] != '.lock':
160-
myzip.write(os.path.join(gdb_file, f), gdb_name + '/' + os.path.basename(f))
161-
162-
print('Completed zipping: {}'.format(gdb_file))
163-
164-
zip_gdb(out_location)
165-
166-
167-
out_file = out_location + '.zip'
168-
shutil.make_archive(out_file, 'zip', out_location)‍‍‍‍‍‍‍
145+
pass

open_data/hqta.py

+5-10
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,11 @@
88
]
99

1010
PURPOSE = ('''
11-
Estimated High Quality Transit Areas as described in
12-
Public Resources Code 21155, 21064.3, 21060.2.
11+
Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2.
1312
'''
1413
)
1514

16-
METHODOLOGY = ('''
17-
This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of “bus rapid transit” in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables.
18-
19-
Notes: Null values may be present. The "hqta_details" columns defines which part of the Public Resources Code definition the HQTA classification was based on. If "hqta_details" references a single operator, then "itp_id_secondary" and "agency_secondary" are null. If "hqta_details" references the same operator, then "itp_id_secondary" and "agency_secondary" are the same as "itp_id_primary" and "agency_primary".
20-
'''
21-
)
15+
METHODOLOGY = "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `itp_id_secondary` and `agency_secondary` are null. If `hqta_details` references the same operator, then `itp_id_secondary` and `agency_secondary` are the same as `itp_id_primary` and `agency_primary`."
2216

2317
HQTA_TRANSIT_AREAS_DICT = {
2418
"dataset_name": "ca_hq_transit_areas",
@@ -43,8 +37,9 @@
4337

4438
"contact_organization": "Caltrans",
4539
"contact_person": "Eric Dasmalchi",
46-
"contact_email": "[email protected]"
47-
40+
"contact_email": "[email protected]",
41+
42+
"horiz_accuracy": "0.00004 decimal degrees",
4843
}
4944

5045
# Use same data dictionary with tiny modifications

open_data/intake_justification.md

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Open Data Intake
2+
3+
Document links, justification, for why datasets need to be made public. Submit this information every time a ticket is open to update the existing layers.
4+
5+
## Intranet Links
6+
1. [Geospatial open data request](https://sv03tmcpo.ct.dot.ca.gov/portal/apps/sites/#/geep/pages/open-data-request)
7+
1. [Open support ticket for GIS](https://sv03tmcpo.ct.dot.ca.gov/portal/apps/sites/#/geep/pages/support-request)
8+
9+
### High Quality Transit Areas
10+
11+
High Quality Transit Areas, as described in Public Resources Code 21155, 21064.3, 21060.2, relies on the intersection of frequent transit service. These are subject to the transit schedules available in the General Transit Feed Specification (GTFS). The California Integrated Travel Project within Caltrans ingests GTFS data daily for all operators in the state, standardizes, and processes this data for storage in its data warehouse.
12+
13+
Capturing where these HQTAs are is one expected spatial product from GTFS data. Given that GTFS data is always being updated by operators, whether it is to increase or reduce service,
14+
this HQTA dataset also reflects the most recent boundaries given operator's latest scheduled trips.
15+
16+
### Transit Routes / Stops
17+
18+
The General Transit Feed Specification (GTFS) provides transit schedules, including transit route and stop information, in text files. The California Integrated Travel Project within Caltrans ingests GTFS data daily for all operators in the state, standardizes, and processes this data for storage in its data warehouse.
19+
20+
This dataset compiles all the route and stop information for all CA transit operators and provides it in a geospatial format. Transit routes are shown with their line geometry, and transit stops are shown with their point geometry. Given that GTFS data is always being updated by operators, whether it is to increase or reduce service, CA transit routes / transit stops datasets provide monthly snapshots for operators' latest schedules.

open_data/metadata_update.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ class metadata_input(BaseModel):
9595
data_dict_url: str
9696
contact_organization: str = "Caltrans"
9797
contact_person: str
98-
contact_email: str = "[email protected]"
98+
contact_email: str = "[email protected]"
99+
horiz_accuracy: str = "0.00004 decimal degrees"
99100

100101

101102
def fix_values_in_validated_dict(d):
@@ -110,6 +111,8 @@ def fix_values_in_validated_dict(d):
110111
d["beginning_date"] = validation.check_dates(d["beginning_date"])
111112
d["end_date"] = validation.check_dates(d["end_date"])
112113

114+
d["horiz_accuracy"] = validation.check_horiz_accuracy(d["horiz_accuracy"])
115+
113116
return d
114117

115118

@@ -139,12 +142,12 @@ def overwrite_metadata_json(metadata_json, DATASET_INFO):
139142

140143
m["idinfo"]["keywords"] = d["theme_topics"]
141144

142-
# Add resource contact
143145
m["idinfo"]["ptcontac"]["cntinfo"]["cntorgp"]["cntorg"] = d["contact_organization"]
144146
m["idinfo"]["ptcontac"]["cntinfo"]["cntorgp"]["cntper"] = d["contact_person"]
145147
m["idinfo"]["ptcontac"]["cntinfo"]["cntpos"] = d["publish_entity"]
146148
m["idinfo"]["ptcontac"]["cntinfo"]["cntemail"] = d["contact_email"]
147149

150+
m["dataqual"]["posacc"]["horizpa"]["horizpar"] = d["horiz_accuracy"]
148151
m["dataqual"]["lineage"]["procstep"]["procdesc"] = d["methodology"]
149152

150153
m["eainfo"]["detailed"]["enttyp"]["enttypl"] = d["dataset_name"]
@@ -155,7 +158,7 @@ def overwrite_metadata_json(metadata_json, DATASET_INFO):
155158
m["metainfo"]["metc"]["cntinfo"]["cntorgp"]["cntper"] = d["contact_person"]
156159
m["metainfo"]["metc"]["cntinfo"]["cntpos"] = d["publish_entity"]
157160
m["metainfo"]["metc"]["cntinfo"]["cntemail"] = d["contact_email"]
158-
161+
159162
return new_metadata
160163

161164

open_data/metadata_xml/ca_hq_transit_areas.xml

+7-7
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
<descript>
1515
<abstract>Public. EPSG: 4326</abstract>
1616
<purpose>
17-
Estimated High Quality Transit Areas as described in
18-
Public Resources Code 21155, 21064.3, 21060.2.
17+
Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2.
1918
</purpose>
2019
</descript>
2120
<timeperd>
@@ -57,13 +56,14 @@
5756
<native>Version 6.2 (Build 9200) ; Esri ArcGIS 10.8.1.14362</native>
5857
</idinfo>
5958
<dataqual>
59+
<posacc>
60+
<horizpa>
61+
<horizpar>0.00004 decimal degrees</horizpar>
62+
</horizpa>
63+
</posacc>
6064
<lineage>
6165
<procstep>
62-
<procdesc>
63-
This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of “bus rapid transit” in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables.
64-
65-
Notes: Null values may be present. The "hqta_details" columns defines which part of the Public Resources Code definition the HQTA classification was based on. If "hqta_details" references a single operator, then "itp_id_secondary" and "agency_secondary" are null. If "hqta_details" references the same operator, then "itp_id_secondary" and "agency_secondary" are the same as "itp_id_primary" and "agency_primary".
66-
</procdesc>
66+
<procdesc>This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `itp_id_secondary` and `agency_secondary` are null. If `hqta_details` references the same operator, then `itp_id_secondary` and `agency_secondary` are the same as `itp_id_primary` and `agency_primary`.</procdesc>
6767
</procstep>
6868
</lineage>
6969
</dataqual>

0 commit comments

Comments
 (0)