Skip to content

Commit 39ba4b5

Browse files
authored
Merge pull request #3314 from cal-itp/vb-3305-gtfs-schedule-validator-v50-update
Updates to v5.0.0 of the schedule validator
2 parents 87d81b8 + 10b4dc3 commit 39ba4b5

File tree

8 files changed

+193
-3
lines changed

8 files changed

+193
-3
lines changed

Diff for: jobs/gtfs-schedule-validator/Dockerfile

+4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ COPY ./gtfs-validator-4.1.0-cli.jar ${V4_1_VALIDATOR_JAR}
2828
ENV V4_2_VALIDATOR_JAR=/gtfs-validator-4.2.0-cli.jar
2929
COPY ./gtfs-validator-4.2.0-cli.jar ${V4_2_VALIDATOR_JAR}
3030

31+
# v5 from https://github.com/MobilityData/gtfs-validator/releases/download/v5.0.0/gtfs-validator-5.0.0-cli.jar
32+
ENV V5_VALIDATOR_JAR=/gtfs-validator-5.0.0-cli.jar
33+
COPY ./gtfs-validator-5.0.0-cli.jar ${V5_VALIDATOR_JAR}
34+
3135
WORKDIR /app
3236

3337
COPY ./pyproject.toml /app/pyproject.toml

Diff for: jobs/gtfs-schedule-validator/README.md

+33
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,36 @@ available version of the validator. Instead, we use extract dates to determine w
2222
version of the validator was correct to use at the time the data was created. That way,
2323
we don't "punish" older data for not conforming to expectations that changed in the time
2424
since data creation.
25+
26+
## Upgrading the Schedule Validator Version tips
27+
If you run into trouble when adding the new validator jar, it's because the default set for check-added-large-files in our pre-commit config which is a relatively low 500Kb. It's more meant as an alarm for local development than as an enforcement mechanism.
28+
You can make one commit that adds the jar and temporarily adds a higher file size threshold to the pre-commit config [like this one](https://github.com/cal-itp/data-infra/pull/2893/commits/7d40c81f2f5a2622123d4ac5dbbb064eb35565c6) and then a second commit that removes the threshold modification [like this one](https://github.com/cal-itp/data-infra/pull/2893/commits/1ec4e4a1f30ac95b9c0edffcf1f2b12e53e40733). That'll get the file through.
29+
30+
Remember you need to rebuild and push the latest docker file to `dhcr.io` before changes will be reflected in airflow runs.
31+
32+
You will need to parse the `rules.json` from the mobility validator. Here is a code example for the upgrade to v5:
33+
```
34+
# https://github.com/MobilityData/gtfs-validator/releases/tag/v5.0.0
35+
import json
36+
import pandas as pd
37+
38+
# Replace with your JSON data
39+
with open('rules.json') as f:
40+
data = json.load(f)
41+
result = []
42+
for key in data.keys():
43+
# print(key)
44+
result.append({
45+
'code': data[key]['code'],
46+
'human_readable_description': data[key]['shortSummary'],
47+
'version': 'v5.0.0',
48+
'severity': data[key]['severityLevel']
49+
})
50+
# Create CSV
51+
df = pd.DataFrame(result)
52+
df.to_csv('gtfs_schedule_validator_rule_details_v5_0_0.csv', index=False)
53+
```
54+
55+
Here is a command to test once you have appropriate gtfs zip files in the test bucket:
56+
57+
`docker-compose run airflow tasks test unzip_and_validate_gtfs_schedule_hourly validate_gtfs_schedule YYYY-MM-DDTHH:MM:SS`
35.4 MB
Binary file not shown.

Diff for: jobs/gtfs-schedule-validator/gtfs_schedule_validator_hourly.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
V4_VALIDATOR_JAR = os.getenv("V4_VALIDATOR_JAR")
4242
V4_1_VALIDATOR_JAR = os.getenv("V4_1_VALIDATOR_JAR")
4343
V4_2_VALIDATOR_JAR = os.getenv("V4_2_VALIDATOR_JAR")
44+
V5_VALIDATOR_JAR = os.getenv("V5_VALIDATOR_JAR")
4445

4546
JAR_DEFAULT = typer.Option(
4647
default=os.environ.get(SCHEDULE_VALIDATOR_JAR_LOCATION_ENV_KEY),
@@ -161,9 +162,12 @@ def execute_schedule_validator(
161162
elif extract_ts.date() < pendulum.Date(2024, 1, 20):
162163
versioned_jar_path = V4_1_VALIDATOR_JAR
163164
validator_version = "v4.1.0"
164-
else:
165+
elif extract_ts.date() < pendulum.Date(2024, 3, 27):
165166
versioned_jar_path = V4_2_VALIDATOR_JAR
166167
validator_version = "v4.2.0"
168+
else:
169+
versioned_jar_path = V5_VALIDATOR_JAR
170+
validator_version = "v5.0.0"
167171

168172
assert versioned_jar_path
169173

@@ -181,7 +185,6 @@ def execute_schedule_validator(
181185

182186
report_path = Path(output_dir) / "report.json"
183187
system_errors_path = Path(output_dir) / "system_errors.json"
184-
185188
log(f"executing schedule validator: {' '.join(args)}", pbar=pbar)
186189
subprocess.run(
187190
args,

Diff for: warehouse/models/intermediate/gtfs_quality/int_gtfs_quality__schedule_validator_rule_details_unioned.sql

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ WITH unioned AS (
66
ref('gtfs_schedule_validator_rule_details_v4_0_0'),
77
ref('gtfs_schedule_validator_rule_details_v4_1_0'),
88
ref('gtfs_schedule_validator_rule_details_v4_2_0'),
9+
ref('gtfs_schedule_validator_rule_details_v5_0_0'),
910
],
1011
) }}
1112
),

Diff for: warehouse/models/mart/gtfs_quality/_mart_gtfs_quality.yml

+5-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,11 @@ models:
9090
where: validation_validator_version = 'v4.1.0'
9191
- dbt_utils.accepted_range:
9292
min_value: "DATE'2024-01-20'"
93+
max_value: "DATE'2024-03-26'"
9394
where: validation_validator_version = 'v4.2.0'
95+
- dbt_utils.accepted_range:
96+
min_value: "DATE'2024-03-27'"
97+
where: validation_validator_version = 'v5.0.0'
9498
- &schedule_feed_key
9599
name: feed_key
96100
tests:
@@ -115,7 +119,7 @@ models:
115119
tests:
116120
- not_null
117121
- accepted_values:
118-
values: ['v2.0.0', 'v3.1.1', 'v4.0.0', 'v4.1.0', 'v4.2.0']
122+
values: ['v2.0.0', 'v3.1.1', 'v4.0.0', 'v4.1.0', 'v4.2.0', 'v5.0.0']
119123
- &schedule_validator_code
120124
name: code
121125
description: |

Diff for: warehouse/seeds/_seeds.yml

+20
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,26 @@ seeds:
151151
tests:
152152
- not_null
153153

154+
- name: gtfs_schedule_validator_rule_details_v5_0_0
155+
description: |
156+
A list of validation codes output by the GTFS Schedule validator, and their severities and descriptions.
157+
This data was manually parsed from the contents of the RULES.md file in the v5.0.0 release of the validator,
158+
sourced from: https://github.com/MobilityData/gtfs-validator/releases/tag/v5.0.0
159+
columns:
160+
- name: code
161+
tests:
162+
- not_null
163+
- unique
164+
- name: human_readable_description
165+
tests:
166+
- not_null
167+
- name: version
168+
tests:
169+
- not_null
170+
- name: severity
171+
tests:
172+
- not_null
173+
154174
- name: _deprecated__ntd_agency_to_organization
155175
description: |
156176
*Deprecated May 2023 in favor of the `organizations.raw_ntd_id` column entered directly in Airtable.*
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
code,human_readable_description,version,severity
2+
attribution_without_role,Attribution with no role.,v5.0.0,WARNING
3+
block_trips_with_overlapping_stop_times,Trips with the same block id have overlapping stop times.,v5.0.0,ERROR
4+
csv_parsing_failed,Parsing of a CSV file failed.,v5.0.0,ERROR
5+
decreasing_or_equal_stop_time_distance,Decreasing or equal `shape_dist_traveled` in `stop_times.txt`.,v5.0.0,ERROR
6+
decreasing_shape_distance,Decreasing `shape_dist_traveled` in `shapes.txt`.,v5.0.0,ERROR
7+
duplicate_fare_media,Two distinct fare media have the same fare media name and type.,v5.0.0,WARNING
8+
duplicate_key,Duplicated entity.,v5.0.0,ERROR
9+
duplicate_route_name,"Two distinct routes have either the same `route_short_name`, the same `route_long_name`, or the same combination of `route_short_name` and `route_long_name`.",v5.0.0,WARNING
10+
duplicated_column,Duplicated column in CSV.,v5.0.0,ERROR
11+
empty_column_name,A column name is empty.,v5.0.0,ERROR
12+
empty_file,A CSV file is empty.,v5.0.0,ERROR
13+
empty_row,A row in the input file has only spaces.,v5.0.0,WARNING
14+
equal_shape_distance_diff_coordinates,Two consecutive points have equal `shape_dist_traveled` and different lat/lon coordinates in `shapes.txt` and the distance between the two points is greater than the 1.11m.,v5.0.0,ERROR
15+
equal_shape_distance_diff_coordinates_distance_below_threshold,Two consecutive points have equal `shape_dist_traveled` and different lat/lon coordinates in `shapes.txt` and the distance between the two points is less than 1.11m.,v5.0.0,WARNING
16+
equal_shape_distance_same_coordinates,Two consecutive points have equal `shape_dist_traveled` and the same lat/lon coordinates in `shapes.txt`.,v5.0.0,WARNING
17+
expired_calendar,Dataset should not contain date ranges for services that have already expired.,v5.0.0,WARNING
18+
fare_transfer_rule_duration_limit_type_without_duration_limit,A row from GTFS file `fare_transfer_rules.txt` has a defined `duration_limit_type` field but no `duration_limit` specified.,v5.0.0,ERROR
19+
fare_transfer_rule_duration_limit_without_type,A row from GTFS file `fare_transfer_rules.txt` has a defined `duration_limit` field but no `duration_limit_type` specified.,v5.0.0,ERROR
20+
fare_transfer_rule_invalid_transfer_count,A row from GTFS file `fare_transfer_rules.txt` has a defined `transfer_count` with an invalid value.,v5.0.0,ERROR
21+
fare_transfer_rule_missing_transfer_count,"A row from `fare_transfer_rules.txt` has `from_leg_group_id` equal to `to_leg_group_id`, but has no `transfer_count` specified.",v5.0.0,ERROR
22+
fare_transfer_rule_with_forbidden_transfer_count,"A row from `fare_transfer_rules.txt` has `from_leg_group_id` not equal to `to_leg_group_id`, but has `transfer_count` specified.",v5.0.0,ERROR
23+
fast_travel_between_consecutive_stops,A transit vehicle moves too fast between two consecutive stops.,v5.0.0,WARNING
24+
fast_travel_between_far_stops,A transit vehicle moves too fast between two far stops.,v5.0.0,WARNING
25+
feed_expiration_date30_days,Dataset should cover at least the next 30 days of service.,v5.0.0,WARNING
26+
feed_expiration_date7_days,Dataset should be valid for at least the next 7 days.,v5.0.0,WARNING
27+
feed_info_lang_and_agency_lang_mismatch,Mismatching feed and agency language fields.,v5.0.0,WARNING
28+
foreign_key_violation,Wrong foreign key.,v5.0.0,ERROR
29+
i_o_error,Error in IO operation.,v5.0.0,ERROR
30+
inconsistent_agency_lang,Inconsistent language among agencies.,v5.0.0,WARNING
31+
inconsistent_agency_timezone,Inconsistent Timezone among agencies.,v5.0.0,ERROR
32+
invalid_color,A field contains an invalid color value.,v5.0.0,ERROR
33+
invalid_currency,A field contains a wrong currency code.,v5.0.0,ERROR
34+
invalid_currency_amount,A currency amount field has a value that does not match the format of its corresponding currency code field.,v5.0.0,ERROR
35+
invalid_date,A field cannot be parsed as date.,v5.0.0,ERROR
36+
invalid_email,A field contains a malformed email address.,v5.0.0,ERROR
37+
invalid_float,A field cannot be parsed as a floating point number.,v5.0.0,ERROR
38+
invalid_input_files_in_subfolder,At least 1 GTFS file is in a subfolder.,v5.0.0,ERROR
39+
invalid_integer,A field cannot be parsed as an integer.,v5.0.0,ERROR
40+
invalid_language_code,A field contains a wrong language code.,v5.0.0,ERROR
41+
invalid_phone_number,A field contains a malformed phone number.,v5.0.0,ERROR
42+
invalid_row_length,Invalid csv row length.,v5.0.0,ERROR
43+
invalid_time,A field cannot be parsed as time.,v5.0.0,ERROR
44+
invalid_timezone,A field cannot be parsed as a timezone.,v5.0.0,ERROR
45+
invalid_url,A field contains a malformed URL.,v5.0.0,ERROR
46+
leading_or_trailing_whitespaces,The value in CSV file has leading or trailing whitespaces.,v5.0.0,WARNING
47+
location_with_unexpected_stop_time,A location in `stops.txt` that is not a stop is referenced by some `stop_times.stop_id`.,v5.0.0,ERROR
48+
location_without_parent_station,A location that must have `parent_station` field does not have it.,v5.0.0,ERROR
49+
missing_bike_allowance,Ferry trips should include bike allowance information.,v5.0.0,WARNING
50+
missing_calendar_and_calendar_date_files,Missing GTFS files `calendar.txt` and `calendar_dates.txt`.,v5.0.0,ERROR
51+
missing_feed_contact_email_and_url,Best Practices for `feed_info.txt` suggest providing at least one of `feed_contact_email` and `feed_contact_url`.,v5.0.0,WARNING
52+
missing_feed_info_date,"One of `feed_start_date` or `feed_end_date` is specified, but not both.",v5.0.0,WARNING
53+
missing_level_id,`stops.level_id` is conditionally required.,v5.0.0,ERROR
54+
missing_recommended_column,A recommended column is missing in the input file.,v5.0.0,WARNING
55+
missing_recommended_field,A recommended field is missing.,v5.0.0,WARNING
56+
missing_recommended_file,A recommended file is missing.,v5.0.0,WARNING
57+
missing_required_column,A required column is missing in the input file.,v5.0.0,ERROR
58+
missing_required_field,A required field is missing.,v5.0.0,ERROR
59+
missing_required_file,A required file is missing.,v5.0.0,ERROR
60+
missing_stop_name,"`stops.stop_name` is required for `location_type` equal to `0`, `1`, or `2`.",v5.0.0,ERROR
61+
missing_timepoint_value,`stop_times.timepoint` value is missing for a record.,v5.0.0,WARNING
62+
missing_trip_edge,Missing trip edge `arrival_time` or `departure_time`.,v5.0.0,ERROR
63+
mixed_case_recommended_field,This field has customer-facing text and should use Mixed Case (should contain upper and lower case letters).,v5.0.0,WARNING
64+
more_than_one_entity,More than one row in CSV.,v5.0.0,WARNING
65+
new_line_in_value,New line or carriage return in a value in CSV file.,v5.0.0,ERROR
66+
non_ascii_or_non_printable_char,Non ascii or non printable char in ID field.,v5.0.0,WARNING
67+
number_out_of_range,Out of range value.,v5.0.0,ERROR
68+
overlapping_frequency,Trip frequencies overlap.,v5.0.0,ERROR
69+
pathway_dangling_generic_node,A generic node has only one incident location in a pathway graph.,v5.0.0,WARNING
70+
pathway_loop,A pathway starts and ends at the same location.,v5.0.0,WARNING
71+
pathway_to_platform_with_boarding_areas,A pathway has an endpoint that is a platform which has boarding areas.,v5.0.0,ERROR
72+
pathway_to_wrong_location_type,A pathway has an endpoint that is a station.,v5.0.0,ERROR
73+
pathway_unreachable_location,A location is not reachable at least in one direction: from the entrances or to the exits.,v5.0.0,ERROR
74+
platform_without_parent_station,A platform has no `parent_station` field set.,v5.0.0,INFO
75+
point_near_origin,"A point is too close to origin `(0, 0)`.",v5.0.0,ERROR
76+
point_near_pole,A point is too close to the North or South Pole.,v5.0.0,ERROR
77+
route_both_short_and_long_name_missing,Both `route_short_name` and `route_long_name` are missing for a route.,v5.0.0,ERROR
78+
route_color_contrast,Insufficient route color contrast.,v5.0.0,WARNING
79+
route_long_name_contains_short_name,Long name should not contain short name for a single route.,v5.0.0,WARNING
80+
route_networks_specified_in_more_than_one_file,Indicates that route network identifiers are specified across multiple files.,v5.0.0,ERROR
81+
route_short_name_too_long,Short name of a route is too long (more than 12 characters).,v5.0.0,WARNING
82+
runtime_exception_in_loader_error,RuntimeException while loading GTFS dataset in memory.,v5.0.0,ERROR
83+
runtime_exception_in_validator_error,RuntimeException while validating GTFS archive.,v5.0.0,ERROR
84+
same_name_and_description_for_route,Same name and description for route.,v5.0.0,WARNING
85+
same_name_and_description_for_stop,Same name and description for stop.,v5.0.0,WARNING
86+
same_route_and_agency_url,Same `routes.route_url` and `agency.agency_url`.,v5.0.0,WARNING
87+
same_stop_and_agency_url,Same `stops.stop_url` and `agency.agency_url`.,v5.0.0,WARNING
88+
same_stop_and_route_url,Same `stops.stop_url` and `routes.route_url`.,v5.0.0,WARNING
89+
start_and_end_range_equal,Two date or time fields are equal.,v5.0.0,ERROR
90+
start_and_end_range_out_of_order,Two date or time fields are out of order.,v5.0.0,ERROR
91+
station_with_parent_station,A station has `parent_station` field set.,v5.0.0,ERROR
92+
stop_has_too_many_matches_for_shape,"Stop entry that has many potential matches to the trip's path of travel, as defined by the shape entry in `shapes.txt`.",v5.0.0,WARNING
93+
stop_time_timepoint_without_times,`arrival_time` or `departure_time` not specified for timepoint.,v5.0.0,ERROR
94+
stop_time_with_arrival_before_previous_departure_time,Backwards time travel between stops in `stop_times.txt`,v5.0.0,ERROR
95+
stop_time_with_only_arrival_or_departure_time,Missing `stop_times.arrival_time` or `stop_times.departure_time`.,v5.0.0,ERROR
96+
stop_too_far_from_shape,Stop too far from trip shape.,v5.0.0,WARNING
97+
stop_too_far_from_shape_using_user_distance,Stop time too far from shape.,v5.0.0,WARNING
98+
stop_without_location,"`stop_lat` and/or `stop_lon` is missing for stop with `location_type` equal to`0`, `1`, or `2`",v5.0.0,ERROR
99+
stop_without_stop_time,A stop in `stops.txt` is not referenced by any `stop_times.stop_id`.,v5.0.0,WARNING
100+
stop_without_zone_id,Stop without value for `stops.zone_id` contained in a route with a zone-dependent fare rule.,v5.0.0,INFO
101+
stops_match_shape_out_of_order,Two stop entries are different than their arrival-departure order defined by `shapes.txt`.,v5.0.0,WARNING
102+
thread_execution_error,ExecutionException during multithreaded validation,v5.0.0,ERROR
103+
timeframe_only_start_or_end_time_specified,A row from `timeframes.txt` was found with only one of `start_time` and `end_time` specified.,v5.0.0,ERROR
104+
timeframe_overlap,Two entries in `timeframes.txt` with the same `timeframe_group_id` and `service_id` have overlapping time intervals.,v5.0.0,ERROR
105+
timeframe_start_or_end_time_greater_than_twenty_four_hours,A time in `timeframes.txt` is greater than `24:00:00`.,v5.0.0,ERROR
106+
too_many_rows,A CSV file has too many rows.,v5.0.0,ERROR
107+
transfer_with_invalid_stop_location_type,A stop id field from GTFS file `transfers.txt` references a stop that has a `location_type` other than 0 or 1 (aka Stop/Platform or Station).,v5.0.0,ERROR
108+
transfer_with_invalid_trip_and_route,A trip id field from GTFS file `transfers.txt` references a route that does not match its `trips.txt` `route_id`.,v5.0.0,ERROR
109+
transfer_with_invalid_trip_and_stop,A trip id field from GTFS file `transfers.txt` references a stop that is not included in the referenced trip's stop-times.,v5.0.0,ERROR
110+
transfer_with_suspicious_mid_trip_in_seat,A trip id field from GTFS file `transfers.txt` with an in-seat transfer type references a stop that is not in the expected position in the trip's stop-times.,v5.0.0,WARNING
111+
translation_foreign_key_violation,An entity with the given `record_id` and `record_sub_id` cannot be found in the referenced table.,v5.0.0,ERROR
112+
translation_unexpected_value,A field in a translations row has value but must be empty.,v5.0.0,ERROR
113+
translation_unknown_table_name,A translation references an unknown or missing GTFS table.,v5.0.0,WARNING
114+
trip_coverage_not_active_for_next7_days,Trips data should be valid for at least the next seven days.,v5.0.0,WARNING
115+
trip_distance_exceeds_shape_distance,The distance between the last shape point and last stop point is greater than or equal to the 11.1m threshold.,v5.0.0,ERROR
116+
trip_distance_exceeds_shape_distance_below_threshold,The distance between the last shape point and last stop point is less than the 11.1m threshold.,v5.0.0,WARNING
117+
u_r_i_syntax_error,A string could not be parsed as a URI reference.,v5.0.0,ERROR
118+
unexpected_enum_value,An enum has an unexpected value.,v5.0.0,WARNING
119+
unknown_column,A column name is unknown.,v5.0.0,INFO
120+
unknown_file,A file is unknown.,v5.0.0,INFO
121+
unusable_trip,Trips must have more than one stop to be usable.,v5.0.0,WARNING
122+
unused_parent_station,Unused parent station.,v5.0.0,INFO
123+
unused_shape,Shape is not used in GTFS file `trips.txt`.,v5.0.0,WARNING
124+
unused_trip,Trip is not be used in `stop_times.txt`,v5.0.0,WARNING
125+
wrong_parent_location_type,Incorrect type of the parent location.,v5.0.0,ERROR

0 commit comments

Comments
 (0)