From 804067ee0fae72aa3685f9f8dbb09190dbd9b7f2 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Thu, 31 Oct 2024 17:29:26 +0100 Subject: [PATCH 1/4] =?UTF-8?q?remove=20--routes-without-agency-id=20optio?= =?UTF-8?q?n=20=F0=9F=92=A5=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cli.js | 5 ----- index.js | 1 - lib/deps.js | 3 +-- lib/routes.js | 2 +- readme.md | 1 - 5 files changed, 2 insertions(+), 10 deletions(-) diff --git a/cli.js b/cli.js index ed0b0be..eb39661 100755 --- a/cli.js +++ b/cli.js @@ -35,9 +35,6 @@ const { 'trips-without-shape-id': { type: 'boolean', }, - 'routes-without-agency-id': { - type: 'boolean', - }, 'stops-without-level-id': { type: 'boolean', }, @@ -98,7 +95,6 @@ Options: Default: google-extended --trips-without-shape-id Don't require trips.txt items to have a shape_id. Default if shapes.txt has not been provided. - --routes-without-agency-id Don't require routes.txt items to have an agency_id. --stops-without-level-id Don't require stops.txt items to have a level_id. Default if levels.txt has not been provided. --stops-location-index Create a spatial index on stops.stop_loc for efficient @@ -179,7 +175,6 @@ const opt = { ignoreUnsupportedFiles: !!flags['ignore-unsupported'], routeTypesScheme: flags['route-types-scheme'] || 'google-extended', tripsWithoutShapeId: !!flags['trips-without-shape-id'], - routesWithoutAgencyId: !!flags['routes-without-agency-id'], stopsLocationIndex: !!flags['stops-location-index'], statsByRouteIdAndDate: flags['stats-by-route-date'] || 'none', statsByAgencyIdAndRouteIdAndStopAndHour: flags['stats-by-agency-route-stop-hour'] || 'none', diff --git a/index.js b/index.js index 052291b..b623c9f 100644 --- a/index.js +++ b/index.js @@ -17,7 +17,6 @@ const convertGtfsToSql = async function* (files, opt = {}) { ignoreUnsupportedFiles: false, routeTypesScheme: 'google-extended', tripsWithoutShapeId: !files.some(f => f.name === 'shapes'), - routesWithoutAgencyId: false, stopsWithoutLevelId: !files.some(f => f.name === 'levels'), stopsLocationIndex: false, lowerCaseLanguageCodes: false, diff --git a/lib/deps.js b/lib/deps.js index 5d9e2c3..d00d793 100644 --- a/lib/deps.js +++ b/lib/deps.js @@ -3,7 +3,6 @@ const getDependencies = (opt, files) => { const { tripsWithoutShapeId, - routesWithoutAgencyId, stopsWithoutLevelId, } = opt return { @@ -27,7 +26,7 @@ const getDependencies = (opt, files) => { 'frequencies', ], routes: [ - ...(routesWithoutAgencyId ? [] : ['agency']), + 'agency', ], trips: [ 'routes', diff --git a/lib/routes.js b/lib/routes.js index fe416e5..11b3ef5 100644 --- a/lib/routes.js +++ b/lib/routes.js @@ -293,7 +293,7 @@ COPY "${opt.schema}".routes ( const formatRoutesRow = (r, opt, workingState) => { const agency_id = r.agency_id || null - if (agency_id === null && !opt.routesWithoutAgencyId) { + if (!agency_id) { // The GTFS spec allows routes.agency_id to be empty/null if there is exactly one agency in the feed. // It seems that GTFS has allowed this at least since 2016: // https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 diff --git a/readme.md b/readme.md index 44c87b2..857c5d5 100644 --- a/readme.md +++ b/readme.md @@ -156,7 +156,6 @@ Options: Default: google-extended --trips-without-shape-id Don't require trips.txt items to have a shape_id. Default if shapes.txt has not been provided. - --routes-without-agency-id Don't require routes.txt items to have an agency_id. --stops-without-level-id Don't require stops.txt items to have a level_id. Default if levels.txt has not been provided. --stops-location-index Create a spatial index on stops.stop_loc for efficient From bf78e076b2cdaef8dd10220c22a838cb72304628 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Thu, 31 Oct 2024 17:38:00 +0100 Subject: [PATCH 2/4] =?UTF-8?q?handle=20feeds=20with=201=20agency=20&=20ro?= =?UTF-8?q?utes.agency=5Fid=20=3D=20null=20=F0=9F=90=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit follow-up of 02d307b --- index.js | 16 ++++++++++++++++ lib/routes.js | 23 +++++++++-------------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/index.js b/index.js index b623c9f..003194a 100644 --- a/index.js +++ b/index.js @@ -208,6 +208,22 @@ LANGUAGE sql; const nrOfRowsByName = new Map() const workingState = { nrOfRowsByName, + onlyAgencyId: null, + } + + // The GTFS spec allows agency.txt to be empty/null if there is exactly one agency in the feed. + // It seems that GTFS has allowed this at least since 2016: + // https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 + // However, because we have to use left join instead of an inner join in tables referencing `agency`, this prevents the PostgreSQL query planner from doing some filter pushdowns, e.g. + // - when querying `arrivals_departures` by route, stop, date and t_departure/t_arrival + { + for await (const agency of await readCsv('agency')) { + workingState.onlyAgencyId = agency.agency_id + if (++agencies >= 2) { + workingState.onlyAgencyId = null + break + } + } } for (const name of order) { diff --git a/lib/routes.js b/lib/routes.js index 11b3ef5..97d3d9f 100644 --- a/lib/routes.js +++ b/lib/routes.js @@ -292,21 +292,16 @@ COPY "${opt.schema}".routes ( } const formatRoutesRow = (r, opt, workingState) => { - const agency_id = r.agency_id || null + // The GTFS spec allows routes.agency_id to be empty/null if there is exactly one agency in the feed. In this case, we insert a default agency. + const agency_id = r.agency_id || workingState.onlyAgencyId if (!agency_id) { - // The GTFS spec allows routes.agency_id to be empty/null if there is exactly one agency in the feed. - // It seems that GTFS has allowed this at least since 2016: - // https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 - if (workingState.nrOfRowsByName.get('agency') !== 1) { - // todo: throw special error indicating an error in the input data - throw new DataError( - 'routes', - 'agency_id must not be empty/null', - [ - 'The GTFS spec allows routes.agency_id to be empty/null only if there is exactly one agency in the feed.' - ], - ) - } + throw new DataError( + 'routes', + 'agency_id must not be empty/null', + [ + 'The GTFS spec allows routes.agency_id to be empty/null only if there is exactly one agency in the feed.' + ], + ) } return [ From 21d658e8cd10d006f765ecfca066d3907fd692dd Mon Sep 17 00:00:00 2001 From: Jannis R Date: Thu, 31 Oct 2024 17:40:03 +0100 Subject: [PATCH 3/4] =?UTF-8?q?handle=20feeds=20with=200=20agencies=20&=20?= =?UTF-8?q?routes.agency=5Fid=20=3D=20null=20=F0=9F=90=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit follow-up of 02d307b --- index.js | 10 ++++++++++ lib/agency.js | 25 ++++++++++++++++++++++++- lib/stop_times.js | 18 ++---------------- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/index.js b/index.js index 003194a..0a8a394 100644 --- a/index.js +++ b/index.js @@ -9,6 +9,7 @@ const {Stringifier} = require('csv-stringify') const formatters = require('./lib') const getDependencies = require('./lib/deps') const pkg = require('./package.json') +const {DEFAULT_AGENCY_ID} = require('./lib/agency') const convertGtfsToSql = async function* (files, opt = {}) { opt = { @@ -24,6 +25,8 @@ const convertGtfsToSql = async function* (files, opt = {}) { statsByAgencyIdAndRouteIdAndStopAndHour: 'none', statsActiveTripsByHour: 'none', schema: 'public', + // todo: find something more helpful than falling back to Etc/GMT! + defaultTimezone: new Intl.DateTimeFormat().resolvedOptions().timeZone || 'Etc/GMT', postgraphile: false, postgraphilePassword: process.env.POSTGRAPHILE_PGPASSWORD || null, postgrest: false, @@ -208,6 +211,7 @@ LANGUAGE sql; const nrOfRowsByName = new Map() const workingState = { nrOfRowsByName, + insertDefaultAgency: false, onlyAgencyId: null, } @@ -217,6 +221,7 @@ LANGUAGE sql; // However, because we have to use left join instead of an inner join in tables referencing `agency`, this prevents the PostgreSQL query planner from doing some filter pushdowns, e.g. // - when querying `arrivals_departures` by route, stop, date and t_departure/t_arrival { + let agencies = 0 for await (const agency of await readCsv('agency')) { workingState.onlyAgencyId = agency.agency_id if (++agencies >= 2) { @@ -224,6 +229,11 @@ LANGUAGE sql; break } } + // We insert a mock agency in order to use an inner join in tables referencing `agency`. + if (agencies === 0) { + workingState.insertDefaultAgency = true + workingState.onlyAgencyId = DEFAULT_AGENCY_ID + } } for (const name of order) { diff --git a/lib/agency.js b/lib/agency.js index f9e3697..7cd2b03 100644 --- a/lib/agency.js +++ b/lib/agency.js @@ -1,5 +1,7 @@ 'use strict' +const DEFAULT_AGENCY_ID = 'default-agency' + // https://gtfs.org/schedule/reference/#agencytxt const beforeAll = (opt) => `\ CREATE TABLE "${opt.schema}".agency ( @@ -39,11 +41,32 @@ const formatAgencyRow = (a) => { ] } -const afterAll = `\ +const afterAll = (opt, workingState) => { + let sql = `\ \\. ` + if (workingState.insertDefaultAgency) { + sql += `\ +INSERT INTO "${opt.schema}".agency ( + agency_id, + agency_name, + agency_url, + agency_timezone +) VALUES ( + '${DEFAULT_AGENCY_ID}', + 'implicit default agency, the CSV file doesn\\'t contain one', + 'http://example.org', + '${opt.defaultTimezone}' +); +` + } + + return sql +} + module.exports = { + DEFAULT_AGENCY_ID, beforeAll, formatRow: formatAgencyRow, afterAll, diff --git a/lib/stop_times.js b/lib/stop_times.js index dd1c37c..a2ad1c9 100644 --- a/lib/stop_times.js +++ b/lib/stop_times.js @@ -225,14 +225,7 @@ WITH stop_times_based AS NOT MATERIALIZED ( LEFT JOIN "${opt.schema}".stops stations ON stops.parent_station = stations.stop_id JOIN "${opt.schema}".trips ON s.trip_id = trips.trip_id JOIN "${opt.schema}".routes ON trips.route_id = routes.route_id - LEFT JOIN "${opt.schema}".agency ON ( - -- The GTFS spec allows routes.agency_id to be NULL if there is exactly one agency in the feed. - -- Note: We implicitly rely on other parts of the code base to validate that agency has just one row! - -- It seems that GTFS has allowed this at least since 2016: - -- https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 - routes.agency_id IS NULL -- match first (and only) agency - OR routes.agency_id = agency.agency_id -- match by ID - ) + JOIN "${opt.schema}".agency ON routes.agency_id = agency.agency_id JOIN "${opt.schema}".service_days ON trips.service_id = service_days.service_id ) -- todo: this slows down slightly @@ -465,14 +458,7 @@ WITH stop_times_based AS NOT MATERIALIZED ( ) AS to_wheelchair_boarding FROM "${opt.schema}".trips LEFT JOIN "${opt.schema}".routes ON trips.route_id = routes.route_id - LEFT JOIN "${opt.schema}".agency ON ( - -- The GTFS spec allows routes.agency_id to be NULL if there is exactly one agency in the feed. - -- Note: We implicitly rely on other parts of the code base to validate that agency has just one row! - -- It seems that GTFS has allowed this at least since 2016: - -- https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 - routes.agency_id IS NULL -- match first (and only) agency - OR routes.agency_id = agency.agency_id -- match by ID - ) + JOIN "${opt.schema}".agency ON routes.agency_id = agency.agency_id LEFT JOIN "${opt.schema}".stop_times ON trips.trip_id = stop_times.trip_id LEFT JOIN "${opt.schema}".stops from_stops ON stop_times.stop_id = from_stops.stop_id LEFT JOIN "${opt.schema}".stops from_stations ON from_stops.parent_station = from_stations.stop_id From a8c3e95868feac71022cfbef91198c8f3c0d1e8e Mon Sep 17 00:00:00 2001 From: Jannis R Date: Thu, 31 Oct 2024 17:41:04 +0100 Subject: [PATCH 4/4] agency handling: add todos --- index.js | 1 + lib/stop_times.js | 2 ++ 2 files changed, 3 insertions(+) diff --git a/index.js b/index.js index 0a8a394..5daf8e4 100644 --- a/index.js +++ b/index.js @@ -220,6 +220,7 @@ LANGUAGE sql; // https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 // However, because we have to use left join instead of an inner join in tables referencing `agency`, this prevents the PostgreSQL query planner from doing some filter pushdowns, e.g. // - when querying `arrivals_departures` by route, stop, date and t_departure/t_arrival + // todo: add tests: 0 agencies (implicit single agency), 1 agency { let agencies = 0 for await (const agency of await readCsv('agency')) { diff --git a/lib/stop_times.js b/lib/stop_times.js index a2ad1c9..ba49db2 100644 --- a/lib/stop_times.js +++ b/lib/stop_times.js @@ -225,6 +225,7 @@ WITH stop_times_based AS NOT MATERIALIZED ( LEFT JOIN "${opt.schema}".stops stations ON stops.parent_station = stations.stop_id JOIN "${opt.schema}".trips ON s.trip_id = trips.trip_id JOIN "${opt.schema}".routes ON trips.route_id = routes.route_id + -- todo: what if the route is missing (LEFT JOIN), does this work? JOIN "${opt.schema}".agency ON routes.agency_id = agency.agency_id JOIN "${opt.schema}".service_days ON trips.service_id = service_days.service_id ) @@ -458,6 +459,7 @@ WITH stop_times_based AS NOT MATERIALIZED ( ) AS to_wheelchair_boarding FROM "${opt.schema}".trips LEFT JOIN "${opt.schema}".routes ON trips.route_id = routes.route_id + -- todo: what if the route is missing (LEFT JOIN), does this work? JOIN "${opt.schema}".agency ON routes.agency_id = agency.agency_id LEFT JOIN "${opt.schema}".stop_times ON trips.trip_id = stop_times.trip_id LEFT JOIN "${opt.schema}".stops from_stops ON stop_times.stop_id = from_stops.stop_id