Skip to content

Commit a7c861e

Browse files
authored
Merge branch 'main' into curriculum_docs_update
2 parents 4dd18e4 + c9f7a2d commit a7c861e

File tree

210 files changed

+8391
-3054
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

210 files changed

+8391
-3054
lines changed

.github/pull_request_template.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Resolves #\[issue\]
1515

1616
_Include commands/logs/screenshots as relevant._
1717

18-
_If making changes to dbt models, please run the command `poetry run dbt run -s CHANGED_MODEL` and include the output in this section of the PR._
18+
_If making changes to dbt models, please run the command `poetry run dbt run -s CHANGED_MODEL` and `poetry run dbt test -s CHANGED_MODEL`, then include the output in this section of the PR._
1919

2020
## Post-merge follow-ups
2121

.github/workflows/publish-docs.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929

3030
- name: Build jupyter book
3131
run: jb build docs --warningiserror --keep-going # set doc to fail on any sphinx warning
32-
- uses: actions/upload-artifact@v2
32+
- uses: actions/upload-artifact@v3
3333
if: always()
3434
with:
3535
name: docs-build

.pre-commit-config.yaml

+4-2
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@ repos:
1515
rev: 6.0.0
1616
hooks:
1717
- id: flake8
18-
args: ["--ignore=E501,W503"] # line too long and line before binary operator (black is ok with these)
18+
args: ["--ignore=E501,W503,E231"] # line too long and line before binary operator (black is ok with these) and explicitly ignore the whitespace after colon error
1919
types:
2020
- python
21+
# Suppress SyntaxWarning about invalid escape sequence from calitp-data-infra dependency without modifying source
22+
entry: env PYTHONWARNINGS="ignore::SyntaxWarning" flake8
2123
- repo: https://github.com/psf/black
2224
rev: 23.1.0
2325
hooks:
@@ -71,6 +73,6 @@ repos:
7173
exclude: 'README.md|warehouse/.*'
7274
args: ["--number"]
7375
additional_dependencies:
74-
- mdformat-gfm
76+
- mdformat-gfm==0.3.5
7577
- mdformat-frontmatter
7678
- mdformat-footnote
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
operator: operators.ExternalTable
2+
bucket: gs://calitp-ntd-xlsx-products-clean
3+
prefix_bucket: false
4+
post_hook: |
5+
SELECT *
6+
FROM `{{ get_project_id() }}`.external_ntd__annual_reporting.2022__annual_database_agency_information
7+
LIMIT 1;
8+
source_objects:
9+
- "annual_database_agency_information/2022/_2022_agency_information/*.jsonl.gz"
10+
destination_project_dataset_table: "external_ntd__annual_reporting.2022__annual_database_agency_information"
11+
source_format: NEWLINE_DELIMITED_JSON
12+
use_bq_client: true
13+
hive_options:
14+
mode: AUTO
15+
require_partition_filter: false
16+
source_uri_prefix: "annual_database_agency_information/2022/_2022_agency_information/"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
operator: operators.ExternalTable
2+
bucket: gs://calitp-ntd-xlsx-products-clean
3+
prefix_bucket: false
4+
post_hook: |
5+
SELECT *
6+
FROM `{{ get_project_id() }}`.external_ntd__annual_reporting.2023__annual_database_agency_information
7+
LIMIT 1;
8+
source_objects:
9+
- "annual_database_agency_information/2023/agency_information/*.jsonl.gz"
10+
destination_project_dataset_table: "external_ntd__annual_reporting.2023__annual_database_agency_information"
11+
source_format: NEWLINE_DELIMITED_JSON
12+
use_bq_client: true
13+
hive_options:
14+
mode: AUTO
15+
require_partition_filter: false
16+
source_uri_prefix: "annual_database_agency_information/2023/agency_information/"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
operator: operators.ExternalTable
2+
bucket: gs://calitp-ntd-xlsx-products-clean
3+
prefix_bucket: false
4+
post_hook: |
5+
SELECT *
6+
FROM `{{ get_project_id() }}`.external_ntd__annual_reporting.2023__annual_database_contractual_relationships
7+
LIMIT 1;
8+
source_objects:
9+
- "annual_database_contractual_relationship/2023/contractual_relationships/*.jsonl.gz"
10+
destination_project_dataset_table: "external_ntd__annual_reporting.2023__annual_database_contractual_relationships"
11+
source_format: NEWLINE_DELIMITED_JSON
12+
use_bq_client: true
13+
hive_options:
14+
mode: AUTO
15+
require_partition_filter: false
16+
source_uri_prefix: "annual_database_contractual_relationship/2023/contractual_relationships/"

airflow/dags/create_external_tables/ntd_data_products/annual_database_agency_information.yml

+23-60
Original file line numberDiff line numberDiff line change
@@ -16,125 +16,88 @@ hive_options:
1616
source_uri_prefix: "annual-database-agency-information/{dt:DATE}/{ts:TIMESTAMP}/{year:INTEGER}/"
1717
schema_fields:
1818
- name: number_of_state_counties
19-
type: FLOAT
20-
mode: NULLABLE
19+
type: NUMERIC
2120
- name: tam_tier
2221
type: STRING
23-
mode: NULLABLE
2422
- name: personal_vehicles
25-
type: FLOAT
26-
mode: NULLABLE
23+
type: NUMERIC
2724
- name: density
2825
type: FLOAT
29-
mode: NULLABLE
3026
- name: uza_name
3127
type: STRING
32-
mode: NULLABLE
3328
- name: tribal_area_name
3429
type: STRING
35-
mode: NULLABLE
3630
- name: service_area_sq_miles
37-
type: FLOAT
38-
mode: NULLABLE
31+
type: NUMERIC
3932
- name: total_voms
40-
type: FLOAT
41-
mode: NULLABLE
33+
type: NUMERIC
4234
- name: city
4335
type: STRING
44-
mode: NULLABLE
4536
- name: fta_recipient_id
46-
type: FLOAT
47-
mode: NULLABLE
37+
type: NUMERIC
4838
- name: region
49-
type: FLOAT
50-
mode: NULLABLE
39+
type: NUMERIC
5140
- name: state_admin_funds_expended
52-
type: FLOAT
53-
mode: NULLABLE
41+
type: NUMERIC
5442
- name: zip_code_ext
55-
type: FLOAT
56-
mode: NULLABLE
43+
type: STRING
5744
- name: zip_code
58-
type: FLOAT
59-
mode: NULLABLE
45+
type: STRING
6046
- name: ueid
6147
type: STRING
62-
mode: NULLABLE
48+
- name: division_department
49+
type: STRING
50+
- name: state_parent_ntd_id
51+
type: STRING
6352
- name: address_line_2
6453
type: STRING
65-
mode: NULLABLE
6654
- name: number_of_counties_with_service
67-
type: FLOAT
68-
mode: NULLABLE
55+
type: NUMERIC
6956
- name: reporter_acronym
7057
type: STRING
71-
mode: NULLABLE
7258
- name: original_due_date
73-
type: INTEGER
74-
mode: NULLABLE
59+
type: STRING
7560
- name: sq_miles
76-
type: FLOAT
77-
mode: NULLABLE
61+
type: NUMERIC
7862
- name: address_line_1
7963
type: STRING
80-
mode: NULLABLE
8164
- name: p_o__box
8265
type: STRING
83-
mode: NULLABLE
8466
- name: fy_end_date
85-
type: INTEGER
86-
mode: NULLABLE
67+
type: STRING
8768
- name: reported_by_ntd_id
8869
type: STRING
89-
mode: NULLABLE
9070
- name: population
91-
type: FLOAT
92-
mode: NULLABLE
71+
type: NUMERIC
9372
- name: reporting_module
9473
type: STRING
95-
mode: NULLABLE
9674
- name: service_area_pop
97-
type: FLOAT
98-
mode: NULLABLE
75+
type: NUMERIC
9976
- name: subrecipient_type
10077
type: STRING
101-
mode: NULLABLE
10278
- name: state
10379
type: STRING
104-
mode: NULLABLE
10580
- name: volunteer_drivers
106-
type: FLOAT
107-
mode: NULLABLE
81+
type: NUMERIC
10882
- name: primary_uza
109-
type: FLOAT
110-
mode: NULLABLE
83+
type: NUMERIC
11184
- name: doing_business_as
11285
type: STRING
113-
mode: NULLABLE
11486
- name: reporter_type
11587
type: STRING
116-
mode: NULLABLE
11788
- name: legacy_ntd_id
11889
type: STRING
119-
mode: NULLABLE
12090
- name: voms_do
121-
type: FLOAT
122-
mode: NULLABLE
91+
type: NUMERIC
12392
- name: url
12493
type: STRING
125-
mode: NULLABLE
12694
- name: reported_by_name
12795
type: STRING
128-
mode: NULLABLE
12996
- name: voms_pt
130-
type: FLOAT
131-
mode: NULLABLE
97+
type: NUMERIC
13298
- name: organization_type
13399
type: STRING
134-
mode: NULLABLE
135100
- name: agency_name
136101
type: STRING
137-
mode: NULLABLE
138102
- name: ntd_id
139103
type: STRING
140-
mode: NULLABLE
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
operator: operators.ExternalTable
2+
bucket: gs://calitp-ntd-api-products
3+
source_objects:
4+
- "fra_regulated_mode_major_security_events/historical/*.jsonl.gz"
5+
source_format: NEWLINE_DELIMITED_JSON
6+
use_bq_client: true
7+
hive_options:
8+
mode: CUSTOM
9+
require_partition_filter: false
10+
source_uri_prefix: "fra_regulated_mode_major_security_events/historical/{dt:DATE}/{execution_ts:TIMESTAMP}"
11+
destination_project_dataset_table: "external_ntd__safety_and_security.historical__fra_regulated_mode_major_security_events"
12+
prefix_bucket: false
13+
post_hook: SELECT * FROM `{{ get_project_id() }}`.external_ntd__safety_and_security.historical__fra_regulated_mode_major_security_events LIMIT 1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
operator: operators.ExternalTable
2+
bucket: gs://calitp-ntd-api-products
3+
source_objects:
4+
- "major_safety_events/historical/*.jsonl.gz"
5+
source_format: NEWLINE_DELIMITED_JSON
6+
use_bq_client: true
7+
hive_options:
8+
mode: CUSTOM
9+
require_partition_filter: false
10+
source_uri_prefix: "major_safety_events/historical/{dt:DATE}/{execution_ts:TIMESTAMP}"
11+
destination_project_dataset_table: "external_ntd__safety_and_security.historical__major_safety_events"
12+
prefix_bucket: false
13+
post_hook: SELECT * FROM `{{ get_project_id() }}`.external_ntd__safety_and_security.historical__major_safety_events LIMIT 1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
operator: operators.ExternalTable
2+
bucket: gs://calitp-ntd-api-products
3+
source_objects:
4+
- "monthly_modal_time_series_safety_and_service/historical/*.jsonl.gz"
5+
source_format: NEWLINE_DELIMITED_JSON
6+
use_bq_client: true
7+
hive_options:
8+
mode: CUSTOM
9+
require_partition_filter: false
10+
source_uri_prefix: "monthly_modal_time_series_safety_and_service/historical/{dt:DATE}/{execution_ts:TIMESTAMP}"
11+
destination_project_dataset_table: "external_ntd__safety_and_security.historical__monthly_modal_time_series_safety_and_service"
12+
prefix_bucket: false
13+
post_hook: SELECT * FROM `{{ get_project_id() }}`.external_ntd__safety_and_security.historical__monthly_modal_time_series_safety_and_service LIMIT 1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
operator: operators.ExternalTable
2+
bucket: gs://calitp-ntd-api-products
3+
source_objects:
4+
- "nonmajor_safety_and_security_events/historical/*.jsonl.gz"
5+
source_format: NEWLINE_DELIMITED_JSON
6+
use_bq_client: true
7+
hive_options:
8+
mode: CUSTOM
9+
require_partition_filter: false
10+
source_uri_prefix: "nonmajor_safety_and_security_events/historical/{dt:DATE}/{execution_ts:TIMESTAMP}"
11+
destination_project_dataset_table: "external_ntd__safety_and_security.historical__nonmajor_safety_and_security_events"
12+
prefix_bucket: false
13+
post_hook: SELECT * FROM `{{ get_project_id() }}`.external_ntd__safety_and_security.historical__nonmajor_safety_and_security_events LIMIT 1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
operator: operators.ExternalTable
2+
bucket: gs://calitp-state-geoportal-scrape
3+
source_objects:
4+
- "state_highway_network_geodata/*.jsonl.gz"
5+
source_format: NEWLINE_DELIMITED_JSON
6+
use_bq_client: true
7+
hive_options:
8+
mode: CUSTOM
9+
require_partition_filter: false
10+
source_uri_prefix: "state_highway_network_geodata/{dt:DATE}/{execution_ts:TIMESTAMP}/"
11+
destination_project_dataset_table: "external_state_geoportal.state_highway_network"
12+
prefix_bucket: false
13+
post_hook: |
14+
SELECT *
15+
FROM `{{ get_project_id() }}`.external_state_geoportal.state_highway_network
16+
LIMIT 1;
17+
schema_fields:
18+
- name: Route
19+
type: INTEGER
20+
- name: County
21+
type: STRING
22+
- name: District
23+
type: INTEGER
24+
- name: RouteType
25+
type: STRING
26+
- name: Direction
27+
type: STRING
28+
- name: wkt_coordinates
29+
type: GEOGRAPHY
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
description: "Scrape State Highway Network from State Geoportal"
2+
schedule_interval: "0 4 1 * *" # 4am UTC first day of every month
3+
tags:
4+
- all_gusty_features
5+
default_args:
6+
owner: airflow
7+
depends_on_past: False
8+
catchup: False
9+
start_date: "2024-09-15"
10+
email:
11+
12+
email_on_failure: True
13+
email_on_retry: False
14+
retries: 1
15+
retry_delay: !timedelta 'minutes: 2'
16+
concurrency: 50
17+
#sla: !timedelta 'hours: 2'
18+
wait_for_defaults:
19+
timeout: 3600
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
operator: operators.StateGeoportalAPIOperator
2+
3+
root_url: 'https://caltrans-gis.dot.ca.gov/arcgis/rest/services/'
4+
service: "CHhighway/SHN_Lines"
5+
layer: "0"
6+
product: 'state_highway_network'
7+
resultRecordCount: 2000

airflow/dags/sync_ntd_data_api/safety_service_and_security_historical/major_safety_events.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ operator: operators.NtdDataProductAPIOperator
33
year: 'historical'
44
product: 'major_safety_events'
55
root_url: 'https://data.transportation.gov/resource/'
6-
endpoint_id: '9ivb-8ae9'
6+
endpoint_id: 'urir-txqm'
77
file_format: '.json'

airflow/dags/sync_ntd_data_api/safety_service_and_security_historical/monthly_modal_time_series_safety_and_service.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ operator: operators.NtdDataProductAPIOperator
33
year: 'historical'
44
product: 'monthly_modal_time_series_safety_and_service'
55
root_url: 'https://data.transportation.gov/resource/'
6-
endpoint_id: '65fa-qbkf'
6+
endpoint_id: '5ti2-5uiv'
77
file_format: '.json'

airflow/dags/sync_ntd_data_xlsx/METADATA.yml

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
description: "Scrape tables from DOT Ridership XLSX file daily"
2-
schedule_interval: "0 10 * * *" # 10am UTC every day
1+
description: "Scrape tables from DOT Ridership XLSX file weekly"
2+
schedule_interval: "0 10 * * 1" # 10am UTC every Monday
33
tags:
44
- all_gusty_features
55
default_args:
@@ -15,5 +15,6 @@ default_args:
1515
retry_delay: !timedelta 'minutes: 2'
1616
concurrency: 50
1717
#sla: !timedelta 'hours: 2'
18+
provide_context: True
1819
wait_for_defaults:
1920
timeout: 3600
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
operator: operators.NtdDataProductXLSXOperator
2+
3+
product: 'annual_database_agency_information'
4+
xlsx_file_url: 'https://www.transit.dot.gov/ntd/data-product/2022-annual-database-agency-information' # placeholder for scraped url from scrape_ntd_xlsx_urls task
5+
year: '2022' # one of: 'historical' (long history), 'mutli-year' (select history), or a specific year (ex: 2022)
6+
dependencies:
7+
- scrape_ntd_xlsx_urls
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
operator: operators.NtdDataProductXLSXOperator
2+
3+
product: 'annual_database_agency_information'
4+
xlsx_file_url: 'https://www.transit.dot.gov/ntd/data-product/2023-annual-database-agency-information' # placeholder for scraped url from scrape_ntd_xlsx_urls task
5+
year: '2023' # one of: 'historical' (long history), 'mutli-year' (select history), or a specific year (ex: 2022)
6+
dependencies:
7+
- scrape_ntd_xlsx_urls

0 commit comments

Comments
 (0)