-
Notifications
You must be signed in to change notification settings - Fork 128
/
Copy pathbigquery.yml
60 lines (53 loc) · 2.01 KB
/
bigquery.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
version: 2
sources:
- name: snowplow
database: analytics
loader: gcloud storage
tables:
- name: event
description: "External table of Snowplow events, stored as CSV files in Cloud Storage"
external:
location: 'gs://bucket/path/*'
options:
format: csv
skip_leading_rows: 1
# if you want a partitioned table, file paths MUST be Hive-style:
# 'gs://bucket/path/collector_hour=2020-01-01/'
# 'gs://bucket/path/collector_hour=2020-01-02/' (etc)
hive_partition_uri_prefix: 'gs://bucket/path/'
partitions:
- name: collector_date
data_type: date
columns:
- name: app_id
data_type: varchar(255)
description: "Application ID"
- name: domain_sessionidx
data_type: int
description: "A visit / session index"
- name: etl_tstamp
data_type: timestamp
description: "Timestamp event began ETL"
- name: contexts
data_type: variant
description: "Contexts attached to event by Tracker"
# alternatively, BigQuery can infer your schema (columns + partitions)
- name: event_inferred
external:
location: 'gs://bucket/path/*'
options:
format: csv
skip_leading_rows: 1
hive_partition_uri_prefix: 'gs://bucket/path/'
# optionally, BigQuery can pull data from multiple GCS paths, instead of just one
- name: event_multiple_paths
external:
location: this is still a required property, but it will be ignored
options:
format: csv
skip_leading_rows: 1
# list all file paths with relevant source data
uris:
- 'gs://bucket_a/path/*'
- 'gs://bucket_b/path/*'
- 'gs://bucket_c/more/specific/path/file.csv'