From ea9e492478815d2cb824403756e55f2215ae37a0 Mon Sep 17 00:00:00 2001 From: jarno-r <19144726+jarno-r@users.noreply.github.com> Date: Tue, 1 Mar 2022 08:39:51 +0200 Subject: [PATCH] Inferred partitions (#1) Add 'recover_partitions' option for Spark to run ALTER TABLE RECOVER PARTITIONS even if partitions are not explicitly specified. This makes using inferred partitions possible. --- macros/plugins/spark/helpers/recover_partitions.sql | 2 +- sample_sources/spark.yml | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/macros/plugins/spark/helpers/recover_partitions.sql b/macros/plugins/spark/helpers/recover_partitions.sql index 2d20212b..a79de3e2 100644 --- a/macros/plugins/spark/helpers/recover_partitions.sql +++ b/macros/plugins/spark/helpers/recover_partitions.sql @@ -2,7 +2,7 @@ {# https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-alter-table.html #} {% set ddl %} - {%- if source_node.external.partitions and source_node.external.using and source_node.external.using|lower != 'delta' -%} + {%- if (source_node.external.partitions or source_node.external.recover_partitions) and source_node.external.using and source_node.external.using|lower != 'delta' -%} ALTER TABLE {{ source(source_node.source_name, source_node.name) }} RECOVER PARTITIONS {%- endif -%} {% endset %} diff --git a/sample_sources/spark.yml b/sample_sources/spark.yml index 658e198c..5e49b6fa 100644 --- a/sample_sources/spark.yml +++ b/sample_sources/spark.yml @@ -30,3 +30,15 @@ sources: - name: contexts data_type: string description: "Contexts attached to event by Tracker" + +- name: event_inferred_schema + description: "Snowplow events stored as partitioned parquet files in HDFS with inferred schema" + external: + # File path can contain partitions such as: hdfs://.../events/my_partition=2022-03-01/events1.parquet + # These partitions are excluded from 'location'. + location: 'hdfs://.../events/' + using: parquet + + # Setting recover_partitions to true causes partitions to be refreshed, + # even though partitions are not explicitly specified. + recover_partitions: true