Merge pull request #204 from marklogic/feature/fix-spark-config-option

rjrudin · web-flow · commit cec070d72530 · 2024-07-23T17:20:46.000-04:00
Added docs for --spark-master-url and -C
diff --git a/docs/common-options.md b/docs/common-options.md
@@ -254,3 +254,29 @@ potentially being helpful. The initial error message displayed by Flux is intend
 Flux uses a [Log4J2 properties file](https://logging.apache.org/log4j/2.x/manual/configuration.html#Properties) to 
 configure its logging. The file is located in a Flux installation at `./conf/log4j2.properties`. You are free to 
 customize this file to meet your needs for logging.
+
+## Advanced Spark options
+
+Flux is built on top of [Apache Spark](https://spark.apache.org/) and provides a number of command line options for 
+configuring the underlying Spark runtime environment used by Flux. 
+
+### Configuring a Spark URL
+
+By default, Flux creates a Spark session with a master URL of `local[*]`. You can change this via the 
+`--spark-master-url` option; please see 
+[the Spark documentation](https://spark.apache.org/docs/latest/submitting-applications.html#master-urls) for examples
+of valid values. If you are looking to run a Flux command on a remote Spark cluster, please instead see the 
+[Spark Integration guide](spark-integration.md) for details on integrating Flux with `spark-submit`.
+
+### Configuring the Spark runtime
+
+Some Flux commands reuse [Spark data sources](https://spark.apache.org/docs/latest/sql-data-sources.html) that 
+accept configuration items via the Spark runtime. You can provide these configuration items via the `-C` option. 
+For example, the [Spark Avro data source](https://spark.apache.org/docs/latest/sql-data-sources-avro.html#configuration)
+identifies several configuration items, such as `spark.sql.avro.compression.codec`. You can set this value by 
+including `-Cspark.sql.avro.compression.codec=snappy` as a command line option. 
+
+Note that the majority of [Spark cluster configuration properties](https://spark.apache.org/docs/latest/configuration.html)
+cannot be set via the `-C` option as those options must be set before a Spark session is created. For further control 
+over the Spark session, please see the [Spark Integration guide](spark-integration.md) for details on integrating Flux
+with `spark-submit`.
diff --git a/docs/export/export-rows.md b/docs/export/export-rows.md
@@ -76,9 +76,12 @@ Rows selected via an Optic query can be exported to any of the below file format
 
 The `export-avro-files` command writes one or more Avro files to the directory specified by the `--path` option. This
 command reuses Spark's support for writing Avro files. You can include any of the 
-[Spark Avro options](https://spark.apache.org/docs/latest/sql-data-sources-avro.html) via the `-P` option to
+[Spark Avro data source options](https://spark.apache.org/docs/latest/sql-data-sources-avro.html) via the `-P` option to
 control how Avro content is written. These options are expressed as `-PoptionName=optionValue`.
 
+For configuration options listed in the above Spark Avro guide, use the `-C` option instead. For example,
+`-Cspark.sql.avro.compression.codec=deflate` would change the type of compression used for writing Avro files.
+
 ### Delimited text
 
 The `export-delimited-files` command writes one or more delimited text (commonly CSV) files to the directory 
@@ -125,16 +128,22 @@ By default, each file will be written using the UTF-8 encoding. You can specify
 
 The `export-orc-files` command writes one or more ORC files to the directory specified by the `--path` option. This
 command reuses Spark's support for writing ORC files. You can include any of the
-[Spark ORC options](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) via the `-P` option to
+[Spark ORC data source options](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) via the `-P` option to
 control how ORC content is written. These options are expressed as `-PoptionName=optionValue`.
 
+For configuration options listed in the above Spark ORC guide, use the `-C` option instead. For example,
+`-Cspark.sql.orc.impl=hive` would change the type of ORC implementation.
+
 ### Parquet
 
 The `export-parquet-files` command writes one or more Parquet files to the directory specified by the `--path` option. This
 command reuses Spark's support for writing Parquet files. You can include any of the
-[Spark Parquet options](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) via the `-P` option to
+[Spark Parquet data source options](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) via the `-P` option to
 control how Parquet content is written. These options are expressed as `-PoptionName=optionValue`.
 
+For configuration options listed in the above Spark Parquet guide, use the `-C` option instead. For example, 
+`-Cspark.sql.parquet.compression.codec=gzip` would change the compressed used for writing Parquet files.
+
 ## Controlling the save mode
 
 Each of the commands for exporting rows to files supports a `--mode` option that controls how data is written to a 
diff --git a/docs/import/import-files/avro.md b/docs/import/import-files/avro.md
@@ -95,5 +95,9 @@ explicitly specify a compression algorithm if Flux is not able to read your comp
 ## Advanced options
 
 The `import-avro-files` command reuses Spark's support for reading Avro files. You can include any of
-the [Spark Avro options](https://spark.apache.org/docs/latest/sql-data-sources-avro.html) via the `-P` option
+the [Spark Avro data source options](https://spark.apache.org/docs/latest/sql-data-sources-avro.html) via the `-P` option
 to control how Avro content is read. These options are expressed as `-PoptionName=optionValue`.
+
+For the configuration options listed in the above Spark Avro guide, use the `-C` option instead. For example, 
+`-Cspark.sql.avro.filterPushdown.enabled=false` would configure Spark Avro to not push down filters.
+
diff --git a/docs/import/import-files/orc.md b/docs/import/import-files/orc.md
@@ -95,6 +95,8 @@ explicitly specify a compression algorithm if Flux is not able to read your comp
 ## Advanced options
 
 The `import-orc-files` command reuses Spark's support for reading ORC files. You can include any of
-the [Spark ORC options](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) via the `-P` option
-to control how Avro content is read. These options are expressed as `-PoptionName=optionValue`.
+the [Spark ORC data source options](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) via the `-P` option
+to control how ORC content is read. These options are expressed as `-PoptionName=optionValue`.
 
+For the configuration options listed in the above Spark ORC guide, use the `-C` option instead. For example, 
+`-Cspark.sql.orc.filterPushdown=false` would configure Spark ORC to not push down filters.
diff --git a/docs/import/import-files/parquet.md b/docs/import/import-files/parquet.md
@@ -95,5 +95,8 @@ explicitly specify a compression algorithm if Flux is not able to read your comp
 ## Advanced options
 
 The `import-parquet-files` command reuses Spark's support for reading Parquet files. You can include any of
-the [Spark Parquet options](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) via the `-P` option
+the [Spark Parquet data source options](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) via the `-P` option
 to control how Parquet content is read. These options are expressed as `-PoptionName=optionValue`.
+
+For the configuration options listed in the above Spark Parquet guide, use the `-C` option instead. For example,
+`-Cspark.sql.parquet.filterPushdown=false` would configure Spark Parquet to not push down filters.
diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/SparkUtil.java b/flux-cli/src/main/java/com/marklogic/flux/impl/SparkUtil.java
@@ -22,6 +22,10 @@ public static SparkSession buildSparkSession() {
     public static SparkSession buildSparkSession(String masterUrl, boolean showConsoleProgress) {
         SparkSession.Builder builder = SparkSession.builder()
             .master(masterUrl)
+            // Spark config options can be provided now or at runtime via spark.conf().set(). The downside to setting
+            // options now that are defined by the user is that they won't work when used with spark-submit, which
+            // handles constructing a SparkSession. We may eventually provide a feature though for providing options
+            // at this point for local users that want more control over the SparkSession itself.
             .config("spark.sql.session.timeZone", "UTC");
 
         if (showConsoleProgress) {
diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAvroFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAvroFilesTest.java
@@ -25,7 +25,10 @@ void defaultSettingsMultipleFiles() {
 
             // Including these for manual verification of progress logging.
             "--batch-size", "1",
-            "--log-progress", "2"
+            "--log-progress", "2",
+
+            // Including this to ensure a valid -C option doesn't cause an error.
+            "-Cspark.sql.avro.filterPushdown.enabled=false"
         );
 
         assertCollectionSize("avro-test", 6);
diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportOrcFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportOrcFilesTest.java
@@ -25,7 +25,10 @@ void orcFileTest() {
 
             // Including these for manual verification of progress logging.
             "--batch-size", "5",
-            "--log-progress", "5"
+            "--log-progress", "5",
+
+            // Including this to ensure a valid -C option doesn't cause an error.
+            "-Cspark.sql.orc.filterPushdown=false"
         );
 
         getUrisInCollection("orcFile-test", 15).forEach(this::verifyDocContent);
diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportParquetFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportParquetFilesTest.java
@@ -26,7 +26,10 @@ void defaultSettingsSingleFile() {
 
             // Including these for manual verification of progress logging.
             "--batch-size", "5",
-            "--log-progress", "10"
+            "--log-progress", "10",
+
+            // Including this to ensure a valid -C option doesn't cause an error.
+            "-Cspark.sql.parquet.filterPushdown=false"
         );
 
         assertCollectionSize("parquet-test", 32);