Merge pull request #201 from marklogic/feature/json-gzip

rjrudin · web-flow · commit 2eaaf1589247 · 2024-07-22T21:32:06.000-04:00
Added docs for compression with Spark data sources
diff --git a/docs/common-options.md b/docs/common-options.md
@@ -32,10 +32,10 @@ are followed by a list of options common to every Flux command.
 You can specify a command name without entering its full name, as long as you enter a sufficient number of characters
 such that Flux can uniquely identify the command name.
 
-For example, instead of entering `import-aggregate-xml-files`, you can enter `import-ag` as it is the only command in
-Flux with that sequence of letters:
+For example, instead of entering `import-parquet-files`, you can enter `import-p` as it is the only command in
+Flux beginning with that sequence of letters:
 
-    ./bin/flux import-ag --path path/to/data etc...
+    ./bin/flux import-p --path path/to/data etc...
 
 If Flux cannot uniquely identify the command name, it will print an error and list the command names that match what
 you entered.
diff --git a/docs/import/import-files/avro.md b/docs/import/import-files/avro.md
@@ -86,6 +86,12 @@ it may be important to query for documents that have a particular field with a v
 The `import-avro-files` command supports aggregating related rows together to produce hierarchical documents. See
 [Aggregating rows](../aggregating-rows.md) for more information.
 
+## Reading compressed files
+
+Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to
+specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to
+explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically.
+
 ## Advanced options
 
 The `import-avro-files` command reuses Spark's support for reading Avro files. You can include any of
diff --git a/docs/import/import-files/delimited-text.md b/docs/import/import-files/delimited-text.md
@@ -106,6 +106,12 @@ the content can be correctly translated to UTF-8 when written to MarkLogic - e.g
 The `import-delimited-files` command supports aggregating related rows together to produce hierarchical documents. See
 [Aggregating rows](../aggregating-rows.md) for more information.
 
+## Reading compressed files
+
+Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to
+specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to
+explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically.
+
 ## Advanced options
 
 The `import-delimited-files` command reuses Spark's support for reading delimited text data. You can include any of
diff --git a/docs/import/import-files/json.md b/docs/import/import-files/json.md
@@ -83,6 +83,12 @@ the content can be correctly translated to UTF-8 when written to MarkLogic:
     etc...
 ```
 
+## Reading compressed files
+
+Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to
+specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to
+explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically.
+
 ## Advanced options
 
 The `import-aggregate-json-files` command reuses Spark's support for reading JSON files. You can include any of
diff --git a/docs/import/import-files/orc.md b/docs/import/import-files/orc.md
@@ -86,6 +86,12 @@ it may be important to query for documents that have a particular field with a v
 The `import-orc-files` command supports aggregating related rows together to produce hierarchical documents. See
 [Aggregating rows](../aggregating-rows.md) for more information.
 
+## Reading compressed files
+
+Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to
+specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to
+explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically.
+
 ## Advanced options
 
 The `import-orc-files` command reuses Spark's support for reading ORC files. You can include any of
diff --git a/docs/import/import-files/parquet.md b/docs/import/import-files/parquet.md
@@ -86,6 +86,12 @@ it may be important to query for documents that have a particular field with a v
 The `import-parquet-files` command supports aggregating related rows together to produce hierarchical documents. See
 [Aggregating rows](../aggregating-rows.md) for more information.
 
+## Reading compressed files
+
+Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to
+specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to
+explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically.
+
 ## Advanced options
 
 The `import-parquet-files` command reuses Spark's support for reading Parquet files. You can include any of
diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java
@@ -124,6 +124,28 @@ void jsonLines() {
         verifyDoc("/delimited/lastName-3.json", "firstName-3", "lastName-3");
     }
 
+    @Test
+    void gzippedJsonLines() {
+        run(
+            "import-aggregate-json-files",
+            "--path", "src/test/resources/delimited-files/line-delimited-json.txt.gz",
+            "--json-lines",
+            "--connection-string", makeConnectionString(),
+            "--permissions", DEFAULT_PERMISSIONS,
+            "--collections", "delimited-json-test",
+            "--uri-template", "/delimited/{lastName}.json"
+        );
+
+        assertCollectionSize(
+            "Spark data sources will automatically handle .gz files without -Pcompression=gzip being specified.",
+            "delimited-json-test", 3
+        );
+        verifyDoc("/delimited/lastName-1.json", "firstName-1", "lastName-1");
+        verifyDoc("/delimited/lastName-2.json", "firstName-2", "lastName-2");
+        verifyDoc("/delimited/lastName-3.json", "firstName-3", "lastName-3");
+    }
+
+
     @Test
     void jsonRootName() {
         run(
diff --git a/flux-cli/src/test/resources/delimited-files/line-delimited-json.txt.gz b/flux-cli/src/test/resources/delimited-files/line-delimited-json.txt.gz