From 0ddda3570b4a32ac03c306b5ee07087a0b422b65 Mon Sep 17 00:00:00 2001
From: Rob Rudin <rudin@progress.com>
Date: Mon, 22 Jul 2024 17:45:36 -0400
Subject: [PATCH] Added docs for compression with Spark data sources

---
 docs/common-options.md                        |   6 ++---
 docs/import/import-files/avro.md              |   6 +++++
 docs/import/import-files/delimited-text.md    |   6 +++++
 docs/import/import-files/json.md              |   6 +++++
 docs/import/import-files/orc.md               |   6 +++++
 docs/import/import-files/parquet.md           |   6 +++++
 .../ImportAggregateJsonFilesTest.java         |  22 ++++++++++++++++++
 .../line-delimited-json.txt.gz                | Bin 0 -> 75 bytes
 8 files changed, 55 insertions(+), 3 deletions(-)
 create mode 100644 flux-cli/src/test/resources/delimited-files/line-delimited-json.txt.gz

diff --git a/docs/common-options.md b/docs/common-options.md
index 0709894d..748e64ee 100644
--- a/docs/common-options.md
+++ b/docs/common-options.md
@@ -32,10 +32,10 @@ are followed by a list of options common to every Flux command.
 You can specify a command name without entering its full name, as long as you enter a sufficient number of characters
 such that Flux can uniquely identify the command name.
 
-For example, instead of entering `import-aggregate-xml-files`, you can enter `import-ag` as it is the only command in
-Flux with that sequence of letters:
+For example, instead of entering `import-parquet-files`, you can enter `import-p` as it is the only command in
+Flux beginning with that sequence of letters:
 
-    ./bin/flux import-ag --path path/to/data etc...
+    ./bin/flux import-p --path path/to/data etc...
 
 If Flux cannot uniquely identify the command name, it will print an error and list the command names that match what
 you entered.
diff --git a/docs/import/import-files/avro.md b/docs/import/import-files/avro.md
index 1cdbcadc..da20357e 100644
--- a/docs/import/import-files/avro.md
+++ b/docs/import/import-files/avro.md
@@ -86,6 +86,12 @@ it may be important to query for documents that have a particular field with a v
 The `import-avro-files` command supports aggregating related rows together to produce hierarchical documents. See
 [Aggregating rows](../aggregating-rows.md) for more information.
 
+## Reading compressed files
+
+Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to
+specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to
+explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically.
+
 ## Advanced options
 
 The `import-avro-files` command reuses Spark's support for reading Avro files. You can include any of
diff --git a/docs/import/import-files/delimited-text.md b/docs/import/import-files/delimited-text.md
index 1d70a805..0b99ef60 100644
--- a/docs/import/import-files/delimited-text.md
+++ b/docs/import/import-files/delimited-text.md
@@ -106,6 +106,12 @@ the content can be correctly translated to UTF-8 when written to MarkLogic - e.g
 The `import-delimited-files` command supports aggregating related rows together to produce hierarchical documents. See
 [Aggregating rows](../aggregating-rows.md) for more information.
 
+## Reading compressed files
+
+Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to
+specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to
+explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically.
+
 ## Advanced options
 
 The `import-delimited-files` command reuses Spark's support for reading delimited text data. You can include any of
diff --git a/docs/import/import-files/json.md b/docs/import/import-files/json.md
index 84f8f80b..1ea01396 100644
--- a/docs/import/import-files/json.md
+++ b/docs/import/import-files/json.md
@@ -83,6 +83,12 @@ the content can be correctly translated to UTF-8 when written to MarkLogic:
     etc...
 ```
 
+## Reading compressed files
+
+Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to
+specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to
+explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically.
+
 ## Advanced options
 
 The `import-aggregate-json-files` command reuses Spark's support for reading JSON files. You can include any of
diff --git a/docs/import/import-files/orc.md b/docs/import/import-files/orc.md
index 9e4ce773..3093dab6 100644
--- a/docs/import/import-files/orc.md
+++ b/docs/import/import-files/orc.md
@@ -86,6 +86,12 @@ it may be important to query for documents that have a particular field with a v
 The `import-orc-files` command supports aggregating related rows together to produce hierarchical documents. See
 [Aggregating rows](../aggregating-rows.md) for more information.
 
+## Reading compressed files
+
+Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to
+specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to
+explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically.
+
 ## Advanced options
 
 The `import-orc-files` command reuses Spark's support for reading ORC files. You can include any of
diff --git a/docs/import/import-files/parquet.md b/docs/import/import-files/parquet.md
index 6e23b286..0b288329 100644
--- a/docs/import/import-files/parquet.md
+++ b/docs/import/import-files/parquet.md
@@ -86,6 +86,12 @@ it may be important to query for documents that have a particular field with a v
 The `import-parquet-files` command supports aggregating related rows together to produce hierarchical documents. See
 [Aggregating rows](../aggregating-rows.md) for more information.
 
+## Reading compressed files
+
+Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to
+specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to
+explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically.
+
 ## Advanced options
 
 The `import-parquet-files` command reuses Spark's support for reading Parquet files. You can include any of
diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java
index 376a18a1..c20d86e5 100644
--- a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java
+++ b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java
@@ -124,6 +124,28 @@ void jsonLines() {
         verifyDoc("/delimited/lastName-3.json", "firstName-3", "lastName-3");
     }
 
+    @Test
+    void gzippedJsonLines() {
+        run(
+            "import-aggregate-json-files",
+            "--path", "src/test/resources/delimited-files/line-delimited-json.txt.gz",
+            "--json-lines",
+            "--connection-string", makeConnectionString(),
+            "--permissions", DEFAULT_PERMISSIONS,
+            "--collections", "delimited-json-test",
+            "--uri-template", "/delimited/{lastName}.json"
+        );
+
+        assertCollectionSize(
+            "Spark data sources will automatically handle .gz files without -Pcompression=gzip being specified.",
+            "delimited-json-test", 3
+        );
+        verifyDoc("/delimited/lastName-1.json", "firstName-1", "lastName-1");
+        verifyDoc("/delimited/lastName-2.json", "firstName-2", "lastName-2");
+        verifyDoc("/delimited/lastName-3.json", "firstName-3", "lastName-3");
+    }
+
+
     @Test
     void jsonRootName() {
         run(
diff --git a/flux-cli/src/test/resources/delimited-files/line-delimited-json.txt.gz b/flux-cli/src/test/resources/delimited-files/line-delimited-json.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..a892c5974f7657a129f96503bd1792c45f285b01
GIT binary patch
literal 75
zcmV-R0JQ%fiwFqD(Vk`i19W9=a4vLsbO5VXO3N%NF7ZpuO;xf|KyY*om2{MH5)m>m
hHc+CLs~VRsBV4+S@aZzfrOOzo3jhvjeg~oe002i?9AN+e

literal 0
HcmV?d00001