From 0ddda3570b4a32ac03c306b5ee07087a0b422b65 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Mon, 22 Jul 2024 17:45:36 -0400 Subject: [PATCH] Added docs for compression with Spark data sources --- docs/common-options.md | 6 ++--- docs/import/import-files/avro.md | 6 +++++ docs/import/import-files/delimited-text.md | 6 +++++ docs/import/import-files/json.md | 6 +++++ docs/import/import-files/orc.md | 6 +++++ docs/import/import-files/parquet.md | 6 +++++ .../ImportAggregateJsonFilesTest.java | 22 ++++++++++++++++++ .../line-delimited-json.txt.gz | Bin 0 -> 75 bytes 8 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 flux-cli/src/test/resources/delimited-files/line-delimited-json.txt.gz diff --git a/docs/common-options.md b/docs/common-options.md index 0709894d..748e64ee 100644 --- a/docs/common-options.md +++ b/docs/common-options.md @@ -32,10 +32,10 @@ are followed by a list of options common to every Flux command. You can specify a command name without entering its full name, as long as you enter a sufficient number of characters such that Flux can uniquely identify the command name. -For example, instead of entering `import-aggregate-xml-files`, you can enter `import-ag` as it is the only command in -Flux with that sequence of letters: +For example, instead of entering `import-parquet-files`, you can enter `import-p` as it is the only command in +Flux beginning with that sequence of letters: - ./bin/flux import-ag --path path/to/data etc... + ./bin/flux import-p --path path/to/data etc... If Flux cannot uniquely identify the command name, it will print an error and list the command names that match what you entered. diff --git a/docs/import/import-files/avro.md b/docs/import/import-files/avro.md index 1cdbcadc..da20357e 100644 --- a/docs/import/import-files/avro.md +++ b/docs/import/import-files/avro.md @@ -86,6 +86,12 @@ it may be important to query for documents that have a particular field with a v The `import-avro-files` command supports aggregating related rows together to produce hierarchical documents. See [Aggregating rows](../aggregating-rows.md) for more information. +## Reading compressed files + +Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to +specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to +explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically. + ## Advanced options The `import-avro-files` command reuses Spark's support for reading Avro files. You can include any of diff --git a/docs/import/import-files/delimited-text.md b/docs/import/import-files/delimited-text.md index 1d70a805..0b99ef60 100644 --- a/docs/import/import-files/delimited-text.md +++ b/docs/import/import-files/delimited-text.md @@ -106,6 +106,12 @@ the content can be correctly translated to UTF-8 when written to MarkLogic - e.g The `import-delimited-files` command supports aggregating related rows together to produce hierarchical documents. See [Aggregating rows](../aggregating-rows.md) for more information. +## Reading compressed files + +Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to +specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to +explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically. + ## Advanced options The `import-delimited-files` command reuses Spark's support for reading delimited text data. You can include any of diff --git a/docs/import/import-files/json.md b/docs/import/import-files/json.md index 84f8f80b..1ea01396 100644 --- a/docs/import/import-files/json.md +++ b/docs/import/import-files/json.md @@ -83,6 +83,12 @@ the content can be correctly translated to UTF-8 when written to MarkLogic: etc... ``` +## Reading compressed files + +Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to +specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to +explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically. + ## Advanced options The `import-aggregate-json-files` command reuses Spark's support for reading JSON files. You can include any of diff --git a/docs/import/import-files/orc.md b/docs/import/import-files/orc.md index 9e4ce773..3093dab6 100644 --- a/docs/import/import-files/orc.md +++ b/docs/import/import-files/orc.md @@ -86,6 +86,12 @@ it may be important to query for documents that have a particular field with a v The `import-orc-files` command supports aggregating related rows together to produce hierarchical documents. See [Aggregating rows](../aggregating-rows.md) for more information. +## Reading compressed files + +Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to +specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to +explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically. + ## Advanced options The `import-orc-files` command reuses Spark's support for reading ORC files. You can include any of diff --git a/docs/import/import-files/parquet.md b/docs/import/import-files/parquet.md index 6e23b286..0b288329 100644 --- a/docs/import/import-files/parquet.md +++ b/docs/import/import-files/parquet.md @@ -86,6 +86,12 @@ it may be important to query for documents that have a particular field with a v The `import-parquet-files` command supports aggregating related rows together to produce hierarchical documents. See [Aggregating rows](../aggregating-rows.md) for more information. +## Reading compressed files + +Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to +specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to +explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically. + ## Advanced options The `import-parquet-files` command reuses Spark's support for reading Parquet files. You can include any of diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java index 376a18a1..c20d86e5 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java @@ -124,6 +124,28 @@ void jsonLines() { verifyDoc("/delimited/lastName-3.json", "firstName-3", "lastName-3"); } + @Test + void gzippedJsonLines() { + run( + "import-aggregate-json-files", + "--path", "src/test/resources/delimited-files/line-delimited-json.txt.gz", + "--json-lines", + "--connection-string", makeConnectionString(), + "--permissions", DEFAULT_PERMISSIONS, + "--collections", "delimited-json-test", + "--uri-template", "/delimited/{lastName}.json" + ); + + assertCollectionSize( + "Spark data sources will automatically handle .gz files without -Pcompression=gzip being specified.", + "delimited-json-test", 3 + ); + verifyDoc("/delimited/lastName-1.json", "firstName-1", "lastName-1"); + verifyDoc("/delimited/lastName-2.json", "firstName-2", "lastName-2"); + verifyDoc("/delimited/lastName-3.json", "firstName-3", "lastName-3"); + } + + @Test void jsonRootName() { run( diff --git a/flux-cli/src/test/resources/delimited-files/line-delimited-json.txt.gz b/flux-cli/src/test/resources/delimited-files/line-delimited-json.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..a892c5974f7657a129f96503bd1792c45f285b01 GIT binary patch literal 75 zcmV-R0JQ%fiwFqD(Vk`i19W9=a4vLsbO5VXO3N%NF7ZpuO;xf|KyY*om2{MH5)m>m hHc+CLs~VRsBV4+S@aZzfrOOzo3jhvjeg~oe002i?9AN+e literal 0 HcmV?d00001