Merge pull request #175 from marklogic/feature/custom-docs

Tweaks for custom command docs
marklogic · Jul 15, 2024 · 96d61ca · 96d61ca
2 parents 1e5ba3c + cbf2dbb
commit 96d61ca
Show file tree

Hide file tree

Showing 13 changed files with 41 additions and 23 deletions.
diff --git a/docs/export/custom-export.md b/docs/export/custom-export.md
@@ -8,6 +8,14 @@ nav_order: 6
 The `custom-export-rows` and `custom-export-documents` commands allow you to read rows and documents respectively from 
 MarkLogic and write the results to a custom target.
 
+## Table of contents
+{: .no_toc .text-delta }
+
+- TOC
+{:toc}
+
+## Usage
+
 With the required `--target` option, you can specify
 [any Spark data source](https://spark.apache.org/docs/latest/sql-data-sources.html) or the name of a thirdparty Spark
 connector. For a thirdparty Spark connector, you must include the necessary JAR files for the connector in the
@@ -25,17 +33,26 @@ via `custom-export-rows`:
   --query "op.fromView('schema', 'view')" etc...
 ```
 
+## Exporting rows
+
+When using `custom-export-rows` with an Optic query to select rows from MarkLogic, each row sent to the connector or 
+data source defined by `--target` will have a schema based on the output of the Optic query. You may find the 
+`--preview` option helpful in understanding what data will be these rows. See [Common Options](../common-options.md) 
+for more information.
+
+## Exporting documents
+
 When using `custom-export-documents`, each document returned by MarkLogic will be represented as a Spark row with 
 the following column definitions:
 
-1. "URI" containing a string. 
-2. "content" containing a byte array.
-3. "format" containing a string. 
-4. "collections" containing an array of strings.
-5. "permissions" containing a map of strings and arrays of strings representing roles and permissions. 
-6. "quality" containing an integer.
-7. "properties" containing an XML document serialized to a string.
-8. "metadataValues" containing a map of string keys and string values.
+1. `URI` containing a string. 
+2. `content` containing a byte array.
+3. `format` containing a string. 
+4. `collections` containing an array of strings.
+5. `permissions` containing a map of strings and arrays of strings representing roles and permissions. 
+6. `quality` containing an integer.
+7. `properties` containing an XML document serialized to a string.
+8. `metadataValues` containing a map of string keys and string values.
 
 These are normal Spark rows that can be written via Spark data sources like Parquet and ORC. If using a thirdparty 
 Spark connector, you will likely need to understand how that connector will make use of rows defined via the above 

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -227,14 +227,15 @@ documents:
     --write-javascript "declareUpdate(); xdmp.documentAddCollections(URI, 'reprocessed')" 
 ```
 
-In qconsole, you can see that the 1000 employee documents are now also in the `reprocessed` collection. Or, use
-Flux and its `--count` option, which allows you to get a count of all the data read by a command without processing or 
+In qconsole, you can see that the 1000 employee documents are now also in the `reprocessed` collection. You can also
+use Flux and its `--count` option, which allows you to get a count of all the data read by a command without processing or 
 writing any of the data:
 
 ```
 ./bin/flux export-files \
     --connection-string "flux-example-user:password@localhost:8004" \
-    --path export --collections reprocessed --count 
+    --path export --collections reprocessed \
+    --count 
 ```
 
 For more information, please see the [Reprocessing guide](reprocess.md).

diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/custom/AbstractCustomExportCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/custom/AbstractCustomExportCommand.java
@@ -37,7 +37,7 @@ public static class CustomWriteParams implements CustomExportWriteOptions {
 
         @CommandLine.Option(
             names = "-P",
-            description = "Specify any number of options to be passed to the connector identified by '--target'."
+            description = "Specify any number of options to be passed to the connector identified by '--target' - e.g. -PmyOption=someValue."
         )
         private Map<String, String> additionalOptions = new HashMap<>();
 

diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/custom/CustomImportCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/custom/CustomImportCommand.java
@@ -48,14 +48,14 @@ public static class CustomReadParams implements CustomReadOptions {
 
         @CommandLine.Option(
             names = "--source",
-            description = "Identifier for the Spark connector that is the source of data to import.",
+            description = "Identifier for the Spark connector or data source that is used to read data.",
             required = true
         )
         private String source;
 
         @CommandLine.Option(
             names = "-P",
-            description = "Specify any number of options to be passed to the connector identified by '--source'."
+            description = "Specify any number of options to be passed to the connector identified by '--source' - e.g. -PmyOption=someValue."
         )
         private Map<String, String> additionalOptions = new HashMap<>();
 

diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportAvroFilesCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportAvroFilesCommand.java
@@ -15,7 +15,7 @@
 @CommandLine.Command(
     name = "export-avro-files",
     description = "Read rows via Optic from MarkLogic and write them to Avro files on a local filesystem, HDFS, or S3 " +
-        "using Spark's support defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-avro.html."
+        "using Spark's support defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-avro.html ."
 )
 public class ExportAvroFilesCommand extends AbstractExportRowsToFilesCommand<AvroFilesExporter> implements AvroFilesExporter {
 

diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportDelimitedFilesCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportDelimitedFilesCommand.java
@@ -15,7 +15,7 @@
 @CommandLine.Command(
     name = "export-delimited-files",
     description = "Read rows via Optic from MarkLogic and write them to delimited text files on a local filesystem, " +
-        "HDFS, or S3 using Spark's support defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-csv.html."
+        "HDFS, or S3 using Spark's support defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-csv.html ."
 )
 public class ExportDelimitedFilesCommand extends AbstractExportRowsToFilesCommand<DelimitedFilesExporter> implements DelimitedFilesExporter {
 

diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportJdbcCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportJdbcCommand.java
@@ -16,7 +16,7 @@
 @CommandLine.Command(
     name = "export-jdbc",
     description = "Read rows via Optic from MarkLogic and write them to a table using Spark's support defined at" +
-        "%nhttps://spark.apache.org/docs/latest/sql-data-sources-jdbc.html."
+        "%nhttps://spark.apache.org/docs/latest/sql-data-sources-jdbc.html ."
 )
 public class ExportJdbcCommand extends AbstractCommand<JdbcExporter> implements JdbcExporter {
 

diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportJsonLinesFilesCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportJsonLinesFilesCommand.java
@@ -15,7 +15,7 @@
 @CommandLine.Command(
     name = "export-json-lines-files",
     description = "Read rows via Optic from MarkLogic and write them to JSON Lines files on a local filesystem, HDFS, or S3 " +
-        "using Spark's support defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-json.html."
+        "using Spark's support defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-json.html ."
 )
 public class ExportJsonLinesFilesCommand extends AbstractExportRowsToFilesCommand<JsonLinesFilesExporter> implements JsonLinesFilesExporter {
 

diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportOrcFilesCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportOrcFilesCommand.java
@@ -15,7 +15,7 @@
 @CommandLine.Command(
     name = "export-orc-files",
     description = "Read rows via Optic from MarkLogic and write them to ORC files on a local filesystem, HDFS, or S3 " +
-        "using Spark's support defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-orc.html."
+        "using Spark's support defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-orc.html ."
 )
 public class ExportOrcFilesCommand extends AbstractExportRowsToFilesCommand<OrcFilesExporter> implements OrcFilesExporter {
 

diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportParquetFilesCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportParquetFilesCommand.java
@@ -15,7 +15,7 @@
 @CommandLine.Command(
     name = "export-parquet-files",
     description = "Read rows via Optic from MarkLogic and write them to Parquet files on a local filesystem, HDFS, or S3 " +
-        "using Spark's support defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-jdbc.html."
+        "using Spark's support defined at %nhttps://spark.apache.org/docs/latest/sql-data-sources-parquet.html ."
 )
 public class ExportParquetFilesCommand extends AbstractExportRowsToFilesCommand<ParquetFilesExporter> implements ParquetFilesExporter {
 

diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAvroFilesOptionsTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAvroFilesOptionsTest.java
@@ -27,6 +27,6 @@ void test() {
         assertFalse(options.containsKey("spark.sql.parquet.filterPushdown"),
             "Dynamic params starting with 'spark.sql' should not be added to the 'read' options. They should " +
                 "instead be added to the SparkConf object, per the documentation at " +
-                "https://spark.apache.org/docs/latest/sql-data-sources-avro.html.");
+                "https://spark.apache.org/docs/latest/sql-data-sources-avro.html .");
     }
 }
diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportOrcFilesOptionsTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportOrcFilesOptionsTest.java
@@ -27,6 +27,6 @@ void test() {
         assertFalse(options.containsKey("spark.sql.parquet.filterPushdown"),
             "Dynamic params starting with 'spark.sql' should not be added to the 'read' options. They should " +
                 "instead be added to the SparkConf object, per the documentation at " +
-                "https://spark.apache.org/docs/latest/sql-data-sources-orc.html.");
+                "https://spark.apache.org/docs/latest/sql-data-sources-orc.html .");
     }
 }
diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportParquetFilesOptionsTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportParquetFilesOptionsTest.java
@@ -27,6 +27,6 @@ void configurationAndDataSourceOptions() {
         assertFalse(options.containsKey("spark.sql.parquet.filterPushdown"),
             "Dynamic params starting with 'spark.sql' should not be added to the 'read' options. They should " +
                 "instead be added to the SparkConf object, per the documentation at " +
-                "https://spark.apache.org/docs/latest/sql-data-sources-parquet.html.");
+                "https://spark.apache.org/docs/latest/sql-data-sources-parquet.html .");
     }
 }