Skip to content

Commit

Permalink
Merge pull request #177 from marklogic/feature/preview-schema
Browse files Browse the repository at this point in the history
Added --preview-schema
  • Loading branch information
rjrudin authored Jul 16, 2024
2 parents 7e09dd6 + 561c4e1 commit aa9fec4
Show file tree
Hide file tree
Showing 12 changed files with 93 additions and 85 deletions.
13 changes: 13 additions & 0 deletions docs/common-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,19 @@ Note that in the case of previewing an import, Flux will show the data as it has
Spark rows with columns. The data is not shown as a set of documents yet, as the transformation of rows to documents
occurs when the data is written to MarkLogic.

For some commands, it may be helpful to see the schema of the data read from the command's data source. For example,
when exporting data with a MarkLogic Optic query, you may wish to verify that the datatypes of each column are what you
expect before writing the data to a Parquet file or relational database. Use the `--preview-schema` option to request
that Flux log the schema and not write any data:

```
./bin/flux export-parquet-files \
--connection-string "flux-example-user:password@localhost:8004" \
--query "op.fromView('Example', 'Employees')" \
--path export/parquet \
--preview-schema
```

## Applying a limit

For many use cases, it can be useful to only process a small subset of the source data to ensure that the results
Expand Down
4 changes: 2 additions & 2 deletions docs/export/custom-export.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ via `custom-export-rows`:

When using `custom-export-rows` with an Optic query to select rows from MarkLogic, each row sent to the connector or
data source defined by `--target` will have a schema based on the output of the Optic query. You may find the
`--preview` option helpful in understanding what data will be these rows. See [Common Options](../common-options.md)
for more information.
`--preview` and `--preview-schema` options helpful in understanding what data will be in these rows.
See [Common Options](../common-options.md) for more information.

## Exporting documents

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public final void execute(SparkSession session) {
Dataset<Row> dataset = readDataset(session);
if (commonParams.isCount()) {
logger.info("Count: {}", dataset.count());
} else if (commonParams.isPreviewRequested()) {
} else if (commonParams.getPreview().isPreviewRequested()) {
commonParams.getPreview().showPreview(dataset);
} else {
applyWriter(session, dataset.write());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,10 @@ public Dataset<Row> applyParams(Dataset<Row> dataset) {
return dataset;
}

public void setCount(boolean count) {
this.count = count;
}

public boolean isCount() {
return count;
}

public boolean isPreviewRequested() {
return preview != null && preview.getNumberRows() > 0;
}

public void setLimit(Integer limit) {
this.limit = limit;
}
Expand Down
17 changes: 13 additions & 4 deletions flux-cli/src/main/java/com/marklogic/flux/impl/Preview.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/
package com.marklogic.flux.impl;

import com.marklogic.spark.Util;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import picocli.CommandLine;
Expand All @@ -20,16 +21,24 @@ public class Preview {
@CommandLine.Option(names = "--preview-vertical", description = "Preview the data in a vertical format instead of in a table.")
private boolean vertical;

@CommandLine.Option(names = "--preview-schema", description = "Show the schema of the data read by the command.")
private boolean showSchema;

public void showPreview(Dataset<Row> dataset) {
Dataset<Row> datasetPreview = dataset;
if (columnsToDrop != null && !columnsToDrop.isEmpty()) {
datasetPreview = datasetPreview.drop(columnsToDrop.toArray(new String[]{}));
}
// Not truncating at all. For now, users can drop columns if their values are too long.
datasetPreview.show(numberRows, Integer.MAX_VALUE, vertical);
if (showSchema && Util.MAIN_LOGGER.isInfoEnabled()) {
Util.MAIN_LOGGER.info("Spark schema of the data read by this command:\n{}", datasetPreview.schema().prettyJson());
}
if (numberRows > 0) {
// Not truncating at all. For now, users can drop columns if their values are too long.
datasetPreview.show(numberRows, Integer.MAX_VALUE, vertical);
}
}

public int getNumberRows() {
return numberRows;
public boolean isPreviewRequested() {
return numberRows > 0 || showSchema;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,6 @@ private void validatePartitionParams(CommandLine.ParseResult parseResult) {
}

private void validateWriteParams(CommandLine.ParseResult parseResult) {
if (parseResult.subcommand().hasMatchedOption("--preview")) {
return;
}
String[] options = new String[]{
"--write-invoke", "--write-javascript", "--write-xquery", "--write-javascript-file", "--write-xquery-file"
};
Expand Down
61 changes: 61 additions & 0 deletions flux-cli/src/test/java/com/marklogic/flux/impl/PreviewTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Copyright © 2024 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
*/
package com.marklogic.flux.impl;

import com.marklogic.flux.AbstractTest;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

import java.nio.file.Path;

import static org.junit.jupiter.api.Assertions.assertEquals;

class PreviewTest extends AbstractTest {

@Test
void limitAndPreview() {
run(
"import-delimited-files",
"--path", "src/test/resources/delimited-files/three-rows.csv",
"--connection-string", makeConnectionString(),
"--limit", "1",
"--preview", "3",
"--preview-vertical",
"--collections", "sample"
);

// We have not had reliable success with capturing stdout and making assertions on it, so this just verifies
// that nothing is written.
assertCollectionSize("Verifying that nothing is written to the collection", "sample", 0);
}

@Test
void preview() {
run(
"import-files",
"--path", "src/test/resources/mixed-files",
"--preview", "2",
"--preview-drop", "content", "modificationTime",
"--preview-vertical",
"--collections", "sample"
);

assertCollectionSize("Verifying that nothing is written to the collection", "sample", 0);
}

@Test
void previewSchema(@TempDir Path tempDir) {
run(
"export-parquet",
"--path", tempDir.toFile().getAbsolutePath(),
"--connection-string", makeConnectionString(),
"--query", "op.fromView('Medical', 'Authors')",
"--preview-schema"
);

assertEquals(0, tempDir.toFile().listFiles().length,
"Verifying that nothing was exported since preview-schema was used, which should result in the Spark " +
"dataset schema being logged at the INFO level (which we're not yet able to assert on).");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import com.marklogic.client.query.QueryManager;
import com.marklogic.client.query.StructuredQueryDefinition;
import com.marklogic.flux.AbstractTest;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.*;
Expand Down Expand Up @@ -127,29 +126,6 @@ void dontInferSchema() {
assertEquals(JsonNodeType.STRING, doc.get("flag").getNodeType());
}

/**
* We expect limit and preview to work for every command; this is just a simple sanity check
* for this command.
*/
@Test
@Disabled("stdout isn't being captured correctly for this test, will debug soon.")
void limitAndPreview() {
String stdout = runAndReturnStdout(() -> {
run(
"import-delimited-files",
"--path", "src/test/resources/delimited-files/three-rows.csv",
"--connection-string", makeConnectionString(),
"--limit", "1",
"--preview", "3",
"--preview-vertical"
);
});

String message = "Unexpected stdout: " + stdout;
assertTrue(stdout.contains("number | 1"), message);
assertFalse(stdout.contains("number | 2"), message);
}

@Test
void dontAbortOnReadFailure() {
String stderr = runAndReturnStderr(() -> run(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import com.marklogic.flux.AbstractTest;
import com.marklogic.junit5.XmlNode;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.springframework.util.FileCopyUtils;
Expand All @@ -16,7 +15,8 @@
import java.util.List;
import java.util.stream.Stream;

import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

class ImportFilesTest extends AbstractTest {

Expand Down Expand Up @@ -86,30 +86,6 @@ void documentType() {
assertEquals("element", kind);
}

/**
* preview = show the first N rows from the reader, and don't invoke the writer.
*/
@Test
@Disabled("Another stdout test that runs fine by itself, but fails when the suite is run.")
void preview() {
String stdout = runAndReturnStdout(() -> run(
"import-files",
"--path", "src/test/resources/mixed-files",
"--preview", "2",
"--preview-drop", "content", "modificationTime",
"--preview-vertical"
));

String message = "Unexpected output to stdout: " + stdout;
assertTrue(stdout.contains("RECORD 0"), message);
assertTrue(stdout.contains("RECORD 1"), message);
assertFalse(stdout.contains("RECORD 2"), message);
assertTrue(stdout.contains("path"), message);
assertTrue(stdout.contains("length"), message);
assertFalse(stdout.contains("content"), message);
assertFalse(stdout.contains("modificationTime"), message);
}

@Test
void fileOptions(@TempDir Path tempDir) throws IOException {
File optionsFile = new File(tempDir.toFile(), "options.txt");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ void badJdbcDriverValue() {
"--jdbc-url", PostgresUtil.URL_WITH_AUTH,
"--jdbc-driver", "not.valid.driver.value",
"--connection-string", makeConnectionString(),
"--query", "select * from customer",
"--preview", "10"
"--query", "select * from customer"
), "Command failed, cause: Unable to load class: not.valid.driver.value; " +
"for a JDBC driver, ensure you are specifying the fully-qualified class name for your JDBC driver.");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ void invalidParquetFile() {
run("import-parquet-files",
"--connection-string", makeConnectionString(),
"--path", "src/test/resources/parquet/individual/invalid.parquet",
"--preview", "10",
"--abort-on-read-failure"
)
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,6 @@ void test() {
}
}

@Test
void previewDoesntRequireWriteParam() {
String stdout = runAndReturnStdout(() -> run(
"reprocess",
"--connection-string", makeConnectionString(),
"--read-javascript", "cts.uris(null, null, cts.collectionQuery('author'))",
"--preview", "2"
));

assertTrue(stdout.contains("only showing top 2 rows"),
"No 'write' param should be required when a user uses '--preview', as the user is specifically asking " +
"just to see the read data and not to write anything.");
}

@Test
void missingReadParam() {
String stderr = runAndReturnStderr(() -> run(
Expand Down

0 comments on commit aa9fec4

Please sign in to comment.