diff --git a/.gitignore b/.gitignore index c84c103c5..5201cfc95 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ out/** *.iws .vscode .pmdCache + +*/bin \ No newline at end of file diff --git a/examples/substrait-spark/.gitignore b/examples/substrait-spark/.gitignore index 8965a89f4..a6765eee4 100644 --- a/examples/substrait-spark/.gitignore +++ b/examples/substrait-spark/.gitignore @@ -1,2 +1,4 @@ +spark-warehouse +derby.log _apps _data diff --git a/examples/substrait-spark/README.md b/examples/substrait-spark/README.md index 97a53b707..a8079e2cd 100644 --- a/examples/substrait-spark/README.md +++ b/examples/substrait-spark/README.md @@ -1,7 +1,6 @@ # Introduction to the Substrait-Spark library -The Substrait-Spark library was recently added to the [substrait-java](https://github.com/substrait-io/substrait-java) project; this library allows Substrait plans to convert to and from Spark Plans. - +The Substrait-Spark library allows Substrait plans to convert to and from Spark Plans. This example will show how this can be used. ## How does this work in practice? @@ -27,7 +26,7 @@ To run these you will need: - Java 17 or greater - Docker to start a test Spark Cluster - you could use your own cluster, but would need to adjust file locations defined in [SparkHelper](./app/src/main/java/io/substrait/examples/SparkHelper.java) -- [just task runner](https://github.com/casey/just#installation) optional, but very helpful to run the bash commands +- The [just task runner](https://github.com/casey/just#installation) is optional, but very helpful to run the bash commands - [Two datafiles](./app/src/main/resources/) are provided (CSV format) For building using the `substrait-spark` library youself, using the [mvn repository](https://mvnrepository.com/artifact/io.substrait/spark) diff --git a/examples/substrait-spark/app/.gitignore b/examples/substrait-spark/app/.gitignore deleted file mode 100644 index 2ee4e319c..000000000 --- a/examples/substrait-spark/app/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -spark-warehouse -derby.log diff --git a/examples/substrait-spark/app/build.gradle b/examples/substrait-spark/app/build.gradle deleted file mode 100644 index cb2710b8b..000000000 --- a/examples/substrait-spark/app/build.gradle +++ /dev/null @@ -1,62 +0,0 @@ -/* - * This file was generated by the Gradle 'init' task. - * - * This project uses @Incubating APIs which are subject to change. - */ - -plugins { - id 'buildlogic.java-application-conventions' -} - -dependencies { - implementation 'org.apache.commons:commons-text' - // for running as a Spark application for real, this could be compile-only - - - implementation libs.substrait.core - implementation libs.substrait.spark - implementation libs.spark.sql - - // For a real Spark application, these would not be required since they would be in the Spark server classpath - runtimeOnly libs.spark.core -// https://mvnrepository.com/artifact/org.apache.spark/spark-hive - runtimeOnly libs.spark.hive - - - -} - -def jvmArguments = [ - "--add-exports", - "java.base/sun.nio.ch=ALL-UNNAMED", - "--add-opens=java.base/java.net=ALL-UNNAMED", - "--add-opens=java.base/java.nio=ALL-UNNAMED", - "-Dspark.master=local" -] - -application { - // Define the main class for the application. - mainClass = 'io.substrait.examples.App' - applicationDefaultJvmArgs = jvmArguments -} - -jar { - zip64 = true - duplicatesStrategy = DuplicatesStrategy.EXCLUDE - - manifest { - attributes 'Main-Class': 'io.substrait.examples.App' - } - - from { - configurations.runtimeClasspath.collect { it.isDirectory() ? it : zipTree(it) } - } - - exclude 'META-INF/*.RSA' - exclude 'META-INF/*.SF' - exclude 'META-INF/*.DSA' -} - -repositories { - -} diff --git a/examples/substrait-spark/build-logic/build.gradle b/examples/substrait-spark/build-logic/build.gradle deleted file mode 100644 index d29beaf6e..000000000 --- a/examples/substrait-spark/build-logic/build.gradle +++ /dev/null @@ -1,16 +0,0 @@ -/* - * This file was generated by the Gradle 'init' task. - * - * This project uses @Incubating APIs which are subject to change. - */ - -plugins { - // Support convention plugins written in Groovy. Convention plugins are build scripts in 'src/main' that automatically become available as plugins in the main build. - id 'groovy-gradle-plugin' -} - -repositories { - - // Use the plugin portal to apply community plugins in convention plugins. - gradlePluginPortal() -} diff --git a/examples/substrait-spark/build-logic/settings.gradle b/examples/substrait-spark/build-logic/settings.gradle deleted file mode 100644 index 58fbfd5cb..000000000 --- a/examples/substrait-spark/build-logic/settings.gradle +++ /dev/null @@ -1,15 +0,0 @@ -/* - * This file was generated by the Gradle 'init' task. - * - * This settings file is used to specify which projects to include in your build-logic build. - * This project uses @Incubating APIs which are subject to change. - */ - -dependencyResolutionManagement { - // Reuse version catalog from the main build. - versionCatalogs { - create('libs', { from(files("../gradle/libs.versions.toml")) }) - } -} - -rootProject.name = 'build-logic' diff --git a/examples/substrait-spark/build-logic/src/main/groovy/buildlogic.java-application-conventions.gradle b/examples/substrait-spark/build-logic/src/main/groovy/buildlogic.java-application-conventions.gradle deleted file mode 100644 index 1006b9b31..000000000 --- a/examples/substrait-spark/build-logic/src/main/groovy/buildlogic.java-application-conventions.gradle +++ /dev/null @@ -1,13 +0,0 @@ -/* - * This file was generated by the Gradle 'init' task. - * - * This project uses @Incubating APIs which are subject to change. - */ - -plugins { - // Apply the common convention plugin for shared build configuration between library and application projects. - id 'buildlogic.java-common-conventions' - - // Apply the application plugin to add support for building a CLI application in Java. - id 'application' -} diff --git a/examples/substrait-spark/build-logic/src/main/groovy/buildlogic.java-common-conventions.gradle b/examples/substrait-spark/build-logic/src/main/groovy/buildlogic.java-common-conventions.gradle deleted file mode 100644 index 1f605ee5f..000000000 --- a/examples/substrait-spark/build-logic/src/main/groovy/buildlogic.java-common-conventions.gradle +++ /dev/null @@ -1,39 +0,0 @@ -/* - * This file was generated by the Gradle 'init' task. - * - * This project uses @Incubating APIs which are subject to change. - */ - -plugins { - // Apply the java Plugin to add support for Java. - id 'java' -} - -repositories { - // Use Maven Central for resolving dependencies. - mavenCentral() -} - -dependencies { - constraints { - // Define dependency versions as constraints - implementation 'org.apache.commons:commons-text:1.11.0' - } -} - -testing { - suites { - // Configure the built-in test suite - test { - // Use JUnit Jupiter test framework - useJUnitJupiter('5.10.1') - } - } -} - -// Apply a specific Java toolchain to ease working on different environments. -java { - toolchain { - languageVersion = JavaLanguageVersion.of(17) - } -} diff --git a/examples/substrait-spark/build-logic/src/main/groovy/buildlogic.java-library-conventions.gradle b/examples/substrait-spark/build-logic/src/main/groovy/buildlogic.java-library-conventions.gradle deleted file mode 100644 index 526803e32..000000000 --- a/examples/substrait-spark/build-logic/src/main/groovy/buildlogic.java-library-conventions.gradle +++ /dev/null @@ -1,13 +0,0 @@ -/* - * This file was generated by the Gradle 'init' task. - * - * This project uses @Incubating APIs which are subject to change. - */ - -plugins { - // Apply the common convention plugin for shared build configuration between library and application projects. - id 'buildlogic.java-common-conventions' - - // Apply the java-library plugin for API and implementation separation. - id 'java-library' -} diff --git a/examples/substrait-spark/build.gradle.kts b/examples/substrait-spark/build.gradle.kts new file mode 100644 index 000000000..05db91980 --- /dev/null +++ b/examples/substrait-spark/build.gradle.kts @@ -0,0 +1,48 @@ + +plugins { + // Apply the application plugin to add support for building a CLI application in Java. + id("java") +} + +repositories { + // Use Maven Central for resolving dependencies. + mavenCentral() +} + +dependencies { + implementation("org.apache.spark:spark-core_2.12:3.5.1") + implementation("io.substrait:spark:0.36.0") + implementation("io.substrait:core:0.36.0") + implementation("org.apache.spark:spark-sql_2.12:3.5.1") + + // For a real Spark application, these would not be required since they would be in the Spark server classpath + runtimeOnly("org.apache.spark:spark-core_2.12:3.5.1") + runtimeOnly("org.apache.spark:spark-hive_2.12:3.5.1") + +} + +tasks.jar { + isZip64 = true + exclude ("META-INF/*.RSA") + exclude ("META-INF/*.SF") + exclude ("META-INF/*.DSA") + + duplicatesStrategy = DuplicatesStrategy.EXCLUDE + manifest.attributes["Main-Class"] = "io.substrait.examples.App" + from(configurations.runtimeClasspath.get().map({ if (it.isDirectory) it else zipTree(it) })) + +} + +tasks.named("test") { + // Use JUnit Platform for unit tests. + useJUnitPlatform() +} +java { + toolchain { languageVersion.set(JavaLanguageVersion.of(17)) } +} +// // Apply a specific Java toolchain to ease working on different environments. +// java { +// toolchain { +// languageVersion = JavaLanguageVersion.of(17) +// } +// } diff --git a/examples/substrait-spark/gradle.properties b/examples/substrait-spark/gradle.properties deleted file mode 100644 index 18f452c73..000000000 --- a/examples/substrait-spark/gradle.properties +++ /dev/null @@ -1,6 +0,0 @@ -# This file was generated by the Gradle 'init' task. -# https://docs.gradle.org/current/userguide/build_environment.html#sec:gradle_configuration_properties - -org.gradle.parallel=true -org.gradle.caching=true - diff --git a/examples/substrait-spark/gradle/libs.versions.toml b/examples/substrait-spark/gradle/libs.versions.toml deleted file mode 100644 index 8a36ae4d9..000000000 --- a/examples/substrait-spark/gradle/libs.versions.toml +++ /dev/null @@ -1,14 +0,0 @@ -# This file was generated by the Gradle 'init' task. -# https://docs.gradle.org/current/userguide/platforms.html#sub::toml-dependencies-format -[versions] -spark = "3.5.1" -spotless = "6.25.0" -substrait = "0.36.0" -substrait-spark = "0.36.0" - -[libraries] -spark-core = { module = "org.apache.spark:spark-core_2.12", version.ref = "spark" } -spark-sql = { module = "org.apache.spark:spark-sql_2.12", version.ref = "spark" } -spark-hive = { module = "org.apache.spark:spark-hive_2.12", version.ref = "spark" } -substrait-spark = { module = "io.substrait:spark", version.ref = "substrait-spark" } -substrait-core = { module = "io.substrait:core", version.ref = "substrait" } diff --git a/examples/substrait-spark/gradle/wrapper/gradle-wrapper.jar b/examples/substrait-spark/gradle/wrapper/gradle-wrapper.jar deleted file mode 100644 index e6441136f..000000000 Binary files a/examples/substrait-spark/gradle/wrapper/gradle-wrapper.jar and /dev/null differ diff --git a/examples/substrait-spark/gradle/wrapper/gradle-wrapper.properties b/examples/substrait-spark/gradle/wrapper/gradle-wrapper.properties deleted file mode 100644 index b82aa23a4..000000000 --- a/examples/substrait-spark/gradle/wrapper/gradle-wrapper.properties +++ /dev/null @@ -1,7 +0,0 @@ -distributionBase=GRADLE_USER_HOME -distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip -networkTimeout=10000 -validateDistributionUrl=true -zipStoreBase=GRADLE_USER_HOME -zipStorePath=wrapper/dists diff --git a/examples/substrait-spark/gradlew b/examples/substrait-spark/gradlew deleted file mode 100755 index 1aa94a426..000000000 --- a/examples/substrait-spark/gradlew +++ /dev/null @@ -1,249 +0,0 @@ -#!/bin/sh - -# -# Copyright © 2015-2021 the original authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -############################################################################## -# -# Gradle start up script for POSIX generated by Gradle. -# -# Important for running: -# -# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is -# noncompliant, but you have some other compliant shell such as ksh or -# bash, then to run this script, type that shell name before the whole -# command line, like: -# -# ksh Gradle -# -# Busybox and similar reduced shells will NOT work, because this script -# requires all of these POSIX shell features: -# * functions; -# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», -# «${var#prefix}», «${var%suffix}», and «$( cmd )»; -# * compound commands having a testable exit status, especially «case»; -# * various built-in commands including «command», «set», and «ulimit». -# -# Important for patching: -# -# (2) This script targets any POSIX shell, so it avoids extensions provided -# by Bash, Ksh, etc; in particular arrays are avoided. -# -# The "traditional" practice of packing multiple parameters into a -# space-separated string is a well documented source of bugs and security -# problems, so this is (mostly) avoided, by progressively accumulating -# options in "$@", and eventually passing that to Java. -# -# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, -# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; -# see the in-line comments for details. -# -# There are tweaks for specific operating systems such as AIX, CygWin, -# Darwin, MinGW, and NonStop. -# -# (3) This script is generated from the Groovy template -# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt -# within the Gradle project. -# -# You can find Gradle at https://github.com/gradle/gradle/. -# -############################################################################## - -# Attempt to set APP_HOME - -# Resolve links: $0 may be a link -app_path=$0 - -# Need this for daisy-chained symlinks. -while - APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path - [ -h "$app_path" ] -do - ls=$( ls -ld "$app_path" ) - link=${ls#*' -> '} - case $link in #( - /*) app_path=$link ;; #( - *) app_path=$APP_HOME$link ;; - esac -done - -# This is normally unused -# shellcheck disable=SC2034 -APP_BASE_NAME=${0##*/} -# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) -APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit - -# Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD=maximum - -warn () { - echo "$*" -} >&2 - -die () { - echo - echo "$*" - echo - exit 1 -} >&2 - -# OS specific support (must be 'true' or 'false'). -cygwin=false -msys=false -darwin=false -nonstop=false -case "$( uname )" in #( - CYGWIN* ) cygwin=true ;; #( - Darwin* ) darwin=true ;; #( - MSYS* | MINGW* ) msys=true ;; #( - NONSTOP* ) nonstop=true ;; -esac - -CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar - - -# Determine the Java command to use to start the JVM. -if [ -n "$JAVA_HOME" ] ; then - if [ -x "$JAVA_HOME/jre/sh/java" ] ; then - # IBM's JDK on AIX uses strange locations for the executables - JAVACMD=$JAVA_HOME/jre/sh/java - else - JAVACMD=$JAVA_HOME/bin/java - fi - if [ ! -x "$JAVACMD" ] ; then - die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." - fi -else - JAVACMD=java - if ! command -v java >/dev/null 2>&1 - then - die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." - fi -fi - -# Increase the maximum file descriptors if we can. -if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then - case $MAX_FD in #( - max*) - # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. - # shellcheck disable=SC2039,SC3045 - MAX_FD=$( ulimit -H -n ) || - warn "Could not query maximum file descriptor limit" - esac - case $MAX_FD in #( - '' | soft) :;; #( - *) - # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. - # shellcheck disable=SC2039,SC3045 - ulimit -n "$MAX_FD" || - warn "Could not set maximum file descriptor limit to $MAX_FD" - esac -fi - -# Collect all arguments for the java command, stacking in reverse order: -# * args from the command line -# * the main class name -# * -classpath -# * -D...appname settings -# * --module-path (only if needed) -# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. - -# For Cygwin or MSYS, switch paths to Windows format before running java -if "$cygwin" || "$msys" ; then - APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) - CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) - - JAVACMD=$( cygpath --unix "$JAVACMD" ) - - # Now convert the arguments - kludge to limit ourselves to /bin/sh - for arg do - if - case $arg in #( - -*) false ;; # don't mess with options #( - /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath - [ -e "$t" ] ;; #( - *) false ;; - esac - then - arg=$( cygpath --path --ignore --mixed "$arg" ) - fi - # Roll the args list around exactly as many times as the number of - # args, so each arg winds up back in the position where it started, but - # possibly modified. - # - # NB: a `for` loop captures its iteration list before it begins, so - # changing the positional parameters here affects neither the number of - # iterations, nor the values presented in `arg`. - shift # remove old arg - set -- "$@" "$arg" # push replacement arg - done -fi - - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' - -# Collect all arguments for the java command: -# * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, -# and any embedded shellness will be escaped. -# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be -# treated as '${Hostname}' itself on the command line. - -set -- \ - "-Dorg.gradle.appname=$APP_BASE_NAME" \ - -classpath "$CLASSPATH" \ - org.gradle.wrapper.GradleWrapperMain \ - "$@" - -# Stop when "xargs" is not available. -if ! command -v xargs >/dev/null 2>&1 -then - die "xargs is not available" -fi - -# Use "xargs" to parse quoted args. -# -# With -n1 it outputs one arg per line, with the quotes and backslashes removed. -# -# In Bash we could simply go: -# -# readarray ARGS < <( xargs -n1 <<<"$var" ) && -# set -- "${ARGS[@]}" "$@" -# -# but POSIX shell has neither arrays nor command substitution, so instead we -# post-process each arg (as a line of input to sed) to backslash-escape any -# character that might be a shell metacharacter, then use eval to reverse -# that process (while maintaining the separation between arguments), and wrap -# the whole thing up as a single "set" statement. -# -# This will of course break if any of these variables contains a newline or -# an unmatched quote. -# - -eval "set -- $( - printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | - xargs -n1 | - sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | - tr '\n' ' ' - )" '"$@"' - -exec "$JAVACMD" "$@" diff --git a/examples/substrait-spark/gradlew.bat b/examples/substrait-spark/gradlew.bat deleted file mode 100644 index 7101f8e46..000000000 --- a/examples/substrait-spark/gradlew.bat +++ /dev/null @@ -1,92 +0,0 @@ -@rem -@rem Copyright 2015 the original author or authors. -@rem -@rem Licensed under the Apache License, Version 2.0 (the "License"); -@rem you may not use this file except in compliance with the License. -@rem You may obtain a copy of the License at -@rem -@rem https://www.apache.org/licenses/LICENSE-2.0 -@rem -@rem Unless required by applicable law or agreed to in writing, software -@rem distributed under the License is distributed on an "AS IS" BASIS, -@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -@rem See the License for the specific language governing permissions and -@rem limitations under the License. -@rem - -@if "%DEBUG%"=="" @echo off -@rem ########################################################################## -@rem -@rem Gradle startup script for Windows -@rem -@rem ########################################################################## - -@rem Set local scope for the variables with windows NT shell -if "%OS%"=="Windows_NT" setlocal - -set DIRNAME=%~dp0 -if "%DIRNAME%"=="" set DIRNAME=. -@rem This is normally unused -set APP_BASE_NAME=%~n0 -set APP_HOME=%DIRNAME% - -@rem Resolve any "." and ".." in APP_HOME to make it shorter. -for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi - -@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" - -@rem Find java.exe -if defined JAVA_HOME goto findJavaFromJavaHome - -set JAVA_EXE=java.exe -%JAVA_EXE% -version >NUL 2>&1 -if %ERRORLEVEL% equ 0 goto execute - -echo. 1>&2 -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 -echo. 1>&2 -echo Please set the JAVA_HOME variable in your environment to match the 1>&2 -echo location of your Java installation. 1>&2 - -goto fail - -:findJavaFromJavaHome -set JAVA_HOME=%JAVA_HOME:"=% -set JAVA_EXE=%JAVA_HOME%/bin/java.exe - -if exist "%JAVA_EXE%" goto execute - -echo. 1>&2 -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 -echo. 1>&2 -echo Please set the JAVA_HOME variable in your environment to match the 1>&2 -echo location of your Java installation. 1>&2 - -goto fail - -:execute -@rem Setup the command line - -set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar - - -@rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* - -:end -@rem End local scope for the variables with windows NT shell -if %ERRORLEVEL% equ 0 goto mainEnd - -:fail -rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of -rem the _cmd.exe /c_ return code! -set EXIT_CODE=%ERRORLEVEL% -if %EXIT_CODE% equ 0 set EXIT_CODE=1 -if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% -exit /b %EXIT_CODE% - -:mainEnd -if "%OS%"=="Windows_NT" endlocal - -:omega diff --git a/examples/substrait-spark/justfile b/examples/substrait-spark/justfile index 9a138d278..6fcad7666 100644 --- a/examples/substrait-spark/justfile +++ b/examples/substrait-spark/justfile @@ -10,7 +10,7 @@ CWDIR := justfile_directory() SPARK_VERSION := "3.5.1" -SPARK_MASTER_CONTAINER := "subtrait-spark-spark-1" +SPARK_MASTER_CONTAINER := "substrait-spark-spark-1" _default: @just -f {{justfile()}} --list @@ -19,14 +19,14 @@ buildapp: #!/bin/bash set -e -o pipefail - ${CWDIR}/gradlew build + ${CWDIR}/../../gradlew build # need to let the SPARK user be able to write to the _data mount mkdir -p ${CWDIR}/_data && chmod g+w ${CWDIR}/_data mkdir -p ${CWDIR}/_apps - cp ${CWDIR}/app/build/libs/app.jar ${CWDIR}/_apps - cp ${CWDIR}/app/src/main/resources/*.csv ${CWDIR}/_data + cp ${CWDIR}/build/libs/substrait-spark*.jar ${CWDIR}/_apps/app.jar + cp ${CWDIR}/src/main/resources/*.csv ${CWDIR}/_data dataset: #!/bin/bash diff --git a/examples/substrait-spark/settings.gradle b/examples/substrait-spark/settings.gradle deleted file mode 100644 index ed37a683e..000000000 --- a/examples/substrait-spark/settings.gradle +++ /dev/null @@ -1,20 +0,0 @@ -/* - * This file was generated by the Gradle 'init' task. - * - * The settings file is used to specify which projects to include in your build. - * For more detailed information on multi-project builds, please refer to https://docs.gradle.org/8.7/userguide/multi_project_builds.html in the Gradle documentation. - * This project uses @Incubating APIs which are subject to change. - */ - -pluginManagement { - // Include 'plugins build' to define convention plugins. - includeBuild('build-logic') -} - -plugins { - // Apply the foojay-resolver plugin to allow automatic download of JDKs - id 'org.gradle.toolchains.foojay-resolver-convention' version '0.8.0' -} - -rootProject.name = 'flexdata-spark' -include('app') diff --git a/examples/substrait-spark/app/src/main/java/io/substrait/examples/App.java b/examples/substrait-spark/src/main/java/io/substrait/examples/App.java similarity index 69% rename from examples/substrait-spark/app/src/main/java/io/substrait/examples/App.java rename to examples/substrait-spark/src/main/java/io/substrait/examples/App.java index fed789b3f..b401c0417 100644 --- a/examples/substrait-spark/app/src/main/java/io/substrait/examples/App.java +++ b/examples/substrait-spark/src/main/java/io/substrait/examples/App.java @@ -1,20 +1,26 @@ package io.substrait.examples; -import java.nio.file.Files; -import java.nio.file.Paths; +/** Main class */ +public final class App { -import io.substrait.plan.Plan; -import io.substrait.plan.ProtoPlanConverter; + /** Implemented by all examples */ + public interface Action { -public class App { - - public static interface Action { - public void run(String arg); + /** Run + * + * @param arg argument + */ + void run(String arg); } private App() { } + /** + * Traditional main method + * + * @param args string[] + */ public static void main(String args[]) { try { diff --git a/examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkConsumeSubstrait.java b/examples/substrait-spark/src/main/java/io/substrait/examples/SparkConsumeSubstrait.java similarity index 97% rename from examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkConsumeSubstrait.java rename to examples/substrait-spark/src/main/java/io/substrait/examples/SparkConsumeSubstrait.java index 761209850..13805515b 100644 --- a/examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkConsumeSubstrait.java +++ b/examples/substrait-spark/src/main/java/io/substrait/examples/SparkConsumeSubstrait.java @@ -17,8 +17,7 @@ /** Minimal Spark application */ public class SparkConsumeSubstrait implements App.Action { - public SparkConsumeSubstrait() { - } + @Override public void run(String arg) { diff --git a/examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkDataset.java b/examples/substrait-spark/src/main/java/io/substrait/examples/SparkDataset.java similarity index 85% rename from examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkDataset.java rename to examples/substrait-spark/src/main/java/io/substrait/examples/SparkDataset.java index 4f0e668c7..e03916ceb 100644 --- a/examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkDataset.java +++ b/examples/substrait-spark/src/main/java/io/substrait/examples/SparkDataset.java @@ -15,10 +15,6 @@ /** Minimal Spark application */ public class SparkDataset implements App.Action { - public SparkDataset() { - - } - @Override public void run(String arg) { @@ -32,8 +28,8 @@ public void run(String arg) { String vehiclesFile = Paths.get(ROOT_DIR, VEHICLES_CSV).toString(); String testsFile = Paths.get(ROOT_DIR, TESTS_CSV).toString(); - System.out.println("Reading "+vehiclesFile); - System.out.println("Reading "+testsFile); + System.out.println("Reading " + vehiclesFile); + System.out.println("Reading " + testsFile); dsVehicles = spark.read().option("delimiter", ",").option("header", "true").csv(vehiclesFile); dsVehicles.show(); @@ -61,6 +57,11 @@ public void run(String arg) { } } + /** + * Create substrait plan and save to file based on logical plan + * + * @param enginePlan logical plan + */ public void createSubstrait(LogicalPlan enginePlan) { ToSubstraitRel toSubstrait = new ToSubstraitRel(); io.substrait.plan.Plan plan = toSubstrait.convert(enginePlan); @@ -70,8 +71,8 @@ public void createSubstrait(LogicalPlan enginePlan) { PlanProtoConverter planToProto = new PlanProtoConverter(); byte[] buffer = planToProto.toProto(plan).toByteArray(); try { - Files.write(Paths.get(ROOT_DIR,"spark_dataset_substrait.plan"), buffer); - System.out.println("File written to "+Paths.get(ROOT_DIR,"spark_sql_substrait.plan")); + Files.write(Paths.get(ROOT_DIR, "spark_dataset_substrait.plan"), buffer); + System.out.println("File written to " + Paths.get(ROOT_DIR, "spark_sql_substrait.plan")); } catch (IOException e) { e.printStackTrace(System.out); } diff --git a/examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkHelper.java b/examples/substrait-spark/src/main/java/io/substrait/examples/SparkHelper.java similarity index 59% rename from examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkHelper.java rename to examples/substrait-spark/src/main/java/io/substrait/examples/SparkHelper.java index 7bed7fae4..efe516578 100644 --- a/examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkHelper.java +++ b/examples/substrait-spark/src/main/java/io/substrait/examples/SparkHelper.java @@ -2,25 +2,47 @@ import org.apache.spark.sql.SparkSession; -public class SparkHelper { +/** Collection of helper fns */ +public final class SparkHelper { + + private SparkHelper() { + } + + /** + * Namespace to use for the data + */ public static final String NAMESPACE = "demo_db"; + + /** Vehicles table */ public static final String VEHICLE_TABLE = "vehicles"; + + /** Tests table (the vehicle safety tests) */ public static final String TESTS_TABLE = "tests"; + /** Source data - parquet */ public static final String VEHICLES_PQ = "vehicles_subset_2023.parquet"; + + /** Source data - parquet */ public static final String TESTS_PQ = "tests_subset_2023.parquet"; + /** Source data - csv */ public static final String VEHICLES_CSV = "vehicles_subset_2023.csv"; + + /** Source data - csv */ public static final String TESTS_CSV = "tests_subset_2023.csv"; + /** In-container data location */ public static final String ROOT_DIR = "/opt/spark-data"; - // Connect to local spark for demo purposes - public static SparkSession connectSpark(String spark_master) { + /** Connect to local spark for demo purposes + * @param sparkMaster address of the Spark Master to connect to + * @return SparkSession + */ + public static SparkSession connectSpark(String sparkMaster) { SparkSession spark = SparkSession.builder() // .config("spark.sql.warehouse.dir", "spark-warehouse") - .config("spark.master", spark_master) + .config("spark.master", sparkMaster) .enableHiveSupport() .getOrCreate(); @@ -29,6 +51,9 @@ public static SparkSession connectSpark(String spark_master) { return spark; } + /** Connects to the local spark cluister + * @return SparkSession + */ public static SparkSession connectLocalSpark() { SparkSession spark = SparkSession.builder() @@ -40,5 +65,4 @@ public static SparkSession connectLocalSpark() { return spark; } - } diff --git a/examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkSQL.java b/examples/substrait-spark/src/main/java/io/substrait/examples/SparkSQL.java similarity index 85% rename from examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkSQL.java rename to examples/substrait-spark/src/main/java/io/substrait/examples/SparkSQL.java index 3bdd26e96..3da061801 100644 --- a/examples/substrait-spark/app/src/main/java/io/substrait/examples/SparkSQL.java +++ b/examples/substrait-spark/src/main/java/io/substrait/examples/SparkSQL.java @@ -19,10 +19,6 @@ /** Minimal Spark application */ public class SparkSQL implements App.Action { - public SparkSQL() { - - } - @Override public void run(String arg) { @@ -42,14 +38,12 @@ public void run(String arg) { spark.read().option("delimiter", ",").option("header", "true").csv(testsFile) .createOrReplaceTempView(TESTS_TABLE); - String sqlQuery = """ - SELECT vehicles.colour, count(*) as colourcount - FROM vehicles - INNER JOIN tests ON vehicles.vehicle_id=tests.vehicle_id - WHERE tests.test_result = 'P' - GROUP BY vehicles.colour - ORDER BY count(*) - """; + String sqlQuery = "SELECT vehicles.colour, count(*) as colourcount"+ + " FROM vehicles"+ + " INNER JOIN tests ON vehicles.vehicle_id=tests.vehicle_id"+ + " WHERE tests.test_result = 'P'"+ + " GROUP BY vehicles.colour"+ + " ORDER BY count(*)"; var result = spark.sql(sqlQuery); result.show(); @@ -67,6 +61,9 @@ ORDER BY count(*) } } + /** creates a substrait plan based on the logical plan + * @param enginePlan Spark Local PLan + */ public void createSubstrait(LogicalPlan enginePlan) { ToSubstraitRel toSubstrait = new ToSubstraitRel(); io.substrait.plan.Plan plan = toSubstrait.convert(enginePlan); diff --git a/examples/substrait-spark/app/src/main/resources/tests_subset_2023.csv b/examples/substrait-spark/src/main/resources/tests_subset_2023.csv similarity index 100% rename from examples/substrait-spark/app/src/main/resources/tests_subset_2023.csv rename to examples/substrait-spark/src/main/resources/tests_subset_2023.csv diff --git a/examples/substrait-spark/app/src/main/resources/vehicles_subset_2023.csv b/examples/substrait-spark/src/main/resources/vehicles_subset_2023.csv similarity index 100% rename from examples/substrait-spark/app/src/main/resources/vehicles_subset_2023.csv rename to examples/substrait-spark/src/main/resources/vehicles_subset_2023.csv diff --git a/settings.gradle.kts b/settings.gradle.kts index 224c6b509..013449786 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -1,6 +1,6 @@ rootProject.name = "substrait" -include("bom", "core", "isthmus", "isthmus-cli", "spark") +include("bom", "core", "isthmus", "isthmus-cli", "spark", "examples:substrait-spark") pluginManagement { plugins {