Skip to content

Commit

Permalink
Add more template options for the command line sink.
Browse files Browse the repository at this point in the history
  • Loading branch information
yruslan committed Apr 11, 2024
1 parent a522924 commit 4dcdca8
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 15 deletions.
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,15 @@ The pipeline operation for this sink could look like this:
tables = [
{
input.metastore.table = metastore_table
output.cmd.line = "/my_apps/cmd_line_tool --path @dataPath --date @infoDate"
# Supported substitutions:
# - @dataPath - the path to generated data or to the original metastore table
# - @partitionPath - the path to the partition corresponding to the information date being processed
# - @bucket - the bucket of the table location if the output is on S3
# - @prefix - the prefix on the bucket for tables located on S3
# - @partitionPrefix - the prefix to the data for the information date currently being processed
# - @infoDate - the information date in yyyy-MM-dd format
# - @infoMonth - the information month in yyyy-MM format
output.cmd.line = "/my_apps/cmd_line_tool --path @dataPath --partition-path @partitionPath --date @infoDate"
## All following settings are OPTIONAL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ import com.typesafe.config.Config
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.slf4j.LoggerFactory
import za.co.absa.pramen.api.{ExternalChannelFactory, MetastoreReader, Sink, SinkResult}
import za.co.absa.pramen.api.{DataFormat, ExternalChannelFactory, MetaTableDef, MetastoreReader, Query, Sink, SinkResult}
import za.co.absa.pramen.core.exceptions.CmdFailedException
import za.co.absa.pramen.core.process.{ProcessRunner, ProcessRunnerImpl}
import za.co.absa.pramen.core.sink.CmdLineSink.{CMD_LINE_KEY, CmdLineDataParams}
import za.co.absa.pramen.core.utils.{ConfigUtils, FsUtils}
import za.co.absa.pramen.core.utils.{ConfigUtils, FsUtils, SparkUtils}

import java.time.LocalDate
import java.time.format.DateTimeFormatter
Expand All @@ -51,7 +51,7 @@ import scala.util.control.NonFatal
*
* Otherwise, the data can be accessed by the command line tool directly from the metastore.
*
* Example sink definition:
* ==Example sink definition:==
* {{{
* {
* name = "cmd_line"
Expand All @@ -73,6 +73,7 @@ import scala.util.control.NonFatal
* Here is an example of a sink definition in a pipeline. As for any other operation you can specify
* dependencies, transformations, filters and columns to select.
*
* ==Example operation:==
* {{{
* {
* name = "Command Line sink"
Expand Down Expand Up @@ -154,15 +155,18 @@ class CmdLineSink(sinkConfig: Config,

log.info(s"$count records saved to $tempPath.")

val cmdLine = getCmdLine(cmdLineTemplate, Option(tempPath), infoDate)
val cmdLine = getCmdLine(cmdLineTemplate, Option(tempPath), Option(tempPath), infoDate)

runCmd(cmdLine)

log.info(s"$count records sent to the cmd line sink ($cmdLine).")
}
SinkResult(count)
case None =>
val cmdLine = getCmdLine(cmdLineTemplate, None, infoDate)
val metaTable = metastore.getTableDef(tableName)
val (dataPath, partitionPath) = getPaths(metaTable, infoDate)

val cmdLine = getCmdLine(cmdLineTemplate, dataPath, partitionPath, infoDate)

runCmd(cmdLine)

Expand All @@ -173,21 +177,80 @@ class CmdLineSink(sinkConfig: Config,
}
}

private[core] def getPaths(metaTable: MetaTableDef, infoDate: LocalDate): (Option[Path], Option[Path]) = {
val basePathOpt = metaTable.format match {
case DataFormat.Parquet(path, _) =>
Option(path)
case DataFormat.Delta(query, _) =>
query match {
case Query.Path(path) =>
Option(path)
case _ => None
}
case _ =>
None
}

basePathOpt match {
case Some(basePath) =>
(Option(new Path(basePath)), Option(SparkUtils.getPartitionPath(infoDate, metaTable.infoDateColumn, metaTable.infoDateFormat, basePath)))
case None =>
(None, None)
}
}

private[core] def getCmdLine(cmdLineTemplate: String,
dataPath: Option[Path],
partitionPath: Option[Path],
infoDate: LocalDate): String = {
log.info(s"CmdLine template: $cmdLineTemplate")

val cmdWithDates = cmdLineTemplate.replace("@infoDate", infoDate.toString)
.replace("@infoMonth", infoDate.format(DateTimeFormatter.ofPattern("yyyy-MM")))

dataPath match {
val cmdWithDataPath = dataPath match {
case Some(path) =>
cmdWithDates.replace("@dataPath", path.toString)
.replace("@dataUri", path.toUri.toString)
if (Option(path.toUri.getAuthority).isDefined) {
val bucket = path.toUri.getAuthority
val prefixOrg = path.toUri.getPath
val prefix = if (prefixOrg.startsWith("/", 0))
prefixOrg.substring(1)
else
prefixOrg

cmdWithDates
.replace("@bucket", bucket)
.replace("@prefix", prefix)
.replace("@dataPath", path.toString)
.replace("@dataUri", path.toUri.toString)
} else {
cmdWithDates.replace("@dataPath", path.toString)
.replace("@dataUri", path.toUri.toString)
}
case None =>
cmdWithDates
}

partitionPath match {
case Some(path) =>
if (Option(path.toUri.getAuthority).isDefined) {
val bucket = path.toUri.getAuthority
val prefixOrg = path.toUri.getPath
val prefix = if (prefixOrg.startsWith("/", 0))
prefixOrg.substring(1)
else
prefixOrg

cmdWithDataPath
.replace("@bucket", bucket)
.replace("@partitionPrefix", prefix)
.replace("@partitionPath", path.toString)
} else {
cmdWithDataPath.replace("@partitionPath", path.toString)
}
case None =>
cmdWithDataPath
}
}

private[core] def runCmd(cmdLine: String): Unit = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ package za.co.absa.pramen.core.mocks.sink
import com.typesafe.config.ConfigFactory
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.DataFrame
import org.mockito.Mockito.{mock, when}
import org.scalatest.wordspec.AnyWordSpec
import za.co.absa.pramen.api.{DataFormat, MetastoreReader}
import za.co.absa.pramen.core.MetaTableDefFactory
import za.co.absa.pramen.core.base.SparkTestBase
import za.co.absa.pramen.core.exceptions.CmdFailedException
import za.co.absa.pramen.core.fixtures.TempDirFixture
Expand Down Expand Up @@ -93,22 +96,43 @@ class CmdLineSinkSuite extends AnyWordSpec with SparkTestBase with TempDirFixtur
}

"work without a temporary path" in {
val (sink, _) = getUseCase(null, recordCountToReturn = Some(5))
withTempDirectory("cmd_sink") { tempDir =>
val metastoreReader = mock(classOf[MetastoreReader])
val metatable = MetaTableDefFactory.getDummyMetaTableDef(name = "table1",
format = DataFormat.Parquet(tempDir, None)
)
when(metastoreReader.getTableDef("table1")).thenReturn(metatable)

val (sink, _) = getUseCase(null, recordCountToReturn = Some(5))

val sinkResult = sink.send(exampleDf, "table1", null, infoDate, Map[String, String]("cmd.line" -> "dummy @infoDate"))
val sinkResult = sink.send(exampleDf, "table1", metastoreReader, infoDate, Map[String, String]("cmd.line" -> "dummy @infoDate @partitionPath"))

assert(sinkResult.recordsSent == 5)
assert(sinkResult.recordsSent == 5)
}
}
}

"getCmdLine()" should {
"replace variables with actual values" in {
val (sink, _) = getUseCase()
val dataPath = Some(new Path("/dummy/path"))
val partitionPath = Some(new Path(s"/dummy/path/date=$infoDate"))

val cmdTemplate = "--data-path @dataPath --data-uri @dataUri --partition-path @partitionPath --info-date @infoDate --infoMonth @infoMonth"

assert(sink.getCmdLine(cmdTemplate, dataPath, partitionPath, infoDate) ==
"--data-path /dummy/path --data-uri /dummy/path --partition-path /dummy/path/date=2021-12-28 --info-date 2021-12-28 --infoMonth 2021-12")
}

"replace s3 variables with actual values" in {
val (sink, _) = getUseCase()
val dataPath = Some(new Path("s3a://my_bucket1/dummy/path"))
val partitionPath = Some(new Path("s3a://my_bucket2/dummy/path/enceladus_info_date=2023-12-30"))

val cmdTemplate = "--data-path @dataPath --data-uri @dataUri --info-date @infoDate --infoMonth @infoMonth"
val cmdTemplate = "--bucket @bucket --prefix @prefix --partition-prefix @partitionPrefix"

assert(sink.getCmdLine(cmdTemplate, Some(new Path("/dummy/path")), infoDate) ==
"--data-path /dummy/path --data-uri /dummy/path --info-date 2021-12-28 --infoMonth 2021-12")
assert(sink.getCmdLine(cmdTemplate, dataPath, partitionPath, infoDate) ==
"--bucket my_bucket1 --prefix dummy/path --partition-prefix dummy/path/enceladus_info_date=2023-12-30")
}
}

Expand Down

0 comments on commit 4dcdca8

Please sign in to comment.