@@ -20,11 +20,11 @@ import com.typesafe.config.Config
20
20
import org .apache .hadoop .fs .Path
21
21
import org .apache .spark .sql .{DataFrame , SaveMode , SparkSession }
22
22
import org .slf4j .LoggerFactory
23
- import za .co .absa .pramen .api .{ExternalChannelFactory , MetastoreReader , Sink , SinkResult }
23
+ import za .co .absa .pramen .api .{DataFormat , ExternalChannelFactory , MetaTableDef , MetastoreReader , Query , Sink , SinkResult }
24
24
import za .co .absa .pramen .core .exceptions .CmdFailedException
25
25
import za .co .absa .pramen .core .process .{ProcessRunner , ProcessRunnerImpl }
26
26
import za .co .absa .pramen .core .sink .CmdLineSink .{CMD_LINE_KEY , CmdLineDataParams }
27
- import za .co .absa .pramen .core .utils .{ConfigUtils , FsUtils }
27
+ import za .co .absa .pramen .core .utils .{ConfigUtils , FsUtils , SparkUtils }
28
28
29
29
import java .time .LocalDate
30
30
import java .time .format .DateTimeFormatter
@@ -51,7 +51,7 @@ import scala.util.control.NonFatal
51
51
*
52
52
* Otherwise, the data can be accessed by the command line tool directly from the metastore.
53
53
*
54
- * Example sink definition:
54
+ * == Example sink definition:==
55
55
* {{{
56
56
* {
57
57
* name = "cmd_line"
@@ -73,6 +73,7 @@ import scala.util.control.NonFatal
73
73
* Here is an example of a sink definition in a pipeline. As for any other operation you can specify
74
74
* dependencies, transformations, filters and columns to select.
75
75
*
76
+ * ==Example operation:==
76
77
* {{{
77
78
* {
78
79
* name = "Command Line sink"
@@ -154,15 +155,18 @@ class CmdLineSink(sinkConfig: Config,
154
155
155
156
log.info(s " $count records saved to $tempPath. " )
156
157
157
- val cmdLine = getCmdLine(cmdLineTemplate, Option (tempPath), infoDate)
158
+ val cmdLine = getCmdLine(cmdLineTemplate, Option (tempPath), Option (tempPath), infoDate)
158
159
159
160
runCmd(cmdLine)
160
161
161
162
log.info(s " $count records sent to the cmd line sink ( $cmdLine). " )
162
163
}
163
164
SinkResult (count)
164
165
case None =>
165
- val cmdLine = getCmdLine(cmdLineTemplate, None , infoDate)
166
+ val metaTable = metastore.getTableDef(tableName)
167
+ val (dataPath, partitionPath) = getPaths(metaTable, infoDate)
168
+
169
+ val cmdLine = getCmdLine(cmdLineTemplate, dataPath, partitionPath, infoDate)
166
170
167
171
runCmd(cmdLine)
168
172
@@ -173,21 +177,80 @@ class CmdLineSink(sinkConfig: Config,
173
177
}
174
178
}
175
179
180
+ private [core] def getPaths (metaTable : MetaTableDef , infoDate : LocalDate ): (Option [Path ], Option [Path ]) = {
181
+ val basePathOpt = metaTable.format match {
182
+ case DataFormat .Parquet (path, _) =>
183
+ Option (path)
184
+ case DataFormat .Delta (query, _) =>
185
+ query match {
186
+ case Query .Path (path) =>
187
+ Option (path)
188
+ case _ => None
189
+ }
190
+ case _ =>
191
+ None
192
+ }
193
+
194
+ basePathOpt match {
195
+ case Some (basePath) =>
196
+ (Option (new Path (basePath)), Option (SparkUtils .getPartitionPath(infoDate, metaTable.infoDateColumn, metaTable.infoDateFormat, basePath)))
197
+ case None =>
198
+ (None , None )
199
+ }
200
+ }
201
+
176
202
private [core] def getCmdLine (cmdLineTemplate : String ,
177
203
dataPath : Option [Path ],
204
+ partitionPath : Option [Path ],
178
205
infoDate : LocalDate ): String = {
179
206
log.info(s " CmdLine template: $cmdLineTemplate" )
180
207
181
208
val cmdWithDates = cmdLineTemplate.replace(" @infoDate" , infoDate.toString)
182
209
.replace(" @infoMonth" , infoDate.format(DateTimeFormatter .ofPattern(" yyyy-MM" )))
183
210
184
- dataPath match {
211
+ val cmdWithDataPath = dataPath match {
185
212
case Some (path) =>
186
- cmdWithDates.replace(" @dataPath" , path.toString)
187
- .replace(" @dataUri" , path.toUri.toString)
213
+ if (Option (path.toUri.getAuthority).isDefined) {
214
+ val bucket = path.toUri.getAuthority
215
+ val prefixOrg = path.toUri.getPath
216
+ val prefix = if (prefixOrg.startsWith(" /" , 0 ))
217
+ prefixOrg.substring(1 )
218
+ else
219
+ prefixOrg
220
+
221
+ cmdWithDates
222
+ .replace(" @bucket" , bucket)
223
+ .replace(" @prefix" , prefix)
224
+ .replace(" @dataPath" , path.toString)
225
+ .replace(" @dataUri" , path.toUri.toString)
226
+ } else {
227
+ cmdWithDates.replace(" @dataPath" , path.toString)
228
+ .replace(" @dataUri" , path.toUri.toString)
229
+ }
188
230
case None =>
189
231
cmdWithDates
190
232
}
233
+
234
+ partitionPath match {
235
+ case Some (path) =>
236
+ if (Option (path.toUri.getAuthority).isDefined) {
237
+ val bucket = path.toUri.getAuthority
238
+ val prefixOrg = path.toUri.getPath
239
+ val prefix = if (prefixOrg.startsWith(" /" , 0 ))
240
+ prefixOrg.substring(1 )
241
+ else
242
+ prefixOrg
243
+
244
+ cmdWithDataPath
245
+ .replace(" @bucket" , bucket)
246
+ .replace(" @partitionPrefix" , prefix)
247
+ .replace(" @partitionPath" , path.toString)
248
+ } else {
249
+ cmdWithDataPath.replace(" @partitionPath" , path.toString)
250
+ }
251
+ case None =>
252
+ cmdWithDataPath
253
+ }
191
254
}
192
255
193
256
private [core] def runCmd (cmdLine : String ): Unit = {
0 commit comments