Skip to content

Commit 3aa4cf3

Browse files
committed
#666 Add record length value mapping option to Cobrix.
1 parent 6f78ed2 commit 3aa4cf3

File tree

5 files changed

+65
-0
lines changed

5 files changed

+65
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1547,6 +1547,7 @@ The output looks like this:
15471547
| .option("bdw_adjustment", 0) | If there is a mismatch between BDW and record length this option can be used to adjust the difference. |
15481548
| .option("re_additional_info", "") | Passes a string as an additional info parameter passed to a custom record extractor to its constructor. |
15491549
| .option("record_length_field", "RECORD-LEN") | Specifies a record length field or expression to use instead of RDW. Use `rdw_adjustment` option if the record length field differs from the actual length by a fixed amount of bytes. The `record_format` should be set to `F`. This option is incompatible with `is_record_sequence`. |
1550+
| .option("record_length_map", """{"A":100}""") | Specifies a mapping between record length field values and actual record lengths. |
15501551
| .option("record_extractor", "com.example.record.extractor") | Specifies a class for parsing record in a custom way. The class must inherit `RawRecordExtractor` and `Serializable` traits. See the chapter on record extractors above. |
15511552
| .option("minimum_record_length", 1) | Specifies the minimum length a record is considered valid, will be skipped otherwise. |
15521553
| .option("maximum_record_length", 1000) | Specifies the maximum length a record is considered valid, will be skipped otherwise. |

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
4242
* @param minimumRecordLength Minium record length for which the record is considered valid.
4343
* @param maximumRecordLength Maximum record length for which the record is considered valid.
4444
* @param lengthFieldExpression A name of a field that contains record length. Optional. If not set the copybook record length will be used.
45+
* @param lengthFieldMap Mapping between record length field values to actual length. The field name should be specified in lengthFieldExpression.
4546
* @param isRecordSequence Does input files have 4 byte record length headers
4647
* @param bdw Block descriptor word (if specified), for FB and VB record formats
4748
* @param isRdwPartRecLength Does RDW count itself as part of record length itself
@@ -88,6 +89,7 @@ case class ReaderParameters(
8889
minimumRecordLength: Int = 1,
8990
maximumRecordLength: Int = Int.MaxValue,
9091
lengthFieldExpression: Option[String] = None,
92+
lengthFieldMap: Map[String, Int] = Map.empty,
9193
isRecordSequence: Boolean = false,
9294
bdw: Option[Bdw] = None,
9395
isRdwBigEndian: Boolean = false,

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/VariableLengthParameters.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ package za.co.absa.cobrix.cobol.reader.parameters
2929
* @param rhpAdditionalInfo An optional additional option string passed to a custom record header parser
3030
* @param reAdditionalInfo An optional additional option string passed to a custom record extractor
3131
* @param recordLengthField A field that stores record length
32+
* @param recordLengthMap A mapping between field value and record size.
3233
* @param fileStartOffset A number of bytes to skip at the beginning of each file
3334
* @param fileEndOffset A number of bytes to skip at the end of each file
3435
* @param generateRecordId Generate a sequential record number for each record to be able to retain the order of the original data
@@ -50,6 +51,7 @@ case class VariableLengthParameters(
5051
rhpAdditionalInfo: Option[String],
5152
reAdditionalInfo: String,
5253
recordLengthField: String,
54+
recordLengthMap: Map[String, Int],
5355
fileStartOffset: Int,
5456
fileEndOffset: Int,
5557
generateRecordId: Boolean,

spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
3232

3333
import scala.collection.mutable
3434
import scala.collection.mutable.ListBuffer
35+
import scala.util.control.NonFatal
3536

3637
/**
3738
* This class provides methods for parsing the parameters set as Spark options.
@@ -55,6 +56,7 @@ object CobolParametersParser extends Logging {
5556
val PARAM_MAXIMUM_RECORD_LENGTH = "maximum_record_length"
5657
val PARAM_IS_RECORD_SEQUENCE = "is_record_sequence"
5758
val PARAM_RECORD_LENGTH_FIELD = "record_length_field"
59+
val PARAM_RECORD_LENGTH_MAP = "record_length_map"
5860
val PARAM_RECORD_START_OFFSET = "record_start_offset"
5961
val PARAM_RECORD_END_OFFSET = "record_end_offset"
6062
val PARAM_FILE_START_OFFSET = "file_start_offset"
@@ -348,6 +350,7 @@ object CobolParametersParser extends Logging {
348350
rhpAdditionalInfo = None,
349351
reAdditionalInfo = "",
350352
recordLengthField = "",
353+
Map.empty,
351354
fileStartOffset = 0,
352355
fileEndOffset = 0,
353356
generateRecordId = false,
@@ -380,6 +383,7 @@ object CobolParametersParser extends Logging {
380383
minimumRecordLength = parameters.minimumRecordLength.getOrElse(1),
381384
maximumRecordLength = parameters.maximumRecordLength.getOrElse(Int.MaxValue),
382385
lengthFieldExpression = recordLengthField,
386+
lengthFieldMap = varLenParams.recordLengthMap,
383387
isRecordSequence = varLenParams.isRecordSequence,
384388
bdw = varLenParams.bdw,
385389
isRdwBigEndian = varLenParams.isRdwBigEndian,
@@ -461,6 +465,7 @@ object CobolParametersParser extends Logging {
461465
params.get(PARAM_RHP_ADDITIONAL_INFO),
462466
params.get(PARAM_RE_ADDITIONAL_INFO).getOrElse(""),
463467
recordLengthFieldOpt.getOrElse(""),
468+
getRecordLengthMappings(params.getOrElse(PARAM_RECORD_LENGTH_MAP, "{}")),
464469
fileStartOffset,
465470
fileEndOffset,
466471
isRecordIdGenerationEnabled,
@@ -912,6 +917,33 @@ object CobolParametersParser extends Logging {
912917
}
913918
}
914919

920+
/**
921+
* Parses the options for the record length mappings.
922+
*
923+
* @param recordLengthMapJson Parameters provided by spark.read.option(...)
924+
* @return Returns a mapping from the record length field values to the actual record length
925+
*/
926+
@throws(classOf[IllegalArgumentException])
927+
def getRecordLengthMappings(recordLengthMapJson: String): Map[String, Int] = {
928+
val parser = new ParserJson()
929+
parser.parseMap(recordLengthMapJson)
930+
.toSeq // Converting to a non-lazy sequence first. If .mapValues() is used the map stays lazy and errors pop up later
931+
.map { case (k, v) =>
932+
val vInt = v match {
933+
case num: Int => num
934+
case num: Long => num.toInt
935+
case str: String =>
936+
try {
937+
str.toInt
938+
} catch {
939+
case NonFatal(ex) => throw new IllegalArgumentException(s"Unsupported record length value: '$str'. Please, use numeric values only", ex)
940+
}
941+
case any => throw new IllegalArgumentException(s"Unsupported record length value: '$any'. Please, use numeric values only")
942+
}
943+
(k, vInt)
944+
}.toMap[String, Int]
945+
}
946+
915947
/**
916948
* Parses the options for the occurs mappings.
917949
*

spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/ParametersParsingSpec.scala

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,4 +101,32 @@ class ParametersParsingSpec extends AnyFunSuite {
101101
assert(fieldCodePageMap("field_3") == "us-ascii")
102102
}
103103

104+
test("Test getRecordLengthMappings() works as expected") {
105+
val map1 = CobolParametersParser.getRecordLengthMappings("""{}""")
106+
assert(map1.isEmpty)
107+
108+
val map2 = CobolParametersParser.getRecordLengthMappings("""{"A": 12}""")
109+
assert(map2("A") == 12)
110+
111+
val map3 = CobolParametersParser.getRecordLengthMappings("""{"0A1": "1234", "B": 122}""")
112+
assert(map3("0A1") == 1234)
113+
assert(map3("B") == 122)
114+
}
115+
116+
test("Test getRecordLengthMappings() exceptional situations") {
117+
val ex = intercept[IllegalArgumentException] {
118+
CobolParametersParser.getRecordLengthMappings("""{"A": "ABC"}""")
119+
}
120+
assert(ex.getMessage == "Unsupported record length value: 'ABC'. Please, use numeric values only")
121+
122+
val ex2 = intercept[IllegalArgumentException] {
123+
CobolParametersParser.getRecordLengthMappings("""{"A": {"B": 12}}""")
124+
}
125+
assert(ex2.getMessage == "Unsupported record length value: 'Map(B -> 12)'. Please, use numeric values only")
126+
127+
assertThrows[IllegalArgumentException] {
128+
CobolParametersParser.getRecordLengthMappings("""{"A": {"B": 5000000000}}""")
129+
}
130+
}
131+
104132
}

0 commit comments

Comments
 (0)