Skip to content

Commit 0d5ee08

Browse files
committed
#666 Implement record splitting based on record length field mapping.
1 parent 896c885 commit 0d5ee08

File tree

10 files changed

+189
-26
lines changed

10 files changed

+189
-26
lines changed

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,13 @@ or
478478
.option("record_length_field", "FIELD1 * 10 + 200")
479479
```
480480

481+
If the record field contains a string that can be mapped to a record size, you can add the mapping as a JSON:
482+
```
483+
.option("record_format", "F")
484+
.option("record_length_field", "FIELD_STR")
485+
.option("record_length_map", """{"SEG1":100,"SEG2":200}""")
486+
```
487+
481488
### Use cases for various variable length formats
482489

483490
In order to understand the file format it is often sufficient to look at the first 4 bytes of the file (un case of RDW only files),
@@ -1547,7 +1554,7 @@ The output looks like this:
15471554
| .option("bdw_adjustment", 0) | If there is a mismatch between BDW and record length this option can be used to adjust the difference. |
15481555
| .option("re_additional_info", "") | Passes a string as an additional info parameter passed to a custom record extractor to its constructor. |
15491556
| .option("record_length_field", "RECORD-LEN") | Specifies a record length field or expression to use instead of RDW. Use `rdw_adjustment` option if the record length field differs from the actual length by a fixed amount of bytes. The `record_format` should be set to `F`. This option is incompatible with `is_record_sequence`. |
1550-
| .option("record_length_map", """{"A":100}""") | Specifies a mapping between record length field values and actual record lengths. |
1557+
| .option("record_length_map", """{"A":100,"B":50}""") | Specifies a mapping between record length field values and actual record lengths. |
15511558
| .option("record_extractor", "com.example.record.extractor") | Specifies a class for parsing record in a custom way. The class must inherit `RawRecordExtractor` and `Serializable` traits. See the chapter on record extractors above. |
15521559
| .option("minimum_record_length", 1) | Specifies the minimum length a record is considered valid, will be skipped otherwise. |
15531560
| .option("maximum_record_length", 1000) | Specifies the maximum length a record is considered valid, will be skipped otherwise. |
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/*
2+
* Copyright 2018 ABSA Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.cobrix.cobol.reader.iterator
18+
19+
import za.co.absa.cobrix.cobol.parser.ast.Primitive
20+
import za.co.absa.cobrix.cobol.parser.expression.NumberExprEvaluator
21+
22+
case class RecordLengthField(
23+
field: Primitive,
24+
valueMap: Map[String, Int]
25+
)

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/VRLRecordReader.scala

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ class VRLRecordReader(cobolSchema: Copybook,
5151
private var recordIndex = startRecordId - 1
5252

5353
private val copyBookRecordSize = cobolSchema.getRecordSize
54-
private val (lengthField, lengthFieldExpr) = ReaderParametersValidator.getEitherFieldAndExpression(readerProperties.lengthFieldExpression, cobolSchema)
54+
private val (recordLengthField, lengthFieldExpr) = ReaderParametersValidator.getEitherFieldAndExpression(readerProperties.lengthFieldExpression, readerProperties.lengthFieldMap, cobolSchema)
55+
private val lengthField = recordLengthField.map(_.field)
56+
private val lengthMap = recordLengthField.map(_.valueMap).getOrElse(Map.empty)
5557
private val segmentIdField = ReaderParametersValidator.getSegmentIdField(readerProperties.multisegment, cobolSchema)
5658
private val recordLengthAdjustment = readerProperties.rdwAdjustment
5759
private val useRdw = lengthField.isEmpty && lengthFieldExpr.isEmpty
@@ -130,12 +132,22 @@ class VRLRecordReader(cobolSchema: Copybook,
130132

131133
val recordLength = lengthField match {
132134
case Some(lengthAST) =>
133-
cobolSchema.extractPrimitiveField(lengthAST, binaryDataStart, readerProperties.startOffset) match {
134-
case i: Int => i + recordLengthAdjustment
135-
case l: Long => l.toInt + recordLengthAdjustment
136-
case s: String => s.toInt + recordLengthAdjustment
135+
val length = cobolSchema.extractPrimitiveField(lengthAST, binaryDataStart, readerProperties.startOffset) match {
136+
case i: Int => i
137+
case l: Long => l.toInt
138+
case s: String =>
139+
if (lengthMap.isEmpty) {
140+
s.toInt
141+
} else {
142+
lengthMap.get(s) match {
143+
case Some(len) => len
144+
case None => throw new IllegalStateException(s"Record length value '$s' is not mapped to a record length.")
145+
}
146+
}
147+
137148
case _ => throw new IllegalStateException(s"Record length value of the field ${lengthAST.name} must be an integral type.")
138149
}
150+
length + recordLengthAdjustment
139151
case None => copyBookRecordSize
140152
}
141153

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/validator/ReaderParametersValidator.scala

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,22 @@ package za.co.absa.cobrix.cobol.reader.validator
1919
import za.co.absa.cobrix.cobol.parser.Copybook
2020
import za.co.absa.cobrix.cobol.parser.ast.Primitive
2121
import za.co.absa.cobrix.cobol.parser.expression.NumberExprEvaluator
22-
import za.co.absa.cobrix.cobol.reader.iterator.RecordLengthExpression
22+
import za.co.absa.cobrix.cobol.reader.iterator.{RecordLengthExpression, RecordLengthField}
2323
import za.co.absa.cobrix.cobol.reader.parameters.MultisegmentParameters
2424

2525
import scala.util.Try
2626

2727
object ReaderParametersValidator {
2828

29-
def getEitherFieldAndExpression(fieldOrExpressionOpt: Option[String], cobolSchema: Copybook): (Option[Primitive], Option[RecordLengthExpression]) = {
29+
def getEitherFieldAndExpression(fieldOrExpressionOpt: Option[String], recordLengthMap: Map[String, Int], cobolSchema: Copybook): (Option[RecordLengthField], Option[RecordLengthExpression]) = {
3030
fieldOrExpressionOpt match {
3131
case Some(fieldOrExpression) =>
3232
val canBeExpression = fieldOrExpression.exists(c => "+-*/".contains(c))
3333

3434
if (canBeExpression && Try(cobolSchema.getFieldByName(fieldOrExpression)).isSuccess) {
35-
(getLengthField(fieldOrExpression, cobolSchema), None)
35+
(getLengthField(fieldOrExpression, recordLengthMap, cobolSchema), None)
3636
} else {
37-
(None, getLengthFieldExpr(fieldOrExpression, cobolSchema))
37+
(None, getLengthFieldExpr(fieldOrExpression, recordLengthMap, cobolSchema))
3838
}
3939
case None =>
4040
(None, None)
@@ -43,13 +43,13 @@ object ReaderParametersValidator {
4343
}
4444

4545
@throws(classOf[IllegalStateException])
46-
def getLengthField(recordLengthFieldName: String, cobolSchema: Copybook): Option[Primitive] = {
46+
def getLengthField(recordLengthFieldName: String, recordLengthMap: Map[String, Int], cobolSchema: Copybook): Option[RecordLengthField] = {
4747
val field = cobolSchema.getFieldByName(recordLengthFieldName)
4848

4949
val astNode = field match {
5050
case s: Primitive =>
51-
if (!s.dataType.isInstanceOf[za.co.absa.cobrix.cobol.parser.ast.datatype.Integral]) {
52-
throw new IllegalStateException(s"The record length field $recordLengthFieldName must be an integral type.")
51+
if (!s.dataType.isInstanceOf[za.co.absa.cobrix.cobol.parser.ast.datatype.Integral] && recordLengthMap.isEmpty) {
52+
throw new IllegalStateException(s"The record length field $recordLengthFieldName must be an integral type or a value mapping must be specified.")
5353
}
5454
if (s.occurs.isDefined && s.occurs.get > 1) {
5555
throw new IllegalStateException(s"The record length field '$recordLengthFieldName' cannot be an array.")
@@ -58,17 +58,17 @@ object ReaderParametersValidator {
5858
case _ =>
5959
throw new IllegalStateException(s"The record length field $recordLengthFieldName must have an primitive integral type.")
6060
}
61-
Some(astNode)
61+
Some(RecordLengthField(astNode, recordLengthMap))
6262
}
6363

6464
@throws(classOf[IllegalStateException])
65-
def getLengthFieldExpr(recordLengthFieldExpr: String, cobolSchema: Copybook): Option[RecordLengthExpression] = {
65+
def getLengthFieldExpr(recordLengthFieldExpr: String, recordLengthMap: Map[String, Int], cobolSchema: Copybook): Option[RecordLengthExpression] = {
6666
val evaluator = new NumberExprEvaluator(recordLengthFieldExpr)
6767
val vars = evaluator.getVariables
6868
val fields = vars.map { field =>
69-
val primitive = getLengthField(field, cobolSchema)
69+
val primitive = getLengthField(field, recordLengthMap, cobolSchema)
7070
.getOrElse(throw new IllegalArgumentException(s"The record length expression '$recordLengthFieldExpr' contains an unknown field '$field'."))
71-
(field, primitive)
71+
(field, primitive.field)
7272
}
7373
val requiredBytesToRead = if (fields.nonEmpty) {
7474
fields.map { case (_, field) =>

cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/reader/iterator/VRLRecordReaderSpec.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ class VRLRecordReaderSpec extends AnyWordSpec {
203203
lengthFieldExpression = Some("LEN"))
204204
}
205205

206-
assert(ex.getMessage == "The record length field LEN must be an integral type.")
206+
assert(ex.getMessage == "The record length field LEN must be an integral type or a value mapping must be specified.")
207207
}
208208
}
209209

spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat
2323
import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
2424
import za.co.absa.cobrix.cobol.parser.policies.DebugFieldsPolicy.DebugFieldsPolicy
2525
import za.co.absa.cobrix.cobol.parser.policies.StringTrimmingPolicy.StringTrimmingPolicy
26-
import za.co.absa.cobrix.cobol.parser.policies.{CommentPolicy, DebugFieldsPolicy, FillerNamingPolicy, MetadataPolicy, StringTrimmingPolicy}
26+
import za.co.absa.cobrix.cobol.parser.policies._
2727
import za.co.absa.cobrix.cobol.parser.recordformats.RecordFormat
2828
import za.co.absa.cobrix.cobol.parser.recordformats.RecordFormat._
2929
import za.co.absa.cobrix.cobol.reader.parameters._
@@ -926,8 +926,13 @@ object CobolParametersParser extends Logging {
926926
@throws(classOf[IllegalArgumentException])
927927
def getRecordLengthMappings(recordLengthMapJson: String): Map[String, Int] = {
928928
val parser = new ParserJson()
929-
parser.parseMap(recordLengthMapJson)
930-
.toSeq // Converting to a non-lazy sequence first. If .mapValues() is used the map stays lazy and errors pop up later
929+
val json = try {
930+
parser.parseMap(recordLengthMapJson)
931+
} catch {
932+
case NonFatal(ex) => throw new IllegalArgumentException(s"Unable to parse record length mapping JSON.", ex)
933+
}
934+
935+
json.toSeq // Converting to a non-lazy sequence first. If .mapValues() is used the map stays lazy and errors pop up later
931936
.map { case (k, v) =>
932937
val vInt = v match {
933938
case num: Int => num
@@ -936,9 +941,9 @@ object CobolParametersParser extends Logging {
936941
try {
937942
str.toInt
938943
} catch {
939-
case NonFatal(ex) => throw new IllegalArgumentException(s"Unsupported record length value: '$str'. Please, use numeric values only", ex)
944+
case NonFatal(ex) => throw new IllegalArgumentException(s"Unsupported record length value: '$str'. Please, use numeric values only.", ex)
940945
}
941-
case any => throw new IllegalArgumentException(s"Unsupported record length value: '$any'. Please, use numeric values only")
946+
case any => throw new IllegalArgumentException(s"Unsupported record length value: '$any'. Please, use numeric values only.")
942947
}
943948
(k, vInt)
944949
}.toMap[String, Int]

spark-cobol/src/test/resources/log4j.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ log4j.logger.za.co.absa.cobrix.spark.cobol.source.index.IndexBuilder$=ERROR
3232
log4j.logger.za.co.absa.cobrix.spark.cobol.source.streaming.FileStreamer=ERROR
3333
log4j.logger.za.co.absa.cobrix.spark.cobol.utils.FileUtils$=OFF
3434
log4j.logger.za.co.absa.cobrix.spark.cobol.utils.FileUtils=OFF
35+
log4j.logger.za.co.absa.cobrix.cobol.parser.antlr.ParserJson=OFF

spark-cobol/src/test/resources/log4j2.properties

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,6 @@ logger.cobrix_file_utils1.level = OFF
3939

4040
logger.cobrix_file_utils2.name = za.co.absa.cobrix.cobol.utils.FileUtils
4141
logger.cobrix_file_utils2.level = OFF
42+
43+
logger.parserjson.name = za.co.absa.cobrix.cobol.parser.antlr.ParserJson
44+
logger.parserjson.level = OFF

spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/ParametersParsingSpec.scala

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,16 +117,22 @@ class ParametersParsingSpec extends AnyFunSuite {
117117
val ex = intercept[IllegalArgumentException] {
118118
CobolParametersParser.getRecordLengthMappings("""{"A": "ABC"}""")
119119
}
120-
assert(ex.getMessage == "Unsupported record length value: 'ABC'. Please, use numeric values only")
120+
assert(ex.getMessage == "Unsupported record length value: 'ABC'. Please, use numeric values only.")
121121

122122
val ex2 = intercept[IllegalArgumentException] {
123123
CobolParametersParser.getRecordLengthMappings("""{"A": {"B": 12}}""")
124124
}
125-
assert(ex2.getMessage == "Unsupported record length value: 'Map(B -> 12)'. Please, use numeric values only")
125+
assert(ex2.getMessage == "Unsupported record length value: 'Map(B -> 12)'. Please, use numeric values only.")
126126

127-
assertThrows[IllegalArgumentException] {
127+
val ex3 = intercept[IllegalArgumentException] {
128128
CobolParametersParser.getRecordLengthMappings("""{"A": {"B": 5000000000}}""")
129129
}
130+
assert(ex3.getMessage == "Unsupported record length value: 'Map(B -> 5.0E9)'. Please, use numeric values only.")
131+
132+
val ex4 = intercept[IllegalArgumentException] {
133+
CobolParametersParser.getRecordLengthMappings("""Hmm...""")
134+
}
135+
assert(ex4.getMessage == "Unable to parse record length mapping JSON.")
130136
}
131137

132138
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
* Copyright 2018 ABSA Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.cobrix.spark.cobol.source.integration
18+
19+
import org.apache.spark.SparkException
20+
import org.scalatest.wordspec.AnyWordSpec
21+
import za.co.absa.cobrix.spark.cobol.source.base.SparkTestBase
22+
import za.co.absa.cobrix.spark.cobol.source.fixtures.BinaryFileFixture
23+
24+
class Test37RecordLengthMappingSpec extends AnyWordSpec with SparkTestBase with BinaryFileFixture {
25+
private val copybook =
26+
""" 01 R.
27+
03 SEG-ID PIC X(1).
28+
03 TEXT PIC X(7).
29+
"""
30+
31+
val dataSimple: Array[Byte] = Array(
32+
0xC1, 0xF1, 0xF2, 0xF3, // record 0 'A123'
33+
0xC2, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, // record 1 'B123456'
34+
0xC3, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7).map(_.toByte) // record 2 'C1234567890'
35+
36+
val dataWithFileOffsets: Array[Byte] = Array(
37+
0x00, // header
38+
0xC1, 0xF1, 0xF2, 0xF3, // record 0 'A123'
39+
0xC2, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, // record 1 'B123456'
40+
0xC3, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, // record 2 'C1234567890'
41+
0x00, 0x00).map(_.toByte) // footer
42+
43+
"data with record length mapping" should {
44+
"work for simple mappings" in {
45+
withTempBinFile("rdw_test", ".tmp", dataSimple) { tempFile =>
46+
val expected = """{"SEG_ID":"A","TEXT":"123"},{"SEG_ID":"B","TEXT":"123456"},{"SEG_ID":"C","TEXT":"1234567"}"""
47+
48+
val df = spark.read
49+
.format("cobol")
50+
.option("copybook_contents", copybook)
51+
.option("record_format", "F")
52+
.option("record_length_field", "SEG-ID")
53+
.option("input_split_records", "2")
54+
.option("pedantic", "true")
55+
.option("record_length_map", """{"A":4,"B":7,"C":8}""")
56+
.load(tempFile)
57+
58+
val actual = df.orderBy("SEG_ID").toJSON.collect().mkString(",")
59+
60+
assert(actual == expected)
61+
}
62+
}
63+
64+
"work for data with offsets" in {
65+
withTempBinFile("rdw_test", ".tmp", dataSimple) { tempFile =>
66+
val expected = """{"SEG_ID":"A","TEXT":"123"},{"SEG_ID":"B","TEXT":"123456"},{"SEG_ID":"C","TEXT":"1234567"}"""
67+
68+
val df = spark.read
69+
.format("cobol")
70+
.option("copybook_contents", copybook)
71+
.option("record_format", "F")
72+
.option("record_length_field", "SEG-ID")
73+
.option("file_start_offset", 1)
74+
.option("file_end_offset", 2)
75+
.option("pedantic", "true")
76+
.option("record_length_map", """{"A":4,"B":7,"C":8}""")
77+
.load(tempFile)
78+
79+
val actual = df.orderBy("SEG_ID").toJSON.collect().mkString(",")
80+
81+
assert(actual == expected)
82+
}
83+
}
84+
85+
"throw an exception for unknown mapping" in {
86+
withTempBinFile("rdw_test", ".tmp", dataSimple) { tempFile =>
87+
val df = spark.read
88+
.format("cobol")
89+
.option("copybook_contents", copybook)
90+
.option("record_format", "F")
91+
.option("record_length_field", "SEG-ID")
92+
.option("record_length_map", """{"A":4,"B":7}""")
93+
.option("pedantic", "true")
94+
.load(tempFile)
95+
96+
val ex = intercept[SparkException] {
97+
df.count()
98+
}
99+
100+
assert(ex.getMessage.contains("Record length value 'C' is not mapped to a record length"))
101+
}
102+
}
103+
}
104+
}

0 commit comments

Comments
 (0)