Skip to content

Commit a39c527

Browse files
committed
#672 Tidy up the code, remove a few instances of code duplications.
1 parent 4c3d79f commit a39c527

File tree

12 files changed

+36
-162
lines changed

12 files changed

+36
-162
lines changed

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/FixedLenNestedReader.scala

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -16,44 +16,20 @@
1616

1717
package za.co.absa.cobrix.cobol.reader
1818

19-
import java.nio.charset.{Charset, StandardCharsets}
20-
import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
21-
import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage
22-
import za.co.absa.cobrix.cobol.parser.encoding.{ASCII, EBCDIC}
23-
import za.co.absa.cobrix.cobol.parser.policies.FillerNamingPolicy
24-
import za.co.absa.cobrix.cobol.parser.policies.StringTrimmingPolicy.StringTrimmingPolicy
25-
import za.co.absa.cobrix.cobol.parser.recordformats.RecordFormat.{AsciiText, CobrixAsciiText, FixedLength}
26-
import za.co.absa.cobrix.cobol.parser.{Copybook, CopybookParser}
19+
import za.co.absa.cobrix.cobol.parser.recordformats.RecordFormat.{AsciiText, FixedLength}
2720
import za.co.absa.cobrix.cobol.reader.extractors.record.RecordHandler
2821
import za.co.absa.cobrix.cobol.reader.iterator.FixedLenNestedRowIterator
2922
import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
30-
import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaRetentionPolicy
3123
import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
3224

33-
import scala.collection.immutable.HashMap
3425
import scala.reflect.ClassTag
3526

3627
/**
3728
* The Cobol data reader that produces nested structure schema
3829
*
3930
* @param copyBookContents A copybook contents.
40-
* @param startOffset Specifies the number of bytes at the beginning of each record that can be ignored.
41-
* @param endOffset Specifies the number of bytes at the end of each record that can be ignored.
42-
* @param schemaRetentionPolicy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
4331
*/
4432
class FixedLenNestedReader[T: ClassTag](copyBookContents: Seq[String],
45-
isEbcdic: Boolean,
46-
ebcdicCodePage: CodePage,
47-
floatingPointFormat: FloatingPointFormat,
48-
startOffset: Int,
49-
endOffset: Int,
50-
schemaRetentionPolicy: SchemaRetentionPolicy,
51-
stringTrimmingPolicy: StringTrimmingPolicy,
52-
dropGroupFillers: Boolean,
53-
dropValueFillers: Boolean,
54-
fillerNamingPolicy: FillerNamingPolicy,
55-
nonTerminals: Seq[String],
56-
occursMappings: Map[String, Map[String, Int]],
5733
readerProperties: ReaderParameters,
5834
handler: RecordHandler[T]) extends FixedLenReader with Serializable {
5935

@@ -63,22 +39,22 @@ class FixedLenNestedReader[T: ClassTag](copyBookContents: Seq[String],
6339

6440
override def getRecordSize: Int = {
6541
val recordInternalsSize = readerProperties.recordLength.getOrElse(cobolSchema.getRecordSize)
66-
recordInternalsSize + startOffset + endOffset
42+
recordInternalsSize + readerProperties.startOffset + readerProperties.endOffset
6743
}
6844

6945
@throws(classOf[Exception])
7046
override def getRecordIterator(binaryData: Array[Byte]): Iterator[Seq[Any]] = {
7147
checkBinaryDataValidity(binaryData)
7248
val singleRecordIterator = readerProperties.recordFormat == AsciiText || readerProperties.recordFormat == FixedLength
73-
new FixedLenNestedRowIterator(binaryData, cobolSchema, readerProperties, schemaRetentionPolicy, startOffset, endOffset, singleRecordIterator, handler)
49+
new FixedLenNestedRowIterator(binaryData, cobolSchema, readerProperties, readerProperties.startOffset, readerProperties.endOffset, singleRecordIterator, handler)
7450
}
7551

7652
def checkBinaryDataValidity(binaryData: Array[Byte]): Unit = {
77-
if (startOffset < 0) {
78-
throw new IllegalArgumentException(s"Invalid record start offset = $startOffset. A record start offset cannot be negative.")
53+
if (readerProperties.startOffset < 0) {
54+
throw new IllegalArgumentException(s"Invalid record start offset = ${readerProperties.startOffset}. A record start offset cannot be negative.")
7955
}
80-
if (endOffset < 0) {
81-
throw new IllegalArgumentException(s"Invalid record end offset = $endOffset. A record end offset cannot be negative.")
56+
if (readerProperties.endOffset < 0) {
57+
throw new IllegalArgumentException(s"Invalid record end offset = ${readerProperties.endOffset}. A record end offset cannot be negative.")
8258
}
8359
readerProperties.recordLength match {
8460
case Some(len) =>
@@ -96,7 +72,7 @@ class FixedLenNestedReader[T: ClassTag](copyBookContents: Seq[String],
9672
}
9773

9874
private def getExpectedLength: Int = {
99-
cobolSchema.getRecordSize + startOffset + endOffset
75+
cobolSchema.getRecordSize + readerProperties.startOffset + readerProperties.endOffset
10076
}
10177

10278
private def loadCopyBook(copyBookContents: Seq[String]): CobolSchema = {

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/FixedLenNestedRowIterator.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ class FixedLenNestedRowIterator[T: ClassTag](
3636
val binaryData: Array[Byte],
3737
val cobolSchema: CobolSchema,
3838
readerProperties: ReaderParameters,
39-
policy: SchemaRetentionPolicy,
4039
startOffset: Int,
4140
endOffset: Int,
4241
singleRecordOnly: Boolean,
@@ -89,7 +88,7 @@ class FixedLenNestedRowIterator[T: ClassTag](
8988
cobolSchema.getCobolSchema.ast,
9089
binaryData,
9190
offset,
92-
policy,
91+
readerProperties.schemaPolicy,
9392
readerProperties.variableSizeOccurs,
9493
generateRecordBytes = readerProperties.generateRecordBytes,
9594
activeSegmentRedefine = activeSegmentRedefine,

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ case class CobolParameters(
7171
isEbcdic: Boolean,
7272
ebcdicCodePage: String,
7373
ebcdicCodePageClass: Option[String],
74-
asciiCharset: String,
74+
asciiCharset: Option[String],
7575
fieldCodePage: Map[String, String],
7676
isUtf16BigEndian: Boolean,
7777
floatingPointFormat: FloatingPointFormat,

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ case class ReaderParameters(
7979
isText: Boolean = false,
8080
ebcdicCodePage: String = "common",
8181
ebcdicCodePageClass: Option[String] = None,
82-
asciiCharset: String = "",
82+
asciiCharset: Option[String] = None,
8383
fieldCodePage: Map[String, String] = Map.empty[String, String],
8484
isUtf16BigEndian: Boolean = true,
8585
floatingPointFormat: FloatingPointFormat = FloatingPointFormat.IBM,
@@ -103,7 +103,7 @@ case class ReaderParameters(
103103
fileEndOffset: Int = 0,
104104
generateRecordId: Boolean = false,
105105
generateRecordBytes: Boolean = false,
106-
schemaPolicy: SchemaRetentionPolicy = SchemaRetentionPolicy.KeepOriginal,
106+
schemaPolicy: SchemaRetentionPolicy = SchemaRetentionPolicy.CollapseRoot,
107107
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
108108
allowPartialRecords: Boolean = false,
109109
multisegment: Option[MultisegmentParameters] = None,

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,10 @@ object CobolSchema {
7777
val segmentRedefines = readerParameters.multisegment.map(r => r.segmentIdRedefineMap.values.toList.distinct).getOrElse(Nil)
7878
val fieldParentMap = readerParameters.multisegment.map(r => r.fieldParentMap).getOrElse(HashMap[String, String]())
7979
val codePage = getCodePage(readerParameters.ebcdicCodePage, readerParameters.ebcdicCodePageClass)
80-
val asciiCharset = if (readerParameters.asciiCharset.isEmpty) StandardCharsets.UTF_8 else Charset.forName(readerParameters.asciiCharset)
80+
val asciiCharset = readerParameters.asciiCharset match {
81+
case Some(asciiCharset) => Charset.forName(asciiCharset)
82+
case None => StandardCharsets.UTF_8
83+
}
8184

8285
val schema = if (copyBookContents.size == 1)
8386
CopybookParser.parseTree(encoding,

cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/reader/FixedLenNestedReaderSpec.scala

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ class FixedLenNestedReaderSpec extends AnyWordSpec {
117117
}
118118

119119
"return an iterator for single ASCII record" in {
120-
val reader = getUseCase(Seq(copybookContents), recordFormat = RecordFormat.AsciiText, asciiCharset = "us-ascii")
120+
val reader = getUseCase(Seq(copybookContents), recordFormat = RecordFormat.AsciiText, asciiCharset = Some("us-ascii"))
121121

122122
val it = reader.getRecordIterator(fixedLengthDataExample)
123123

@@ -206,28 +206,19 @@ class FixedLenNestedReaderSpec extends AnyWordSpec {
206206
startOffset: Int = 0,
207207
endOffset: Int = 0,
208208
recordLength: Option[Int] = None,
209-
asciiCharset: String = ""
209+
asciiCharset: Option[String] = None
210210
): FixedLenNestedReader[scala.Array[Any]] = {
211211
val readerProperties = za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters(
212+
isEbcdic = isEbcdic,
213+
startOffset = startOffset,
214+
endOffset = endOffset,
212215
recordFormat = recordFormat,
213216
recordLength = recordLength,
214217
asciiCharset = asciiCharset
215218
)
216219

217220
val reader = new FixedLenNestedReader[scala.Array[Any]](
218221
copybooks,
219-
isEbcdic = isEbcdic,
220-
ebcdicCodePage = new CodePageCommon,
221-
floatingPointFormat = FloatingPointFormat.IEEE754,
222-
startOffset = startOffset,
223-
endOffset = endOffset,
224-
schemaRetentionPolicy = SchemaRetentionPolicy.CollapseRoot,
225-
stringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
226-
dropGroupFillers = false,
227-
dropValueFillers = false,
228-
fillerNamingPolicy = FillerNamingPolicy.SequenceNumbers,
229-
nonTerminals = Nil,
230-
occursMappings = Map.empty,
231222
readerProperties = readerProperties,
232223
handler = new SimpleRecordHandler)
233224

spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ object CobolParametersParser extends Logging {
220220
val stringTrimmingPolicy = getStringTrimmingPolicy(params)
221221
val ebcdicCodePageName = params.getOrElse(PARAM_EBCDIC_CODE_PAGE, "common")
222222
val ebcdicCodePageClass = params.get(PARAM_EBCDIC_CODE_PAGE_CLASS)
223-
val asciiCharset = params.getOrElse(PARAM_ASCII_CHARSET, "")
223+
val asciiCharset = params.get(PARAM_ASCII_CHARSET)
224224

225225
val recordFormatDefined = getRecordFormat(params)
226226

spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/reader/FixedLenNestedReader.scala

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,7 @@ package za.co.absa.cobrix.spark.cobol.reader
1919
import org.apache.spark.sql.Row
2020
import org.apache.spark.sql.catalyst.expressions.GenericRow
2121
import org.apache.spark.sql.types.StructType
22-
import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
23-
import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage
24-
import za.co.absa.cobrix.cobol.parser.policies.FillerNamingPolicy
25-
import za.co.absa.cobrix.cobol.parser.policies.StringTrimmingPolicy.StringTrimmingPolicy
2622
import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
27-
import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaRetentionPolicy
2823
import za.co.absa.cobrix.cobol.reader.{FixedLenNestedReader => ReaderFixedLenNestedReader}
2924
import za.co.absa.cobrix.spark.cobol.schema.CobolSchema
3025

@@ -33,31 +28,10 @@ import za.co.absa.cobrix.spark.cobol.schema.CobolSchema
3328
* The Cobol data reader that produces nested structure schema
3429
*
3530
* @param copyBookContents A copybook contents.
36-
* @param startOffset Specifies the number of bytes at the beginning of each record that can be ignored.
37-
* @param endOffset Specifies the number of bytes at the end of each record that can be ignored.
38-
* @param schemaRetentionPolicy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
31+
* @param readerProperties Properties reflecting parsing copybooks and decoding data.
3932
*/
40-
final class FixedLenNestedReader(copyBookContents: Seq[String],
41-
isEbcdic: Boolean = true,
42-
ebcdicCodePage: CodePage,
43-
floatingPointFormat: FloatingPointFormat,
44-
startOffset: Int = 0,
45-
endOffset: Int = 0,
46-
schemaRetentionPolicy: SchemaRetentionPolicy,
47-
stringTrimmingPolicy: StringTrimmingPolicy,
48-
dropGroupFillers: Boolean,
49-
dropValueFillers: Boolean,
50-
fillerNamingPolicy: FillerNamingPolicy,
51-
nonTerminals: Seq[String],
52-
occursMappings: Map[String, Map[String, Int]],
53-
readerProperties: ReaderParameters
54-
)
55-
extends ReaderFixedLenNestedReader[GenericRow](
56-
copyBookContents, isEbcdic, ebcdicCodePage, floatingPointFormat,
57-
startOffset, endOffset, schemaRetentionPolicy, stringTrimmingPolicy,
58-
dropGroupFillers, dropValueFillers, fillerNamingPolicy, nonTerminals, occursMappings, readerProperties,
59-
new RowHandler()
60-
) with FixedLenReader with Serializable {
33+
final class FixedLenNestedReader(copyBookContents: Seq[String], readerProperties: ReaderParameters)
34+
extends ReaderFixedLenNestedReader[GenericRow](copyBookContents, readerProperties, new RowHandler()) with FixedLenReader with Serializable {
6135

6236
class RowIterator(iterator: Iterator[Seq[Any]]) extends Iterator[Row] {
6337
override def hasNext: Boolean = iterator.hasNext

spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/reader/FixedLenTextReader.scala

Lines changed: 7 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,7 @@ package za.co.absa.cobrix.spark.cobol.reader
1919
import org.apache.spark.sql.Row
2020
import org.apache.spark.sql.catalyst.expressions.GenericRow
2121
import org.apache.spark.sql.types.StructType
22-
import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
23-
import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage
24-
import za.co.absa.cobrix.cobol.parser.policies.FillerNamingPolicy
25-
import za.co.absa.cobrix.cobol.parser.policies.StringTrimmingPolicy.StringTrimmingPolicy
2622
import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
27-
import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaRetentionPolicy
2823
import za.co.absa.cobrix.cobol.reader.{FixedLenNestedReader => ReaderFixedLenNestedReader}
2924
import za.co.absa.cobrix.spark.cobol.schema.CobolSchema
3025

@@ -33,32 +28,10 @@ import za.co.absa.cobrix.spark.cobol.schema.CobolSchema
3328
* The Cobol data reader from text files that produces nested structure schema
3429
*
3530
* @param copyBookContents A copybook contents.
36-
* @param startOffset Specifies the number of bytes at the beginning of each record that can be ignored.
37-
* @param endOffset Specifies the number of bytes at the end of each record that can be ignored.
38-
* @param schemaRetentionPolicy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
31+
* @param readerProperties Properties reflecting parsing copybooks and decoding data.
3932
*/
40-
final class FixedLenTextReader(copyBookContents: Seq[String],
41-
isEbcdic: Boolean = true,
42-
ebcdicCodePage: CodePage,
43-
val asciiCharset: Option[String],
44-
floatingPointFormat: FloatingPointFormat,
45-
startOffset: Int = 0,
46-
endOffset: Int = 0,
47-
schemaRetentionPolicy: SchemaRetentionPolicy,
48-
stringTrimmingPolicy: StringTrimmingPolicy,
49-
dropGroupFillers: Boolean,
50-
dropValueFillers: Boolean,
51-
fillerNamingPolicy: FillerNamingPolicy,
52-
nonTerminals: Seq[String],
53-
occursMappings: Map[String, Map[String, Int]],
54-
readerProperties: ReaderParameters
55-
)
56-
extends ReaderFixedLenNestedReader[GenericRow](
57-
copyBookContents, isEbcdic, ebcdicCodePage, floatingPointFormat,
58-
startOffset, endOffset, schemaRetentionPolicy, stringTrimmingPolicy,
59-
dropGroupFillers, dropValueFillers, fillerNamingPolicy, nonTerminals, occursMappings, readerProperties,
60-
new RowHandler()
61-
) with FixedLenReader with Serializable {
33+
final class FixedLenTextReader(copyBookContents: Seq[String], readerProperties: ReaderParameters)
34+
extends ReaderFixedLenNestedReader[GenericRow](copyBookContents, readerProperties, new RowHandler()) with FixedLenReader with Serializable {
6235

6336
class RowIterator(iterator: Iterator[Seq[Any]]) extends Iterator[Row] {
6437
override def hasNext: Boolean = iterator.hasNext
@@ -80,11 +53,11 @@ final class FixedLenTextReader(copyBookContents: Seq[String],
8053
}
8154

8255
override def checkBinaryDataValidity(binaryData: Array[Byte]): Unit = {
83-
if (startOffset < 0) {
84-
throw new IllegalArgumentException(s"Invalid record start offset = $startOffset. A record start offset cannot be negative.")
56+
if (readerProperties.startOffset < 0) {
57+
throw new IllegalArgumentException(s"Invalid record start offset = ${readerProperties.startOffset}. A record start offset cannot be negative.")
8558
}
86-
if (endOffset < 0) {
87-
throw new IllegalArgumentException(s"Invalid record end offset = $endOffset. A record end offset cannot be negative.")
59+
if (readerProperties.endOffset < 0) {
60+
throw new IllegalArgumentException(s"Invalid record end offset = ${readerProperties.endOffset}. A record end offset cannot be negative.")
8861
}
8962
}
9063
}

0 commit comments

Comments
 (0)