Skip to content

Commit 4c3d79f

Browse files
committed
#672 Implement a method that returns the Spark schema for copybooks and use it across the code base.
1 parent ced5908 commit 4c3d79f

File tree

12 files changed

+202
-250
lines changed

12 files changed

+202
-250
lines changed

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/FixedLenNestedReader.scala

Lines changed: 1 addition & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -100,57 +100,6 @@ class FixedLenNestedReader[T: ClassTag](copyBookContents: Seq[String],
100100
}
101101

102102
private def loadCopyBook(copyBookContents: Seq[String]): CobolSchema = {
103-
val encoding = if (isEbcdic) EBCDIC else ASCII
104-
val segmentRedefines = readerProperties.multisegment.map(r => r.segmentIdRedefineMap.values.toList.distinct).getOrElse(Nil)
105-
val fieldParentMap = readerProperties.multisegment.map(r => r.fieldParentMap).getOrElse(HashMap[String, String]())
106-
val asciiCharset = if (readerProperties.asciiCharset.isEmpty) StandardCharsets.UTF_8 else Charset.forName(readerProperties.asciiCharset)
107-
108-
val schema = if (copyBookContents.size == 1)
109-
CopybookParser.parseTree(encoding,
110-
copyBookContents.head,
111-
dropGroupFillers,
112-
dropValueFillers,
113-
fillerNamingPolicy,
114-
segmentRedefines,
115-
fieldParentMap,
116-
stringTrimmingPolicy,
117-
readerProperties.commentPolicy,
118-
readerProperties.strictSignOverpunch,
119-
readerProperties.improvedNullDetection,
120-
readerProperties.decodeBinaryAsHex,
121-
ebcdicCodePage,
122-
asciiCharset,
123-
readerProperties.isUtf16BigEndian,
124-
floatingPointFormat,
125-
nonTerminals,
126-
occursMappings,
127-
readerProperties.debugFieldsPolicy,
128-
readerProperties.fieldCodePage)
129-
else
130-
Copybook.merge(
131-
copyBookContents.map(
132-
CopybookParser.parseTree(encoding,
133-
_,
134-
dropGroupFillers,
135-
dropValueFillers,
136-
fillerNamingPolicy,
137-
segmentRedefines,
138-
fieldParentMap,
139-
stringTrimmingPolicy,
140-
readerProperties.commentPolicy,
141-
readerProperties.strictSignOverpunch,
142-
readerProperties.improvedNullDetection,
143-
readerProperties.decodeBinaryAsHex,
144-
ebcdicCodePage,
145-
asciiCharset,
146-
readerProperties.isUtf16BigEndian,
147-
floatingPointFormat,
148-
nonTerminals,
149-
occursMappings,
150-
readerProperties.debugFieldsPolicy,
151-
readerProperties.fieldCodePage)
152-
)
153-
)
154-
new CobolSchema(schema, schemaRetentionPolicy, "", false, readerProperties.generateRecordBytes, metadataPolicy = readerProperties.metadataPolicy)
103+
CobolSchema.fromReaderParameters(copyBookContents, readerProperties)
155104
}
156105
}

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/VarLenNestedReader.scala

Lines changed: 1 addition & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -199,59 +199,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
199199
}
200200

201201
private def loadCopyBook(copyBookContents: Seq[String]): CobolSchema = {
202-
val encoding = if (readerProperties.isEbcdic) EBCDIC else ASCII
203-
val segmentRedefines = readerProperties.multisegment.map(r => r.segmentIdRedefineMap.values.toList.distinct).getOrElse(Nil)
204-
val fieldParentMap = readerProperties.multisegment.map(r => r.fieldParentMap).getOrElse(HashMap[String, String]())
205-
val codePage = getCodePage(readerProperties.ebcdicCodePage, readerProperties.ebcdicCodePageClass)
206-
val asciiCharset = if (readerProperties.asciiCharset.isEmpty) StandardCharsets.US_ASCII else Charset.forName(readerProperties.asciiCharset)
207-
208-
val schema = if (copyBookContents.size == 1)
209-
CopybookParser.parseTree(encoding,
210-
copyBookContents.head,
211-
readerProperties.dropGroupFillers,
212-
readerProperties.dropValueFillers,
213-
readerProperties.fillerNamingPolicy,
214-
segmentRedefines,
215-
fieldParentMap,
216-
readerProperties.stringTrimmingPolicy,
217-
readerProperties.commentPolicy,
218-
readerProperties.strictSignOverpunch,
219-
readerProperties.improvedNullDetection,
220-
readerProperties.decodeBinaryAsHex,
221-
codePage,
222-
asciiCharset,
223-
readerProperties.isUtf16BigEndian,
224-
readerProperties.floatingPointFormat,
225-
readerProperties.nonTerminals,
226-
readerProperties.occursMappings,
227-
readerProperties.debugFieldsPolicy,
228-
readerProperties.fieldCodePage)
229-
else
230-
Copybook.merge(copyBookContents.map(cpb =>
231-
CopybookParser.parseTree(encoding,
232-
cpb,
233-
readerProperties.dropGroupFillers,
234-
readerProperties.dropValueFillers,
235-
readerProperties.fillerNamingPolicy,
236-
segmentRedefines,
237-
fieldParentMap,
238-
readerProperties.stringTrimmingPolicy,
239-
readerProperties.commentPolicy,
240-
readerProperties.strictSignOverpunch,
241-
readerProperties.improvedNullDetection,
242-
readerProperties.decodeBinaryAsHex,
243-
codePage,
244-
asciiCharset,
245-
readerProperties.isUtf16BigEndian,
246-
readerProperties.floatingPointFormat,
247-
nonTerminals = readerProperties.nonTerminals,
248-
readerProperties.occursMappings,
249-
readerProperties.debugFieldsPolicy,
250-
readerProperties.fieldCodePage)
251-
))
252-
val segIdFieldCount = readerProperties.multisegment.map(p => p.segmentLevelIds.size).getOrElse(0)
253-
val segmentIdPrefix = readerProperties.multisegment.map(p => p.segmentIdPrefix).getOrElse("")
254-
new CobolSchema(schema, readerProperties.schemaPolicy, readerProperties.inputFileNameColumn, readerProperties.generateRecordId, readerProperties.generateRecordBytes, segIdFieldCount, segmentIdPrefix)
202+
CobolSchema.fromReaderParameters(copyBookContents, readerProperties)
255203
}
256204

257205
private def checkInputArgumentsValidity(): Unit = {
@@ -271,13 +219,6 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
271219
}
272220
}
273221

274-
private def getCodePage(codePageName: String, codePageClass: Option[String]): CodePage = {
275-
codePageClass match {
276-
case Some(c) => CodePage.getCodePageByClass(c)
277-
case None => CodePage.getCodePageByName(codePageName)
278-
}
279-
}
280-
281222
private def getRecordHeaderParser: RecordHeaderParser = {
282223
val adjustment1 = if (readerProperties.isRdwPartRecLength) -4 else 0
283224
val adjustment2 = readerProperties.rdwAdjustment

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,19 @@
1616

1717
package za.co.absa.cobrix.cobol.reader.schema
1818

19+
import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage
20+
1921
import java.time.ZonedDateTime
2022
import java.time.format.DateTimeFormatter
21-
import za.co.absa.cobrix.cobol.parser.Copybook
23+
import za.co.absa.cobrix.cobol.parser.{Copybook, CopybookParser}
24+
import za.co.absa.cobrix.cobol.parser.encoding.{ASCII, EBCDIC}
2225
import za.co.absa.cobrix.cobol.parser.policies.MetadataPolicy
26+
import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
2327
import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaRetentionPolicy
2428

29+
import java.nio.charset.{Charset, StandardCharsets}
30+
import scala.collection.immutable.HashMap
31+
2532

2633
/**
2734
* This class provides a view on a COBOL schema from the perspective of Spark. When provided with a parsed copybook the class
@@ -59,3 +66,72 @@ class CobolSchema(val copybook: Copybook,
5966
timestampFormat.format(now)
6067
}
6168
}
69+
70+
object CobolSchema {
71+
def fromReaderParameters(copyBookContents: Seq[String], readerParameters: ReaderParameters): CobolSchema = {
72+
if (copyBookContents.isEmpty) {
73+
throw new IllegalArgumentException("At least one copybook must be specified.")
74+
}
75+
76+
val encoding = if (readerParameters.isEbcdic) EBCDIC else ASCII
77+
val segmentRedefines = readerParameters.multisegment.map(r => r.segmentIdRedefineMap.values.toList.distinct).getOrElse(Nil)
78+
val fieldParentMap = readerParameters.multisegment.map(r => r.fieldParentMap).getOrElse(HashMap[String, String]())
79+
val codePage = getCodePage(readerParameters.ebcdicCodePage, readerParameters.ebcdicCodePageClass)
80+
val asciiCharset = if (readerParameters.asciiCharset.isEmpty) StandardCharsets.UTF_8 else Charset.forName(readerParameters.asciiCharset)
81+
82+
val schema = if (copyBookContents.size == 1)
83+
CopybookParser.parseTree(encoding,
84+
copyBookContents.head,
85+
readerParameters.dropGroupFillers,
86+
readerParameters.dropValueFillers,
87+
readerParameters.fillerNamingPolicy,
88+
segmentRedefines,
89+
fieldParentMap,
90+
readerParameters.stringTrimmingPolicy,
91+
readerParameters.commentPolicy,
92+
readerParameters.strictSignOverpunch,
93+
readerParameters.improvedNullDetection,
94+
readerParameters.decodeBinaryAsHex,
95+
codePage,
96+
asciiCharset,
97+
readerParameters.isUtf16BigEndian,
98+
readerParameters.floatingPointFormat,
99+
readerParameters.nonTerminals,
100+
readerParameters.occursMappings,
101+
readerParameters.debugFieldsPolicy,
102+
readerParameters.fieldCodePage)
103+
else
104+
Copybook.merge(copyBookContents.map(cpb =>
105+
CopybookParser.parseTree(encoding,
106+
cpb,
107+
readerParameters.dropGroupFillers,
108+
readerParameters.dropValueFillers,
109+
readerParameters.fillerNamingPolicy,
110+
segmentRedefines,
111+
fieldParentMap,
112+
readerParameters.stringTrimmingPolicy,
113+
readerParameters.commentPolicy,
114+
readerParameters.strictSignOverpunch,
115+
readerParameters.improvedNullDetection,
116+
readerParameters.decodeBinaryAsHex,
117+
codePage,
118+
asciiCharset,
119+
readerParameters.isUtf16BigEndian,
120+
readerParameters.floatingPointFormat,
121+
nonTerminals = readerParameters.nonTerminals,
122+
readerParameters.occursMappings,
123+
readerParameters.debugFieldsPolicy,
124+
readerParameters.fieldCodePage)
125+
))
126+
val segIdFieldCount = readerParameters.multisegment.map(p => p.segmentLevelIds.size).getOrElse(0)
127+
val segmentIdPrefix = readerParameters.multisegment.map(p => p.segmentIdPrefix).getOrElse("")
128+
new CobolSchema(schema, readerParameters.schemaPolicy, readerParameters.inputFileNameColumn, readerParameters.generateRecordId, readerParameters.generateRecordBytes, segIdFieldCount, segmentIdPrefix, readerParameters.metadataPolicy)
129+
}
130+
131+
def getCodePage(codePageName: String, codePageClass: Option[String]): CodePage = {
132+
codePageClass match {
133+
case Some(c) => CodePage.getCodePageByClass(c)
134+
case None => CodePage.getCodePageByName(codePageName)
135+
}
136+
}
137+
}

0 commit comments

Comments
 (0)