Skip to content

Commit 1006331

Browse files
SteffengreinerKochTobiwow-such-code
authored
Enable NanoporeParser to validate and parse pod5 based nanopore structures (#127)
* Enable NanoporeParser to check for pod5 and dorado basecaller generated files * Add Full data structure example * Add JD Co-authored-by: Tobias Koch <[email protected]> Co-authored-by: wow-such-code <[email protected]>
1 parent abfb920 commit 1006331

File tree

110 files changed

+317
-22
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+317
-22
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@
128128
<dependency>
129129
<artifactId>data-model-lib</artifactId>
130130
<groupId>life.qbic</groupId>
131-
<version>2.25.0</version>
131+
<version>2.27.0</version>
132132
</dependency>
133133
<dependency>
134134
<groupId>org.mockito</groupId>

src/main/groovy/life/qbic/utils/NanoporeParser.groovy

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package life.qbic.utils
33
import com.fasterxml.jackson.databind.ObjectMapper
44
import groovy.json.JsonSlurper
55
import groovy.util.logging.Log4j2
6+
import life.qbic.datamodel.instruments.OxfordNanoporeInstrumentOutputDoradoMinimal
67
import life.qbic.datamodel.instruments.OxfordNanoporeInstrumentOutputMinimal
78
import net.jimblackler.jsonschemafriend.Schema
89
import net.jimblackler.jsonschemafriend.SchemaStore
@@ -14,7 +15,6 @@ import java.nio.file.Path
1415
import java.nio.file.Paths
1516
import java.text.ParseException
1617
import life.qbic.datamodel.datasets.OxfordNanoporeExperiment
17-
1818
import java.util.stream.Collectors
1919

2020
@Log4j2
@@ -95,8 +95,8 @@ class NanoporeParser {
9595
jsonStarted = true
9696
}
9797
if (jsonStarted) {
98-
def split = line.replaceAll("\\s+","").split(":")
99-
if(split.size() == 2 && split[1].replaceAll('"',"").size() <= 1){
98+
def split = line.replaceAll("\\s+", "").split(":")
99+
if (split.size() == 2 && split[1].replaceAll('"', "").size() <= 1) {
100100
log.info("Metadata value ${split[0]} missing in ${reportFile["path"]}")
101101
}
102102
buffer.append(line)
@@ -110,12 +110,11 @@ class NanoporeParser {
110110
new File(Paths.get(root.toString(), summaryFile["path"].toString()) as String)
111111
.readLines().each { line ->
112112
def split = line.split("=")
113-
if(split.size() > 1){
113+
if (split.size() > 1) {
114114
finalMetaData[split[0]] = split[1]
115-
}
116-
else {
115+
} else {
117116
log.info("Metadata value ${split[0]} missing in ${summaryFile["path"]}, defaulting to empty value")
118-
finalMetaData[split[0]] = ""
117+
finalMetaData[split[0]] = ""
119118
}
120119
}
121120
return finalMetaData
@@ -178,18 +177,27 @@ class NanoporeParser {
178177
* @throws net.jimblackler.jsonschemafriend.ValidationException
179178
*/
180179
private static void validateJson(String json) throws ValidationException {
181-
// Step 1: load schema
180+
// Step 1: load json
182181
ObjectMapper objectMapper = new ObjectMapper()
183182
Object jsonObject = objectMapper.readValue(json, Object)
183+
184184
SchemaStore schemaStore = new SchemaStore()
185-
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputMinimal.getSchemaAsStream())
186185
Validator validator = new Validator()
187-
validator.validate(schema, jsonObject)
186+
try {
187+
//Validate against Fast5 Based Oxford Measurement
188+
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputMinimal.getSchemaAsStream())
189+
validator.validate(schema, jsonObject)
190+
} catch (ValidationException ignored) {
191+
//Validate against Pod5 Based Oxford Measurement
192+
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputDoradoMinimal.getSchemaAsStream())
193+
validator.validate(schema, jsonObject)
194+
}
188195
}
189196

190197
/*
191198
* Converts a file tree into a json object.
192199
*/
200+
193201
private static class DirectoryConverter {
194202
private static final PREDEFINED_EXTENSIONS = ["fastq.gz"]
195203
private static final IGNORED_FOLDERNAMES = ["qc"]
@@ -239,11 +247,11 @@ class NanoporeParser {
239247
List<File> children = currentDirectory.listFiles()
240248

241249
List<File> visibleChildren = children.stream()
242-
.filter(file -> !file.isHidden()).collect(Collectors.toList());
250+
.filter(file -> !file.isHidden()).collect(Collectors.toList())
243251

244252
for (File file : children) {
245253
if (!visibleChildren.contains(file)) {
246-
hiddenFiles.add(file);
254+
hiddenFiles.add(file)
247255
}
248256
}
249257

@@ -252,11 +260,11 @@ class NanoporeParser {
252260
return !IGNORED_FOLDERNAMES.contains(currentFolderName)
253261
}.collect {
254262
file ->
255-
if (file.isFile()) {
256-
convertFile(file.toPath())
257-
} else if (file.isDirectory()) {
258-
convertDirectory(file.toPath())
259-
}
263+
if (file.isFile()) {
264+
convertFile(file.toPath())
265+
} else if (file.isDirectory()) {
266+
convertDirectory(file.toPath())
267+
}
260268
}
261269

262270
def convertedDirectory = [

src/test/groovy/life/qbic/utils/NanoporeParserSpec.groovy

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,41 @@ class NanoporeParserSpec extends Specification {
125125
thrown(ValidationException)
126126
}
127127

128+
def "parsing a valid minimal file structure for dorado based basecalling containing additional unknown files and folder still returns an OxfordNanoporeExperiment Object"() {
129+
given:
130+
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_valid_dorado_minimal")
131+
when:
132+
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
133+
then:
134+
assert experiment instanceof OxfordNanoporeExperiment
135+
// Check that the metadata from the report file has been retrieved
136+
assert experiment.getMeasurements().get(0).getMachineHost() == "PCT0094"
137+
// Check that the metadata from the summary file has been retrieved
138+
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
139+
}
140+
141+
def "parsing a valid file structure for dorado based basecalling containing additional unknown files and folder still returns an OxfordNanoporeExperiment Object"() {
142+
given:
143+
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_valid_dorado_example")
144+
when:
145+
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
146+
then:
147+
assert experiment instanceof OxfordNanoporeExperiment
148+
// Check that the metadata from the report file has been retrieved
149+
assert experiment.getMeasurements().get(0).getMachineHost() == "PCT0094"
150+
// Check that the metadata from the summary file has been retrieved
151+
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
152+
}
153+
154+
def "parsing an invalid minimal file structure for dorado based basecalling leads to a ValidationException"() {
155+
given:
156+
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "fails/QABCD001AB_E12A345a01_PAE12345_missing_skip_folder")
157+
when:
158+
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
159+
then:
160+
thrown(ValidationException)
161+
}
162+
128163
def "parsing the alternative valid file structure with metadata missing returns an OxfordNanoporeExperiment Object"() {
129164
given:
130165
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_new_minimal")

src/test/resources/dummyFileSystem/nanopore-instrument-output/fails/QABCD001AB_E12A345a01_PAE12345_missing_skip_folder/20200122_1217_1-A1-B1-PAE12345_1234567a/barcode_alignment_.tsv

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is some text
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is some text
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is some text
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is some text

src/test/resources/dummyFileSystem/nanopore-instrument-output/fails/QABCD001AB_E12A345a01_PAE12345_missing_skip_folder/20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_pass/myfile1.fastq.gz

Whitespace-only changes.

0 commit comments

Comments
 (0)