Skip to content

Commit 9a36671

Browse files
authored
Merge pull request #38 from qbicsoftware/release/1.3
Prepare release 1.3
2 parents 0693826 + e7df522 commit 9a36671

File tree

4 files changed

+206
-25
lines changed

4 files changed

+206
-25
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## 1.3
4+
5+
* Provide metadata schema in JSON for the IMGAG dropbox
6+
* Register checksums for Oxford Nanopore datasets
7+
* Register unclassified read data for Oxford Nanopore datasets
8+
39
## 1.2
410

511
* Provide ETL routine for Oxford Nanopore NGS data
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# IMGAG dropbox
2+
3+
## Expected data structure
4+
The data structure needs to be a root folder, containing a file `metadata` following the [upload metadata schema](upload-metadata.schema.json). In addition, the folder shall contain files of type `fastq/fastq.gz` and/or `vcf/vcf.gz` and/or `GSvar/GSvar.gz`.
5+
6+
Incoming structure overview:
7+
8+
```
9+
|-QTEST001AE (top level folder name)
10+
|
11+
|- file1.fastq.gz
12+
|- file2.fastq.gz
13+
|- metadata
14+
|- ...
15+
16+
```
17+
18+
openBIS structure overview:
19+
20+
TODO: ER model.
21+
22+
## Expected metadata
23+
Metadata is expected to be noted in JSON and following the [upload metadata schema](upload-metadata.schema.json). An example JSON entry can look like this:
24+
25+
```
26+
{
27+
"files": [
28+
"reads.1.fastq.gz",
29+
"reads.2.fastq.gz"
30+
],
31+
"type": "dna_seq",
32+
"sample1": {
33+
"genome": "GRCh37",
34+
"id_genetics": "GS000000_01",
35+
"id_qbic": "QTEST002AE",
36+
"processing_system": "Test system",
37+
"tumor": "no"
38+
}
39+
}
40+
```
41+
42+
The sample code for `id_qbic` can be of type `Q_TEST_SAMPLE` or `Q_BIOLOGICAL_SAMPLE`. In the latter case, a new sample of type `Q_TEST_SAMPLE` is created and attached as child to the biological sample. The data-set will be registered under this test sample then.
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
{
2+
"$schema": "http://json-schema.org/draft-07/schema",
3+
"$id": "http://qbic.life/v1/upload-metadata.schema.json",
4+
"title": "Upload metadata for data registration at QBiC",
5+
"description": "A full description of mandatory and optional metadata properties that need to/can be included for data registration via QBiC dropboxes.",
6+
"type": "object",
7+
"definitions": {
8+
"qc": {
9+
"type": "object",
10+
"properties": {
11+
"qcml_id": {
12+
"type": "string",
13+
"description": "A qcml id following the qzml specification",
14+
"pattern": "^QC:[0-9]{7}$"
15+
},
16+
"name": {
17+
"type": "string",
18+
"description": "Name of the quality control",
19+
"examples": ["read count", "target region read depth", "Q20 read percentage"]
20+
},
21+
"value": {
22+
"type": "string",
23+
"description": "The actual qc value"
24+
}
25+
}
26+
},
27+
"sample": {
28+
"type": "object",
29+
"properties": {
30+
"genome": {
31+
"type": "string",
32+
"examples": ["GRCh37"]
33+
},
34+
"id_genetics": {
35+
"type": "string",
36+
"description": "A sample URI provided by the human genetics department",
37+
"examples": ["GS000000_01"]
38+
},
39+
"id_qbic": {
40+
"type": "string",
41+
"pattern": "Q\\w{4}\\d{3}[A-X][A-X0-9]",
42+
"description": "QBIC sample code of the analysed biological specimen",
43+
"examples": ["QTEST001AE"]
44+
},
45+
"processing_system": {
46+
"type": "string",
47+
"examples": ["SureSelectXT Human All Exon v5"]
48+
},
49+
"qc": {
50+
"type": "array",
51+
"items": {
52+
"allOf": [
53+
{
54+
"$ref": "#/definitions/qc"
55+
}
56+
]
57+
}
58+
},
59+
"tumor": {
60+
"type": "string",
61+
"enum": ["yes", "no"]
62+
}
63+
}
64+
}
65+
},
66+
"properties": {
67+
"files": {
68+
"type": "array",
69+
"items": { "type": "string" },
70+
"minItems": 1
71+
},
72+
"type": {
73+
"type": "string",
74+
"enum": ["dna_seq", "rna_seq", "dna_seq_somatic"]
75+
},
76+
"sample1": { "$ref": "#/definitions/sample" },
77+
"sample2": { "$ref": "#/definitions/sample" }
78+
},
79+
"required": [
80+
"files",
81+
"type",
82+
"sample1"
83+
]
84+
}

drop-boxes/register-nanopore-dropbox/register-nanopore.py

Lines changed: 74 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,6 @@
2929

3030
import sample_tracking_helper_qbic as tracking_helper
3131

32-
######## imports for fastq/5 file validation
33-
#import subprocess
34-
3532
#### Setup Sample Tracking service
3633
SERVICE_CREDENTIALS = ServiceCredentials()
3734
SERVICE_CREDENTIALS.user = tracking_helper.get_service_user()
@@ -58,6 +55,7 @@
5855
# needed for pooled samples with multiple measurements
5956
usedSampleIdentifiers = set()
6057
usedExperimentIdentifiers = set()
58+
checksumMap = {}
6159

6260
def createNewSample(transaction, space, parentSampleCode):
6361
run = 0
@@ -101,8 +99,11 @@ def getTimeStamp():
10199
ts = str(now.minute)+str(now.second)+str(now.microsecond)
102100
return ts
103101

102+
# copies log files from a folder that may contain other files to another path
104103
def copyLogFilesTo(logFiles, filePath, targetFolderPath):
105104
for logFile in logFiles:
105+
sourcePath = os.path.join(filePath, logFile.getName())
106+
shutil.copy2(sourcePath, targetFolderPath)
106107
src = os.path.join(filePath, logFile.getName())
107108
shutil.copy2(src, targetFolderPath)
108109
copiedContent = os.listdir(targetFolderPath)
@@ -139,7 +140,6 @@ def createExperimentFromMeasurement(transaction, currentPath, space, project, me
139140
...
140141
]
141142
"""
142-
# 1.) Create a new experiment in openBIS
143143
runExperiment = createNewExperiment(transaction, space, project)
144144

145145
# 2.) Enrich it with metadata about the sequencing run (base caller, adapter, library kit, etc.)
@@ -154,14 +154,61 @@ def createExperimentFromMeasurement(transaction, currentPath, space, project, me
154154
runExperiment.setPropertyValue("Q_NANOPORE_HOSTNAME", measurement.getMachineHost())
155155
runExperiment.setPropertyValue("Q_DATA_GENERATION_FACILITY", origin)
156156
runExperiment.setPropertyValue("Q_MEASUREMENT_START_DATE", convertTime(measurement.getStartDate()))
157-
for sampleCode in rawDataPerSample.keySet():
158-
datamap = rawDataPerSample.get(sampleCode)
157+
if measurement.getAdapter():
158+
runExperiment.setPropertyValue("Q_SEQUENCING_ADAPTER", measurement.getAdapter())
159+
# handle measured samples
160+
unclassifiedMap = measurement.getUnclassifiedData()
161+
for barcode in rawDataPerSample.keySet():
162+
datamap = rawDataPerSample.get(barcode)
159163
newLogFolder = createLogFolder(currentPath)
160164
# 3.) Aggregate all log files into an own log folder per measurement
161165
copyLogFilesTo(measurement.getLogFiles(), currentPath, newLogFolder)
162-
createSampleWithData(transaction, space, sampleCode, datamap, runExperiment, currentPath, newLogFolder)
163-
164-
def createSampleWithData(transaction, space, parentSampleCode, mapWithDataForSample, openbisExperiment, currentPath, absLogPath):
166+
createSampleWithData(transaction, space, barcode, datamap, unclassifiedMap, runExperiment, currentPath, newLogFolder)
167+
168+
# fills the global dictionary containing all checksums for paths from the global checksum file
169+
def fillChecksumMap(checksumFilePath):
170+
with open(checksumFilePath, 'r') as chf:
171+
for line in chf:
172+
# remove asterisk from paths, so they can be compared later on
173+
tokens = line.strip().split(" *")
174+
path = tokens[1]
175+
checksum = tokens[0]
176+
checksumMap[path] = checksum
177+
178+
# creates a file containing checksums and paths for files contained in the passed path using the global checksum dictionary
179+
def createChecksumFileForFolder(incomingPath, folderPath):
180+
181+
relativePath = os.path.relpath(folderPath, incomingPath)
182+
183+
pathEnd = os.path.basename(os.path.normpath(folderPath))
184+
checksumFilePath = os.path.join(folderPath, pathEnd+'.sha256sum')
185+
if not os.path.isfile(checksumFilePath):
186+
with open(checksumFilePath, 'w') as f:
187+
for key, value in checksumMap.items():
188+
# for each file in our dictionary that starts with the currently handled path, we add the known checksums and the paths, along with the asterisk we removed earlier
189+
if key.startswith(relativePath):
190+
f.write(value+' *'+key+'\n')
191+
return checksumFilePath
192+
193+
# moves a subset of nanopore data to a new target path, needed to add fastq and fast5 subfolders to the same dataset
194+
def prepareDataFolder(incomingPath, currentPath, destinationPath, dataObject, unclassifiedDataObject, suffix):
195+
name = dataObject.getName()
196+
relativePath = dataObject.getRelativePath()
197+
# the source path of the currently handled data object (e.g. fast5_fail folder)
198+
sourcePath = os.path.join(os.path.dirname(currentPath), relativePath)
199+
checksumFile = createChecksumFileForFolder(incomingPath, sourcePath)
200+
# destination path containing data type (fastq or fast5), as well as the parent sample code, so pooled samples can be handled
201+
destination = os.path.join(destinationPath, name + "_" + suffix)
202+
os.rename(sourcePath, destination)
203+
# if unclassified data exists, create relevant checksums and add them with the data to the expected (barcoded) data folder
204+
if unclassifiedDataObject:
205+
relativePath = unclassifiedDataObject.getRelativePath()
206+
# the source path of the currently handled data object (e.g. unclassified fast5_fail folder)
207+
unclassifiedSourcePath = os.path.join(os.path.dirname(currentPath), relativePath)
208+
unclassifiedChecksumFile = createChecksumFileForFolder(incomingPath, unclassifiedSourcePath)
209+
shutil.copytree(unclassifiedSourcePath, os.path.join(destination,"unclassified"))
210+
211+
def createSampleWithData(transaction, space, parentSampleCode, mapWithDataForSample, unclassifiedDataMap, openbisExperiment, currentPath, absLogPath):
165212
""" Aggregates all measurement related files and registers them in openBIS.
166213
167214
The Map mapWithDataForSample contains all DataFolders created for one sample code:
@@ -172,6 +219,9 @@ def createSampleWithData(transaction, space, parentSampleCode, mapWithDataForSam
172219
"fastqpass": DataFolder
173220
]
174221
"""
222+
# needed to create relative path used in checksums file
223+
incomingPath = transaction.getIncoming().getAbsolutePath()
224+
175225
search_service = transaction.getSearchService()
176226
sc = SearchCriteria()
177227
sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, parentSampleCode))
@@ -185,28 +235,25 @@ def createSampleWithData(transaction, space, parentSampleCode, mapWithDataForSam
185235
# Aggregate the folders fastqfail and fastqpass under a common folder "<sample code>_fastq"
186236
topFolderFastq = os.path.join(currentPath, parentSampleCode+"_fastq")
187237
os.makedirs(topFolderFastq)
188-
folder = mapWithDataForSample.get("fastqfail")
189-
name = folder.getName()
190-
src = os.path.join(currentPath, name)
191-
os.rename(src, topFolderFastq+'/'+name)
192238

193-
folder = mapWithDataForSample.get("fastqpass")
194-
name = folder.getName()
195-
src = os.path.join(currentPath, folder.getName())
196-
os.rename(src, topFolderFastq+'/'+name)
239+
unclassifiedFastqFail = unclassifiedDataMap.get("fastqfail")
240+
unclassifiedFastqPass = unclassifiedDataMap.get("fastqpass")
241+
unclassifiedFast5Fail = unclassifiedDataMap.get("fast5fail")
242+
unclassifiedFast5Pass = unclassifiedDataMap.get("fast5pass")
243+
244+
fastqFail = mapWithDataForSample.get("fastqfail")
245+
prepareDataFolder(incomingPath, currentPath, topFolderFastq, fastqFail, unclassifiedFastqFail, "fail")
246+
fastqPass = mapWithDataForSample.get("fastqpass")
247+
prepareDataFolder(incomingPath, currentPath, topFolderFastq, fastqPass, unclassifiedFastqPass, "pass")
197248

198249
# Aggregate the folders fast5fail and fast5pass under a common folder "<sample code>_fast5"
199250
topFolderFast5 = os.path.join(currentPath, parentSampleCode+"_fast5")
200251
os.makedirs(topFolderFast5)
201-
folder = mapWithDataForSample.get("fast5pass")
202-
name = folder.getName()
203-
src = os.path.join(currentPath, folder.getName())
204-
os.rename(src, topFolderFast5+'/'+name)
205252

206-
folder = mapWithDataForSample.get("fast5fail")
207-
name = folder.getName()
208-
src = os.path.join(currentPath, folder.getName())
209-
os.rename(src, topFolderFast5+'/'+name)
253+
fast5Fail = mapWithDataForSample.get("fast5fail")
254+
prepareDataFolder(incomingPath, currentPath, topFolderFast5, fast5Fail, unclassifiedFast5Fail, "fail")
255+
fast5Pass = mapWithDataForSample.get("fast5pass")
256+
prepareDataFolder(incomingPath, currentPath, topFolderFast5, fast5Pass, unclassifiedFast5Pass, "pass")
210257

211258
fast5DataSet = transaction.createNewDataSet(NANOPORE_FAST5_CODE)
212259
fastQDataSet = transaction.createNewDataSet(NANOPORE_FASTQ_CODE)
@@ -239,6 +286,8 @@ def process(transaction):
239286
currentPath = os.path.realpath(os.path.join(incomingPath,f))
240287
if os.path.isdir(currentPath):
241288
nanoporeFolder = currentPath
289+
if currentPath.endswith('.sha256sum'):
290+
fillChecksumMap(currentPath)
242291

243292
origin = getDatahandlerMetadata(incomingPath, "source_dropbox.txt")
244293
# Use file structure parser to create structure object

0 commit comments

Comments
 (0)