Merge pull request #38 from qbicsoftware/release/1.3

sven1103 · web-flow · commit 9a3667166236 · 2020-07-16T10:35:01.000+02:00
Prepare release 1.3
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 1.3
+
+* Provide metadata schema in JSON for the IMGAG dropbox
+* Register checksums for Oxford Nanopore datasets
+* Register unclassified read data for Oxford Nanopore datasets
+
 ## 1.2
 
 * Provide ETL routine for Oxford Nanopore NGS data
diff --git a/drop-boxes/register-imgag-dropbox/README.md b/drop-boxes/register-imgag-dropbox/README.md
@@ -0,0 +1,42 @@
+# IMGAG dropbox
+
+## Expected data structure
+The data structure needs to be a root folder, containing a file `metadata` following the [upload metadata schema](upload-metadata.schema.json). In addition, the folder shall contain files of type `fastq/fastq.gz` and/or `vcf/vcf.gz` and/or `GSvar/GSvar.gz`. 
+
+Incoming structure overview:
+
+```
+|-QTEST001AE (top level folder name)
+    |
+    |- file1.fastq.gz
+    |- file2.fastq.gz
+    |- metadata
+    |- ...
+
+```
+
+openBIS structure overview:
+
+TODO: ER model.
+
+## Expected metadata
+Metadata is expected to be noted in JSON and following the [upload metadata schema](upload-metadata.schema.json). An example JSON entry can look like this:
+
+```
+{
+    "files": [
+        "reads.1.fastq.gz",
+        "reads.2.fastq.gz"
+    ],
+    "type": "dna_seq",
+    "sample1": {
+        "genome": "GRCh37",
+        "id_genetics": "GS000000_01",
+        "id_qbic": "QTEST002AE",
+        "processing_system": "Test system",
+        "tumor": "no"
+    }
+}
+```
+
+The sample code for `id_qbic` can be of type `Q_TEST_SAMPLE` or `Q_BIOLOGICAL_SAMPLE`. In the latter case, a new sample of type `Q_TEST_SAMPLE` is created and attached as child to the biological sample. The data-set will be registered under this test sample then.
diff --git a/drop-boxes/register-imgag-dropbox/upload-metadata.schema.json b/drop-boxes/register-imgag-dropbox/upload-metadata.schema.json
@@ -0,0 +1,84 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema",
+    "$id": "http://qbic.life/v1/upload-metadata.schema.json",
+    "title": "Upload metadata for data registration at QBiC",
+    "description": "A full description of mandatory and optional metadata properties that need to/can be included for data registration via QBiC dropboxes.",
+    "type": "object",
+    "definitions": {
+        "qc": {
+            "type": "object",
+            "properties": {
+                "qcml_id": {
+                    "type": "string",
+                    "description": "A qcml id following the qzml specification",
+                    "pattern": "^QC:[0-9]{7}$"
+                }, 
+                "name": {
+                    "type": "string",
+                    "description": "Name of the quality control",
+                    "examples": ["read count", "target region read depth", "Q20 read percentage"]
+                },
+                "value": {
+                    "type": "string",
+                    "description": "The actual qc value"
+                }
+            }
+        },
+        "sample": {
+            "type": "object",
+            "properties": {
+                "genome": {
+                    "type": "string",
+                    "examples": ["GRCh37"]
+                },
+                "id_genetics": {
+                    "type": "string",
+                    "description": "A sample URI provided by the human genetics department",
+                    "examples": ["GS000000_01"]
+                },
+                "id_qbic": {
+                    "type": "string",
+                    "pattern": "Q\\w{4}\\d{3}[A-X][A-X0-9]",
+                    "description": "QBIC sample code of the analysed biological specimen",
+                    "examples": ["QTEST001AE"]
+                },
+                "processing_system": {
+                    "type": "string",
+                    "examples": ["SureSelectXT Human All Exon v5"]
+                },
+                "qc": {
+                    "type": "array",
+                    "items": {
+                        "allOf": [
+                            {
+                                "$ref": "#/definitions/qc"
+                            }
+                        ]
+                    }
+                }, 
+                "tumor": {
+                    "type": "string",
+                    "enum": ["yes", "no"]
+                }
+            }
+        }
+    },
+    "properties": {
+        "files": {
+            "type": "array",
+            "items": { "type": "string" },
+            "minItems": 1
+        },
+        "type": {
+            "type": "string",
+            "enum": ["dna_seq", "rna_seq", "dna_seq_somatic"]
+        },
+        "sample1": { "$ref": "#/definitions/sample" }, 
+        "sample2": { "$ref": "#/definitions/sample" }
+    },
+    "required": [
+        "files", 
+        "type",
+        "sample1"
+    ] 
+}
diff --git a/drop-boxes/register-nanopore-dropbox/register-nanopore.py b/drop-boxes/register-nanopore-dropbox/register-nanopore.py
@@ -29,9 +29,6 @@
 
 import sample_tracking_helper_qbic as tracking_helper
 
-######## imports for fastq/5 file validation
-#import subprocess
-
 #### Setup Sample Tracking service
 SERVICE_CREDENTIALS = ServiceCredentials()
 SERVICE_CREDENTIALS.user = tracking_helper.get_service_user()
@@ -58,6 +55,7 @@
 # needed for pooled samples with multiple measurements
 usedSampleIdentifiers = set()
 usedExperimentIdentifiers = set()
+checksumMap = {}
 
 def createNewSample(transaction, space, parentSampleCode):
     run = 0
@@ -101,8 +99,11 @@ def getTimeStamp():
     ts = str(now.minute)+str(now.second)+str(now.microsecond)
     return ts
 
+# copies log files from a folder that may contain other files to another path
 def copyLogFilesTo(logFiles, filePath, targetFolderPath):
     for logFile in logFiles:
+        sourcePath = os.path.join(filePath, logFile.getName())
+        shutil.copy2(sourcePath, targetFolderPath)
         src = os.path.join(filePath, logFile.getName())
         shutil.copy2(src, targetFolderPath)
     copiedContent = os.listdir(targetFolderPath)
@@ -139,7 +140,6 @@ def createExperimentFromMeasurement(transaction, currentPath, space, project, me
          ...
     ]
     """
-    # 1.) Create a new experiment in openBIS
     runExperiment = createNewExperiment(transaction, space, project)
 
     # 2.) Enrich it with metadata about the sequencing run (base caller, adapter, library kit, etc.)
@@ -154,14 +154,61 @@ def createExperimentFromMeasurement(transaction, currentPath, space, project, me
     runExperiment.setPropertyValue("Q_NANOPORE_HOSTNAME", measurement.getMachineHost())
     runExperiment.setPropertyValue("Q_DATA_GENERATION_FACILITY", origin)
     runExperiment.setPropertyValue("Q_MEASUREMENT_START_DATE", convertTime(measurement.getStartDate()))
-    for sampleCode in rawDataPerSample.keySet():
-        datamap = rawDataPerSample.get(sampleCode)
+    if measurement.getAdapter():
+        runExperiment.setPropertyValue("Q_SEQUENCING_ADAPTER", measurement.getAdapter())
+    # handle measured samples
+    unclassifiedMap = measurement.getUnclassifiedData()
+    for barcode in rawDataPerSample.keySet():
+        datamap = rawDataPerSample.get(barcode)
         newLogFolder = createLogFolder(currentPath)
         # 3.) Aggregate all log files into an own log folder per measurement
         copyLogFilesTo(measurement.getLogFiles(), currentPath, newLogFolder)
-        createSampleWithData(transaction, space, sampleCode, datamap, runExperiment, currentPath, newLogFolder)
-
-def createSampleWithData(transaction, space, parentSampleCode, mapWithDataForSample, openbisExperiment, currentPath, absLogPath):
+        createSampleWithData(transaction, space, barcode, datamap, unclassifiedMap, runExperiment, currentPath, newLogFolder)
+
+# fills the global dictionary containing all checksums for paths from the global checksum file
+def fillChecksumMap(checksumFilePath):
+    with open(checksumFilePath, 'r') as chf:
+        for line in chf:
+            # remove asterisk from paths, so they can be compared later on
+            tokens = line.strip().split(" *")
+            path = tokens[1]
+            checksum = tokens[0]
+            checksumMap[path] = checksum
+
+# creates a file containing checksums and paths for files contained in the passed path using the global checksum dictionary
+def createChecksumFileForFolder(incomingPath, folderPath):
+
+    relativePath = os.path.relpath(folderPath, incomingPath)
+
+    pathEnd = os.path.basename(os.path.normpath(folderPath))
+    checksumFilePath = os.path.join(folderPath, pathEnd+'.sha256sum')
+    if not os.path.isfile(checksumFilePath):
+        with open(checksumFilePath, 'w') as f:
+            for key, value in checksumMap.items():
+                # for each file in our dictionary that starts with the currently handled path, we add the known checksums and the paths, along with the asterisk we removed earlier
+                if key.startswith(relativePath):
+                    f.write(value+' *'+key+'\n')
+    return checksumFilePath
+
+# moves a subset of nanopore data to a new target path, needed to add fastq and fast5 subfolders to the same dataset
+def prepareDataFolder(incomingPath, currentPath, destinationPath, dataObject, unclassifiedDataObject, suffix):
+    name = dataObject.getName()
+    relativePath = dataObject.getRelativePath()
+    # the source path of the currently handled data object (e.g. fast5_fail folder)
+    sourcePath = os.path.join(os.path.dirname(currentPath), relativePath)
+    checksumFile = createChecksumFileForFolder(incomingPath, sourcePath)
+    # destination path containing data type (fastq or fast5), as well as the parent sample code, so pooled samples can be handled
+    destination = os.path.join(destinationPath, name + "_" + suffix)
+    os.rename(sourcePath, destination)
+    # if unclassified data exists, create relevant checksums and add them with the data to the expected (barcoded) data folder
+    if unclassifiedDataObject:
+        relativePath = unclassifiedDataObject.getRelativePath()
+        # the source path of the currently handled data object (e.g. unclassified fast5_fail folder)
+        unclassifiedSourcePath = os.path.join(os.path.dirname(currentPath), relativePath)
+        unclassifiedChecksumFile = createChecksumFileForFolder(incomingPath, unclassifiedSourcePath)
+        shutil.copytree(unclassifiedSourcePath, os.path.join(destination,"unclassified"))
+
+def createSampleWithData(transaction, space, parentSampleCode, mapWithDataForSample, unclassifiedDataMap, openbisExperiment, currentPath, absLogPath):
     """ Aggregates all measurement related files and registers them in openBIS.
     
     The Map mapWithDataForSample contains all DataFolders created for one sample code:
@@ -172,6 +219,9 @@ def createSampleWithData(transaction, space, parentSampleCode, mapWithDataForSam
         "fastqpass": DataFolder
      ]   
     """
+    # needed to create relative path used in checksums file
+    incomingPath = transaction.getIncoming().getAbsolutePath()
+
     search_service = transaction.getSearchService()
     sc = SearchCriteria()
     sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, parentSampleCode))
@@ -185,28 +235,25 @@ def createSampleWithData(transaction, space, parentSampleCode, mapWithDataForSam
     # Aggregate the folders fastqfail and fastqpass under a common folder "<sample code>_fastq"
     topFolderFastq = os.path.join(currentPath, parentSampleCode+"_fastq")
     os.makedirs(topFolderFastq)
-    folder = mapWithDataForSample.get("fastqfail")
-    name = folder.getName()
-    src = os.path.join(currentPath, name)
-    os.rename(src, topFolderFastq+'/'+name)
 
-    folder = mapWithDataForSample.get("fastqpass")
-    name = folder.getName()
-    src = os.path.join(currentPath, folder.getName())
-    os.rename(src, topFolderFastq+'/'+name)
+    unclassifiedFastqFail = unclassifiedDataMap.get("fastqfail")
+    unclassifiedFastqPass = unclassifiedDataMap.get("fastqpass")
+    unclassifiedFast5Fail = unclassifiedDataMap.get("fast5fail")
+    unclassifiedFast5Pass = unclassifiedDataMap.get("fast5pass")
+
+    fastqFail = mapWithDataForSample.get("fastqfail")
+    prepareDataFolder(incomingPath, currentPath, topFolderFastq, fastqFail, unclassifiedFastqFail, "fail")
+    fastqPass = mapWithDataForSample.get("fastqpass")
+    prepareDataFolder(incomingPath, currentPath, topFolderFastq, fastqPass, unclassifiedFastqPass, "pass")
 
     # Aggregate the folders fast5fail and fast5pass under a common folder "<sample code>_fast5"
     topFolderFast5 = os.path.join(currentPath, parentSampleCode+"_fast5")
     os.makedirs(topFolderFast5)
-    folder = mapWithDataForSample.get("fast5pass")
-    name = folder.getName()
-    src = os.path.join(currentPath, folder.getName())
-    os.rename(src, topFolderFast5+'/'+name)
 
-    folder = mapWithDataForSample.get("fast5fail")
-    name = folder.getName()
-    src = os.path.join(currentPath, folder.getName())
-    os.rename(src, topFolderFast5+'/'+name)
+    fast5Fail = mapWithDataForSample.get("fast5fail")
+    prepareDataFolder(incomingPath, currentPath, topFolderFast5, fast5Fail, unclassifiedFast5Fail, "fail")
+    fast5Pass = mapWithDataForSample.get("fast5pass")
+    prepareDataFolder(incomingPath, currentPath, topFolderFast5, fast5Pass, unclassifiedFast5Pass, "pass")
 
     fast5DataSet = transaction.createNewDataSet(NANOPORE_FAST5_CODE)
     fastQDataSet = transaction.createNewDataSet(NANOPORE_FASTQ_CODE)
@@ -239,6 +286,8 @@ def process(transaction):
         currentPath = os.path.realpath(os.path.join(incomingPath,f))
         if os.path.isdir(currentPath):
             nanoporeFolder = currentPath
+        if currentPath.endswith('.sha256sum'):
+            fillChecksumMap(currentPath)
 
     origin = getDatahandlerMetadata(incomingPath, "source_dropbox.txt")
     # Use file structure parser to create structure object