Skip to content

Commit e1cea11

Browse files
authored
Release 1.10.0
Release 1.10.0
2 parents 2e27646 + 50c8590 commit e1cea11

File tree

5 files changed

+836
-12
lines changed

5 files changed

+836
-12
lines changed

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# Changelog
22

3+
# 1.10.0 2021-07-26
4+
5+
* Provides new ETL for MTB project data that are not supposed to be stored in QUK17 [(#89)](https://github.com/qbicsoftware/etl-scripts/pull/89)
6+
* Allow multiple sequencing lanes for MTB data
7+
38
# 1.9.0 2021-06-28
49

510
* Provides new ETL routine written in Java, that will replace all Jython scripts at some point [(#85)](https://github.com/qbicsoftware/etl-scripts/pull/85)

drop-boxes/register-mtb-data-dropbox/register-mtb-data-dropbox.py

+25-12
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@
108108

109109
# Regex matching the RNAseq sample file naming specification
110110
RNASEQ_REG = re.compile(r'.*tumor_rna.[1,2]{1}.fastq.gz')
111+
# Update from 2021-07-06: Support lanes, indicated with a three digit number
112+
RNASEQ_REG_LANES = re.compile(r'.*tumor_rna_[0-9]{3}.[1,2]{1}.fastq.gz')
111113

112114
# Path to the openBIS properties file
113115
PROPERTIES = '/etc/openbis.properties'
@@ -214,7 +216,7 @@ def process(transaction):
214216
for in_file in file_list:
215217
if in_file.endswith('origlabfilename') or in_file.endswith('sha256sum') or 'source_dropbox.txt' in in_file:
216218
continue
217-
if RNASEQ_REG.findall(in_file):
219+
if RNASEQ_REG.findall(in_file) or RNASEQ_REG_LANES.findall(in_file):
218220
rna_seq_files.append(in_file)
219221
elif 'fastq' in in_file:
220222
if 'normal' in in_file:
@@ -253,8 +255,10 @@ def execute_vcf_registration(vcf_files, transaction):
253255

254256

255257
def execute_fastq_registration(fastqs_normal, fastqs_tumor, transaction):
256-
if len(fastqs_tumor) != 2 or len(fastqs_normal) != 2:
258+
if len(fastqs_tumor) < 2 or len(fastqs_normal) < 2:
257259
raise mtbutils.MTBdropboxerror("Tumor/normal fastq dataset was not complete. Please check.")
260+
elif len(fastqs_tumor) != len(fastqs_normal):
261+
raise mtbutils.MTBdropboxerror("Tumor/normal fastq dataset dont have the same number of files. Are all lanes provided?")
258262
else:
259263
proc_fastq(fastqs_tumor, transaction)
260264
proc_fastq(fastqs_normal, transaction)
@@ -271,14 +275,19 @@ def get_last_exp_id(experiments):
271275
return exp_ids[-1]
272276

273277

274-
def getNextFreeBarcode(projectcode, numberOfBarcodes):
278+
def getNextFreeBarcode(projectcode, numberOfBarcodes, transaction, space):
275279
letters = string.ascii_uppercase
276-
numberOfBarcodes += 1
277-
278-
currentLetter = letters[numberOfBarcodes / 999]
279-
currentNumber = numberOfBarcodes % 999
280-
code = projectcode + str(currentNumber).zfill(3) + currentLetter
281-
return code + checksum.checksum(code)
280+
sampleExists = True
281+
newSampleCode = None
282+
while sampleExists:
283+
numberOfBarcodes += 1
284+
currentLetter = letters[numberOfBarcodes / 999]
285+
currentNumber = numberOfBarcodes % 999
286+
code = projectcode + str(currentNumber).zfill(3) + currentLetter
287+
newSampleCode = code + checksum.checksum(code)
288+
sampleExists = transaction.getSampleForUpdate(
289+
"/{space}/{sample}".format(space=space, sample=newSampleCode))
290+
return newSampleCode
282291

283292

284293
def register_rnaseq(rna_seq_files, transaction):
@@ -296,7 +305,8 @@ def register_rnaseq(rna_seq_files, transaction):
296305
the reason for the failure.
297306
"""
298307
print(mtbutils.log_stardate('Registering incoming MTB RNAseq data {}'.format(rna_seq_files)))
299-
assert len(rna_seq_files) == 2
308+
# Check if dataset files are paired end and complete
309+
assert len(rna_seq_files) % 2 == 0
300310
file1 = os.path.basename(rna_seq_files[0])
301311
file2 = os.path.basename(rna_seq_files[1])
302312
assert len(set(QCODE_REG.findall(file1))) == 1
@@ -315,7 +325,10 @@ def register_rnaseq(rna_seq_files, transaction):
315325
sc.addSubCriteria(SearchSubCriteria.createExperimentCriteria(pc))
316326
result = search_service.searchForSamples(sc)
317327
print("Found {} samples for project {} in space {}.".format(len(result), project, space))
318-
new_rna_sample_barcode = getNextFreeBarcode(project, numberOfBarcodes=len(result))
328+
new_rna_sample_barcode = getNextFreeBarcode(project,
329+
numberOfBarcodes=len(result),
330+
transaction=transaction,
331+
space=space)
319332

320333
# Now get the parent sample id (tumor sample, type: BIOLOGICAL_SAMPLE)
321334
tumor_dna_sample = getsample(dna_barcode, transaction)
@@ -448,7 +461,7 @@ def proc_fastq(fastq_file, transaction):
448461
"""Register fastq as dataset in openBIS"""
449462

450463
# Check, if there are file pairs present (paired-end data!)
451-
if len(fastq_file) != 2:
464+
if len(fastq_file) % 2 != 0:
452465
raise mtbutils.MTBdropboxerror('Expecting paired end reads files, found only {}'
453466
.format(len(fastq_file)))
454467
qbiccode_f1 = QCODE_REG.findall(os.path.basename(fastq_file[0]))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
""" A utility class, holding helper functions
2+
for the main dropbox """
3+
4+
import datetime
5+
import subprocess as sp
6+
7+
MTB_CONVERTER_PATH = '/home/qeana10/bin/miniconda/bin/mtbconverter'
8+
9+
def mtbconverter(cmds):
10+
"""Tries to activate a given conda environment"""
11+
command = [MTB_CONVERTER_PATH] + cmds
12+
ret_code = sp.call(command)
13+
return ret_code
14+
15+
def log_stardate(msg):
16+
"""Prints a message nicely with current stardate"""
17+
stardate = datetime.datetime.now()
18+
return '{} [{}]: {}'.format(stardate.isoformat(), 'mtbconverter', msg)
19+
20+
class MTBdropboxerror(Exception):
21+
"""A generic Exception class for this dropbox."""
22+
23+
class Counter():
24+
25+
def __init__(self):
26+
self.counter = 1
27+
28+
def newId(self):
29+
self.counter += 1
30+
return self.counter - 1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#
2+
# Drop box for registering a fastq file as a data set
3+
#
4+
# Variables:
5+
# incoming-root-dir
6+
# Path to the directory which contains incoming directories for drop boxes.
7+
incoming-dir = ${incoming-root-dir}/QBiC-register-mtb-projects-data
8+
incoming-data-completeness-condition = marker-file
9+
top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2
10+
script-path = register-mtb-projects-dropbox.py
11+
storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor

0 commit comments

Comments
 (0)