From 50e44d735fdf1bfb8120c284c574e70d74a48689 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Wed, 22 Apr 2026 15:04:23 +0200 Subject: [PATCH 01/39] fgumi module added --- MONSDA/Workflows.py | 45 +++++++++--- configs/template.json | 10 ++- configs/template_base_commented.json | 15 +++- configs/template_clean.json | 10 ++- containers/apptainer/fgumi.def | 23 ++++++ envs/fgumi.yaml | 9 +++ workflows/fgumi.nf | 104 +++++++++++++++++++++++++++ workflows/fgumi.smk | 76 ++++++++++++++++++++ workflows/fgumi_dedup.nf | 54 ++++++++++++++ workflows/fgumi_dedup.smk | 15 ++++ 10 files changed, 349 insertions(+), 12 deletions(-) create mode 100644 containers/apptainer/fgumi.def create mode 100644 envs/fgumi.yaml create mode 100644 workflows/fgumi.nf create mode 100644 workflows/fgumi.smk create mode 100644 workflows/fgumi_dedup.nf create mode 100644 workflows/fgumi_dedup.smk diff --git a/MONSDA/Workflows.py b/MONSDA/Workflows.py index 88aaf92a..709b95e1 100755 --- a/MONSDA/Workflows.py +++ b/MONSDA/Workflows.py @@ -901,7 +901,9 @@ def make_sub( and "MAPPING" not in works and toolenv != "rustqc" ): - if "DEDUP" in works and "umitools" in envs: + if "DEDUP" in works and any( + x in envs for x in ["umitools", "fgumi"] + ): subname = toolenv + "_dedup_trim.smk" else: subname = toolenv + "_trim.smk" @@ -912,16 +914,18 @@ def make_sub( and "MAPPING" not in works and toolenv != "rustqc" ): - if "DEDUP" in subworkflows and "umitools" in envs: + if "DEDUP" in subworkflows and any( + x in envs for x in ["umitools", "fgumi"] + ): subname = toolenv + "_dedup.smk" else: subname = toolenv + "_raw.smk" - # Picard tools can be extended here + # Dedup tools can be extended here if works[j] == "DEDUP" and toolenv == "picard": subname = toolenv + "_dedup.smk" subconf.pop("PREDEDUP", None) - elif works[j] == "DEDUP" and toolenv == "umitools": + elif works[j] == "DEDUP" and toolenv in ["umitools", "fgumi"]: subconf["PREDEDUP"] = "enabled" smkf = os.path.abspath(os.path.join(workflowpath, subname)) @@ -1090,11 +1094,11 @@ def make_sub( else: subname = toolenv + "_trim.smk" - # Picard tools can be extended here + # Dedup tools can be extended here if subwork == "DEDUP" and toolenv == "picard": subname = toolenv + "_dedup.smk" subconf.pop("PREDEDUP", None) - elif works[j] == "DEDUP" and toolenv == "umitools": + elif subwork == "DEDUP" and toolenv in ["umitools", "fgumi"]: subconf["PREDEDUP"] = "enabled" # Add rulethemall based on chosen workflows add.append( @@ -2913,7 +2917,7 @@ def nf_make_sub( else: if "DEDUP" in subworkflows: flowlist.append("QC_RAW") - if toolenv == "umitools": + if toolenv in ["umitools", "fgumi"]: flowlist.append("DEDUPEXTRACT") if "MAPPING" in works: flowlist.append("QC_MAPPING") @@ -2928,7 +2932,7 @@ def nf_make_sub( flowlist.append("TRIMMING") if works[j] == "DEDUP": - if toolenv == "umitools": + if toolenv in ["umitools", "fgumi"]: flowlist.append("PREDEDUP") subconf["PREDEDUP"] = "enabled" if "QC" in flowlist: @@ -3284,9 +3288,32 @@ def nf_make_sub( subname = toolenv + ".nf" - # Picard tools can be extended here + # Dedup tools can be extended here if subwork == "DEDUP" and toolenv == "picard": subname = toolenv + "_dedup.nf" + elif subwork == "DEDUP" and toolenv in ["umitools", "fgumi"]: + flowlist.append("PREDEDUP") + subconf["PREDEDUP"] = "enabled" + if "QC" in flowlist: + flowlist.append("QC_DEDUP") + subname = toolenv + ".nf" + nfi = os.path.abspath(os.path.join(workflowpath, subname)) + with open(nfi, "r") as nf: + for line in mu.comment_remover(nf.readlines()): + line = re.sub(condapath, 'conda "' + envpath, line) + if "include {" in line: + line = fixinclude( + line, + loglevel, + condapath, + envpath, + workflowpath, + logfix, + "nfmode", + ) + subjobs.append(line) + subjobs.append("\n\n") + subname = toolenv + "_dedup.nf" nfi = os.path.abspath(os.path.join(workflowpath, subname)) with open(nfi, "r") as nf: diff --git a/configs/template.json b/configs/template.json index 9ab0bb95..374e77b3 100644 --- a/configs/template.json +++ b/configs/template.json @@ -120,7 +120,8 @@ "DEDUP": { #options for deduplication for each sample/condition "TOOLS": { "umitools": "umi_tools", - "picard": "picard" + "picard": "picard", + "fgumi": "fgumi" }, "id": { "condition": { @@ -141,6 +142,13 @@ "JAVA" : "",# options "DEDUP": "" # dedup options } + }, + "fgumi":{ + "OPTIONS": + { + "EXTRACT": "", # fgumi extract options, e.g. --read-structures 8M+T +T + "DEDUP": "" # fgumi dedup options + } } } } diff --git a/configs/template_base_commented.json b/configs/template_base_commented.json index 8b2c1388..7a47a674 100644 --- a/configs/template_base_commented.json +++ b/configs/template_base_commented.json @@ -123,7 +123,8 @@ }, "DEDUP": { "TOOLS": { - "umitools": "umi_tools" + "umitools": "umi_tools", + "fgumi": "fgumi" }, "ENV" : "", "BIN" : "", @@ -150,6 +151,18 @@ "JAVA" : "", "DEDUP": "" } + }, + "fgumi": { + "comment": + { + "EXTRACT": "fgumi extract options, e.g. --read-structures 8M+T +T", + "DEDUP": "fgumi dedup options" + }, + "OPTIONS": + { + "EXTRACT": "", + "DEDUP": "" + } } }, "MAPPING": { diff --git a/configs/template_clean.json b/configs/template_clean.json index 825e7335..a335c204 100644 --- a/configs/template_clean.json +++ b/configs/template_clean.json @@ -110,7 +110,8 @@ }, "DEDUP": { "TOOLS": { - "umitools": "umi_tools" + "umitools": "umi_tools", + "fgumi": "fgumi" }, "id": { "condition": { @@ -128,6 +129,13 @@ "JAVA" : "", "DEDUP": "" } + }, + "fgumi": { + "OPTIONS": + { + "EXTRACT": "", + "DEDUP": "" + } } } } diff --git a/containers/apptainer/fgumi.def b/containers/apptainer/fgumi.def new file mode 100644 index 00000000..4d305696 --- /dev/null +++ b/containers/apptainer/fgumi.def @@ -0,0 +1,23 @@ +Bootstrap: docker +From: continuumio/miniconda3 + +%files + /home/fall/MONSDA/envs/fgumi.yaml /opt/envs/ + ${HOME}/MONSDA/scripts /opt/MONSDA/ + +%environment + +%post + ls -alrt /opt/envs + chmod -R +x /opt/envs/fgumi.yaml + + ENV_NAME=fgumi + echo ". /opt/conda/etc/profile.d/conda.sh" >> $APPTAINER_ENVIRONMENT + echo "conda activate $ENV_NAME" >> $APPTAINER_ENVIRONMENT + + . /opt/conda/etc/profile.d/conda.sh + conda env create -f /opt/envs/fgumi.yaml -p /opt/conda/envs/$ENV_NAME + conda clean --all + +%runscript + exec "$@" diff --git a/envs/fgumi.yaml b/envs/fgumi.yaml new file mode 100644 index 00000000..9bffb96d --- /dev/null +++ b/envs/fgumi.yaml @@ -0,0 +1,9 @@ +name: fgumi +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - samtools =1.21 + - fgumi =0.1.3 + - dateutils =0.6.12 diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf new file mode 100644 index 00000000..100de0f8 --- /dev/null +++ b/workflows/fgumi.nf @@ -0,0 +1,104 @@ +DEDUPENV=get_always('DEDUPENV') +DEDUPBIN=get_always('DEDUPBIN') + +EXTRACTPARAMS = get_always('fgumi_params_EXTRACT') ?: '' +DEDUPPARAMS = get_always('fgumi_params_DEDUP') ?: '' + +process extract_fq{ + conda "$DEDUPENV"+".yaml" + container "oras://jfallmann/monsda:"+"$DEDUPENV" + cpus THREADS + cache 'lenient' + + publishDir "${workflow.workDir}/../" , mode: 'link', + saveAs: {filename -> + if (filename.indexOf("_dedup.fastq.gz") > 0) "DEDUP_FASTQ/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.fastq.gz" + else if (filename.indexOf("_fgumi_extract.bam") > 0) "TMP/FGEX/${COMBO}/${CONDITION}/${filename}" + else if (filename.indexOf("log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/dedup_extract.log" + else null + } + + input: + path samples + + output: + path "*_dedup.fastq.gz", emit: extract + path "*_fgumi_extract.bam", emit: ubam + path "ex.log", emit: logs + + script: + if (PAIRED == 'paired'){ + r1 = samples[0] + r2 = samples[1] + sn = samples[0].getSimpleName().replace("_R1","") + ubam = sn+"_fgumi_extract.bam" + outf = samples[0].getSimpleName()+"_dedup.fastq.gz" + outf2 = samples[1].getSimpleName()+"_dedup.fastq.gz" + """ + mkdir -p tmp && $DEDUPBIN extract $EXTRACTPARAMS --inputs $r1 $r2 --sample $sn --library $sn --output $ubam > ex.log 2>&1 && samtools fastq -n -1 $outf -2 $outf2 -0 /dev/null -s /dev/null $ubam >> ex.log 2>&1 + """ + } + else{ + r1 = samples[0] + sn = samples[0].getSimpleName().replace(".fastq.gz","") + ubam = sn+"_fgumi_extract.bam" + outf = samples[0].getSimpleName()+"_dedup.fastq.gz" + """ + mkdir -p tmp && $DEDUPBIN extract $EXTRACTPARAMS --inputs $r1 --sample $sn --library $sn --output $ubam > ex.log 2>&1 && samtools fastq -n $ubam | gzip -c > $outf && echo done >> ex.log + """ + } +} + +workflow DEDUPEXTRACT{ + take: + collection + + main: + //SAMPLE CHANNELS + if ( PREDEDUP == 'enabled' ){ + if (PAIRED == 'paired'){ + extract_fq(samples_ch.collate( 2 )) + } else{ + extract_fq(samples_ch.collate( 1 )) + } + }else{ + if (PAIRED == 'paired'){ + extract_fq(collection.collate( 2 )) + } else{ + extract_fq(collection.collate( 1 )) + } + } + + emit: + extract = extract_fq.out.extract + ubam = extract_fq.out.ubam + logs = extract_fq.out.logs +} + +process dedup_bam{ + conda "$DEDUPENV"+".yaml" + container "oras://jfallmann/monsda:"+"$DEDUPENV" + cpus THREADS + cache 'lenient' + + publishDir "${workflow.workDir}/../MAPPED/${COMBO}/${CONDITION}" , mode: 'link' + + input: + path mapped_bam + path ubam + + output: + path "${mapped_bam.baseName}_dedup.bam", emit: dedup_bam + path "${mapped_bam.baseName}_dedup.bam.bai", emit: dedup_idx + path "dedup.log", emit: logs + + script: + """ + mkdir -p tmp + $DEDUPBIN zipper --unmapped $ubam --aligned $mapped_bam --output tmp/zippered.bam > dedup.log 2>&1 + $DEDUPBIN sort --order template-coordinate --input tmp/zippered.bam --output tmp/sorted.bam >> dedup.log 2>&1 + $DEDUPBIN dedup $DEDUPPARAMS --input tmp/sorted.bam --output ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 + samtools index ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 + rm $ubam + """ +} diff --git a/workflows/fgumi.smk b/workflows/fgumi.smk new file mode 100644 index 00000000..9a5ef72e --- /dev/null +++ b/workflows/fgumi.smk @@ -0,0 +1,76 @@ +DEDUPBIN, DEDUPENV = env_bin_from_config(config, 'DEDUP') + +wildcard_constraints: + type = "sorted|sorted_unique" + +if paired == 'paired': + rule extract: + input: r1 = lambda wildcards: "FASTQ/{rawfile}_R1.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]), + r2 = lambda wildcards: "FASTQ/{rawfile}_R2.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]) + output: o1 = "DEDUP_FASTQ/{combo}/{file}_R1_dedup.fastq.gz", + o2 = "DEDUP_FASTQ/{combo}/{file}_R2_dedup.fastq.gz", + ubam = temp("TMP/FGEX/{combo}/{file}_extracted.bam"), + td = temp(directory("TMP/FGEX/{combo}/{file}")) + log: "LOGS/{combo}/{file}_dedup_extract.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + params: epara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""), + dedup = DEDUPBIN, + sname = lambda wildcards: os.path.basename(wildcards.file) + shell: "mkdir -p {output.td} && {params.dedup} extract {params.epara} --inputs {input.r1} {input.r2} --sample {params.sname} --library {params.sname} --output {output.ubam} > {log} 2>&1 && samtools fastq -n -1 {output.o1} -2 {output.o2} -0 /dev/null -s /dev/null {output.ubam} >> {log} 2>&1" +else: + rule extract: + input: r1 = lambda wildcards: "FASTQ/{rawfile}.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]) + output: o1 = "DEDUP_FASTQ/{combo}/{file}_dedup.fastq.gz", + ubam = temp("TMP/FGEX/{combo}/{file}_extracted.bam"), + td = temp(directory("TMP/FGEX/{combo}/{file}")) + log: "LOGS/{combo}/{file}_dedup_extract.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + params: epara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""), + dedup = DEDUPBIN, + sname = lambda wildcards: os.path.basename(wildcards.file) + shell: "mkdir -p {output.td} && {params.dedup} extract {params.epara} --inputs {input.r1} --sample {params.sname} --library {params.sname} --output {output.ubam} > {log} 2>&1 && samtools fastq -n {output.ubam} | gzip -c > {output.o1} && echo done >> {log}" + +if paired == 'paired': + rule dedupbam: + input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam", + ubam = "TMP/FGEX/{combo}/{file}_extracted.bam" + output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"), + bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"), + td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}")) + log: "LOGS/{combo}/{file}_{type}/dedupbam.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + priority: 0 # This should be done after all mapping is done + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), + dedup = DEDUPBIN + shell: """mkdir -p {output.td} +{params.dedup} zipper --unmapped {input.ubam} --aligned {input.bam} --output {output.td}/zippered.bam > {log} 2>&1 +{params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 +{params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.bam} >> {log} 2>&1 +samtools index {output.bam} >> {log} 2>&1 +rm {input.ubam}""" +else: + rule dedupbam: + input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam", + ubam = "TMP/FGEX/{combo}/{file}_extracted.bam" + output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"), + bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"), + td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}")) + log: "LOGS/{combo}/{file}_{type}/dedupbam.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + priority: 0 # This should be done after all mapping is done + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), + dedup = DEDUPBIN + shell: """mkdir -p {output.td} +{params.dedup} zipper --unmapped {input.ubam} --aligned {input.bam} --output {output.td}/zippered.bam > {log} 2>&1 +{params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 +{params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.bam} >> {log} 2>&1 +samtools index {output.bam} >> {log} 2>&1 +rm {input.ubam}""" diff --git a/workflows/fgumi_dedup.nf b/workflows/fgumi_dedup.nf new file mode 100644 index 00000000..05ff5ed1 --- /dev/null +++ b/workflows/fgumi_dedup.nf @@ -0,0 +1,54 @@ +DEDUPENV=get_always('DEDUPENV') +DEDUPBIN=get_always('DEDUPBIN') + +DEDUPPARAMS = get_always('fgumi_params_DEDUP') ?: '' + +process dedup_bam{ + conda "$DEDUPENV"+".yaml" + container "oras://jfallmann/monsda:"+"$DEDUPENV" + cpus THREADS + cache 'lenient' + //validExitStatus 0,1 + + publishDir "${workflow.workDir}/../" , mode: 'link', + saveAs: {filename -> + if (filename.endsWith("_dedup.bam")) "MAPPED/${COMBO}/${CONDITION}/${file(filename).getName()}" + else if (filename.indexOf("_dedup.bam.bai") > 0) "MAPPED/${COMBO}/${CONDITION}/${file(filename).getName()}" + else if (filename.indexOf("dedup.log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/${file(filename).getName()}" + else null + } + + input: + path todedup + path bami + + output: + path "*_dedup.bam", emit: bam + path "*_dedup.bam.bai", emit: bai + path "*_dedup.log", emit: logs + + script: + bams = todedup[0] + bais = todedup[1] + outf = bams.getSimpleName()+"_dedup.bam" + outl = bams.getSimpleName()+"_dedup.log" + """ + mkdir -p TMP && $DEDUPBIN dedup $DEDUPPARAMS --input $bams --output $outf &> $outl && samtools index $outf &>> $outl + """ +} + +workflow DEDUPBAM{ + take: + map + mapi + mapu + mapui + + main: + dedup_bam(map.concat(mapu), mapi.concat(mapui)) + + emit: + dedup = dedup_bam.out.bam + dedupbai = dedup_bam.out.bai + deduplog = dedup_bam.out.logs +} diff --git a/workflows/fgumi_dedup.smk b/workflows/fgumi_dedup.smk new file mode 100644 index 00000000..49d54982 --- /dev/null +++ b/workflows/fgumi_dedup.smk @@ -0,0 +1,15 @@ +DEDUPBIN, DEDUPENV = env_bin_from_config(config, 'DEDUP') + +rule dedupbam: + input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam" + output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"), + bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"), + td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}")) + log: "LOGS/{combo}/{file}_{type}/dedupbam.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + priority: 0 # This should be done after all mapping is done + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), + dedup = DEDUPBIN + shell: "mkdir -p {output.td} && {params.dedup} dedup {params.dpara} --input {input.bam} --output {output.bam} 2> {log} && samtools index {output.bam} 2>> {log}" From 410496496cc1cb1f9a075ea187638bcc85e8ab47 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Wed, 22 Apr 2026 15:06:44 +0200 Subject: [PATCH 02/39] fgumi module update --- workflows/fgumi.nf | 1 - workflows/fgumi.smk | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf index 100de0f8..227c386a 100644 --- a/workflows/fgumi.nf +++ b/workflows/fgumi.nf @@ -13,7 +13,6 @@ process extract_fq{ publishDir "${workflow.workDir}/../" , mode: 'link', saveAs: {filename -> if (filename.indexOf("_dedup.fastq.gz") > 0) "DEDUP_FASTQ/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.fastq.gz" - else if (filename.indexOf("_fgumi_extract.bam") > 0) "TMP/FGEX/${COMBO}/${CONDITION}/${filename}" else if (filename.indexOf("log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/dedup_extract.log" else null } diff --git a/workflows/fgumi.smk b/workflows/fgumi.smk index 9a5ef72e..60a197e2 100644 --- a/workflows/fgumi.smk +++ b/workflows/fgumi.smk @@ -9,7 +9,7 @@ if paired == 'paired': r2 = lambda wildcards: "FASTQ/{rawfile}_R2.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]) output: o1 = "DEDUP_FASTQ/{combo}/{file}_R1_dedup.fastq.gz", o2 = "DEDUP_FASTQ/{combo}/{file}_R2_dedup.fastq.gz", - ubam = temp("TMP/FGEX/{combo}/{file}_extracted.bam"), + ubam = "TMP/FGEX/{combo}/{file}_extracted.bam", td = temp(directory("TMP/FGEX/{combo}/{file}")) log: "LOGS/{combo}/{file}_dedup_extract.log" conda: ""+DEDUPENV+".yaml" @@ -23,7 +23,7 @@ else: rule extract: input: r1 = lambda wildcards: "FASTQ/{rawfile}.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]) output: o1 = "DEDUP_FASTQ/{combo}/{file}_dedup.fastq.gz", - ubam = temp("TMP/FGEX/{combo}/{file}_extracted.bam"), + ubam = "TMP/FGEX/{combo}/{file}_extracted.bam", td = temp(directory("TMP/FGEX/{combo}/{file}")) log: "LOGS/{combo}/{file}_dedup_extract.log" conda: ""+DEDUPENV+".yaml" From 38213d0dc21f6b8f8b8f9bb26b7e11b56a3e57ba Mon Sep 17 00:00:00 2001 From: jfallmann Date: Thu, 23 Apr 2026 11:01:38 +0200 Subject: [PATCH 03/39] tests and documentation --- README.md | 7 ++++ docs/source/config.rst | 5 +++ docs/source/tutorial.rst | 18 ++++----- docs/source/workflows.rst | 8 ++++ tests/data/README_fgumi_test_data.md | 27 +++++++++++++ tests/data/config_fgumi_test.json | 47 ++++++++++++++++++++++ tests/data/make_fgumi_umi_fixture.py | 60 ++++++++++++++++++++++++++++ tests/test_fgumi_smoke.sh | 22 ++++++++++ 8 files changed, 185 insertions(+), 9 deletions(-) create mode 100644 tests/data/README_fgumi_test_data.md create mode 100644 tests/data/config_fgumi_test.json create mode 100644 tests/data/make_fgumi_umi_fixture.py create mode 100644 tests/test_fgumi_smoke.sh diff --git a/README.md b/README.md index b3dfedfc..a442b81c 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,13 @@ pip install MONSDA More information can be found in the official [documentation](https://monsda.readthedocs.io/en/latest/?badge=latest) +## Notes on newly available tools + +- **rustqc** is available as an alternative QC backend and is designed for fast, integrated RNA-seq QC reporting with MultiQC-compatible outputs. +- **fgumi** is available as an additional UMI-aware deduplication backend in the `DEDUP` step. + +Both tools are available through the shipped conda environments in `envs/` and can be selected through the regular config `TOOLS` sections. + ## How does it work diff --git a/docs/source/config.rst b/docs/source/config.rst index 022fe09d..6be4d1ac 100644 --- a/docs/source/config.rst +++ b/docs/source/config.rst @@ -34,6 +34,11 @@ You can always define differing ENV/BIN keys for each condition-tree leaf separa The next key-level is the *OPTIONS* key which is where you can define additional parameters for each tool. It is not needed to define anything related to *single-/paired-* end or *singlecell* sequencing, this is done automatically. To add parameters simply add the *OPTION* key which defines a dict where you can set parameters for each defined subworkflow-step. Parameters are here defined as key/value pairs corresponding to the subworkflow-step, e.g. 'INDEX' to generate an index file for mapping and all settings similar to a command line call as values. This should become clear having a look at the different processing steps in the template json. If there are no options just leave the 'OPTIONS' dict empty. +For newly available tools: + +- **QC/rustqc** can be selected with ``"rustqc": "rustqc"`` in the ``QC -> TOOLS`` section and configured via the regular ``OPTIONS`` keys (e.g. ``QC`` and ``MULTI`` entries in tutorial configs). +- **DEDUP/fgumi** can be selected with ``"fgumi": "fgumi"`` in the ``DEDUP -> TOOLS`` section and supports dedicated options under ``OPTIONS`` using ``EXTRACT`` and ``DEDUP`` keys. + .. literalinclude:: ../../configs/template.json :language: json diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 2d1afe53..ebd55264 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -57,10 +57,10 @@ This slightly more complex use case involves multiple input files, two condition Workflows include: - FETCH: Download from SRA - - QC: FASTQC of input and output + - QC: FASTQC and RustQC of input and output - TRIMMING: Adaptor removal with cutadapt/trimgalore - MAPPING: Read mapping with STAR, hisat2, bwa, segemehl3 and minimap2 - - DEDUP: Read deduplication with umi_tools and picard + - DEDUP: Read deduplication with umi_tools, fgumi and picard The more complex config for this analysis follows @@ -74,7 +74,7 @@ Starting the run with 12 cores (defining more will be capped by the config file monsda -j 12 -c ${CONDA_PREFIX}/share/MONSDA/configs/tutorial_toolmix.json --directory ${PWD} -Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/INDICES" directory containing the built index, a "QC" directory containing all *fastqc* reports and *multiqc* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, a "DE" directory will be created which will hold output from counting with featurecounts and DE input and output from *EdgeR* and *DESeq2*. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and the "JOBS" directory with all command-line calls. +Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/INDICES" directory containing the built index, a "QC" directory containing all *fastqc*/*rustqc* reports and *multiqc* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools*/*fgumi* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, a "DE" directory will be created which will hold output from counting with featurecounts and DE input and output from *EdgeR* and *DESeq2*. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and the "JOBS" directory with all command-line calls. A successful run will show the message 'Workflow finished, no error' at the end. @@ -86,10 +86,10 @@ This postprocessing use case involves multiple input files, two conditions (WT/K Workflows include: - FETCH: Download from SRA - - QC: FASTQC of input and output + - QC: FASTQC and RustQC of input and output - TRIMMING: Adaptor removal with cutadapt/trimgalore - MAPPING: Read mapping with STAR, hisat2, bwa, segemehl3 and minimap2 - - DEDUP: Read deduplication with umi_tools and picard + - DEDUP: Read deduplication with umi_tools, fgumi and picard - DE: Differential Expression analysis with EdgeR and DESeq2 - DEU: Differential Exon Usage analysis with EdgeR (DEXSeq skipped, runtime) - COUNTING: Read counting with FeatureCounts @@ -108,7 +108,7 @@ Starting the run with 12 cores (defining more will be capped by the config file monsda -j 12 -c ${CONDA_PREFIX}/share/MONSDA/configs/tutorial_postprocess.json --directory ${PWD} -Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for *salmon* later on, a "QC" directory containing all *FastQC* reports and *MultiQC* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU" directories will be created which will hold output from counting with FeatureCounts and DE/DEU input and output from *EdgeR* and *DESeq2* respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls. +Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for *salmon* later on, a "QC" directory containing all *FastQC*/*RustQC* reports and *MultiQC* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools*/*fgumi* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU" directories will be created which will hold output from counting with FeatureCounts and DE/DEU input and output from *EdgeR* and *DESeq2* respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls. A successful run will show the message 'Workflow finished, no error'. Be aware that this is indeed an exhaustive workflow and will require a decent amount of disk-space, memory and compute-time, depending on the hardware at your disposal. @@ -122,10 +122,10 @@ This exhaustive use case involves multiple input files, two conditions (WT/KO) a Workflows include: - FETCH: Download from SRA - - QC: FASTQC of input and output + - QC: FASTQC and RustQC of input and output - TRIMMING: Adaptor removal with cutadapt/trimgalore - MAPPING: Read mapping with STAR, hisat2, bwa, segemehl3 and minimap2 - - DEDUP: Read deduplication with umi_tools and picard + - DEDUP: Read deduplication with umi_tools, fgumi and picard - DE: Differential Expression analysis with EdgeR and DESeq2 - DEU: Differential Exon Usage analysis with EdgeR (DEXSeq skipped, runtime) - DAS: Differential Alternative Splicing analysis with EdgeR and DIEGO @@ -186,6 +186,6 @@ Starting the run with 12 cores (defining more will be capped by the config file monsda -j 12 -c ${CONDA_PREFIX}/share/MONSDA/configs/tutorial_exhaustive.json --directory ${PWD} -Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for salmon later on, a "QC" directory containing all FASTQC reports and MULTIQC output, a "TRIMMED_FASTQ" directory for trimgalore and cutadapt output, a "DEDUP" directory for umi_tools (runs before trimming and after mapping) and picard (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU/DAS/DTU" directories will be created which will hold output from counting with FeatureCounts (or salmon for DTU) and DE/DEU/DAS/DTU input and output from EDGER, DESeq2, Diego and DrimSeq respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls. +Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for salmon later on, a "QC" directory containing all FASTQC/RustQC reports and MULTIQC output, a "TRIMMED_FASTQ" directory for trimgalore and cutadapt output, a "DEDUP" directory for umi_tools/fgumi (runs before trimming and after mapping) and picard (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU/DAS/DTU" directories will be created which will hold output from counting with FeatureCounts (or salmon for DTU) and DE/DEU/DAS/DTU input and output from EDGER, DESeq2, Diego and DrimSeq respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls. A successful run will show the message 'Workflow finished, no error'. Be aware that this is indeed an exhaustive workflow and will require a decent amount of disk-space, memory and compute-time, depending on the hardware at your disposal. \ No newline at end of file diff --git a/docs/source/workflows.rst b/docs/source/workflows.rst index cccf07e6..3295b4f5 100644 --- a/docs/source/workflows.rst +++ b/docs/source/workflows.rst @@ -75,6 +75,8 @@ QUALITY CONTROL I This workflow step can be run as preprocessing step if none of the processing workflows is defined in the config.json. +*rustqc* is intended for mapped BAM-level QC and is therefore generally most useful in processing mode after mapping outputs are available. + .. table:: :widths: 10, 40, 10, 10, 10, 10, 10 :class: tight-table @@ -84,6 +86,8 @@ This workflow step can be run as preprocessing step if none of the processing wo +============================+============================================================+=========+=========+=========================================================================+============+===========+ | FASTQC (includes MULTIQC) | A quality control tool for high throughput sequence data. | fastqc | fastqc | `fastqc `_ | FASTQ/BAM | ZIP/HTML | +----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+ + | RustQC (includes MULTIQC) | High-performance RNA-seq QC suite with MultiQC-compatible outputs. | rustqc | rustqc | `rustqc `_ | BAM | TEXT/TSV/LOG | + +----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+ PROCESSING @@ -103,6 +107,8 @@ If any of the below listed processing steps is defined in the config.json, quali +============================+============================================================+=========+=========+=========================================================================+============+===========+ | FASTQC (includes MULTIQC) | A quality control tool for high throughput sequence data. | fastqc | fastqc | `fastqc `_ | FASTQ/BAM | ZIP/HTML | +----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+ + | RustQC (includes MULTIQC) | High-performance RNA-seq QC suite with MultiQC-compatible outputs. | rustqc | rustqc | `rustqc `_ | BAM | TEXT/TSV/LOG | + +----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+ Trimming @@ -173,6 +179,8 @@ Deduplicate reads by UMI or based on mapping position and CIGAR string +===============+====================================================================================================================================================+===========+============+====================================================================================================+======================+============+ | UMI-tools | UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) and single cell RNA-Seq cell barcodes. | umitools | umi_tools | `umitools `_ | FASTQ/TRIMMED_FASTQ | FASTQ/BAM | +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------+----------------------+------------+ + | fgumi | High-performance tools for UMI-tagged sequencing data including UMI extraction and UMI-aware deduplication. | fgumi | fgumi | `fgumi `_ | FASTQ/TRIMMED_FASTQ | FASTQ/BAM | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------+----------------------+------------+ | Picard tools | A better duplication marking algorithm that handles all cases including clipped and gapped alignments. | picard | picard | `picard `_ | BAM | BAM | +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------+----------------------+------------+ diff --git a/tests/data/README_fgumi_test_data.md b/tests/data/README_fgumi_test_data.md new file mode 100644 index 00000000..4902a8c1 --- /dev/null +++ b/tests/data/README_fgumi_test_data.md @@ -0,0 +1,27 @@ +# fgumi UMI smoke-test data + +This directory contains a tiny synthetic paired-end fixture generator for `fgumi`: + +- `make_fgumi_umi_fixture.py` writes + - `FASTQ/Test/umi/FGUMI01_R1.fastq.gz` + - `FASTQ/Test/umi/FGUMI01_R2.fastq.gz` +- `config_fgumi_test.json` is a minimal tutorial-style MONSDA config that enables only `DEDUP` with `fgumi`. + +## Why synthetic data? + +Most publicly referenced UMI tutorial datasets are not truly small enough for quick CI-style smoke tests. +The synthetic fixture keeps runtime tiny and deterministic while still exercising UMI extraction behavior. + +## External UMI datasets (if you want real data) + +- Galaxy Training (CEL-Seq2 UMI tutorial data on Zenodo): + - https://zenodo.org/record/2573177 + - Files: + - `test_barcodes_celseq2_R1.fastq.gz` (~243.7 MB) + - `test_barcodes_celseq2_R2.fastq.gz` (~594.9 MB) +- Galaxy tutorial page: + - https://training.galaxyproject.org/training-material/topics/single-cell/tutorials/scrna-umis/tutorial.html + +For nf-core test datasets, use branch discovery/search tooling first: +- https://github.com/nf-core/test-datasets +- https://github.com/nf-core/test-datasets/blob/master/docs/USE_EXISTING_DATA.md diff --git a/tests/data/config_fgumi_test.json b/tests/data/config_fgumi_test.json new file mode 100644 index 00000000..a19d1bc7 --- /dev/null +++ b/tests/data/config_fgumi_test.json @@ -0,0 +1,47 @@ +{ + "WORKFLOWS": "DEDUP", + "BINS": "", + "MAXTHREADS": "2", + "VERSION": "FIXME", + "SETTINGS": { + "Test": { + "umi": { + "SAMPLES": [ + "FGUMI01" + ], + "SEQUENCING": "paired", + "REFERENCE": "GENOME/genome.fa.gz", + "ANNOTATION": { + "GTF": "GENOME/genomic.gtf.gz", + "GFF": "GENOME/genomic.gff.gz" + }, + "GROUPS": [ + "umi" + ], + "TYPES": [ + "fgumi_smoke" + ], + "BATCHES": [ + "1" + ], + "INDEX": "", + "PREFIX": "" + } + } + }, + "DEDUP": { + "TOOLS": { + "fgumi": "fgumi" + }, + "Test": { + "umi": { + "fgumi": { + "OPTIONS": { + "EXTRACT": "--read-structures 6M+T +T", + "DEDUP": "" + } + } + } + } + } +} diff --git a/tests/data/make_fgumi_umi_fixture.py b/tests/data/make_fgumi_umi_fixture.py new file mode 100644 index 00000000..79d9dbf6 --- /dev/null +++ b/tests/data/make_fgumi_umi_fixture.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +"""Create a tiny paired-end FASTQ fixture with synthetic UMIs for fgumi tests. + +This script writes gzipped FASTQ files into: + FASTQ/Test/umi/FGUMI01_R1.fastq.gz + FASTQ/Test/umi/FGUMI01_R2.fastq.gz + +R1 layout is intentionally compatible with a simple fgumi read structure like: + --read-structures 6M+T +T +where the first 6 bases in R1 are a mock UMI. +""" + +from __future__ import annotations + +import gzip +from pathlib import Path + +OUTDIR = Path("FASTQ") / "Test" / "umi" +SAMPLE = "FGUMI01" + +# (umi6, r1_suffix, r2_sequence) +READS = [ + ("ACGTAA", "TTTTTTTTTTTTAACCGGTTCCAA", "GATTACAGATTACAGATTACAGATTACA"), + ("ACGTAA", "TTTTTTTTTTTTAACCGGTTCCAA", "GATTACAGATTACAGATTACAGATTACA"), # duplicate + ("TGCACT", "TTTTTTTTTTTTGGCCAATTGGCC", "CGTACGTACGTACGTACGTACGTACGTAC"), + ( + "TGCACT", + "TTTTTTTTTTTTGGCCAATTGGCC", + "CGTACGTACGTACGTACGTACGTACGTAC", + ), # duplicate + ("GGAACC", "TTTTTTTTTTTTCCGGAATTCCGG", "TTGCAATTGCAATTGCAATTGCAATTGCA"), + ("CCTTGG", "TTTTTTTTTTTTAATTCCGGAATT", "AACCGGTTAACCGGTTAACCGGTTAACCG"), + ("TTAAGG", "TTTTTTTTTTTTGGAATTCCGGAA", "GGCCAATTGGCCAATTGGCCAATTGGCCA"), + ("CCGGTT", "TTTTTTTTTTTTAACCTTGGAACC", "ATATCGCGATATCGCGATATCGCGATATC"), +] + + +def _fq_record(name: str, seq: str, qual_char: str = "I") -> str: + return f"@{name}\n{seq}\n+\n{qual_char * len(seq)}\n" + + +def main() -> None: + OUTDIR.mkdir(parents=True, exist_ok=True) + + r1_path = OUTDIR / f"{SAMPLE}_R1.fastq.gz" + r2_path = OUTDIR / f"{SAMPLE}_R2.fastq.gz" + + with gzip.open(r1_path, "wt") as r1h, gzip.open(r2_path, "wt") as r2h: + for i, (umi, r1_suffix, r2_seq) in enumerate(READS, start=1): + read_id = f"{SAMPLE}:{i}" + r1_seq = umi + r1_suffix + r1h.write(_fq_record(read_id + " 1:N:0:TEST", r1_seq)) + r2h.write(_fq_record(read_id + " 2:N:0:TEST", r2_seq)) + + print(f"Wrote {r1_path}") + print(f"Wrote {r2_path}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_fgumi_smoke.sh b/tests/test_fgumi_smoke.sh new file mode 100644 index 00000000..fabfd17c --- /dev/null +++ b/tests/test_fgumi_smoke.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "${SCRIPT_DIR}" + +# Mirror existing integration style: expose tests/data as working inputs +ln -fs data/* . + +# Generate tiny gzipped UMI FASTQ inputs +python data/make_fgumi_umi_fixture.py + +# Keep version in sync with installed MONSDA +VERSION=$(monsda --version 2>&1 | sed 's/MONSDA version //g') +sed -i "s/\"VERSION\": \"FIXME\"/\"VERSION\": \"${VERSION}\"/g" config_fgumi_test.json + +mkdir -p CONDALIB + +# --save keeps this as a lightweight workflow generation smoke test +monsda -j 2 -c config_fgumi_test.json --directory "${PWD}" --use-conda --conda-prefix CONDALIB --save + +echo "FGUMI smoke test workflow generation completed." From 71297dbe89ce43c57f1aa541aadbb3a7fa9bd22d Mon Sep 17 00:00:00 2001 From: jfallmann Date: Thu, 23 Apr 2026 11:13:25 +0200 Subject: [PATCH 04/39] gracefully exit if no samples found --- MONSDA/Params.py | 12 ++++++++++++ MONSDA/Utils.py | 16 ++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/MONSDA/Params.py b/MONSDA/Params.py index 71182733..e59c00ef 100644 --- a/MONSDA/Params.py +++ b/MONSDA/Params.py @@ -424,6 +424,18 @@ def get_samples_from_dir(search: str, config: dict, nocheck: str = None) -> list clean.append(c) break log.debug(logid + "checkclean: " + str(clean)) + + # Check if any samples were found in the clean list + if not clean: + search_dir = os.sep.join(["FASTQ"] + search) + error_msg = ( + f"No sample files found for condition {os.sep.join(search)}. " + f"Expected to find files matching samples {samples} " + f"in directory: {search_dir}" + ) + log.error(logid + error_msg) + raise ValueError(error_msg) + paired = checkpaired( [os.sep.join([os.sep.join(search), clean[0].split(os.sep)[-1]])], config ) diff --git a/MONSDA/Utils.py b/MONSDA/Utils.py index 29a8c4d2..3a8bae03 100644 --- a/MONSDA/Utils.py +++ b/MONSDA/Utils.py @@ -182,6 +182,22 @@ def func_wrapper(*args, **kwargs): try: return func(*args, **kwargs) + except ValueError as e: + # Handle sample not found errors gracefully + error_msg = str(e) + if "sample" in error_msg.lower() and "not found" in error_msg.lower(): + log.error(logid + error_msg) + log.error(logid + "STOPPING: Processing stopped due to missing samples") + sys.exit(1) + else: + # Re-raise other ValueError exceptions + exc_type, exc_value, exc_tb = sys.exc_info() + tbe = tb.TracebackException( + exc_type, + exc_value, + exc_tb, + ) + log.error(logid + "".join(tbe.format())) except Exception: exc_type, exc_value, exc_tb = sys.exc_info() tbe = tb.TracebackException( From 888d4d86f7d4450220037f43bdc5e60d9db4dca6 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Thu, 23 Apr 2026 11:47:05 +0200 Subject: [PATCH 05/39] docs update --- README.md | 7 ------- docs/source/config.rst | 5 ----- 2 files changed, 12 deletions(-) diff --git a/README.md b/README.md index a442b81c..b3dfedfc 100644 --- a/README.md +++ b/README.md @@ -43,13 +43,6 @@ pip install MONSDA More information can be found in the official [documentation](https://monsda.readthedocs.io/en/latest/?badge=latest) -## Notes on newly available tools - -- **rustqc** is available as an alternative QC backend and is designed for fast, integrated RNA-seq QC reporting with MultiQC-compatible outputs. -- **fgumi** is available as an additional UMI-aware deduplication backend in the `DEDUP` step. - -Both tools are available through the shipped conda environments in `envs/` and can be selected through the regular config `TOOLS` sections. - ## How does it work diff --git a/docs/source/config.rst b/docs/source/config.rst index 6be4d1ac..022fe09d 100644 --- a/docs/source/config.rst +++ b/docs/source/config.rst @@ -34,11 +34,6 @@ You can always define differing ENV/BIN keys for each condition-tree leaf separa The next key-level is the *OPTIONS* key which is where you can define additional parameters for each tool. It is not needed to define anything related to *single-/paired-* end or *singlecell* sequencing, this is done automatically. To add parameters simply add the *OPTION* key which defines a dict where you can set parameters for each defined subworkflow-step. Parameters are here defined as key/value pairs corresponding to the subworkflow-step, e.g. 'INDEX' to generate an index file for mapping and all settings similar to a command line call as values. This should become clear having a look at the different processing steps in the template json. If there are no options just leave the 'OPTIONS' dict empty. -For newly available tools: - -- **QC/rustqc** can be selected with ``"rustqc": "rustqc"`` in the ``QC -> TOOLS`` section and configured via the regular ``OPTIONS`` keys (e.g. ``QC`` and ``MULTI`` entries in tutorial configs). -- **DEDUP/fgumi** can be selected with ``"fgumi": "fgumi"`` in the ``DEDUP -> TOOLS`` section and supports dedicated options under ``OPTIONS`` using ``EXTRACT`` and ``DEDUP`` keys. - .. literalinclude:: ../../configs/template.json :language: json From 1d43280422de0bce3bbc760104805b10e72b3405 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Thu, 23 Apr 2026 11:47:14 +0200 Subject: [PATCH 06/39] fgumi zipper fix --- workflows/fgumi.nf | 21 ++++++++++++++++++++- workflows/fgumi.smk | 10 ++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf index 227c386a..78b2776d 100644 --- a/workflows/fgumi.nf +++ b/workflows/fgumi.nf @@ -85,6 +85,7 @@ process dedup_bam{ input: path mapped_bam path ubam + path ref output: path "${mapped_bam.baseName}_dedup.bam", emit: dedup_bam @@ -94,10 +95,28 @@ process dedup_bam{ script: """ mkdir -p tmp - $DEDUPBIN zipper --unmapped $ubam --aligned $mapped_bam --output tmp/zippered.bam > dedup.log 2>&1 + $DEDUPBIN zipper --unmapped $ubam --input $mapped_bam --reference $ref --output tmp/zippered.bam > dedup.log 2>&1 $DEDUPBIN sort --order template-coordinate --input tmp/zippered.bam --output tmp/sorted.bam >> dedup.log 2>&1 $DEDUPBIN dedup $DEDUPPARAMS --input tmp/sorted.bam --output ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 samtools index ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 rm $ubam """ } + +workflow DEDUPBAM{ + take: + map + mapi + mapu + mapui + ubam + + main: + ref_ch = Channel.fromPath(REFERENCE) + dedup_bam(map.concat(mapu), ubam, ref_ch) + + emit: + dedup = dedup_bam.out.dedup_bam + dedupbai = dedup_bam.out.dedup_idx + deduplog = dedup_bam.out.logs +} diff --git a/workflows/fgumi.smk b/workflows/fgumi.smk index 60a197e2..8483f96e 100644 --- a/workflows/fgumi.smk +++ b/workflows/fgumi.smk @@ -47,9 +47,10 @@ if paired == 'paired': threads: 1 priority: 0 # This should be done after all mapping is done params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), - dedup = DEDUPBIN + dedup = DEDUPBIN, + ref = REFERENCE shell: """mkdir -p {output.td} -{params.dedup} zipper --unmapped {input.ubam} --aligned {input.bam} --output {output.td}/zippered.bam > {log} 2>&1 +{params.dedup} zipper --unmapped {input.ubam} --input {input.bam} --reference {params.ref} --output {output.td}/zippered.bam > {log} 2>&1 {params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 {params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.bam} >> {log} 2>&1 samtools index {output.bam} >> {log} 2>&1 @@ -67,9 +68,10 @@ else: threads: 1 priority: 0 # This should be done after all mapping is done params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), - dedup = DEDUPBIN + dedup = DEDUPBIN, + ref = REFERENCE shell: """mkdir -p {output.td} -{params.dedup} zipper --unmapped {input.ubam} --aligned {input.bam} --output {output.td}/zippered.bam > {log} 2>&1 +{params.dedup} zipper --unmapped {input.ubam} --input {input.bam} --reference {params.ref} --output {output.td}/zippered.bam > {log} 2>&1 {params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 {params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.bam} >> {log} 2>&1 samtools index {output.bam} >> {log} 2>&1 From 900fdb6979222a3324d7c9452c0db1572f871fc2 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Thu, 23 Apr 2026 13:36:03 +0200 Subject: [PATCH 07/39] fgumi ref dict if not availbale --- workflows/fgumi.nf | 4 +++- workflows/fgumi.smk | 12 ++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf index 78b2776d..e9cd0907 100644 --- a/workflows/fgumi.nf +++ b/workflows/fgumi.nf @@ -95,7 +95,9 @@ process dedup_bam{ script: """ mkdir -p tmp - $DEDUPBIN zipper --unmapped $ubam --input $mapped_bam --reference $ref --output tmp/zippered.bam > dedup.log 2>&1 + ref_dict=\$(basename $ref .gz).dict + if [[ ! -f "\${ref_dict}" ]]; then samtools dict $ref -o \${ref_dict} >> dedup.log 2>&1; fi + $DEDUPBIN zipper --unmapped $ubam --input $mapped_bam --reference $ref --output tmp/zippered.bam >> dedup.log 2>&1 $DEDUPBIN sort --order template-coordinate --input tmp/zippered.bam --output tmp/sorted.bam >> dedup.log 2>&1 $DEDUPBIN dedup $DEDUPPARAMS --input tmp/sorted.bam --output ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 samtools index ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 diff --git a/workflows/fgumi.smk b/workflows/fgumi.smk index 8483f96e..efb4d6b0 100644 --- a/workflows/fgumi.smk +++ b/workflows/fgumi.smk @@ -48,9 +48,11 @@ if paired == 'paired': priority: 0 # This should be done after all mapping is done params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), dedup = DEDUPBIN, - ref = REFERENCE + ref = REFERENCE, + ref_dict = (REFERENCE[:-3] if REFERENCE.endswith('.gz') else REFERENCE) + ".dict" shell: """mkdir -p {output.td} -{params.dedup} zipper --unmapped {input.ubam} --input {input.bam} --reference {params.ref} --output {output.td}/zippered.bam > {log} 2>&1 +[[ -f "{params.ref_dict}" ]] || samtools dict {params.ref} -o {params.ref_dict} >> {log} 2>&1 +{params.dedup} zipper --unmapped {input.ubam} --input {input.bam} --reference {params.ref} --output {output.td}/zippered.bam >> {log} 2>&1 {params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 {params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.bam} >> {log} 2>&1 samtools index {output.bam} >> {log} 2>&1 @@ -69,9 +71,11 @@ else: priority: 0 # This should be done after all mapping is done params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), dedup = DEDUPBIN, - ref = REFERENCE + ref = REFERENCE, + ref_dict = (REFERENCE[:-3] if REFERENCE.endswith('.gz') else REFERENCE) + ".dict" shell: """mkdir -p {output.td} -{params.dedup} zipper --unmapped {input.ubam} --input {input.bam} --reference {params.ref} --output {output.td}/zippered.bam > {log} 2>&1 +[[ -f "{params.ref_dict}" ]] || samtools dict {params.ref} -o {params.ref_dict} >> {log} 2>&1 +{params.dedup} zipper --unmapped {input.ubam} --input {input.bam} --reference {params.ref} --output {output.td}/zippered.bam >> {log} 2>&1 {params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 {params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.bam} >> {log} 2>&1 samtools index {output.bam} >> {log} 2>&1 From 84ec08f6737c4e872e3869b209a204a52a3e84f8 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Thu, 23 Apr 2026 13:41:35 +0200 Subject: [PATCH 08/39] sorting by name before fgumi zipper --- workflows/fgumi.nf | 6 ++++-- workflows/fgumi.smk | 12 ++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf index e9cd0907..b0e097ee 100644 --- a/workflows/fgumi.nf +++ b/workflows/fgumi.nf @@ -77,7 +77,7 @@ workflow DEDUPEXTRACT{ process dedup_bam{ conda "$DEDUPENV"+".yaml" container "oras://jfallmann/monsda:"+"$DEDUPENV" - cpus THREADS + cpus 4 cache 'lenient' publishDir "${workflow.workDir}/../MAPPED/${COMBO}/${CONDITION}" , mode: 'link' @@ -97,7 +97,9 @@ process dedup_bam{ mkdir -p tmp ref_dict=\$(basename $ref .gz).dict if [[ ! -f "\${ref_dict}" ]]; then samtools dict $ref -o \${ref_dict} >> dedup.log 2>&1; fi - $DEDUPBIN zipper --unmapped $ubam --input $mapped_bam --reference $ref --output tmp/zippered.bam >> dedup.log 2>&1 + samtools sort -n -@ ${task.cpus} -o tmp/ubam_qn.bam $ubam >> dedup.log 2>&1 + samtools sort -n -@ ${task.cpus} -o tmp/mapped_qn.bam $mapped_bam >> dedup.log 2>&1 + $DEDUPBIN zipper --unmapped tmp/ubam_qn.bam --input tmp/mapped_qn.bam --reference $ref --output tmp/zippered.bam >> dedup.log 2>&1 $DEDUPBIN sort --order template-coordinate --input tmp/zippered.bam --output tmp/sorted.bam >> dedup.log 2>&1 $DEDUPBIN dedup $DEDUPPARAMS --input tmp/sorted.bam --output ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 samtools index ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 diff --git a/workflows/fgumi.smk b/workflows/fgumi.smk index efb4d6b0..06b61ee2 100644 --- a/workflows/fgumi.smk +++ b/workflows/fgumi.smk @@ -44,7 +44,7 @@ if paired == 'paired': log: "LOGS/{combo}/{file}_{type}/dedupbam.log" conda: ""+DEDUPENV+".yaml" container: "oras://jfallmann/monsda:"+DEDUPENV+"" - threads: 1 + threads: 4 priority: 0 # This should be done after all mapping is done params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), dedup = DEDUPBIN, @@ -52,7 +52,9 @@ if paired == 'paired': ref_dict = (REFERENCE[:-3] if REFERENCE.endswith('.gz') else REFERENCE) + ".dict" shell: """mkdir -p {output.td} [[ -f "{params.ref_dict}" ]] || samtools dict {params.ref} -o {params.ref_dict} >> {log} 2>&1 -{params.dedup} zipper --unmapped {input.ubam} --input {input.bam} --reference {params.ref} --output {output.td}/zippered.bam >> {log} 2>&1 +samtools sort -n -@ {threads} -o {output.td}/ubam_qn.bam {input.ubam} >> {log} 2>&1 +samtools sort -n -@ {threads} -o {output.td}/mapped_qn.bam {input.bam} >> {log} 2>&1 +{params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn.bam --reference {params.ref} --output {output.td}/zippered.bam >> {log} 2>&1 {params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 {params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.bam} >> {log} 2>&1 samtools index {output.bam} >> {log} 2>&1 @@ -67,7 +69,7 @@ else: log: "LOGS/{combo}/{file}_{type}/dedupbam.log" conda: ""+DEDUPENV+".yaml" container: "oras://jfallmann/monsda:"+DEDUPENV+"" - threads: 1 + threads: 4 priority: 0 # This should be done after all mapping is done params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), dedup = DEDUPBIN, @@ -75,7 +77,9 @@ else: ref_dict = (REFERENCE[:-3] if REFERENCE.endswith('.gz') else REFERENCE) + ".dict" shell: """mkdir -p {output.td} [[ -f "{params.ref_dict}" ]] || samtools dict {params.ref} -o {params.ref_dict} >> {log} 2>&1 -{params.dedup} zipper --unmapped {input.ubam} --input {input.bam} --reference {params.ref} --output {output.td}/zippered.bam >> {log} 2>&1 +samtools sort -n -@ {threads} -o {output.td}/ubam_qn.bam {input.ubam} >> {log} 2>&1 +samtools sort -n -@ {threads} -o {output.td}/mapped_qn.bam {input.bam} >> {log} 2>&1 +{params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn.bam --reference {params.ref} --output {output.td}/zippered.bam >> {log} 2>&1 {params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 {params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.bam} >> {log} 2>&1 samtools index {output.bam} >> {log} 2>&1 From c1c0cc528906d76d68d85be3dd28faede007fdd9 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Thu, 23 Apr 2026 14:23:08 +0200 Subject: [PATCH 09/39] sorting before index --- workflows/fgumi.nf | 3 ++- workflows/fgumi.smk | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf index b0e097ee..0f32f9a6 100644 --- a/workflows/fgumi.nf +++ b/workflows/fgumi.nf @@ -101,7 +101,8 @@ process dedup_bam{ samtools sort -n -@ ${task.cpus} -o tmp/mapped_qn.bam $mapped_bam >> dedup.log 2>&1 $DEDUPBIN zipper --unmapped tmp/ubam_qn.bam --input tmp/mapped_qn.bam --reference $ref --output tmp/zippered.bam >> dedup.log 2>&1 $DEDUPBIN sort --order template-coordinate --input tmp/zippered.bam --output tmp/sorted.bam >> dedup.log 2>&1 - $DEDUPBIN dedup $DEDUPPARAMS --input tmp/sorted.bam --output ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 + $DEDUPBIN dedup $DEDUPPARAMS --input tmp/sorted.bam --output tmp/dedup.bam >> dedup.log 2>&1 + samtools sort -@ ${task.cpus} -o ${mapped_bam.baseName}_dedup.bam tmp/dedup.bam >> dedup.log 2>&1 samtools index ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 rm $ubam """ diff --git a/workflows/fgumi.smk b/workflows/fgumi.smk index 06b61ee2..0eb1fe64 100644 --- a/workflows/fgumi.smk +++ b/workflows/fgumi.smk @@ -56,7 +56,8 @@ samtools sort -n -@ {threads} -o {output.td}/ubam_qn.bam {input.ubam} >> {log} 2 samtools sort -n -@ {threads} -o {output.td}/mapped_qn.bam {input.bam} >> {log} 2>&1 {params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn.bam --reference {params.ref} --output {output.td}/zippered.bam >> {log} 2>&1 {params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 -{params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.bam} >> {log} 2>&1 +{params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.td}/dedup.bam >> {log} 2>&1 +samtools sort -@ {threads} -o {output.bam} {output.td}/dedup.bam >> {log} 2>&1 samtools index {output.bam} >> {log} 2>&1 rm {input.ubam}""" else: @@ -81,6 +82,7 @@ samtools sort -n -@ {threads} -o {output.td}/ubam_qn.bam {input.ubam} >> {log} 2 samtools sort -n -@ {threads} -o {output.td}/mapped_qn.bam {input.bam} >> {log} 2>&1 {params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn.bam --reference {params.ref} --output {output.td}/zippered.bam >> {log} 2>&1 {params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 -{params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.bam} >> {log} 2>&1 +{params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.td}/dedup.bam >> {log} 2>&1 +samtools sort -@ {threads} -o {output.bam} {output.td}/dedup.bam >> {log} 2>&1 samtools index {output.bam} >> {log} 2>&1 rm {input.ubam}""" From b07955b311fcf1a9fd209415b8ab121e41e0de2e Mon Sep 17 00:00:00 2001 From: jfallmann Date: Fri, 24 Apr 2026 14:31:14 +0200 Subject: [PATCH 10/39] fgumi update --- workflows/fgumi.nf | 8 ++++---- workflows/fgumi.smk | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf index 0f32f9a6..566abfc3 100644 --- a/workflows/fgumi.nf +++ b/workflows/fgumi.nf @@ -99,11 +99,11 @@ process dedup_bam{ if [[ ! -f "\${ref_dict}" ]]; then samtools dict $ref -o \${ref_dict} >> dedup.log 2>&1; fi samtools sort -n -@ ${task.cpus} -o tmp/ubam_qn.bam $ubam >> dedup.log 2>&1 samtools sort -n -@ ${task.cpus} -o tmp/mapped_qn.bam $mapped_bam >> dedup.log 2>&1 - $DEDUPBIN zipper --unmapped tmp/ubam_qn.bam --input tmp/mapped_qn.bam --reference $ref --output tmp/zippered.bam >> dedup.log 2>&1 - $DEDUPBIN sort --order template-coordinate --input tmp/zippered.bam --output tmp/sorted.bam >> dedup.log 2>&1 + samtools view -h tmp/mapped_qn.bam | awk 'BEGIN{FS=OFS="\t"} /^@/{print; next} {f=\$2+0; if (!and(f,256) && !and(f,2048)) {k=\$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next} print}' | samtools view -b -o tmp/mapped_qn_primaryuniq.bam - >> dedup.log 2>&1 + $DEDUPBIN zipper --unmapped tmp/ubam_qn.bam --input tmp/mapped_qn_primaryuniq.bam --reference $ref --output tmp/zippered.bam --threads ${task.cpus} --compression-level 1 >> dedup.log 2>&1 + $DEDUPBIN sort --order template-coordinate --input tmp/zippered.bam --output tmp/sorted.bam --threads ${task.cpus} --compression-level 1 >> dedup.log 2>&1 $DEDUPBIN dedup $DEDUPPARAMS --input tmp/sorted.bam --output tmp/dedup.bam >> dedup.log 2>&1 - samtools sort -@ ${task.cpus} -o ${mapped_bam.baseName}_dedup.bam tmp/dedup.bam >> dedup.log 2>&1 - samtools index ${mapped_bam.baseName}_dedup.bam >> dedup.log 2>&1 + $DEDUPBIN sort --order coordinate --input tmp/dedup.bam --output ${mapped_bam.baseName}_dedup.bam --write-index --threads ${task.cpus} --compression-level 1 >> dedup.log 2>&1 rm $ubam """ } diff --git a/workflows/fgumi.smk b/workflows/fgumi.smk index 0eb1fe64..04d25ba1 100644 --- a/workflows/fgumi.smk +++ b/workflows/fgumi.smk @@ -54,11 +54,11 @@ if paired == 'paired': [[ -f "{params.ref_dict}" ]] || samtools dict {params.ref} -o {params.ref_dict} >> {log} 2>&1 samtools sort -n -@ {threads} -o {output.td}/ubam_qn.bam {input.ubam} >> {log} 2>&1 samtools sort -n -@ {threads} -o {output.td}/mapped_qn.bam {input.bam} >> {log} 2>&1 -{params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn.bam --reference {params.ref} --output {output.td}/zippered.bam >> {log} 2>&1 -{params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 +samtools view -h {output.td}/mapped_qn.bam | awk 'BEGIN{{FS=OFS="\t"}} /^@/{{print; next}} {{f=$2+0; if (!and(f,256) && !and(f,2048)) {{k=$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next}} print}}' | samtools view -b -o {output.td}/mapped_qn_primaryuniq.bam - >> {log} 2>&1 +{params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn_primaryuniq.bam --reference {params.ref} --output {output.td}/zippered.bam --threads {threads} --compression-level 1 >> {log} 2>&1 +{params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam --threads {threads} --compression-level 1 >> {log} 2>&1 {params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.td}/dedup.bam >> {log} 2>&1 -samtools sort -@ {threads} -o {output.bam} {output.td}/dedup.bam >> {log} 2>&1 -samtools index {output.bam} >> {log} 2>&1 +{params.dedup} sort --order coordinate --input {output.td}/dedup.bam --output {output.bam} --write-index --threads {threads} --compression-level 1 >> {log} 2>&1 rm {input.ubam}""" else: rule dedupbam: @@ -80,9 +80,9 @@ else: [[ -f "{params.ref_dict}" ]] || samtools dict {params.ref} -o {params.ref_dict} >> {log} 2>&1 samtools sort -n -@ {threads} -o {output.td}/ubam_qn.bam {input.ubam} >> {log} 2>&1 samtools sort -n -@ {threads} -o {output.td}/mapped_qn.bam {input.bam} >> {log} 2>&1 -{params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn.bam --reference {params.ref} --output {output.td}/zippered.bam >> {log} 2>&1 -{params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam >> {log} 2>&1 +samtools view -h {output.td}/mapped_qn.bam | awk 'BEGIN{{FS=OFS="\t"}} /^@/{{print; next}} {{f=$2+0; if (!and(f,256) && !and(f,2048)) {{k=$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next}} print}}' | samtools view -b -o {output.td}/mapped_qn_primaryuniq.bam - >> {log} 2>&1 +{params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn_primaryuniq.bam --reference {params.ref} --output {output.td}/zippered.bam --threads {threads} --compression-level 1 >> {log} 2>&1 +{params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam --threads {threads} --compression-level 1 >> {log} 2>&1 {params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.td}/dedup.bam >> {log} 2>&1 -samtools sort -@ {threads} -o {output.bam} {output.td}/dedup.bam >> {log} 2>&1 -samtools index {output.bam} >> {log} 2>&1 +{params.dedup} sort --order coordinate --input {output.td}/dedup.bam --output {output.bam} --write-index --threads {threads} --compression-level 1 >> {log} 2>&1 rm {input.ubam}""" From 0d8460f87dbc68a19ae158e67ee8cbaa1fc74afe Mon Sep 17 00:00:00 2001 From: jfallmann Date: Fri, 24 Apr 2026 14:52:07 +0200 Subject: [PATCH 11/39] fgumi update --- workflows/fgumi.nf | 1 - workflows/fgumi.smk | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf index 566abfc3..190c583f 100644 --- a/workflows/fgumi.nf +++ b/workflows/fgumi.nf @@ -104,7 +104,6 @@ process dedup_bam{ $DEDUPBIN sort --order template-coordinate --input tmp/zippered.bam --output tmp/sorted.bam --threads ${task.cpus} --compression-level 1 >> dedup.log 2>&1 $DEDUPBIN dedup $DEDUPPARAMS --input tmp/sorted.bam --output tmp/dedup.bam >> dedup.log 2>&1 $DEDUPBIN sort --order coordinate --input tmp/dedup.bam --output ${mapped_bam.baseName}_dedup.bam --write-index --threads ${task.cpus} --compression-level 1 >> dedup.log 2>&1 - rm $ubam """ } diff --git a/workflows/fgumi.smk b/workflows/fgumi.smk index 04d25ba1..edeacf1e 100644 --- a/workflows/fgumi.smk +++ b/workflows/fgumi.smk @@ -59,7 +59,7 @@ samtools view -h {output.td}/mapped_qn.bam | awk 'BEGIN{{FS=OFS="\t"}} /^@/{{pri {params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam --threads {threads} --compression-level 1 >> {log} 2>&1 {params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.td}/dedup.bam >> {log} 2>&1 {params.dedup} sort --order coordinate --input {output.td}/dedup.bam --output {output.bam} --write-index --threads {threads} --compression-level 1 >> {log} 2>&1 -rm {input.ubam}""" +""" else: rule dedupbam: input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam", @@ -85,4 +85,4 @@ samtools view -h {output.td}/mapped_qn.bam | awk 'BEGIN{{FS=OFS="\t"}} /^@/{{pri {params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam --threads {threads} --compression-level 1 >> {log} 2>&1 {params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.td}/dedup.bam >> {log} 2>&1 {params.dedup} sort --order coordinate --input {output.td}/dedup.bam --output {output.bam} --write-index --threads {threads} --compression-level 1 >> {log} 2>&1 -rm {input.ubam}""" +""" From ffb8e3dd5121e24e823d3248b77f9a8ba1692183 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Fri, 24 Apr 2026 15:04:21 +0200 Subject: [PATCH 12/39] fgumi nf split/update --- MONSDA/Workflows.py | 40 ++++++++++++++++++++++++-------- workflows/fgumi.nf | 50 ---------------------------------------- workflows/fgumi_dedup.nf | 45 ++++++++++++++++++++++++++++++------ 3 files changed, 68 insertions(+), 67 deletions(-) diff --git a/MONSDA/Workflows.py b/MONSDA/Workflows.py index 709b95e1..3b93851f 100755 --- a/MONSDA/Workflows.py +++ b/MONSDA/Workflows.py @@ -2712,6 +2712,9 @@ def nf_make_pre( subjobs.append(line) subjobs.append("\n\n") + if "DEDUP" in works: + flowlist.append("DEDUPBAM") + tp.append( nf_tool_params( subsamples[0], @@ -2932,6 +2935,7 @@ def nf_make_sub( flowlist.append("TRIMMING") if works[j] == "DEDUP": + deduptool = toolenv if toolenv in ["umitools", "fgumi"]: flowlist.append("PREDEDUP") subconf["PREDEDUP"] = "enabled" @@ -3101,11 +3105,18 @@ def nf_make_sub( " " * 4 + "POSTMAPPING(MAPPING.out.mapped)\n" ) elif w == "DEDUPBAM": - subjobs.append( - " " * 4 - + w - + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai)\n" - ) + if deduptool == "fgumi": + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai, DEDUPEXTRACT.out.ubam)\n" + ) + else: + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai)\n" + ) elif w == "QC_MAPPING": if "DEDUPBAM" in flowlist: subjobs.append( @@ -3205,6 +3216,7 @@ def nf_make_sub( subjobs = list() subconf = mu.NestedDefaultDict() tp = list() + deduptool = None for subwork in subworkflows: log.debug(logid + "PREPARING " + str(subwork) + " " + str(condition)) @@ -3292,6 +3304,7 @@ def nf_make_sub( if subwork == "DEDUP" and toolenv == "picard": subname = toolenv + "_dedup.nf" elif subwork == "DEDUP" and toolenv in ["umitools", "fgumi"]: + deduptool = toolenv flowlist.append("PREDEDUP") subconf["PREDEDUP"] = "enabled" if "QC" in flowlist: @@ -3440,11 +3453,18 @@ def nf_make_sub( subjobs.append(" " * 4 + w + "(TRIMMING.out.trimmed)\n") subjobs.append(" " * 4 + "POSTMAPPING(MAPPING.out.mapped)\n") elif w == "DEDUPBAM": - subjobs.append( - " " * 4 - + w - + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai)\n" - ) + if deduptool == "fgumi": + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai, DEDUPEXTRACT.out.ubam)\n" + ) + else: + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai)\n" + ) elif w == "QC_MAPPING": if "DEDUPBAM" in flowlist: subjobs.append( diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf index 190c583f..9e9547b9 100644 --- a/workflows/fgumi.nf +++ b/workflows/fgumi.nf @@ -74,53 +74,3 @@ workflow DEDUPEXTRACT{ logs = extract_fq.out.logs } -process dedup_bam{ - conda "$DEDUPENV"+".yaml" - container "oras://jfallmann/monsda:"+"$DEDUPENV" - cpus 4 - cache 'lenient' - - publishDir "${workflow.workDir}/../MAPPED/${COMBO}/${CONDITION}" , mode: 'link' - - input: - path mapped_bam - path ubam - path ref - - output: - path "${mapped_bam.baseName}_dedup.bam", emit: dedup_bam - path "${mapped_bam.baseName}_dedup.bam.bai", emit: dedup_idx - path "dedup.log", emit: logs - - script: - """ - mkdir -p tmp - ref_dict=\$(basename $ref .gz).dict - if [[ ! -f "\${ref_dict}" ]]; then samtools dict $ref -o \${ref_dict} >> dedup.log 2>&1; fi - samtools sort -n -@ ${task.cpus} -o tmp/ubam_qn.bam $ubam >> dedup.log 2>&1 - samtools sort -n -@ ${task.cpus} -o tmp/mapped_qn.bam $mapped_bam >> dedup.log 2>&1 - samtools view -h tmp/mapped_qn.bam | awk 'BEGIN{FS=OFS="\t"} /^@/{print; next} {f=\$2+0; if (!and(f,256) && !and(f,2048)) {k=\$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next} print}' | samtools view -b -o tmp/mapped_qn_primaryuniq.bam - >> dedup.log 2>&1 - $DEDUPBIN zipper --unmapped tmp/ubam_qn.bam --input tmp/mapped_qn_primaryuniq.bam --reference $ref --output tmp/zippered.bam --threads ${task.cpus} --compression-level 1 >> dedup.log 2>&1 - $DEDUPBIN sort --order template-coordinate --input tmp/zippered.bam --output tmp/sorted.bam --threads ${task.cpus} --compression-level 1 >> dedup.log 2>&1 - $DEDUPBIN dedup $DEDUPPARAMS --input tmp/sorted.bam --output tmp/dedup.bam >> dedup.log 2>&1 - $DEDUPBIN sort --order coordinate --input tmp/dedup.bam --output ${mapped_bam.baseName}_dedup.bam --write-index --threads ${task.cpus} --compression-level 1 >> dedup.log 2>&1 - """ -} - -workflow DEDUPBAM{ - take: - map - mapi - mapu - mapui - ubam - - main: - ref_ch = Channel.fromPath(REFERENCE) - dedup_bam(map.concat(mapu), ubam, ref_ch) - - emit: - dedup = dedup_bam.out.dedup_bam - dedupbai = dedup_bam.out.dedup_idx - deduplog = dedup_bam.out.logs -} diff --git a/workflows/fgumi_dedup.nf b/workflows/fgumi_dedup.nf index 05ff5ed1..608b288b 100644 --- a/workflows/fgumi_dedup.nf +++ b/workflows/fgumi_dedup.nf @@ -6,7 +6,7 @@ DEDUPPARAMS = get_always('fgumi_params_DEDUP') ?: '' process dedup_bam{ conda "$DEDUPENV"+".yaml" container "oras://jfallmann/monsda:"+"$DEDUPENV" - cpus THREADS + cpus 4 cache 'lenient' //validExitStatus 0,1 @@ -19,8 +19,8 @@ process dedup_bam{ } input: - path todedup - path bami + tuple val(sample_id), path(mapped_bam), path(ubam) + path ref output: path "*_dedup.bam", emit: bam @@ -28,12 +28,20 @@ process dedup_bam{ path "*_dedup.log", emit: logs script: - bams = todedup[0] - bais = todedup[1] + bams = mapped_bam outf = bams.getSimpleName()+"_dedup.bam" outl = bams.getSimpleName()+"_dedup.log" """ - mkdir -p TMP && $DEDUPBIN dedup $DEDUPPARAMS --input $bams --output $outf &> $outl && samtools index $outf &>> $outl + mkdir -p TMP + ref_dict=\$(basename $ref .gz).dict + if [[ ! -f "\${ref_dict}" ]]; then samtools dict $ref -o \${ref_dict} >> $outl 2>&1; fi + samtools sort -n -@ ${task.cpus} -o TMP/ubam_qn.bam $ubam >> $outl 2>&1 + samtools sort -n -@ ${task.cpus} -o TMP/mapped_qn.bam $bams >> $outl 2>&1 + samtools view -h TMP/mapped_qn.bam | awk 'BEGIN{FS=OFS="\t"} /^@/{print; next} {f=\$2+0; if (!and(f,256) && !and(f,2048)) {k=\$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next} print}' | samtools view -b -o TMP/mapped_qn_primaryuniq.bam - >> $outl 2>&1 + $DEDUPBIN zipper --unmapped TMP/ubam_qn.bam --input TMP/mapped_qn_primaryuniq.bam --reference $ref --output TMP/zippered.bam --threads ${task.cpus} --compression-level 1 >> $outl 2>&1 + $DEDUPBIN sort --order template-coordinate --input TMP/zippered.bam --output TMP/sorted.bam --threads ${task.cpus} --compression-level 1 >> $outl 2>&1 + $DEDUPBIN dedup $DEDUPPARAMS --input TMP/sorted.bam --output TMP/dedup.bam >> $outl 2>&1 + $DEDUPBIN sort --order coordinate --input TMP/dedup.bam --output $outf --write-index --threads ${task.cpus} --compression-level 1 >> $outl 2>&1 """ } @@ -43,9 +51,32 @@ workflow DEDUPBAM{ mapi mapu mapui + ubam main: - dedup_bam(map.concat(mapu), mapi.concat(mapui)) + mapped_ch = map.concat(mapu).map { b -> + def n = file(b).getName() + def key = n + .replaceFirst(/_mapped_sorted_unique\.bam$/, '') + .replaceFirst(/_mapped_sorted\.bam$/, '') + .replaceFirst(/_R1_dedup_trimmed$/, '') + .replaceFirst(/_dedup_trimmed$/, '') + .replaceFirst(/_R1_trimmed$/, '') + .replaceFirst(/_trimmed$/, '') + tuple(key, b) + } + ubam_ch = ubam.map { u -> + def n = file(u).getName() + def key = n + .replaceFirst(/_fgumi_extract\.bam$/, '') + .replaceFirst(/_extracted\.bam$/, '') + .replaceFirst(/_R1$/, '') + tuple(key, u) + } + paired_ch = mapped_ch.combine(ubam_ch, by: 0).map { key, mb, ub -> tuple(key, mb, ub) } + + ref_ch = channel.value(file(REFERENCE)) + dedup_bam(paired_ch, ref_ch) emit: dedup = dedup_bam.out.bam From cbc335fe1af4bc80e415aef2f145290ff7ba7e6e Mon Sep 17 00:00:00 2001 From: jfallmann Date: Fri, 24 Apr 2026 15:52:40 +0200 Subject: [PATCH 13/39] counttable verbosity fix --- scripts/Analysis/build_count_table.py | 4 ++++ workflows/bwa.nf | 4 ++-- workflows/hisat2.nf | 4 ++-- workflows/star.nf | 6 +++--- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/scripts/Analysis/build_count_table.py b/scripts/Analysis/build_count_table.py index 64401669..5a220a8a 100755 --- a/scripts/Analysis/build_count_table.py +++ b/scripts/Analysis/build_count_table.py @@ -383,6 +383,8 @@ def prepare_table( exc_tb, ) log.error(logid + "".join(tbe.format())) + print("".join(tbe.format()), file=sys.stderr) + raise SystemExit(1) def make_sample_list(group_name): @@ -433,6 +435,8 @@ def make_sample_list(group_name): exc_tb, ) log.error(logid + "".join(tbe.format())) + print("".join(tbe.format()), file=sys.stderr) + raise SystemExit(1) # # build_count_table_simple.py ends here diff --git a/workflows/bwa.nf b/workflows/bwa.nf index be07f149..e436f6f0 100644 --- a/workflows/bwa.nf +++ b/workflows/bwa.nf @@ -89,7 +89,7 @@ process bwa_mapping{ if (PAIRED == 'paired'){ r1 = reads[1] r2 = reads[2] - fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"") + fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"") pf = fn+"_mapped.sam.gz" uf1 = fn+"_R1_unmapped.fastq.gz" uf2 = fn+"_R2_unmapped.fastq.gz" @@ -99,7 +99,7 @@ process bwa_mapping{ """ }else{ read = reads[1] - fn = file(reads[1]).getSimpleName().replaceAll(/\Q_trimmed\E/,"") + fn = file(reads[1]).getSimpleName().replaceAll(/(_dedup)?_trimmed$/,"") pf = fn+"_mapped.sam.gz" uf = fn+"_unmapped.fastq.gz" lf = "bwa_"+fn+".log" diff --git a/workflows/hisat2.nf b/workflows/hisat2.nf index 55bcffc4..cf3d3f89 100644 --- a/workflows/hisat2.nf +++ b/workflows/hisat2.nf @@ -105,7 +105,7 @@ process hisat2_mapping{ r1 = reads[1] r2 = reads[2] - fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"") + fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"") pf = fn+"_mapped.sam" ufo = fn+"_R1_unmapped.fastq.gz" uft = fn+"_R2_unmapped.fastq.gz" @@ -122,7 +122,7 @@ process hisat2_mapping{ stranded = '' } read = reads[1] - fn = file(reads[1]).getSimpleName().replaceAll(/\Q_trimmed\E/,"") + fn = file(reads[1]).getSimpleName().replaceAll(/(_dedup)?_trimmed$/,"") pf = fn+"_mapped.sam" uf = fn+"_unmapped.fastq.gz" lf = "hisat2_"+fn+".log" diff --git a/workflows/star.nf b/workflows/star.nf index 83681e95..62c3563a 100644 --- a/workflows/star.nf +++ b/workflows/star.nf @@ -96,7 +96,7 @@ process star_mapping{ r1 = reads[1] r2 = reads[2] a = "Trimming_report.txt" - fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"") + fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"") of = fn+'.Aligned.out.sam' gf = of.replaceAll(/\Q.Aligned.out.sam\E/,"_mapped.sam.gz") """ @@ -106,7 +106,7 @@ process star_mapping{ else{ if (PAIRED != 'singlecell'){ read = reads[1] - fn = file(reads[1]).getSimpleName().replaceAll(/\Q_trimmed\E/,"")+"." + fn = file(reads[1]).getSimpleName().replaceAll(/(_dedup)?_trimmed$/,"")+"." of = fn+'Aligned.out.sam' gf = of.replaceAll(/\Q.Aligned.out.sam\E/,"_mapped.sam.gz") """ @@ -122,7 +122,7 @@ process star_mapping{ stranded = '--soloStrand Unstranded' } r1 = reads[1] - fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"") + fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"") r2 = "${workflow.workDir}/../FASTQ/${CONDITION}/"+file(reads[2]).getSimpleName().replaceAll(/\QR2_trimmed\E/,"R2.fastq.gz") if (MAPPARAMS.contains('--soloBarcodeMate 1')){ t = r2 From 2e13a34283dfba3bc685e7f0229fbe99ce1856a6 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 27 Apr 2026 13:33:46 +0200 Subject: [PATCH 14/39] docs fix --- docs/source/workflows.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/workflows.rst b/docs/source/workflows.rst index 3295b4f5..008756bc 100644 --- a/docs/source/workflows.rst +++ b/docs/source/workflows.rst @@ -75,8 +75,6 @@ QUALITY CONTROL I This workflow step can be run as preprocessing step if none of the processing workflows is defined in the config.json. -*rustqc* is intended for mapped BAM-level QC and is therefore generally most useful in processing mode after mapping outputs are available. - .. table:: :widths: 10, 40, 10, 10, 10, 10, 10 :class: tight-table From 6a0d2aa77644f8b445d454fc3a1c02c12b5a3a75 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 27 Apr 2026 13:34:01 +0200 Subject: [PATCH 15/39] counttable build from nf dedup fix --- workflows/countreads.nf | 3 ++- workflows/deseq2_DE.nf | 3 ++- workflows/dexseq_DEU.nf | 3 ++- workflows/edger_DAS.nf | 3 ++- workflows/edger_DE.nf | 3 ++- workflows/edger_DEU.nf | 3 ++- 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/workflows/countreads.nf b/workflows/countreads.nf index b542cfab..b54cf7b2 100644 --- a/workflows/countreads.nf +++ b/workflows/countreads.nf @@ -252,7 +252,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DEREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DEREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/deseq2_DE.nf b/workflows/deseq2_DE.nf index e39a645c..d555c913 100644 --- a/workflows/deseq2_DE.nf +++ b/workflows/deseq2_DE.nf @@ -94,7 +94,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DEREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DEREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/dexseq_DEU.nf b/workflows/dexseq_DEU.nf index 8c954541..8d938a17 100644 --- a/workflows/dexseq_DEU.nf +++ b/workflows/dexseq_DEU.nf @@ -133,7 +133,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DEUREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DEUREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/edger_DAS.nf b/workflows/edger_DAS.nf index 9fa62d2b..22806a0a 100644 --- a/workflows/edger_DAS.nf +++ b/workflows/edger_DAS.nf @@ -94,7 +94,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DASREPS --ids --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DASREPS --ids -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/edger_DE.nf b/workflows/edger_DE.nf index 43e0160e..a4683376 100644 --- a/workflows/edger_DE.nf +++ b/workflows/edger_DE.nf @@ -94,7 +94,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DEREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DEREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/edger_DEU.nf b/workflows/edger_DEU.nf index 7891d95d..aa1e5146 100644 --- a/workflows/edger_DEU.nf +++ b/workflows/edger_DEU.nf @@ -94,7 +94,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DEUREPS --ids --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DEUREPS --ids -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } From 6ad833bb8755cc1e0bd5f8a26b89dc0aa38b6829 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 27 Apr 2026 15:05:50 +0200 Subject: [PATCH 16/39] fix in output path generation, we never split QC --- MONSDA/Params.py | 10 +++++++--- MONSDA/Workflows.py | 7 ++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/MONSDA/Params.py b/MONSDA/Params.py index e59c00ef..f342292e 100644 --- a/MONSDA/Params.py +++ b/MONSDA/Params.py @@ -1558,9 +1558,13 @@ def get_combo_name(combinations: list) -> mu.NestedDefaultDict: envs = list() works = list() for step in combi: - for work, env in step.items(): - envs.append(env) - works.append(work) + # A step can be a single dict (default) or a list of dicts + # (QC all-tools grouping from get_combo). + grouped_steps = step if isinstance(step, list) else [step] + for grouped_step in grouped_steps: + for work, env in grouped_step.items(): + envs.append(env) + works.append(work) combname[condition]["envs"].append(str.join("-", envs)) combname[condition]["works"].append(str.join("-", works)) diff --git a/MONSDA/Workflows.py b/MONSDA/Workflows.py index 3b93851f..b2f01153 100755 --- a/MONSDA/Workflows.py +++ b/MONSDA/Workflows.py @@ -192,7 +192,12 @@ def get_combo(wfs, config, conditions): + " with Tool: " + str(tools) ) - ret.append(tools) + # For QC, run all configured QC tools in the same workflow combo + # (instead of creating one combo per QC tool). + if subwork == "QC": + ret.append([tools]) + else: + ret.append(tools) log.debug(f"{logid} Itertools {ret}") combos[condition] = itertools.product(*ret) From 1a2bc9f157fb7433ae5deb987ede0628eab3a7f1 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 27 Apr 2026 20:04:11 +0200 Subject: [PATCH 17/39] split in pre and postqc workflows --- MONSDA/Params.py | 10 +--- MONSDA/Workflows.py | 101 ++++++++++++++++++++++++-------- workflows/fastqc.nf | 4 +- workflows/fastqc.smk | 2 +- workflows/fastqc_dedup.nf | 4 +- workflows/fastqc_dedup.smk | 2 +- workflows/fastqc_dedup_trim.nf | 4 +- workflows/fastqc_dedup_trim.smk | 2 +- workflows/fastqc_raw.nf | 4 +- workflows/fastqc_raw.smk | 2 +- workflows/fastqc_trim.nf | 4 +- workflows/fastqc_trim.smk | 2 +- workflows/multiqc.nf | 4 +- workflows/multiqc_rustqc.nf | 65 ++++++++++++++++++++ workflows/rustqc.nf | 4 +- workflows/rustqc.smk | 2 +- 16 files changed, 165 insertions(+), 51 deletions(-) create mode 100644 workflows/multiqc_rustqc.nf diff --git a/MONSDA/Params.py b/MONSDA/Params.py index f342292e..e59c00ef 100644 --- a/MONSDA/Params.py +++ b/MONSDA/Params.py @@ -1558,13 +1558,9 @@ def get_combo_name(combinations: list) -> mu.NestedDefaultDict: envs = list() works = list() for step in combi: - # A step can be a single dict (default) or a list of dicts - # (QC all-tools grouping from get_combo). - grouped_steps = step if isinstance(step, list) else [step] - for grouped_step in grouped_steps: - for work, env in grouped_step.items(): - envs.append(env) - works.append(work) + for work, env in step.items(): + envs.append(env) + works.append(work) combname[condition]["envs"].append(str.join("-", envs)) combname[condition]["works"].append(str.join("-", works)) diff --git a/MONSDA/Workflows.py b/MONSDA/Workflows.py index b2f01153..4eae67c3 100755 --- a/MONSDA/Workflows.py +++ b/MONSDA/Workflows.py @@ -192,12 +192,7 @@ def get_combo(wfs, config, conditions): + " with Tool: " + str(tools) ) - # For QC, run all configured QC tools in the same workflow combo - # (instead of creating one combo per QC tool). - if subwork == "QC": - ret.append([tools]) - else: - ret.append(tools) + ret.append(tools) log.debug(f"{logid} Itertools {ret}") combos[condition] = itertools.product(*ret) @@ -569,8 +564,15 @@ def make_pre( toolenv, toolbin = map(str, listoftools[a]) if toolenv != envs[j] or toolbin is None: continue - sconf[works[j] + "ENV"] = toolenv - sconf[works[j] + "BIN"] = toolbin + if works[j] == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[works[j] + "ENV"] = toolenv + sconf[works[j] + "BIN"] = toolbin subconf.update(sconf) subname = toolenv + ".smk" @@ -695,8 +697,15 @@ def make_pre( if toolenv is None or toolbin is None: continue subconf = mu.NestedDefaultDict() - sconf[subwork + "ENV"] = toolenv - sconf[subwork + "BIN"] = toolbin + if subwork == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[subwork + "ENV"] = toolenv + sconf[subwork + "BIN"] = toolbin subconf.update(sconf) subname = toolenv + ".smk" @@ -884,8 +893,15 @@ def make_sub( toolenv.replace("bisulfite", "_bisulfite") + ".smk" ) - sconf[works[j] + "ENV"] = toolenv - sconf[works[j] + "BIN"] = toolbin + if works[j] == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[works[j] + "ENV"] = toolenv + sconf[works[j] + "BIN"] = toolbin subconf.update(sconf) log.debug(logid + f"SCONF:{sconf}, SUBCONF:{subconf}") @@ -1071,8 +1087,15 @@ def make_sub( ): # Here we can add tool specific extra cases, like e.g segehmehl bisulfite mode subname = toolenv.replace("bisulfite", "_bisulfite") + ".smk" subconf = mu.NestedDefaultDict() - sconf[subwork + "ENV"] = toolenv - sconf[subwork + "BIN"] = toolbin + if subwork == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[subwork + "ENV"] = toolenv + sconf[subwork + "BIN"] = toolbin subconf.update(sconf) # RustQC is post-alignment QC only; skip if no MAPPING @@ -2475,8 +2498,15 @@ def nf_make_pre( continue subsamples = mp.get_samples(sconf) - sconf[works[j] + "ENV"] = toolenv - sconf[works[j] + "BIN"] = toolbin + if works[j] == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[works[j] + "ENV"] = toolenv + sconf[works[j] + "BIN"] = toolbin subconf.merge(sconf) subconf[works[j]] = mu.add_to_innermost_key_by_list( @@ -2666,8 +2696,15 @@ def nf_make_pre( toolenv, toolbin = map(str, listoftools[i]) if toolenv is None or toolbin is None: continue - sconf[subwork + "ENV"] = toolenv - sconf[subwork + "BIN"] = toolbin + if subwork == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[subwork + "ENV"] = toolenv + sconf[subwork + "BIN"] = toolbin subconf.merge(sconf) subconf[subwork] = mu.add_to_innermost_key_by_list( @@ -2888,8 +2925,15 @@ def nf_make_sub( ): # Here we can add tool specific extra cases, like e.g segehmehl bisulfite mode subname = toolenv.replace("bisulfite", "_bisulfite") + ".nf" subsamples = mp.get_samples(sconf) - sconf[works[j] + "ENV"] = toolenv - sconf[works[j] + "BIN"] = toolbin + if works[j] == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[works[j] + "ENV"] = toolenv + sconf[works[j] + "BIN"] = toolbin subconf.merge(sconf) subconf[works[j]] = mu.add_to_innermost_key_by_list( @@ -3054,7 +3098,8 @@ def nf_make_sub( if "QC" in works: flowlist.append("MULTIQC") - nfi = os.path.abspath(os.path.join(workflowpath, "multiqc.nf")) + _mqc_nf = "multiqc_rustqc.nf" if "QC_MAPPING" in flowlist and "QC_RAW" not in flowlist else "multiqc.nf" + nfi = os.path.abspath(os.path.join(workflowpath, _mqc_nf)) with open(nfi, "r") as nf: for line in mu.comment_remover(nf.readlines()): line = re.sub(condapath, 'conda "' + envpath, line) @@ -3250,8 +3295,15 @@ def nf_make_sub( ): # Here we can add tool specific extra cases, like e.g segehmehl bisulfite mode subname = toolenv.replace("bisulfite", "_bisulfite") + ".nf" - sconf[subwork + "ENV"] = toolenv - sconf[subwork + "BIN"] = toolbin + if subwork == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[subwork + "ENV"] = toolenv + sconf[subwork + "BIN"] = toolbin subconf.merge(sconf) subconf[subwork] = mu.add_to_innermost_key_by_list( @@ -3406,7 +3458,8 @@ def nf_make_sub( if "QC" in subworkflows: flowlist.append("MULTIQC") - nfi = os.path.abspath(os.path.join(workflowpath, "multiqc.nf")) + _mqc_nf = "multiqc_rustqc.nf" if "QC_MAPPING" in flowlist and "QC_RAW" not in flowlist else "multiqc.nf" + nfi = os.path.abspath(os.path.join(workflowpath, _mqc_nf)) with open(nfi, "r") as nf: for line in mu.comment_remover(nf.readlines()): line = re.sub(condapath, 'conda "' + envpath, line) diff --git a/workflows/fastqc.nf b/workflows/fastqc.nf index 9ef084e1..77bd87d0 100644 --- a/workflows/fastqc.nf +++ b/workflows/fastqc.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_QC') ?: '' //QC RAW diff --git a/workflows/fastqc.smk b/workflows/fastqc.smk index b25b1787..7694bbaa 100644 --- a/workflows/fastqc.smk +++ b/workflows/fastqc.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config(config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'PREQC') if paired == 'paired': log.info('Running paired mode QC') diff --git a/workflows/fastqc_dedup.nf b/workflows/fastqc_dedup.nf index 62cf54a3..85665236 100644 --- a/workflows/fastqc_dedup.nf +++ b/workflows/fastqc_dedup.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_QC') ?: '' // RAW QC diff --git a/workflows/fastqc_dedup.smk b/workflows/fastqc_dedup.smk index b9f8676e..923115c0 100644 --- a/workflows/fastqc_dedup.smk +++ b/workflows/fastqc_dedup.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config(config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'PREQC') #outdir = 'QC/'+str(QCENV)+'/' #moutdir = 'QC/Multi/'+str(QCENV)+'/' diff --git a/workflows/fastqc_dedup_trim.nf b/workflows/fastqc_dedup_trim.nf index d875ba3e..2e53b7f2 100644 --- a/workflows/fastqc_dedup_trim.nf +++ b/workflows/fastqc_dedup_trim.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_QC') ?: '' //QC RAW diff --git a/workflows/fastqc_dedup_trim.smk b/workflows/fastqc_dedup_trim.smk index c5b12743..08d5f183 100644 --- a/workflows/fastqc_dedup_trim.smk +++ b/workflows/fastqc_dedup_trim.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config( config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'PREQC') #outdir = 'QC/'+str(QCENV)+'/' #moutdir = 'QC/Multi/'+str(QCENV)+'/' diff --git a/workflows/fastqc_raw.nf b/workflows/fastqc_raw.nf index d83e612c..70e1bb81 100644 --- a/workflows/fastqc_raw.nf +++ b/workflows/fastqc_raw.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_QC') ?: '' process qc_raw{ diff --git a/workflows/fastqc_raw.smk b/workflows/fastqc_raw.smk index b9dfe662..1fe7ab37 100644 --- a/workflows/fastqc_raw.smk +++ b/workflows/fastqc_raw.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config(config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'PREQC') if paired == 'paired': log.info('Running paired mode QC') diff --git a/workflows/fastqc_trim.nf b/workflows/fastqc_trim.nf index 5aa1f262..badcea20 100644 --- a/workflows/fastqc_trim.nf +++ b/workflows/fastqc_trim.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_QC') ?: '' // RAW QC diff --git a/workflows/fastqc_trim.smk b/workflows/fastqc_trim.smk index c4088665..5446e889 100644 --- a/workflows/fastqc_trim.smk +++ b/workflows/fastqc_trim.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config( config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'PREQC') #outdir = 'QC/'+str(QCENV)+'/' #moutdir = 'QC/Multi/'+str(QCENV)+'/' diff --git a/workflows/multiqc.nf b/workflows/multiqc.nf index e82010ce..f51c1c0d 100644 --- a/workflows/multiqc.nf +++ b/workflows/multiqc.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_MULTI') ?: '' process mqc{ diff --git a/workflows/multiqc_rustqc.nf b/workflows/multiqc_rustqc.nf new file mode 100644 index 00000000..71cfe752 --- /dev/null +++ b/workflows/multiqc_rustqc.nf @@ -0,0 +1,65 @@ +QCENV=get_always('POSTQCENV') +QCBIN=get_always('POSTQCBIN') +QCPARAMS = get_always('rustqc_params_MULTI') ?: '' + +process mqc{ + conda "$QCENV"+".yaml" + container "oras://jfallmann/monsda:"+"$QCENV" + cpus THREADS + cache 'lenient' + //validExitStatus 0,1 + + publishDir "${workflow.workDir}/../" , mode: 'link', + saveAs: {filename -> + if (filename.indexOf("zip") > 0) "QC/Multi/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.zip" + else if (filename.indexOf("html") > 0) "QC/Multi/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.html" + else "QC/Multi/${COMBO}/${CONDITION}/${file(filename).getName()}" + } + + input: + path others, stageAs: 'mqc_input??/*' + //path samples + + output: + path "*.zip", emit: mqc + path "*.html", emit: html + + script: + """ + touch $others + OUT=\${PWD} + LIST=multiqc_inputs.txt + TMP_LIST=multiqc_inputs_unique.txt + BASE_QC_DIR="${workflow.workDir}/../QC" + COMBO_VAL="${COMBO}" + CONDITION_VAL="${CONDITION}" + + for i in $others; do + dirname "\$i" >> "\$LIST" + done + + # If the corresponding fastqc combo exists, include its output in the MultiQC report. + FQ_COMBO="\${COMBO_VAL/rustqc/fastqc}" + FQ_DIR="\${BASE_QC_DIR}/\${FQ_COMBO}/\${CONDITION_VAL}" + if [[ -d "\$FQ_DIR" ]]; then + echo "\$FQ_DIR" >> "\$LIST" + fi + + sort -u "\$LIST" > "\$TMP_LIST" + export LC_ALL=en_US.utf8 + export LC_ALL=C.UTF-8 + multiqc -f --exclude picard --exclude gatk -k json -z -s -o "\$OUT" -l "\$TMP_LIST" + """ +} + +workflow MULTIQC{ + take: + otherqcs + + main: + + mqc(otherqcs.collect()) + + emit: + mqcres = mqc.out.mqc +} diff --git a/workflows/rustqc.nf b/workflows/rustqc.nf index 1505b0a3..7808bf23 100644 --- a/workflows/rustqc.nf +++ b/workflows/rustqc.nf @@ -1,5 +1,5 @@ -QCENV = get_always('QCENV') -QCBIN = get_always('QCBIN') +QCENV = get_always('POSTQCENV') +QCBIN = get_always('POSTQCBIN') QCPARAMS = get_always('rustqc_params_QC') ?: '' MAPANNO = get_always('MAPPINGANNO') diff --git a/workflows/rustqc.smk b/workflows/rustqc.smk index 67fa3e93..d80863e5 100644 --- a/workflows/rustqc.smk +++ b/workflows/rustqc.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config(config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'POSTQC') # Map MONSDA strandedness to RustQC strandedness def rustqc_stranded(stranded): From 4e2e2081c41a7a58a1744ca6cb59a9f521d85f48 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 27 Apr 2026 20:15:47 +0200 Subject: [PATCH 18/39] merge QC into one stage --- MONSDA/Params.py | 12 +++++++++--- MONSDA/Workflows.py | 7 ++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/MONSDA/Params.py b/MONSDA/Params.py index e59c00ef..dfa66e28 100644 --- a/MONSDA/Params.py +++ b/MONSDA/Params.py @@ -1558,9 +1558,15 @@ def get_combo_name(combinations: list) -> mu.NestedDefaultDict: envs = list() works = list() for step in combi: - for work, env in step.items(): - envs.append(env) - works.append(work) + if isinstance(step, list): + for substep in step: + for work, env in substep.items(): + envs.append(env) + works.append(work) + else: + for work, env in step.items(): + envs.append(env) + works.append(work) combname[condition]["envs"].append(str.join("-", envs)) combname[condition]["works"].append(str.join("-", works)) diff --git a/MONSDA/Workflows.py b/MONSDA/Workflows.py index 4eae67c3..6e3f2938 100755 --- a/MONSDA/Workflows.py +++ b/MONSDA/Workflows.py @@ -192,7 +192,12 @@ def get_combo(wfs, config, conditions): + " with Tool: " + str(tools) ) - ret.append(tools) + # Group all QC tools into a single combo position so pre-QC + # (fastqc) and post-QC (rustqc) both appear in one combo name. + if subwork == "QC" and len(tools) > 1: + ret.append([tools]) # one option = the entire list of QC tools + else: + ret.append(tools) log.debug(f"{logid} Itertools {ret}") combos[condition] = itertools.product(*ret) From 3e8d4ee98d978a16270de8f8105043500a096ba1 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 27 Apr 2026 20:25:21 +0200 Subject: [PATCH 19/39] nf merge qc fix --- MONSDA/Workflows.py | 95 +++++++++++++++++++++++++++++++++++++-------- workflows/rustqc.nf | 2 +- 2 files changed, 80 insertions(+), 17 deletions(-) diff --git a/MONSDA/Workflows.py b/MONSDA/Workflows.py index 6e3f2938..650d612e 100755 --- a/MONSDA/Workflows.py +++ b/MONSDA/Workflows.py @@ -2958,7 +2958,7 @@ def nf_make_sub( if toolenv == "rustqc": # RustQC is post-alignment QC only if "MAPPING" in works: - flowlist.append("QC_MAPPING") + flowlist.append("RUSTQC_MAPPING") else: log.warning( logid @@ -3103,7 +3103,14 @@ def nf_make_sub( if "QC" in works: flowlist.append("MULTIQC") - _mqc_nf = "multiqc_rustqc.nf" if "QC_MAPPING" in flowlist and "QC_RAW" not in flowlist else "multiqc.nf" + _mqc_nf = ( + "multiqc_rustqc.nf" + if ( + ("QC_MAPPING" in flowlist or "RUSTQC_MAPPING" in flowlist) + and "QC_RAW" not in flowlist + ) + else "multiqc.nf" + ) nfi = os.path.abspath(os.path.join(workflowpath, _mqc_nf)) with open(nfi, "r") as nf: for line in mu.comment_remover(nf.readlines()): @@ -3124,6 +3131,14 @@ def nf_make_sub( # workflow merger log.debug("FLOWLIST: " + str(flowlist)) + map_qc_chan = "QC_MAPPING.out.qc" + if "RUSTQC_MAPPING" in flowlist: + map_qc_chan = ( + "QC_MAPPING.out.qc.concat(RUSTQC_MAPPING.out.qc)" + if "QC_MAPPING" in flowlist + else "RUSTQC_MAPPING.out.qc" + ) + subjobs.append("\n\n" + "workflow {\n") for w in [ "QC_RAW", @@ -3134,6 +3149,7 @@ def nf_make_sub( "MAPPING", "DEDUPBAM", "QC_MAPPING", + "RUSTQC_MAPPING", "MULTIQC", ]: if w in flowlist: @@ -3185,17 +3201,33 @@ def nf_make_sub( + w + "(POSTMAPPING.out.postmap.concat(POSTMAPPING.out.postmapuni))\n" ) + elif w == "RUSTQC_MAPPING": + if "DEDUPBAM" in flowlist: + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap.concat(POSTMAPPING.out.postmapuni.concat(DEDUPBAM.out.dedup)))\n" + ) + else: + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap.concat(POSTMAPPING.out.postmapuni))\n" + ) elif w == "MULTIQC": - if "QC_RAW" not in flowlist and "QC_MAPPING" in flowlist: + if "QC_RAW" not in flowlist and ( + "QC_MAPPING" in flowlist + or "RUSTQC_MAPPING" in flowlist + ): # RustQC: only BAM-level QC, no FASTQ QC subjobs.append( - " " * 4 + w + "(QC_MAPPING.out.qc.collect())\n" + " " * 4 + w + f"({map_qc_chan}.collect())\n" ) elif "DEDUPBAM" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat(QC_MAPPING.out.qc.concat(MAPPING.out.logs))).collect())\n" + + f"(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat({map_qc_chan}.concat(MAPPING.out.logs))).collect())\n" ) elif ( "DEDUPBAM" in flowlist and "QC_TRIMMING" not in flowlist @@ -3203,13 +3235,13 @@ def nf_make_sub( subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_MAPPING.out.qc.concat(MAPPING.out.logs)).collect())\n" + + f"(QC_RAW.out.qc.concat({map_qc_chan}.concat(MAPPING.out.logs)).collect())\n" ) elif "MAPPING" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat(QC_MAPPING.out.qc.concat(POSTMAPPING.out.postmapuni))).collect())\n" + + f"(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat({map_qc_chan}.concat(POSTMAPPING.out.postmapuni))).collect())\n" ) elif ( "MAPPING" in flowlist and "QC_TRIMMING" not in flowlist @@ -3217,7 +3249,7 @@ def nf_make_sub( subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_MAPPING.out.qc.concat(POSTMAPPING.out.postmapuni)).collect())\n" + + f"(QC_RAW.out.qc.concat({map_qc_chan}.concat(POSTMAPPING.out.postmapuni)).collect())\n" ) elif "TRIMMING" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( @@ -3328,7 +3360,7 @@ def nf_make_sub( if toolenv == "rustqc": # RustQC is post-alignment QC only if "MAPPING" in subworkflows: - flowlist.append("QC_MAPPING") + flowlist.append("RUSTQC_MAPPING") else: log.warning( logid @@ -3463,7 +3495,14 @@ def nf_make_sub( if "QC" in subworkflows: flowlist.append("MULTIQC") - _mqc_nf = "multiqc_rustqc.nf" if "QC_MAPPING" in flowlist and "QC_RAW" not in flowlist else "multiqc.nf" + _mqc_nf = ( + "multiqc_rustqc.nf" + if ( + ("QC_MAPPING" in flowlist or "RUSTQC_MAPPING" in flowlist) + and "QC_RAW" not in flowlist + ) + else "multiqc.nf" + ) nfi = os.path.abspath(os.path.join(workflowpath, _mqc_nf)) with open(nfi, "r") as nf: for line in mu.comment_remover(nf.readlines()): @@ -3484,6 +3523,14 @@ def nf_make_sub( # workflow merger log.debug("FLOWLIST: " + str(flowlist)) + map_qc_chan = "QC_MAPPING.out.qc" + if "RUSTQC_MAPPING" in flowlist: + map_qc_chan = ( + "QC_MAPPING.out.qc.concat(RUSTQC_MAPPING.out.qc)" + if "QC_MAPPING" in flowlist + else "RUSTQC_MAPPING.out.qc" + ) + subjobs.append("\n\n" + "workflow {\n") for w in [ "QC_RAW", @@ -3494,6 +3541,7 @@ def nf_make_sub( "MAPPING", "DEDUPBAM", "QC_MAPPING", + "RUSTQC_MAPPING", "MULTIQC", ]: if w in flowlist: @@ -3541,35 +3589,50 @@ def nf_make_sub( + w + "(POSTMAPPING.out.postmap.concat(POSTMAPPING.out.postmapuni))\n" ) + elif w == "RUSTQC_MAPPING": + if "DEDUPBAM" in flowlist: + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap.concat(POSTMAPPING.out.postmapuni.concat(DEDUPBAM.out.dedup)))\n" + ) + else: + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap.concat(POSTMAPPING.out.postmapuni))\n" + ) elif w == "MULTIQC": - if "QC_RAW" not in flowlist and "QC_MAPPING" in flowlist: + if "QC_RAW" not in flowlist and ( + "QC_MAPPING" in flowlist or "RUSTQC_MAPPING" in flowlist + ): # RustQC: only BAM-level QC, no FASTQ QC subjobs.append( - " " * 4 + w + "(QC_MAPPING.out.qc.collect())\n" + " " * 4 + w + f"({map_qc_chan}.collect())\n" ) elif "DEDUPBAM" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat(QC_MAPPING.out.qc.concat(MAPPING.out.logs))).collect())\n" + + f"(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat({map_qc_chan}.concat(MAPPING.out.logs))).collect())\n" ) elif "DEDUPBAM" in flowlist and "QC_TRIMMING" not in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_MAPPING.out.qc.concat(MAPPING.out.logs)).collect())\n" + + f"(QC_RAW.out.qc.concat({map_qc_chan}.concat(MAPPING.out.logs)).collect())\n" ) elif "MAPPING" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat(QC_MAPPING.out.qc.concat(POSTMAPPING.out.postmapuni))).collect())\n" + + f"(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat({map_qc_chan}.concat(POSTMAPPING.out.postmapuni))).collect())\n" ) elif "MAPPING" in flowlist and "QC_TRIMMING" not in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_MAPPING.out.qc.concat(POSTMAPPING.out.postmapuni)).collect())\n" + + f"(QC_RAW.out.qc.concat({map_qc_chan}.concat(POSTMAPPING.out.postmapuni)).collect())\n" ) elif "TRIMMING" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( diff --git a/workflows/rustqc.nf b/workflows/rustqc.nf index 7808bf23..914cd8c4 100644 --- a/workflows/rustqc.nf +++ b/workflows/rustqc.nf @@ -44,7 +44,7 @@ process rustqc_mapped{ """ } -workflow QC_MAPPING{ +workflow RUSTQC_MAPPING{ take: collection main: From 411fcc9a216252c19067af2289d1f1abd7de0cd6 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 27 Apr 2026 20:29:15 +0200 Subject: [PATCH 20/39] deterministic qc order --- MONSDA/Workflows.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MONSDA/Workflows.py b/MONSDA/Workflows.py index 650d612e..e07c630c 100755 --- a/MONSDA/Workflows.py +++ b/MONSDA/Workflows.py @@ -195,6 +195,14 @@ def get_combo(wfs, config, conditions): # Group all QC tools into a single combo position so pre-QC # (fastqc) and post-QC (rustqc) both appear in one combo name. if subwork == "QC" and len(tools) > 1: + qc_prio = {"fastqc": 0, "rustqc": 1} + tools = sorted( + tools, + key=lambda item: ( + qc_prio.get(list(item.values())[0], 99), + list(item.values())[0], + ), + ) ret.append([tools]) # one option = the entire list of QC tools else: ret.append(tools) From 490b27b6f08c8f7c9246742fbfa4fce71dbd9b95 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 27 Apr 2026 20:36:59 +0200 Subject: [PATCH 21/39] nf key fix --- MONSDA/Workflows.py | 66 ++++++++++++++++++++++++++++++++++----------- workflows/rustqc.nf | 12 ++++----- 2 files changed, 57 insertions(+), 21 deletions(-) diff --git a/MONSDA/Workflows.py b/MONSDA/Workflows.py index e07c630c..af51f7e3 100755 --- a/MONSDA/Workflows.py +++ b/MONSDA/Workflows.py @@ -2291,13 +2291,35 @@ def nf_tool_params( if toolenv else mu.sub_dict(config[subwork], x)["OPTIONS"] ) - tp.append( - "--" + subwork + "ENV " + toolenv + " --" + subwork + "BIN " + toolbin + " " - ) + if subwork == "QC": + qcprefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + tp.append( + "--" + + qcprefix + + "ENV " + + toolenv + + " --" + + qcprefix + + "BIN " + + toolbin + + " " + ) + else: + tp.append( + "--" + + subwork + + "ENV " + + toolenv + + " --" + + subwork + + "BIN " + + toolbin + + " " + ) toolpar = list() if "star" in [toolenv, toolbin]: - np['INDEX'] = mp.fixRunParameters(config, toolenv, sample, None, 'MAPPING', 'INDEX', "--sjdbGTFfile", "--sjdbGTFfile tmp_anno") + np['INDEX'] = mp.fixRunParameters(config, toolenv, sample, None, 'MAPPING', 'INDEX', "--sjdbGTFfile", "--sjdbGTFfile tmp_anno") for key, val in np.items(): pars = val if val and val != "" else None if pars: @@ -2310,17 +2332,31 @@ def nf_tool_params( toolenv, toolbin = map(str, [sd["ENV"], sd["BIN"]]) np = sd[toolenv.split("_")[0]]["OPTIONS"] if toolenv else sd["OPTIONS"] - tp.append( - "--" - + subwork - + "ENV " - + toolenv - + " --" - + subwork - + "BIN " - + toolbin - + " " - ) + if subwork == "QC": + qcprefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + tp.append( + "--" + + qcprefix + + "ENV " + + toolenv + + " --" + + qcprefix + + "BIN " + + toolbin + + " " + ) + else: + tp.append( + "--" + + subwork + + "ENV " + + toolenv + + " --" + + subwork + + "BIN " + + toolbin + + " " + ) toolpar = list() for key, val in np.items(): diff --git a/workflows/rustqc.nf b/workflows/rustqc.nf index 914cd8c4..9ce3a806 100644 --- a/workflows/rustqc.nf +++ b/workflows/rustqc.nf @@ -1,6 +1,6 @@ -QCENV = get_always('POSTQCENV') -QCBIN = get_always('POSTQCBIN') -QCPARAMS = get_always('rustqc_params_QC') ?: '' +RUSTQCENV = get_always('POSTQCENV') +RUSTQCBIN = get_always('POSTQCBIN') +RUSTQCPARAMS = get_always('rustqc_params_QC') ?: '' MAPANNO = get_always('MAPPINGANNO') @@ -17,8 +17,8 @@ RUSTQC_PAIRED = (PAIRED == 'paired') ? '-p' : '' //RUSTQC on mapped BAMs process rustqc_mapped{ - conda "$QCENV"+".yaml" - container "oras://jfallmann/monsda:"+"$QCENV" + conda "$RUSTQCENV"+".yaml" + container "oras://jfallmann/monsda:"+"$RUSTQCENV" cpus THREADS cache 'lenient' label 'big_mem' @@ -40,7 +40,7 @@ process rustqc_mapped{ fn = file(bam).getSimpleName() anno = file("${workflow.workDir}/../${MAPANNO}") """ - rustqc rna $bam --gtf $anno -t ${task.cpus} $RUSTQC_PAIRED -s $RUSTQC_STRANDED --skip-dup-check -j results/rustqc_summary.json -o results/$fn $QCPARAMS + $RUSTQCBIN rna $bam --gtf $anno -t ${task.cpus} $RUSTQC_PAIRED -s $RUSTQC_STRANDED --skip-dup-check -j results/rustqc_summary.json -o results/$fn $RUSTQCPARAMS """ } From 160b107971773303f42e2ccaad72b9692800ba36 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 27 Apr 2026 21:16:16 +0200 Subject: [PATCH 22/39] enable samplesheet injection into config json --- MONSDA/Params.py | 192 +++++++++++++++++++++++++++++++ MONSDA/RunMONSDA.py | 16 +++ configs/samplesheet_template.csv | 7 ++ 3 files changed, 215 insertions(+) create mode 100644 configs/samplesheet_template.csv diff --git a/MONSDA/Params.py b/MONSDA/Params.py index dfa66e28..c592146d 100644 --- a/MONSDA/Params.py +++ b/MONSDA/Params.py @@ -61,10 +61,12 @@ # # __file__ fails if someone does os.chdir() before. # # sys.argv[0] also fails, because it doesn't not always contain the path. +import csv import datetime import glob import inspect import itertools +import json import os import re import shutil @@ -73,6 +75,7 @@ from collections import OrderedDict, defaultdict from natsort import natsorted +from snakemake.common.configfile import load_configfile as _load_configfile import MONSDA.Utils as mu from MONSDA.Utils import check_run as check_run @@ -101,6 +104,195 @@ print("".join(tbe.format()), file=sys.stderr) +def samplesheet_to_settings(samplesheet_path: str) -> dict: + """Read a CSV/TSV samplesheet and return a SETTINGS dict compatible with MONSDA config. + + Expected columns (case-insensitive header row): + CONDITION - slash-separated condition path, e.g. ``Ecoli/WT`` or ``Ecoli/WT/dummylevel`` + SAMPLE - sample name / accession + GROUP - group label for differential analysis + SEQUENCING - e.g. ``paired`` or ``single`` + REFERENCE - path to genome FASTA (.fa.gz) + GTF - path to GTF annotation (.gtf.gz) [optional] + GFF - path to GFF annotation (.gff.gz) [optional] + INDEX - path to pre-built index [optional] + PREFIX - mapper index prefix [optional] + DECOY - path to decoy file [optional] + TYPE - sample type label [optional] + BATCH - batch label [optional] + IP - IP protocol info (for PEAKS) [optional] + + Per-condition metadata (SEQUENCING, REFERENCE, …) only needs to be present + on the first row for that condition; subsequent rows may leave those cells + empty (fill-down behaviour). + + Parameters + ---------- + samplesheet_path : str + Absolute or relative path to the samplesheet file. + + Returns + ------- + dict + Nested dict suitable for assigning to ``config["SETTINGS"]``. + """ + logid = scriptname + ".samplesheet_to_settings: " + + # --- detect delimiter --- + with open(samplesheet_path, newline="") as fh: + sample = fh.read(4096) + + delimiter = None + # 1. trust the file extension + ext = os.path.splitext(samplesheet_path)[1].lower() + if ext in (".tsv", ".txt"): + delimiter = "\t" + elif ext == ".csv": + delimiter = "," + else: + # 2. try the sniffer + try: + dialect = csv.Sniffer().sniff(sample, delimiters=",\t;") + delimiter = dialect.delimiter + except csv.Error: + pass + # 3. manual probe: whichever candidate appears more on the first line + if delimiter is None: + first_line = sample.splitlines()[0] if sample else "" + delimiter = "\t" if first_line.count("\t") >= first_line.count(",") else "," + + settings: dict = {} + + # per-condition accumulator for fill-down metadata + cond_meta: dict = {} + + with open(samplesheet_path, newline="") as fh: + reader = csv.DictReader(fh, delimiter=delimiter) + # normalise header keys to upper-case + if reader.fieldnames is None: + raise ValueError( + "Samplesheet appears to be empty or has no header row: " + + samplesheet_path + ) + reader.fieldnames = [f.strip().upper() for f in reader.fieldnames] + + for row in reader: + row = { + k.strip().upper(): (v.strip() if v is not None else "") + for k, v in row.items() + if k + } + + condition_str = row.get("CONDITION", "").strip() + sample_name = row.get("SAMPLE", "").strip() + if not condition_str or not sample_name: + log.warning( + logid + "Skipping row with missing CONDITION or SAMPLE: " + str(row) + ) + continue + + # fill-down: carry over non-empty metadata from previous rows of same condition + if condition_str not in cond_meta: + cond_meta[condition_str] = {} + for key in ( + "SEQUENCING", + "REFERENCE", + "GTF", + "GFF", + "INDEX", + "PREFIX", + "DECOY", + "IP", + ): + val = row.get(key, "") + if val: + cond_meta[condition_str][key] = val + + meta = cond_meta[condition_str] + + # --- navigate / create nested dict path --- + path_parts = [p.strip() for p in condition_str.split("/") if p.strip()] + node = settings + for part in path_parts: + node = node.setdefault(part, {}) + + # --- initialise leaf node on first encounter --- + if "SAMPLES" not in node: + node["SAMPLES"] = [] + node["GROUPS"] = [] + node["TYPES"] = [] + node["BATCHES"] = [] + node["SEQUENCING"] = meta.get("SEQUENCING", "") + node["REFERENCE"] = meta.get("REFERENCE", "") + node["INDEX"] = meta.get("INDEX", "") + node["PREFIX"] = meta.get("PREFIX", "") + node["IP"] = meta.get("IP", "") + gtf = meta.get("GTF", "") + gff = meta.get("GFF", "") + node["ANNOTATION"] = {"GTF": gtf, "GFF": gff} + decoy = meta.get("DECOY", "") + node["DECOY"] = {decoy: ""} if decoy else {} + + node["SAMPLES"].append(sample_name) + node["GROUPS"].append(row.get("GROUP", "")) + node["TYPES"].append(row.get("TYPE", "")) + node["BATCHES"].append(row.get("BATCH", "")) + + log.info(logid + "Built SETTINGS from samplesheet: " + str(list(settings.keys()))) + return settings + + +def inject_samplesheet_settings(configfile: str, samplesheet_path: str) -> str: + """Load *configfile*, populate ``SETTINGS`` from *samplesheet_path* if absent, + write the augmented config to ``_with_settings.json`` and return that path. + + Parameters + ---------- + configfile : str + Path to the original MONSDA JSON config. + samplesheet_path : str + Path to the CSV/TSV samplesheet. + + Returns + ------- + str + Path to the written (augmented) config file. + """ + logid = scriptname + ".inject_samplesheet_settings: " + + config = _load_configfile(configfile) + + existing = config.get("SETTINGS", {}) + # strip comment-only SETTINGS that have no SAMPLES anywhere + has_samples = ( + any( + isinstance(v, dict) and "SAMPLES" in v + for cond in existing.values() + if isinstance(cond, dict) + for v in cond.values() + ) + if existing + else False + ) + + if has_samples: + log.info( + logid + + "Config already contains SETTINGS with sample data; samplesheet will be ignored." + ) + return configfile + + log.info(logid + "Populating SETTINGS from samplesheet: " + samplesheet_path) + config["SETTINGS"] = samplesheet_to_settings(samplesheet_path) + + base, ext = os.path.splitext(configfile) + out_path = base + "_with_settings" + (ext if ext else ".json") + with open(out_path, "w") as fh: + json.dump(config, fh, indent=4) + log.info(logid + "Augmented config written to: " + out_path) + return out_path + + @check_run def get_samples(config: dict) -> list(): """Check and return samples according to sample list on config.json diff --git a/MONSDA/RunMONSDA.py b/MONSDA/RunMONSDA.py index 571fde34..9635a1f7 100755 --- a/MONSDA/RunMONSDA.py +++ b/MONSDA/RunMONSDA.py @@ -129,6 +129,15 @@ def parseargs(): action="store_true", help="Print version and exit", ) + parser.add_argument( + "--samplesheet", + type=str, + default=None, + metavar="FILE", + help="CSV or TSV samplesheet to populate the SETTINGS section. " + "Used when the config JSON lacks a SETTINGS block. " + "On the first run an augmented config (_with_settings.json) is written for reuse.", + ) if len(sys.argv) == 1: parser.print_help(sys.stderr) @@ -923,6 +932,13 @@ def main(): for i in range(1, len(knownargs.config)): optionalargs[0].extend(list(["-c", str(knownargs.config[i].pop())])) + # --- samplesheet injection (before any other config use) --- + if knownargs.samplesheet: + knownargs.configfile = mp.inject_samplesheet_settings( + knownargs.configfile, + os.path.abspath(knownargs.samplesheet), + ) + log.debug( f"{logid} ARGS: {args} {type(args)} KNOWNARGS: {knownargs} {type(knownargs)} OPTIONALARGS: {optionalargs} {type(optionalargs)}" ) diff --git a/configs/samplesheet_template.csv b/configs/samplesheet_template.csv new file mode 100644 index 00000000..2e95bea4 --- /dev/null +++ b/configs/samplesheet_template.csv @@ -0,0 +1,7 @@ +CONDITION,SAMPLE,GROUP,SEQUENCING,REFERENCE,GTF,GFF,INDEX,PREFIX,DECOY,TYPE,BATCH,IP +Ecoli/WT,SRR16324019,ctrl,paired,GENOMES/Ecoli/ecoli.fa.gz,GENOMES/Ecoli/ecoli.gtf.gz,GENOMES/Ecoli/ecoli.gff.gz,,,,,, +Ecoli/WT,SRR16324018,ctrl,,,,,,,,, +Ecoli/WT,SRR16324017,ctrl,,,,,,,,, +Ecoli/KO,SRR16324016,ko,paired,GENOMES/Ecoli/ecoli.fa.gz,GENOMES/Ecoli/ecoli.gtf.gz,GENOMES/Ecoli/ecoli.gff.gz,,,,,, +Ecoli/KO,SRR16324015,ko,,,,,,,,, +Ecoli/KO,SRR16324014,ko,,,,,,,,, From d7fc1041b27b0d708b2ebb7365675db4d89b02d5 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 4 May 2026 14:41:51 +0200 Subject: [PATCH 23/39] samplesheet update --- configs/samplesheet_template.csv | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/samplesheet_template.csv b/configs/samplesheet_template.csv index 2e95bea4..0d553361 100644 --- a/configs/samplesheet_template.csv +++ b/configs/samplesheet_template.csv @@ -1,7 +1,7 @@ CONDITION,SAMPLE,GROUP,SEQUENCING,REFERENCE,GTF,GFF,INDEX,PREFIX,DECOY,TYPE,BATCH,IP -Ecoli/WT,SRR16324019,ctrl,paired,GENOMES/Ecoli/ecoli.fa.gz,GENOMES/Ecoli/ecoli.gtf.gz,GENOMES/Ecoli/ecoli.gff.gz,,,,,, -Ecoli/WT,SRR16324018,ctrl,,,,,,,,, -Ecoli/WT,SRR16324017,ctrl,,,,,,,,, -Ecoli/KO,SRR16324016,ko,paired,GENOMES/Ecoli/ecoli.fa.gz,GENOMES/Ecoli/ecoli.gtf.gz,GENOMES/Ecoli/ecoli.gff.gz,,,,,, -Ecoli/KO,SRR16324015,ko,,,,,,,,, -Ecoli/KO,SRR16324014,ko,,,,,,,,, +FGUMI/WT/dummylevel,Sample1,ctrl,paired,GENOMES/Ecoli/ecoli.fa.gz,GENOMES/Ecoli/ecoli.gtf.gz,GENOMES/Ecoli/ecoli.gff.gz,,,,,, +FGUMI/WT/dummylevel,Sample2,ctrl,,,,,,,,, +FGUMI/WT/dummylevel,Sample3,ctrl,,,,,,,,, +FGUMI/KO,Sample4,ko,paired,GENOMES/Ecoli/ecoli.fa.gz,GENOMES/Ecoli/ecoli.gtf.gz,GENOMES/Ecoli/ecoli.gff.gz,,,,,, +FGUMI/KO,Sample5,ko,,,,,,,,, +FGUMI/KO,Sample6,ko,,,,,,,,, From 9499a4393f685a0bba4b3a44aebf2815372a76c0 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 4 May 2026 14:57:54 +0200 Subject: [PATCH 24/39] config updates --- configs/template_base_commented.json | 44 ++++++++++++++++++++++++---- configs/tutorial_exhaustive.json | 15 +++++++++- configs/tutorial_postprocess.json | 15 +++++++++- configs/tutorial_toolmix.json | 15 +++++++++- 4 files changed, 81 insertions(+), 8 deletions(-) diff --git a/configs/template_base_commented.json b/configs/template_base_commented.json index 7a47a674..0c526f7e 100644 --- a/configs/template_base_commented.json +++ b/configs/template_base_commented.json @@ -58,22 +58,36 @@ }, "BASECALL": { "TOOLS": { - "guppy": "~/.local/bin/guppy-cpu/bin/guppy_basecaller" + "guppy": "~/.local/bin/guppy-cpu/bin/guppy_basecaller", + "dorado": "dorado" }, "ENV" : "", "BIN" : "", "guppy": { "comment": { - "BASECALL": "Guppy options here if any, paired is not required, will be resolved by rules" + "BASECALL": "Guppy caller options here if any", + "MODEL": "Guppy model options here if any" }, "OPTIONS": { - "BASECALL": "" + "BASECALL": "", + "MODEL": "" + } + }, + "dorado": { + "comment": { + "CALLER": "Dorado caller options here if any", + "MODEL": "Dorado model options here if any" + }, + "OPTIONS": { + "CALLER": "", + "MODEL": "" } } }, "QC": { "TOOLS": { - "fastqc": "fastqc" + "fastqc": "fastqc", + "rustqc": "rustqc" }, "ENV" : "", "BIN" : "", @@ -86,13 +100,24 @@ "QC": "", "MULTI": "" } + }, + "rustqc": { + "comment": { + "QC": "RustQC options here if any, post-alignment QC", + "MULTI": "MultiQC options for rustqc if any" + }, + "OPTIONS": { + "QC": "", + "MULTI": "" + } } }, "TRIMMING": { "TOOLS": { "trimgalore": "trim_galore", "cutadapt": "cutadapt", - "bbduk": "bbmap" + "bbduk": "bbmap", + "fastp": "fastp" }, "ENV" : "", "BIN" : "", @@ -119,11 +144,20 @@ "OPTIONS": { "TRIM": "-q 15 --length 8 -e 0.15" } + }, + "fastp": { + "comment": { + "TRIM": "Trimming options here, --paired is not required, will be resolved by rules" + }, + "OPTIONS": { + "TRIM": "-q 15 -l 8" + } } }, "DEDUP": { "TOOLS": { "umitools": "umi_tools", + "picard": "picard", "fgumi": "fgumi" }, "ENV" : "", diff --git a/configs/tutorial_exhaustive.json b/configs/tutorial_exhaustive.json index e811dec3..da2cecac 100644 --- a/configs/tutorial_exhaustive.json +++ b/configs/tutorial_exhaustive.json @@ -128,7 +128,8 @@ "TOOLS" : { "trimgalore": "trim_galore", - "cutadapt": "cutadapt" + "cutadapt": "cutadapt", + "fastp": "fastp" }, "Ecoli": { "KO": { @@ -143,6 +144,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } }, "WT": { @@ -158,6 +165,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } } } diff --git a/configs/tutorial_postprocess.json b/configs/tutorial_postprocess.json index aa5ba370..79ca138d 100644 --- a/configs/tutorial_postprocess.json +++ b/configs/tutorial_postprocess.json @@ -122,7 +122,8 @@ "TOOLS" : { "trimgalore": "trim_galore", - "cutadapt": "cutadapt" + "cutadapt": "cutadapt", + "fastp": "fastp" }, "Ecoli": { "KO": { @@ -137,6 +138,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } }, "WT": { @@ -152,6 +159,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } } } diff --git a/configs/tutorial_toolmix.json b/configs/tutorial_toolmix.json index e3b080fa..2f91f471 100644 --- a/configs/tutorial_toolmix.json +++ b/configs/tutorial_toolmix.json @@ -120,7 +120,8 @@ "TOOLS" : { "trimgalore": "trim_galore", - "cutadapt": "cutadapt" + "cutadapt": "cutadapt", + "fastp": "fastp" }, "Ecoli": { "KO": { @@ -135,6 +136,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } }, "WT": { @@ -150,6 +157,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } } } From f272103c1ea4621bec5e6f1ea6280ae24e312795 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Mon, 4 May 2026 15:01:35 +0200 Subject: [PATCH 25/39] configurator update wip --- MONSDA/Configurator.py | 88 ++++- MONSDA/Utils.py | 1 + MONSDA/web_configurator.py | 778 ++++++++++++++++++++++++++++--------- 3 files changed, 688 insertions(+), 179 deletions(-) diff --git a/MONSDA/Configurator.py b/MONSDA/Configurator.py index 847d9f65..97bd7be8 100755 --- a/MONSDA/Configurator.py +++ b/MONSDA/Configurator.py @@ -28,12 +28,10 @@ os.sep.join(["lib", pythonversion, "site-packages", "MONSDA"]), "share" ) except: - installpath = os.path.cwd() + installpath = os.getcwd() configpath = os.path.join(installpath, "MONSDA", "configs") current_path = os.getcwd() -dir_path = os.path.dirname(os.path.realpath(__file__)) -os.chdir(dir_path) template = load_configfile(os.sep.join([configpath, "template_base_commented.json"])) none_workflow_keys = ["WORKFLOWS", "BINS", "MAXTHREADS", "SETTINGS", "VERSION"] @@ -70,6 +68,13 @@ help="takes configuration file to modify", ) +parser.add_argument( + "--samplesheet", + type=str, + default=False, + help="CSV or TSV samplesheet to populate SETTINGS and condition tree for new configs/projects", +) + args = parser.parse_args() @@ -315,6 +320,21 @@ def get_conditions_from_dict(root, keylist=[]): keylist.pop() +def get_conditions_from_settings(root, keylist=[]): + """Yield condition paths from SETTINGS leaves that contain a SAMPLES key.""" + if not isinstance(root, dict): + return + if "SAMPLES" in root and isinstance(root.get("SAMPLES"), list): + yield ":".join(keylist) + return + for k, v in root.items(): + if not isinstance(v, dict): + continue + keylist.append(k) + yield from get_conditions_from_settings(v, keylist) + keylist.pop() + + def getPathesFromDict(d, value=None): def yield_func(d): q = [(d, [])] @@ -790,6 +810,8 @@ def create_condition_tree(): def add_sample_dirs(only_conditions=None): pickle_unfinished("add_sample_dirs") # project.current_func_arg = only_conditions + if args.samplesheet and only_conditions is None and project.mode in ["project", "config"]: + return assign_samplesheet() if "FETCH" in project.workflowsDict.keys(): return assign_SRA(only_conditions) print("\n FASTQ files:") @@ -876,6 +898,58 @@ def add_sample_dirs(only_conditions=None): return assign_samples(only_conditions) +def assign_samplesheet(): + pickle_unfinished("assign_samplesheet") + prCyan("\n Sample Assignment: samplesheet\n") + + samplesheet = str(args.samplesheet) + if not os.path.isfile(samplesheet): + prRed(f"Could not find samplesheet file: {samplesheet}") + exit(1) + + cwd = os.getcwd() + try: + # Params initializes logging at import time via Utils.setup_logger and expects + # a writable LOGS/ directory in the current working directory. + os.makedirs(os.path.join(current_path, "LOGS"), exist_ok=True) + os.chdir(current_path) + from .Params import samplesheet_to_settings + + sheet_settings = samplesheet_to_settings(samplesheet) + except Exception as e: + prRed(f"Failed to parse samplesheet '{samplesheet}': {e}") + exit(1) + finally: + os.chdir(cwd) + + if not sheet_settings: + prRed(f"Samplesheet '{samplesheet}' did not produce SETTINGS entries") + exit(1) + + project.settingsDict = decouple(sheet_settings) + project.conditionsDict = NestedDefaultDict() + project.samplesDict = NestedDefaultDict() + + condition_paths = [ + x.split(":") for x in get_conditions_from_settings(project.settingsDict) + ] + if not condition_paths: + prRed( + "No condition leaves with SAMPLES found in parsed samplesheet SETTINGS" + ) + exit(1) + + for path in condition_paths: + setInDict(project.conditionsDict, path, {}) + + prGreen("Loaded condition tree and SETTINGS from samplesheet:") + print_dict(project.conditionsDict, gap=" ") + print("") + print_dict(project.settingsDict, gap=" ") + show_settings() + return select_conditioning() + + def assign_SRA(only_conditions=None): pickle_unfinished("assign_SRA") prCyan("\n Sample Assignment: SRA Accession Numbers\n") @@ -1882,6 +1956,14 @@ def main(): global guide project = PROJECT() guide = GUIDE() + if args.samplesheet: + if not str(args.samplesheet).lower().endswith((".csv", ".tsv", ".txt")): + print("Samplesheet flag requires a .csv/.tsv/.txt file") + exit() + args.samplesheet = os.path.abspath(args.samplesheet) + if not os.path.isfile(args.samplesheet): + print(f"Samplesheet file not found: {args.samplesheet}") + exit() if args.test: guide.testing = True if args.config: diff --git a/MONSDA/Utils.py b/MONSDA/Utils.py index 3a8bae03..fd2f4d4b 100644 --- a/MONSDA/Utils.py +++ b/MONSDA/Utils.py @@ -90,6 +90,7 @@ def setup_logger(scriptname): for handler in log.handlers[:]: handler.close() log.removeHandler(handler) + os.makedirs("LOGS", exist_ok=True) handler = logging.FileHandler("LOGS/MONSDA.log", mode="a") handler.setFormatter( logging.Formatter( diff --git a/MONSDA/web_configurator.py b/MONSDA/web_configurator.py index 7f8425a2..474f1bca 100644 --- a/MONSDA/web_configurator.py +++ b/MONSDA/web_configurator.py @@ -1,19 +1,23 @@ +import copy import json import os -from typing import Any, Dict +from typing import Any, Dict, List, Optional from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import FileResponse -from pydantic import BaseModel +from fastapi.responses import HTMLResponse +from pydantic import BaseModel, Field +from snakemake.common.configfile import load_configfile + +from .Params import samplesheet_to_settings TEMPLATE_PATH = os.path.join( os.path.dirname(__file__), "../configs/template_base_commented.json" ) +NONE_WORKFLOW_KEYS = ["WORKFLOWS", "BINS", "MAXTHREADS", "SETTINGS", "VERSION"] -app = FastAPI(title="MONSDA Configurator Web Service") +app = FastAPI(title="MONSDA Configurator Web") -# Allow CORS for local development app.add_middleware( CORSMiddleware, allow_origins=["*"], @@ -23,224 +27,646 @@ ) -class ConfigRequest(BaseModel): +class SamplesheetRequest(BaseModel): + samplesheet_path: str + + +class BuildConfigRequest(BaseModel): + config_name: str + output_dir: str + workflows: List[str] = Field(default_factory=list) + tools: Dict[str, List[str]] = Field(default_factory=dict) + maxthreads: str = "16" + settings: Optional[Dict[str, Any]] = None + samplesheet_path: Optional[str] = None + + +class ProjectRequest(BaseModel): + project_dir: str + workflows: List[str] = Field(default_factory=list) + + +class SaveConfigRequest(BaseModel): config_name: str - config: Dict[str, Any] output_dir: str + config: Dict[str, Any] def load_template() -> Dict[str, Any]: - with open(TEMPLATE_PATH, "r") as f: - return json.load(f) + return load_configfile(TEMPLATE_PATH) -def strip_comments(d): +def strip_comments(d: Any) -> Any: if isinstance(d, dict): return {k: strip_comments(v) for k, v in d.items() if k != "comment"} - elif isinstance(d, list): + if isinstance(d, list): return [strip_comments(x) for x in d] + return d + + +def set_in_path(root: Dict[str, Any], path: List[str], value: Any) -> None: + node = root + for key in path[:-1]: + if key not in node or not isinstance(node[key], dict): + node[key] = {} + node = node[key] + node[path[-1]] = value + + +def get_condition_paths_from_settings(settings: Dict[str, Any]) -> List[List[str]]: + out: List[List[str]] = [] + + def _walk(node: Any, path: List[str]) -> None: + if not isinstance(node, dict): + return + if "SAMPLES" in node and isinstance(node.get("SAMPLES"), list): + out.append(path.copy()) + return + for k, v in node.items(): + if isinstance(v, dict): + _walk(v, path + [k]) + + _walk(settings, []) + return out + + +def build_workflow_block( + workflow_name: str, + workflow_template: Dict[str, Any], + condition_paths: List[List[str]], + selected_tools: List[str], +) -> Dict[str, Any]: + block: Dict[str, Any] = {} + + all_tools = workflow_template.get("TOOLS", {}) + if not selected_tools: + selected_tools = list(all_tools.keys()) + + if all_tools: + block["TOOLS"] = {k: all_tools[k] for k in selected_tools if k in all_tools} + + # carry workflow-level defaults if present + for passthrough in ["FEATURES", "CUTOFFS", "COMPARABLE", "EXCLUDE"]: + if passthrough in workflow_template: + block[passthrough] = copy.deepcopy(workflow_template[passthrough]) + + # populate per-condition tool settings like CLI configurator does + for cond_path in condition_paths: + for tool in selected_tools: + tool_def = workflow_template.get(tool) + if not isinstance(tool_def, dict): + continue + tool_def_no_comment = strip_comments(tool_def) + if not tool_def_no_comment: + continue + set_in_path(block, cond_path + [tool], copy.deepcopy(tool_def_no_comment)) + + return block + + +def build_config(req: BuildConfigRequest) -> Dict[str, Any]: + template = strip_comments(load_template()) + + if req.settings is not None: + settings = req.settings + elif req.samplesheet_path: + samplesheet_path = os.path.abspath(req.samplesheet_path) + if not os.path.isfile(samplesheet_path): + raise HTTPException( + status_code=400, + detail=f"Samplesheet does not exist: {samplesheet_path}", + ) + settings = samplesheet_to_settings(samplesheet_path) else: - return d + raise HTTPException( + status_code=400, + detail="Provide either settings JSON or samplesheet_path.", + ) - config_name: str - config: Dict[str, Any] - output_dir: str + condition_paths = get_condition_paths_from_settings(settings) + if not condition_paths: + raise HTTPException( + status_code=400, + detail="No condition leaves with SAMPLES found in SETTINGS.", + ) + + workflows = req.workflows or [] + if not workflows: + raise HTTPException( + status_code=400, + detail="At least one workflow must be selected.", + ) + + invalid = [w for w in workflows if w not in template or w in NONE_WORKFLOW_KEYS] + if invalid: + raise HTTPException( + status_code=400, + detail=f"Unknown workflow(s): {', '.join(invalid)}", + ) + + final_config: Dict[str, Any] = { + "WORKFLOWS": ", ".join(workflows), + "BINS": template.get("BINS", ""), + "MAXTHREADS": str(req.maxthreads), + "VERSION": template.get("VERSION", ""), + "SETTINGS": settings, + } + + for wf in workflows: + wf_template = template.get(wf, {}) + final_config[wf] = build_workflow_block( + wf, + wf_template, + condition_paths, + req.tools.get(wf, []), + ) + + return final_config @app.get("/template", response_model=Dict[str, Any]) -def get_template(): - """Get the config template (with comments).""" - return load_template() +def get_template() -> Dict[str, Any]: + try: + return load_template() + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to load template: {e}") @app.get("/template/fields", response_model=Dict[str, Any]) -def get_template_fields(): - """Get the config template (without comments).""" - return strip_comments(load_template()) +def get_template_fields() -> Dict[str, Any]: + try: + return strip_comments(load_template()) + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to load template fields: {e}" + ) + + +def _normalize_path(path: str) -> str: + if path: + return os.path.abspath(os.path.expanduser(path.strip())) + return os.getcwd() + + +@app.get("/fs/roots", response_model=Dict[str, Any]) +def fs_roots() -> Dict[str, Any]: + cwd = os.getcwd() + home = os.path.expanduser("~") + roots = [] + for p in [cwd, home, os.path.sep]: + if p and p not in roots and os.path.isdir(p): + roots.append(p) + return {"roots": roots} + + +@app.get("/fs/list", response_model=Dict[str, Any]) +def fs_list(path: str = "", mode: str = "dirs") -> Dict[str, Any]: + mode = (mode or "dirs").lower() + if mode not in {"dirs", "all", "samplesheets"}: + raise HTTPException( + status_code=400, detail="mode must be dirs, all, or samplesheets" + ) + + current = _normalize_path(path) + if not os.path.isdir(current): + raise HTTPException(status_code=400, detail=f"Not a directory: {current}") + + try: + entries = list(os.scandir(current)) + except PermissionError: + raise HTTPException(status_code=403, detail=f"Permission denied: {current}") + + dirs = [] + files = [] + for e in sorted(entries, key=lambda x: x.name.lower()): + if e.name in {".", ".."}: + continue + if e.is_dir(follow_symlinks=False): + dirs.append({"name": e.name, "path": e.path}) + continue + if not e.is_file(follow_symlinks=False): + continue + if mode in {"all", "samplesheets"}: + if mode == "samplesheets": + lower = e.name.lower() + if not lower.endswith((".csv", ".tsv", ".txt")): + continue + files.append({"name": e.name, "path": e.path}) + + parent = ( + os.path.dirname(current.rstrip(os.sep)) + if current != os.path.sep + else os.path.sep + ) + return { + "current": current, + "parent": parent, + "dirs": dirs, + "files": files, + "mode": mode, + } + + +@app.post("/samplesheet/parse", response_model=Dict[str, Any]) +def parse_samplesheet(req: SamplesheetRequest) -> Dict[str, Any]: + path = os.path.abspath(req.samplesheet_path.strip()) + if not os.path.isfile(path): + raise HTTPException(status_code=400, detail=f"Samplesheet not found: {path}") + settings = samplesheet_to_settings(path) + return { + "samplesheet": path, + "settings": settings, + "conditions": [ + "/".join(p) for p in get_condition_paths_from_settings(settings) + ], + } + + +@app.post("/config/preview", response_model=Dict[str, Any]) +def preview_config(req: BuildConfigRequest) -> Dict[str, Any]: + return {"config": build_config(req)} -@app.post("/generate_config") -def generate_config(req: ConfigRequest): - """Generate a config file from user input. User specifies output_dir.""" +@app.post("/config/save", response_model=Dict[str, Any]) +def save_config(req: SaveConfigRequest) -> Dict[str, Any]: config_name = req.config_name.strip() - output_dir = os.path.abspath(req.output_dir.strip()) if not config_name or any(c in config_name for c in "/\\"): raise HTTPException(status_code=400, detail="Invalid config name.") - if not output_dir or not os.path.isdir(output_dir): - raise HTTPException(status_code=400, detail="Invalid output directory.") - config_path = os.path.join(output_dir, f"config_{config_name}.json") - with open(config_path, "w") as f: - json.dump(req.config, f, indent=4) - return {"message": "Config generated.", "path": config_path} - - -# Download config by full path (for security, restrict to files under allowed parent dir) -@app.get("/download_config/") -def download_config(config_path: str): - config_path = os.path.abspath(config_path) - if not os.path.exists(config_path): - raise HTTPException(status_code=404, detail="Config not found.") - if not config_path.endswith(".json"): - raise HTTPException(status_code=400, detail="Only .json files allowed.") - return FileResponse(config_path, filename=os.path.basename(config_path)) - - -class DirRequest(BaseModel): - config: Dict[str, Any] - project_dir: str + output_dir = os.path.abspath(req.output_dir.strip()) + if not os.path.isdir(output_dir): + raise HTTPException(status_code=400, detail="Output directory does not exist.") -def safe_makedirs(path): - os.makedirs(path, exist_ok=True) + path = os.path.join(output_dir, f"config_{config_name}.json") + with open(path, "w") as fh: + json.dump(req.config, fh, indent=4) + return {"message": "Config written.", "path": path} -def create_project_structure(config: Dict[str, Any], project_dir: str): - # Example: create folders for workflows, logs, counts, etc. - safe_makedirs(project_dir) - for wf in config.get("WORKFLOWS", "").split(","): - wf = wf.strip() - if wf: - safe_makedirs(os.path.join(project_dir, wf)) - safe_makedirs(os.path.join(project_dir, "LOGS")) - safe_makedirs(os.path.join(project_dir, "COUNTS")) - # Add more as needed +@app.post("/project/create", response_model=Dict[str, Any]) +def create_project(req: ProjectRequest) -> Dict[str, Any]: + project_dir = os.path.abspath(req.project_dir.strip()) + os.makedirs(project_dir, exist_ok=True) -@app.post("/generate_project_dir") -def generate_project_dir(req: DirRequest): - project_dir = os.path.abspath(req.project_dir) - if not project_dir.startswith(os.getcwd()): - raise HTTPException( - status_code=400, detail="Project dir must be inside working directory." - ) - if os.path.exists(project_dir) and os.listdir(project_dir): - raise HTTPException( - status_code=400, detail="Project dir already exists and is not empty." - ) - create_project_structure(req.config, project_dir) - return {"message": "Project directory structure created.", "path": project_dir} + # Basic MONSDA project skeleton + os.makedirs(os.path.join(project_dir, "FASTQ"), exist_ok=True) + os.makedirs(os.path.join(project_dir, "GENOMES"), exist_ok=True) + os.makedirs(os.path.join(project_dir, "LOGS"), exist_ok=True) + for wf in req.workflows: + os.makedirs(os.path.join(project_dir, wf), exist_ok=True) -from fastapi.responses import HTMLResponse + return {"message": "Project directory prepared.", "path": project_dir} @app.get("/", response_class=HTMLResponse) -def root(): +def root() -> str: return """ - - MONSDA Configurator - + + MONSDA Web Configurator + -

MONSDA Configurator

-
-

Step 1: Edit Configuration

- - - +

MONSDA Web Configurator

+

Interactive builder for MONSDA configs with samplesheet support.

+ +
+

1) Samplesheet → SETTINGS

+ +
+ + +
+ +
+
+
+ +
+

2) Workflow + Tool selection

+ +
+
+
+ +
+

3) Build config

+
+
+ + +
+
+ +
+ + +
+
-
-

Step 2: Generate Config File

- - - - - - -
+ + + + + + + + +
+ + + +
+ +
+

4) Create project skeleton

+ +
+ +
-
-

Step 3: Create Project Directory

- - - - + +
+
+ +
+

Path browser

+
+
+
+ + + +
- + const r = await fetch('/config/save', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + config_name: document.getElementById('configName').value.trim(), + output_dir: document.getElementById('outputDir').value.trim(), + config: lastConfig + }) + }); + const data = await r.json(); + if (r.ok) { + setStatus('buildStatus', `Saved: ${data.path}`); + } else { + setStatus('buildStatus', data.detail || 'Failed to save config.', false); + } + } catch (e) { + setStatus('buildStatus', 'Failed to save config.', false); + } +} + +async function createProject() { + try { + const workflows = (lastConfig && lastConfig.WORKFLOWS) ? lastConfig.WORKFLOWS.split(',').map(x => x.trim()).filter(Boolean) : []; + const r = await fetch('/project/create', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + project_dir: document.getElementById('projectDir').value.trim(), + workflows: workflows + }) + }); + const data = await r.json(); + if (r.ok) { + setStatus('projectStatus', `Created: ${data.path}`); + } else { + setStatus('projectStatus', data.detail || 'Failed to create project.', false); + } + } catch (e) { + setStatus('projectStatus', 'Failed to create project.', false); + } +} + +window.onload = loadTemplate; + """ From 81845384ef08c0690077c99f81bbaf5bb4681b61 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Tue, 5 May 2026 11:17:54 +0200 Subject: [PATCH 26/39] rustqc container recipe --- containers/apptainer/rustqc.def | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 containers/apptainer/rustqc.def diff --git a/containers/apptainer/rustqc.def b/containers/apptainer/rustqc.def new file mode 100644 index 00000000..2d9f57b8 --- /dev/null +++ b/containers/apptainer/rustqc.def @@ -0,0 +1,23 @@ +Bootstrap: docker +From: continuumio/miniconda3 + +%files + /home/fall/MONSDA/envs/rustqc.yaml /opt/envs/ + ${HOME}/MONSDA/scripts /opt/MONSDA/ + +%environment + +%post + ls -alrt /opt/envs + chmod -R +x /opt/envs/rustqc.yaml + + ENV_NAME=rustqc + echo ". /opt/conda/etc/profile.d/conda.sh" >> $APPTAINER_ENVIRONMENT + echo "conda activate $ENV_NAME" >> $APPTAINER_ENVIRONMENT + + . /opt/conda/etc/profile.d/conda.sh + conda env create -f /opt/envs/rustqc.yaml -p /opt/conda/envs/$ENV_NAME + conda clean --all + +%runscript + exec "$@" From 7ddd4290f59c3d19877a0aa8806ca936e4662e0a Mon Sep 17 00:00:00 2001 From: jfallmann Date: Tue, 5 May 2026 11:28:06 +0200 Subject: [PATCH 27/39] web_config update --- MONSDA/web_configurator.py | 1028 +++++++++++++++++++++++++++++++----- 1 file changed, 910 insertions(+), 118 deletions(-) diff --git a/MONSDA/web_configurator.py b/MONSDA/web_configurator.py index 474f1bca..6b2fb5c8 100644 --- a/MONSDA/web_configurator.py +++ b/MONSDA/web_configurator.py @@ -1,6 +1,7 @@ import copy import json import os +import sys from typing import Any, Dict, List, Optional from fastapi import FastAPI, HTTPException @@ -11,9 +12,7 @@ from .Params import samplesheet_to_settings -TEMPLATE_PATH = os.path.join( - os.path.dirname(__file__), "../configs/template_base_commented.json" -) +TEMPLATE_FILE = "template_base_commented.json" NONE_WORKFLOW_KEYS = ["WORKFLOWS", "BINS", "MAXTHREADS", "SETTINGS", "VERSION"] app = FastAPI(title="MONSDA Configurator Web") @@ -41,9 +40,25 @@ class BuildConfigRequest(BaseModel): samplesheet_path: Optional[str] = None +class ConditionFiles(BaseModel): + """Per-condition file selections from the wizard.""" + + condition: str # slash-separated condition path e.g. "Ecoli/WT" + fastq_dir: str = "" # directory containing FASTQ files for this condition + fastq_files: List[str] = Field(default_factory=list) # explicit file paths + sequencing: str = "paired" # paired | single + reference: str = "" # absolute path to genome FASTA + gtf: str = "" # absolute path to GTF annotation + gff: str = "" # absolute path to GFF annotation (optional) + decoy: str = "" # absolute path to decoy file (optional) + + class ProjectRequest(BaseModel): project_dir: str - workflows: List[str] = Field(default_factory=list) + project_name: str = "monsda" + condition_files: List[ConditionFiles] = Field(default_factory=list) + config: Optional[Dict[str, Any]] = None + settings: Optional[Dict[str, Any]] = None class SaveConfigRequest(BaseModel): @@ -51,9 +66,53 @@ class SaveConfigRequest(BaseModel): output_dir: str config: Dict[str, Any] +def _template_candidates() -> List[str]: + base_dir = os.path.abspath(os.path.dirname(__file__)) + pythonversion = f"python{sys.version_info.major}.{sys.version_info.minor}" + install_share = base_dir.replace( + os.sep.join(["lib", pythonversion, "site-packages", "MONSDA"]), "share" + ) + + candidates = [ + # explicit override if user/admin wants to force a template location + os.environ.get("MONSDA_TEMPLATE_PATH", ""), + # local checkout layout (repo root/configs) + os.path.abspath(os.path.join(base_dir, "..", "configs", TEMPLATE_FILE)), + # package-relative layout if configs were bundled next to package + os.path.abspath(os.path.join(base_dir, "configs", TEMPLATE_FILE)), + # Configurator.py install layout logic: /share/MONSDA/configs + os.path.abspath( + os.path.join(install_share, "MONSDA", "configs", TEMPLATE_FILE) + ), + # generic venv/conda prefix share path + os.path.abspath( + os.path.join(sys.prefix, "share", "MONSDA", "configs", TEMPLATE_FILE) + ), + # last-resort: run directory configs + os.path.abspath(os.path.join(os.getcwd(), "configs", TEMPLATE_FILE)), + ] + + # de-duplicate while preserving order and dropping empty values + deduped: List[str] = [] + for c in candidates: + c = (c or "").strip() + if c and c not in deduped: + deduped.append(c) + return deduped + + +def resolve_template_path() -> str: + for path in _template_candidates(): + if os.path.isfile(path): + return path + tried = "\n - ".join(_template_candidates()) + raise FileNotFoundError( + "Could not find template_base_commented.json. Tried:\n - " + tried + ) + def load_template() -> Dict[str, Any]: - return load_configfile(TEMPLATE_PATH) + return load_configfile(resolve_template_path()) def strip_comments(d: Any) -> Any: @@ -241,16 +300,12 @@ def fs_list(path: str = "", mode: str = "dirs") -> Dict[str, Any]: for e in sorted(entries, key=lambda x: x.name.lower()): if e.name in {".", ".."}: continue - if e.is_dir(follow_symlinks=False): + if e.is_dir(follow_symlinks=True): dirs.append({"name": e.name, "path": e.path}) continue - if not e.is_file(follow_symlinks=False): + if not e.is_file(follow_symlinks=True): continue if mode in {"all", "samplesheets"}: - if mode == "samplesheets": - lower = e.name.lower() - if not lower.endswith((".csv", ".tsv", ".txt")): - continue files.append({"name": e.name, "path": e.path}) parent = ( @@ -306,131 +361,543 @@ def save_config(req: SaveConfigRequest) -> Dict[str, Any]: @app.post("/project/create", response_model=Dict[str, Any]) def create_project(req: ProjectRequest) -> Dict[str, Any]: - project_dir = os.path.abspath(req.project_dir.strip()) + base_dir = os.path.abspath(req.project_dir.strip()) + project_name = req.project_name.strip() or "monsda" + + # Project name becomes a subdirectory under the chosen path + project_dir = os.path.join(base_dir, project_name) os.makedirs(project_dir, exist_ok=True) - # Basic MONSDA project skeleton - os.makedirs(os.path.join(project_dir, "FASTQ"), exist_ok=True) - os.makedirs(os.path.join(project_dir, "GENOMES"), exist_ok=True) - os.makedirs(os.path.join(project_dir, "LOGS"), exist_ok=True) + # FASTQ directly under project dir (no extra project_name level) + fastq_dir = os.path.join(project_dir, "FASTQ") + gen_dir = os.path.join(project_dir, "GENOMES") + os.makedirs(fastq_dir, exist_ok=True) + os.makedirs(gen_dir, exist_ok=True) + + settings = req.settings or (req.config.get("SETTINGS") if req.config else None) + linked_files: List[str] = [] + warnings: List[str] = [] + + # Build lookup: condition_path -> ConditionFiles + cond_lookup: Dict[str, ConditionFiles] = {} + for cf in req.condition_files: + cond_lookup[cf.condition] = cf + + if settings and isinstance(settings, dict): + condition_paths = get_condition_paths_from_settings(settings) + for cond_path in condition_paths: + cond_key = "/".join(cond_path) + cond_info = cond_lookup.get(cond_key) + + # Create condition subdirectories under FASTQ + cond_dir = os.path.join(fastq_dir, *cond_path) + os.makedirs(cond_dir, exist_ok=True) + + # Walk to the leaf node to find SAMPLES + node = settings + for key in cond_path: + node = node.get(key, {}) + + samples = node.get("SAMPLES", []) + + # Link FASTQ files: prefer explicit file list, fall back to directory+sample matching + if cond_info and cond_info.fastq_files: + # Explicit file selection mode - link exactly the files the user chose + for fpath in cond_info.fastq_files: + src_file = os.path.abspath(fpath) + if not os.path.isfile(src_file): + warnings.append(f"FASTQ file not found: {fpath}") + continue + dst = os.path.join(cond_dir, os.path.basename(src_file)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(src_file), dst) + linked_files.append(dst) + # Handle paired-end mate + if cond_info.sequencing == "paired": + basename = os.path.basename(src_file) + if "_R1" in basename: + mate_name = basename.replace("_R1", "_R2") + elif "_1." in basename: + mate_name = basename.replace("_1.", "_2.") + else: + mate_name = None + if mate_name: + mate_src = os.path.join( + os.path.dirname(src_file), mate_name + ) + mate_dst = os.path.join(cond_dir, mate_name) + if os.path.isfile(mate_src) and not os.path.exists( + mate_dst + ): + os.symlink(os.path.realpath(mate_src), mate_dst) + linked_files.append(mate_dst) + + elif cond_info and cond_info.fastq_dir and isinstance(samples, list): + # Directory mode - find files matching sample names from samplesheet + src_dir = os.path.abspath(cond_info.fastq_dir) + if os.path.isdir(src_dir): + for sample in samples: + # Find matching files (sample name may or may not have extension) + matched = _find_fastq_files(src_dir, sample) + if not matched: + warnings.append( + f"No FASTQ file found for sample '{sample}' in {src_dir}" + ) + continue + for src_file in matched: + dst = os.path.join(cond_dir, os.path.basename(src_file)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(src_file), dst) + linked_files.append(dst) + # Handle paired-end mate + if cond_info.sequencing == "paired": + basename = os.path.basename(src_file) + if "_R1" in basename: + mate_name = basename.replace("_R1", "_R2") + elif "_1." in basename: + mate_name = basename.replace("_1.", "_2.") + elif "_R2" in basename: + mate_name = basename.replace("_R2", "_R1") + elif "_2." in basename: + mate_name = basename.replace("_2.", "_1.") + else: + mate_name = None + if mate_name: + mate_src = os.path.join(src_dir, mate_name) + mate_dst = os.path.join(cond_dir, mate_name) + if os.path.isfile(mate_src) and not os.path.exists( + mate_dst + ): + os.symlink(os.path.realpath(mate_src), mate_dst) + linked_files.append(mate_dst) + else: + warnings.append( + f"FASTQ directory not found for condition '{cond_key}': {src_dir}" + ) + + # Link genome files: REFERENCE, DECOY, GTF, GFF into GENOMES/ + ref_path = (cond_info.reference if cond_info else "") or node.get( + "REFERENCE", "" + ) + if ref_path and os.path.isfile(ref_path): + dst = os.path.join(gen_dir, os.path.basename(ref_path)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(ref_path), dst) + linked_files.append(dst) + # Update settings path to relative + rel = os.path.relpath(dst, start=project_dir) + node["REFERENCE"] = rel + elif ref_path: + warnings.append(f"REFERENCE not found: {ref_path}") + + decoy_path = (cond_info.decoy if cond_info else "") or node.get("DECOY", "") + if decoy_path and os.path.isfile(decoy_path): + dst = os.path.join(gen_dir, os.path.basename(decoy_path)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(decoy_path), dst) + linked_files.append(dst) + rel = os.path.relpath(dst, start=project_dir) + node["DECOY"] = rel + + gtf_path = (cond_info.gtf if cond_info else "") or "" + anno = node.get("ANNOTATION", {}) + if isinstance(anno, dict): + gtf_path = gtf_path or anno.get("GTF", "") + if gtf_path and os.path.isfile(gtf_path): + dst = os.path.join(gen_dir, os.path.basename(gtf_path)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(gtf_path), dst) + linked_files.append(dst) + rel = os.path.relpath(dst, start=project_dir) + if isinstance(anno, dict): + anno["GTF"] = rel + node["ANNOTATION"] = anno + elif gtf_path: + warnings.append(f"GTF not found: {gtf_path}") + + gff_path = (cond_info.gff if cond_info else "") or "" + if isinstance(anno, dict): + gff_path = gff_path or anno.get("GFF", "") + if gff_path and os.path.isfile(gff_path): + dst = os.path.join(gen_dir, os.path.basename(gff_path)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(gff_path), dst) + linked_files.append(dst) + rel = os.path.relpath(dst, start=project_dir) + if isinstance(anno, dict): + anno["GFF"] = rel + node["ANNOTATION"] = anno + elif gff_path: + warnings.append(f"GFF not found: {gff_path}") + + # Write config if provided (with updated relative paths in SETTINGS) + config_path = "" + if req.config: + if settings: + req.config["SETTINGS"] = settings + config_file = f"config_{project_name}.json" + config_path = os.path.join(project_dir, config_file) + with open(config_path, "w") as fh: + json.dump(req.config, fh, indent=4) + + return { + "message": "Project created.", + "path": project_dir, + "config_path": config_path, + "linked_files": len(linked_files), + "warnings": warnings, + } - for wf in req.workflows: - os.makedirs(os.path.join(project_dir, wf), exist_ok=True) - return {"message": "Project directory prepared.", "path": project_dir} +def _find_fastq_files(src_dir: str, sample_name: str) -> List[str]: + """Find FASTQ files in src_dir matching a sample name. + + Matches: exact filename, or sample_name*.fastq.gz / .fq.gz / .fastq / .fq + Only returns R1 (or unpaired) to avoid double-linking. + """ + import glob as _glob + + candidates: List[str] = [] + # Try exact match first + exact = os.path.join(src_dir, sample_name) + if os.path.isfile(exact): + return [exact] + + # Try common FASTQ extensions + for ext in [".fastq.gz", ".fq.gz", ".fastq", ".fq"]: + path = os.path.join(src_dir, sample_name + ext) + if os.path.isfile(path): + candidates.append(path) + # Try with _R1 suffix + path_r1 = os.path.join(src_dir, sample_name + "_R1" + ext) + if os.path.isfile(path_r1): + candidates.append(path_r1) + # Try with _1 suffix + path_1 = os.path.join(src_dir, sample_name + "_1" + ext) + if os.path.isfile(path_1): + candidates.append(path_1) + + if candidates: + return candidates + + # Glob fallback: any file starting with sample_name + pattern = os.path.join(src_dir, sample_name + "*") + matches = _glob.glob(pattern) + # Filter to only fastq-like files and only R1/unpaired + for m in sorted(matches): + if os.path.isfile(m): + lower = m.lower() + if any(lower.endswith(e) for e in [".fastq.gz", ".fq.gz", ".fastq", ".fq"]): + # Skip R2/_2 to avoid double-linking + base = os.path.basename(m) + if "_R2" in base or "_2." in base: + continue + candidates.append(m) + + return candidates @app.get("/", response_class=HTMLResponse) def root() -> str: return """ - + - + MONSDA Web Configurator + +

MONSDA Web Configurator

-

Interactive builder for MONSDA configs with samplesheet support.

+

Interactive builder for MONSDA pipeline configurations.

+ +
+
+

Quick Start

+ +
+
+ 1. Pick and parse a samplesheet.
+ 2. Select workflows & configure tools.
+ 3. Preview and save config.
+ 4. Create project skeleton with symlinked data. +
+
-
-

1) Samplesheet → SETTINGS

+
+
+

1. Samplesheet → Settings

+ +
+
+ Choose your CSV/TSV samplesheet, then click Parse Samplesheet. This fills the SETTINGS JSON automatically. +
- +
- +
-

2) Workflow + Tool selection

- +
+

2. Workflows & Tools

+ +
+
+ Select the workflows you need. Tools are only configured for workflows you select. Click Configure tools to customize per workflow. +
+
+ + +
-
+
+
-
-

3) Build config

-
+
+
+

3. Output & Finalize

+ +
+
+ Choose one mode:
+ Save config only — generates and saves the JSON config to a directory of your choice.
+ Create project — builds a full MONSDA project skeleton with FASTQ & GENOMES symlinks, plus the config. Select FASTQ files per condition either by directory (filtered by sample names) or by picking individual files. +
+ + +
+ + +
+ + +
- - + +
- -
- - -
+ +
- - - + + + +
+ +
+ + +
+
+ + +
+
- - -
+ + + +
+
- +
-
-

4) Create project skeleton

- -
- - + -
-

Path browser

-
-
-
- - - - +
+
+
+

Tool selection for active workflows

+ +
+
By default all tools are selected when you enable a workflow. Uncheck tools you do not want.
+
-
-
+ """ From 3b8ee8342260a65d9631ea3db8eab66d9553950d Mon Sep 17 00:00:00 2001 From: jfallmann Date: Tue, 5 May 2026 15:18:54 +0200 Subject: [PATCH 28/39] readme update --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b3dfedfc..9271ea91 100644 --- a/README.md +++ b/README.md @@ -46,12 +46,12 @@ More information can be found in the official [documentation](https://monsda.rea ## How does it work -This repository hosts the executable ```MONSDA.py``` which acts a wrapper around ```Snakemake``` and the ```config.json``` file. +This repository hosts the executable ```MONSDA.py``` which acts a wrapper around ```Snakemake|Nextflow``` and the ```config.json``` file. The ```config.json``` holds all the information that is needed to run the jobs and will be parsed by ```MONSDA.py``` and split into sub-configs that can later be found in the directory ```SubSnakes``` or ```SubFlows``` respectively. To successfully run an analysis pipeline, a few steps have to be followed: * Directory structure: The structure for the directories is dictated by the condition-tree in the config file - * Config file: This is the central part of the analysis. Depending on this file ```MONSDA.py``` will determine processing steps and generate according config and ```Snakemake/Nextflow``` workflow files to run each subworkflow until all processing steps are done. + * Config file: This is the central part of the analysis. Depending on this file ```MONSDA.py``` will determine processing steps and generate according config and ```Snakemake|Nextflow``` workflow files to run each subworkflow until all processing steps are done. ## Run the pipeline From c040c49b8c4f1c94841a8ecf703ceece654f10b0 Mon Sep 17 00:00:00 2001 From: jfallmann Date: Tue, 5 May 2026 15:34:47 +0200 Subject: [PATCH 29/39] webconfig --- MONSDA/web_configurator.py | 120 +++++++++++++++++++++++++------------ 1 file changed, 83 insertions(+), 37 deletions(-) diff --git a/MONSDA/web_configurator.py b/MONSDA/web_configurator.py index 6b2fb5c8..ce964980 100644 --- a/MONSDA/web_configurator.py +++ b/MONSDA/web_configurator.py @@ -10,8 +10,11 @@ from pydantic import BaseModel, Field from snakemake.common.configfile import load_configfile +from . import _version from .Params import samplesheet_to_settings +__version__ = _version.get_versions()["version"] + TEMPLATE_FILE = "template_base_commented.json" NONE_WORKFLOW_KEYS = ["WORKFLOWS", "BINS", "MAXTHREADS", "SETTINGS", "VERSION"] @@ -227,7 +230,7 @@ def build_config(req: BuildConfigRequest) -> Dict[str, Any]: "WORKFLOWS": ", ".join(workflows), "BINS": template.get("BINS", ""), "MAXTHREADS": str(req.maxthreads), - "VERSION": template.get("VERSION", ""), + "VERSION": __version__, "SETTINGS": settings, } @@ -855,6 +858,7 @@ def root() -> str:
+
@@ -875,11 +879,13 @@ def root() -> str: