diff --git a/MONSDA/Configurator.py b/MONSDA/Configurator.py index 847d9f65..97bd7be8 100755 --- a/MONSDA/Configurator.py +++ b/MONSDA/Configurator.py @@ -28,12 +28,10 @@ os.sep.join(["lib", pythonversion, "site-packages", "MONSDA"]), "share" ) except: - installpath = os.path.cwd() + installpath = os.getcwd() configpath = os.path.join(installpath, "MONSDA", "configs") current_path = os.getcwd() -dir_path = os.path.dirname(os.path.realpath(__file__)) -os.chdir(dir_path) template = load_configfile(os.sep.join([configpath, "template_base_commented.json"])) none_workflow_keys = ["WORKFLOWS", "BINS", "MAXTHREADS", "SETTINGS", "VERSION"] @@ -70,6 +68,13 @@ help="takes configuration file to modify", ) +parser.add_argument( + "--samplesheet", + type=str, + default=False, + help="CSV or TSV samplesheet to populate SETTINGS and condition tree for new configs/projects", +) + args = parser.parse_args() @@ -315,6 +320,21 @@ def get_conditions_from_dict(root, keylist=[]): keylist.pop() +def get_conditions_from_settings(root, keylist=[]): + """Yield condition paths from SETTINGS leaves that contain a SAMPLES key.""" + if not isinstance(root, dict): + return + if "SAMPLES" in root and isinstance(root.get("SAMPLES"), list): + yield ":".join(keylist) + return + for k, v in root.items(): + if not isinstance(v, dict): + continue + keylist.append(k) + yield from get_conditions_from_settings(v, keylist) + keylist.pop() + + def getPathesFromDict(d, value=None): def yield_func(d): q = [(d, [])] @@ -790,6 +810,8 @@ def create_condition_tree(): def add_sample_dirs(only_conditions=None): pickle_unfinished("add_sample_dirs") # project.current_func_arg = only_conditions + if args.samplesheet and only_conditions is None and project.mode in ["project", "config"]: + return assign_samplesheet() if "FETCH" in project.workflowsDict.keys(): return assign_SRA(only_conditions) print("\n FASTQ files:") @@ -876,6 +898,58 @@ def add_sample_dirs(only_conditions=None): return assign_samples(only_conditions) +def assign_samplesheet(): + pickle_unfinished("assign_samplesheet") + prCyan("\n Sample Assignment: samplesheet\n") + + samplesheet = str(args.samplesheet) + if not os.path.isfile(samplesheet): + prRed(f"Could not find samplesheet file: {samplesheet}") + exit(1) + + cwd = os.getcwd() + try: + # Params initializes logging at import time via Utils.setup_logger and expects + # a writable LOGS/ directory in the current working directory. + os.makedirs(os.path.join(current_path, "LOGS"), exist_ok=True) + os.chdir(current_path) + from .Params import samplesheet_to_settings + + sheet_settings = samplesheet_to_settings(samplesheet) + except Exception as e: + prRed(f"Failed to parse samplesheet '{samplesheet}': {e}") + exit(1) + finally: + os.chdir(cwd) + + if not sheet_settings: + prRed(f"Samplesheet '{samplesheet}' did not produce SETTINGS entries") + exit(1) + + project.settingsDict = decouple(sheet_settings) + project.conditionsDict = NestedDefaultDict() + project.samplesDict = NestedDefaultDict() + + condition_paths = [ + x.split(":") for x in get_conditions_from_settings(project.settingsDict) + ] + if not condition_paths: + prRed( + "No condition leaves with SAMPLES found in parsed samplesheet SETTINGS" + ) + exit(1) + + for path in condition_paths: + setInDict(project.conditionsDict, path, {}) + + prGreen("Loaded condition tree and SETTINGS from samplesheet:") + print_dict(project.conditionsDict, gap=" ") + print("") + print_dict(project.settingsDict, gap=" ") + show_settings() + return select_conditioning() + + def assign_SRA(only_conditions=None): pickle_unfinished("assign_SRA") prCyan("\n Sample Assignment: SRA Accession Numbers\n") @@ -1882,6 +1956,14 @@ def main(): global guide project = PROJECT() guide = GUIDE() + if args.samplesheet: + if not str(args.samplesheet).lower().endswith((".csv", ".tsv", ".txt")): + print("Samplesheet flag requires a .csv/.tsv/.txt file") + exit() + args.samplesheet = os.path.abspath(args.samplesheet) + if not os.path.isfile(args.samplesheet): + print(f"Samplesheet file not found: {args.samplesheet}") + exit() if args.test: guide.testing = True if args.config: diff --git a/MONSDA/Logger.py b/MONSDA/Logger.py index b4b4bd9f..6d9fbafd 100755 --- a/MONSDA/Logger.py +++ b/MONSDA/Logger.py @@ -1,47 +1,5 @@ # Logger.py --- -# -# Filename: Logger.py -# Description: -# Author: Joerg Fallmann -# Maintainer: -# Created: Mon Aug 12 10:26:55 2019 (+0200) -# Version: -# Package-Requires: () -# Last-Updated: Wed Apr 29 16:42:40 2020 (+0200) -# By: Joerg Fallmann -# Update #: 91 -# URL: -# Doc URL: -# Keywords: -# Compatibility: -# -# - -# Commentary: -# -# -# -# - -# Change Log: -# -# -# -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or (at -# your option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Emacs. If not, see . -# -# + # Code: import logging diff --git a/MONSDA/Params.py b/MONSDA/Params.py index 71182733..6a62a918 100644 --- a/MONSDA/Params.py +++ b/MONSDA/Params.py @@ -1,46 +1,4 @@ # Params.py --- -# -# Filename: Params.py -# Description: -# Author: Joerg Fallmann -# Maintainer: -# Created: Tue Sep 18 15:39:06 2018 (+0200) -# Version: -# Package-Requires: () -# Last-Updated: Thu Feb 4 18:01:07 2021 (+0100) -# By: Joerg Fallmann -# Update #: 2888 -# URL: -# Doc URL: -# Keywords: -# Compatibility: -# -# - -# Commentary: -# -# -# - -# Change Log: -# -# -# -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or (at -# your option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Emacs. If not, see . -# -# # Code: # import os, sys, inspect @@ -61,18 +19,23 @@ # # __file__ fails if someone does os.chdir() before. # # sys.argv[0] also fails, because it doesn't not always contain the path. +import csv import datetime import glob import inspect import itertools +import json import os import re import shutil import sys import traceback as tb from collections import OrderedDict, defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Optional from natsort import natsorted +from snakemake.common.configfile import load_configfile as _load_configfile import MONSDA.Utils as mu from MONSDA.Utils import check_run as check_run @@ -101,6 +64,397 @@ print("".join(tbe.format()), file=sys.stderr) +def _is_lane_split_file(filename: str, sample: Optional[str] = None) -> bool: + """Return True if filename matches lane split naming. + + Accepted examples (both orderings): + Sample_L001_R1.fastq.gz + Sample_R1_L001.fastq.gz + Sample_R1_L001_001.fastq.gz + """ + + if sample: + # Match: SAMPLE_[L###_]R[12][_###] or SAMPLE_R[12][_L###][_###] + pattern = rf"^{re.escape(sample)}(?:_L\d{{1,3}})?_[Rr][12](?:_L\d{{1,3}})?(?:_\d+)?\.fastq\.gz$" + else: + pattern = r"^.+_[Rr][12]_L\d{1,3}(?:_\d+)?\.fastq\.gz$|^.+_L\d{1,3}_[Rr][12](?:_\d+)?\.fastq\.gz$" + return bool(re.match(pattern, filename)) + + +def _matches_sample_fastq(filename: str, sample: str) -> bool: + """Match configured sample against accepted FASTQ naming schemes. + + Handles both lane orderings: + - SAMPLE_L###_R[12] (standard Illumina) + - SAMPLE_R[12]_L### (alternative) + """ + return ( + bool( + re.match( + rf"^{re.escape(sample)}(?:_L\d{{1,3}})?_[Rr][12](?:_L\d{{1,3}})?(?:_\d+)?\.fastq\.gz$", + filename, + ) + ) + or filename == sample + ".fastq.gz" + ) + + +def _strip_fastq_sample_name(filename: str) -> str: + """Strip read/lane suffix from FASTQ filename and return sample basename. + + Handles both orderings: + - SAMPLE_L###_R[12][_###] + - SAMPLE_R[12]_L###[_###] + """ + base = os.path.basename(filename) + return re.sub( + r"(?:_L\d{1,3})?_[Rr][12](?:_L\d{1,3})?(?:_\d+)?\.fastq\.gz$|\.fastq\.gz$", + "", + base, + ) + + +def _filter_lane_files_when_merged_exists(files: list) -> list: + """Drop lane-split FASTQs if canonical merged FASTQ exists for same sample/read. + + Keeps existing behavior for non-lane files and for samples without merged targets. + Handles both lane orderings: SAMPLE_L###_R# and SAMPLE_R#_L###. + """ + if not files: + return files + + # Match both orderings: _L###_R# or _R#_L### + lane_patterns = [ + re.compile(r"^(?P.+)_L\d{1,3}_[Rr](?P[12])(?:_\d+)?\.fastq\.gz$"), + re.compile(r"^(?P.+)_[Rr](?P[12])_L\d{1,3}(?:_\d+)?\.fastq\.gz$"), + ] + base_names = {os.path.basename(f) for f in files} + keep = [] + + for fp in files: + bn = os.path.basename(fp) + m = None + for pat in lane_patterns: + m = pat.match(bn) + if m: + break + + if not m: + keep.append(fp) + continue + + canonical_bn = f"{m.group('sample')}_R{m.group('read')}.fastq.gz" + canonical_fp = os.path.join(os.path.dirname(fp), canonical_bn) + if canonical_bn in base_names or os.path.exists(canonical_fp): + continue + keep.append(fp) + + return keep + + +def samplesheet_to_settings(samplesheet_path: str) -> dict: + """Read a CSV/TSV samplesheet and return a SETTINGS dict compatible with MONSDA config. + + Expected columns (case-insensitive header row): + CONDITION - slash-separated condition path, e.g. ``Ecoli/WT`` or ``Ecoli/WT/dummylevel`` + SAMPLE - sample name / accession + GROUP - group label for differential analysis + SEQUENCING - e.g. ``paired`` or ``single`` + REFERENCE - path to genome FASTA (.fa.gz) + GTF - path to GTF annotation (.gtf.gz) [optional] + GFF - path to GFF annotation (.gff.gz) [optional] + INDEX - path to pre-built index [optional] + PREFIX - mapper index prefix [optional] + DECOY - path to decoy file [optional] + TYPE - sample type label [optional] + BATCH - batch label [optional] + IP - IP protocol info (for PEAKS) [optional] + + Per-condition metadata (SEQUENCING, REFERENCE, …) only needs to be present + on the first row for that condition; subsequent rows may leave those cells + empty (fill-down behaviour). + + Parameters + ---------- + samplesheet_path : str + Absolute or relative path to the samplesheet file. + + Returns + ------- + dict + Nested dict suitable for assigning to ``config["SETTINGS"]``. + """ + logid = scriptname + ".samplesheet_to_settings: " + + # --- detect delimiter --- + with open(samplesheet_path, newline="") as fh: + sample = fh.read(4096) + + delimiter = None + # 1. trust the file extension + ext = os.path.splitext(samplesheet_path)[1].lower() + if ext in (".tsv", ".txt"): + delimiter = "\t" + elif ext == ".csv": + delimiter = "," + else: + # 2. try the sniffer + try: + dialect = csv.Sniffer().sniff(sample, delimiters=",\t;") + delimiter = dialect.delimiter + except csv.Error: + pass + # 3. manual probe: whichever candidate appears more on the first line + if delimiter is None: + first_line = sample.splitlines()[0] if sample else "" + delimiter = "\t" if first_line.count("\t") >= first_line.count(",") else "," + + settings: dict = {} + + # per-condition accumulator for fill-down metadata + cond_meta: dict = {} + + with open(samplesheet_path, newline="") as fh: + reader = csv.DictReader(fh, delimiter=delimiter) + # normalise header keys to upper-case + if reader.fieldnames is None: + raise ValueError( + "Samplesheet appears to be empty or has no header row: " + + samplesheet_path + ) + reader.fieldnames = [f.strip().upper() for f in reader.fieldnames] + + for row in reader: + row = { + k.strip().upper(): (v.strip() if v is not None else "") + for k, v in row.items() + if k + } + + condition_str = row.get("CONDITION", "").strip() + sample_name = row.get("SAMPLE", "").strip() + if not condition_str or not sample_name: + log.warning( + logid + "Skipping row with missing CONDITION or SAMPLE: " + str(row) + ) + continue + + # fill-down: carry over non-empty metadata from previous rows of same condition + if condition_str not in cond_meta: + cond_meta[condition_str] = {} + for key in ( + "SEQUENCING", + "REFERENCE", + "GTF", + "GFF", + "INDEX", + "PREFIX", + "DECOY", + "IP", + ): + val = row.get(key, "") + if val: + cond_meta[condition_str][key] = val + + meta = cond_meta[condition_str] + + # --- navigate / create nested dict path --- + path_parts = [p.strip() for p in condition_str.split("/") if p.strip()] + node = settings + for part in path_parts: + node = node.setdefault(part, {}) + + # --- initialise leaf node on first encounter --- + if "SAMPLES" not in node: + node["SAMPLES"] = [] + node["GROUPS"] = [] + node["TYPES"] = [] + node["BATCHES"] = [] + node["SEQUENCING"] = meta.get("SEQUENCING", "") + node["REFERENCE"] = meta.get("REFERENCE", "") + node["INDEX"] = meta.get("INDEX", "") + node["PREFIX"] = meta.get("PREFIX", "") + node["IP"] = meta.get("IP", "") + gtf = meta.get("GTF", "") + gff = meta.get("GFF", "") + node["ANNOTATION"] = {"GTF": gtf, "GFF": gff} + decoy = meta.get("DECOY", "") + node["DECOY"] = {decoy: ""} if decoy else {} + + node["SAMPLES"].append(sample_name) + node["GROUPS"].append(row.get("GROUP", "")) + node["TYPES"].append(row.get("TYPE", "")) + node["BATCHES"].append(row.get("BATCH", "")) + + log.info(logid + "Built SETTINGS from samplesheet: " + str(list(settings.keys()))) + return settings + + +def inject_samplesheet_settings(configfile: str, samplesheet_path: str) -> str: + """Load *configfile*, populate ``SETTINGS`` from *samplesheet_path* if absent, + write the augmented config to ``_with_settings.json`` and return that path. + + Parameters + ---------- + configfile : str + Path to the original MONSDA JSON config. + samplesheet_path : str + Path to the CSV/TSV samplesheet. + + Returns + ------- + str + Path to the written (augmented) config file. + """ + logid = scriptname + ".inject_samplesheet_settings: " + + config = _load_configfile(configfile) + + existing = config.get("SETTINGS", {}) + # strip comment-only SETTINGS that have no SAMPLES anywhere + has_samples = ( + any( + isinstance(v, dict) and "SAMPLES" in v + for cond in existing.values() + if isinstance(cond, dict) + for v in cond.values() + ) + if existing + else False + ) + + if has_samples: + log.info( + logid + + "Config already contains SETTINGS with sample data; samplesheet will be ignored." + ) + return configfile + + log.info(logid + "Populating SETTINGS from samplesheet: " + samplesheet_path) + config["SETTINGS"] = samplesheet_to_settings(samplesheet_path) + + base, ext = os.path.splitext(configfile) + out_path = base + "_with_settings" + (ext if ext else ".json") + with open(out_path, "w") as fh: + json.dump(config, fh, indent=4) + log.info(logid + "Augmented config written to: " + out_path) + return out_path + + +@check_run +def _merge_lane_files(target: str, lane_candidates: list, logid: str) -> str: + """Concatenate *lane_candidates* gzip files into *target*. + + Returns *target* on success; raises on any I/O error. + """ + with open(target, "wb") as outfh: + for lane_file in lane_candidates: + with open(lane_file, "rb") as infh: + shutil.copyfileobj(infh, outfh) + return target + + +def prepare_lane_split_fastqs(config: dict, max_workers: Optional[int] = None) -> int: + """Concatenate lane-split FASTQs into canonical _R1/_R2 files when needed. + + Merges are executed in parallel using a ``ThreadPoolExecutor`` so that all + samples/reads are processed concurrently (I/O-bound work, thread-safe). + + Parameters + ---------- + config : dict + Parsed MONSDA config. + max_workers : int, optional + Maximum number of parallel merge threads. When ``None`` (default), + the value is taken from ``config["MAXTHREADS"]``; if that is also + absent the ``ThreadPoolExecutor`` built-in default is used. + + This is intentionally additive: existing canonical files are kept untouched. + """ + logid = scriptname + ".Params_prepare_lane_split_fastqs: " + merged_files = 0 + + if max_workers is None: + try: + max_workers = int(config["MAXTHREADS"]) + except (KeyError, TypeError, ValueError): + max_workers = None # fall back to ThreadPoolExecutor default + + samples = [os.path.join(x) for x in sampleslong(config, nocheck="1")] + log.debug(logid + "Checking lane split files for samples: " + str(samples)) + + # --- collect all (target, lane_candidates) pairs first --- + merge_tasks: list = [] # list of (target, lane_candidates) + + for sample in samples: + paired = checkpaired([sample], config) + if not paired or not any(x in paired for x in ["paired", "singlecell"]): + continue + + sample_dir = os.path.join("FASTQ", os.path.dirname(sample)) + sample_name = os.path.basename(sample).replace(".fastq.gz", "") + if not os.path.isdir(sample_dir): + continue + + for read in ["1", "2"]: + # Find lane files in either format: SAMPLE_L###_R# or SAMPLE_R#_L### + lane_candidates = sorted( + set( + f + for pattern_glob in [ + os.path.join(sample_dir, f"{sample_name}_L*_R{read}*.fastq.gz"), + os.path.join(sample_dir, f"{sample_name}_R{read}_L*.fastq.gz"), + ] + for f in glob.glob(pattern_glob) + if _is_lane_split_file(os.path.basename(f), sample_name) + ) + ) + + if len(lane_candidates) < 1: + continue + + target = os.path.join(sample_dir, f"{sample_name}_R{read}.fastq.gz") + if os.path.exists(target): + log.info( + logid + + f"Found lane-split files for {sample_name} R{read}, but target already exists: {target} (keeping existing file)" + ) + continue + + log.info( + logid + + f"Concatenating {len(lane_candidates)} lane files into {target}" + ) + merge_tasks.append((target, lane_candidates)) + + if not merge_tasks: + log.debug(logid + "No lane-split FASTQ files required concatenation") + return 0 + + # --- run all merges in parallel --- + with ThreadPoolExecutor(max_workers=max_workers) as pool: + futures = { + pool.submit(_merge_lane_files, target, lanes, logid): target + for target, lanes in merge_tasks + } + for future in as_completed(futures): + target = futures[future] + try: + future.result() + merged_files += 1 + except Exception: + exc_type, exc_value, exc_tb = sys.exc_info() + tbe = tb.TracebackException(exc_type, exc_value, exc_tb) + log.error( + logid + + f"Failed to merge lane files into {target}: " + + "".join(tbe.format()) + ) + + log.info(logid + f"Created {merged_files} concatenated lane-merged FASTQ files") + return merged_files + + @check_run def get_samples(config: dict) -> list(): """Check and return samples according to sample list on config.json @@ -129,6 +483,7 @@ def get_samples(config: dict) -> list(): paired = checkpaired([SAMPLES[i]], config) log.debug(logid + "PAIRED: " + str(paired)) f = glob.glob(s) + f = _filter_lane_files_when_merged_exists(f) log.debug(logid + "SAMPLECHECK: " + str(f)) if f: f = list(set([str.join(os.sep, s.split(os.sep)[1:]) for s in f])) @@ -201,6 +556,7 @@ def get_samples_postprocess(config: dict, subwork: str) -> list: paired = checkpaired([SAMPLES[i]], config) log.debug(logid + "PAIRED: " + str(paired)) f = glob.glob(s) + f = _filter_lane_files_when_merged_exists(f) log.debug(logid + "SAMPLECHECK: " + str(f)) if f: f = sorted(list(set([str.join(os.sep, s.split(os.sep)[1:]) for s in f]))) @@ -395,6 +751,7 @@ def get_samples_from_dir(search: str, config: dict, nocheck: str = None) -> list pat = os.sep.join(["FASTQ", os.sep.join(search[0:x]), "*.fastq.gz"]) log.debug(logid + "REGEX: " + str(pat) + "\t" + "SAMPLES: " + str(samples)) check = natsorted(glob.glob(pat), key=lambda y: y.lower()) + check = _filter_lane_files_when_merged_exists(check) log.debug(logid + "check: " + str(check)) if len(check) > 0: ret = list() @@ -409,21 +766,30 @@ def get_samples_from_dir(search: str, config: dict, nocheck: str = None) -> list for s in samples: log.debug(logid + "x: " + str(x)) log.debug(logid + "sample: " + str(s)) - if re.match(f"^{s}_R", x) or x == s + ".fastq.gz": + if _matches_sample_fastq(x, s): log.debug( logid + "FOUND: " + s - + "_R" - + " or " - + s - + ".fastq.gz" + + " matching accepted FASTQ naming" + " in " + x ) clean.append(c) break log.debug(logid + "checkclean: " + str(clean)) + + # Check if any samples were found in the clean list + if not clean: + search_dir = os.sep.join(["FASTQ"] + search) + error_msg = ( + f"No sample files found for condition {os.sep.join(search)}. " + f"Expected to find files matching samples {samples} " + f"in directory: {search_dir}" + ) + log.error(logid + error_msg) + raise ValueError(error_msg) + paired = checkpaired( [os.sep.join([os.sep.join(search), clean[0].split(os.sep)[-1]])], config ) @@ -436,11 +802,7 @@ def get_samples_from_dir(search: str, config: dict, nocheck: str = None) -> list os.sep.join( [ os.sep.join(os.path.dirname(s).split(os.sep)[1:]), - re.sub( - r"_r1.fastq.gz|_R1.fastq.gz|_r2.fastq.gz|_R2.fastq.gz|.fastq.gz", - "", - os.path.basename(s), - ), + _strip_fastq_sample_name(os.path.basename(s)), ] ) for s in clean @@ -456,10 +818,8 @@ def get_samples_from_dir(search: str, config: dict, nocheck: str = None) -> list os.sep.join( os.path.dirname(s).split(os.sep)[1:] ), - re.sub( - r"_r1.fastq.gz|_R1.fastq.gz|_r2.fastq.gz|_R2.fastq.gz|.fastq.gz", - "", - os.path.basename(s), + _strip_fastq_sample_name( + os.path.basename(s) ), ] ) @@ -1546,9 +1906,15 @@ def get_combo_name(combinations: list) -> mu.NestedDefaultDict: envs = list() works = list() for step in combi: - for work, env in step.items(): - envs.append(env) - works.append(work) + if isinstance(step, list): + for substep in step: + for work, env in substep.items(): + envs.append(env) + works.append(work) + else: + for work, env in step.items(): + envs.append(env) + works.append(work) combname[condition]["envs"].append(str.join("-", envs)) combname[condition]["works"].append(str.join("-", works)) diff --git a/MONSDA/RunMONSDA.py b/MONSDA/RunMONSDA.py index 571fde34..422c82d3 100755 --- a/MONSDA/RunMONSDA.py +++ b/MONSDA/RunMONSDA.py @@ -2,6 +2,7 @@ import argparse import os +import re import shlex import shutil import subprocess @@ -129,6 +130,23 @@ def parseargs(): action="store_true", help="Print version and exit", ) + parser.add_argument( + "--samplesheet", + type=str, + default=None, + metavar="FILE", + help="CSV or TSV samplesheet to populate the SETTINGS section. " + "Used when the config JSON lacks a SETTINGS block. " + "On the first run an augmented config (_with_settings.json) is written for reuse.", + ) + parser.add_argument( + "--oras-registry", + type=str, + default="docker.io", + metavar="HOST", + help="Container registry hostname for ORAS image pulls (default: docker.io). " + "Use e.g. ghcr.io for GitHub Container Registry.", + ) if len(sys.argv) == 1: parser.print_help(sys.stderr) @@ -275,6 +293,12 @@ def run_snakemake( ONCE FILES ARE DOWNLOADED WE CAN START OTHER PREPROCESSING STEPS """ mu.makeoutdir("TMP") + merged_lane_fastqs = mp.prepare_lane_split_fastqs(config) + if merged_lane_fastqs: + log.info( + logid + + f"Prepared {merged_lane_fastqs} lane-merged FASTQ files before sample collection" + ) SAMPLES = mp.get_samples(config) log.info(logid + "SAMPLES: " + str(SAMPLES)) @@ -617,6 +641,12 @@ def run_nextflow( ONCE FILES ARE DOWNLOAD WE CAN START OTHER PREPROCESSING STEPS """ mu.makeoutdir("TMP") + merged_lane_fastqs = mp.prepare_lane_split_fastqs(config) + if merged_lane_fastqs: + log.info( + logid + + f"Prepared {merged_lane_fastqs} lane-merged FASTQ files before sample collection" + ) SAMPLES = mp.get_samples(config) log.info(logid + "SAMPLES: " + str(SAMPLES)) conditions = mp.get_conditions(config) @@ -819,9 +849,19 @@ def run_nextflow( def runjob(jobtorun): try: logid = scriptname + ".runjob: " + run_cmd = jobtorun + if ( + re.search(r"(^|\s)nextflow(\s|$)", jobtorun) + and "NXF_SYNTAX_PARSER" not in jobtorun + ): + run_cmd = "NXF_SYNTAX_PARSER=v1 " + jobtorun + log.warning( + logid + + "Nextflow 26.04+ strict parser detected; forcing legacy parser (NXF_SYNTAX_PARSER=v1) for compatibility with generated workflows." + ) # return subprocess.run(jobtorun, shell=True, universal_newlines=True, capture_output=True) # python >= 3.7 job = subprocess.Popen( - jobtorun, + run_cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, @@ -923,6 +963,16 @@ def main(): for i in range(1, len(knownargs.config)): optionalargs[0].extend(list(["-c", str(knownargs.config[i].pop())])) + # --- samplesheet injection (before any other config use) --- + if knownargs.samplesheet: + knownargs.configfile = mp.inject_samplesheet_settings( + knownargs.configfile, + os.path.abspath(knownargs.samplesheet), + ) + + # --- set ORAS registry for container image pulls --- + mw.set_oras_registry(knownargs.oras_registry) + log.debug( f"{logid} ARGS: {args} {type(args)} KNOWNARGS: {knownargs} {type(knownargs)} OPTIONALARGS: {optionalargs} {type(optionalargs)}" ) diff --git a/MONSDA/Utils.py b/MONSDA/Utils.py index 29a8c4d2..8dd741f5 100644 --- a/MONSDA/Utils.py +++ b/MONSDA/Utils.py @@ -1,46 +1,4 @@ # Utils.py --- -# -# Filename: Utils.py -# Description: -# Author: Joerg Fallmann -# Maintainer: -# Created: Tue Sep 18 15:39:06 2018 (+0200) -# Version: -# Package-Requires: () -# Last-Updated: Thu Feb 4 18:01:07 2021 (+0100) -# By: Joerg Fallmann -# Update #: 2888 -# URL: -# Doc URL: -# Keywords: -# Compatibility: -# -# - -# Commentary: -# -# -# - -# Change Log: -# -# -# -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or (at -# your option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Emacs. If not, see . -# -# # Code: # import os, sys, inspect @@ -90,6 +48,7 @@ def setup_logger(scriptname): for handler in log.handlers[:]: handler.close() log.removeHandler(handler) + os.makedirs("LOGS", exist_ok=True) handler = logging.FileHandler("LOGS/MONSDA.log", mode="a") handler.setFormatter( logging.Formatter( @@ -182,6 +141,22 @@ def func_wrapper(*args, **kwargs): try: return func(*args, **kwargs) + except ValueError as e: + # Handle sample not found errors gracefully + error_msg = str(e) + if "sample" in error_msg.lower() and "not found" in error_msg.lower(): + log.error(logid + error_msg) + log.error(logid + "STOPPING: Processing stopped due to missing samples") + sys.exit(1) + else: + # Re-raise other ValueError exceptions + exc_type, exc_value, exc_tb = sys.exc_info() + tbe = tb.TracebackException( + exc_type, + exc_value, + exc_tb, + ) + log.error(logid + "".join(tbe.format())) except Exception: exc_type, exc_value, exc_tb = sys.exc_info() tbe = tb.TracebackException( diff --git a/MONSDA/Workflows.py b/MONSDA/Workflows.py index 88aaf92a..2206e7bb 100755 --- a/MONSDA/Workflows.py +++ b/MONSDA/Workflows.py @@ -1,45 +1,4 @@ # Workflows.py --- -# -# Filename: Workflows.py -# Description: -# Author: Joerg Fallmann -# Maintainer: -# Created: Tue Sep 18 15:39:06 2018 (+0200) -# Version: -# Package-Requires: () -# Last-Updated: Thu Feb 4 18:01:07 2021 (+0100) -# By: Joerg Fallmann -# Update #: 2888 -# URL: -# Doc URL: -# Keywords: -# Compatibility: -# -# - -# Commentary: -# -# -# - -# Change Log: -# -# -# -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or (at -# your option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Emacs. If not, see . -# -# # Code: # import os, sys, inspect @@ -94,6 +53,27 @@ binpath = os.path.join(installpath, "MONSDA", "scripts") condapath = re.compile(r'conda:\s+"') logfix = re.compile(r'loglevel="INFO"') +# Matches oras:// NOT already followed by a registry host (i.e. followed directly by a +# Docker Hub namespace like "jfallmann/" without dots before the first slash) +oraspath = re.compile(r'oras://(?![\w.-]+\.[\w.-]+/)') +oras_registry = "docker.io" + + +def set_oras_registry(registry: str): + """Set the ORAS registry used for container image URIs.""" + global oras_registry + oras_registry = registry.strip().rstrip("/") + + +def _fix_oras(line: str) -> str: + """Insert oras_registry into unqualified oras:// URIs.""" + return re.sub(oraspath, "oras://" + oras_registry + "/", line) + + +def _write_workflow(filepath, content): + """Write workflow file with ORAS registry fix applied.""" + content = re.sub(oraspath, "oras://" + oras_registry + "/", content) + write_if_different(filepath, content) try: scriptname = ( @@ -192,7 +172,20 @@ def get_combo(wfs, config, conditions): + " with Tool: " + str(tools) ) - ret.append(tools) + # Group all QC tools into a single combo position so pre-QC + # (fastqc) and post-QC (rustqc) both appear in one combo name. + if subwork == "QC" and len(tools) > 1: + qc_prio = {"fastqc": 0, "rustqc": 1} + tools = sorted( + tools, + key=lambda item: ( + qc_prio.get(list(item.values())[0], 99), + list(item.values())[0], + ), + ) + ret.append([tools]) # one option = the entire list of QC tools + else: + ret.append(tools) log.debug(f"{logid} Itertools {ret}") combos[condition] = itertools.product(*ret) @@ -564,8 +557,15 @@ def make_pre( toolenv, toolbin = map(str, listoftools[a]) if toolenv != envs[j] or toolbin is None: continue - sconf[works[j] + "ENV"] = toolenv - sconf[works[j] + "BIN"] = toolbin + if works[j] == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[works[j] + "ENV"] = toolenv + sconf[works[j] + "BIN"] = toolbin subconf.update(sconf) subname = toolenv + ".smk" @@ -622,7 +622,7 @@ def make_pre( ), ) ) - write_if_different(smko, "".join(add) + "".join(subjobs)) + _write_workflow(smko, "".join(add) + "".join(subjobs)) confo = os.path.abspath( os.path.join( @@ -690,8 +690,15 @@ def make_pre( if toolenv is None or toolbin is None: continue subconf = mu.NestedDefaultDict() - sconf[subwork + "ENV"] = toolenv - sconf[subwork + "BIN"] = toolbin + if subwork == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[subwork + "ENV"] = toolenv + sconf[subwork + "BIN"] = toolbin subconf.update(sconf) subname = toolenv + ".smk" @@ -756,7 +763,7 @@ def make_pre( ) ) - write_if_different(smko, str.join("", add) + str.join("", subjobs)) + _write_workflow(smko, str.join("", add) + str.join("", subjobs)) confo = os.path.abspath( os.path.join( @@ -879,8 +886,15 @@ def make_sub( toolenv.replace("bisulfite", "_bisulfite") + ".smk" ) - sconf[works[j] + "ENV"] = toolenv - sconf[works[j] + "BIN"] = toolbin + if works[j] == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[works[j] + "ENV"] = toolenv + sconf[works[j] + "BIN"] = toolbin subconf.update(sconf) log.debug(logid + f"SCONF:{sconf}, SUBCONF:{subconf}") @@ -901,7 +915,9 @@ def make_sub( and "MAPPING" not in works and toolenv != "rustqc" ): - if "DEDUP" in works and "umitools" in envs: + if "DEDUP" in works and any( + x in envs for x in ["umitools", "fgumi", "umicollapse"] + ): subname = toolenv + "_dedup_trim.smk" else: subname = toolenv + "_trim.smk" @@ -912,16 +928,18 @@ def make_sub( and "MAPPING" not in works and toolenv != "rustqc" ): - if "DEDUP" in subworkflows and "umitools" in envs: + if "DEDUP" in subworkflows and any( + x in envs for x in ["umitools", "fgumi", "umicollapse"] + ): subname = toolenv + "_dedup.smk" else: subname = toolenv + "_raw.smk" - # Picard tools can be extended here + # Dedup tools can be extended here if works[j] == "DEDUP" and toolenv == "picard": subname = toolenv + "_dedup.smk" subconf.pop("PREDEDUP", None) - elif works[j] == "DEDUP" and toolenv == "umitools": + elif works[j] == "DEDUP" and toolenv in ["umitools", "fgumi", "umicollapse"]: subconf["PREDEDUP"] = "enabled" smkf = os.path.abspath(os.path.join(workflowpath, subname)) @@ -998,7 +1016,7 @@ def make_sub( ) ) - write_if_different(smko, "".join(add) + "".join(subjobs)) + _write_workflow(smko, "".join(add) + "".join(subjobs)) confo = os.path.abspath( os.path.join( @@ -1062,8 +1080,15 @@ def make_sub( ): # Here we can add tool specific extra cases, like e.g segehmehl bisulfite mode subname = toolenv.replace("bisulfite", "_bisulfite") + ".smk" subconf = mu.NestedDefaultDict() - sconf[subwork + "ENV"] = toolenv - sconf[subwork + "BIN"] = toolbin + if subwork == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[subwork + "ENV"] = toolenv + sconf[subwork + "BIN"] = toolbin subconf.update(sconf) # RustQC is post-alignment QC only; skip if no MAPPING @@ -1090,11 +1115,11 @@ def make_sub( else: subname = toolenv + "_trim.smk" - # Picard tools can be extended here + # Dedup tools can be extended here if subwork == "DEDUP" and toolenv == "picard": subname = toolenv + "_dedup.smk" subconf.pop("PREDEDUP", None) - elif works[j] == "DEDUP" and toolenv == "umitools": + elif subwork == "DEDUP" and toolenv in ["umitools", "fgumi", "umicollapse"]: subconf["PREDEDUP"] = "enabled" # Add rulethemall based on chosen workflows add.append( @@ -1178,7 +1203,7 @@ def make_sub( os.path.join(subdir, "_".join(["_".join(condition), "subsnake.smk"])) ) - write_if_different(smko, str.join("", add) + str.join("", subjobs)) + _write_workflow(smko, str.join("", add) + str.join("", subjobs)) confo = os.path.abspath( os.path.join(subdir, "_".join(["_".join(condition), "subconfig.json"])) ) @@ -1391,7 +1416,7 @@ def make_post( ) ) - write_if_different(smko, "".join(add) + "".join(subjobs)) + _write_workflow(smko, "".join(add) + "".join(subjobs)) confo = os.path.abspath( os.path.join( @@ -1524,7 +1549,7 @@ def make_post( ) ) - write_if_different(smko, "".join(add) + "".join(subjobs)) + _write_workflow(smko, "".join(add) + "".join(subjobs)) confo = os.path.abspath( os.path.join( @@ -1616,7 +1641,7 @@ def make_summary(config, subdir, loglevel, combinations=None): subjobs.append("\n\n") smko = os.path.abspath(os.path.join(subdir, "summary_subsnake.smk")) - write_if_different(smko, "".join(subjobs)) + _write_workflow(smko, "".join(subjobs)) subconf = mu.NestedDefaultDict() for key in ["BINS", "MAXTHREADS", "SETTINGS"]: @@ -1724,6 +1749,7 @@ def fixinclude( for line in incl.readlines(): line = re.sub(logfix, "loglevel='" + loglevel + "'", line) line = re.sub(condapath, condaline + envpath, line) + line = _fix_oras(line) if includeline in line: line = fixinclude( line, loglevel, condapath, envpath, workflowpath, logfix @@ -2246,13 +2272,35 @@ def nf_tool_params( if toolenv else mu.sub_dict(config[subwork], x)["OPTIONS"] ) - tp.append( - "--" + subwork + "ENV " + toolenv + " --" + subwork + "BIN " + toolbin + " " - ) + if subwork == "QC": + qcprefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + tp.append( + "--" + + qcprefix + + "ENV " + + toolenv + + " --" + + qcprefix + + "BIN " + + toolbin + + " " + ) + else: + tp.append( + "--" + + subwork + + "ENV " + + toolenv + + " --" + + subwork + + "BIN " + + toolbin + + " " + ) toolpar = list() if "star" in [toolenv, toolbin]: - np['INDEX'] = mp.fixRunParameters(config, toolenv, sample, None, 'MAPPING', 'INDEX', "--sjdbGTFfile", "--sjdbGTFfile tmp_anno") + np['INDEX'] = mp.fixRunParameters(config, toolenv, sample, None, 'MAPPING', 'INDEX', "--sjdbGTFfile", "--sjdbGTFfile tmp_anno") for key, val in np.items(): pars = val if val and val != "" else None if pars: @@ -2265,17 +2313,31 @@ def nf_tool_params( toolenv, toolbin = map(str, [sd["ENV"], sd["BIN"]]) np = sd[toolenv.split("_")[0]]["OPTIONS"] if toolenv else sd["OPTIONS"] - tp.append( - "--" - + subwork - + "ENV " - + toolenv - + " --" - + subwork - + "BIN " - + toolbin - + " " - ) + if subwork == "QC": + qcprefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + tp.append( + "--" + + qcprefix + + "ENV " + + toolenv + + " --" + + qcprefix + + "BIN " + + toolbin + + " " + ) + else: + tp.append( + "--" + + subwork + + "ENV " + + toolenv + + " --" + + subwork + + "BIN " + + toolbin + + " " + ) toolpar = list() for key, val in np.items(): @@ -2466,8 +2528,15 @@ def nf_make_pre( continue subsamples = mp.get_samples(sconf) - sconf[works[j] + "ENV"] = toolenv - sconf[works[j] + "BIN"] = toolbin + if works[j] == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[works[j] + "ENV"] = toolenv + sconf[works[j] + "BIN"] = toolbin subconf.merge(sconf) subconf[works[j]] = mu.add_to_innermost_key_by_list( @@ -2569,7 +2638,7 @@ def nf_make_pre( ) ) if writeout: - write_if_different(nfo, "".join(subjobs)) + _write_workflow(nfo, "".join(subjobs)) confo = os.path.abspath( os.path.join( @@ -2657,8 +2726,15 @@ def nf_make_pre( toolenv, toolbin = map(str, listoftools[i]) if toolenv is None or toolbin is None: continue - sconf[subwork + "ENV"] = toolenv - sconf[subwork + "BIN"] = toolbin + if subwork == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[subwork + "ENV"] = toolenv + sconf[subwork + "BIN"] = toolbin subconf.merge(sconf) subconf[subwork] = mu.add_to_innermost_key_by_list( @@ -2708,6 +2784,9 @@ def nf_make_pre( subjobs.append(line) subjobs.append("\n\n") + if "DEDUP" in works: + flowlist.append("DEDUPBAM") + tp.append( nf_tool_params( subsamples[0], @@ -2764,7 +2843,7 @@ def nf_make_pre( ) ) if writeout: - write_if_different(nfo, "".join(subjobs)) + _write_workflow(nfo, "".join(subjobs)) confo = os.path.abspath( os.path.join( @@ -2876,8 +2955,15 @@ def nf_make_sub( ): # Here we can add tool specific extra cases, like e.g segehmehl bisulfite mode subname = toolenv.replace("bisulfite", "_bisulfite") + ".nf" subsamples = mp.get_samples(sconf) - sconf[works[j] + "ENV"] = toolenv - sconf[works[j] + "BIN"] = toolbin + if works[j] == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[works[j] + "ENV"] = toolenv + sconf[works[j] + "BIN"] = toolbin subconf.merge(sconf) subconf[works[j]] = mu.add_to_innermost_key_by_list( @@ -2897,7 +2983,7 @@ def nf_make_sub( if toolenv == "rustqc": # RustQC is post-alignment QC only if "MAPPING" in works: - flowlist.append("QC_MAPPING") + flowlist.append("RUSTQC_MAPPING") else: log.warning( logid @@ -2913,7 +2999,7 @@ def nf_make_sub( else: if "DEDUP" in subworkflows: flowlist.append("QC_RAW") - if toolenv == "umitools": + if toolenv in ["umitools", "fgumi", "umicollapse"]: flowlist.append("DEDUPEXTRACT") if "MAPPING" in works: flowlist.append("QC_MAPPING") @@ -2928,7 +3014,8 @@ def nf_make_sub( flowlist.append("TRIMMING") if works[j] == "DEDUP": - if toolenv == "umitools": + deduptool = toolenv + if toolenv in ["umitools", "fgumi", "umicollapse"]: flowlist.append("PREDEDUP") subconf["PREDEDUP"] = "enabled" if "QC" in flowlist: @@ -3041,7 +3128,15 @@ def nf_make_sub( if "QC" in works: flowlist.append("MULTIQC") - nfi = os.path.abspath(os.path.join(workflowpath, "multiqc.nf")) + _mqc_nf = ( + "multiqc_rustqc.nf" + if ( + ("QC_MAPPING" in flowlist or "RUSTQC_MAPPING" in flowlist) + and "QC_RAW" not in flowlist + ) + else "multiqc.nf" + ) + nfi = os.path.abspath(os.path.join(workflowpath, _mqc_nf)) with open(nfi, "r") as nf: for line in mu.comment_remover(nf.readlines()): line = re.sub(condapath, 'conda "' + envpath, line) @@ -3061,6 +3156,14 @@ def nf_make_sub( # workflow merger log.debug("FLOWLIST: " + str(flowlist)) + map_qc_chan = "QC_MAPPING.out.qc" + if "RUSTQC_MAPPING" in flowlist: + map_qc_chan = ( + "QC_MAPPING.out.qc.concat(RUSTQC_MAPPING.out.qc)" + if "QC_MAPPING" in flowlist + else "RUSTQC_MAPPING.out.qc" + ) + subjobs.append("\n\n" + "workflow {\n") for w in [ "QC_RAW", @@ -3071,6 +3174,7 @@ def nf_make_sub( "MAPPING", "DEDUPBAM", "QC_MAPPING", + "RUSTQC_MAPPING", "MULTIQC", ]: if w in flowlist: @@ -3097,11 +3201,18 @@ def nf_make_sub( " " * 4 + "POSTMAPPING(MAPPING.out.mapped)\n" ) elif w == "DEDUPBAM": - subjobs.append( - " " * 4 - + w - + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai)\n" - ) + if deduptool == "fgumi": + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai, DEDUPEXTRACT.out.ubam)\n" + ) + else: + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai)\n" + ) elif w == "QC_MAPPING": if "DEDUPBAM" in flowlist: subjobs.append( @@ -3115,17 +3226,26 @@ def nf_make_sub( + w + "(POSTMAPPING.out.postmap.concat(POSTMAPPING.out.postmapuni))\n" ) + elif w == "RUSTQC_MAPPING": + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap.concat(POSTMAPPING.out.postmapuni))\n" + ) elif w == "MULTIQC": - if "QC_RAW" not in flowlist and "QC_MAPPING" in flowlist: + if "QC_RAW" not in flowlist and ( + "QC_MAPPING" in flowlist + or "RUSTQC_MAPPING" in flowlist + ): # RustQC: only BAM-level QC, no FASTQ QC subjobs.append( - " " * 4 + w + "(QC_MAPPING.out.qc.collect())\n" + " " * 4 + w + f"({map_qc_chan}.collect())\n" ) elif "DEDUPBAM" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat(QC_MAPPING.out.qc.concat(MAPPING.out.logs))).collect())\n" + + f"(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat({map_qc_chan}.concat(MAPPING.out.logs))).collect())\n" ) elif ( "DEDUPBAM" in flowlist and "QC_TRIMMING" not in flowlist @@ -3133,13 +3253,13 @@ def nf_make_sub( subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_MAPPING.out.qc.concat(MAPPING.out.logs)).collect())\n" + + f"(QC_RAW.out.qc.concat({map_qc_chan}.concat(MAPPING.out.logs)).collect())\n" ) elif "MAPPING" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat(QC_MAPPING.out.qc.concat(POSTMAPPING.out.postmapuni))).collect())\n" + + f"(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat({map_qc_chan}.concat(POSTMAPPING.out.postmapuni))).collect())\n" ) elif ( "MAPPING" in flowlist and "QC_TRIMMING" not in flowlist @@ -3147,7 +3267,7 @@ def nf_make_sub( subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_MAPPING.out.qc.concat(POSTMAPPING.out.postmapuni)).collect())\n" + + f"(QC_RAW.out.qc.concat({map_qc_chan}.concat(POSTMAPPING.out.postmapuni)).collect())\n" ) elif "TRIMMING" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( @@ -3178,7 +3298,7 @@ def nf_make_sub( ) ) if writeout: - write_if_different(nfo, "".join(add) + "".join(subjobs)) + _write_workflow(nfo, "".join(add) + "".join(subjobs)) confo = os.path.abspath( os.path.join( @@ -3201,6 +3321,7 @@ def nf_make_sub( subjobs = list() subconf = mu.NestedDefaultDict() tp = list() + deduptool = None for subwork in subworkflows: log.debug(logid + "PREPARING " + str(subwork) + " " + str(condition)) @@ -3229,8 +3350,15 @@ def nf_make_sub( ): # Here we can add tool specific extra cases, like e.g segehmehl bisulfite mode subname = toolenv.replace("bisulfite", "_bisulfite") + ".nf" - sconf[subwork + "ENV"] = toolenv - sconf[subwork + "BIN"] = toolbin + if subwork == "QC": + _qc_prefix = "POSTQC" if toolenv == "rustqc" else "PREQC" + sconf[_qc_prefix + "ENV"] = toolenv + sconf[_qc_prefix + "BIN"] = toolbin + sconf.pop("QCENV", None) + sconf.pop("QCBIN", None) + else: + sconf[subwork + "ENV"] = toolenv + sconf[subwork + "BIN"] = toolbin subconf.merge(sconf) subconf[subwork] = mu.add_to_innermost_key_by_list( @@ -3250,7 +3378,7 @@ def nf_make_sub( if toolenv == "rustqc": # RustQC is post-alignment QC only if "MAPPING" in subworkflows: - flowlist.append("QC_MAPPING") + flowlist.append("RUSTQC_MAPPING") else: log.warning( logid @@ -3284,9 +3412,33 @@ def nf_make_sub( subname = toolenv + ".nf" - # Picard tools can be extended here + # Dedup tools can be extended here if subwork == "DEDUP" and toolenv == "picard": subname = toolenv + "_dedup.nf" + elif subwork == "DEDUP" and toolenv in ["umitools", "fgumi", "umicollapse"]: + deduptool = toolenv + flowlist.append("PREDEDUP") + subconf["PREDEDUP"] = "enabled" + if "QC" in flowlist: + flowlist.append("QC_DEDUP") + subname = toolenv + ".nf" + nfi = os.path.abspath(os.path.join(workflowpath, subname)) + with open(nfi, "r") as nf: + for line in mu.comment_remover(nf.readlines()): + line = re.sub(condapath, 'conda "' + envpath, line) + if "include {" in line: + line = fixinclude( + line, + loglevel, + condapath, + envpath, + workflowpath, + logfix, + "nfmode", + ) + subjobs.append(line) + subjobs.append("\n\n") + subname = toolenv + "_dedup.nf" nfi = os.path.abspath(os.path.join(workflowpath, subname)) with open(nfi, "r") as nf: @@ -3361,7 +3513,15 @@ def nf_make_sub( if "QC" in subworkflows: flowlist.append("MULTIQC") - nfi = os.path.abspath(os.path.join(workflowpath, "multiqc.nf")) + _mqc_nf = ( + "multiqc_rustqc.nf" + if ( + ("QC_MAPPING" in flowlist or "RUSTQC_MAPPING" in flowlist) + and "QC_RAW" not in flowlist + ) + else "multiqc.nf" + ) + nfi = os.path.abspath(os.path.join(workflowpath, _mqc_nf)) with open(nfi, "r") as nf: for line in mu.comment_remover(nf.readlines()): line = re.sub(condapath, 'conda "' + envpath, line) @@ -3381,6 +3541,14 @@ def nf_make_sub( # workflow merger log.debug("FLOWLIST: " + str(flowlist)) + map_qc_chan = "QC_MAPPING.out.qc" + if "RUSTQC_MAPPING" in flowlist: + map_qc_chan = ( + "QC_MAPPING.out.qc.concat(RUSTQC_MAPPING.out.qc)" + if "QC_MAPPING" in flowlist + else "RUSTQC_MAPPING.out.qc" + ) + subjobs.append("\n\n" + "workflow {\n") for w in [ "QC_RAW", @@ -3391,6 +3559,7 @@ def nf_make_sub( "MAPPING", "DEDUPBAM", "QC_MAPPING", + "RUSTQC_MAPPING", "MULTIQC", ]: if w in flowlist: @@ -3413,11 +3582,18 @@ def nf_make_sub( subjobs.append(" " * 4 + w + "(TRIMMING.out.trimmed)\n") subjobs.append(" " * 4 + "POSTMAPPING(MAPPING.out.mapped)\n") elif w == "DEDUPBAM": - subjobs.append( - " " * 4 - + w - + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai)\n" - ) + if deduptool == "fgumi": + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai, DEDUPEXTRACT.out.ubam)\n" + ) + else: + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap, POSTMAPPING.out.postbai, POSTMAPPING.out.postmapuni, POSTMAPPING.out.postunibai)\n" + ) elif w == "QC_MAPPING": if "DEDUPBAM" in flowlist: subjobs.append( @@ -3431,35 +3607,43 @@ def nf_make_sub( + w + "(POSTMAPPING.out.postmap.concat(POSTMAPPING.out.postmapuni))\n" ) + elif w == "RUSTQC_MAPPING": + subjobs.append( + " " * 4 + + w + + "(POSTMAPPING.out.postmap.concat(POSTMAPPING.out.postmapuni))\n" + ) elif w == "MULTIQC": - if "QC_RAW" not in flowlist and "QC_MAPPING" in flowlist: + if "QC_RAW" not in flowlist and ( + "QC_MAPPING" in flowlist or "RUSTQC_MAPPING" in flowlist + ): # RustQC: only BAM-level QC, no FASTQ QC subjobs.append( - " " * 4 + w + "(QC_MAPPING.out.qc.collect())\n" + " " * 4 + w + f"({map_qc_chan}.collect())\n" ) elif "DEDUPBAM" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat(QC_MAPPING.out.qc.concat(MAPPING.out.logs))).collect())\n" + + f"(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat({map_qc_chan}.concat(MAPPING.out.logs))).collect())\n" ) elif "DEDUPBAM" in flowlist and "QC_TRIMMING" not in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_MAPPING.out.qc.concat(MAPPING.out.logs)).collect())\n" + + f"(QC_RAW.out.qc.concat({map_qc_chan}.concat(MAPPING.out.logs)).collect())\n" ) elif "MAPPING" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat(QC_MAPPING.out.qc.concat(POSTMAPPING.out.postmapuni))).collect())\n" + + f"(QC_RAW.out.qc.concat(QC_TRIMMING.out.qc.concat({map_qc_chan}.concat(POSTMAPPING.out.postmapuni))).collect())\n" ) elif "MAPPING" in flowlist and "QC_TRIMMING" not in flowlist: subjobs.append( " " * 4 + w - + "(QC_RAW.out.qc.concat(QC_MAPPING.out.qc.concat(POSTMAPPING.out.postmapuni)).collect())\n" + + f"(QC_RAW.out.qc.concat({map_qc_chan}.concat(POSTMAPPING.out.postmapuni)).collect())\n" ) elif "TRIMMING" in flowlist and "QC_TRIMMING" in flowlist: subjobs.append( @@ -3485,7 +3669,7 @@ def nf_make_sub( os.path.join(subdir, "_".join(["_".join(condition), "subflow.nf"])) ) if writeout: - write_if_different(nfo, "".join(subjobs)) + _write_workflow(nfo, "".join(subjobs)) confo = os.path.abspath( os.path.join(subdir, "_".join(["_".join(condition), "subconfig.json"])) @@ -3685,7 +3869,7 @@ def nf_make_post( ) ) if writeout: - write_if_different(nfo, "".join(add) + "".join(subjobs)) + _write_workflow(nfo, "".join(add) + "".join(subjobs)) confo = os.path.abspath( os.path.join( @@ -3831,7 +4015,7 @@ def nf_make_post( ) ) if writeout: - write_if_different(nfo, "".join(add) + "".join(subjobs)) + _write_workflow(nfo, "".join(add) + "".join(subjobs)) confo = os.path.abspath( os.path.join( @@ -3961,7 +4145,7 @@ def nf_make_summary(config, subdir, loglevel, combinations=None): subjobs.append("\n\n") nfo = os.path.abspath(os.path.join(subdir, "summary_subflow.nf")) - write_if_different(nfo, "".join(subjobs)) + _write_workflow(nfo, "".join(subjobs)) subconf = mu.NestedDefaultDict() for key in ["BINS", "MAXTHREADS", "SETTINGS"]: diff --git a/MONSDA/web_configurator.py b/MONSDA/web_configurator.py index 7f8425a2..ce964980 100644 --- a/MONSDA/web_configurator.py +++ b/MONSDA/web_configurator.py @@ -1,19 +1,25 @@ +import copy import json import os -from typing import Any, Dict +import sys +from typing import Any, Dict, List, Optional from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import FileResponse -from pydantic import BaseModel +from fastapi.responses import HTMLResponse +from pydantic import BaseModel, Field +from snakemake.common.configfile import load_configfile -TEMPLATE_PATH = os.path.join( - os.path.dirname(__file__), "../configs/template_base_commented.json" -) +from . import _version +from .Params import samplesheet_to_settings + +__version__ = _version.get_versions()["version"] + +TEMPLATE_FILE = "template_base_commented.json" +NONE_WORKFLOW_KEYS = ["WORKFLOWS", "BINS", "MAXTHREADS", "SETTINGS", "VERSION"] -app = FastAPI(title="MONSDA Configurator Web Service") +app = FastAPI(title="MONSDA Configurator Web") -# Allow CORS for local development app.add_middleware( CORSMiddleware, allow_origins=["*"], @@ -23,224 +29,1482 @@ ) -class ConfigRequest(BaseModel): +class SamplesheetRequest(BaseModel): + samplesheet_path: str + + +class BuildConfigRequest(BaseModel): + config_name: str + output_dir: str + workflows: List[str] = Field(default_factory=list) + tools: Dict[str, List[str]] = Field(default_factory=dict) + maxthreads: str = "16" + settings: Optional[Dict[str, Any]] = None + samplesheet_path: Optional[str] = None + + +class ConditionFiles(BaseModel): + """Per-condition file selections from the wizard.""" + + condition: str # slash-separated condition path e.g. "Ecoli/WT" + fastq_dir: str = "" # directory containing FASTQ files for this condition + fastq_files: List[str] = Field(default_factory=list) # explicit file paths + sequencing: str = "paired" # paired | single + reference: str = "" # absolute path to genome FASTA + gtf: str = "" # absolute path to GTF annotation + gff: str = "" # absolute path to GFF annotation (optional) + decoy: str = "" # absolute path to decoy file (optional) + + +class ProjectRequest(BaseModel): + project_dir: str + project_name: str = "monsda" + condition_files: List[ConditionFiles] = Field(default_factory=list) + config: Optional[Dict[str, Any]] = None + settings: Optional[Dict[str, Any]] = None + + +class SaveConfigRequest(BaseModel): config_name: str - config: Dict[str, Any] output_dir: str + config: Dict[str, Any] + +def _template_candidates() -> List[str]: + base_dir = os.path.abspath(os.path.dirname(__file__)) + pythonversion = f"python{sys.version_info.major}.{sys.version_info.minor}" + install_share = base_dir.replace( + os.sep.join(["lib", pythonversion, "site-packages", "MONSDA"]), "share" + ) + + candidates = [ + # explicit override if user/admin wants to force a template location + os.environ.get("MONSDA_TEMPLATE_PATH", ""), + # local checkout layout (repo root/configs) + os.path.abspath(os.path.join(base_dir, "..", "configs", TEMPLATE_FILE)), + # package-relative layout if configs were bundled next to package + os.path.abspath(os.path.join(base_dir, "configs", TEMPLATE_FILE)), + # Configurator.py install layout logic: /share/MONSDA/configs + os.path.abspath( + os.path.join(install_share, "MONSDA", "configs", TEMPLATE_FILE) + ), + # generic venv/conda prefix share path + os.path.abspath( + os.path.join(sys.prefix, "share", "MONSDA", "configs", TEMPLATE_FILE) + ), + # last-resort: run directory configs + os.path.abspath(os.path.join(os.getcwd(), "configs", TEMPLATE_FILE)), + ] + + # de-duplicate while preserving order and dropping empty values + deduped: List[str] = [] + for c in candidates: + c = (c or "").strip() + if c and c not in deduped: + deduped.append(c) + return deduped + + +def resolve_template_path() -> str: + for path in _template_candidates(): + if os.path.isfile(path): + return path + tried = "\n - ".join(_template_candidates()) + raise FileNotFoundError( + "Could not find template_base_commented.json. Tried:\n - " + tried + ) def load_template() -> Dict[str, Any]: - with open(TEMPLATE_PATH, "r") as f: - return json.load(f) + return load_configfile(resolve_template_path()) -def strip_comments(d): +def strip_comments(d: Any) -> Any: if isinstance(d, dict): return {k: strip_comments(v) for k, v in d.items() if k != "comment"} - elif isinstance(d, list): + if isinstance(d, list): return [strip_comments(x) for x in d] + return d + + +def set_in_path(root: Dict[str, Any], path: List[str], value: Any) -> None: + node = root + for key in path[:-1]: + if key not in node or not isinstance(node[key], dict): + node[key] = {} + node = node[key] + node[path[-1]] = value + + +def get_condition_paths_from_settings(settings: Dict[str, Any]) -> List[List[str]]: + out: List[List[str]] = [] + + def _walk(node: Any, path: List[str]) -> None: + if not isinstance(node, dict): + return + if "SAMPLES" in node and isinstance(node.get("SAMPLES"), list): + out.append(path.copy()) + return + for k, v in node.items(): + if isinstance(v, dict): + _walk(v, path + [k]) + + _walk(settings, []) + return out + + +def build_workflow_block( + workflow_name: str, + workflow_template: Dict[str, Any], + condition_paths: List[List[str]], + selected_tools: List[str], +) -> Dict[str, Any]: + block: Dict[str, Any] = {} + + all_tools = workflow_template.get("TOOLS", {}) + if not selected_tools: + selected_tools = list(all_tools.keys()) + + if all_tools: + block["TOOLS"] = {k: all_tools[k] for k in selected_tools if k in all_tools} + + # carry workflow-level defaults if present + for passthrough in ["FEATURES", "CUTOFFS", "COMPARABLE", "EXCLUDE"]: + if passthrough in workflow_template: + block[passthrough] = copy.deepcopy(workflow_template[passthrough]) + + # populate per-condition tool settings like CLI configurator does + for cond_path in condition_paths: + for tool in selected_tools: + tool_def = workflow_template.get(tool) + if not isinstance(tool_def, dict): + continue + tool_def_no_comment = strip_comments(tool_def) + if not tool_def_no_comment: + continue + set_in_path(block, cond_path + [tool], copy.deepcopy(tool_def_no_comment)) + + return block + + +def build_config(req: BuildConfigRequest) -> Dict[str, Any]: + template = strip_comments(load_template()) + + if req.settings is not None: + settings = req.settings + elif req.samplesheet_path: + samplesheet_path = os.path.abspath(req.samplesheet_path) + if not os.path.isfile(samplesheet_path): + raise HTTPException( + status_code=400, + detail=f"Samplesheet does not exist: {samplesheet_path}", + ) + settings = samplesheet_to_settings(samplesheet_path) else: - return d + raise HTTPException( + status_code=400, + detail="Provide either settings JSON or samplesheet_path.", + ) - config_name: str - config: Dict[str, Any] - output_dir: str + condition_paths = get_condition_paths_from_settings(settings) + if not condition_paths: + raise HTTPException( + status_code=400, + detail="No condition leaves with SAMPLES found in SETTINGS.", + ) + + workflows = req.workflows or [] + if not workflows: + raise HTTPException( + status_code=400, + detail="At least one workflow must be selected.", + ) + + invalid = [w for w in workflows if w not in template or w in NONE_WORKFLOW_KEYS] + if invalid: + raise HTTPException( + status_code=400, + detail=f"Unknown workflow(s): {', '.join(invalid)}", + ) + + final_config: Dict[str, Any] = { + "WORKFLOWS": ", ".join(workflows), + "BINS": template.get("BINS", ""), + "MAXTHREADS": str(req.maxthreads), + "VERSION": __version__, + "SETTINGS": settings, + } + + for wf in workflows: + wf_template = template.get(wf, {}) + final_config[wf] = build_workflow_block( + wf, + wf_template, + condition_paths, + req.tools.get(wf, []), + ) + + return final_config @app.get("/template", response_model=Dict[str, Any]) -def get_template(): - """Get the config template (with comments).""" - return load_template() +def get_template() -> Dict[str, Any]: + try: + return load_template() + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to load template: {e}") @app.get("/template/fields", response_model=Dict[str, Any]) -def get_template_fields(): - """Get the config template (without comments).""" - return strip_comments(load_template()) +def get_template_fields() -> Dict[str, Any]: + try: + return strip_comments(load_template()) + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to load template fields: {e}" + ) + + +def _normalize_path(path: str) -> str: + if path: + return os.path.abspath(os.path.expanduser(path.strip())) + return os.getcwd() + + +@app.get("/fs/roots", response_model=Dict[str, Any]) +def fs_roots() -> Dict[str, Any]: + cwd = os.getcwd() + home = os.path.expanduser("~") + roots = [] + for p in [cwd, home, os.path.sep]: + if p and p not in roots and os.path.isdir(p): + roots.append(p) + return {"roots": roots} + + +@app.get("/fs/list", response_model=Dict[str, Any]) +def fs_list(path: str = "", mode: str = "dirs") -> Dict[str, Any]: + mode = (mode or "dirs").lower() + if mode not in {"dirs", "all", "samplesheets"}: + raise HTTPException( + status_code=400, detail="mode must be dirs, all, or samplesheets" + ) + + current = _normalize_path(path) + if not os.path.isdir(current): + raise HTTPException(status_code=400, detail=f"Not a directory: {current}") + + try: + entries = list(os.scandir(current)) + except PermissionError: + raise HTTPException(status_code=403, detail=f"Permission denied: {current}") + + dirs = [] + files = [] + for e in sorted(entries, key=lambda x: x.name.lower()): + if e.name in {".", ".."}: + continue + if e.is_dir(follow_symlinks=True): + dirs.append({"name": e.name, "path": e.path}) + continue + if not e.is_file(follow_symlinks=True): + continue + if mode in {"all", "samplesheets"}: + files.append({"name": e.name, "path": e.path}) + + parent = ( + os.path.dirname(current.rstrip(os.sep)) + if current != os.path.sep + else os.path.sep + ) + return { + "current": current, + "parent": parent, + "dirs": dirs, + "files": files, + "mode": mode, + } + + +@app.post("/samplesheet/parse", response_model=Dict[str, Any]) +def parse_samplesheet(req: SamplesheetRequest) -> Dict[str, Any]: + path = os.path.abspath(req.samplesheet_path.strip()) + if not os.path.isfile(path): + raise HTTPException(status_code=400, detail=f"Samplesheet not found: {path}") + settings = samplesheet_to_settings(path) + return { + "samplesheet": path, + "settings": settings, + "conditions": [ + "/".join(p) for p in get_condition_paths_from_settings(settings) + ], + } + +@app.post("/config/preview", response_model=Dict[str, Any]) +def preview_config(req: BuildConfigRequest) -> Dict[str, Any]: + return {"config": build_config(req)} -@app.post("/generate_config") -def generate_config(req: ConfigRequest): - """Generate a config file from user input. User specifies output_dir.""" + +@app.post("/config/save", response_model=Dict[str, Any]) +def save_config(req: SaveConfigRequest) -> Dict[str, Any]: config_name = req.config_name.strip() - output_dir = os.path.abspath(req.output_dir.strip()) if not config_name or any(c in config_name for c in "/\\"): raise HTTPException(status_code=400, detail="Invalid config name.") - if not output_dir or not os.path.isdir(output_dir): - raise HTTPException(status_code=400, detail="Invalid output directory.") - config_path = os.path.join(output_dir, f"config_{config_name}.json") - with open(config_path, "w") as f: - json.dump(req.config, f, indent=4) - return {"message": "Config generated.", "path": config_path} - - -# Download config by full path (for security, restrict to files under allowed parent dir) -@app.get("/download_config/") -def download_config(config_path: str): - config_path = os.path.abspath(config_path) - if not os.path.exists(config_path): - raise HTTPException(status_code=404, detail="Config not found.") - if not config_path.endswith(".json"): - raise HTTPException(status_code=400, detail="Only .json files allowed.") - return FileResponse(config_path, filename=os.path.basename(config_path)) - - -class DirRequest(BaseModel): - config: Dict[str, Any] - project_dir: str + output_dir = os.path.abspath(req.output_dir.strip()) + if not os.path.isdir(output_dir): + raise HTTPException(status_code=400, detail="Output directory does not exist.") -def safe_makedirs(path): - os.makedirs(path, exist_ok=True) + path = os.path.join(output_dir, f"config_{config_name}.json") + with open(path, "w") as fh: + json.dump(req.config, fh, indent=4) + return {"message": "Config written.", "path": path} -def create_project_structure(config: Dict[str, Any], project_dir: str): - # Example: create folders for workflows, logs, counts, etc. - safe_makedirs(project_dir) - for wf in config.get("WORKFLOWS", "").split(","): - wf = wf.strip() - if wf: - safe_makedirs(os.path.join(project_dir, wf)) - safe_makedirs(os.path.join(project_dir, "LOGS")) - safe_makedirs(os.path.join(project_dir, "COUNTS")) - # Add more as needed +@app.post("/project/create", response_model=Dict[str, Any]) +def create_project(req: ProjectRequest) -> Dict[str, Any]: + base_dir = os.path.abspath(req.project_dir.strip()) + project_name = req.project_name.strip() or "monsda" -@app.post("/generate_project_dir") -def generate_project_dir(req: DirRequest): - project_dir = os.path.abspath(req.project_dir) - if not project_dir.startswith(os.getcwd()): - raise HTTPException( - status_code=400, detail="Project dir must be inside working directory." - ) - if os.path.exists(project_dir) and os.listdir(project_dir): - raise HTTPException( - status_code=400, detail="Project dir already exists and is not empty." - ) - create_project_structure(req.config, project_dir) - return {"message": "Project directory structure created.", "path": project_dir} + # Project name becomes a subdirectory under the chosen path + project_dir = os.path.join(base_dir, project_name) + os.makedirs(project_dir, exist_ok=True) + # FASTQ directly under project dir (no extra project_name level) + fastq_dir = os.path.join(project_dir, "FASTQ") + gen_dir = os.path.join(project_dir, "GENOMES") + os.makedirs(fastq_dir, exist_ok=True) + os.makedirs(gen_dir, exist_ok=True) + + settings = req.settings or (req.config.get("SETTINGS") if req.config else None) + linked_files: List[str] = [] + warnings: List[str] = [] + + # Build lookup: condition_path -> ConditionFiles + cond_lookup: Dict[str, ConditionFiles] = {} + for cf in req.condition_files: + cond_lookup[cf.condition] = cf + + if settings and isinstance(settings, dict): + condition_paths = get_condition_paths_from_settings(settings) + for cond_path in condition_paths: + cond_key = "/".join(cond_path) + cond_info = cond_lookup.get(cond_key) + + # Create condition subdirectories under FASTQ + cond_dir = os.path.join(fastq_dir, *cond_path) + os.makedirs(cond_dir, exist_ok=True) + + # Walk to the leaf node to find SAMPLES + node = settings + for key in cond_path: + node = node.get(key, {}) + + samples = node.get("SAMPLES", []) + + # Link FASTQ files: prefer explicit file list, fall back to directory+sample matching + if cond_info and cond_info.fastq_files: + # Explicit file selection mode - link exactly the files the user chose + for fpath in cond_info.fastq_files: + src_file = os.path.abspath(fpath) + if not os.path.isfile(src_file): + warnings.append(f"FASTQ file not found: {fpath}") + continue + dst = os.path.join(cond_dir, os.path.basename(src_file)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(src_file), dst) + linked_files.append(dst) + # Handle paired-end mate + if cond_info.sequencing == "paired": + basename = os.path.basename(src_file) + if "_R1" in basename: + mate_name = basename.replace("_R1", "_R2") + elif "_1." in basename: + mate_name = basename.replace("_1.", "_2.") + else: + mate_name = None + if mate_name: + mate_src = os.path.join( + os.path.dirname(src_file), mate_name + ) + mate_dst = os.path.join(cond_dir, mate_name) + if os.path.isfile(mate_src) and not os.path.exists( + mate_dst + ): + os.symlink(os.path.realpath(mate_src), mate_dst) + linked_files.append(mate_dst) + + elif cond_info and cond_info.fastq_dir and isinstance(samples, list): + # Directory mode - find files matching sample names from samplesheet + src_dir = os.path.abspath(cond_info.fastq_dir) + if os.path.isdir(src_dir): + for sample in samples: + # Find matching files (sample name may or may not have extension) + matched = _find_fastq_files(src_dir, sample) + if not matched: + warnings.append( + f"No FASTQ file found for sample '{sample}' in {src_dir}" + ) + continue + for src_file in matched: + dst = os.path.join(cond_dir, os.path.basename(src_file)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(src_file), dst) + linked_files.append(dst) + # Handle paired-end mate + if cond_info.sequencing == "paired": + basename = os.path.basename(src_file) + if "_R1" in basename: + mate_name = basename.replace("_R1", "_R2") + elif "_1." in basename: + mate_name = basename.replace("_1.", "_2.") + elif "_R2" in basename: + mate_name = basename.replace("_R2", "_R1") + elif "_2." in basename: + mate_name = basename.replace("_2.", "_1.") + else: + mate_name = None + if mate_name: + mate_src = os.path.join(src_dir, mate_name) + mate_dst = os.path.join(cond_dir, mate_name) + if os.path.isfile(mate_src) and not os.path.exists( + mate_dst + ): + os.symlink(os.path.realpath(mate_src), mate_dst) + linked_files.append(mate_dst) + else: + warnings.append( + f"FASTQ directory not found for condition '{cond_key}': {src_dir}" + ) + + # Link genome files: REFERENCE, DECOY, GTF, GFF into GENOMES/ + ref_path = (cond_info.reference if cond_info else "") or node.get( + "REFERENCE", "" + ) + if ref_path and os.path.isfile(ref_path): + dst = os.path.join(gen_dir, os.path.basename(ref_path)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(ref_path), dst) + linked_files.append(dst) + # Update settings path to relative + rel = os.path.relpath(dst, start=project_dir) + node["REFERENCE"] = rel + elif ref_path: + warnings.append(f"REFERENCE not found: {ref_path}") + + decoy_path = (cond_info.decoy if cond_info else "") or node.get("DECOY", "") + if decoy_path and os.path.isfile(decoy_path): + dst = os.path.join(gen_dir, os.path.basename(decoy_path)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(decoy_path), dst) + linked_files.append(dst) + rel = os.path.relpath(dst, start=project_dir) + node["DECOY"] = rel + + gtf_path = (cond_info.gtf if cond_info else "") or "" + anno = node.get("ANNOTATION", {}) + if isinstance(anno, dict): + gtf_path = gtf_path or anno.get("GTF", "") + if gtf_path and os.path.isfile(gtf_path): + dst = os.path.join(gen_dir, os.path.basename(gtf_path)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(gtf_path), dst) + linked_files.append(dst) + rel = os.path.relpath(dst, start=project_dir) + if isinstance(anno, dict): + anno["GTF"] = rel + node["ANNOTATION"] = anno + elif gtf_path: + warnings.append(f"GTF not found: {gtf_path}") + + gff_path = (cond_info.gff if cond_info else "") or "" + if isinstance(anno, dict): + gff_path = gff_path or anno.get("GFF", "") + if gff_path and os.path.isfile(gff_path): + dst = os.path.join(gen_dir, os.path.basename(gff_path)) + if not os.path.exists(dst): + os.symlink(os.path.realpath(gff_path), dst) + linked_files.append(dst) + rel = os.path.relpath(dst, start=project_dir) + if isinstance(anno, dict): + anno["GFF"] = rel + node["ANNOTATION"] = anno + elif gff_path: + warnings.append(f"GFF not found: {gff_path}") + + # Write config if provided (with updated relative paths in SETTINGS) + config_path = "" + if req.config: + if settings: + req.config["SETTINGS"] = settings + config_file = f"config_{project_name}.json" + config_path = os.path.join(project_dir, config_file) + with open(config_path, "w") as fh: + json.dump(req.config, fh, indent=4) + + return { + "message": "Project created.", + "path": project_dir, + "config_path": config_path, + "linked_files": len(linked_files), + "warnings": warnings, + } -from fastapi.responses import HTMLResponse + +def _find_fastq_files(src_dir: str, sample_name: str) -> List[str]: + """Find FASTQ files in src_dir matching a sample name. + + Matches: exact filename, or sample_name*.fastq.gz / .fq.gz / .fastq / .fq + Only returns R1 (or unpaired) to avoid double-linking. + """ + import glob as _glob + + candidates: List[str] = [] + # Try exact match first + exact = os.path.join(src_dir, sample_name) + if os.path.isfile(exact): + return [exact] + + # Try common FASTQ extensions + for ext in [".fastq.gz", ".fq.gz", ".fastq", ".fq"]: + path = os.path.join(src_dir, sample_name + ext) + if os.path.isfile(path): + candidates.append(path) + # Try with _R1 suffix + path_r1 = os.path.join(src_dir, sample_name + "_R1" + ext) + if os.path.isfile(path_r1): + candidates.append(path_r1) + # Try with _1 suffix + path_1 = os.path.join(src_dir, sample_name + "_1" + ext) + if os.path.isfile(path_1): + candidates.append(path_1) + + if candidates: + return candidates + + # Glob fallback: any file starting with sample_name + pattern = os.path.join(src_dir, sample_name + "*") + matches = _glob.glob(pattern) + # Filter to only fastq-like files and only R1/unpaired + for m in sorted(matches): + if os.path.isfile(m): + lower = m.lower() + if any(lower.endswith(e) for e in [".fastq.gz", ".fq.gz", ".fastq", ".fq"]): + # Skip R2/_2 to avoid double-linking + base = os.path.basename(m) + if "_R2" in base or "_2." in base: + continue + candidates.append(m) + + return candidates @app.get("/", response_class=HTMLResponse) -def root(): +def root() -> str: return """ - + - - MONSDA Configurator - + + MONSDA Web Configurator + + + -

MONSDA Configurator

-
-

Step 1: Edit Configuration

- - - -
-
-

Step 2: Generate Config File

- - - - - - -
-
-
-

Step 3: Create Project Directory

- - - - -
- + } + walkSettings(settings, []); + + if (!conditions.length) { + setStatus('projectStatus', 'No conditions with SAMPLES found in settings.', false); + return; + } + + const wizard = document.getElementById('conditionWizard'); + wizard.innerHTML = `
${conditions.length} condition(s) found. For each, select FASTQ files by directory (auto-matched by sample name) or pick individual files.
`; + + conditions.forEach((c, idx) => { + const id = `cond_${idx}`; + const hasSamples = c.samples && c.samples.length > 0; + const samplesStr = hasSamples + ? c.samples.slice(0, 5).join(', ') + (c.samples.length > 5 ? ` (+${c.samples.length - 5} more)` : '') + : '(no samples defined)'; + const card = document.createElement('div'); + card.className = 'cond-card'; + card.dataset.condition = c.path; + card.innerHTML = ` +
+ ${escHtml(c.path)} + ${escHtml(samplesStr)} +
+
+
+ +
+ + +
+
+
+ +
+ + +
+
+ +
+ + +
+
+ +
+ + +
+
+
+ +
+ + +
+
+
+ +
+ + +
+
+
+ +
+ + +
+
+
+ `; + wizard.appendChild(card); + }); + setStatus('projectStatus', `Loaded ${conditions.length} condition(s). Fill in file paths and click Create Project.`); +} + +function toggleFqMode(id, mode) { + document.getElementById(id + '_dirpanel').style.display = mode === 'dir' ? '' : 'none'; + document.getElementById(id + '_filespanel').style.display = mode === 'files' ? '' : 'none'; +} + +function addFastqFile(id) { + openPathBrowser('', 'multifiles', (paths) => { + const list = document.getElementById(id + '_filelist'); + paths.forEach(path => { + // Avoid duplicates + let exists = false; + list.querySelectorAll('li').forEach(li => { if (li.dataset.path === path) exists = true; }); + if (exists) return; + const li = document.createElement('li'); + const span = document.createElement('span'); + span.textContent = path; + const rmBtn = document.createElement('button'); + rmBtn.type = 'button'; + rmBtn.textContent = '\u00d7'; + rmBtn.title = 'Remove'; + rmBtn.addEventListener('click', () => li.remove()); + li.appendChild(span); + li.appendChild(rmBtn); + li.dataset.path = path; + list.appendChild(li); + }); + }); +} + +window.onload = () => { + loadTemplate(); + switchOutputMode('config'); +}; + + """ diff --git a/README.md b/README.md index b3dfedfc..9271ea91 100644 --- a/README.md +++ b/README.md @@ -46,12 +46,12 @@ More information can be found in the official [documentation](https://monsda.rea ## How does it work -This repository hosts the executable ```MONSDA.py``` which acts a wrapper around ```Snakemake``` and the ```config.json``` file. +This repository hosts the executable ```MONSDA.py``` which acts a wrapper around ```Snakemake|Nextflow``` and the ```config.json``` file. The ```config.json``` holds all the information that is needed to run the jobs and will be parsed by ```MONSDA.py``` and split into sub-configs that can later be found in the directory ```SubSnakes``` or ```SubFlows``` respectively. To successfully run an analysis pipeline, a few steps have to be followed: * Directory structure: The structure for the directories is dictated by the condition-tree in the config file - * Config file: This is the central part of the analysis. Depending on this file ```MONSDA.py``` will determine processing steps and generate according config and ```Snakemake/Nextflow``` workflow files to run each subworkflow until all processing steps are done. + * Config file: This is the central part of the analysis. Depending on this file ```MONSDA.py``` will determine processing steps and generate according config and ```Snakemake|Nextflow``` workflow files to run each subworkflow until all processing steps are done. ## Run the pipeline diff --git a/configs/samplesheet_template.csv b/configs/samplesheet_template.csv new file mode 100644 index 00000000..0d553361 --- /dev/null +++ b/configs/samplesheet_template.csv @@ -0,0 +1,7 @@ +CONDITION,SAMPLE,GROUP,SEQUENCING,REFERENCE,GTF,GFF,INDEX,PREFIX,DECOY,TYPE,BATCH,IP +FGUMI/WT/dummylevel,Sample1,ctrl,paired,GENOMES/Ecoli/ecoli.fa.gz,GENOMES/Ecoli/ecoli.gtf.gz,GENOMES/Ecoli/ecoli.gff.gz,,,,,, +FGUMI/WT/dummylevel,Sample2,ctrl,,,,,,,,, +FGUMI/WT/dummylevel,Sample3,ctrl,,,,,,,,, +FGUMI/KO,Sample4,ko,paired,GENOMES/Ecoli/ecoli.fa.gz,GENOMES/Ecoli/ecoli.gtf.gz,GENOMES/Ecoli/ecoli.gff.gz,,,,,, +FGUMI/KO,Sample5,ko,,,,,,,,, +FGUMI/KO,Sample6,ko,,,,,,,,, diff --git a/configs/template.json b/configs/template.json index 9ab0bb95..ad52f0ed 100644 --- a/configs/template.json +++ b/configs/template.json @@ -120,7 +120,8 @@ "DEDUP": { #options for deduplication for each sample/condition "TOOLS": { "umitools": "umi_tools", - "picard": "picard" + "picard": "picard", + "fgumi": "fgumi" }, "id": { "condition": { @@ -141,6 +142,13 @@ "JAVA" : "",# options "DEDUP": "" # dedup options } + }, + "fgumi":{ + "OPTIONS": + { + "EXTRACT": "", # fgumi extract options, e.g. --read-structures 8M+T +T + "DEDUP": "" # fgumi dedup options + } } } } @@ -233,8 +241,8 @@ "salmon": { "OPTIONS": { - "INDEX": "-l A --gencode", # salmon index options - "QUANT": "--gcBias", # salmon quant options + "INDEX": "", # salmon index options + "QUANT": "--validateMappings --seqBias --gcBias", # salmon quant options } } } @@ -454,8 +462,8 @@ { "OPTIONS": { - "INDEX": "-l A --gencode", # salmon index options - "QUANT": "--gcBias", # salmon quant options + "INDEX": "", # salmon index options + "QUANT": "--validateMappings --seqBias --gcBias", # salmon quant options "DTU": "" # Options for Analysis } }, @@ -463,8 +471,8 @@ { "OPTIONS": { - "INDEX": "-l A --gencode", # salmon index options - "QUANT": "--gcBias", # salmon quant options + "INDEX": "", # salmon index options + "QUANT": "--validateMappings --seqBias --gcBias", # salmon quant options "DTU": "" # Options for Analysis } } diff --git a/configs/template_base_commented.json b/configs/template_base_commented.json index 8b2c1388..93928bb8 100644 --- a/configs/template_base_commented.json +++ b/configs/template_base_commented.json @@ -58,22 +58,36 @@ }, "BASECALL": { "TOOLS": { - "guppy": "~/.local/bin/guppy-cpu/bin/guppy_basecaller" + "guppy": "~/.local/bin/guppy-cpu/bin/guppy_basecaller", + "dorado": "dorado" }, "ENV" : "", "BIN" : "", "guppy": { "comment": { - "BASECALL": "Guppy options here if any, paired is not required, will be resolved by rules" + "BASECALL": "Guppy caller options here if any", + "MODEL": "Guppy model options here if any" }, "OPTIONS": { - "BASECALL": "" + "BASECALL": "", + "MODEL": "" + } + }, + "dorado": { + "comment": { + "CALLER": "Dorado caller options here if any", + "MODEL": "Dorado model options here if any" + }, + "OPTIONS": { + "CALLER": "", + "MODEL": "" } } }, "QC": { "TOOLS": { - "fastqc": "fastqc" + "fastqc": "fastqc", + "rustqc": "rustqc" }, "ENV" : "", "BIN" : "", @@ -86,13 +100,24 @@ "QC": "", "MULTI": "" } + }, + "rustqc": { + "comment": { + "QC": "RustQC options here if any, post-alignment QC", + "MULTI": "MultiQC options for rustqc if any" + }, + "OPTIONS": { + "QC": "", + "MULTI": "" + } } }, "TRIMMING": { "TOOLS": { "trimgalore": "trim_galore", "cutadapt": "cutadapt", - "bbduk": "bbmap" + "bbduk": "bbmap", + "fastp": "fastp" }, "ENV" : "", "BIN" : "", @@ -119,11 +144,21 @@ "OPTIONS": { "TRIM": "-q 15 --length 8 -e 0.15" } + }, + "fastp": { + "comment": { + "TRIM": "Trimming options here, --paired is not required, will be resolved by rules" + }, + "OPTIONS": { + "TRIM": "-q 15 -l 8" + } } }, "DEDUP": { "TOOLS": { - "umitools": "umi_tools" + "umitools": "umi_tools", + "picard": "picard", + "fgumi": "fgumi" }, "ENV" : "", "BIN" : "", @@ -150,6 +185,18 @@ "JAVA" : "", "DEDUP": "" } + }, + "fgumi": { + "comment": + { + "EXTRACT": "fgumi extract options, e.g. --read-structures 8M+T +T", + "DEDUP": "fgumi dedup options" + }, + "OPTIONS": + { + "EXTRACT": "", + "DEDUP": "" + } } }, "MAPPING": { @@ -275,8 +322,8 @@ }, "OPTIONS": { - "INDEX": "-l A --gencode", - "QUANT": "--gcBias" + "INDEX": "", + "QUANT": "--validateMappings --seqBias --gcBias" } } }, @@ -481,8 +528,8 @@ "DTU": "Options for DTU Analysis, leave blank unless you know exactly what you are doing, can be used to set cutoffs for filtering, e.g. min_samps_feature_expr, min_gene_expr ..." }, "OPTIONS": { - "INDEX": "-l A --gencode -d GENOMES/salmon_decoy", - "QUANT": "--gcBias", + "INDEX": "-d GENOMES/salmon_decoy", + "QUANT": "--validateMappings --seqBias --gcBias", "DTU": "" } }, @@ -493,8 +540,8 @@ "DTU": "Options for DTU Analysis, leave blank unless you know exactly what you are doing, , can be used to set cutoffs for filtering, e.g. min_samps_feature_expr, min_gene_expr ..." }, "OPTIONS": { - "INDEX": "-l A --gencode -d GENOMES/salmon_decoy", - "QUANT": "--gcBias", + "INDEX": "-d GENOMES/salmon_decoy", + "QUANT": "--validateMappings --seqBias --gcBias", "DTU": "" } } diff --git a/configs/template_clean.json b/configs/template_clean.json index 825e7335..78ea39f1 100644 --- a/configs/template_clean.json +++ b/configs/template_clean.json @@ -110,7 +110,8 @@ }, "DEDUP": { "TOOLS": { - "umitools": "umi_tools" + "umitools": "umi_tools", + "fgumi": "fgumi" }, "id": { "condition": { @@ -128,6 +129,13 @@ "JAVA" : "", "DEDUP": "" } + }, + "fgumi": { + "OPTIONS": + { + "EXTRACT": "", + "DEDUP": "" + } } } } @@ -224,8 +232,8 @@ "salmon": { "OPTIONS": { - "INDEX": "-l A --gencode", - "QUANT": "--gcBias" + "INDEX": "", + "QUANT": "--validateMappings --seqBias --gcBias" } } } @@ -428,8 +436,8 @@ { "OPTIONS": { - "INDEX": "-l A --gencode", - "QUANT": "--gcBias", + "INDEX": "", + "QUANT": "--validateMappings --seqBias --gcBias", "DTU": "" } }, @@ -437,8 +445,8 @@ { "OPTIONS": { - "INDEX": "-l A --gencode", - "QUANT": "--gcBias", + "INDEX": "", + "QUANT": "--validateMappings --seqBias --gcBias", "DTU": "" } } diff --git a/configs/template_umicollapse.json b/configs/template_umicollapse.json new file mode 100644 index 00000000..cff24128 --- /dev/null +++ b/configs/template_umicollapse.json @@ -0,0 +1,118 @@ +{ + "WORKFLOWS": "QC,MAPPING,DEDUP,COUNTING", + "BINS": "", + "MAXTHREADS": "20", + "VERSION": "1.4.0", + "SETTINGS": { + "id": { + "condition": { + "SAMPLES": [ + "rep_1", + "rep_2" + ], + "GROUPS": [ + "WT", + "WT" + ], + "TYPES": [ + "standard", + "standard" + ], + "BATCHES": [ + "1", + "1" + ], + "SEQUENCING": "paired", + "REFERENCE": "GENOMES/Dm6/dm6.fa.gz", + "INDEX": "GENOMES/Dm6/INDICES/star", + "PREFIX": "idx", + "ANNOTATION": { + "GTF": "GENOMES/Dm6/dm6.gtf.gz", + "GFF": "GENOMES/Dm6/dm6.gff3.gz" + } + } + } + }, + "QC": { + "TOOLS": { + "fastqc": "fastqc" + }, + "id": { + "condition": { + "fastqc": { + "OPTIONS": { + "QC": "", + "MULTI": "" + } + } + } + } + }, + "MAP": { + "TOOLS": { + "star": "STAR" + }, + "id": { + "condition": { + "star": { + "OPTIONS": { + "INDEX": "--sjdbGTFfeatureExon exon --sjdbGTFtagExonParentTranscript Parent --genomeSAindexNbases 13", + "MAP": "--sjdbGTFfeatureExon exon --sjdbGTFtagExonParentTranscript Parent --outSAMprimaryFlag AllBestScore --outSAMattributes NH HI NM MD AS nM jM jI XS", + "EXTENSION": "" + } + } + } + } + }, + "DEDUP": { + "TOOLS": { + "umicollapse": "umicollapse", + "umitools": "umi_tools", + "fgumi": "fgumi", + "picard": "picard" + }, + "id": { + "condition": { + "umicollapse": { + "OPTIONS": { + "DEDUP": "--algo dir --umi-sep _" + } + }, + "umitools": { + "OPTIONS": { + "WHITELIST": "--extract-method string --bc-pattern 'XNNNNX'", + "EXTRACT": "--extract-umi-method read_id", + "DEDUP": "" + } + }, + "fgumi": { + "OPTIONS": { + "EXTRACT": "", + "DEDUP": "" + } + }, + "picard": { + "OPTIONS": { + "JAVA": "", + "DEDUP": "" + } + } + } + } + }, + "COUNTING": { + "TOOLS": { + "salmon": "salmon" + }, + "id": { + "condition": { + "salmon": { + "OPTIONS": { + "INDEX": "-k 31", + "COUNTING": "-l U --validateMappings" + } + } + } + } + } +} diff --git a/configs/tutorial_exhaustive.json b/configs/tutorial_exhaustive.json index e811dec3..da2cecac 100644 --- a/configs/tutorial_exhaustive.json +++ b/configs/tutorial_exhaustive.json @@ -128,7 +128,8 @@ "TOOLS" : { "trimgalore": "trim_galore", - "cutadapt": "cutadapt" + "cutadapt": "cutadapt", + "fastp": "fastp" }, "Ecoli": { "KO": { @@ -143,6 +144,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } }, "WT": { @@ -158,6 +165,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } } } diff --git a/configs/tutorial_postprocess.json b/configs/tutorial_postprocess.json index aa5ba370..79ca138d 100644 --- a/configs/tutorial_postprocess.json +++ b/configs/tutorial_postprocess.json @@ -122,7 +122,8 @@ "TOOLS" : { "trimgalore": "trim_galore", - "cutadapt": "cutadapt" + "cutadapt": "cutadapt", + "fastp": "fastp" }, "Ecoli": { "KO": { @@ -137,6 +138,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } }, "WT": { @@ -152,6 +159,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } } } diff --git a/configs/tutorial_toolmix.json b/configs/tutorial_toolmix.json index e3b080fa..2f91f471 100644 --- a/configs/tutorial_toolmix.json +++ b/configs/tutorial_toolmix.json @@ -120,7 +120,8 @@ "TOOLS" : { "trimgalore": "trim_galore", - "cutadapt": "cutadapt" + "cutadapt": "cutadapt", + "fastp": "fastp" }, "Ecoli": { "KO": { @@ -135,6 +136,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } }, "WT": { @@ -150,6 +157,12 @@ { "TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules } + }, + "fastp":{ + "OPTIONS": + { + "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules + } } } } diff --git a/containers/apptainer/fgumi.def b/containers/apptainer/fgumi.def new file mode 100644 index 00000000..4d305696 --- /dev/null +++ b/containers/apptainer/fgumi.def @@ -0,0 +1,23 @@ +Bootstrap: docker +From: continuumio/miniconda3 + +%files + /home/fall/MONSDA/envs/fgumi.yaml /opt/envs/ + ${HOME}/MONSDA/scripts /opt/MONSDA/ + +%environment + +%post + ls -alrt /opt/envs + chmod -R +x /opt/envs/fgumi.yaml + + ENV_NAME=fgumi + echo ". /opt/conda/etc/profile.d/conda.sh" >> $APPTAINER_ENVIRONMENT + echo "conda activate $ENV_NAME" >> $APPTAINER_ENVIRONMENT + + . /opt/conda/etc/profile.d/conda.sh + conda env create -f /opt/envs/fgumi.yaml -p /opt/conda/envs/$ENV_NAME + conda clean --all + +%runscript + exec "$@" diff --git a/containers/apptainer/rustqc.def b/containers/apptainer/rustqc.def new file mode 100644 index 00000000..2d9f57b8 --- /dev/null +++ b/containers/apptainer/rustqc.def @@ -0,0 +1,23 @@ +Bootstrap: docker +From: continuumio/miniconda3 + +%files + /home/fall/MONSDA/envs/rustqc.yaml /opt/envs/ + ${HOME}/MONSDA/scripts /opt/MONSDA/ + +%environment + +%post + ls -alrt /opt/envs + chmod -R +x /opt/envs/rustqc.yaml + + ENV_NAME=rustqc + echo ". /opt/conda/etc/profile.d/conda.sh" >> $APPTAINER_ENVIRONMENT + echo "conda activate $ENV_NAME" >> $APPTAINER_ENVIRONMENT + + . /opt/conda/etc/profile.d/conda.sh + conda env create -f /opt/envs/rustqc.yaml -p /opt/conda/envs/$ENV_NAME + conda clean --all + +%runscript + exec "$@" diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 2d1afe53..ebd55264 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -57,10 +57,10 @@ This slightly more complex use case involves multiple input files, two condition Workflows include: - FETCH: Download from SRA - - QC: FASTQC of input and output + - QC: FASTQC and RustQC of input and output - TRIMMING: Adaptor removal with cutadapt/trimgalore - MAPPING: Read mapping with STAR, hisat2, bwa, segemehl3 and minimap2 - - DEDUP: Read deduplication with umi_tools and picard + - DEDUP: Read deduplication with umi_tools, fgumi and picard The more complex config for this analysis follows @@ -74,7 +74,7 @@ Starting the run with 12 cores (defining more will be capped by the config file monsda -j 12 -c ${CONDA_PREFIX}/share/MONSDA/configs/tutorial_toolmix.json --directory ${PWD} -Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/INDICES" directory containing the built index, a "QC" directory containing all *fastqc* reports and *multiqc* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, a "DE" directory will be created which will hold output from counting with featurecounts and DE input and output from *EdgeR* and *DESeq2*. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and the "JOBS" directory with all command-line calls. +Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/INDICES" directory containing the built index, a "QC" directory containing all *fastqc*/*rustqc* reports and *multiqc* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools*/*fgumi* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, a "DE" directory will be created which will hold output from counting with featurecounts and DE input and output from *EdgeR* and *DESeq2*. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and the "JOBS" directory with all command-line calls. A successful run will show the message 'Workflow finished, no error' at the end. @@ -86,10 +86,10 @@ This postprocessing use case involves multiple input files, two conditions (WT/K Workflows include: - FETCH: Download from SRA - - QC: FASTQC of input and output + - QC: FASTQC and RustQC of input and output - TRIMMING: Adaptor removal with cutadapt/trimgalore - MAPPING: Read mapping with STAR, hisat2, bwa, segemehl3 and minimap2 - - DEDUP: Read deduplication with umi_tools and picard + - DEDUP: Read deduplication with umi_tools, fgumi and picard - DE: Differential Expression analysis with EdgeR and DESeq2 - DEU: Differential Exon Usage analysis with EdgeR (DEXSeq skipped, runtime) - COUNTING: Read counting with FeatureCounts @@ -108,7 +108,7 @@ Starting the run with 12 cores (defining more will be capped by the config file monsda -j 12 -c ${CONDA_PREFIX}/share/MONSDA/configs/tutorial_postprocess.json --directory ${PWD} -Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for *salmon* later on, a "QC" directory containing all *FastQC* reports and *MultiQC* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU" directories will be created which will hold output from counting with FeatureCounts and DE/DEU input and output from *EdgeR* and *DESeq2* respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls. +Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for *salmon* later on, a "QC" directory containing all *FastQC*/*RustQC* reports and *MultiQC* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools*/*fgumi* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU" directories will be created which will hold output from counting with FeatureCounts and DE/DEU input and output from *EdgeR* and *DESeq2* respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls. A successful run will show the message 'Workflow finished, no error'. Be aware that this is indeed an exhaustive workflow and will require a decent amount of disk-space, memory and compute-time, depending on the hardware at your disposal. @@ -122,10 +122,10 @@ This exhaustive use case involves multiple input files, two conditions (WT/KO) a Workflows include: - FETCH: Download from SRA - - QC: FASTQC of input and output + - QC: FASTQC and RustQC of input and output - TRIMMING: Adaptor removal with cutadapt/trimgalore - MAPPING: Read mapping with STAR, hisat2, bwa, segemehl3 and minimap2 - - DEDUP: Read deduplication with umi_tools and picard + - DEDUP: Read deduplication with umi_tools, fgumi and picard - DE: Differential Expression analysis with EdgeR and DESeq2 - DEU: Differential Exon Usage analysis with EdgeR (DEXSeq skipped, runtime) - DAS: Differential Alternative Splicing analysis with EdgeR and DIEGO @@ -186,6 +186,6 @@ Starting the run with 12 cores (defining more will be capped by the config file monsda -j 12 -c ${CONDA_PREFIX}/share/MONSDA/configs/tutorial_exhaustive.json --directory ${PWD} -Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for salmon later on, a "QC" directory containing all FASTQC reports and MULTIQC output, a "TRIMMED_FASTQ" directory for trimgalore and cutadapt output, a "DEDUP" directory for umi_tools (runs before trimming and after mapping) and picard (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU/DAS/DTU" directories will be created which will hold output from counting with FeatureCounts (or salmon for DTU) and DE/DEU/DAS/DTU input and output from EDGER, DESeq2, Diego and DrimSeq respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls. +Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for salmon later on, a "QC" directory containing all FASTQC/RustQC reports and MULTIQC output, a "TRIMMED_FASTQ" directory for trimgalore and cutadapt output, a "DEDUP" directory for umi_tools/fgumi (runs before trimming and after mapping) and picard (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU/DAS/DTU" directories will be created which will hold output from counting with FeatureCounts (or salmon for DTU) and DE/DEU/DAS/DTU input and output from EDGER, DESeq2, Diego and DrimSeq respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls. A successful run will show the message 'Workflow finished, no error'. Be aware that this is indeed an exhaustive workflow and will require a decent amount of disk-space, memory and compute-time, depending on the hardware at your disposal. \ No newline at end of file diff --git a/docs/source/workflows.rst b/docs/source/workflows.rst index cccf07e6..008756bc 100644 --- a/docs/source/workflows.rst +++ b/docs/source/workflows.rst @@ -84,6 +84,8 @@ This workflow step can be run as preprocessing step if none of the processing wo +============================+============================================================+=========+=========+=========================================================================+============+===========+ | FASTQC (includes MULTIQC) | A quality control tool for high throughput sequence data. | fastqc | fastqc | `fastqc `_ | FASTQ/BAM | ZIP/HTML | +----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+ + | RustQC (includes MULTIQC) | High-performance RNA-seq QC suite with MultiQC-compatible outputs. | rustqc | rustqc | `rustqc `_ | BAM | TEXT/TSV/LOG | + +----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+ PROCESSING @@ -103,6 +105,8 @@ If any of the below listed processing steps is defined in the config.json, quali +============================+============================================================+=========+=========+=========================================================================+============+===========+ | FASTQC (includes MULTIQC) | A quality control tool for high throughput sequence data. | fastqc | fastqc | `fastqc `_ | FASTQ/BAM | ZIP/HTML | +----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+ + | RustQC (includes MULTIQC) | High-performance RNA-seq QC suite with MultiQC-compatible outputs. | rustqc | rustqc | `rustqc `_ | BAM | TEXT/TSV/LOG | + +----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+ Trimming @@ -173,6 +177,8 @@ Deduplicate reads by UMI or based on mapping position and CIGAR string +===============+====================================================================================================================================================+===========+============+====================================================================================================+======================+============+ | UMI-tools | UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) and single cell RNA-Seq cell barcodes. | umitools | umi_tools | `umitools `_ | FASTQ/TRIMMED_FASTQ | FASTQ/BAM | +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------+----------------------+------------+ + | fgumi | High-performance tools for UMI-tagged sequencing data including UMI extraction and UMI-aware deduplication. | fgumi | fgumi | `fgumi `_ | FASTQ/TRIMMED_FASTQ | FASTQ/BAM | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------+----------------------+------------+ | Picard tools | A better duplication marking algorithm that handles all cases including clipped and gapped alignments. | picard | picard | `picard `_ | BAM | BAM | +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------+----------------------+------------+ diff --git a/envs/fgumi.yaml b/envs/fgumi.yaml new file mode 100644 index 00000000..9bffb96d --- /dev/null +++ b/envs/fgumi.yaml @@ -0,0 +1,9 @@ +name: fgumi +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - samtools =1.21 + - fgumi =0.1.3 + - dateutils =0.6.12 diff --git a/envs/monsda.yaml b/envs/monsda.yaml index 39538a1d..697dc095 100644 --- a/envs/monsda.yaml +++ b/envs/monsda.yaml @@ -4,29 +4,29 @@ channels: - bioconda - nodefaults dependencies: - - biopython =1.86 - - fastapi =0.128.5 + - biopython =1.87 + - fastapi =0.136.1 - grep >=3.4 - - isort =7.0.0 + - isort =8.0.1 - monsda =1.4.0 - natsort =8.4.0 - - nextflow =25.10.3 - - numpy =2.4.2 + - nextflow =26.04.0 + - numpy =2.4.3 - pandas =2.3.3 - perl =5.32.1 - pip >=25.3 - python =3.12.2 - pyyaml =6.0.3 - - scipy =1.17.0 - - snakemake =9.16.3 - - snakemake-executor-plugin-slurm =2.1.0 + - scipy =1.17.1 + - snakemake =9.20.0 + - snakemake-executor-plugin-slurm =2.6.1 - snakemake-executor-plugin-cluster-generic =1.0.9 - - snakemake-interface-common =1.22.0 - - snakemake-interface-executor-plugins =9.3.9 + - snakemake-interface-common =1.23.0 + - snakemake-interface-executor-plugins =9.4.0 - snakemake-interface-report-plugins =1.3.0 - - snakemake-interface-storage-plugins =4.3.2 + - snakemake-interface-storage-plugins =4.4.1 - snakemake-storage-plugin-s3 =0.3.6 - snakemake-storage-plugin-ftp =0.1.3 - - snakemake-storage-plugin-http =0.3.0 + - snakemake-storage-plugin-http =0.3.1 - snakemake-storage-plugin-zenodo =0.1.5 - - uvicorn =0.40.0 \ No newline at end of file + - uvicorn =0.46.0 \ No newline at end of file diff --git a/envs/monsda_min.yaml b/envs/monsda_min.yaml index 5b0f3cdb..b2939fa2 100644 --- a/envs/monsda_min.yaml +++ b/envs/monsda_min.yaml @@ -4,28 +4,28 @@ channels: - bioconda - nodefaults dependencies: - - biopython =1.86 - - fastapi =0.128.5 + - biopython =1.87 + - fastapi =0.136.1 - grep >=3.4 - - isort =7.0.0 + - isort =8.0.1 - natsort =8.4.0 - - nextflow =25.10.3 - - numpy =2.4.2 + - nextflow =26.04.0 + - numpy =2.4.3 - pandas =2.3.3 - perl =5.32.1 - pip >=25.3 - python =3.12.2 - pyyaml =6.0.3 - - scipy =1.17.0 - - snakemake =9.16.3 - - snakemake-executor-plugin-slurm =2.1.0 + - scipy =1.17.1 + - snakemake =9.20.0 + - snakemake-executor-plugin-slurm =2.6.1 - snakemake-executor-plugin-cluster-generic =1.0.9 - - snakemake-interface-common =1.22.0 - - snakemake-interface-executor-plugins =9.3.9 + - snakemake-interface-common =1.23.0 + - snakemake-interface-executor-plugins =9.4.0 - snakemake-interface-report-plugins =1.3.0 - - snakemake-interface-storage-plugins =4.3.2 + - snakemake-interface-storage-plugins =4.4.1 - snakemake-storage-plugin-s3 =0.3.6 - snakemake-storage-plugin-ftp =0.1.3 - - snakemake-storage-plugin-http =0.3.0 + - snakemake-storage-plugin-http =0.3.1 - snakemake-storage-plugin-zenodo =0.1.5 - uvicorn =0.40.0 diff --git a/envs/salmon.yaml b/envs/salmon.yaml index 071b81f9..cfff7361 100644 --- a/envs/salmon.yaml +++ b/envs/salmon.yaml @@ -3,4 +3,4 @@ channels: - conda-forge - bioconda dependencies: - - salmon =1.10.3 \ No newline at end of file + - salmon =1.11.4 \ No newline at end of file diff --git a/envs/umicollapse.yaml b/envs/umicollapse.yaml new file mode 100644 index 00000000..06018058 --- /dev/null +++ b/envs/umicollapse.yaml @@ -0,0 +1,10 @@ +name: umicollapse +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - samtools =1.21 + - umicollapse + - umi_tools =1.1.2 + - dateutils =0.6.12 diff --git a/profile_nextflow/nextflow.config b/profile_nextflow/nextflow.config index aec407b5..6d15e289 100644 --- a/profile_nextflow/nextflow.config +++ b/profile_nextflow/nextflow.config @@ -1,27 +1,82 @@ +// Queue-agnostic fallback defaults for ALL processes. +// These apply regardless of which executor/profile is active. +// Inline process directives (e.g. memory 16.GB in sortsam) take precedence. +process { + cpus = 1 + errorStrategy = 'retry' + maxRetries = 2 + memory = { 20.GB * (1 << ((task.attempt ?: 1) - 1)) } + time = { 8.h * (1 << ((task.attempt ?: 1) - 1)) } +} + profiles { - slurm { - process.executor = 'slurm' - process.memory = '10 GB' - process.queue = 'main' - withName: '_idx|_map' { - memory = '160GB' + slurm { + process.executor = 'slurm' + process.queue = 'c' // default partition – override per-rule below + + // Non-default overrides only – rules that need more than the global defaults. + process { + withName: 'bwa_idx|bwa2_idx|bwameth_idx|hisat2_idx|minimap_idx|segemehl_idx|segemehl3_idx|star_idx' { + memory = { 80.GB * (1 << ((task.attempt ?: 1) - 1)) } + } + withName: 'bwa_mapping|bwa2_mapping|bwameth_mapping|hisat2_mapping|minimap_mapping|segemehl_mapping|segemehl3_mapping|star_mapping' { + memory = { 80.GB * (1 << ((task.attempt ?: 1) - 1)) } + } + withName: 'salmon_idx|kallisto_idx' { + memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) } + } + withName: 'mqc' { + memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) } + } + withName: 'run_deseq2|run_edger' { + memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) } + queue = 'medium' + time = { 18.h * (1 << ((task.attempt ?: 1) - 1)) } + } + withName: 'run_drimseq|run_dexseq' { + memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) } + queue = 'long' + time = { 120.h * (1 << ((task.attempt ?: 1) - 1)) } + } + withName: 'atacorrect' { + memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) } + } + withName: 'bindetect' { + memory = { 60.GB * (1 << ((task.attempt ?: 1) - 1)) } + } + withName: 'gridss_call|gridss_setupreference' { + memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) } + cpus = 8 + } + } } - } - local { - process.executor = 'local' - } + local { + process.executor = 'local' + } - apptainer { - conda.enabled = true - apptainer.enabled = true - apptainer.autoMounts = true - docker.enabled = false - process { - withName: '_idx|_map' { - memory = '160GB' + apptainer { + conda.enabled = true + apptainer.enabled = true + apptainer.autoMounts = true + docker.enabled = false + process { + beforeScript = ''' + if [[ -n "${CONDA_PREFIX:-}" && -f "${CONDA_PREFIX}/etc/profile.d/conda.sh" ]]; then + source "${CONDA_PREFIX}/etc/profile.d/conda.sh" + elif [[ -n "${CONDA_EXE:-}" ]]; then + _conda_base="$(dirname "$(dirname "${CONDA_EXE}")")" + source "${_conda_base}/etc/profile.d/conda.sh" + elif [[ -f "${HOME}/miniconda3/etc/profile.d/conda.sh" ]]; then + source "${HOME}/miniconda3/etc/profile.d/conda.sh" + elif [[ -f "${HOME}/anaconda3/etc/profile.d/conda.sh" ]]; then + source "${HOME}/anaconda3/etc/profile.d/conda.sh" + else + echo "Could not locate conda.sh" >&2 + exit 1 + fi + conda activate apptainer + ''' } - beforeScript = 'source ~/anaconda3/etc/profile.d/conda.sh; conda activate apptainer; export PATH=${PATH}:${CONDA_PREFIX}/bin' } - } } \ No newline at end of file diff --git a/profile_snakemake/cluster_config.yaml b/profile_snakemake/cluster_config.yaml index 226538a4..f9f7b122 100644 --- a/profile_snakemake/cluster_config.yaml +++ b/profile_snakemake/cluster_config.yaml @@ -1,6 +1,6 @@ __default__: - account: user # your account name - partition: main # the partition to use + account: ${USER} # your account name + partition: c # the partition to use time: 1500 # default time (minutes) nodes: 1 output: "/SLURMLOG/{rule}.{wildcards}.out" @@ -13,3 +13,6 @@ generate_index: mapping: mem: 200GB + +multiqc: + mem: 30GB diff --git a/profile_snakemake/config.v8+.yaml b/profile_snakemake/config.v8+.yaml old mode 100644 new mode 100755 index 548731c7..a984a4d2 --- a/profile_snakemake/config.v8+.yaml +++ b/profile_snakemake/config.v8+.yaml @@ -1,41 +1,52 @@ executor: slurm -cluster-generic-submit-cmd: - mkdir -p LOGS/SLURM/{rule} && - sbatch - --partition={resources.partition} - --cpus-per-task={threads} - --mem={resources.mem} - --account={resources.account} - --job-name=smk-{rule}-{wildcards} - --output=LOGS/SLURM/{rule}/{rule}-{wildcards}-%j.out - --parsable + default-resources: - slurm_account: "user" # your account name - slurm_partition: "c" # the partition to use + slurm_account: "${SLURM_ACCOUNT}" # your account name + slurm_partition: "${SLURM_PARTITION}" # the partition to use nodes: 1 - runtime: "8h" - mem: "20GB" # default memory + mem_mb: "20000 * (2 ** (attempt - 1))" + runtime: "480 * (2 ** (attempt - 1))" set-resources: generate_index: - mem: "160GB" + mem_mb: "80000 * (2 ** (attempt - 1))" mapping: - mem: "160GB" - sortsam: - mem: "30GB" - count_mappers: - mem: "40GB" + mem_mb: "80000 * (2 ** (attempt - 1))" multiqc: - mem: "100GB" + mem_mb: "40000 * (2 ** (attempt - 1))" run_deseq2: - mem: "40GB" - slurm_extra: "'--qos=medium'" + mem_mb: "40000 * (2 ** (attempt - 1))" + slurm_qos: "medium" + runtime: "18h" + run_DTU: + mem_mb: "40000 * (2 ** (attempt - 1))" + slurm_qos: "long" + runtime: "120h" + run_edger: + mem_mb: "40000 * (2 ** (attempt - 1))" + slurm_qos: "medium" runtime: "18h" + salmon_index: + mem_mb: "40000 * (2 ** (attempt - 1))" + sortsam: + mem_mb: "16000 * (2 ** (attempt - 1))" + slurm_qos: "rapid" + runtime: "1h" + atacorrect: + mem_mb: "40000 * (2 ** (attempt - 1))" + bindetect: + mem_mb: "60000 * (2 ** (attempt - 1))" + gridss_call: + mem_mb: "40000 * (2 ** (attempt - 1))" + threads: 8 + gridss_setupreference: + mem_mb: "40000 * (2 ** (attempt - 1))" + threads: 8 jobs: 1000 keep-going: True rerun-incomplete: True scheduler: greedy max-jobs-per-second: 10 max-status-checks-per-second: 10 -restart-times: 3 +restart-times: 1 local-cores: 1 latency-wait: 600 diff --git a/profile_snakemake/config.yaml b/profile_snakemake/config.yaml new file mode 100644 index 00000000..a4e4edb5 --- /dev/null +++ b/profile_snakemake/config.yaml @@ -0,0 +1,12 @@ +restart-times: 3 +jobscript: "slurm-jobscript.sh" +cluster-generic-submit-cmd: "slurm-submit.py" +#cluster-status: "slurm-status.py" +max-jobs-per-second: 1 +max-status-checks-per-second: 3 +local-cores: 1 +latency-wait: 600 +#use-conda: True +keep-going: True +rerun-incomplete: True +#printshellcmds: True diff --git a/scripts/Analysis/build_count_table.py b/scripts/Analysis/build_count_table.py index 64401669..5a220a8a 100755 --- a/scripts/Analysis/build_count_table.py +++ b/scripts/Analysis/build_count_table.py @@ -383,6 +383,8 @@ def prepare_table( exc_tb, ) log.error(logid + "".join(tbe.format())) + print("".join(tbe.format()), file=sys.stderr) + raise SystemExit(1) def make_sample_list(group_name): @@ -433,6 +435,8 @@ def make_sample_list(group_name): exc_tb, ) log.error(logid + "".join(tbe.format())) + print("".join(tbe.format()), file=sys.stderr) + raise SystemExit(1) # # build_count_table_simple.py ends here diff --git a/tests/data/README_fgumi_test_data.md b/tests/data/README_fgumi_test_data.md new file mode 100644 index 00000000..4902a8c1 --- /dev/null +++ b/tests/data/README_fgumi_test_data.md @@ -0,0 +1,27 @@ +# fgumi UMI smoke-test data + +This directory contains a tiny synthetic paired-end fixture generator for `fgumi`: + +- `make_fgumi_umi_fixture.py` writes + - `FASTQ/Test/umi/FGUMI01_R1.fastq.gz` + - `FASTQ/Test/umi/FGUMI01_R2.fastq.gz` +- `config_fgumi_test.json` is a minimal tutorial-style MONSDA config that enables only `DEDUP` with `fgumi`. + +## Why synthetic data? + +Most publicly referenced UMI tutorial datasets are not truly small enough for quick CI-style smoke tests. +The synthetic fixture keeps runtime tiny and deterministic while still exercising UMI extraction behavior. + +## External UMI datasets (if you want real data) + +- Galaxy Training (CEL-Seq2 UMI tutorial data on Zenodo): + - https://zenodo.org/record/2573177 + - Files: + - `test_barcodes_celseq2_R1.fastq.gz` (~243.7 MB) + - `test_barcodes_celseq2_R2.fastq.gz` (~594.9 MB) +- Galaxy tutorial page: + - https://training.galaxyproject.org/training-material/topics/single-cell/tutorials/scrna-umis/tutorial.html + +For nf-core test datasets, use branch discovery/search tooling first: +- https://github.com/nf-core/test-datasets +- https://github.com/nf-core/test-datasets/blob/master/docs/USE_EXISTING_DATA.md diff --git a/tests/data/config_Test.json b/tests/data/config_Test.json index f002306c..e7eededd 100644 --- a/tests/data/config_Test.json +++ b/tests/data/config_Test.json @@ -254,8 +254,8 @@ "Test": { "drimseq": { "OPTIONS": { - "INDEX": "-l A --gencode -d GENOMES/transcripts.fa.gz", - "QUANT": "--gcBias", + "INDEX": "-d GENOMES/transcripts.fa.gz", + "QUANT": "--validateMappings --seqBias --gcBias", "DTU": "" } } diff --git a/tests/data/config_Test_local.json b/tests/data/config_Test_local.json index 56d924aa..19bc01a5 100644 --- a/tests/data/config_Test_local.json +++ b/tests/data/config_Test_local.json @@ -254,8 +254,8 @@ "Test": { "drimseq": { "OPTIONS": { - "INDEX": "-l A --gencode -d GENOMES/transcripts.fa.gz", - "QUANT": "--gcBias", + "INDEX": "-d GENOMES/transcripts.fa.gz", + "QUANT": "--validateMappings --seqBias --gcBias", "DTU": "" } } diff --git a/tests/data/config_fgumi_test.json b/tests/data/config_fgumi_test.json new file mode 100644 index 00000000..a19d1bc7 --- /dev/null +++ b/tests/data/config_fgumi_test.json @@ -0,0 +1,47 @@ +{ + "WORKFLOWS": "DEDUP", + "BINS": "", + "MAXTHREADS": "2", + "VERSION": "FIXME", + "SETTINGS": { + "Test": { + "umi": { + "SAMPLES": [ + "FGUMI01" + ], + "SEQUENCING": "paired", + "REFERENCE": "GENOME/genome.fa.gz", + "ANNOTATION": { + "GTF": "GENOME/genomic.gtf.gz", + "GFF": "GENOME/genomic.gff.gz" + }, + "GROUPS": [ + "umi" + ], + "TYPES": [ + "fgumi_smoke" + ], + "BATCHES": [ + "1" + ], + "INDEX": "", + "PREFIX": "" + } + } + }, + "DEDUP": { + "TOOLS": { + "fgumi": "fgumi" + }, + "Test": { + "umi": { + "fgumi": { + "OPTIONS": { + "EXTRACT": "--read-structures 6M+T +T", + "DEDUP": "" + } + } + } + } + } +} diff --git a/tests/data/make_fgumi_umi_fixture.py b/tests/data/make_fgumi_umi_fixture.py new file mode 100644 index 00000000..79d9dbf6 --- /dev/null +++ b/tests/data/make_fgumi_umi_fixture.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +"""Create a tiny paired-end FASTQ fixture with synthetic UMIs for fgumi tests. + +This script writes gzipped FASTQ files into: + FASTQ/Test/umi/FGUMI01_R1.fastq.gz + FASTQ/Test/umi/FGUMI01_R2.fastq.gz + +R1 layout is intentionally compatible with a simple fgumi read structure like: + --read-structures 6M+T +T +where the first 6 bases in R1 are a mock UMI. +""" + +from __future__ import annotations + +import gzip +from pathlib import Path + +OUTDIR = Path("FASTQ") / "Test" / "umi" +SAMPLE = "FGUMI01" + +# (umi6, r1_suffix, r2_sequence) +READS = [ + ("ACGTAA", "TTTTTTTTTTTTAACCGGTTCCAA", "GATTACAGATTACAGATTACAGATTACA"), + ("ACGTAA", "TTTTTTTTTTTTAACCGGTTCCAA", "GATTACAGATTACAGATTACAGATTACA"), # duplicate + ("TGCACT", "TTTTTTTTTTTTGGCCAATTGGCC", "CGTACGTACGTACGTACGTACGTACGTAC"), + ( + "TGCACT", + "TTTTTTTTTTTTGGCCAATTGGCC", + "CGTACGTACGTACGTACGTACGTACGTAC", + ), # duplicate + ("GGAACC", "TTTTTTTTTTTTCCGGAATTCCGG", "TTGCAATTGCAATTGCAATTGCAATTGCA"), + ("CCTTGG", "TTTTTTTTTTTTAATTCCGGAATT", "AACCGGTTAACCGGTTAACCGGTTAACCG"), + ("TTAAGG", "TTTTTTTTTTTTGGAATTCCGGAA", "GGCCAATTGGCCAATTGGCCAATTGGCCA"), + ("CCGGTT", "TTTTTTTTTTTTAACCTTGGAACC", "ATATCGCGATATCGCGATATCGCGATATC"), +] + + +def _fq_record(name: str, seq: str, qual_char: str = "I") -> str: + return f"@{name}\n{seq}\n+\n{qual_char * len(seq)}\n" + + +def main() -> None: + OUTDIR.mkdir(parents=True, exist_ok=True) + + r1_path = OUTDIR / f"{SAMPLE}_R1.fastq.gz" + r2_path = OUTDIR / f"{SAMPLE}_R2.fastq.gz" + + with gzip.open(r1_path, "wt") as r1h, gzip.open(r2_path, "wt") as r2h: + for i, (umi, r1_suffix, r2_seq) in enumerate(READS, start=1): + read_id = f"{SAMPLE}:{i}" + r1_seq = umi + r1_suffix + r1h.write(_fq_record(read_id + " 1:N:0:TEST", r1_seq)) + r2h.write(_fq_record(read_id + " 2:N:0:TEST", r2_seq)) + + print(f"Wrote {r1_path}") + print(f"Wrote {r2_path}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_fgumi_smoke.sh b/tests/test_fgumi_smoke.sh new file mode 100644 index 00000000..fabfd17c --- /dev/null +++ b/tests/test_fgumi_smoke.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "${SCRIPT_DIR}" + +# Mirror existing integration style: expose tests/data as working inputs +ln -fs data/* . + +# Generate tiny gzipped UMI FASTQ inputs +python data/make_fgumi_umi_fixture.py + +# Keep version in sync with installed MONSDA +VERSION=$(monsda --version 2>&1 | sed 's/MONSDA version //g') +sed -i "s/\"VERSION\": \"FIXME\"/\"VERSION\": \"${VERSION}\"/g" config_fgumi_test.json + +mkdir -p CONDALIB + +# --save keeps this as a lightweight workflow generation smoke test +monsda -j 2 -c config_fgumi_test.json --directory "${PWD}" --use-conda --conda-prefix CONDALIB --save + +echo "FGUMI smoke test workflow generation completed." diff --git a/workflows/bwa.nf b/workflows/bwa.nf index be07f149..e436f6f0 100644 --- a/workflows/bwa.nf +++ b/workflows/bwa.nf @@ -89,7 +89,7 @@ process bwa_mapping{ if (PAIRED == 'paired'){ r1 = reads[1] r2 = reads[2] - fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"") + fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"") pf = fn+"_mapped.sam.gz" uf1 = fn+"_R1_unmapped.fastq.gz" uf2 = fn+"_R2_unmapped.fastq.gz" @@ -99,7 +99,7 @@ process bwa_mapping{ """ }else{ read = reads[1] - fn = file(reads[1]).getSimpleName().replaceAll(/\Q_trimmed\E/,"") + fn = file(reads[1]).getSimpleName().replaceAll(/(_dedup)?_trimmed$/,"") pf = fn+"_mapped.sam.gz" uf = fn+"_unmapped.fastq.gz" lf = "bwa_"+fn+".log" diff --git a/workflows/countreads.nf b/workflows/countreads.nf index b542cfab..377410ba 100644 --- a/workflows/countreads.nf +++ b/workflows/countreads.nf @@ -147,7 +147,7 @@ process count_mappers{ conda "samtools.yaml" container "oras://jfallmann/monsda:"+"samtools" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -178,7 +178,7 @@ process featurecount{ conda "$COUNTENV"+".yaml" container "oras://jfallmann/monsda:"+"$COUNTENV" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -252,7 +252,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DEREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DEREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/deseq2_DE.nf b/workflows/deseq2_DE.nf index e39a645c..0c446a12 100644 --- a/workflows/deseq2_DE.nf +++ b/workflows/deseq2_DE.nf @@ -22,7 +22,7 @@ process featurecount_deseq{ conda "$COUNTENV"+".yaml" container "oras://jfallmann/monsda:"+"$COUNTENV" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -94,7 +94,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DEREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DEREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/dexseq_DEU.nf b/workflows/dexseq_DEU.nf index 8c954541..0fb02c75 100644 --- a/workflows/dexseq_DEU.nf +++ b/workflows/dexseq_DEU.nf @@ -21,7 +21,7 @@ process prepare_deu_annotation{ conda "$COUNTENV"+".yaml" container "oras://jfallmann/monsda:"+"$COUNTENV" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -61,7 +61,7 @@ process featurecount_dexseq{ conda "$COUNTENV"+".yaml" container "oras://jfallmann/monsda:"+"$COUNTENV" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -133,7 +133,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DEUREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DEUREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/diego_DAS.nf b/workflows/diego_DAS.nf index a2a43a73..b0a40373 100644 --- a/workflows/diego_DAS.nf +++ b/workflows/diego_DAS.nf @@ -24,7 +24,7 @@ process featurecount_diego{ conda "$COUNTENV"+".yaml" container "oras://jfallmann/monsda:"+"$COUNTENV" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 diff --git a/workflows/edger_DAS.nf b/workflows/edger_DAS.nf index 9fa62d2b..cedb3a93 100644 --- a/workflows/edger_DAS.nf +++ b/workflows/edger_DAS.nf @@ -22,7 +22,7 @@ process featurecount_edger{ conda "$COUNTENV"+".yaml" container "oras://jfallmann/monsda:"+"$COUNTENV" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -94,7 +94,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DASREPS --ids --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DASREPS --ids -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/edger_DE.nf b/workflows/edger_DE.nf index 43e0160e..5a55648a 100644 --- a/workflows/edger_DE.nf +++ b/workflows/edger_DE.nf @@ -22,7 +22,7 @@ process featurecount_edger{ conda "$COUNTENV"+".yaml" container "oras://jfallmann/monsda:"+"$COUNTENV" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -94,7 +94,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DEREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DEREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/edger_DEU.nf b/workflows/edger_DEU.nf index 7891d95d..a0a27db1 100644 --- a/workflows/edger_DEU.nf +++ b/workflows/edger_DEU.nf @@ -22,7 +22,7 @@ process featurecount_edger{ conda "$COUNTENV"+".yaml" container "oras://jfallmann/monsda:"+"$COUNTENV" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -94,7 +94,8 @@ process prepare_count_table{ script: """ - ${BINS}/Analysis/build_count_table.py $DEUREPS --ids --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log + reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -) + ${BINS}/Analysis/build_count_table.py $DEUREPS --ids -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log """ } diff --git a/workflows/fastqc.nf b/workflows/fastqc.nf index 9ef084e1..77bd87d0 100644 --- a/workflows/fastqc.nf +++ b/workflows/fastqc.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_QC') ?: '' //QC RAW diff --git a/workflows/fastqc.smk b/workflows/fastqc.smk index b25b1787..7694bbaa 100644 --- a/workflows/fastqc.smk +++ b/workflows/fastqc.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config(config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'PREQC') if paired == 'paired': log.info('Running paired mode QC') diff --git a/workflows/fastqc_dedup.nf b/workflows/fastqc_dedup.nf index 62cf54a3..85665236 100644 --- a/workflows/fastqc_dedup.nf +++ b/workflows/fastqc_dedup.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_QC') ?: '' // RAW QC diff --git a/workflows/fastqc_dedup.smk b/workflows/fastqc_dedup.smk index b9f8676e..923115c0 100644 --- a/workflows/fastqc_dedup.smk +++ b/workflows/fastqc_dedup.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config(config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'PREQC') #outdir = 'QC/'+str(QCENV)+'/' #moutdir = 'QC/Multi/'+str(QCENV)+'/' diff --git a/workflows/fastqc_dedup_trim.nf b/workflows/fastqc_dedup_trim.nf index d875ba3e..2e53b7f2 100644 --- a/workflows/fastqc_dedup_trim.nf +++ b/workflows/fastqc_dedup_trim.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_QC') ?: '' //QC RAW diff --git a/workflows/fastqc_dedup_trim.smk b/workflows/fastqc_dedup_trim.smk index c5b12743..08d5f183 100644 --- a/workflows/fastqc_dedup_trim.smk +++ b/workflows/fastqc_dedup_trim.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config( config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'PREQC') #outdir = 'QC/'+str(QCENV)+'/' #moutdir = 'QC/Multi/'+str(QCENV)+'/' diff --git a/workflows/fastqc_raw.nf b/workflows/fastqc_raw.nf index d83e612c..70e1bb81 100644 --- a/workflows/fastqc_raw.nf +++ b/workflows/fastqc_raw.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_QC') ?: '' process qc_raw{ diff --git a/workflows/fastqc_raw.smk b/workflows/fastqc_raw.smk index b9dfe662..1fe7ab37 100644 --- a/workflows/fastqc_raw.smk +++ b/workflows/fastqc_raw.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config(config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'PREQC') if paired == 'paired': log.info('Running paired mode QC') diff --git a/workflows/fastqc_trim.nf b/workflows/fastqc_trim.nf index 5aa1f262..badcea20 100644 --- a/workflows/fastqc_trim.nf +++ b/workflows/fastqc_trim.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_QC') ?: '' // RAW QC diff --git a/workflows/fastqc_trim.smk b/workflows/fastqc_trim.smk index c4088665..5446e889 100644 --- a/workflows/fastqc_trim.smk +++ b/workflows/fastqc_trim.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config( config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'PREQC') #outdir = 'QC/'+str(QCENV)+'/' #moutdir = 'QC/Multi/'+str(QCENV)+'/' diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf new file mode 100644 index 00000000..9e9547b9 --- /dev/null +++ b/workflows/fgumi.nf @@ -0,0 +1,76 @@ +DEDUPENV=get_always('DEDUPENV') +DEDUPBIN=get_always('DEDUPBIN') + +EXTRACTPARAMS = get_always('fgumi_params_EXTRACT') ?: '' +DEDUPPARAMS = get_always('fgumi_params_DEDUP') ?: '' + +process extract_fq{ + conda "$DEDUPENV"+".yaml" + container "oras://jfallmann/monsda:"+"$DEDUPENV" + cpus THREADS + cache 'lenient' + + publishDir "${workflow.workDir}/../" , mode: 'link', + saveAs: {filename -> + if (filename.indexOf("_dedup.fastq.gz") > 0) "DEDUP_FASTQ/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.fastq.gz" + else if (filename.indexOf("log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/dedup_extract.log" + else null + } + + input: + path samples + + output: + path "*_dedup.fastq.gz", emit: extract + path "*_fgumi_extract.bam", emit: ubam + path "ex.log", emit: logs + + script: + if (PAIRED == 'paired'){ + r1 = samples[0] + r2 = samples[1] + sn = samples[0].getSimpleName().replace("_R1","") + ubam = sn+"_fgumi_extract.bam" + outf = samples[0].getSimpleName()+"_dedup.fastq.gz" + outf2 = samples[1].getSimpleName()+"_dedup.fastq.gz" + """ + mkdir -p tmp && $DEDUPBIN extract $EXTRACTPARAMS --inputs $r1 $r2 --sample $sn --library $sn --output $ubam > ex.log 2>&1 && samtools fastq -n -1 $outf -2 $outf2 -0 /dev/null -s /dev/null $ubam >> ex.log 2>&1 + """ + } + else{ + r1 = samples[0] + sn = samples[0].getSimpleName().replace(".fastq.gz","") + ubam = sn+"_fgumi_extract.bam" + outf = samples[0].getSimpleName()+"_dedup.fastq.gz" + """ + mkdir -p tmp && $DEDUPBIN extract $EXTRACTPARAMS --inputs $r1 --sample $sn --library $sn --output $ubam > ex.log 2>&1 && samtools fastq -n $ubam | gzip -c > $outf && echo done >> ex.log + """ + } +} + +workflow DEDUPEXTRACT{ + take: + collection + + main: + //SAMPLE CHANNELS + if ( PREDEDUP == 'enabled' ){ + if (PAIRED == 'paired'){ + extract_fq(samples_ch.collate( 2 )) + } else{ + extract_fq(samples_ch.collate( 1 )) + } + }else{ + if (PAIRED == 'paired'){ + extract_fq(collection.collate( 2 )) + } else{ + extract_fq(collection.collate( 1 )) + } + } + + emit: + extract = extract_fq.out.extract + ubam = extract_fq.out.ubam + logs = extract_fq.out.logs +} + diff --git a/workflows/fgumi.smk b/workflows/fgumi.smk new file mode 100644 index 00000000..edeacf1e --- /dev/null +++ b/workflows/fgumi.smk @@ -0,0 +1,88 @@ +DEDUPBIN, DEDUPENV = env_bin_from_config(config, 'DEDUP') + +wildcard_constraints: + type = "sorted|sorted_unique" + +if paired == 'paired': + rule extract: + input: r1 = lambda wildcards: "FASTQ/{rawfile}_R1.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]), + r2 = lambda wildcards: "FASTQ/{rawfile}_R2.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]) + output: o1 = "DEDUP_FASTQ/{combo}/{file}_R1_dedup.fastq.gz", + o2 = "DEDUP_FASTQ/{combo}/{file}_R2_dedup.fastq.gz", + ubam = "TMP/FGEX/{combo}/{file}_extracted.bam", + td = temp(directory("TMP/FGEX/{combo}/{file}")) + log: "LOGS/{combo}/{file}_dedup_extract.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + params: epara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""), + dedup = DEDUPBIN, + sname = lambda wildcards: os.path.basename(wildcards.file) + shell: "mkdir -p {output.td} && {params.dedup} extract {params.epara} --inputs {input.r1} {input.r2} --sample {params.sname} --library {params.sname} --output {output.ubam} > {log} 2>&1 && samtools fastq -n -1 {output.o1} -2 {output.o2} -0 /dev/null -s /dev/null {output.ubam} >> {log} 2>&1" +else: + rule extract: + input: r1 = lambda wildcards: "FASTQ/{rawfile}.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]) + output: o1 = "DEDUP_FASTQ/{combo}/{file}_dedup.fastq.gz", + ubam = "TMP/FGEX/{combo}/{file}_extracted.bam", + td = temp(directory("TMP/FGEX/{combo}/{file}")) + log: "LOGS/{combo}/{file}_dedup_extract.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + params: epara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""), + dedup = DEDUPBIN, + sname = lambda wildcards: os.path.basename(wildcards.file) + shell: "mkdir -p {output.td} && {params.dedup} extract {params.epara} --inputs {input.r1} --sample {params.sname} --library {params.sname} --output {output.ubam} > {log} 2>&1 && samtools fastq -n {output.ubam} | gzip -c > {output.o1} && echo done >> {log}" + +if paired == 'paired': + rule dedupbam: + input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam", + ubam = "TMP/FGEX/{combo}/{file}_extracted.bam" + output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"), + bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"), + td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}")) + log: "LOGS/{combo}/{file}_{type}/dedupbam.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 4 + priority: 0 # This should be done after all mapping is done + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), + dedup = DEDUPBIN, + ref = REFERENCE, + ref_dict = (REFERENCE[:-3] if REFERENCE.endswith('.gz') else REFERENCE) + ".dict" + shell: """mkdir -p {output.td} +[[ -f "{params.ref_dict}" ]] || samtools dict {params.ref} -o {params.ref_dict} >> {log} 2>&1 +samtools sort -n -@ {threads} -o {output.td}/ubam_qn.bam {input.ubam} >> {log} 2>&1 +samtools sort -n -@ {threads} -o {output.td}/mapped_qn.bam {input.bam} >> {log} 2>&1 +samtools view -h {output.td}/mapped_qn.bam | awk 'BEGIN{{FS=OFS="\t"}} /^@/{{print; next}} {{f=$2+0; if (!and(f,256) && !and(f,2048)) {{k=$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next}} print}}' | samtools view -b -o {output.td}/mapped_qn_primaryuniq.bam - >> {log} 2>&1 +{params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn_primaryuniq.bam --reference {params.ref} --output {output.td}/zippered.bam --threads {threads} --compression-level 1 >> {log} 2>&1 +{params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam --threads {threads} --compression-level 1 >> {log} 2>&1 +{params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.td}/dedup.bam >> {log} 2>&1 +{params.dedup} sort --order coordinate --input {output.td}/dedup.bam --output {output.bam} --write-index --threads {threads} --compression-level 1 >> {log} 2>&1 +""" +else: + rule dedupbam: + input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam", + ubam = "TMP/FGEX/{combo}/{file}_extracted.bam" + output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"), + bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"), + td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}")) + log: "LOGS/{combo}/{file}_{type}/dedupbam.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 4 + priority: 0 # This should be done after all mapping is done + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), + dedup = DEDUPBIN, + ref = REFERENCE, + ref_dict = (REFERENCE[:-3] if REFERENCE.endswith('.gz') else REFERENCE) + ".dict" + shell: """mkdir -p {output.td} +[[ -f "{params.ref_dict}" ]] || samtools dict {params.ref} -o {params.ref_dict} >> {log} 2>&1 +samtools sort -n -@ {threads} -o {output.td}/ubam_qn.bam {input.ubam} >> {log} 2>&1 +samtools sort -n -@ {threads} -o {output.td}/mapped_qn.bam {input.bam} >> {log} 2>&1 +samtools view -h {output.td}/mapped_qn.bam | awk 'BEGIN{{FS=OFS="\t"}} /^@/{{print; next}} {{f=$2+0; if (!and(f,256) && !and(f,2048)) {{k=$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next}} print}}' | samtools view -b -o {output.td}/mapped_qn_primaryuniq.bam - >> {log} 2>&1 +{params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn_primaryuniq.bam --reference {params.ref} --output {output.td}/zippered.bam --threads {threads} --compression-level 1 >> {log} 2>&1 +{params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam --threads {threads} --compression-level 1 >> {log} 2>&1 +{params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.td}/dedup.bam >> {log} 2>&1 +{params.dedup} sort --order coordinate --input {output.td}/dedup.bam --output {output.bam} --write-index --threads {threads} --compression-level 1 >> {log} 2>&1 +""" diff --git a/workflows/fgumi_dedup.nf b/workflows/fgumi_dedup.nf new file mode 100644 index 00000000..608b288b --- /dev/null +++ b/workflows/fgumi_dedup.nf @@ -0,0 +1,85 @@ +DEDUPENV=get_always('DEDUPENV') +DEDUPBIN=get_always('DEDUPBIN') + +DEDUPPARAMS = get_always('fgumi_params_DEDUP') ?: '' + +process dedup_bam{ + conda "$DEDUPENV"+".yaml" + container "oras://jfallmann/monsda:"+"$DEDUPENV" + cpus 4 + cache 'lenient' + //validExitStatus 0,1 + + publishDir "${workflow.workDir}/../" , mode: 'link', + saveAs: {filename -> + if (filename.endsWith("_dedup.bam")) "MAPPED/${COMBO}/${CONDITION}/${file(filename).getName()}" + else if (filename.indexOf("_dedup.bam.bai") > 0) "MAPPED/${COMBO}/${CONDITION}/${file(filename).getName()}" + else if (filename.indexOf("dedup.log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/${file(filename).getName()}" + else null + } + + input: + tuple val(sample_id), path(mapped_bam), path(ubam) + path ref + + output: + path "*_dedup.bam", emit: bam + path "*_dedup.bam.bai", emit: bai + path "*_dedup.log", emit: logs + + script: + bams = mapped_bam + outf = bams.getSimpleName()+"_dedup.bam" + outl = bams.getSimpleName()+"_dedup.log" + """ + mkdir -p TMP + ref_dict=\$(basename $ref .gz).dict + if [[ ! -f "\${ref_dict}" ]]; then samtools dict $ref -o \${ref_dict} >> $outl 2>&1; fi + samtools sort -n -@ ${task.cpus} -o TMP/ubam_qn.bam $ubam >> $outl 2>&1 + samtools sort -n -@ ${task.cpus} -o TMP/mapped_qn.bam $bams >> $outl 2>&1 + samtools view -h TMP/mapped_qn.bam | awk 'BEGIN{FS=OFS="\t"} /^@/{print; next} {f=\$2+0; if (!and(f,256) && !and(f,2048)) {k=\$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next} print}' | samtools view -b -o TMP/mapped_qn_primaryuniq.bam - >> $outl 2>&1 + $DEDUPBIN zipper --unmapped TMP/ubam_qn.bam --input TMP/mapped_qn_primaryuniq.bam --reference $ref --output TMP/zippered.bam --threads ${task.cpus} --compression-level 1 >> $outl 2>&1 + $DEDUPBIN sort --order template-coordinate --input TMP/zippered.bam --output TMP/sorted.bam --threads ${task.cpus} --compression-level 1 >> $outl 2>&1 + $DEDUPBIN dedup $DEDUPPARAMS --input TMP/sorted.bam --output TMP/dedup.bam >> $outl 2>&1 + $DEDUPBIN sort --order coordinate --input TMP/dedup.bam --output $outf --write-index --threads ${task.cpus} --compression-level 1 >> $outl 2>&1 + """ +} + +workflow DEDUPBAM{ + take: + map + mapi + mapu + mapui + ubam + + main: + mapped_ch = map.concat(mapu).map { b -> + def n = file(b).getName() + def key = n + .replaceFirst(/_mapped_sorted_unique\.bam$/, '') + .replaceFirst(/_mapped_sorted\.bam$/, '') + .replaceFirst(/_R1_dedup_trimmed$/, '') + .replaceFirst(/_dedup_trimmed$/, '') + .replaceFirst(/_R1_trimmed$/, '') + .replaceFirst(/_trimmed$/, '') + tuple(key, b) + } + ubam_ch = ubam.map { u -> + def n = file(u).getName() + def key = n + .replaceFirst(/_fgumi_extract\.bam$/, '') + .replaceFirst(/_extracted\.bam$/, '') + .replaceFirst(/_R1$/, '') + tuple(key, u) + } + paired_ch = mapped_ch.combine(ubam_ch, by: 0).map { key, mb, ub -> tuple(key, mb, ub) } + + ref_ch = channel.value(file(REFERENCE)) + dedup_bam(paired_ch, ref_ch) + + emit: + dedup = dedup_bam.out.bam + dedupbai = dedup_bam.out.bai + deduplog = dedup_bam.out.logs +} diff --git a/workflows/fgumi_dedup.smk b/workflows/fgumi_dedup.smk new file mode 100644 index 00000000..49d54982 --- /dev/null +++ b/workflows/fgumi_dedup.smk @@ -0,0 +1,15 @@ +DEDUPBIN, DEDUPENV = env_bin_from_config(config, 'DEDUP') + +rule dedupbam: + input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam" + output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"), + bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"), + td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}")) + log: "LOGS/{combo}/{file}_{type}/dedupbam.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + priority: 0 # This should be done after all mapping is done + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), + dedup = DEDUPBIN + shell: "mkdir -p {output.td} && {params.dedup} dedup {params.dpara} --input {input.bam} --output {output.bam} 2> {log} && samtools index {output.bam} 2>> {log}" diff --git a/workflows/footer.smk b/workflows/footer.smk index 04b84b4b..17081eeb 100644 --- a/workflows/footer.smk +++ b/workflows/footer.smk @@ -1,3 +1,36 @@ +## Queue-agnostic fallback resources for all rules. +## These defaults apply when a rule has no explicit resource set. +## They keep local/non-Slurm runs from lacking baseline resource values. +_fallback_rule_resources = { + "mem_mb": 20000, + "runtime": 480, + "nodes": 1, +} + +_user_rule_defaults = config.get("DEFAULT_RULE_RESOURCES", {}) +if isinstance(_user_rule_defaults, dict): + _fallback_rule_resources.update(_user_rule_defaults) + + +def _has_resource(rule_obj, resource_name): + try: + return resource_name in rule_obj.resources.keys() + except Exception: + return hasattr(rule_obj.resources, resource_name) + + +def _set_resource(rule_obj, resource_name, value): + try: + rule_obj.resources[resource_name] = value + except Exception: + setattr(rule_obj.resources, resource_name, value) + + +for _rule in workflow.rules: + for _res_name, _res_value in _fallback_rule_resources.items(): + if not _has_resource(_rule, _res_name): + _set_resource(_rule, _res_name, _res_value) + onsuccess: print("Workflow finished, no error") onerror: diff --git a/workflows/header.nf b/workflows/header.nf index 576c43d2..7aaa6158 100644 --- a/workflows/header.nf +++ b/workflows/header.nf @@ -7,8 +7,8 @@ // ALWAYS COMMENT LINES WITH '//', DO NOT USE MULTI LINE COMMENTS AS THE PARSER WILL NOT IGNORE MIDDLE LINES AND THIS WILL CAUSE CHAOS //Version Check -nextflowVersion = '>=20.01.0.5264' nextflow.enable.dsl=2 +//nextflowVersion = '>=20.01.0.5264' //define unset Params def get_always(parameter){ diff --git a/workflows/hisat2.nf b/workflows/hisat2.nf index 55bcffc4..cf3d3f89 100644 --- a/workflows/hisat2.nf +++ b/workflows/hisat2.nf @@ -105,7 +105,7 @@ process hisat2_mapping{ r1 = reads[1] r2 = reads[2] - fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"") + fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"") pf = fn+"_mapped.sam" ufo = fn+"_R1_unmapped.fastq.gz" uft = fn+"_R2_unmapped.fastq.gz" @@ -122,7 +122,7 @@ process hisat2_mapping{ stranded = '' } read = reads[1] - fn = file(reads[1]).getSimpleName().replaceAll(/\Q_trimmed\E/,"") + fn = file(reads[1]).getSimpleName().replaceAll(/(_dedup)?_trimmed$/,"") pf = fn+"_mapped.sam" uf = fn+"_unmapped.fastq.gz" lf = "hisat2_"+fn+".log" diff --git a/workflows/mapping.nf b/workflows/mapping.nf index 62b06ffa..b455eba4 100644 --- a/workflows/mapping.nf +++ b/workflows/mapping.nf @@ -4,7 +4,7 @@ process sortsam{ conda "samtools.yaml" container "oras://jfallmann/monsda:"+"samtools" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -34,7 +34,7 @@ process sam2bam{ conda "samtools.yaml" container "oras://jfallmann/monsda:"+"samtools" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -66,7 +66,7 @@ process uniqsam{ conda "samtools.yaml" container "oras://jfallmann/monsda:"+"samtools" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 @@ -101,7 +101,7 @@ process sam2bamuniq{ conda "samtools.yaml" container "oras://jfallmann/monsda:"+"samtools" cpus THREADS - memory 16.GB + memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) } cache 'lenient' //validExitStatus 0,1 diff --git a/workflows/multiqc.nf b/workflows/multiqc.nf index e82010ce..f51c1c0d 100644 --- a/workflows/multiqc.nf +++ b/workflows/multiqc.nf @@ -1,5 +1,5 @@ -QCENV=get_always('QCENV') -QCBIN=get_always('QCBIN') +QCENV=get_always('PREQCENV') +QCBIN=get_always('PREQCBIN') QCPARAMS = get_always('fastqc_params_MULTI') ?: '' process mqc{ diff --git a/workflows/multiqc_rustqc.nf b/workflows/multiqc_rustqc.nf new file mode 100644 index 00000000..71cfe752 --- /dev/null +++ b/workflows/multiqc_rustqc.nf @@ -0,0 +1,65 @@ +QCENV=get_always('POSTQCENV') +QCBIN=get_always('POSTQCBIN') +QCPARAMS = get_always('rustqc_params_MULTI') ?: '' + +process mqc{ + conda "$QCENV"+".yaml" + container "oras://jfallmann/monsda:"+"$QCENV" + cpus THREADS + cache 'lenient' + //validExitStatus 0,1 + + publishDir "${workflow.workDir}/../" , mode: 'link', + saveAs: {filename -> + if (filename.indexOf("zip") > 0) "QC/Multi/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.zip" + else if (filename.indexOf("html") > 0) "QC/Multi/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.html" + else "QC/Multi/${COMBO}/${CONDITION}/${file(filename).getName()}" + } + + input: + path others, stageAs: 'mqc_input??/*' + //path samples + + output: + path "*.zip", emit: mqc + path "*.html", emit: html + + script: + """ + touch $others + OUT=\${PWD} + LIST=multiqc_inputs.txt + TMP_LIST=multiqc_inputs_unique.txt + BASE_QC_DIR="${workflow.workDir}/../QC" + COMBO_VAL="${COMBO}" + CONDITION_VAL="${CONDITION}" + + for i in $others; do + dirname "\$i" >> "\$LIST" + done + + # If the corresponding fastqc combo exists, include its output in the MultiQC report. + FQ_COMBO="\${COMBO_VAL/rustqc/fastqc}" + FQ_DIR="\${BASE_QC_DIR}/\${FQ_COMBO}/\${CONDITION_VAL}" + if [[ -d "\$FQ_DIR" ]]; then + echo "\$FQ_DIR" >> "\$LIST" + fi + + sort -u "\$LIST" > "\$TMP_LIST" + export LC_ALL=en_US.utf8 + export LC_ALL=C.UTF-8 + multiqc -f --exclude picard --exclude gatk -k json -z -s -o "\$OUT" -l "\$TMP_LIST" + """ +} + +workflow MULTIQC{ + take: + otherqcs + + main: + + mqc(otherqcs.collect()) + + emit: + mqcres = mqc.out.mqc +} diff --git a/workflows/multiqc_rustqc.smk b/workflows/multiqc_rustqc.smk index 1ecd2b1f..17022c50 100644 --- a/workflows/multiqc_rustqc.smk +++ b/workflows/multiqc_rustqc.smk @@ -1,109 +1,31 @@ -if rundedup: - if paired == 'paired': - if prededup: - rule multiqc: - input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_dedupmapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_uniquededup.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.dedupbam.output.bam, file=samplecond(SAMPLES, config), combo=combo, type=["sorted", "sorted_unique"]) - output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"), - tmp = temp("QC/Multi/{combo}/{condition}/tmp"), - lst = "QC/Multi/{combo}/{condition}/qclist.txt" - log: "LOGS/{combo}/{condition}_multiqc.log" - conda: "rustqc.yaml" - container: "oras://jfallmann/monsda:rustqc" - threads: 1 - params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "") - shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}" - else: - rule multiqc: - input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_dedupmapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_uniquededup.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.dedupbam.output.bam, file=samplecond(SAMPLES, config), combo=combo, type=["sorted", "sorted_unique"]) - output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"), - tmp = temp("QC/Multi/{combo}/{condition}/tmp"), - lst = "QC/Multi/{combo}/{condition}/qclist.txt" - log: "LOGS/{combo}/{condition}_multiqc.log" - conda: "rustqc.yaml" - container: "oras://jfallmann/monsda:rustqc" - threads: 1 - params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "") - shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}" - - else: - if prededup: - rule multiqc: - input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_dedupmapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_uniquededup.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.dedupbam.output.bam, file=samplecond(SAMPLES, config), combo=combo, type=["sorted", "sorted_unique"]) - output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"), - tmp = temp("QC/Multi/{combo}/{condition}/tmp"), - lst = "QC/Multi/{combo}/{condition}/qclist.txt" - log: "LOGS/{combo}/{condition}_multiqc.log" - conda: "rustqc.yaml" - container: "oras://jfallmann/monsda:rustqc" - threads: 1 - params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "") - shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}" - else: - rule multiqc: - input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_dedupmapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_uniquededup.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.dedupbam.output.bam, file=samplecond(SAMPLES, config), combo=combo, type=["sorted", "sorted_unique"]) - output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"), - tmp = temp("QC/Multi/{combo}/{condition}/tmp"), - lst = "QC/Multi/{combo}/{condition}/qclist.txt" - log: "LOGS/{combo}/{condition}_multiqc.log" - conda: "rustqc.yaml" - container: "oras://jfallmann/monsda:rustqc" - threads: 1 - params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "") - shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}" +if paired == 'paired': + rule multiqc: + input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo), + expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo), + expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo), + expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo) + output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"), + tmp = temp("QC/Multi/{combo}/{condition}/tmp"), + lst = "QC/Multi/{combo}/{condition}/qclist.txt" + log: "LOGS/{combo}/{condition}_multiqc.log" + conda: "rustqc.yaml" + container: "oras://jfallmann/monsda:rustqc" + threads: 1 + params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "") + shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}" else: - if paired == 'paired': - rule multiqc: - input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo) - output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"), - tmp = temp("QC/Multi/{combo}/{condition}/tmp"), - lst = "QC/Multi/{combo}/{condition}/qclist.txt" - log: "LOGS/{combo}/{condition}_multiqc.log" - conda: "rustqc.yaml" - container: "oras://jfallmann/monsda:rustqc" - threads: 1 - params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "") - shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}" - - else: - rule multiqc: - input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo), - expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo) - output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"), - tmp = temp("QC/Multi/{combo}/{condition}/tmp"), - lst = "QC/Multi/{combo}/{condition}/qclist.txt" - log: "LOGS/{combo}/{condition}_multiqc.log" - conda: "rustqc.yaml" - container: "oras://jfallmann/monsda:rustqc" - threads: 1 - params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "") - shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}" + rule multiqc: + input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo), + expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo), + expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo), + expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo) + output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"), + tmp = temp("QC/Multi/{combo}/{condition}/tmp"), + lst = "QC/Multi/{combo}/{condition}/qclist.txt" + log: "LOGS/{combo}/{condition}_multiqc.log" + conda: "rustqc.yaml" + container: "oras://jfallmann/monsda:rustqc" + threads: 1 + params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "") + shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}" diff --git a/workflows/rustqc.nf b/workflows/rustqc.nf index 1505b0a3..9ce3a806 100644 --- a/workflows/rustqc.nf +++ b/workflows/rustqc.nf @@ -1,6 +1,6 @@ -QCENV = get_always('QCENV') -QCBIN = get_always('QCBIN') -QCPARAMS = get_always('rustqc_params_QC') ?: '' +RUSTQCENV = get_always('POSTQCENV') +RUSTQCBIN = get_always('POSTQCBIN') +RUSTQCPARAMS = get_always('rustqc_params_QC') ?: '' MAPANNO = get_always('MAPPINGANNO') @@ -17,8 +17,8 @@ RUSTQC_PAIRED = (PAIRED == 'paired') ? '-p' : '' //RUSTQC on mapped BAMs process rustqc_mapped{ - conda "$QCENV"+".yaml" - container "oras://jfallmann/monsda:"+"$QCENV" + conda "$RUSTQCENV"+".yaml" + container "oras://jfallmann/monsda:"+"$RUSTQCENV" cpus THREADS cache 'lenient' label 'big_mem' @@ -40,11 +40,11 @@ process rustqc_mapped{ fn = file(bam).getSimpleName() anno = file("${workflow.workDir}/../${MAPANNO}") """ - rustqc rna $bam --gtf $anno -t ${task.cpus} $RUSTQC_PAIRED -s $RUSTQC_STRANDED --skip-dup-check -j results/rustqc_summary.json -o results/$fn $QCPARAMS + $RUSTQCBIN rna $bam --gtf $anno -t ${task.cpus} $RUSTQC_PAIRED -s $RUSTQC_STRANDED --skip-dup-check -j results/rustqc_summary.json -o results/$fn $RUSTQCPARAMS """ } -workflow QC_MAPPING{ +workflow RUSTQC_MAPPING{ take: collection main: diff --git a/workflows/rustqc.smk b/workflows/rustqc.smk index 67fa3e93..579715c0 100644 --- a/workflows/rustqc.smk +++ b/workflows/rustqc.smk @@ -1,4 +1,4 @@ -QCBIN, QCENV = env_bin_from_config(config, 'QC') +QCBIN, QCENV = env_bin_from_config(config, 'POSTQC') # Map MONSDA strandedness to RustQC strandedness def rustqc_stranded(stranded): @@ -40,33 +40,3 @@ rule rustqc_uniquemapped: paired = RUSTQC_PAIRED, stranded = RUSTQC_STRANDED shell: "rustqc rna {input.r1} --gtf {params.anno} -t {threads} {params.paired} -s {params.stranded} --skip-dup-check -j {output.js} -o {output.o1} {params.qpara} 2> {log}" - -rule rustqc_dedupmapped: - input: r1 = "MAPPED/{combo}/{file}_mapped_sorted_dedup.bam", - r2 = "MAPPED/{combo}/{file}_mapped_sorted_dedup.bam.bai" - output: o1 = directory("QC/{combo}/{file}_mapped_sorted_dedup"), - js = "QC/{combo}/{file}_mapped_sorted_dedup/rustqc_summary.json" - log: "LOGS/{combo}/{file}_rustqc_dedupmapped.log" - conda: ""+QCENV+".yaml" - container: "oras://jfallmann/monsda:"+QCENV+"" - threads: MAXTHREAD - params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('QC', ""), - anno = ANNOTATION, - paired = RUSTQC_PAIRED, - stranded = RUSTQC_STRANDED - shell: "rustqc rna {input.r1} --gtf {params.anno} -t {threads} {params.paired} -s {params.stranded} -j {output.js} -o {output.o1} {params.qpara} 2> {log}" - -rule rustqc_uniquededup: - input: r1 = "MAPPED/{combo}/{file}_mapped_sorted_unique_dedup.bam", - r2 = "MAPPED/{combo}/{file}_mapped_sorted_unique_dedup.bam.bai" - output: o1 = directory("QC/{combo}/{file}_mapped_sorted_unique_dedup"), - js = "QC/{combo}/{file}_mapped_sorted_unique_dedup/rustqc_summary.json" - log: "LOGS/{combo}/{file}_rustqc_uniquededup.log" - conda: ""+QCENV+".yaml" - container: "oras://jfallmann/monsda:"+QCENV+"" - threads: MAXTHREAD - params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('QC', ""), - anno = ANNOTATION, - paired = RUSTQC_PAIRED, - stranded = RUSTQC_STRANDED - shell: "rustqc rna {input.r1} --gtf {params.anno} -t {threads} {params.paired} -s {params.stranded} -j {output.js} -o {output.o1} {params.qpara} 2> {log}" diff --git a/workflows/star.nf b/workflows/star.nf index 83681e95..62c3563a 100644 --- a/workflows/star.nf +++ b/workflows/star.nf @@ -96,7 +96,7 @@ process star_mapping{ r1 = reads[1] r2 = reads[2] a = "Trimming_report.txt" - fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"") + fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"") of = fn+'.Aligned.out.sam' gf = of.replaceAll(/\Q.Aligned.out.sam\E/,"_mapped.sam.gz") """ @@ -106,7 +106,7 @@ process star_mapping{ else{ if (PAIRED != 'singlecell'){ read = reads[1] - fn = file(reads[1]).getSimpleName().replaceAll(/\Q_trimmed\E/,"")+"." + fn = file(reads[1]).getSimpleName().replaceAll(/(_dedup)?_trimmed$/,"")+"." of = fn+'Aligned.out.sam' gf = of.replaceAll(/\Q.Aligned.out.sam\E/,"_mapped.sam.gz") """ @@ -122,7 +122,7 @@ process star_mapping{ stranded = '--soloStrand Unstranded' } r1 = reads[1] - fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"") + fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"") r2 = "${workflow.workDir}/../FASTQ/${CONDITION}/"+file(reads[2]).getSimpleName().replaceAll(/\QR2_trimmed\E/,"R2.fastq.gz") if (MAPPARAMS.contains('--soloBarcodeMate 1')){ t = r2 diff --git a/workflows/umicollapse.nf b/workflows/umicollapse.nf new file mode 100644 index 00000000..2968c5b2 --- /dev/null +++ b/workflows/umicollapse.nf @@ -0,0 +1,135 @@ +DEDUPENV=get_always('DEDUPENV') +DEDUPBIN=get_always('DEDUPBIN') + +WHITELISTPARAMS = get_always('umicollapse_params_WHITELIST') ?: '' +EXTRACTPARAMS = get_always('umicollapse_params_EXTRACT') ?: '' + +// UMI extraction uses umi_tools extract (bundled in umicollapse env); +// BAM deduplication is handled by umicollapse_dedup.nf. + +process whitelist{ + conda "$DEDUPENV"+".yaml" + container "oras://jfallmann/monsda:"+"$DEDUPENV" + cpus THREADS + cache 'lenient' + //validExitStatus 0,1 + + publishDir "${workflow.workDir}/../" , mode: 'link', + saveAs: {filename -> + if (filename.indexOf("_whitelist") > 0) "DEDUP_FASTQ/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}_whitelist" + else if (filename.indexOf("log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/dedup_whitelist.log" + else null + } + + input: + path samples + + output: + path "*_whitelist", emit: wl + + script: + if (WHITELISTPARAMS == ''){ + outf = samples[0].getSimpleName().replace("_R1","")+"_dummy_whitelist" + """ + touch $outf + """ + } else { + if (PAIRED == 'paired'){ + r1 = samples[0] + r2 = samples[1] + outf = samples[0].getSimpleName().replace("_R1","")+"_whitelist" + """ + mkdir tmp && umi_tools whitelist $WHITELISTPARAMS --temp-dir tmp --log=wl.log --stdin=$r1 --read2-in=$r2 --stdout=$outf + """ + } + else{ + outf = samples.getSimpleName()+"_whitelist" + """ + mkdir tmp && umi_tools whitelist $WHITELISTPARAMS --temp-dir tmp --log=wl.log --stdin=$samples --stdout=$outf + """ + } + } +} + +process extract_fq{ + conda "$DEDUPENV"+".yaml" + container "oras://jfallmann/monsda:"+"$DEDUPENV" + cpus THREADS + cache 'lenient' + //validExitStatus 0,1 + + publishDir "${workflow.workDir}/../" , mode: 'link', + saveAs: {filename -> + if (filename.indexOf("_dedup.fastq.gz") > 0) "DEDUP_FASTQ/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.fastq.gz" + else if (filename.indexOf("log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/dedup_extract.log" + else null + } + + input: + path wl + path samples + + output: + path "*_dedup.fastq.gz", emit: extract + path "ex.log", emit: logs + + script: + if (PAIRED == 'paired'){ + r1 = samples[0] + r2 = samples[1] + outf = samples[0].getSimpleName()+"_dedup.fastq.gz" + outf2 = samples[1].getSimpleName()+"_dedup.fastq.gz" + if (!!(wl =~ /dummy_whitelist/)){ + """ + mkdir tmp && umi_tools extract $EXTRACTPARAMS --temp-dir tmp --log=ex.log --stdin=$r1 --read2-in=$r2 --stdout=$outf --read2-out=$outf2 + """ + } + else{ + """ + mkdir tmp && umi_tools extract $EXTRACTPARAMS --whitelist=$wl --temp-dir tmp --log=ex.log --stdin=$r1 --read2-in=$r2 --stdout=$outf --read2-out=$outf2 + """ + } + } + else{ + outf = samples.getSimpleName()+"_dedup.fastq.gz" + if (!!(wl =~ /dummy_whitelist/)){ + """ + mkdir tmp && umi_tools extract $EXTRACTPARAMS --temp-dir tmp --log=ex.log --stdin=$samples --stdout=$outf + """ + } + else{ + """ + mkdir tmp && umi_tools extract $EXTRACTPARAMS --whitelist=$wl --temp-dir tmp --log=ex.log --stdin=$samples --stdout=$outf + """ + } + } +} + +workflow DEDUPEXTRACT{ + take: + collection + + main: + //SAMPLE CHANNELS + if ( PREDEDUP == 'enabled' ){ + if (PAIRED == 'paired'){ + whitelist(samples_ch.collate( 2 )) + extract_fq(whitelist.out.wl, samples_ch.collate( 2 )) + } else{ + whitelist(samples_ch.collate( 1 )) + extract_fq(whitelist.out.wl, samples_ch.collate( 1 )) + } + }else{ + if (PAIRED == 'paired'){ + whitelist(collection.collate(2)) + extract_fq(whitelist.out.wl, collection.collate( 2 )) + } else{ + whitelist(collection.collate( 1 )) + extract_fq(whitelist.out.wl, collection.collate( 1 )) + } + } + + emit: + extract = extract_fq.out.extract + logs = extract_fq.out.logs +} diff --git a/workflows/umicollapse.smk b/workflows/umicollapse.smk new file mode 100644 index 00000000..e7a83c9e --- /dev/null +++ b/workflows/umicollapse.smk @@ -0,0 +1,146 @@ +DEDUPBIN, DEDUPENV = env_bin_from_config(config, 'DEDUP') + +wlparams = tool_params(SAMPLES[0], None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('WHITELIST') + +wildcard_constraints: + type = "sorted|sorted_unique" + +# UMI extraction uses umi_tools extract (bundled in umicollapse env); +# BAM deduplication uses umicollapse bam for significantly faster dedup. + +if paired == 'paired': + if wlparams: + rule whitelist: + input: r1 = lambda wildcards: "FASTQ/{rawfile}_R1.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]), + r2 = lambda wildcards: "FASTQ/{rawfile}_R2.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]) + output: wl = "DEDUP_FASTQ/{combo}/{file}_whitelist", + td = temp(directory("TMP/UMIWL/{combo}/{file}")) + log: "LOGS/{combo}/{file}_dedup_whitelist.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('WHITELIST', ""), + dedup = "umi_tools" + resources: + mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)), + runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1)) + shell: "mkdir -p {output.td} && {params.dedup} whitelist {params.dpara} --temp-dir {output.td} --log={log} --stdin={input.r1} --read2-in={input.r2} --stdout={output.wl}" + + rule extract: + input: r1 = lambda wildcards: "FASTQ/{rawfile}_R1.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]), + r2 = lambda wildcards: "FASTQ/{rawfile}_R2.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]), + wl = rules.whitelist.output.wl + output: o1 = "DEDUP_FASTQ/{combo}/{file}_R1_dedup.fastq.gz", + o2 = "DEDUP_FASTQ/{combo}/{file}_R2_dedup.fastq.gz", + td = temp(directory("TMP/UMIEX/{combo}/{file}")) + log: "LOGS/{combo}/{file}_dedup_extract.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""), + dedup = "umi_tools" + resources: + mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)), + runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1)) + shell: "mkdir -p {output.td} && {params.dedup} extract {params.dpara} --temp-dir {output.td} --log={log} --error-correct-cell --whitelist={input.wl} --stdin={input.r1} --read2-in={input.r2} --stdout={output.o1} --read2-out={output.o2}" + else: + rule extract: + input: r1 = lambda wildcards: "FASTQ/{rawfile}_R1.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]), + r2 = lambda wildcards: "FASTQ/{rawfile}_R2.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]) + output: o1 = "DEDUP_FASTQ/{combo}/{file}_R1_dedup.fastq.gz", + o2 = "DEDUP_FASTQ/{combo}/{file}_R2_dedup.fastq.gz", + td = temp(directory("TMP/UMIEX/{combo}/{file}")) + log: "LOGS/{combo}/{file}_dedup_extract.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""), + dedup = "umi_tools" + resources: + mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)), + runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1)) + shell: "mkdir -p {output.td} && {params.dedup} extract {params.dpara} --temp-dir {output.td} --log={log} --stdin={input.r1} --read2-in={input.r2} --stdout={output.o1} --read2-out={output.o2}" + +else: + if wlparams: + rule whitelist: + input: r1 = lambda wildcards: "FASTQ/{rawfile}.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]) + output: wl = "DEDUP_FASTQ/{combo}/{file}_whitelist", + td = temp(directory("TMP/UMIWL/{combo}/{file}")) + log: "LOGS/{combo}/{file}_dedup_whitelist.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('WHITELIST', ""), + dedup = "umi_tools" + resources: + mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)), + runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1)) + shell: "mkdir -p {output.td} && {params.dedup} whitelist {params.dpara} --temp-dir {output.td} --log={log} --stdin={input.r1} --stdout={output.wl}" + + rule extract: + input: r1 = lambda wildcards: "FASTQ/{rawfile}.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]), + wl = rules.whitelist.output.wl + output: o1 = "DEDUP_FASTQ/{combo}/{file}_dedup.fastq.gz", + td = temp(directory("TMP/UMIEX/{combo}/{file}")) + log: "LOGS/{combo}/{file}_dedup_extract.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""), + dedup = "umi_tools" + resources: + mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)), + runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1)) + shell: "mkdir -p {output.td} && {params.dedup} extract {params.dpara} --temp-dir {output.td} --log={log} --error-correct-cell --whitelist={input.wl} --stdin={input.r1} --stdout={output.o1}" + + else: + rule extract: + input: r1 = lambda wildcards: "FASTQ/{rawfile}.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]) + output: o1 = "DEDUP_FASTQ/{combo}/{file}_dedup.fastq.gz", + td = temp(directory("TMP/UMIEX/{combo}/{file}")) + log: "LOGS/{combo}/{file}_dedup_extract.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""), + dedup = "umi_tools" + resources: + mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)), + runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1)) + shell: "mkdir -p {output.td} && {params.dedup} extract {params.dpara} --temp-dir {output.td} --log={log} --stdin={input.r1} --stdout={output.o1}" + +if paired == 'paired': + rule dedupbam: + input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam" + output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"), + bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"), + td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}")) + log: "LOGS/{combo}/{file}_{type}/dedupbam.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + priority: 0 + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), + dedup = DEDUPBIN + resources: + mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)), + runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1)) + shell: "mkdir -p {output.td} && {params.dedup} bam -i {input.bam} -o {output.td}/dedup.unsorted.bam --paired {params.dpara} > {log} 2>&1 && samtools sort -@ {threads} -o {output.bam} {output.td}/dedup.unsorted.bam >> {log} 2>&1 && samtools index {output.bam} >> {log} 2>&1" +else: + rule dedupbam: + input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam" + output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"), + bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"), + td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}")) + log: "LOGS/{combo}/{file}_{type}/dedupbam.log" + conda: ""+DEDUPENV+".yaml" + container: "oras://jfallmann/monsda:"+DEDUPENV+"" + threads: 1 + priority: 0 + params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""), + dedup = DEDUPBIN + resources: + mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)), + runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1)) + shell: "mkdir -p {output.td} && {params.dedup} bam -i {input.bam} -o {output.td}/dedup.unsorted.bam {params.dpara} > {log} 2>&1 && samtools sort -@ {threads} -o {output.bam} {output.td}/dedup.unsorted.bam >> {log} 2>&1 && samtools index {output.bam} >> {log} 2>&1" diff --git a/workflows/umicollapse_dedup.nf b/workflows/umicollapse_dedup.nf new file mode 100644 index 00000000..8a77bc60 --- /dev/null +++ b/workflows/umicollapse_dedup.nf @@ -0,0 +1,64 @@ +DEDUPENV=get_always('DEDUPENV') +DEDUPBIN=get_always('DEDUPBIN') + +DEDUPPARAMS = get_always('umicollapse_params_DEDUP') ?: '' + +process dedup_bam{ + conda "$DEDUPENV"+".yaml" + container "oras://jfallmann/monsda:"+"$DEDUPENV" + cpus 1 + cache 'lenient' + //validExitStatus 0,1 + + publishDir "${workflow.workDir}/../" , mode: 'link', + saveAs: {filename -> + if (filename.endsWith("_dedup.bam")) "MAPPED/${COMBO}/${CONDITION}/${file(filename).getName()}" + else if (filename.indexOf("_dedup.bam.bai") > 0) "MAPPED/${COMBO}/${CONDITION}/${file(filename).getName()}" + else if (filename.indexOf("dedup.log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/${file(filename).getName()}" + else null + } + + input: + path todedup + path bami + + output: + path "*_dedup.bam", emit: bam + path "*_dedup.bam.bai", emit: bai + path "*_dedup.log", emit: logs + + memory { 20.GB * (1 << ((task.attempt ?: 1) - 1)) } + time { 8.h * (1 << ((task.attempt ?: 1) - 1)) } + + script: + bams = todedup[0] + bais = todedup[1] + outf = bams.getSimpleName()+"_dedup.bam" + outl = bams.getSimpleName()+"_dedup.log" + if (PAIRED == 'paired'){ + """ + mkdir -p TMP && $DEDUPBIN bam -i $bams -o TMP/dedup.unsorted.bam --paired $DEDUPPARAMS > $outl 2>&1 && samtools sort -@ ${task.cpus} -o $outf TMP/dedup.unsorted.bam >> $outl 2>&1 && samtools index $outf >> $outl 2>&1 + """ + } + else{ + """ + mkdir -p TMP && $DEDUPBIN bam -i $bams -o TMP/dedup.unsorted.bam $DEDUPPARAMS > $outl 2>&1 && samtools sort -@ ${task.cpus} -o $outf TMP/dedup.unsorted.bam >> $outl 2>&1 && samtools index $outf >> $outl 2>&1 + """ + } +} + +workflow DEDUPBAM{ + take: + map + mapi + mapu + mapui + + main: + dedup_bam(map.concat(mapu), mapi.concat(mapui)) + + emit: + dedup = dedup_bam.out.bam + dedupbai = dedup_bam.out.bai + deduplog = dedup_bam.out.logs +}