/share/MONSDA/configs
+ os.path.abspath(
+ os.path.join(install_share, "MONSDA", "configs", TEMPLATE_FILE)
+ ),
+ # generic venv/conda prefix share path
+ os.path.abspath(
+ os.path.join(sys.prefix, "share", "MONSDA", "configs", TEMPLATE_FILE)
+ ),
+ # last-resort: run directory configs
+ os.path.abspath(os.path.join(os.getcwd(), "configs", TEMPLATE_FILE)),
+ ]
+
+ # de-duplicate while preserving order and dropping empty values
+ deduped: List[str] = []
+ for c in candidates:
+ c = (c or "").strip()
+ if c and c not in deduped:
+ deduped.append(c)
+ return deduped
+
+
+def resolve_template_path() -> str:
+ for path in _template_candidates():
+ if os.path.isfile(path):
+ return path
+ tried = "\n - ".join(_template_candidates())
+ raise FileNotFoundError(
+ "Could not find template_base_commented.json. Tried:\n - " + tried
+ )
def load_template() -> Dict[str, Any]:
- with open(TEMPLATE_PATH, "r") as f:
- return json.load(f)
+ return load_configfile(resolve_template_path())
-def strip_comments(d):
+def strip_comments(d: Any) -> Any:
if isinstance(d, dict):
return {k: strip_comments(v) for k, v in d.items() if k != "comment"}
- elif isinstance(d, list):
+ if isinstance(d, list):
return [strip_comments(x) for x in d]
+ return d
+
+
+def set_in_path(root: Dict[str, Any], path: List[str], value: Any) -> None:
+ node = root
+ for key in path[:-1]:
+ if key not in node or not isinstance(node[key], dict):
+ node[key] = {}
+ node = node[key]
+ node[path[-1]] = value
+
+
+def get_condition_paths_from_settings(settings: Dict[str, Any]) -> List[List[str]]:
+ out: List[List[str]] = []
+
+ def _walk(node: Any, path: List[str]) -> None:
+ if not isinstance(node, dict):
+ return
+ if "SAMPLES" in node and isinstance(node.get("SAMPLES"), list):
+ out.append(path.copy())
+ return
+ for k, v in node.items():
+ if isinstance(v, dict):
+ _walk(v, path + [k])
+
+ _walk(settings, [])
+ return out
+
+
+def build_workflow_block(
+ workflow_name: str,
+ workflow_template: Dict[str, Any],
+ condition_paths: List[List[str]],
+ selected_tools: List[str],
+) -> Dict[str, Any]:
+ block: Dict[str, Any] = {}
+
+ all_tools = workflow_template.get("TOOLS", {})
+ if not selected_tools:
+ selected_tools = list(all_tools.keys())
+
+ if all_tools:
+ block["TOOLS"] = {k: all_tools[k] for k in selected_tools if k in all_tools}
+
+ # carry workflow-level defaults if present
+ for passthrough in ["FEATURES", "CUTOFFS", "COMPARABLE", "EXCLUDE"]:
+ if passthrough in workflow_template:
+ block[passthrough] = copy.deepcopy(workflow_template[passthrough])
+
+ # populate per-condition tool settings like CLI configurator does
+ for cond_path in condition_paths:
+ for tool in selected_tools:
+ tool_def = workflow_template.get(tool)
+ if not isinstance(tool_def, dict):
+ continue
+ tool_def_no_comment = strip_comments(tool_def)
+ if not tool_def_no_comment:
+ continue
+ set_in_path(block, cond_path + [tool], copy.deepcopy(tool_def_no_comment))
+
+ return block
+
+
+def build_config(req: BuildConfigRequest) -> Dict[str, Any]:
+ template = strip_comments(load_template())
+
+ if req.settings is not None:
+ settings = req.settings
+ elif req.samplesheet_path:
+ samplesheet_path = os.path.abspath(req.samplesheet_path)
+ if not os.path.isfile(samplesheet_path):
+ raise HTTPException(
+ status_code=400,
+ detail=f"Samplesheet does not exist: {samplesheet_path}",
+ )
+ settings = samplesheet_to_settings(samplesheet_path)
else:
- return d
+ raise HTTPException(
+ status_code=400,
+ detail="Provide either settings JSON or samplesheet_path.",
+ )
- config_name: str
- config: Dict[str, Any]
- output_dir: str
+ condition_paths = get_condition_paths_from_settings(settings)
+ if not condition_paths:
+ raise HTTPException(
+ status_code=400,
+ detail="No condition leaves with SAMPLES found in SETTINGS.",
+ )
+
+ workflows = req.workflows or []
+ if not workflows:
+ raise HTTPException(
+ status_code=400,
+ detail="At least one workflow must be selected.",
+ )
+
+ invalid = [w for w in workflows if w not in template or w in NONE_WORKFLOW_KEYS]
+ if invalid:
+ raise HTTPException(
+ status_code=400,
+ detail=f"Unknown workflow(s): {', '.join(invalid)}",
+ )
+
+ final_config: Dict[str, Any] = {
+ "WORKFLOWS": ", ".join(workflows),
+ "BINS": template.get("BINS", ""),
+ "MAXTHREADS": str(req.maxthreads),
+ "VERSION": __version__,
+ "SETTINGS": settings,
+ }
+
+ for wf in workflows:
+ wf_template = template.get(wf, {})
+ final_config[wf] = build_workflow_block(
+ wf,
+ wf_template,
+ condition_paths,
+ req.tools.get(wf, []),
+ )
+
+ return final_config
@app.get("/template", response_model=Dict[str, Any])
-def get_template():
- """Get the config template (with comments)."""
- return load_template()
+def get_template() -> Dict[str, Any]:
+ try:
+ return load_template()
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Failed to load template: {e}")
@app.get("/template/fields", response_model=Dict[str, Any])
-def get_template_fields():
- """Get the config template (without comments)."""
- return strip_comments(load_template())
+def get_template_fields() -> Dict[str, Any]:
+ try:
+ return strip_comments(load_template())
+ except Exception as e:
+ raise HTTPException(
+ status_code=500, detail=f"Failed to load template fields: {e}"
+ )
+
+
+def _normalize_path(path: str) -> str:
+ if path:
+ return os.path.abspath(os.path.expanduser(path.strip()))
+ return os.getcwd()
+
+
+@app.get("/fs/roots", response_model=Dict[str, Any])
+def fs_roots() -> Dict[str, Any]:
+ cwd = os.getcwd()
+ home = os.path.expanduser("~")
+ roots = []
+ for p in [cwd, home, os.path.sep]:
+ if p and p not in roots and os.path.isdir(p):
+ roots.append(p)
+ return {"roots": roots}
+
+
+@app.get("/fs/list", response_model=Dict[str, Any])
+def fs_list(path: str = "", mode: str = "dirs") -> Dict[str, Any]:
+ mode = (mode or "dirs").lower()
+ if mode not in {"dirs", "all", "samplesheets"}:
+ raise HTTPException(
+ status_code=400, detail="mode must be dirs, all, or samplesheets"
+ )
+
+ current = _normalize_path(path)
+ if not os.path.isdir(current):
+ raise HTTPException(status_code=400, detail=f"Not a directory: {current}")
+
+ try:
+ entries = list(os.scandir(current))
+ except PermissionError:
+ raise HTTPException(status_code=403, detail=f"Permission denied: {current}")
+
+ dirs = []
+ files = []
+ for e in sorted(entries, key=lambda x: x.name.lower()):
+ if e.name in {".", ".."}:
+ continue
+ if e.is_dir(follow_symlinks=True):
+ dirs.append({"name": e.name, "path": e.path})
+ continue
+ if not e.is_file(follow_symlinks=True):
+ continue
+ if mode in {"all", "samplesheets"}:
+ files.append({"name": e.name, "path": e.path})
+
+ parent = (
+ os.path.dirname(current.rstrip(os.sep))
+ if current != os.path.sep
+ else os.path.sep
+ )
+ return {
+ "current": current,
+ "parent": parent,
+ "dirs": dirs,
+ "files": files,
+ "mode": mode,
+ }
+
+
+@app.post("/samplesheet/parse", response_model=Dict[str, Any])
+def parse_samplesheet(req: SamplesheetRequest) -> Dict[str, Any]:
+ path = os.path.abspath(req.samplesheet_path.strip())
+ if not os.path.isfile(path):
+ raise HTTPException(status_code=400, detail=f"Samplesheet not found: {path}")
+ settings = samplesheet_to_settings(path)
+ return {
+ "samplesheet": path,
+ "settings": settings,
+ "conditions": [
+ "/".join(p) for p in get_condition_paths_from_settings(settings)
+ ],
+ }
+
+@app.post("/config/preview", response_model=Dict[str, Any])
+def preview_config(req: BuildConfigRequest) -> Dict[str, Any]:
+ return {"config": build_config(req)}
-@app.post("/generate_config")
-def generate_config(req: ConfigRequest):
- """Generate a config file from user input. User specifies output_dir."""
+
+@app.post("/config/save", response_model=Dict[str, Any])
+def save_config(req: SaveConfigRequest) -> Dict[str, Any]:
config_name = req.config_name.strip()
- output_dir = os.path.abspath(req.output_dir.strip())
if not config_name or any(c in config_name for c in "/\\"):
raise HTTPException(status_code=400, detail="Invalid config name.")
- if not output_dir or not os.path.isdir(output_dir):
- raise HTTPException(status_code=400, detail="Invalid output directory.")
- config_path = os.path.join(output_dir, f"config_{config_name}.json")
- with open(config_path, "w") as f:
- json.dump(req.config, f, indent=4)
- return {"message": "Config generated.", "path": config_path}
-
-
-# Download config by full path (for security, restrict to files under allowed parent dir)
-@app.get("/download_config/")
-def download_config(config_path: str):
- config_path = os.path.abspath(config_path)
- if not os.path.exists(config_path):
- raise HTTPException(status_code=404, detail="Config not found.")
- if not config_path.endswith(".json"):
- raise HTTPException(status_code=400, detail="Only .json files allowed.")
- return FileResponse(config_path, filename=os.path.basename(config_path))
-
-
-class DirRequest(BaseModel):
- config: Dict[str, Any]
- project_dir: str
+ output_dir = os.path.abspath(req.output_dir.strip())
+ if not os.path.isdir(output_dir):
+ raise HTTPException(status_code=400, detail="Output directory does not exist.")
-def safe_makedirs(path):
- os.makedirs(path, exist_ok=True)
+ path = os.path.join(output_dir, f"config_{config_name}.json")
+ with open(path, "w") as fh:
+ json.dump(req.config, fh, indent=4)
+ return {"message": "Config written.", "path": path}
-def create_project_structure(config: Dict[str, Any], project_dir: str):
- # Example: create folders for workflows, logs, counts, etc.
- safe_makedirs(project_dir)
- for wf in config.get("WORKFLOWS", "").split(","):
- wf = wf.strip()
- if wf:
- safe_makedirs(os.path.join(project_dir, wf))
- safe_makedirs(os.path.join(project_dir, "LOGS"))
- safe_makedirs(os.path.join(project_dir, "COUNTS"))
- # Add more as needed
+@app.post("/project/create", response_model=Dict[str, Any])
+def create_project(req: ProjectRequest) -> Dict[str, Any]:
+ base_dir = os.path.abspath(req.project_dir.strip())
+ project_name = req.project_name.strip() or "monsda"
-@app.post("/generate_project_dir")
-def generate_project_dir(req: DirRequest):
- project_dir = os.path.abspath(req.project_dir)
- if not project_dir.startswith(os.getcwd()):
- raise HTTPException(
- status_code=400, detail="Project dir must be inside working directory."
- )
- if os.path.exists(project_dir) and os.listdir(project_dir):
- raise HTTPException(
- status_code=400, detail="Project dir already exists and is not empty."
- )
- create_project_structure(req.config, project_dir)
- return {"message": "Project directory structure created.", "path": project_dir}
+ # Project name becomes a subdirectory under the chosen path
+ project_dir = os.path.join(base_dir, project_name)
+ os.makedirs(project_dir, exist_ok=True)
+ # FASTQ directly under project dir (no extra project_name level)
+ fastq_dir = os.path.join(project_dir, "FASTQ")
+ gen_dir = os.path.join(project_dir, "GENOMES")
+ os.makedirs(fastq_dir, exist_ok=True)
+ os.makedirs(gen_dir, exist_ok=True)
+
+ settings = req.settings or (req.config.get("SETTINGS") if req.config else None)
+ linked_files: List[str] = []
+ warnings: List[str] = []
+
+ # Build lookup: condition_path -> ConditionFiles
+ cond_lookup: Dict[str, ConditionFiles] = {}
+ for cf in req.condition_files:
+ cond_lookup[cf.condition] = cf
+
+ if settings and isinstance(settings, dict):
+ condition_paths = get_condition_paths_from_settings(settings)
+ for cond_path in condition_paths:
+ cond_key = "/".join(cond_path)
+ cond_info = cond_lookup.get(cond_key)
+
+ # Create condition subdirectories under FASTQ
+ cond_dir = os.path.join(fastq_dir, *cond_path)
+ os.makedirs(cond_dir, exist_ok=True)
+
+ # Walk to the leaf node to find SAMPLES
+ node = settings
+ for key in cond_path:
+ node = node.get(key, {})
+
+ samples = node.get("SAMPLES", [])
+
+ # Link FASTQ files: prefer explicit file list, fall back to directory+sample matching
+ if cond_info and cond_info.fastq_files:
+ # Explicit file selection mode - link exactly the files the user chose
+ for fpath in cond_info.fastq_files:
+ src_file = os.path.abspath(fpath)
+ if not os.path.isfile(src_file):
+ warnings.append(f"FASTQ file not found: {fpath}")
+ continue
+ dst = os.path.join(cond_dir, os.path.basename(src_file))
+ if not os.path.exists(dst):
+ os.symlink(os.path.realpath(src_file), dst)
+ linked_files.append(dst)
+ # Handle paired-end mate
+ if cond_info.sequencing == "paired":
+ basename = os.path.basename(src_file)
+ if "_R1" in basename:
+ mate_name = basename.replace("_R1", "_R2")
+ elif "_1." in basename:
+ mate_name = basename.replace("_1.", "_2.")
+ else:
+ mate_name = None
+ if mate_name:
+ mate_src = os.path.join(
+ os.path.dirname(src_file), mate_name
+ )
+ mate_dst = os.path.join(cond_dir, mate_name)
+ if os.path.isfile(mate_src) and not os.path.exists(
+ mate_dst
+ ):
+ os.symlink(os.path.realpath(mate_src), mate_dst)
+ linked_files.append(mate_dst)
+
+ elif cond_info and cond_info.fastq_dir and isinstance(samples, list):
+ # Directory mode - find files matching sample names from samplesheet
+ src_dir = os.path.abspath(cond_info.fastq_dir)
+ if os.path.isdir(src_dir):
+ for sample in samples:
+ # Find matching files (sample name may or may not have extension)
+ matched = _find_fastq_files(src_dir, sample)
+ if not matched:
+ warnings.append(
+ f"No FASTQ file found for sample '{sample}' in {src_dir}"
+ )
+ continue
+ for src_file in matched:
+ dst = os.path.join(cond_dir, os.path.basename(src_file))
+ if not os.path.exists(dst):
+ os.symlink(os.path.realpath(src_file), dst)
+ linked_files.append(dst)
+ # Handle paired-end mate
+ if cond_info.sequencing == "paired":
+ basename = os.path.basename(src_file)
+ if "_R1" in basename:
+ mate_name = basename.replace("_R1", "_R2")
+ elif "_1." in basename:
+ mate_name = basename.replace("_1.", "_2.")
+ elif "_R2" in basename:
+ mate_name = basename.replace("_R2", "_R1")
+ elif "_2." in basename:
+ mate_name = basename.replace("_2.", "_1.")
+ else:
+ mate_name = None
+ if mate_name:
+ mate_src = os.path.join(src_dir, mate_name)
+ mate_dst = os.path.join(cond_dir, mate_name)
+ if os.path.isfile(mate_src) and not os.path.exists(
+ mate_dst
+ ):
+ os.symlink(os.path.realpath(mate_src), mate_dst)
+ linked_files.append(mate_dst)
+ else:
+ warnings.append(
+ f"FASTQ directory not found for condition '{cond_key}': {src_dir}"
+ )
+
+ # Link genome files: REFERENCE, DECOY, GTF, GFF into GENOMES/
+ ref_path = (cond_info.reference if cond_info else "") or node.get(
+ "REFERENCE", ""
+ )
+ if ref_path and os.path.isfile(ref_path):
+ dst = os.path.join(gen_dir, os.path.basename(ref_path))
+ if not os.path.exists(dst):
+ os.symlink(os.path.realpath(ref_path), dst)
+ linked_files.append(dst)
+ # Update settings path to relative
+ rel = os.path.relpath(dst, start=project_dir)
+ node["REFERENCE"] = rel
+ elif ref_path:
+ warnings.append(f"REFERENCE not found: {ref_path}")
+
+ decoy_path = (cond_info.decoy if cond_info else "") or node.get("DECOY", "")
+ if decoy_path and os.path.isfile(decoy_path):
+ dst = os.path.join(gen_dir, os.path.basename(decoy_path))
+ if not os.path.exists(dst):
+ os.symlink(os.path.realpath(decoy_path), dst)
+ linked_files.append(dst)
+ rel = os.path.relpath(dst, start=project_dir)
+ node["DECOY"] = rel
+
+ gtf_path = (cond_info.gtf if cond_info else "") or ""
+ anno = node.get("ANNOTATION", {})
+ if isinstance(anno, dict):
+ gtf_path = gtf_path or anno.get("GTF", "")
+ if gtf_path and os.path.isfile(gtf_path):
+ dst = os.path.join(gen_dir, os.path.basename(gtf_path))
+ if not os.path.exists(dst):
+ os.symlink(os.path.realpath(gtf_path), dst)
+ linked_files.append(dst)
+ rel = os.path.relpath(dst, start=project_dir)
+ if isinstance(anno, dict):
+ anno["GTF"] = rel
+ node["ANNOTATION"] = anno
+ elif gtf_path:
+ warnings.append(f"GTF not found: {gtf_path}")
+
+ gff_path = (cond_info.gff if cond_info else "") or ""
+ if isinstance(anno, dict):
+ gff_path = gff_path or anno.get("GFF", "")
+ if gff_path and os.path.isfile(gff_path):
+ dst = os.path.join(gen_dir, os.path.basename(gff_path))
+ if not os.path.exists(dst):
+ os.symlink(os.path.realpath(gff_path), dst)
+ linked_files.append(dst)
+ rel = os.path.relpath(dst, start=project_dir)
+ if isinstance(anno, dict):
+ anno["GFF"] = rel
+ node["ANNOTATION"] = anno
+ elif gff_path:
+ warnings.append(f"GFF not found: {gff_path}")
+
+ # Write config if provided (with updated relative paths in SETTINGS)
+ config_path = ""
+ if req.config:
+ if settings:
+ req.config["SETTINGS"] = settings
+ config_file = f"config_{project_name}.json"
+ config_path = os.path.join(project_dir, config_file)
+ with open(config_path, "w") as fh:
+ json.dump(req.config, fh, indent=4)
+
+ return {
+ "message": "Project created.",
+ "path": project_dir,
+ "config_path": config_path,
+ "linked_files": len(linked_files),
+ "warnings": warnings,
+ }
-from fastapi.responses import HTMLResponse
+
+def _find_fastq_files(src_dir: str, sample_name: str) -> List[str]:
+ """Find FASTQ files in src_dir matching a sample name.
+
+ Matches: exact filename, or sample_name*.fastq.gz / .fq.gz / .fastq / .fq
+ Only returns R1 (or unpaired) to avoid double-linking.
+ """
+ import glob as _glob
+
+ candidates: List[str] = []
+ # Try exact match first
+ exact = os.path.join(src_dir, sample_name)
+ if os.path.isfile(exact):
+ return [exact]
+
+ # Try common FASTQ extensions
+ for ext in [".fastq.gz", ".fq.gz", ".fastq", ".fq"]:
+ path = os.path.join(src_dir, sample_name + ext)
+ if os.path.isfile(path):
+ candidates.append(path)
+ # Try with _R1 suffix
+ path_r1 = os.path.join(src_dir, sample_name + "_R1" + ext)
+ if os.path.isfile(path_r1):
+ candidates.append(path_r1)
+ # Try with _1 suffix
+ path_1 = os.path.join(src_dir, sample_name + "_1" + ext)
+ if os.path.isfile(path_1):
+ candidates.append(path_1)
+
+ if candidates:
+ return candidates
+
+ # Glob fallback: any file starting with sample_name
+ pattern = os.path.join(src_dir, sample_name + "*")
+ matches = _glob.glob(pattern)
+ # Filter to only fastq-like files and only R1/unpaired
+ for m in sorted(matches):
+ if os.path.isfile(m):
+ lower = m.lower()
+ if any(lower.endswith(e) for e in [".fastq.gz", ".fq.gz", ".fastq", ".fq"]):
+ # Skip R2/_2 to avoid double-linking
+ base = os.path.basename(m)
+ if "_R2" in base or "_2." in base:
+ continue
+ candidates.append(m)
+
+ return candidates
@app.get("/", response_class=HTMLResponse)
-def root():
+def root() -> str:
return """
-
+
-
- MONSDA Configurator
-
+
+ MONSDA Web Configurator
+
+
+
- MONSDA Configurator
-
-
Step 1: Edit Configuration
-
-
-
-
-
-
Step 2: Generate Config File
-
-
-
-
-
-
-
-
-
-
Step 3: Create Project Directory
-
-
-
-
-
-
+ }
+ walkSettings(settings, []);
+
+ if (!conditions.length) {
+ setStatus('projectStatus', 'No conditions with SAMPLES found in settings.', false);
+ return;
+ }
+
+ const wizard = document.getElementById('conditionWizard');
+ wizard.innerHTML = `${conditions.length} condition(s) found. For each, select FASTQ files by directory (auto-matched by sample name) or pick individual files.
`;
+
+ conditions.forEach((c, idx) => {
+ const id = `cond_${idx}`;
+ const hasSamples = c.samples && c.samples.length > 0;
+ const samplesStr = hasSamples
+ ? c.samples.slice(0, 5).join(', ') + (c.samples.length > 5 ? ` (+${c.samples.length - 5} more)` : '')
+ : '(no samples defined)';
+ const card = document.createElement('div');
+ card.className = 'cond-card';
+ card.dataset.condition = c.path;
+ card.innerHTML = `
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ `;
+ wizard.appendChild(card);
+ });
+ setStatus('projectStatus', `Loaded ${conditions.length} condition(s). Fill in file paths and click Create Project.`);
+}
+
+function toggleFqMode(id, mode) {
+ document.getElementById(id + '_dirpanel').style.display = mode === 'dir' ? '' : 'none';
+ document.getElementById(id + '_filespanel').style.display = mode === 'files' ? '' : 'none';
+}
+
+function addFastqFile(id) {
+ openPathBrowser('', 'multifiles', (paths) => {
+ const list = document.getElementById(id + '_filelist');
+ paths.forEach(path => {
+ // Avoid duplicates
+ let exists = false;
+ list.querySelectorAll('li').forEach(li => { if (li.dataset.path === path) exists = true; });
+ if (exists) return;
+ const li = document.createElement('li');
+ const span = document.createElement('span');
+ span.textContent = path;
+ const rmBtn = document.createElement('button');
+ rmBtn.type = 'button';
+ rmBtn.textContent = '\u00d7';
+ rmBtn.title = 'Remove';
+ rmBtn.addEventListener('click', () => li.remove());
+ li.appendChild(span);
+ li.appendChild(rmBtn);
+ li.dataset.path = path;
+ list.appendChild(li);
+ });
+ });
+}
+
+window.onload = () => {
+ loadTemplate();
+ switchOutputMode('config');
+};
+
+
"""
diff --git a/README.md b/README.md
index b3dfedfc..9271ea91 100644
--- a/README.md
+++ b/README.md
@@ -46,12 +46,12 @@ More information can be found in the official [documentation](https://monsda.rea
## How does it work
-This repository hosts the executable ```MONSDA.py``` which acts a wrapper around ```Snakemake``` and the ```config.json``` file.
+This repository hosts the executable ```MONSDA.py``` which acts a wrapper around ```Snakemake|Nextflow``` and the ```config.json``` file.
The ```config.json``` holds all the information that is needed to run the jobs and will be parsed by ```MONSDA.py``` and split into sub-configs that can later be found in the directory ```SubSnakes``` or ```SubFlows``` respectively.
To successfully run an analysis pipeline, a few steps have to be followed:
* Directory structure: The structure for the directories is dictated by the condition-tree in the config file
- * Config file: This is the central part of the analysis. Depending on this file ```MONSDA.py``` will determine processing steps and generate according config and ```Snakemake/Nextflow``` workflow files to run each subworkflow until all processing steps are done.
+ * Config file: This is the central part of the analysis. Depending on this file ```MONSDA.py``` will determine processing steps and generate according config and ```Snakemake|Nextflow``` workflow files to run each subworkflow until all processing steps are done.
## Run the pipeline
diff --git a/configs/samplesheet_template.csv b/configs/samplesheet_template.csv
new file mode 100644
index 00000000..0d553361
--- /dev/null
+++ b/configs/samplesheet_template.csv
@@ -0,0 +1,7 @@
+CONDITION,SAMPLE,GROUP,SEQUENCING,REFERENCE,GTF,GFF,INDEX,PREFIX,DECOY,TYPE,BATCH,IP
+FGUMI/WT/dummylevel,Sample1,ctrl,paired,GENOMES/Ecoli/ecoli.fa.gz,GENOMES/Ecoli/ecoli.gtf.gz,GENOMES/Ecoli/ecoli.gff.gz,,,,,,
+FGUMI/WT/dummylevel,Sample2,ctrl,,,,,,,,,
+FGUMI/WT/dummylevel,Sample3,ctrl,,,,,,,,,
+FGUMI/KO,Sample4,ko,paired,GENOMES/Ecoli/ecoli.fa.gz,GENOMES/Ecoli/ecoli.gtf.gz,GENOMES/Ecoli/ecoli.gff.gz,,,,,,
+FGUMI/KO,Sample5,ko,,,,,,,,,
+FGUMI/KO,Sample6,ko,,,,,,,,,
diff --git a/configs/template.json b/configs/template.json
index 9ab0bb95..ad52f0ed 100644
--- a/configs/template.json
+++ b/configs/template.json
@@ -120,7 +120,8 @@
"DEDUP": { #options for deduplication for each sample/condition
"TOOLS": {
"umitools": "umi_tools",
- "picard": "picard"
+ "picard": "picard",
+ "fgumi": "fgumi"
},
"id": {
"condition": {
@@ -141,6 +142,13 @@
"JAVA" : "",# options
"DEDUP": "" # dedup options
}
+ },
+ "fgumi":{
+ "OPTIONS":
+ {
+ "EXTRACT": "", # fgumi extract options, e.g. --read-structures 8M+T +T
+ "DEDUP": "" # fgumi dedup options
+ }
}
}
}
@@ -233,8 +241,8 @@
"salmon": {
"OPTIONS":
{
- "INDEX": "-l A --gencode", # salmon index options
- "QUANT": "--gcBias", # salmon quant options
+ "INDEX": "", # salmon index options
+ "QUANT": "--validateMappings --seqBias --gcBias", # salmon quant options
}
}
}
@@ -454,8 +462,8 @@
{
"OPTIONS":
{
- "INDEX": "-l A --gencode", # salmon index options
- "QUANT": "--gcBias", # salmon quant options
+ "INDEX": "", # salmon index options
+ "QUANT": "--validateMappings --seqBias --gcBias", # salmon quant options
"DTU": "" # Options for Analysis
}
},
@@ -463,8 +471,8 @@
{
"OPTIONS":
{
- "INDEX": "-l A --gencode", # salmon index options
- "QUANT": "--gcBias", # salmon quant options
+ "INDEX": "", # salmon index options
+ "QUANT": "--validateMappings --seqBias --gcBias", # salmon quant options
"DTU": "" # Options for Analysis
}
}
diff --git a/configs/template_base_commented.json b/configs/template_base_commented.json
index 8b2c1388..93928bb8 100644
--- a/configs/template_base_commented.json
+++ b/configs/template_base_commented.json
@@ -58,22 +58,36 @@
},
"BASECALL": {
"TOOLS": {
- "guppy": "~/.local/bin/guppy-cpu/bin/guppy_basecaller"
+ "guppy": "~/.local/bin/guppy-cpu/bin/guppy_basecaller",
+ "dorado": "dorado"
},
"ENV" : "",
"BIN" : "",
"guppy": {
"comment": {
- "BASECALL": "Guppy options here if any, paired is not required, will be resolved by rules"
+ "BASECALL": "Guppy caller options here if any",
+ "MODEL": "Guppy model options here if any"
},
"OPTIONS": {
- "BASECALL": ""
+ "BASECALL": "",
+ "MODEL": ""
+ }
+ },
+ "dorado": {
+ "comment": {
+ "CALLER": "Dorado caller options here if any",
+ "MODEL": "Dorado model options here if any"
+ },
+ "OPTIONS": {
+ "CALLER": "",
+ "MODEL": ""
}
}
},
"QC": {
"TOOLS": {
- "fastqc": "fastqc"
+ "fastqc": "fastqc",
+ "rustqc": "rustqc"
},
"ENV" : "",
"BIN" : "",
@@ -86,13 +100,24 @@
"QC": "",
"MULTI": ""
}
+ },
+ "rustqc": {
+ "comment": {
+ "QC": "RustQC options here if any, post-alignment QC",
+ "MULTI": "MultiQC options for rustqc if any"
+ },
+ "OPTIONS": {
+ "QC": "",
+ "MULTI": ""
+ }
}
},
"TRIMMING": {
"TOOLS": {
"trimgalore": "trim_galore",
"cutadapt": "cutadapt",
- "bbduk": "bbmap"
+ "bbduk": "bbmap",
+ "fastp": "fastp"
},
"ENV" : "",
"BIN" : "",
@@ -119,11 +144,21 @@
"OPTIONS": {
"TRIM": "-q 15 --length 8 -e 0.15"
}
+ },
+ "fastp": {
+ "comment": {
+ "TRIM": "Trimming options here, --paired is not required, will be resolved by rules"
+ },
+ "OPTIONS": {
+ "TRIM": "-q 15 -l 8"
+ }
}
},
"DEDUP": {
"TOOLS": {
- "umitools": "umi_tools"
+ "umitools": "umi_tools",
+ "picard": "picard",
+ "fgumi": "fgumi"
},
"ENV" : "",
"BIN" : "",
@@ -150,6 +185,18 @@
"JAVA" : "",
"DEDUP": ""
}
+ },
+ "fgumi": {
+ "comment":
+ {
+ "EXTRACT": "fgumi extract options, e.g. --read-structures 8M+T +T",
+ "DEDUP": "fgumi dedup options"
+ },
+ "OPTIONS":
+ {
+ "EXTRACT": "",
+ "DEDUP": ""
+ }
}
},
"MAPPING": {
@@ -275,8 +322,8 @@
},
"OPTIONS":
{
- "INDEX": "-l A --gencode",
- "QUANT": "--gcBias"
+ "INDEX": "",
+ "QUANT": "--validateMappings --seqBias --gcBias"
}
}
},
@@ -481,8 +528,8 @@
"DTU": "Options for DTU Analysis, leave blank unless you know exactly what you are doing, can be used to set cutoffs for filtering, e.g. min_samps_feature_expr, min_gene_expr ..."
},
"OPTIONS": {
- "INDEX": "-l A --gencode -d GENOMES/salmon_decoy",
- "QUANT": "--gcBias",
+ "INDEX": "-d GENOMES/salmon_decoy",
+ "QUANT": "--validateMappings --seqBias --gcBias",
"DTU": ""
}
},
@@ -493,8 +540,8 @@
"DTU": "Options for DTU Analysis, leave blank unless you know exactly what you are doing, , can be used to set cutoffs for filtering, e.g. min_samps_feature_expr, min_gene_expr ..."
},
"OPTIONS": {
- "INDEX": "-l A --gencode -d GENOMES/salmon_decoy",
- "QUANT": "--gcBias",
+ "INDEX": "-d GENOMES/salmon_decoy",
+ "QUANT": "--validateMappings --seqBias --gcBias",
"DTU": ""
}
}
diff --git a/configs/template_clean.json b/configs/template_clean.json
index 825e7335..78ea39f1 100644
--- a/configs/template_clean.json
+++ b/configs/template_clean.json
@@ -110,7 +110,8 @@
},
"DEDUP": {
"TOOLS": {
- "umitools": "umi_tools"
+ "umitools": "umi_tools",
+ "fgumi": "fgumi"
},
"id": {
"condition": {
@@ -128,6 +129,13 @@
"JAVA" : "",
"DEDUP": ""
}
+ },
+ "fgumi": {
+ "OPTIONS":
+ {
+ "EXTRACT": "",
+ "DEDUP": ""
+ }
}
}
}
@@ -224,8 +232,8 @@
"salmon": {
"OPTIONS":
{
- "INDEX": "-l A --gencode",
- "QUANT": "--gcBias"
+ "INDEX": "",
+ "QUANT": "--validateMappings --seqBias --gcBias"
}
}
}
@@ -428,8 +436,8 @@
{
"OPTIONS":
{
- "INDEX": "-l A --gencode",
- "QUANT": "--gcBias",
+ "INDEX": "",
+ "QUANT": "--validateMappings --seqBias --gcBias",
"DTU": ""
}
},
@@ -437,8 +445,8 @@
{
"OPTIONS":
{
- "INDEX": "-l A --gencode",
- "QUANT": "--gcBias",
+ "INDEX": "",
+ "QUANT": "--validateMappings --seqBias --gcBias",
"DTU": ""
}
}
diff --git a/configs/template_umicollapse.json b/configs/template_umicollapse.json
new file mode 100644
index 00000000..cff24128
--- /dev/null
+++ b/configs/template_umicollapse.json
@@ -0,0 +1,118 @@
+{
+ "WORKFLOWS": "QC,MAPPING,DEDUP,COUNTING",
+ "BINS": "",
+ "MAXTHREADS": "20",
+ "VERSION": "1.4.0",
+ "SETTINGS": {
+ "id": {
+ "condition": {
+ "SAMPLES": [
+ "rep_1",
+ "rep_2"
+ ],
+ "GROUPS": [
+ "WT",
+ "WT"
+ ],
+ "TYPES": [
+ "standard",
+ "standard"
+ ],
+ "BATCHES": [
+ "1",
+ "1"
+ ],
+ "SEQUENCING": "paired",
+ "REFERENCE": "GENOMES/Dm6/dm6.fa.gz",
+ "INDEX": "GENOMES/Dm6/INDICES/star",
+ "PREFIX": "idx",
+ "ANNOTATION": {
+ "GTF": "GENOMES/Dm6/dm6.gtf.gz",
+ "GFF": "GENOMES/Dm6/dm6.gff3.gz"
+ }
+ }
+ }
+ },
+ "QC": {
+ "TOOLS": {
+ "fastqc": "fastqc"
+ },
+ "id": {
+ "condition": {
+ "fastqc": {
+ "OPTIONS": {
+ "QC": "",
+ "MULTI": ""
+ }
+ }
+ }
+ }
+ },
+ "MAP": {
+ "TOOLS": {
+ "star": "STAR"
+ },
+ "id": {
+ "condition": {
+ "star": {
+ "OPTIONS": {
+ "INDEX": "--sjdbGTFfeatureExon exon --sjdbGTFtagExonParentTranscript Parent --genomeSAindexNbases 13",
+ "MAP": "--sjdbGTFfeatureExon exon --sjdbGTFtagExonParentTranscript Parent --outSAMprimaryFlag AllBestScore --outSAMattributes NH HI NM MD AS nM jM jI XS",
+ "EXTENSION": ""
+ }
+ }
+ }
+ }
+ },
+ "DEDUP": {
+ "TOOLS": {
+ "umicollapse": "umicollapse",
+ "umitools": "umi_tools",
+ "fgumi": "fgumi",
+ "picard": "picard"
+ },
+ "id": {
+ "condition": {
+ "umicollapse": {
+ "OPTIONS": {
+ "DEDUP": "--algo dir --umi-sep _"
+ }
+ },
+ "umitools": {
+ "OPTIONS": {
+ "WHITELIST": "--extract-method string --bc-pattern 'XNNNNX'",
+ "EXTRACT": "--extract-umi-method read_id",
+ "DEDUP": ""
+ }
+ },
+ "fgumi": {
+ "OPTIONS": {
+ "EXTRACT": "",
+ "DEDUP": ""
+ }
+ },
+ "picard": {
+ "OPTIONS": {
+ "JAVA": "",
+ "DEDUP": ""
+ }
+ }
+ }
+ }
+ },
+ "COUNTING": {
+ "TOOLS": {
+ "salmon": "salmon"
+ },
+ "id": {
+ "condition": {
+ "salmon": {
+ "OPTIONS": {
+ "INDEX": "-k 31",
+ "COUNTING": "-l U --validateMappings"
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/configs/tutorial_exhaustive.json b/configs/tutorial_exhaustive.json
index e811dec3..da2cecac 100644
--- a/configs/tutorial_exhaustive.json
+++ b/configs/tutorial_exhaustive.json
@@ -128,7 +128,8 @@
"TOOLS" :
{
"trimgalore": "trim_galore",
- "cutadapt": "cutadapt"
+ "cutadapt": "cutadapt",
+ "fastp": "fastp"
},
"Ecoli": {
"KO": {
@@ -143,6 +144,12 @@
{
"TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules
}
+ },
+ "fastp":{
+ "OPTIONS":
+ {
+ "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules
+ }
}
},
"WT": {
@@ -158,6 +165,12 @@
{
"TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules
}
+ },
+ "fastp":{
+ "OPTIONS":
+ {
+ "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules
+ }
}
}
}
diff --git a/configs/tutorial_postprocess.json b/configs/tutorial_postprocess.json
index aa5ba370..79ca138d 100644
--- a/configs/tutorial_postprocess.json
+++ b/configs/tutorial_postprocess.json
@@ -122,7 +122,8 @@
"TOOLS" :
{
"trimgalore": "trim_galore",
- "cutadapt": "cutadapt"
+ "cutadapt": "cutadapt",
+ "fastp": "fastp"
},
"Ecoli": {
"KO": {
@@ -137,6 +138,12 @@
{
"TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules
}
+ },
+ "fastp":{
+ "OPTIONS":
+ {
+ "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules
+ }
}
},
"WT": {
@@ -152,6 +159,12 @@
{
"TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules
}
+ },
+ "fastp":{
+ "OPTIONS":
+ {
+ "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules
+ }
}
}
}
diff --git a/configs/tutorial_toolmix.json b/configs/tutorial_toolmix.json
index e3b080fa..2f91f471 100644
--- a/configs/tutorial_toolmix.json
+++ b/configs/tutorial_toolmix.json
@@ -120,7 +120,8 @@
"TOOLS" :
{
"trimgalore": "trim_galore",
- "cutadapt": "cutadapt"
+ "cutadapt": "cutadapt",
+ "fastp": "fastp"
},
"Ecoli": {
"KO": {
@@ -135,6 +136,12 @@
{
"TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules
}
+ },
+ "fastp":{
+ "OPTIONS":
+ {
+ "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules
+ }
}
},
"WT": {
@@ -150,6 +157,12 @@
{
"TRIM": "-m 8 -e 0.15" # trimming options here, --KO is not required, will be resolved by rules
}
+ },
+ "fastp":{
+ "OPTIONS":
+ {
+ "TRIM": "-q 15 -l 8" # trimming options here, --KO is not required, will be resolved by rules
+ }
}
}
}
diff --git a/containers/apptainer/fgumi.def b/containers/apptainer/fgumi.def
new file mode 100644
index 00000000..4d305696
--- /dev/null
+++ b/containers/apptainer/fgumi.def
@@ -0,0 +1,23 @@
+Bootstrap: docker
+From: continuumio/miniconda3
+
+%files
+ /home/fall/MONSDA/envs/fgumi.yaml /opt/envs/
+ ${HOME}/MONSDA/scripts /opt/MONSDA/
+
+%environment
+
+%post
+ ls -alrt /opt/envs
+ chmod -R +x /opt/envs/fgumi.yaml
+
+ ENV_NAME=fgumi
+ echo ". /opt/conda/etc/profile.d/conda.sh" >> $APPTAINER_ENVIRONMENT
+ echo "conda activate $ENV_NAME" >> $APPTAINER_ENVIRONMENT
+
+ . /opt/conda/etc/profile.d/conda.sh
+ conda env create -f /opt/envs/fgumi.yaml -p /opt/conda/envs/$ENV_NAME
+ conda clean --all
+
+%runscript
+ exec "$@"
diff --git a/containers/apptainer/rustqc.def b/containers/apptainer/rustqc.def
new file mode 100644
index 00000000..2d9f57b8
--- /dev/null
+++ b/containers/apptainer/rustqc.def
@@ -0,0 +1,23 @@
+Bootstrap: docker
+From: continuumio/miniconda3
+
+%files
+ /home/fall/MONSDA/envs/rustqc.yaml /opt/envs/
+ ${HOME}/MONSDA/scripts /opt/MONSDA/
+
+%environment
+
+%post
+ ls -alrt /opt/envs
+ chmod -R +x /opt/envs/rustqc.yaml
+
+ ENV_NAME=rustqc
+ echo ". /opt/conda/etc/profile.d/conda.sh" >> $APPTAINER_ENVIRONMENT
+ echo "conda activate $ENV_NAME" >> $APPTAINER_ENVIRONMENT
+
+ . /opt/conda/etc/profile.d/conda.sh
+ conda env create -f /opt/envs/rustqc.yaml -p /opt/conda/envs/$ENV_NAME
+ conda clean --all
+
+%runscript
+ exec "$@"
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
index 2d1afe53..ebd55264 100644
--- a/docs/source/tutorial.rst
+++ b/docs/source/tutorial.rst
@@ -57,10 +57,10 @@ This slightly more complex use case involves multiple input files, two condition
Workflows include:
- FETCH: Download from SRA
- - QC: FASTQC of input and output
+ - QC: FASTQC and RustQC of input and output
- TRIMMING: Adaptor removal with cutadapt/trimgalore
- MAPPING: Read mapping with STAR, hisat2, bwa, segemehl3 and minimap2
- - DEDUP: Read deduplication with umi_tools and picard
+ - DEDUP: Read deduplication with umi_tools, fgumi and picard
The more complex config for this analysis follows
@@ -74,7 +74,7 @@ Starting the run with 12 cores (defining more will be capped by the config file
monsda -j 12 -c ${CONDA_PREFIX}/share/MONSDA/configs/tutorial_toolmix.json --directory ${PWD}
-Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/INDICES" directory containing the built index, a "QC" directory containing all *fastqc* reports and *multiqc* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, a "DE" directory will be created which will hold output from counting with featurecounts and DE input and output from *EdgeR* and *DESeq2*. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and the "JOBS" directory with all command-line calls.
+Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/INDICES" directory containing the built index, a "QC" directory containing all *fastqc*/*rustqc* reports and *multiqc* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools*/*fgumi* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, a "DE" directory will be created which will hold output from counting with featurecounts and DE input and output from *EdgeR* and *DESeq2*. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and the "JOBS" directory with all command-line calls.
A successful run will show the message 'Workflow finished, no error' at the end.
@@ -86,10 +86,10 @@ This postprocessing use case involves multiple input files, two conditions (WT/K
Workflows include:
- FETCH: Download from SRA
- - QC: FASTQC of input and output
+ - QC: FASTQC and RustQC of input and output
- TRIMMING: Adaptor removal with cutadapt/trimgalore
- MAPPING: Read mapping with STAR, hisat2, bwa, segemehl3 and minimap2
- - DEDUP: Read deduplication with umi_tools and picard
+ - DEDUP: Read deduplication with umi_tools, fgumi and picard
- DE: Differential Expression analysis with EdgeR and DESeq2
- DEU: Differential Exon Usage analysis with EdgeR (DEXSeq skipped, runtime)
- COUNTING: Read counting with FeatureCounts
@@ -108,7 +108,7 @@ Starting the run with 12 cores (defining more will be capped by the config file
monsda -j 12 -c ${CONDA_PREFIX}/share/MONSDA/configs/tutorial_postprocess.json --directory ${PWD}
-Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for *salmon* later on, a "QC" directory containing all *FastQC* reports and *MultiQC* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU" directories will be created which will hold output from counting with FeatureCounts and DE/DEU input and output from *EdgeR* and *DESeq2* respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls.
+Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for *salmon* later on, a "QC" directory containing all *FastQC*/*RustQC* reports and *MultiQC* output, a "TRIMMED_FASTQ" directory for *trimgalore* and *cutadapt* output, a "DEDUP" directory for *umi_tools*/*fgumi* (runs before trimming and after mapping) and *picard* (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU" directories will be created which will hold output from counting with FeatureCounts and DE/DEU input and output from *EdgeR* and *DESeq2* respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls.
A successful run will show the message 'Workflow finished, no error'. Be aware that this is indeed an exhaustive workflow and will require a decent amount of disk-space, memory and compute-time, depending on the hardware at your disposal.
@@ -122,10 +122,10 @@ This exhaustive use case involves multiple input files, two conditions (WT/KO) a
Workflows include:
- FETCH: Download from SRA
- - QC: FASTQC of input and output
+ - QC: FASTQC and RustQC of input and output
- TRIMMING: Adaptor removal with cutadapt/trimgalore
- MAPPING: Read mapping with STAR, hisat2, bwa, segemehl3 and minimap2
- - DEDUP: Read deduplication with umi_tools and picard
+ - DEDUP: Read deduplication with umi_tools, fgumi and picard
- DE: Differential Expression analysis with EdgeR and DESeq2
- DEU: Differential Exon Usage analysis with EdgeR (DEXSeq skipped, runtime)
- DAS: Differential Alternative Splicing analysis with EdgeR and DIEGO
@@ -186,6 +186,6 @@ Starting the run with 12 cores (defining more will be capped by the config file
monsda -j 12 -c ${CONDA_PREFIX}/share/MONSDA/configs/tutorial_exhaustive.json --directory ${PWD}
-Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for salmon later on, a "QC" directory containing all FASTQC reports and MULTIQC output, a "TRIMMED_FASTQ" directory for trimgalore and cutadapt output, a "DEDUP" directory for umi_tools (runs before trimming and after mapping) and picard (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU/DAS/DTU" directories will be created which will hold output from counting with FeatureCounts (or salmon for DTU) and DE/DEU/DAS/DTU input and output from EDGER, DESeq2, Diego and DrimSeq respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls.
+Will start the run in the current directory and generate a "FASTQ" sub-directory containing the downloaded sample, a "GENOME/Ecoli/INDICES" directory containing the built indices, including the one built for salmon later on, a "QC" directory containing all FASTQC/RustQC reports and MULTIQC output, a "TRIMMED_FASTQ" directory for trimgalore and cutadapt output, a "DEDUP" directory for umi_tools/fgumi (runs before trimming and after mapping) and picard (runs after mapping) output and a "MAPPING" directory containing the mapped files. Furthermore, "DE/DEU/DAS/DTU" directories will be created which will hold output from counting with FeatureCounts (or salmon for DTU) and DE/DEU/DAS/DTU input and output from EDGER, DESeq2, Diego and DrimSeq respectively. Again, **MONSDA** will create a "LOG" directory containing it's own log, as well as logs of all executed jobs and again a "JOBS" directory for command-line calls.
A successful run will show the message 'Workflow finished, no error'. Be aware that this is indeed an exhaustive workflow and will require a decent amount of disk-space, memory and compute-time, depending on the hardware at your disposal.
\ No newline at end of file
diff --git a/docs/source/workflows.rst b/docs/source/workflows.rst
index cccf07e6..008756bc 100644
--- a/docs/source/workflows.rst
+++ b/docs/source/workflows.rst
@@ -84,6 +84,8 @@ This workflow step can be run as preprocessing step if none of the processing wo
+============================+============================================================+=========+=========+=========================================================================+============+===========+
| FASTQC (includes MULTIQC) | A quality control tool for high throughput sequence data. | fastqc | fastqc | `fastqc `_ | FASTQ/BAM | ZIP/HTML |
+----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+
+ | RustQC (includes MULTIQC) | High-performance RNA-seq QC suite with MultiQC-compatible outputs. | rustqc | rustqc | `rustqc `_ | BAM | TEXT/TSV/LOG |
+ +----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+
PROCESSING
@@ -103,6 +105,8 @@ If any of the below listed processing steps is defined in the config.json, quali
+============================+============================================================+=========+=========+=========================================================================+============+===========+
| FASTQC (includes MULTIQC) | A quality control tool for high throughput sequence data. | fastqc | fastqc | `fastqc `_ | FASTQ/BAM | ZIP/HTML |
+----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+
+ | RustQC (includes MULTIQC) | High-performance RNA-seq QC suite with MultiQC-compatible outputs. | rustqc | rustqc | `rustqc `_ | BAM | TEXT/TSV/LOG |
+ +----------------------------+------------------------------------------------------------+---------+---------+-------------------------------------------------------------------------+------------+-----------+
Trimming
@@ -173,6 +177,8 @@ Deduplicate reads by UMI or based on mapping position and CIGAR string
+===============+====================================================================================================================================================+===========+============+====================================================================================================+======================+============+
| UMI-tools | UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) and single cell RNA-Seq cell barcodes. | umitools | umi_tools | `umitools `_ | FASTQ/TRIMMED_FASTQ | FASTQ/BAM |
+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------+----------------------+------------+
+ | fgumi | High-performance tools for UMI-tagged sequencing data including UMI extraction and UMI-aware deduplication. | fgumi | fgumi | `fgumi `_ | FASTQ/TRIMMED_FASTQ | FASTQ/BAM |
+ +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------+----------------------+------------+
| Picard tools | A better duplication marking algorithm that handles all cases including clipped and gapped alignments. | picard | picard | `picard `_ | BAM | BAM |
+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------+----------------------+------------+
diff --git a/envs/fgumi.yaml b/envs/fgumi.yaml
new file mode 100644
index 00000000..9bffb96d
--- /dev/null
+++ b/envs/fgumi.yaml
@@ -0,0 +1,9 @@
+name: fgumi
+channels:
+ - conda-forge
+ - bioconda
+ - nodefaults
+dependencies:
+ - samtools =1.21
+ - fgumi =0.1.3
+ - dateutils =0.6.12
diff --git a/envs/monsda.yaml b/envs/monsda.yaml
index 39538a1d..697dc095 100644
--- a/envs/monsda.yaml
+++ b/envs/monsda.yaml
@@ -4,29 +4,29 @@ channels:
- bioconda
- nodefaults
dependencies:
- - biopython =1.86
- - fastapi =0.128.5
+ - biopython =1.87
+ - fastapi =0.136.1
- grep >=3.4
- - isort =7.0.0
+ - isort =8.0.1
- monsda =1.4.0
- natsort =8.4.0
- - nextflow =25.10.3
- - numpy =2.4.2
+ - nextflow =26.04.0
+ - numpy =2.4.3
- pandas =2.3.3
- perl =5.32.1
- pip >=25.3
- python =3.12.2
- pyyaml =6.0.3
- - scipy =1.17.0
- - snakemake =9.16.3
- - snakemake-executor-plugin-slurm =2.1.0
+ - scipy =1.17.1
+ - snakemake =9.20.0
+ - snakemake-executor-plugin-slurm =2.6.1
- snakemake-executor-plugin-cluster-generic =1.0.9
- - snakemake-interface-common =1.22.0
- - snakemake-interface-executor-plugins =9.3.9
+ - snakemake-interface-common =1.23.0
+ - snakemake-interface-executor-plugins =9.4.0
- snakemake-interface-report-plugins =1.3.0
- - snakemake-interface-storage-plugins =4.3.2
+ - snakemake-interface-storage-plugins =4.4.1
- snakemake-storage-plugin-s3 =0.3.6
- snakemake-storage-plugin-ftp =0.1.3
- - snakemake-storage-plugin-http =0.3.0
+ - snakemake-storage-plugin-http =0.3.1
- snakemake-storage-plugin-zenodo =0.1.5
- - uvicorn =0.40.0
\ No newline at end of file
+ - uvicorn =0.46.0
\ No newline at end of file
diff --git a/envs/monsda_min.yaml b/envs/monsda_min.yaml
index 5b0f3cdb..b2939fa2 100644
--- a/envs/monsda_min.yaml
+++ b/envs/monsda_min.yaml
@@ -4,28 +4,28 @@ channels:
- bioconda
- nodefaults
dependencies:
- - biopython =1.86
- - fastapi =0.128.5
+ - biopython =1.87
+ - fastapi =0.136.1
- grep >=3.4
- - isort =7.0.0
+ - isort =8.0.1
- natsort =8.4.0
- - nextflow =25.10.3
- - numpy =2.4.2
+ - nextflow =26.04.0
+ - numpy =2.4.3
- pandas =2.3.3
- perl =5.32.1
- pip >=25.3
- python =3.12.2
- pyyaml =6.0.3
- - scipy =1.17.0
- - snakemake =9.16.3
- - snakemake-executor-plugin-slurm =2.1.0
+ - scipy =1.17.1
+ - snakemake =9.20.0
+ - snakemake-executor-plugin-slurm =2.6.1
- snakemake-executor-plugin-cluster-generic =1.0.9
- - snakemake-interface-common =1.22.0
- - snakemake-interface-executor-plugins =9.3.9
+ - snakemake-interface-common =1.23.0
+ - snakemake-interface-executor-plugins =9.4.0
- snakemake-interface-report-plugins =1.3.0
- - snakemake-interface-storage-plugins =4.3.2
+ - snakemake-interface-storage-plugins =4.4.1
- snakemake-storage-plugin-s3 =0.3.6
- snakemake-storage-plugin-ftp =0.1.3
- - snakemake-storage-plugin-http =0.3.0
+ - snakemake-storage-plugin-http =0.3.1
- snakemake-storage-plugin-zenodo =0.1.5
- uvicorn =0.40.0
diff --git a/envs/salmon.yaml b/envs/salmon.yaml
index 071b81f9..cfff7361 100644
--- a/envs/salmon.yaml
+++ b/envs/salmon.yaml
@@ -3,4 +3,4 @@ channels:
- conda-forge
- bioconda
dependencies:
- - salmon =1.10.3
\ No newline at end of file
+ - salmon =1.11.4
\ No newline at end of file
diff --git a/envs/umicollapse.yaml b/envs/umicollapse.yaml
new file mode 100644
index 00000000..06018058
--- /dev/null
+++ b/envs/umicollapse.yaml
@@ -0,0 +1,10 @@
+name: umicollapse
+channels:
+ - conda-forge
+ - bioconda
+ - nodefaults
+dependencies:
+ - samtools =1.21
+ - umicollapse
+ - umi_tools =1.1.2
+ - dateutils =0.6.12
diff --git a/profile_nextflow/nextflow.config b/profile_nextflow/nextflow.config
index aec407b5..6d15e289 100644
--- a/profile_nextflow/nextflow.config
+++ b/profile_nextflow/nextflow.config
@@ -1,27 +1,82 @@
+// Queue-agnostic fallback defaults for ALL processes.
+// These apply regardless of which executor/profile is active.
+// Inline process directives (e.g. memory 16.GB in sortsam) take precedence.
+process {
+ cpus = 1
+ errorStrategy = 'retry'
+ maxRetries = 2
+ memory = { 20.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ time = { 8.h * (1 << ((task.attempt ?: 1) - 1)) }
+}
+
profiles {
- slurm {
- process.executor = 'slurm'
- process.memory = '10 GB'
- process.queue = 'main'
- withName: '_idx|_map' {
- memory = '160GB'
+ slurm {
+ process.executor = 'slurm'
+ process.queue = 'c' // default partition – override per-rule below
+
+ // Non-default overrides only – rules that need more than the global defaults.
+ process {
+ withName: 'bwa_idx|bwa2_idx|bwameth_idx|hisat2_idx|minimap_idx|segemehl_idx|segemehl3_idx|star_idx' {
+ memory = { 80.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ }
+ withName: 'bwa_mapping|bwa2_mapping|bwameth_mapping|hisat2_mapping|minimap_mapping|segemehl_mapping|segemehl3_mapping|star_mapping' {
+ memory = { 80.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ }
+ withName: 'salmon_idx|kallisto_idx' {
+ memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ }
+ withName: 'mqc' {
+ memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ }
+ withName: 'run_deseq2|run_edger' {
+ memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ queue = 'medium'
+ time = { 18.h * (1 << ((task.attempt ?: 1) - 1)) }
+ }
+ withName: 'run_drimseq|run_dexseq' {
+ memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ queue = 'long'
+ time = { 120.h * (1 << ((task.attempt ?: 1) - 1)) }
+ }
+ withName: 'atacorrect' {
+ memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ }
+ withName: 'bindetect' {
+ memory = { 60.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ }
+ withName: 'gridss_call|gridss_setupreference' {
+ memory = { 40.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ cpus = 8
+ }
+ }
}
- }
- local {
- process.executor = 'local'
- }
+ local {
+ process.executor = 'local'
+ }
- apptainer {
- conda.enabled = true
- apptainer.enabled = true
- apptainer.autoMounts = true
- docker.enabled = false
- process {
- withName: '_idx|_map' {
- memory = '160GB'
+ apptainer {
+ conda.enabled = true
+ apptainer.enabled = true
+ apptainer.autoMounts = true
+ docker.enabled = false
+ process {
+ beforeScript = '''
+ if [[ -n "${CONDA_PREFIX:-}" && -f "${CONDA_PREFIX}/etc/profile.d/conda.sh" ]]; then
+ source "${CONDA_PREFIX}/etc/profile.d/conda.sh"
+ elif [[ -n "${CONDA_EXE:-}" ]]; then
+ _conda_base="$(dirname "$(dirname "${CONDA_EXE}")")"
+ source "${_conda_base}/etc/profile.d/conda.sh"
+ elif [[ -f "${HOME}/miniconda3/etc/profile.d/conda.sh" ]]; then
+ source "${HOME}/miniconda3/etc/profile.d/conda.sh"
+ elif [[ -f "${HOME}/anaconda3/etc/profile.d/conda.sh" ]]; then
+ source "${HOME}/anaconda3/etc/profile.d/conda.sh"
+ else
+ echo "Could not locate conda.sh" >&2
+ exit 1
+ fi
+ conda activate apptainer
+ '''
}
- beforeScript = 'source ~/anaconda3/etc/profile.d/conda.sh; conda activate apptainer; export PATH=${PATH}:${CONDA_PREFIX}/bin'
}
- }
}
\ No newline at end of file
diff --git a/profile_snakemake/cluster_config.yaml b/profile_snakemake/cluster_config.yaml
index 226538a4..f9f7b122 100644
--- a/profile_snakemake/cluster_config.yaml
+++ b/profile_snakemake/cluster_config.yaml
@@ -1,6 +1,6 @@
__default__:
- account: user # your account name
- partition: main # the partition to use
+ account: ${USER} # your account name
+ partition: c # the partition to use
time: 1500 # default time (minutes)
nodes: 1
output: "/SLURMLOG/{rule}.{wildcards}.out"
@@ -13,3 +13,6 @@ generate_index:
mapping:
mem: 200GB
+
+multiqc:
+ mem: 30GB
diff --git a/profile_snakemake/config.v8+.yaml b/profile_snakemake/config.v8+.yaml
old mode 100644
new mode 100755
index 548731c7..a984a4d2
--- a/profile_snakemake/config.v8+.yaml
+++ b/profile_snakemake/config.v8+.yaml
@@ -1,41 +1,52 @@
executor: slurm
-cluster-generic-submit-cmd:
- mkdir -p LOGS/SLURM/{rule} &&
- sbatch
- --partition={resources.partition}
- --cpus-per-task={threads}
- --mem={resources.mem}
- --account={resources.account}
- --job-name=smk-{rule}-{wildcards}
- --output=LOGS/SLURM/{rule}/{rule}-{wildcards}-%j.out
- --parsable
+
default-resources:
- slurm_account: "user" # your account name
- slurm_partition: "c" # the partition to use
+ slurm_account: "${SLURM_ACCOUNT}" # your account name
+ slurm_partition: "${SLURM_PARTITION}" # the partition to use
nodes: 1
- runtime: "8h"
- mem: "20GB" # default memory
+ mem_mb: "20000 * (2 ** (attempt - 1))"
+ runtime: "480 * (2 ** (attempt - 1))"
set-resources:
generate_index:
- mem: "160GB"
+ mem_mb: "80000 * (2 ** (attempt - 1))"
mapping:
- mem: "160GB"
- sortsam:
- mem: "30GB"
- count_mappers:
- mem: "40GB"
+ mem_mb: "80000 * (2 ** (attempt - 1))"
multiqc:
- mem: "100GB"
+ mem_mb: "40000 * (2 ** (attempt - 1))"
run_deseq2:
- mem: "40GB"
- slurm_extra: "'--qos=medium'"
+ mem_mb: "40000 * (2 ** (attempt - 1))"
+ slurm_qos: "medium"
+ runtime: "18h"
+ run_DTU:
+ mem_mb: "40000 * (2 ** (attempt - 1))"
+ slurm_qos: "long"
+ runtime: "120h"
+ run_edger:
+ mem_mb: "40000 * (2 ** (attempt - 1))"
+ slurm_qos: "medium"
runtime: "18h"
+ salmon_index:
+ mem_mb: "40000 * (2 ** (attempt - 1))"
+ sortsam:
+ mem_mb: "16000 * (2 ** (attempt - 1))"
+ slurm_qos: "rapid"
+ runtime: "1h"
+ atacorrect:
+ mem_mb: "40000 * (2 ** (attempt - 1))"
+ bindetect:
+ mem_mb: "60000 * (2 ** (attempt - 1))"
+ gridss_call:
+ mem_mb: "40000 * (2 ** (attempt - 1))"
+ threads: 8
+ gridss_setupreference:
+ mem_mb: "40000 * (2 ** (attempt - 1))"
+ threads: 8
jobs: 1000
keep-going: True
rerun-incomplete: True
scheduler: greedy
max-jobs-per-second: 10
max-status-checks-per-second: 10
-restart-times: 3
+restart-times: 1
local-cores: 1
latency-wait: 600
diff --git a/profile_snakemake/config.yaml b/profile_snakemake/config.yaml
new file mode 100644
index 00000000..a4e4edb5
--- /dev/null
+++ b/profile_snakemake/config.yaml
@@ -0,0 +1,12 @@
+restart-times: 3
+jobscript: "slurm-jobscript.sh"
+cluster-generic-submit-cmd: "slurm-submit.py"
+#cluster-status: "slurm-status.py"
+max-jobs-per-second: 1
+max-status-checks-per-second: 3
+local-cores: 1
+latency-wait: 600
+#use-conda: True
+keep-going: True
+rerun-incomplete: True
+#printshellcmds: True
diff --git a/scripts/Analysis/build_count_table.py b/scripts/Analysis/build_count_table.py
index 64401669..5a220a8a 100755
--- a/scripts/Analysis/build_count_table.py
+++ b/scripts/Analysis/build_count_table.py
@@ -383,6 +383,8 @@ def prepare_table(
exc_tb,
)
log.error(logid + "".join(tbe.format()))
+ print("".join(tbe.format()), file=sys.stderr)
+ raise SystemExit(1)
def make_sample_list(group_name):
@@ -433,6 +435,8 @@ def make_sample_list(group_name):
exc_tb,
)
log.error(logid + "".join(tbe.format()))
+ print("".join(tbe.format()), file=sys.stderr)
+ raise SystemExit(1)
#
# build_count_table_simple.py ends here
diff --git a/tests/data/README_fgumi_test_data.md b/tests/data/README_fgumi_test_data.md
new file mode 100644
index 00000000..4902a8c1
--- /dev/null
+++ b/tests/data/README_fgumi_test_data.md
@@ -0,0 +1,27 @@
+# fgumi UMI smoke-test data
+
+This directory contains a tiny synthetic paired-end fixture generator for `fgumi`:
+
+- `make_fgumi_umi_fixture.py` writes
+ - `FASTQ/Test/umi/FGUMI01_R1.fastq.gz`
+ - `FASTQ/Test/umi/FGUMI01_R2.fastq.gz`
+- `config_fgumi_test.json` is a minimal tutorial-style MONSDA config that enables only `DEDUP` with `fgumi`.
+
+## Why synthetic data?
+
+Most publicly referenced UMI tutorial datasets are not truly small enough for quick CI-style smoke tests.
+The synthetic fixture keeps runtime tiny and deterministic while still exercising UMI extraction behavior.
+
+## External UMI datasets (if you want real data)
+
+- Galaxy Training (CEL-Seq2 UMI tutorial data on Zenodo):
+ - https://zenodo.org/record/2573177
+ - Files:
+ - `test_barcodes_celseq2_R1.fastq.gz` (~243.7 MB)
+ - `test_barcodes_celseq2_R2.fastq.gz` (~594.9 MB)
+- Galaxy tutorial page:
+ - https://training.galaxyproject.org/training-material/topics/single-cell/tutorials/scrna-umis/tutorial.html
+
+For nf-core test datasets, use branch discovery/search tooling first:
+- https://github.com/nf-core/test-datasets
+- https://github.com/nf-core/test-datasets/blob/master/docs/USE_EXISTING_DATA.md
diff --git a/tests/data/config_Test.json b/tests/data/config_Test.json
index f002306c..e7eededd 100644
--- a/tests/data/config_Test.json
+++ b/tests/data/config_Test.json
@@ -254,8 +254,8 @@
"Test": {
"drimseq": {
"OPTIONS": {
- "INDEX": "-l A --gencode -d GENOMES/transcripts.fa.gz",
- "QUANT": "--gcBias",
+ "INDEX": "-d GENOMES/transcripts.fa.gz",
+ "QUANT": "--validateMappings --seqBias --gcBias",
"DTU": ""
}
}
diff --git a/tests/data/config_Test_local.json b/tests/data/config_Test_local.json
index 56d924aa..19bc01a5 100644
--- a/tests/data/config_Test_local.json
+++ b/tests/data/config_Test_local.json
@@ -254,8 +254,8 @@
"Test": {
"drimseq": {
"OPTIONS": {
- "INDEX": "-l A --gencode -d GENOMES/transcripts.fa.gz",
- "QUANT": "--gcBias",
+ "INDEX": "-d GENOMES/transcripts.fa.gz",
+ "QUANT": "--validateMappings --seqBias --gcBias",
"DTU": ""
}
}
diff --git a/tests/data/config_fgumi_test.json b/tests/data/config_fgumi_test.json
new file mode 100644
index 00000000..a19d1bc7
--- /dev/null
+++ b/tests/data/config_fgumi_test.json
@@ -0,0 +1,47 @@
+{
+ "WORKFLOWS": "DEDUP",
+ "BINS": "",
+ "MAXTHREADS": "2",
+ "VERSION": "FIXME",
+ "SETTINGS": {
+ "Test": {
+ "umi": {
+ "SAMPLES": [
+ "FGUMI01"
+ ],
+ "SEQUENCING": "paired",
+ "REFERENCE": "GENOME/genome.fa.gz",
+ "ANNOTATION": {
+ "GTF": "GENOME/genomic.gtf.gz",
+ "GFF": "GENOME/genomic.gff.gz"
+ },
+ "GROUPS": [
+ "umi"
+ ],
+ "TYPES": [
+ "fgumi_smoke"
+ ],
+ "BATCHES": [
+ "1"
+ ],
+ "INDEX": "",
+ "PREFIX": ""
+ }
+ }
+ },
+ "DEDUP": {
+ "TOOLS": {
+ "fgumi": "fgumi"
+ },
+ "Test": {
+ "umi": {
+ "fgumi": {
+ "OPTIONS": {
+ "EXTRACT": "--read-structures 6M+T +T",
+ "DEDUP": ""
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/tests/data/make_fgumi_umi_fixture.py b/tests/data/make_fgumi_umi_fixture.py
new file mode 100644
index 00000000..79d9dbf6
--- /dev/null
+++ b/tests/data/make_fgumi_umi_fixture.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+"""Create a tiny paired-end FASTQ fixture with synthetic UMIs for fgumi tests.
+
+This script writes gzipped FASTQ files into:
+ FASTQ/Test/umi/FGUMI01_R1.fastq.gz
+ FASTQ/Test/umi/FGUMI01_R2.fastq.gz
+
+R1 layout is intentionally compatible with a simple fgumi read structure like:
+ --read-structures 6M+T +T
+where the first 6 bases in R1 are a mock UMI.
+"""
+
+from __future__ import annotations
+
+import gzip
+from pathlib import Path
+
+OUTDIR = Path("FASTQ") / "Test" / "umi"
+SAMPLE = "FGUMI01"
+
+# (umi6, r1_suffix, r2_sequence)
+READS = [
+ ("ACGTAA", "TTTTTTTTTTTTAACCGGTTCCAA", "GATTACAGATTACAGATTACAGATTACA"),
+ ("ACGTAA", "TTTTTTTTTTTTAACCGGTTCCAA", "GATTACAGATTACAGATTACAGATTACA"), # duplicate
+ ("TGCACT", "TTTTTTTTTTTTGGCCAATTGGCC", "CGTACGTACGTACGTACGTACGTACGTAC"),
+ (
+ "TGCACT",
+ "TTTTTTTTTTTTGGCCAATTGGCC",
+ "CGTACGTACGTACGTACGTACGTACGTAC",
+ ), # duplicate
+ ("GGAACC", "TTTTTTTTTTTTCCGGAATTCCGG", "TTGCAATTGCAATTGCAATTGCAATTGCA"),
+ ("CCTTGG", "TTTTTTTTTTTTAATTCCGGAATT", "AACCGGTTAACCGGTTAACCGGTTAACCG"),
+ ("TTAAGG", "TTTTTTTTTTTTGGAATTCCGGAA", "GGCCAATTGGCCAATTGGCCAATTGGCCA"),
+ ("CCGGTT", "TTTTTTTTTTTTAACCTTGGAACC", "ATATCGCGATATCGCGATATCGCGATATC"),
+]
+
+
+def _fq_record(name: str, seq: str, qual_char: str = "I") -> str:
+ return f"@{name}\n{seq}\n+\n{qual_char * len(seq)}\n"
+
+
+def main() -> None:
+ OUTDIR.mkdir(parents=True, exist_ok=True)
+
+ r1_path = OUTDIR / f"{SAMPLE}_R1.fastq.gz"
+ r2_path = OUTDIR / f"{SAMPLE}_R2.fastq.gz"
+
+ with gzip.open(r1_path, "wt") as r1h, gzip.open(r2_path, "wt") as r2h:
+ for i, (umi, r1_suffix, r2_seq) in enumerate(READS, start=1):
+ read_id = f"{SAMPLE}:{i}"
+ r1_seq = umi + r1_suffix
+ r1h.write(_fq_record(read_id + " 1:N:0:TEST", r1_seq))
+ r2h.write(_fq_record(read_id + " 2:N:0:TEST", r2_seq))
+
+ print(f"Wrote {r1_path}")
+ print(f"Wrote {r2_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/test_fgumi_smoke.sh b/tests/test_fgumi_smoke.sh
new file mode 100644
index 00000000..fabfd17c
--- /dev/null
+++ b/tests/test_fgumi_smoke.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR=$(dirname "$(realpath "$0")")
+cd "${SCRIPT_DIR}"
+
+# Mirror existing integration style: expose tests/data as working inputs
+ln -fs data/* .
+
+# Generate tiny gzipped UMI FASTQ inputs
+python data/make_fgumi_umi_fixture.py
+
+# Keep version in sync with installed MONSDA
+VERSION=$(monsda --version 2>&1 | sed 's/MONSDA version //g')
+sed -i "s/\"VERSION\": \"FIXME\"/\"VERSION\": \"${VERSION}\"/g" config_fgumi_test.json
+
+mkdir -p CONDALIB
+
+# --save keeps this as a lightweight workflow generation smoke test
+monsda -j 2 -c config_fgumi_test.json --directory "${PWD}" --use-conda --conda-prefix CONDALIB --save
+
+echo "FGUMI smoke test workflow generation completed."
diff --git a/workflows/bwa.nf b/workflows/bwa.nf
index be07f149..e436f6f0 100644
--- a/workflows/bwa.nf
+++ b/workflows/bwa.nf
@@ -89,7 +89,7 @@ process bwa_mapping{
if (PAIRED == 'paired'){
r1 = reads[1]
r2 = reads[2]
- fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"")
+ fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"")
pf = fn+"_mapped.sam.gz"
uf1 = fn+"_R1_unmapped.fastq.gz"
uf2 = fn+"_R2_unmapped.fastq.gz"
@@ -99,7 +99,7 @@ process bwa_mapping{
"""
}else{
read = reads[1]
- fn = file(reads[1]).getSimpleName().replaceAll(/\Q_trimmed\E/,"")
+ fn = file(reads[1]).getSimpleName().replaceAll(/(_dedup)?_trimmed$/,"")
pf = fn+"_mapped.sam.gz"
uf = fn+"_unmapped.fastq.gz"
lf = "bwa_"+fn+".log"
diff --git a/workflows/countreads.nf b/workflows/countreads.nf
index b542cfab..377410ba 100644
--- a/workflows/countreads.nf
+++ b/workflows/countreads.nf
@@ -147,7 +147,7 @@ process count_mappers{
conda "samtools.yaml"
container "oras://jfallmann/monsda:"+"samtools"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -178,7 +178,7 @@ process featurecount{
conda "$COUNTENV"+".yaml"
container "oras://jfallmann/monsda:"+"$COUNTENV"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -252,7 +252,8 @@ process prepare_count_table{
script:
"""
- ${BINS}/Analysis/build_count_table.py $DEREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
+ reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -)
+ ${BINS}/Analysis/build_count_table.py $DEREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
"""
}
diff --git a/workflows/deseq2_DE.nf b/workflows/deseq2_DE.nf
index e39a645c..0c446a12 100644
--- a/workflows/deseq2_DE.nf
+++ b/workflows/deseq2_DE.nf
@@ -22,7 +22,7 @@ process featurecount_deseq{
conda "$COUNTENV"+".yaml"
container "oras://jfallmann/monsda:"+"$COUNTENV"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -94,7 +94,8 @@ process prepare_count_table{
script:
"""
- ${BINS}/Analysis/build_count_table.py $DEREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
+ reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -)
+ ${BINS}/Analysis/build_count_table.py $DEREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
"""
}
diff --git a/workflows/dexseq_DEU.nf b/workflows/dexseq_DEU.nf
index 8c954541..0fb02c75 100644
--- a/workflows/dexseq_DEU.nf
+++ b/workflows/dexseq_DEU.nf
@@ -21,7 +21,7 @@ process prepare_deu_annotation{
conda "$COUNTENV"+".yaml"
container "oras://jfallmann/monsda:"+"$COUNTENV"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -61,7 +61,7 @@ process featurecount_dexseq{
conda "$COUNTENV"+".yaml"
container "oras://jfallmann/monsda:"+"$COUNTENV"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -133,7 +133,8 @@ process prepare_count_table{
script:
"""
- ${BINS}/Analysis/build_count_table.py $DEUREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
+ reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -)
+ ${BINS}/Analysis/build_count_table.py $DEUREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
"""
}
diff --git a/workflows/diego_DAS.nf b/workflows/diego_DAS.nf
index a2a43a73..b0a40373 100644
--- a/workflows/diego_DAS.nf
+++ b/workflows/diego_DAS.nf
@@ -24,7 +24,7 @@ process featurecount_diego{
conda "$COUNTENV"+".yaml"
container "oras://jfallmann/monsda:"+"$COUNTENV"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
diff --git a/workflows/edger_DAS.nf b/workflows/edger_DAS.nf
index 9fa62d2b..cedb3a93 100644
--- a/workflows/edger_DAS.nf
+++ b/workflows/edger_DAS.nf
@@ -22,7 +22,7 @@ process featurecount_edger{
conda "$COUNTENV"+".yaml"
container "oras://jfallmann/monsda:"+"$COUNTENV"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -94,7 +94,8 @@ process prepare_count_table{
script:
"""
- ${BINS}/Analysis/build_count_table.py $DASREPS --ids --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
+ reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -)
+ ${BINS}/Analysis/build_count_table.py $DASREPS --ids -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
"""
}
diff --git a/workflows/edger_DE.nf b/workflows/edger_DE.nf
index 43e0160e..5a55648a 100644
--- a/workflows/edger_DE.nf
+++ b/workflows/edger_DE.nf
@@ -22,7 +22,7 @@ process featurecount_edger{
conda "$COUNTENV"+".yaml"
container "oras://jfallmann/monsda:"+"$COUNTENV"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -94,7 +94,8 @@ process prepare_count_table{
script:
"""
- ${BINS}/Analysis/build_count_table.py $DEREPS --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
+ reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -)
+ ${BINS}/Analysis/build_count_table.py $DEREPS -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
"""
}
diff --git a/workflows/edger_DEU.nf b/workflows/edger_DEU.nf
index 7891d95d..a0a27db1 100644
--- a/workflows/edger_DEU.nf
+++ b/workflows/edger_DEU.nf
@@ -22,7 +22,7 @@ process featurecount_edger{
conda "$COUNTENV"+".yaml"
container "oras://jfallmann/monsda:"+"$COUNTENV"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -94,7 +94,8 @@ process prepare_count_table{
script:
"""
- ${BINS}/Analysis/build_count_table.py $DEUREPS --ids --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
+ reps_csv=\$(for f in $reps; do basename "\$f"; done | paste -sd, -)
+ ${BINS}/Analysis/build_count_table.py $DEUREPS --ids -r \$reps_csv --table COUNTS.gz --anno ANNOTATION.gz --nextflow 2> log
"""
}
diff --git a/workflows/fastqc.nf b/workflows/fastqc.nf
index 9ef084e1..77bd87d0 100644
--- a/workflows/fastqc.nf
+++ b/workflows/fastqc.nf
@@ -1,5 +1,5 @@
-QCENV=get_always('QCENV')
-QCBIN=get_always('QCBIN')
+QCENV=get_always('PREQCENV')
+QCBIN=get_always('PREQCBIN')
QCPARAMS = get_always('fastqc_params_QC') ?: ''
//QC RAW
diff --git a/workflows/fastqc.smk b/workflows/fastqc.smk
index b25b1787..7694bbaa 100644
--- a/workflows/fastqc.smk
+++ b/workflows/fastqc.smk
@@ -1,4 +1,4 @@
-QCBIN, QCENV = env_bin_from_config(config, 'QC')
+QCBIN, QCENV = env_bin_from_config(config, 'PREQC')
if paired == 'paired':
log.info('Running paired mode QC')
diff --git a/workflows/fastqc_dedup.nf b/workflows/fastqc_dedup.nf
index 62cf54a3..85665236 100644
--- a/workflows/fastqc_dedup.nf
+++ b/workflows/fastqc_dedup.nf
@@ -1,5 +1,5 @@
-QCENV=get_always('QCENV')
-QCBIN=get_always('QCBIN')
+QCENV=get_always('PREQCENV')
+QCBIN=get_always('PREQCBIN')
QCPARAMS = get_always('fastqc_params_QC') ?: ''
// RAW QC
diff --git a/workflows/fastqc_dedup.smk b/workflows/fastqc_dedup.smk
index b9f8676e..923115c0 100644
--- a/workflows/fastqc_dedup.smk
+++ b/workflows/fastqc_dedup.smk
@@ -1,4 +1,4 @@
-QCBIN, QCENV = env_bin_from_config(config, 'QC')
+QCBIN, QCENV = env_bin_from_config(config, 'PREQC')
#outdir = 'QC/'+str(QCENV)+'/'
#moutdir = 'QC/Multi/'+str(QCENV)+'/'
diff --git a/workflows/fastqc_dedup_trim.nf b/workflows/fastqc_dedup_trim.nf
index d875ba3e..2e53b7f2 100644
--- a/workflows/fastqc_dedup_trim.nf
+++ b/workflows/fastqc_dedup_trim.nf
@@ -1,5 +1,5 @@
-QCENV=get_always('QCENV')
-QCBIN=get_always('QCBIN')
+QCENV=get_always('PREQCENV')
+QCBIN=get_always('PREQCBIN')
QCPARAMS = get_always('fastqc_params_QC') ?: ''
//QC RAW
diff --git a/workflows/fastqc_dedup_trim.smk b/workflows/fastqc_dedup_trim.smk
index c5b12743..08d5f183 100644
--- a/workflows/fastqc_dedup_trim.smk
+++ b/workflows/fastqc_dedup_trim.smk
@@ -1,4 +1,4 @@
-QCBIN, QCENV = env_bin_from_config( config, 'QC')
+QCBIN, QCENV = env_bin_from_config(config, 'PREQC')
#outdir = 'QC/'+str(QCENV)+'/'
#moutdir = 'QC/Multi/'+str(QCENV)+'/'
diff --git a/workflows/fastqc_raw.nf b/workflows/fastqc_raw.nf
index d83e612c..70e1bb81 100644
--- a/workflows/fastqc_raw.nf
+++ b/workflows/fastqc_raw.nf
@@ -1,5 +1,5 @@
-QCENV=get_always('QCENV')
-QCBIN=get_always('QCBIN')
+QCENV=get_always('PREQCENV')
+QCBIN=get_always('PREQCBIN')
QCPARAMS = get_always('fastqc_params_QC') ?: ''
process qc_raw{
diff --git a/workflows/fastqc_raw.smk b/workflows/fastqc_raw.smk
index b9dfe662..1fe7ab37 100644
--- a/workflows/fastqc_raw.smk
+++ b/workflows/fastqc_raw.smk
@@ -1,4 +1,4 @@
-QCBIN, QCENV = env_bin_from_config(config, 'QC')
+QCBIN, QCENV = env_bin_from_config(config, 'PREQC')
if paired == 'paired':
log.info('Running paired mode QC')
diff --git a/workflows/fastqc_trim.nf b/workflows/fastqc_trim.nf
index 5aa1f262..badcea20 100644
--- a/workflows/fastqc_trim.nf
+++ b/workflows/fastqc_trim.nf
@@ -1,5 +1,5 @@
-QCENV=get_always('QCENV')
-QCBIN=get_always('QCBIN')
+QCENV=get_always('PREQCENV')
+QCBIN=get_always('PREQCBIN')
QCPARAMS = get_always('fastqc_params_QC') ?: ''
// RAW QC
diff --git a/workflows/fastqc_trim.smk b/workflows/fastqc_trim.smk
index c4088665..5446e889 100644
--- a/workflows/fastqc_trim.smk
+++ b/workflows/fastqc_trim.smk
@@ -1,4 +1,4 @@
-QCBIN, QCENV = env_bin_from_config( config, 'QC')
+QCBIN, QCENV = env_bin_from_config(config, 'PREQC')
#outdir = 'QC/'+str(QCENV)+'/'
#moutdir = 'QC/Multi/'+str(QCENV)+'/'
diff --git a/workflows/fgumi.nf b/workflows/fgumi.nf
new file mode 100644
index 00000000..9e9547b9
--- /dev/null
+++ b/workflows/fgumi.nf
@@ -0,0 +1,76 @@
+DEDUPENV=get_always('DEDUPENV')
+DEDUPBIN=get_always('DEDUPBIN')
+
+EXTRACTPARAMS = get_always('fgumi_params_EXTRACT') ?: ''
+DEDUPPARAMS = get_always('fgumi_params_DEDUP') ?: ''
+
+process extract_fq{
+ conda "$DEDUPENV"+".yaml"
+ container "oras://jfallmann/monsda:"+"$DEDUPENV"
+ cpus THREADS
+ cache 'lenient'
+
+ publishDir "${workflow.workDir}/../" , mode: 'link',
+ saveAs: {filename ->
+ if (filename.indexOf("_dedup.fastq.gz") > 0) "DEDUP_FASTQ/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.fastq.gz"
+ else if (filename.indexOf("log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/dedup_extract.log"
+ else null
+ }
+
+ input:
+ path samples
+
+ output:
+ path "*_dedup.fastq.gz", emit: extract
+ path "*_fgumi_extract.bam", emit: ubam
+ path "ex.log", emit: logs
+
+ script:
+ if (PAIRED == 'paired'){
+ r1 = samples[0]
+ r2 = samples[1]
+ sn = samples[0].getSimpleName().replace("_R1","")
+ ubam = sn+"_fgumi_extract.bam"
+ outf = samples[0].getSimpleName()+"_dedup.fastq.gz"
+ outf2 = samples[1].getSimpleName()+"_dedup.fastq.gz"
+ """
+ mkdir -p tmp && $DEDUPBIN extract $EXTRACTPARAMS --inputs $r1 $r2 --sample $sn --library $sn --output $ubam > ex.log 2>&1 && samtools fastq -n -1 $outf -2 $outf2 -0 /dev/null -s /dev/null $ubam >> ex.log 2>&1
+ """
+ }
+ else{
+ r1 = samples[0]
+ sn = samples[0].getSimpleName().replace(".fastq.gz","")
+ ubam = sn+"_fgumi_extract.bam"
+ outf = samples[0].getSimpleName()+"_dedup.fastq.gz"
+ """
+ mkdir -p tmp && $DEDUPBIN extract $EXTRACTPARAMS --inputs $r1 --sample $sn --library $sn --output $ubam > ex.log 2>&1 && samtools fastq -n $ubam | gzip -c > $outf && echo done >> ex.log
+ """
+ }
+}
+
+workflow DEDUPEXTRACT{
+ take:
+ collection
+
+ main:
+ //SAMPLE CHANNELS
+ if ( PREDEDUP == 'enabled' ){
+ if (PAIRED == 'paired'){
+ extract_fq(samples_ch.collate( 2 ))
+ } else{
+ extract_fq(samples_ch.collate( 1 ))
+ }
+ }else{
+ if (PAIRED == 'paired'){
+ extract_fq(collection.collate( 2 ))
+ } else{
+ extract_fq(collection.collate( 1 ))
+ }
+ }
+
+ emit:
+ extract = extract_fq.out.extract
+ ubam = extract_fq.out.ubam
+ logs = extract_fq.out.logs
+}
+
diff --git a/workflows/fgumi.smk b/workflows/fgumi.smk
new file mode 100644
index 00000000..edeacf1e
--- /dev/null
+++ b/workflows/fgumi.smk
@@ -0,0 +1,88 @@
+DEDUPBIN, DEDUPENV = env_bin_from_config(config, 'DEDUP')
+
+wildcard_constraints:
+ type = "sorted|sorted_unique"
+
+if paired == 'paired':
+ rule extract:
+ input: r1 = lambda wildcards: "FASTQ/{rawfile}_R1.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]),
+ r2 = lambda wildcards: "FASTQ/{rawfile}_R2.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0])
+ output: o1 = "DEDUP_FASTQ/{combo}/{file}_R1_dedup.fastq.gz",
+ o2 = "DEDUP_FASTQ/{combo}/{file}_R2_dedup.fastq.gz",
+ ubam = "TMP/FGEX/{combo}/{file}_extracted.bam",
+ td = temp(directory("TMP/FGEX/{combo}/{file}"))
+ log: "LOGS/{combo}/{file}_dedup_extract.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ params: epara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""),
+ dedup = DEDUPBIN,
+ sname = lambda wildcards: os.path.basename(wildcards.file)
+ shell: "mkdir -p {output.td} && {params.dedup} extract {params.epara} --inputs {input.r1} {input.r2} --sample {params.sname} --library {params.sname} --output {output.ubam} > {log} 2>&1 && samtools fastq -n -1 {output.o1} -2 {output.o2} -0 /dev/null -s /dev/null {output.ubam} >> {log} 2>&1"
+else:
+ rule extract:
+ input: r1 = lambda wildcards: "FASTQ/{rawfile}.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0])
+ output: o1 = "DEDUP_FASTQ/{combo}/{file}_dedup.fastq.gz",
+ ubam = "TMP/FGEX/{combo}/{file}_extracted.bam",
+ td = temp(directory("TMP/FGEX/{combo}/{file}"))
+ log: "LOGS/{combo}/{file}_dedup_extract.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ params: epara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""),
+ dedup = DEDUPBIN,
+ sname = lambda wildcards: os.path.basename(wildcards.file)
+ shell: "mkdir -p {output.td} && {params.dedup} extract {params.epara} --inputs {input.r1} --sample {params.sname} --library {params.sname} --output {output.ubam} > {log} 2>&1 && samtools fastq -n {output.ubam} | gzip -c > {output.o1} && echo done >> {log}"
+
+if paired == 'paired':
+ rule dedupbam:
+ input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam",
+ ubam = "TMP/FGEX/{combo}/{file}_extracted.bam"
+ output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"),
+ bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"),
+ td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}"))
+ log: "LOGS/{combo}/{file}_{type}/dedupbam.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 4
+ priority: 0 # This should be done after all mapping is done
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""),
+ dedup = DEDUPBIN,
+ ref = REFERENCE,
+ ref_dict = (REFERENCE[:-3] if REFERENCE.endswith('.gz') else REFERENCE) + ".dict"
+ shell: """mkdir -p {output.td}
+[[ -f "{params.ref_dict}" ]] || samtools dict {params.ref} -o {params.ref_dict} >> {log} 2>&1
+samtools sort -n -@ {threads} -o {output.td}/ubam_qn.bam {input.ubam} >> {log} 2>&1
+samtools sort -n -@ {threads} -o {output.td}/mapped_qn.bam {input.bam} >> {log} 2>&1
+samtools view -h {output.td}/mapped_qn.bam | awk 'BEGIN{{FS=OFS="\t"}} /^@/{{print; next}} {{f=$2+0; if (!and(f,256) && !and(f,2048)) {{k=$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next}} print}}' | samtools view -b -o {output.td}/mapped_qn_primaryuniq.bam - >> {log} 2>&1
+{params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn_primaryuniq.bam --reference {params.ref} --output {output.td}/zippered.bam --threads {threads} --compression-level 1 >> {log} 2>&1
+{params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam --threads {threads} --compression-level 1 >> {log} 2>&1
+{params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.td}/dedup.bam >> {log} 2>&1
+{params.dedup} sort --order coordinate --input {output.td}/dedup.bam --output {output.bam} --write-index --threads {threads} --compression-level 1 >> {log} 2>&1
+"""
+else:
+ rule dedupbam:
+ input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam",
+ ubam = "TMP/FGEX/{combo}/{file}_extracted.bam"
+ output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"),
+ bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"),
+ td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}"))
+ log: "LOGS/{combo}/{file}_{type}/dedupbam.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 4
+ priority: 0 # This should be done after all mapping is done
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""),
+ dedup = DEDUPBIN,
+ ref = REFERENCE,
+ ref_dict = (REFERENCE[:-3] if REFERENCE.endswith('.gz') else REFERENCE) + ".dict"
+ shell: """mkdir -p {output.td}
+[[ -f "{params.ref_dict}" ]] || samtools dict {params.ref} -o {params.ref_dict} >> {log} 2>&1
+samtools sort -n -@ {threads} -o {output.td}/ubam_qn.bam {input.ubam} >> {log} 2>&1
+samtools sort -n -@ {threads} -o {output.td}/mapped_qn.bam {input.bam} >> {log} 2>&1
+samtools view -h {output.td}/mapped_qn.bam | awk 'BEGIN{{FS=OFS="\t"}} /^@/{{print; next}} {{f=$2+0; if (!and(f,256) && !and(f,2048)) {{k=$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next}} print}}' | samtools view -b -o {output.td}/mapped_qn_primaryuniq.bam - >> {log} 2>&1
+{params.dedup} zipper --unmapped {output.td}/ubam_qn.bam --input {output.td}/mapped_qn_primaryuniq.bam --reference {params.ref} --output {output.td}/zippered.bam --threads {threads} --compression-level 1 >> {log} 2>&1
+{params.dedup} sort --order template-coordinate --input {output.td}/zippered.bam --output {output.td}/sorted.bam --threads {threads} --compression-level 1 >> {log} 2>&1
+{params.dedup} dedup {params.dpara} --input {output.td}/sorted.bam --output {output.td}/dedup.bam >> {log} 2>&1
+{params.dedup} sort --order coordinate --input {output.td}/dedup.bam --output {output.bam} --write-index --threads {threads} --compression-level 1 >> {log} 2>&1
+"""
diff --git a/workflows/fgumi_dedup.nf b/workflows/fgumi_dedup.nf
new file mode 100644
index 00000000..608b288b
--- /dev/null
+++ b/workflows/fgumi_dedup.nf
@@ -0,0 +1,85 @@
+DEDUPENV=get_always('DEDUPENV')
+DEDUPBIN=get_always('DEDUPBIN')
+
+DEDUPPARAMS = get_always('fgumi_params_DEDUP') ?: ''
+
+process dedup_bam{
+ conda "$DEDUPENV"+".yaml"
+ container "oras://jfallmann/monsda:"+"$DEDUPENV"
+ cpus 4
+ cache 'lenient'
+ //validExitStatus 0,1
+
+ publishDir "${workflow.workDir}/../" , mode: 'link',
+ saveAs: {filename ->
+ if (filename.endsWith("_dedup.bam")) "MAPPED/${COMBO}/${CONDITION}/${file(filename).getName()}"
+ else if (filename.indexOf("_dedup.bam.bai") > 0) "MAPPED/${COMBO}/${CONDITION}/${file(filename).getName()}"
+ else if (filename.indexOf("dedup.log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/${file(filename).getName()}"
+ else null
+ }
+
+ input:
+ tuple val(sample_id), path(mapped_bam), path(ubam)
+ path ref
+
+ output:
+ path "*_dedup.bam", emit: bam
+ path "*_dedup.bam.bai", emit: bai
+ path "*_dedup.log", emit: logs
+
+ script:
+ bams = mapped_bam
+ outf = bams.getSimpleName()+"_dedup.bam"
+ outl = bams.getSimpleName()+"_dedup.log"
+ """
+ mkdir -p TMP
+ ref_dict=\$(basename $ref .gz).dict
+ if [[ ! -f "\${ref_dict}" ]]; then samtools dict $ref -o \${ref_dict} >> $outl 2>&1; fi
+ samtools sort -n -@ ${task.cpus} -o TMP/ubam_qn.bam $ubam >> $outl 2>&1
+ samtools sort -n -@ ${task.cpus} -o TMP/mapped_qn.bam $bams >> $outl 2>&1
+ samtools view -h TMP/mapped_qn.bam | awk 'BEGIN{FS=OFS="\t"} /^@/{print; next} {f=\$2+0; if (!and(f,256) && !and(f,2048)) {k=\$1":"(and(f,64)?1:0)":"(and(f,128)?1:0); if (seen[k]++) next} print}' | samtools view -b -o TMP/mapped_qn_primaryuniq.bam - >> $outl 2>&1
+ $DEDUPBIN zipper --unmapped TMP/ubam_qn.bam --input TMP/mapped_qn_primaryuniq.bam --reference $ref --output TMP/zippered.bam --threads ${task.cpus} --compression-level 1 >> $outl 2>&1
+ $DEDUPBIN sort --order template-coordinate --input TMP/zippered.bam --output TMP/sorted.bam --threads ${task.cpus} --compression-level 1 >> $outl 2>&1
+ $DEDUPBIN dedup $DEDUPPARAMS --input TMP/sorted.bam --output TMP/dedup.bam >> $outl 2>&1
+ $DEDUPBIN sort --order coordinate --input TMP/dedup.bam --output $outf --write-index --threads ${task.cpus} --compression-level 1 >> $outl 2>&1
+ """
+}
+
+workflow DEDUPBAM{
+ take:
+ map
+ mapi
+ mapu
+ mapui
+ ubam
+
+ main:
+ mapped_ch = map.concat(mapu).map { b ->
+ def n = file(b).getName()
+ def key = n
+ .replaceFirst(/_mapped_sorted_unique\.bam$/, '')
+ .replaceFirst(/_mapped_sorted\.bam$/, '')
+ .replaceFirst(/_R1_dedup_trimmed$/, '')
+ .replaceFirst(/_dedup_trimmed$/, '')
+ .replaceFirst(/_R1_trimmed$/, '')
+ .replaceFirst(/_trimmed$/, '')
+ tuple(key, b)
+ }
+ ubam_ch = ubam.map { u ->
+ def n = file(u).getName()
+ def key = n
+ .replaceFirst(/_fgumi_extract\.bam$/, '')
+ .replaceFirst(/_extracted\.bam$/, '')
+ .replaceFirst(/_R1$/, '')
+ tuple(key, u)
+ }
+ paired_ch = mapped_ch.combine(ubam_ch, by: 0).map { key, mb, ub -> tuple(key, mb, ub) }
+
+ ref_ch = channel.value(file(REFERENCE))
+ dedup_bam(paired_ch, ref_ch)
+
+ emit:
+ dedup = dedup_bam.out.bam
+ dedupbai = dedup_bam.out.bai
+ deduplog = dedup_bam.out.logs
+}
diff --git a/workflows/fgumi_dedup.smk b/workflows/fgumi_dedup.smk
new file mode 100644
index 00000000..49d54982
--- /dev/null
+++ b/workflows/fgumi_dedup.smk
@@ -0,0 +1,15 @@
+DEDUPBIN, DEDUPENV = env_bin_from_config(config, 'DEDUP')
+
+rule dedupbam:
+ input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam"
+ output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"),
+ bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"),
+ td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}"))
+ log: "LOGS/{combo}/{file}_{type}/dedupbam.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ priority: 0 # This should be done after all mapping is done
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""),
+ dedup = DEDUPBIN
+ shell: "mkdir -p {output.td} && {params.dedup} dedup {params.dpara} --input {input.bam} --output {output.bam} 2> {log} && samtools index {output.bam} 2>> {log}"
diff --git a/workflows/footer.smk b/workflows/footer.smk
index 04b84b4b..17081eeb 100644
--- a/workflows/footer.smk
+++ b/workflows/footer.smk
@@ -1,3 +1,36 @@
+## Queue-agnostic fallback resources for all rules.
+## These defaults apply when a rule has no explicit resource set.
+## They keep local/non-Slurm runs from lacking baseline resource values.
+_fallback_rule_resources = {
+ "mem_mb": 20000,
+ "runtime": 480,
+ "nodes": 1,
+}
+
+_user_rule_defaults = config.get("DEFAULT_RULE_RESOURCES", {})
+if isinstance(_user_rule_defaults, dict):
+ _fallback_rule_resources.update(_user_rule_defaults)
+
+
+def _has_resource(rule_obj, resource_name):
+ try:
+ return resource_name in rule_obj.resources.keys()
+ except Exception:
+ return hasattr(rule_obj.resources, resource_name)
+
+
+def _set_resource(rule_obj, resource_name, value):
+ try:
+ rule_obj.resources[resource_name] = value
+ except Exception:
+ setattr(rule_obj.resources, resource_name, value)
+
+
+for _rule in workflow.rules:
+ for _res_name, _res_value in _fallback_rule_resources.items():
+ if not _has_resource(_rule, _res_name):
+ _set_resource(_rule, _res_name, _res_value)
+
onsuccess:
print("Workflow finished, no error")
onerror:
diff --git a/workflows/header.nf b/workflows/header.nf
index 576c43d2..7aaa6158 100644
--- a/workflows/header.nf
+++ b/workflows/header.nf
@@ -7,8 +7,8 @@
// ALWAYS COMMENT LINES WITH '//', DO NOT USE MULTI LINE COMMENTS AS THE PARSER WILL NOT IGNORE MIDDLE LINES AND THIS WILL CAUSE CHAOS
//Version Check
-nextflowVersion = '>=20.01.0.5264'
nextflow.enable.dsl=2
+//nextflowVersion = '>=20.01.0.5264'
//define unset Params
def get_always(parameter){
diff --git a/workflows/hisat2.nf b/workflows/hisat2.nf
index 55bcffc4..cf3d3f89 100644
--- a/workflows/hisat2.nf
+++ b/workflows/hisat2.nf
@@ -105,7 +105,7 @@ process hisat2_mapping{
r1 = reads[1]
r2 = reads[2]
- fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"")
+ fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"")
pf = fn+"_mapped.sam"
ufo = fn+"_R1_unmapped.fastq.gz"
uft = fn+"_R2_unmapped.fastq.gz"
@@ -122,7 +122,7 @@ process hisat2_mapping{
stranded = ''
}
read = reads[1]
- fn = file(reads[1]).getSimpleName().replaceAll(/\Q_trimmed\E/,"")
+ fn = file(reads[1]).getSimpleName().replaceAll(/(_dedup)?_trimmed$/,"")
pf = fn+"_mapped.sam"
uf = fn+"_unmapped.fastq.gz"
lf = "hisat2_"+fn+".log"
diff --git a/workflows/mapping.nf b/workflows/mapping.nf
index 62b06ffa..b455eba4 100644
--- a/workflows/mapping.nf
+++ b/workflows/mapping.nf
@@ -4,7 +4,7 @@ process sortsam{
conda "samtools.yaml"
container "oras://jfallmann/monsda:"+"samtools"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -34,7 +34,7 @@ process sam2bam{
conda "samtools.yaml"
container "oras://jfallmann/monsda:"+"samtools"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -66,7 +66,7 @@ process uniqsam{
conda "samtools.yaml"
container "oras://jfallmann/monsda:"+"samtools"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
@@ -101,7 +101,7 @@ process sam2bamuniq{
conda "samtools.yaml"
container "oras://jfallmann/monsda:"+"samtools"
cpus THREADS
- memory 16.GB
+ memory { 16.GB * (1 << ((task.attempt ?: 1) - 1)) }
cache 'lenient'
//validExitStatus 0,1
diff --git a/workflows/multiqc.nf b/workflows/multiqc.nf
index e82010ce..f51c1c0d 100644
--- a/workflows/multiqc.nf
+++ b/workflows/multiqc.nf
@@ -1,5 +1,5 @@
-QCENV=get_always('QCENV')
-QCBIN=get_always('QCBIN')
+QCENV=get_always('PREQCENV')
+QCBIN=get_always('PREQCBIN')
QCPARAMS = get_always('fastqc_params_MULTI') ?: ''
process mqc{
diff --git a/workflows/multiqc_rustqc.nf b/workflows/multiqc_rustqc.nf
new file mode 100644
index 00000000..71cfe752
--- /dev/null
+++ b/workflows/multiqc_rustqc.nf
@@ -0,0 +1,65 @@
+QCENV=get_always('POSTQCENV')
+QCBIN=get_always('POSTQCBIN')
+QCPARAMS = get_always('rustqc_params_MULTI') ?: ''
+
+process mqc{
+ conda "$QCENV"+".yaml"
+ container "oras://jfallmann/monsda:"+"$QCENV"
+ cpus THREADS
+ cache 'lenient'
+ //validExitStatus 0,1
+
+ publishDir "${workflow.workDir}/../" , mode: 'link',
+ saveAs: {filename ->
+ if (filename.indexOf("zip") > 0) "QC/Multi/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.zip"
+ else if (filename.indexOf("html") > 0) "QC/Multi/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.html"
+ else "QC/Multi/${COMBO}/${CONDITION}/${file(filename).getName()}"
+ }
+
+ input:
+ path others, stageAs: 'mqc_input??/*'
+ //path samples
+
+ output:
+ path "*.zip", emit: mqc
+ path "*.html", emit: html
+
+ script:
+ """
+ touch $others
+ OUT=\${PWD}
+ LIST=multiqc_inputs.txt
+ TMP_LIST=multiqc_inputs_unique.txt
+ BASE_QC_DIR="${workflow.workDir}/../QC"
+ COMBO_VAL="${COMBO}"
+ CONDITION_VAL="${CONDITION}"
+
+ for i in $others; do
+ dirname "\$i" >> "\$LIST"
+ done
+
+ # If the corresponding fastqc combo exists, include its output in the MultiQC report.
+ FQ_COMBO="\${COMBO_VAL/rustqc/fastqc}"
+ FQ_DIR="\${BASE_QC_DIR}/\${FQ_COMBO}/\${CONDITION_VAL}"
+ if [[ -d "\$FQ_DIR" ]]; then
+ echo "\$FQ_DIR" >> "\$LIST"
+ fi
+
+ sort -u "\$LIST" > "\$TMP_LIST"
+ export LC_ALL=en_US.utf8
+ export LC_ALL=C.UTF-8
+ multiqc -f --exclude picard --exclude gatk -k json -z -s -o "\$OUT" -l "\$TMP_LIST"
+ """
+}
+
+workflow MULTIQC{
+ take:
+ otherqcs
+
+ main:
+
+ mqc(otherqcs.collect())
+
+ emit:
+ mqcres = mqc.out.mqc
+}
diff --git a/workflows/multiqc_rustqc.smk b/workflows/multiqc_rustqc.smk
index 1ecd2b1f..17022c50 100644
--- a/workflows/multiqc_rustqc.smk
+++ b/workflows/multiqc_rustqc.smk
@@ -1,109 +1,31 @@
-if rundedup:
- if paired == 'paired':
- if prededup:
- rule multiqc:
- input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_dedupmapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_uniquededup.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.dedupbam.output.bam, file=samplecond(SAMPLES, config), combo=combo, type=["sorted", "sorted_unique"])
- output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"),
- tmp = temp("QC/Multi/{combo}/{condition}/tmp"),
- lst = "QC/Multi/{combo}/{condition}/qclist.txt"
- log: "LOGS/{combo}/{condition}_multiqc.log"
- conda: "rustqc.yaml"
- container: "oras://jfallmann/monsda:rustqc"
- threads: 1
- params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "")
- shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}"
- else:
- rule multiqc:
- input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_dedupmapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_uniquededup.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.dedupbam.output.bam, file=samplecond(SAMPLES, config), combo=combo, type=["sorted", "sorted_unique"])
- output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"),
- tmp = temp("QC/Multi/{combo}/{condition}/tmp"),
- lst = "QC/Multi/{combo}/{condition}/qclist.txt"
- log: "LOGS/{combo}/{condition}_multiqc.log"
- conda: "rustqc.yaml"
- container: "oras://jfallmann/monsda:rustqc"
- threads: 1
- params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "")
- shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}"
-
- else:
- if prededup:
- rule multiqc:
- input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_dedupmapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_uniquededup.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.dedupbam.output.bam, file=samplecond(SAMPLES, config), combo=combo, type=["sorted", "sorted_unique"])
- output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"),
- tmp = temp("QC/Multi/{combo}/{condition}/tmp"),
- lst = "QC/Multi/{combo}/{condition}/qclist.txt"
- log: "LOGS/{combo}/{condition}_multiqc.log"
- conda: "rustqc.yaml"
- container: "oras://jfallmann/monsda:rustqc"
- threads: 1
- params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "")
- shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}"
- else:
- rule multiqc:
- input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_dedupmapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_uniquededup.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.dedupbam.output.bam, file=samplecond(SAMPLES, config), combo=combo, type=["sorted", "sorted_unique"])
- output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"),
- tmp = temp("QC/Multi/{combo}/{condition}/tmp"),
- lst = "QC/Multi/{combo}/{condition}/qclist.txt"
- log: "LOGS/{combo}/{condition}_multiqc.log"
- conda: "rustqc.yaml"
- container: "oras://jfallmann/monsda:rustqc"
- threads: 1
- params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "")
- shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}"
+if paired == 'paired':
+ rule multiqc:
+ input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
+ expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
+ expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo),
+ expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo)
+ output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"),
+ tmp = temp("QC/Multi/{combo}/{condition}/tmp"),
+ lst = "QC/Multi/{combo}/{condition}/qclist.txt"
+ log: "LOGS/{combo}/{condition}_multiqc.log"
+ conda: "rustqc.yaml"
+ container: "oras://jfallmann/monsda:rustqc"
+ threads: 1
+ params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "")
+ shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}"
else:
- if paired == 'paired':
- rule multiqc:
- input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo)
- output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"),
- tmp = temp("QC/Multi/{combo}/{condition}/tmp"),
- lst = "QC/Multi/{combo}/{condition}/qclist.txt"
- log: "LOGS/{combo}/{condition}_multiqc.log"
- conda: "rustqc.yaml"
- container: "oras://jfallmann/monsda:rustqc"
- threads: 1
- params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "")
- shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}"
-
- else:
- rule multiqc:
- input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo),
- expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo)
- output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"),
- tmp = temp("QC/Multi/{combo}/{condition}/tmp"),
- lst = "QC/Multi/{combo}/{condition}/qclist.txt"
- log: "LOGS/{combo}/{condition}_multiqc.log"
- conda: "rustqc.yaml"
- container: "oras://jfallmann/monsda:rustqc"
- threads: 1
- params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "")
- shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}"
+ rule multiqc:
+ input: expand(rules.rustqc_mapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
+ expand(rules.rustqc_uniquemapped.output.js, file=samplecond(SAMPLES, config), combo=combo),
+ expand(rules.sam2bam.output.bam, file=samplecond(SAMPLES, config), combo=combo),
+ expand(rules.sam2bamuniq.output.uniqbam, file=samplecond(SAMPLES, config), combo=combo)
+ output: html = report("QC/Multi/{combo}/{condition}/multiqc_report.html", category="QC"),
+ tmp = temp("QC/Multi/{combo}/{condition}/tmp"),
+ lst = "QC/Multi/{combo}/{condition}/qclist.txt"
+ log: "LOGS/{combo}/{condition}_multiqc.log"
+ conda: "rustqc.yaml"
+ container: "oras://jfallmann/monsda:rustqc"
+ threads: 1
+ params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('MULTI', "")
+ shell: "OUT=$(dirname {output.html}); for i in {input}; do echo $(dirname \"${{i}}\") >> {output.tmp}; done; FQ_COMBO=$(echo {wildcards.combo} | sed 's/rustqc/fastqc/g'); FQ_DIR=QC/${{FQ_COMBO}}/{wildcards.condition}; if [ -d \"${{FQ_DIR}}\" ]; then echo ${{FQ_DIR}} >> {output.tmp}; fi; cat {output.tmp} | sort -u > {output.lst}; export LC_ALL=C.UTF-8; multiqc -f {params.qpara} --exclude picard --exclude gatk -k json -z -s -o ${{OUT}} -l {output.lst} 2> {log}"
diff --git a/workflows/rustqc.nf b/workflows/rustqc.nf
index 1505b0a3..9ce3a806 100644
--- a/workflows/rustqc.nf
+++ b/workflows/rustqc.nf
@@ -1,6 +1,6 @@
-QCENV = get_always('QCENV')
-QCBIN = get_always('QCBIN')
-QCPARAMS = get_always('rustqc_params_QC') ?: ''
+RUSTQCENV = get_always('POSTQCENV')
+RUSTQCBIN = get_always('POSTQCBIN')
+RUSTQCPARAMS = get_always('rustqc_params_QC') ?: ''
MAPANNO = get_always('MAPPINGANNO')
@@ -17,8 +17,8 @@ RUSTQC_PAIRED = (PAIRED == 'paired') ? '-p' : ''
//RUSTQC on mapped BAMs
process rustqc_mapped{
- conda "$QCENV"+".yaml"
- container "oras://jfallmann/monsda:"+"$QCENV"
+ conda "$RUSTQCENV"+".yaml"
+ container "oras://jfallmann/monsda:"+"$RUSTQCENV"
cpus THREADS
cache 'lenient'
label 'big_mem'
@@ -40,11 +40,11 @@ process rustqc_mapped{
fn = file(bam).getSimpleName()
anno = file("${workflow.workDir}/../${MAPANNO}")
"""
- rustqc rna $bam --gtf $anno -t ${task.cpus} $RUSTQC_PAIRED -s $RUSTQC_STRANDED --skip-dup-check -j results/rustqc_summary.json -o results/$fn $QCPARAMS
+ $RUSTQCBIN rna $bam --gtf $anno -t ${task.cpus} $RUSTQC_PAIRED -s $RUSTQC_STRANDED --skip-dup-check -j results/rustqc_summary.json -o results/$fn $RUSTQCPARAMS
"""
}
-workflow QC_MAPPING{
+workflow RUSTQC_MAPPING{
take: collection
main:
diff --git a/workflows/rustqc.smk b/workflows/rustqc.smk
index 67fa3e93..579715c0 100644
--- a/workflows/rustqc.smk
+++ b/workflows/rustqc.smk
@@ -1,4 +1,4 @@
-QCBIN, QCENV = env_bin_from_config(config, 'QC')
+QCBIN, QCENV = env_bin_from_config(config, 'POSTQC')
# Map MONSDA strandedness to RustQC strandedness
def rustqc_stranded(stranded):
@@ -40,33 +40,3 @@ rule rustqc_uniquemapped:
paired = RUSTQC_PAIRED,
stranded = RUSTQC_STRANDED
shell: "rustqc rna {input.r1} --gtf {params.anno} -t {threads} {params.paired} -s {params.stranded} --skip-dup-check -j {output.js} -o {output.o1} {params.qpara} 2> {log}"
-
-rule rustqc_dedupmapped:
- input: r1 = "MAPPED/{combo}/{file}_mapped_sorted_dedup.bam",
- r2 = "MAPPED/{combo}/{file}_mapped_sorted_dedup.bam.bai"
- output: o1 = directory("QC/{combo}/{file}_mapped_sorted_dedup"),
- js = "QC/{combo}/{file}_mapped_sorted_dedup/rustqc_summary.json"
- log: "LOGS/{combo}/{file}_rustqc_dedupmapped.log"
- conda: ""+QCENV+".yaml"
- container: "oras://jfallmann/monsda:"+QCENV+""
- threads: MAXTHREAD
- params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('QC', ""),
- anno = ANNOTATION,
- paired = RUSTQC_PAIRED,
- stranded = RUSTQC_STRANDED
- shell: "rustqc rna {input.r1} --gtf {params.anno} -t {threads} {params.paired} -s {params.stranded} -j {output.js} -o {output.o1} {params.qpara} 2> {log}"
-
-rule rustqc_uniquededup:
- input: r1 = "MAPPED/{combo}/{file}_mapped_sorted_unique_dedup.bam",
- r2 = "MAPPED/{combo}/{file}_mapped_sorted_unique_dedup.bam.bai"
- output: o1 = directory("QC/{combo}/{file}_mapped_sorted_unique_dedup"),
- js = "QC/{combo}/{file}_mapped_sorted_unique_dedup/rustqc_summary.json"
- log: "LOGS/{combo}/{file}_rustqc_uniquededup.log"
- conda: ""+QCENV+".yaml"
- container: "oras://jfallmann/monsda:"+QCENV+""
- threads: MAXTHREAD
- params: qpara = lambda wildcards: tool_params(SAMPLES[0], None, config, 'QC', QCENV)['OPTIONS'].get('QC', ""),
- anno = ANNOTATION,
- paired = RUSTQC_PAIRED,
- stranded = RUSTQC_STRANDED
- shell: "rustqc rna {input.r1} --gtf {params.anno} -t {threads} {params.paired} -s {params.stranded} -j {output.js} -o {output.o1} {params.qpara} 2> {log}"
diff --git a/workflows/star.nf b/workflows/star.nf
index 83681e95..62c3563a 100644
--- a/workflows/star.nf
+++ b/workflows/star.nf
@@ -96,7 +96,7 @@ process star_mapping{
r1 = reads[1]
r2 = reads[2]
a = "Trimming_report.txt"
- fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"")
+ fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"")
of = fn+'.Aligned.out.sam'
gf = of.replaceAll(/\Q.Aligned.out.sam\E/,"_mapped.sam.gz")
"""
@@ -106,7 +106,7 @@ process star_mapping{
else{
if (PAIRED != 'singlecell'){
read = reads[1]
- fn = file(reads[1]).getSimpleName().replaceAll(/\Q_trimmed\E/,"")+"."
+ fn = file(reads[1]).getSimpleName().replaceAll(/(_dedup)?_trimmed$/,"")+"."
of = fn+'Aligned.out.sam'
gf = of.replaceAll(/\Q.Aligned.out.sam\E/,"_mapped.sam.gz")
"""
@@ -122,7 +122,7 @@ process star_mapping{
stranded = '--soloStrand Unstranded'
}
r1 = reads[1]
- fn = file(r1).getSimpleName().replaceAll(/\Q_R1_trimmed\E/,"")
+ fn = file(r1).getSimpleName().replaceAll(/_R1(_dedup)?_trimmed$/,"")
r2 = "${workflow.workDir}/../FASTQ/${CONDITION}/"+file(reads[2]).getSimpleName().replaceAll(/\QR2_trimmed\E/,"R2.fastq.gz")
if (MAPPARAMS.contains('--soloBarcodeMate 1')){
t = r2
diff --git a/workflows/umicollapse.nf b/workflows/umicollapse.nf
new file mode 100644
index 00000000..2968c5b2
--- /dev/null
+++ b/workflows/umicollapse.nf
@@ -0,0 +1,135 @@
+DEDUPENV=get_always('DEDUPENV')
+DEDUPBIN=get_always('DEDUPBIN')
+
+WHITELISTPARAMS = get_always('umicollapse_params_WHITELIST') ?: ''
+EXTRACTPARAMS = get_always('umicollapse_params_EXTRACT') ?: ''
+
+// UMI extraction uses umi_tools extract (bundled in umicollapse env);
+// BAM deduplication is handled by umicollapse_dedup.nf.
+
+process whitelist{
+ conda "$DEDUPENV"+".yaml"
+ container "oras://jfallmann/monsda:"+"$DEDUPENV"
+ cpus THREADS
+ cache 'lenient'
+ //validExitStatus 0,1
+
+ publishDir "${workflow.workDir}/../" , mode: 'link',
+ saveAs: {filename ->
+ if (filename.indexOf("_whitelist") > 0) "DEDUP_FASTQ/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}_whitelist"
+ else if (filename.indexOf("log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/dedup_whitelist.log"
+ else null
+ }
+
+ input:
+ path samples
+
+ output:
+ path "*_whitelist", emit: wl
+
+ script:
+ if (WHITELISTPARAMS == ''){
+ outf = samples[0].getSimpleName().replace("_R1","")+"_dummy_whitelist"
+ """
+ touch $outf
+ """
+ } else {
+ if (PAIRED == 'paired'){
+ r1 = samples[0]
+ r2 = samples[1]
+ outf = samples[0].getSimpleName().replace("_R1","")+"_whitelist"
+ """
+ mkdir tmp && umi_tools whitelist $WHITELISTPARAMS --temp-dir tmp --log=wl.log --stdin=$r1 --read2-in=$r2 --stdout=$outf
+ """
+ }
+ else{
+ outf = samples.getSimpleName()+"_whitelist"
+ """
+ mkdir tmp && umi_tools whitelist $WHITELISTPARAMS --temp-dir tmp --log=wl.log --stdin=$samples --stdout=$outf
+ """
+ }
+ }
+}
+
+process extract_fq{
+ conda "$DEDUPENV"+".yaml"
+ container "oras://jfallmann/monsda:"+"$DEDUPENV"
+ cpus THREADS
+ cache 'lenient'
+ //validExitStatus 0,1
+
+ publishDir "${workflow.workDir}/../" , mode: 'link',
+ saveAs: {filename ->
+ if (filename.indexOf("_dedup.fastq.gz") > 0) "DEDUP_FASTQ/${COMBO}/${CONDITION}/${file(filename).getSimpleName()}.fastq.gz"
+ else if (filename.indexOf("log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/dedup_extract.log"
+ else null
+ }
+
+ input:
+ path wl
+ path samples
+
+ output:
+ path "*_dedup.fastq.gz", emit: extract
+ path "ex.log", emit: logs
+
+ script:
+ if (PAIRED == 'paired'){
+ r1 = samples[0]
+ r2 = samples[1]
+ outf = samples[0].getSimpleName()+"_dedup.fastq.gz"
+ outf2 = samples[1].getSimpleName()+"_dedup.fastq.gz"
+ if (!!(wl =~ /dummy_whitelist/)){
+ """
+ mkdir tmp && umi_tools extract $EXTRACTPARAMS --temp-dir tmp --log=ex.log --stdin=$r1 --read2-in=$r2 --stdout=$outf --read2-out=$outf2
+ """
+ }
+ else{
+ """
+ mkdir tmp && umi_tools extract $EXTRACTPARAMS --whitelist=$wl --temp-dir tmp --log=ex.log --stdin=$r1 --read2-in=$r2 --stdout=$outf --read2-out=$outf2
+ """
+ }
+ }
+ else{
+ outf = samples.getSimpleName()+"_dedup.fastq.gz"
+ if (!!(wl =~ /dummy_whitelist/)){
+ """
+ mkdir tmp && umi_tools extract $EXTRACTPARAMS --temp-dir tmp --log=ex.log --stdin=$samples --stdout=$outf
+ """
+ }
+ else{
+ """
+ mkdir tmp && umi_tools extract $EXTRACTPARAMS --whitelist=$wl --temp-dir tmp --log=ex.log --stdin=$samples --stdout=$outf
+ """
+ }
+ }
+}
+
+workflow DEDUPEXTRACT{
+ take:
+ collection
+
+ main:
+ //SAMPLE CHANNELS
+ if ( PREDEDUP == 'enabled' ){
+ if (PAIRED == 'paired'){
+ whitelist(samples_ch.collate( 2 ))
+ extract_fq(whitelist.out.wl, samples_ch.collate( 2 ))
+ } else{
+ whitelist(samples_ch.collate( 1 ))
+ extract_fq(whitelist.out.wl, samples_ch.collate( 1 ))
+ }
+ }else{
+ if (PAIRED == 'paired'){
+ whitelist(collection.collate(2))
+ extract_fq(whitelist.out.wl, collection.collate( 2 ))
+ } else{
+ whitelist(collection.collate( 1 ))
+ extract_fq(whitelist.out.wl, collection.collate( 1 ))
+ }
+ }
+
+ emit:
+ extract = extract_fq.out.extract
+ logs = extract_fq.out.logs
+}
diff --git a/workflows/umicollapse.smk b/workflows/umicollapse.smk
new file mode 100644
index 00000000..e7a83c9e
--- /dev/null
+++ b/workflows/umicollapse.smk
@@ -0,0 +1,146 @@
+DEDUPBIN, DEDUPENV = env_bin_from_config(config, 'DEDUP')
+
+wlparams = tool_params(SAMPLES[0], None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('WHITELIST')
+
+wildcard_constraints:
+ type = "sorted|sorted_unique"
+
+# UMI extraction uses umi_tools extract (bundled in umicollapse env);
+# BAM deduplication uses umicollapse bam for significantly faster dedup.
+
+if paired == 'paired':
+ if wlparams:
+ rule whitelist:
+ input: r1 = lambda wildcards: "FASTQ/{rawfile}_R1.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]),
+ r2 = lambda wildcards: "FASTQ/{rawfile}_R2.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0])
+ output: wl = "DEDUP_FASTQ/{combo}/{file}_whitelist",
+ td = temp(directory("TMP/UMIWL/{combo}/{file}"))
+ log: "LOGS/{combo}/{file}_dedup_whitelist.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('WHITELIST', ""),
+ dedup = "umi_tools"
+ resources:
+ mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)),
+ runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1))
+ shell: "mkdir -p {output.td} && {params.dedup} whitelist {params.dpara} --temp-dir {output.td} --log={log} --stdin={input.r1} --read2-in={input.r2} --stdout={output.wl}"
+
+ rule extract:
+ input: r1 = lambda wildcards: "FASTQ/{rawfile}_R1.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]),
+ r2 = lambda wildcards: "FASTQ/{rawfile}_R2.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]),
+ wl = rules.whitelist.output.wl
+ output: o1 = "DEDUP_FASTQ/{combo}/{file}_R1_dedup.fastq.gz",
+ o2 = "DEDUP_FASTQ/{combo}/{file}_R2_dedup.fastq.gz",
+ td = temp(directory("TMP/UMIEX/{combo}/{file}"))
+ log: "LOGS/{combo}/{file}_dedup_extract.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""),
+ dedup = "umi_tools"
+ resources:
+ mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)),
+ runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1))
+ shell: "mkdir -p {output.td} && {params.dedup} extract {params.dpara} --temp-dir {output.td} --log={log} --error-correct-cell --whitelist={input.wl} --stdin={input.r1} --read2-in={input.r2} --stdout={output.o1} --read2-out={output.o2}"
+ else:
+ rule extract:
+ input: r1 = lambda wildcards: "FASTQ/{rawfile}_R1.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]),
+ r2 = lambda wildcards: "FASTQ/{rawfile}_R2.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0])
+ output: o1 = "DEDUP_FASTQ/{combo}/{file}_R1_dedup.fastq.gz",
+ o2 = "DEDUP_FASTQ/{combo}/{file}_R2_dedup.fastq.gz",
+ td = temp(directory("TMP/UMIEX/{combo}/{file}"))
+ log: "LOGS/{combo}/{file}_dedup_extract.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""),
+ dedup = "umi_tools"
+ resources:
+ mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)),
+ runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1))
+ shell: "mkdir -p {output.td} && {params.dedup} extract {params.dpara} --temp-dir {output.td} --log={log} --stdin={input.r1} --read2-in={input.r2} --stdout={output.o1} --read2-out={output.o2}"
+
+else:
+ if wlparams:
+ rule whitelist:
+ input: r1 = lambda wildcards: "FASTQ/{rawfile}.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0])
+ output: wl = "DEDUP_FASTQ/{combo}/{file}_whitelist",
+ td = temp(directory("TMP/UMIWL/{combo}/{file}"))
+ log: "LOGS/{combo}/{file}_dedup_whitelist.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('WHITELIST', ""),
+ dedup = "umi_tools"
+ resources:
+ mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)),
+ runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1))
+ shell: "mkdir -p {output.td} && {params.dedup} whitelist {params.dpara} --temp-dir {output.td} --log={log} --stdin={input.r1} --stdout={output.wl}"
+
+ rule extract:
+ input: r1 = lambda wildcards: "FASTQ/{rawfile}.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0]),
+ wl = rules.whitelist.output.wl
+ output: o1 = "DEDUP_FASTQ/{combo}/{file}_dedup.fastq.gz",
+ td = temp(directory("TMP/UMIEX/{combo}/{file}"))
+ log: "LOGS/{combo}/{file}_dedup_extract.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""),
+ dedup = "umi_tools"
+ resources:
+ mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)),
+ runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1))
+ shell: "mkdir -p {output.td} && {params.dedup} extract {params.dpara} --temp-dir {output.td} --log={log} --error-correct-cell --whitelist={input.wl} --stdin={input.r1} --stdout={output.o1}"
+
+ else:
+ rule extract:
+ input: r1 = lambda wildcards: "FASTQ/{rawfile}.fastq.gz".format(rawfile=[x for x in SAMPLES if x.split(os.sep)[-1] in wildcards.file][0])
+ output: o1 = "DEDUP_FASTQ/{combo}/{file}_dedup.fastq.gz",
+ td = temp(directory("TMP/UMIEX/{combo}/{file}"))
+ log: "LOGS/{combo}/{file}_dedup_extract.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('EXTRACT', ""),
+ dedup = "umi_tools"
+ resources:
+ mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)),
+ runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1))
+ shell: "mkdir -p {output.td} && {params.dedup} extract {params.dpara} --temp-dir {output.td} --log={log} --stdin={input.r1} --stdout={output.o1}"
+
+if paired == 'paired':
+ rule dedupbam:
+ input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam"
+ output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"),
+ bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"),
+ td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}"))
+ log: "LOGS/{combo}/{file}_{type}/dedupbam.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ priority: 0
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""),
+ dedup = DEDUPBIN
+ resources:
+ mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)),
+ runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1))
+ shell: "mkdir -p {output.td} && {params.dedup} bam -i {input.bam} -o {output.td}/dedup.unsorted.bam --paired {params.dpara} > {log} 2>&1 && samtools sort -@ {threads} -o {output.bam} {output.td}/dedup.unsorted.bam >> {log} 2>&1 && samtools index {output.bam} >> {log} 2>&1"
+else:
+ rule dedupbam:
+ input: bam = "MAPPED/{combo}/{file}_mapped_{type}.bam"
+ output: bam = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam", category="DEDUP"),
+ bai = report("MAPPED/{combo}/{file}_mapped_{type}_dedup.bam.bai", category="DEDUP"),
+ td = temp(directory("TMP/UMIDD/{combo}/{file}_{type}"))
+ log: "LOGS/{combo}/{file}_{type}/dedupbam.log"
+ conda: ""+DEDUPENV+".yaml"
+ container: "oras://jfallmann/monsda:"+DEDUPENV+""
+ threads: 1
+ priority: 0
+ params: dpara = lambda wildcards: tool_params(wildcards.file, None, config, "DEDUP", DEDUPENV)['OPTIONS'].get('DEDUP', ""),
+ dedup = DEDUPBIN
+ resources:
+ mem_mb = lambda wildcards, attempt: 20000 * (2 ** (attempt - 1)),
+ runtime = lambda wildcards, attempt: 480 * (2 ** (attempt - 1))
+ shell: "mkdir -p {output.td} && {params.dedup} bam -i {input.bam} -o {output.td}/dedup.unsorted.bam {params.dpara} > {log} 2>&1 && samtools sort -@ {threads} -o {output.bam} {output.td}/dedup.unsorted.bam >> {log} 2>&1 && samtools index {output.bam} >> {log} 2>&1"
diff --git a/workflows/umicollapse_dedup.nf b/workflows/umicollapse_dedup.nf
new file mode 100644
index 00000000..8a77bc60
--- /dev/null
+++ b/workflows/umicollapse_dedup.nf
@@ -0,0 +1,64 @@
+DEDUPENV=get_always('DEDUPENV')
+DEDUPBIN=get_always('DEDUPBIN')
+
+DEDUPPARAMS = get_always('umicollapse_params_DEDUP') ?: ''
+
+process dedup_bam{
+ conda "$DEDUPENV"+".yaml"
+ container "oras://jfallmann/monsda:"+"$DEDUPENV"
+ cpus 1
+ cache 'lenient'
+ //validExitStatus 0,1
+
+ publishDir "${workflow.workDir}/../" , mode: 'link',
+ saveAs: {filename ->
+ if (filename.endsWith("_dedup.bam")) "MAPPED/${COMBO}/${CONDITION}/${file(filename).getName()}"
+ else if (filename.indexOf("_dedup.bam.bai") > 0) "MAPPED/${COMBO}/${CONDITION}/${file(filename).getName()}"
+ else if (filename.indexOf("dedup.log") > 0) "LOGS/${COMBO}/${CONDITION}/DEDUP/${file(filename).getName()}"
+ else null
+ }
+
+ input:
+ path todedup
+ path bami
+
+ output:
+ path "*_dedup.bam", emit: bam
+ path "*_dedup.bam.bai", emit: bai
+ path "*_dedup.log", emit: logs
+
+ memory { 20.GB * (1 << ((task.attempt ?: 1) - 1)) }
+ time { 8.h * (1 << ((task.attempt ?: 1) - 1)) }
+
+ script:
+ bams = todedup[0]
+ bais = todedup[1]
+ outf = bams.getSimpleName()+"_dedup.bam"
+ outl = bams.getSimpleName()+"_dedup.log"
+ if (PAIRED == 'paired'){
+ """
+ mkdir -p TMP && $DEDUPBIN bam -i $bams -o TMP/dedup.unsorted.bam --paired $DEDUPPARAMS > $outl 2>&1 && samtools sort -@ ${task.cpus} -o $outf TMP/dedup.unsorted.bam >> $outl 2>&1 && samtools index $outf >> $outl 2>&1
+ """
+ }
+ else{
+ """
+ mkdir -p TMP && $DEDUPBIN bam -i $bams -o TMP/dedup.unsorted.bam $DEDUPPARAMS > $outl 2>&1 && samtools sort -@ ${task.cpus} -o $outf TMP/dedup.unsorted.bam >> $outl 2>&1 && samtools index $outf >> $outl 2>&1
+ """
+ }
+}
+
+workflow DEDUPBAM{
+ take:
+ map
+ mapi
+ mapu
+ mapui
+
+ main:
+ dedup_bam(map.concat(mapu), mapi.concat(mapui))
+
+ emit:
+ dedup = dedup_bam.out.bam
+ dedupbai = dedup_bam.out.bai
+ deduplog = dedup_bam.out.logs
+}