diff --git a/workflow/Snakefile b/workflow/Snakefile index 353629c..c0d5dde 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -37,6 +37,8 @@ normals = [n for n in normals if n] # Remove tumor-onlys, i.e. empty tumor2normal = config['pairs'] # Dict to map a tumor to its normal # List of tumor samples with a paired normal tumorWnormal = [t for t in tumors if tumor2normal[t]] +# List of tumor-only samples +tumorOnly = [t for t in tumors if not tumor2normal[t]] # Use GPU-compute for any supported tools use_gpus = str_bool( config['options']['use_gpus'] @@ -93,8 +95,9 @@ call_gatk_germline = str_bool(config['options']['gatk_germline']) # caller is added and add # conditional running of # each supported caller -somatic_callers = ["octopus", "mutect2"] -tn_somatic_callers = ["muse", "strelka"] +somatic_callers = ["octopus", "mutect2", "sage"] # callers run in both TN and TO mode +tn_somatic_callers = ["muse", "deepsomatic"] # callers run exclusively in TN mode, strelka removed +to_somatic_callers = ["clairs"] # callers run exclusively in TO mode # Mutect2 and MuSE reliably # work with Purple CNV Caller purple_callers = ["mutect2"] @@ -348,6 +351,21 @@ rule all: join(workpath, "octopus", "somatic", "{name}.octopus.vcf"), name=provided(tumors, call_somatic), ), + # Call somatic variants with SAGE, + # only runs if provided --call-somatic, + # @imported from rules/somatic.smk + expand( + join(workpath, "sage", "somatic", "{name}.sage.vcf"), + name=provided(tumors, call_somatic), + ), + # Call somatic variants with ClairS-TO, + # only runs if provided --call-somatic, + # and only runs with tumor-only samples, + # @imported from rules/somatic.smk + expand( + join(workpath, "clairs", "somatic", "{name}.clairs.vcf"), + name=provided(tumorOnly, call_somatic), + ), # Manta, call somatic structural variation (SV), # only runs if provided --call-somatic AND --call-sv # @imported from rules/sv.smk @@ -389,38 +407,47 @@ rule all: join(workpath, "mutect2", "somatic", "{name}.mutect2.vcf"), name=provided(tumors, call_somatic), ), - # MuSE, call somatic variants, + # Call somatic variants with Deepsomatic, # only runs if provided --call-somatic, - # only runs with tumor-normal pairs, + # only runs with tumor-normal pairs, # @imported from rules/somatic.smk expand( - join(workpath, "muse", "somatic", "{name}.muse.vcf"), + join(workpath, "deepsomatic", "somatic", "{name}.deepsomatic.vcf"), name=provided(tumorWnormal, call_somatic), ), - # Strelka, call somatic variants, + # MuSE, call somatic variants, # only runs if provided --call-somatic, - # only runs with tumor-normal pairs, + # only runs with tumor-normal pairs, # @imported from rules/somatic.smk expand( - join(workpath, "strelka", "somatic", "{name}.strelka.vcf"), + join(workpath, "muse", "somatic", "{name}.muse.vcf"), name=provided(tumorWnormal, call_somatic), ), # Post-process somatic VCF, filter and # normalize each somatic callers VCF, - # only runs if provided --call-somatic + # only runs if provided --call-somatic, + # runs TN or Tonly somatic callers, + # Octopus, Mutect2, SAGE # @imported from rules/somatic.smk expand( join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.vcf"), caller=provided(somatic_callers, call_somatic), name=provided(tumors, call_somatic), ), - # Conditionally add tumor-normal callers, - # MuSE and Strelka + # Conditionally runs the exclusive TN callers, + # MuSE and Deepsomatic (strelka removed) expand( join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.vcf"), caller=provided(tn_somatic_callers, call_somatic), name=provided(tumorWnormal, call_somatic), ), + # Conditionally runs the exclusive TO callers, + # ClairS-TO + expand( + join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.vcf"), + caller=provided(to_somatic_callers, call_somatic), + name=provided(tumorOnly, call_somatic), + ), # Post-process somatic VCF, # and merge callsets across callers, # only runs if provided --call-somatic @@ -612,4 +639,4 @@ include: join("rules", "gatk_recal.smk") include: join("rules", "somatic.smk") include: join("rules", "wes.smk") include: join("rules", "gatk_germline.smk") -include: join("rules", "hooks.smk") \ No newline at end of file +include: join("rules", "hooks.smk") diff --git a/workflow/rules/somatic.smk b/workflow/rules/somatic.smk index 6bf9271..6772ecf 100644 --- a/workflow/rules/somatic.smk +++ b/workflow/rules/somatic.smk @@ -52,24 +52,31 @@ def get_normal_pileup_table(wildcards): return [] -def get_somatic_tn_callers(wildcards): - """Returns somatic variants found with tumor-normal variant - callers. For tumor-normal samples, extra somatic callers - (i.e. MuSE and Strelka and DeepSomatic) are run. Tumor-only - samples return an empty list (rule already has reference - in input section). See config['pairs'] for tumor, normal pairs. +def get_somatic_conditional_callers(wildcards): + """Returns conditional somatic variant callers. These are + variant callers that will only run in tumor-normal mode or + tumor-only mode. For example, MuSE and DeepSomatic are only + run in tumor-normal mode, and clairs-to is only run with + tumor-only samples. For more info, please see config['pairs'] + for a list of tumor-normal pairs from the config.json file. """ tumor = wildcards.name normal = tumor2normal[tumor] if normal: - # Callers = MuSE, Strelka + # Callers = MuSE, Deepsomatic (no longer running strelka) + # Strelka removed with the addition of Deepsomatic return [ join(workpath, caller, "somatic", "{0}.{1}.filtered.norm.vcf".format(tumor, caller)) \ for caller in tn_somatic_callers ] else: - # No paired normal, return nothing - return [] + # No paired normal, return clairs + # which is only run on tumor-only + # samples + return [ + join(workpath, caller, "somatic", "{0}.{1}.filtered.norm.vcf".format(tumor, caller)) \ + for caller in to_somatic_callers + ] # Data processing rules for calling somatic variants @@ -643,6 +650,7 @@ rule hmftools_sage: tumor = join(workpath, "BAM", "{name}.sorted.bam"), normal = get_normal_sorted_bam output: + tmp = join(workpath, "sage", "somatic", "{name}.tmp.vcf"), vcf = join(workpath, "sage", "somatic", "{name}.sage.vcf"), params: rname = 'hmfsage', @@ -687,7 +695,19 @@ rule hmftools_sage: -panel_bed {params.panel} \\ -high_confidence_bed {params.high_conf} \\ -ensembl_data_dir {params.ensembl_data} \\ - -output_vcf {output.vcf} + -output_vcf {output.tmp} + + # Remove SG INFO/FORMAT tag from temp + # VCF file. This tag causes issues when + # bcftools is run on the merged VCF file + # produced by CombineVariants. If it is + # not removed, any subsequent bcftools + # commands cause downstream issues. + bcftools annotate \\ + -x "FORMAT/SB,INFO/SB" \\ + {output.tmp} \\ + -Ov \\ + -o {output.vcf} """ @@ -772,13 +792,13 @@ rule deepsomatic_make_examples: inefficently. As so, it is better to run the 1st/3rd step on a compute node and run the 2nd step on a GPU node. @Input: - Duplicate marked, sorted BAM file (scatter) + Duplicate marked, recal BAM file (scatter) @Output: Flag file to indicate success of make_examples """ input: - tumor = join(workpath, "BAM", "{name}.sorted.bam"), - normal = get_normal_sorted_bam + tumor = join(workpath, "BAM", "{name}.recal.bam"), + normal = get_normal_recal_bam output: success = join(workpath, "deepsomatic", "mk_examples", "{name}.make_examples.success"), params: @@ -804,8 +824,8 @@ rule deepsomatic_make_examples: ) if run_wes else '', # Get tumor and normal sample names tumor = '{name}', - # Building option for the paired normal sorted bam - normal_bam_option = lambda w: "--reads_normal {0}.sorted.bam".format( + # Building option for the paired normal recal bam + normal_bam_option = lambda w: "--reads_normal {0}.recal.bam".format( join(workpath, "BAM", tumor2normal[w.name]) ) if tumor2normal[w.name] else "", # Building option for the normal sample name @@ -1069,7 +1089,8 @@ rule deepsomatic_postprocess_variants: input: success = join(workpath, "deepsomatic", "call_variants", "{name}.cv.success"), output: - vcf = join(workpath, "deepsomatic", "somatic", "{name}.deepsomatic.vcf"), + vcfgz = join(workpath, "deepsomatic", "somatic", "{name}.deepsomatic.vcf.gz"), + vcf = join(workpath, "deepsomatic", "somatic", "{name}.deepsomatic.vcf"), params: rname = "ds_postprovars", genome = config['references']['GENOME'], @@ -1092,9 +1113,12 @@ rule deepsomatic_postprocess_variants: time postprocess_variants \\ --ref {params.genome} \\ --infile {params.callvar} \\ - --outfile {output.vcf} \\ + --outfile {output.vcfgz} \\ --process_somatic=true \\ --cpus={threads} + # Deepsomatic outputs a gzipped VCF file + zcat {output.vcfgz} \\ + > {output.vcf} """ @@ -1174,7 +1198,9 @@ rule muse: rule strelka: - """Data-processing step to call somatic mutations with Strelka. This tool is + """ + @DEPRECIATED: The pipeline no longer RUNS Strelka! Deepsomatic replaced it. + Data-processing step to call somatic mutations with Strelka. This tool is optimized for rapid clinical analysis of germline variation in small cohorts and somatic variation in tumor/normal sample pairs. More information about strelka can be found here: https://github.com/Illumina/strelka @@ -1284,7 +1310,9 @@ rule strelka: rule strelka_format: - """Data-processing step to call somatic mutations with Strelka. This tool is + """ + @DEPRECIATED: The pipeline no longer RUNS Strelka. Deepsomatic replaced it. + Data-processing step to call somatic mutations with Strelka. This tool is optimized for rapid clinical analysis of germline variation in small cohorts and somatic variation in tumor/normal sample pairs. More information about strelka can be found here: https://github.com/Illumina/strelka @@ -1428,12 +1456,13 @@ rule somatic_merge_tumor: @Input: Somatic variants found in the tumor sample (gather-across-somatic-callers) @Output: - Variants found in at least 2 callers + Variants found in all callers (union callset) """ input: - tn_callers = get_somatic_tn_callers, # i.e muse, strelka, deepsomatic - octopus = join(workpath, "octopus", "somatic", "{name}.octopus.filtered.norm.vcf"), - mutect2 = join(workpath, "mutect2", "somatic", "{name}.mutect2.filtered.norm.vcf"), + cn_callers = get_somatic_conditional_callers, # i.e muse-TN, deepsomatic-TN, clairs-TO + octopus = join(workpath, "octopus", "somatic", "{name}.octopus.filtered.norm.vcf"), + mutect2 = join(workpath, "mutect2", "somatic", "{name}.mutect2.filtered.norm.vcf"), + sage = join(workpath, "sage", "somatic", "{name}.sage.filtered.norm.vcf"), output: merged = join(workpath, "merged", "somatic", "{name}.merged.filtered.norm.vcf.gz"), params: @@ -1451,14 +1480,18 @@ rule somatic_merge_tumor: # Dynamically update the priority list # based on wether a sample is a tumor-only # or a tumor-normal - priority_list = lambda w: "mutect2,octopus,muse,strelka" \ - if tumor2normal[w.name] else "mutect2,octopus", - strelka_option = lambda w: "--variant:strelka {0}.strelka.filtered.norm.vcf".format( - join(workpath, "strelka", "somatic", w.name) - ) if tumor2normal[w.name] else "", + priority_list = lambda w: "mutect2,octopus,muse,deepsomatic,sage" \ + if tumor2normal[w.name] else "mutect2,octopus,clairs,sage", muse_option = lambda w: "--variant:muse {0}.muse.filtered.norm.vcf".format( join(workpath, "muse", "somatic", w.name) ) if tumor2normal[w.name] else "", + deepsomatic_option = lambda w: "--variant:deepsomatic {0}.deepsomatic.filtered.norm.vcf".format( + join(workpath, "deepsomatic", "somatic", w.name) + ) if tumor2normal[w.name] else "", + # Only run clairs with tumor-only samples + clairs_option = lambda w: "" if tumor2normal[w.name] else "--variant:clairs {0}.clairs.filtered.norm.vcf".format( + join(workpath, "clairs", "somatic", w.name) + ), threads: int(allocated("threads", "somatic_merge_tumor", cluster)) container: config['images']['genome-seek_somatic'] @@ -1477,7 +1510,9 @@ rule somatic_merge_tumor: --genotypemergeoption PRIORITIZE \\ --rod_priority_list {params.priority_list} \\ -o {output.merged} \\ - --variant:octopus {input.octopus} --variant:mutect2 {input.mutect2} {params.strelka_option} {params.muse_option} + --variant:octopus {input.octopus} \\ + --variant:mutect2 {input.mutect2} \\ + --variant:sage {input.sage} {params.clairs_option} {params.muse_option} {params.deepsomatic_option} """ rule somatic_sample_maf: