Skip to content

Commit

Permalink
Adding PAVE filtering for SAGE tumor-only samples
Browse files Browse the repository at this point in the history
  • Loading branch information
skchronicles committed Jan 8, 2025
1 parent ea57257 commit d8f1216
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 3 deletions.
2 changes: 2 additions & 0 deletions config/genome.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
"HMFTOOLS_REF_VERSION": "38",
"HMFTOOLS_AMBER_JAR": "/data/OpenOmics/references/genome-seek/hmftools/amber-3.5.jar",
"HMFTOOLS_COBALT_JAR": "/data/OpenOmics/references/genome-seek/hmftools/cobalt-1.11.jar",
"HMFTOOLS_PAVE_JAR": "/data/OpenOmics/references/genome-seek/hmftools/pave_v1.6.jar",
"HMFTOOLS_PURPLE_JAR": "/data/OpenOmics/references/genome-seek/hmftools/purple_v3.2.jar",
"HMFTOOLS_SAGE_JAR": "/data/OpenOmics/references/genome-seek/hmftools/sage_v3.4.1.jar",
"HMFTOOLS_SAGE_REF_VERSION": "38",
Expand All @@ -75,6 +76,7 @@
"HMFTOOLS_SAGE_HIGH_CONF": "/data/OpenOmics/references/genome-seek/hmftools/v5_34/ref/38/variants/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed.gz",
"HMFTOOLS_SAGE_ENSEMBL_DATA": "/data/OpenOmics/references/genome-seek/hmftools/v5_34/ref/38/common/ensembl_data/",
"HMFTOOLS_AMBER_LOCI": "/data/OpenOmics/references/genome-seek/hmftools/GermlineHetPon.hg38.vcf.gz",
"HMFTOOLS_COHORT_PON": "/data/OpenOmics/references/genome-seek/hmftools/v5_34/ref/38/variants/SageGermlinePon.98x.38.tsv.gz",
"HMFTOOLS_GC_PROFILE": "/data/OpenOmics/references/genome-seek/hmftools/GC_profile.hg38.1000bp.cnp",
"HMFTOOLS_DIPLOID": "/data/OpenOmics/references/genome-seek/hmftools/DiploidRegions.hg38.bed.gz",
"HMFTOOLS_DRIVER_PANEL": "/data/OpenOmics/references/genome-seek/hmftools/DriverGenePanel.38.feb2021.tsv",
Expand Down
69 changes: 66 additions & 3 deletions workflow/rules/somatic.smk
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,19 @@ rule hmftools_sage:
normal_bam = lambda w: "-reference_bam {0}.sorted.bam".format(
join(workpath, "BAM", tumor2normal[w.name])
) if tumor2normal[w.name] else "",
# Options related to running PAVE
# to filter tumor-only samples
# SAGE run mode, either:
# tumor-normal or tumor-only
pave_jar = config['references']['HMFTOOLS_PAVE_JAR'],
sage_run_mode = lambda w: "tumor-normal" if tumor2normal[w.name] else "tumor-only",
pave_input = join(workpath, "sage", "somatic", "{name}.tmp.vcf.gz"),
pave_output = join(workpath, "sage", "somatic", "{name}_pave_tmp", "{name}.tmp.pave.vcf.gz"),
pave_filtered = join(workpath, "sage", "somatic", "{name}_pave_tmp", "{name}.filtered.pave.vcf"),
pave_outdir = join(workpath, "sage", "somatic", "{name}_pave_tmp"),
cohort_pon = config['references']['HMFTOOLS_COHORT_PON'],
bcf_ann_input = lambda w: join(workpath, "sage", "somatic", "{0}.tmp.vcf".format(w.name)) if tumor2normal[w.name] \
else join(workpath, "sage", "somatic", "{0}_pave_tmp".format(w.name), "{0}.filtered.pave.vcf".format(w.name)),
threads:
int(allocated("threads", "hmftools_sage", cluster)),
container: config['images']['genome-seek_cnv']
Expand All @@ -696,7 +709,51 @@ rule hmftools_sage:
-high_confidence_bed {params.high_conf} \\
-ensembl_data_dir {params.ensembl_data} \\
-output_vcf {output.tmp}
# Filter tumor-only samples with PAVE,
# filtering is needed when SAGE is run
# in tumor-only mode to remove the high
# false positives, this step will only
# run if a tumor sample does NOT have
# a paired normal sample.
SAGE_MODE="{params.sage_run_mode}"
if [ $SAGE_MODE == "tumor-only" ]; then
# PAVE needs bgzipp-ed, indexed VCF
bgzip -c {output.tmp} \\
> {params.pave_input}
tabix -p vcf {params.pave_input}
# Run PAVE to annotate VCF with PON
# tags for downstream filtering
java -Xmx{params.memory}g -jar {params.pave_jar} \\
-pon_file {params.cohort_pon} \\
-sample {params.tumor} \\
-vcf_file {params.pave_input} \\
-ensembl_data_dir {params.ensembl_data} \\
-ref_genome {params.genome} \\
-ref_genome_version {params.ref_version} \\
-output_dir {params.pave_outdir}/ \\
-threads {threads}
# Filtering PAVE output based on the
# authors recommendations
bcftools filter \\
-e 'PON_COUNT!="." && INFO/TIER="HOTSPOT" && PON_MAX>=5 && PON_COUNT >= 5' \\
-s PON \\
-m+ {params.pave_output} -Ou \\
| bcftools filter \\
-e 'PON_COUNT!="." && INFO/TIER="PANEL" && PON_MAX>=5 && PON_COUNT >= 2' \\
-s PON \\
-m+ \\
-Ou \\
| bcftools filter \\
-e 'PON_COUNT!="." && INFO/TIER!="HOTSPOT" && INFO/TIER!="PANEL" && PON_COUNT >= 2' \\
-s PON \\
-m+ \\
-Ov \\
-o {params.pave_filtered}
fi
# Remove SG INFO/FORMAT tag from temp
# VCF file. This tag causes issues when
# bcftools is run on the merged VCF file
Expand All @@ -705,7 +762,7 @@ rule hmftools_sage:
# commands cause downstream issues.
bcftools annotate \\
-x "FORMAT/SB,INFO/SB" \\
{output.tmp} \\
{params.bcf_ann_input} \\
-Ov \\
-o {output.vcf}
"""
Expand Down Expand Up @@ -1492,6 +1549,12 @@ rule somatic_merge_tumor:
clairs_option = lambda w: "" if tumor2normal[w.name] else "--variant:clairs {0}.clairs.filtered.norm.vcf".format(
join(workpath, "clairs", "somatic", w.name)
),
# Tumor-only option to require variants
# to be present in the callset of at
# least two callers. Variants found in
# tumor-only samples need to be found
# by at least two callers.
min_n_option = lambda w: "" if tumor2normal[w.name] else "-minN 2"
threads:
int(allocated("threads", "somatic_merge_tumor", cluster))
container: config['images']['genome-seek_somatic']
Expand All @@ -1505,7 +1568,7 @@ rule somatic_merge_tumor:
java -Xmx{params.memory}g -Djava.io.tmpdir=${{tmp}} \\
-XX:ParallelGCThreads={threads} -jar $GATK_JAR -T CombineVariants \\
-R {params.genome} \\
-R {params.genome} {params.min_n_option} \\
--filteredrecordsmergetype KEEP_IF_ANY_UNFILTERED \\
--genotypemergeoption PRIORITIZE \\
--rod_priority_list {params.priority_list} \\
Expand Down

0 comments on commit d8f1216

Please sign in to comment.