From 785b5ec1bbdcb6a92ae8b81c19feaefd73beac01 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Wed, 18 Dec 2024 19:27:00 -0500 Subject: [PATCH 01/27] initial wdl --- vardict.wdl | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100755 vardict.wdl diff --git a/vardict.wdl b/vardict.wdl new file mode 100755 index 0000000..ba6a27f --- /dev/null +++ b/vardict.wdl @@ -0,0 +1,108 @@ +version 1.0 + + +workflow vardict { + input { + File tumor_bam + File normal_bam + String tumor_sample_name + String normal_sample_name + } + + parameter_meta { + tumor_bam: "Input fastqR1 file for analysis sample" + normal_bam: "Input fastqR2 file for analysis sample" + tumor_sample_name:"Sample name for the tumor bam" + normal_sample_name: "Sample name for the normal bam" + } + + # run vardict + call runVardict + { + input: + tumor_bam = tumor_bam, + normal_bam = normal_bam, + tumor_sample_name = tumor_sample_name, + normal_sample_name = normal_sample_name + } + + meta { + author: "Gavin Peng" + email: "gpeng@oicr.on.ca" + description: "VarDict is an ultra sensitive variant caller for both single and paired sample variant calling from BAM files. VarDict implements several novel features such as amplicon bias aware variant calling from targeted sequencing experiments, rescue of long indels by realigning bwa soft clipped reads and better scalability than many Java based variant callers." + dependencies: [ + { + name: "vardict/1.8.3", + url: "https://github.com/pachterlab/vardict" + }, + { + name: "samtools/1.16.1", + url: "https://www.htslib.org/" + }, + { + name: "java", + url: "https://www.java.com/en/" + } + ] + } + output { + File vcf_out = runVardict.vcf_file + } + +} + +# ========================== +# configure and run vardict +# ========================== +task runVardict { + input { + File tumor_bam + File normal_bam + String tumor_sample_name + String normal_sample_name + String refFasta = "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa" + String AF_THR = 0.01 + String modules = "samtools/1.16.1 rstats/4.2 java/9 perl/5.30 vardict/1.8.3" + String bed_file = "/.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/CHUM_Data/5.Somatic_Calls/Part0_Running_Callers/renamed_hg38.bed" + Int timeout = 48 + Int jobMemory = 24 + } + + parameter_meta { + tumor_bam: "Input tumor bam file for analysis sample" + normal_bam: "Input normal bam file for analysis sample" + refFasta: "The reference fasta." + AF_THR: "The threshold for allele frequency, default: 0.01 or 1%" + bed_file: "bed_file for target regions" + jobMemory: "Memory in Gb for this job" + modules: "Names and versions of modules" + timeout: "Timeout in hours, needed to override imposed limits" + } + + command <<< + $VARDICT_ROOT/bin/VarDict \ + -G ~{refFasta} \ + -f ~{AF_THR} \ + -N ~{tumor_sample_name} | ~{normal_sample_name} \ + -b ~{tumor_bam} ~{normal_bam} \ + -Q 10 \ + -c 1 -S 2 -E 3 -g 4 \ + -P 0.9 \ + -m 8 -M 0 \ + ~{bed_file} | \ + $VARDICT_ROOT/bin/testsomatic.R | \ + $VARDICT_ROOT/bin/var2vcf_paired.pl > ~{tumor_sample_name}.vardict.vcf + + >>> + + runtime { + memory: "~{jobMemory} GB" + modules: "~{modules}" + timeout: "~{timeout}" + } + + output { + File vcf_file = "~{tumor_sample_name}.vardict.vcf" + + } +} \ No newline at end of file From 350044526afe22927939746a95319849769a3bf4 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Thu, 19 Dec 2024 22:30:47 -0500 Subject: [PATCH 02/27] small fix --- vardict.wdl | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index ba6a27f..99d9223 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -68,22 +68,11 @@ task runVardict { Int jobMemory = 24 } - parameter_meta { - tumor_bam: "Input tumor bam file for analysis sample" - normal_bam: "Input normal bam file for analysis sample" - refFasta: "The reference fasta." - AF_THR: "The threshold for allele frequency, default: 0.01 or 1%" - bed_file: "bed_file for target regions" - jobMemory: "Memory in Gb for this job" - modules: "Names and versions of modules" - timeout: "Timeout in hours, needed to override imposed limits" - } - command <<< $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ - -N ~{tumor_sample_name} | ~{normal_sample_name} \ + -N "~{tumor_sample_name}|~{normal_sample_name}" \ -b ~{tumor_bam} ~{normal_bam} \ -Q 10 \ -c 1 -S 2 -E 3 -g 4 \ From a9a64b01e760f06ccf27bbeee14fa595bb322ce3 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Fri, 20 Dec 2024 14:54:39 -0500 Subject: [PATCH 03/27] add perl version --- vardict_perl.wdl | 96 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100755 vardict_perl.wdl diff --git a/vardict_perl.wdl b/vardict_perl.wdl new file mode 100755 index 0000000..6463ec5 --- /dev/null +++ b/vardict_perl.wdl @@ -0,0 +1,96 @@ +version 1.0 + + +workflow vardict { + input { + File tumor_bam = "/.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/CHUM_Data/1.Fastq_to_Bam/Outputs_2/MoHQ-CM-1-180.DT.filter.deduped.realigned.recalibrated.bam" + File normal_bam = "/.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/CHUM_Data/1.Fastq_to_Bam/Outputs_2/MoHQ-CM-1-180.DN.filter.deduped.realigned.recalibrated.bam" + String tumor_sample_name = "MoHQ-CM-1-180.DT" + String normal_sample_name = "MoHQ-CM-1-180.DN" + } + + parameter_meta { + tumor_bam: "Input fastqR1 file for analysis sample" + normal_bam: "Input fastqR2 file for analysis sample" + tumor_sample_name:"Sample name for the tumor bam" + normal_sample_name: "Sample name for the normal bam" + } + + # run vardict + call runVardict + { + input: + tumor_bam = tumor_bam, + normal_bam = normal_bam, + tumor_sample_name = tumor_sample_name, + normal_sample_name = normal_sample_name + } + + meta { + author: "Gavin Peng" + email: "gpeng@oicr.on.ca" + description: "VarDict is an ultra sensitive variant caller for both single and paired sample variant calling from BAM files. VarDict implements several novel features such as amplicon bias aware variant calling from targeted sequencing experiments, rescue of long indels by realigning bwa soft clipped reads and better scalability than many Java based variant callers." + dependencies: [ + { + name: "vardict/1.8.3", + url: "https://github.com/pachterlab/vardict" + }, + { + name: "samtools/1.16.1", + url: "https://www.htslib.org/" + }, + { + name: "java", + url: "https://www.java.com/en/" + } + ] + } + output { + File vcf_out = runVardict.vcf_file + } + +} + +# ========================== +# configure and run vardict +# ========================== +task runVardict { + input { + File tumor_bam + File normal_bam + String tumor_sample_name + String normal_sample_name + String refFasta = "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa" + String AF_THR = 0.03 + String modules = "samtools/1.16.1 rstats/4.2 java/9 perl/5.30 vardict/1.8.3" + String bed_file = "/.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/CHUM_Data/5.Somatic_Calls/Part0_Running_Callers/renamed_hg38.bed" + Int timeout = 48 + Int jobMemory = 24 + } + + command <<< + /.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/Scripting/NeoAntigen/VarDict/vardict.pl \ + -G ~{refFasta} \ + -f ~{AF_THR} \ + -N "~{tumor_sample_name}" \ + -b ~{tumor_bam} ~{normal_bam} \ + -Q 10 \ + -c 1 -S 2 -E 3 -g 4 \ + -P 0.9 \ + ~{bed_file} | \ + $VARDICT_ROOT/bin/testsomatic.R | \ + $VARDICT_ROOT/bin/var2vcf_paired.pl -N "MoHQ-CM-1-180.DT|MoHQ-CM-1-180.DN" -f 0.03 -P 0.9 -m 4.25 -M > ~{tumor_sample_name}.vardict.vcf + + >>> + + runtime { + memory: "~{jobMemory} GB" + modules: "~{modules}" + timeout: "~{timeout}" + } + + output { + File vcf_file = "~{tumor_sample_name}.vardict.vcf" + + } +} \ No newline at end of file From 0e231d782d27fd0ffaa8862f495eeb85feb1cdfc Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 23 Dec 2024 11:07:44 -0500 Subject: [PATCH 04/27] small syntax change --- vardict.wdl | 16 +++++++--------- vardict_perl.wdl | 4 ++-- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index 99d9223..f6865c1 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -60,10 +60,10 @@ task runVardict { File normal_bam String tumor_sample_name String normal_sample_name - String refFasta = "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa" + String refFasta = "$HG19_ROOT/hg19_random.fa" String AF_THR = 0.01 - String modules = "samtools/1.16.1 rstats/4.2 java/9 perl/5.30 vardict/1.8.3" - String bed_file = "/.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/CHUM_Data/5.Somatic_Calls/Part0_Running_Callers/renamed_hg38.bed" + String modules = "samtools/1.16.1 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg19/p13" + String bed_file = "/.mounts/labs/gsi/testdata/mutect2/input_data/PCSI0022.val.bed" Int timeout = 48 Int jobMemory = 24 } @@ -72,15 +72,13 @@ task runVardict { $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ - -N "~{tumor_sample_name}|~{normal_sample_name}" \ - -b ~{tumor_bam} ~{normal_bam} \ + -N "~{tumor_sample_name}" \ + -b ~{tumor_bam} | ~{normal_bam} \ -Q 10 \ -c 1 -S 2 -E 3 -g 4 \ - -P 0.9 \ - -m 8 -M 0 \ ~{bed_file} | \ $VARDICT_ROOT/bin/testsomatic.R | \ - $VARDICT_ROOT/bin/var2vcf_paired.pl > ~{tumor_sample_name}.vardict.vcf + $VARDICT_ROOT/bin/var2vcf_paired.pl -N "~{tumor_sample_name}| ~{normal_sample_name}" -f 0.03 > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf >>> @@ -91,7 +89,7 @@ task runVardict { } output { - File vcf_file = "~{tumor_sample_name}.vardict.vcf" + File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf" } } \ No newline at end of file diff --git a/vardict_perl.wdl b/vardict_perl.wdl index 6463ec5..0808301 100755 --- a/vardict_perl.wdl +++ b/vardict_perl.wdl @@ -73,13 +73,13 @@ task runVardict { -G ~{refFasta} \ -f ~{AF_THR} \ -N "~{tumor_sample_name}" \ - -b ~{tumor_bam} ~{normal_bam} \ + -b ~{tumor_bam} | ~{normal_bam} \ -Q 10 \ -c 1 -S 2 -E 3 -g 4 \ -P 0.9 \ ~{bed_file} | \ $VARDICT_ROOT/bin/testsomatic.R | \ - $VARDICT_ROOT/bin/var2vcf_paired.pl -N "MoHQ-CM-1-180.DT|MoHQ-CM-1-180.DN" -f 0.03 -P 0.9 -m 4.25 -M > ~{tumor_sample_name}.vardict.vcf + $VARDICT_ROOT/bin/var2vcf_paired.pl -N "MoHQ-CM-1-180.DT|MoHQ-CM-1-180.DN" -f 0.03 > ~{tumor_sample_name}.vardict.vcf >>> From 194ae6f036a93a11aad6eb02b329e020490dce44 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 23 Dec 2024 11:25:20 -0500 Subject: [PATCH 05/27] yet more syntax test --- vardict.wdl | 4 ++-- vardict_perl.wdl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index f6865c1..b34ca28 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -72,8 +72,8 @@ task runVardict { $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ - -N "~{tumor_sample_name}" \ - -b ~{tumor_bam} | ~{normal_bam} \ + -N ~{tumor_sample_name} \ + -b "~{tumor_bam} | ~{normal_bam}" \ -Q 10 \ -c 1 -S 2 -E 3 -g 4 \ ~{bed_file} | \ diff --git a/vardict_perl.wdl b/vardict_perl.wdl index 0808301..b4b5bc9 100755 --- a/vardict_perl.wdl +++ b/vardict_perl.wdl @@ -72,8 +72,8 @@ task runVardict { /.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/Scripting/NeoAntigen/VarDict/vardict.pl \ -G ~{refFasta} \ -f ~{AF_THR} \ - -N "~{tumor_sample_name}" \ - -b ~{tumor_bam} | ~{normal_bam} \ + -N ~{tumor_sample_name} \ + -b "~{tumor_bam} | ~{normal_bam}" \ -Q 10 \ -c 1 -S 2 -E 3 -g 4 \ -P 0.9 \ From b28fafbf15b33ed9b1915d3a88035bd6456c2b78 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 23 Dec 2024 11:51:11 -0500 Subject: [PATCH 06/27] remove some parameters for testing --- vardict_perl.wdl | 2 -- 1 file changed, 2 deletions(-) diff --git a/vardict_perl.wdl b/vardict_perl.wdl index b4b5bc9..401045f 100755 --- a/vardict_perl.wdl +++ b/vardict_perl.wdl @@ -74,9 +74,7 @@ task runVardict { -f ~{AF_THR} \ -N ~{tumor_sample_name} \ -b "~{tumor_bam} | ~{normal_bam}" \ - -Q 10 \ -c 1 -S 2 -E 3 -g 4 \ - -P 0.9 \ ~{bed_file} | \ $VARDICT_ROOT/bin/testsomatic.R | \ $VARDICT_ROOT/bin/var2vcf_paired.pl -N "MoHQ-CM-1-180.DT|MoHQ-CM-1-180.DN" -f 0.03 > ~{tumor_sample_name}.vardict.vcf From 4c6ea2efa1ff7fb9b56086d7644ac359b3a664d6 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 23 Dec 2024 13:26:14 -0500 Subject: [PATCH 07/27] test simplest case --- vardict.wdl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index b34ca28..7c6d6ab 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -73,12 +73,9 @@ task runVardict { -G ~{refFasta} \ -f ~{AF_THR} \ -N ~{tumor_sample_name} \ - -b "~{tumor_bam} | ~{normal_bam}" \ - -Q 10 \ + -b "~{tumor_bam}" \ -c 1 -S 2 -E 3 -g 4 \ - ~{bed_file} | \ - $VARDICT_ROOT/bin/testsomatic.R | \ - $VARDICT_ROOT/bin/var2vcf_paired.pl -N "~{tumor_sample_name}| ~{normal_sample_name}" -f 0.03 > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf + ~{bed_file} > vardict.txt >>> @@ -89,7 +86,7 @@ task runVardict { } output { - File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf" + File vcf_file = "vardict.txt" } } \ No newline at end of file From a1712f1705d3a4f3eccbec36e81edfec92395699 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 23 Dec 2024 16:22:59 -0500 Subject: [PATCH 08/27] syntax fix, add README --- commands.txt | 23 +++++++++++++++++++++++ vardict.wdl | 51 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 61 insertions(+), 13 deletions(-) create mode 100644 commands.txt diff --git a/commands.txt b/commands.txt new file mode 100644 index 0000000..3418776 --- /dev/null +++ b/commands.txt @@ -0,0 +1,23 @@ +## Commands +This section lists command(s) run by WORKFLOW workflow + +* Running WORKFLOW + +=== Description here ===. + +<<< + $VARDICT_ROOT/bin/VarDict \ + -G ~{refFasta} \ + -f ~{AF_THR} \ + -N ~{tumor_sample_name} \ + -b "~{tumor_bam}|~{normal_bam}" \ + -Q ~{MAP_QUAL} \ + -P ~{READ_POSTION_FILTER} \ + -c 1 -S 2 -E 3 -g 4 \ + ~{bed_file} | \ + $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ + $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ + -N "~{tumor_sample_name}|~{normal_sample_name}" \ + -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf + + >>> diff --git a/vardict.wdl b/vardict.wdl index 7c6d6ab..8914f0d 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -10,8 +10,8 @@ workflow vardict { } parameter_meta { - tumor_bam: "Input fastqR1 file for analysis sample" - normal_bam: "Input fastqR2 file for analysis sample" + tumor_bam: "tumor_bam file for analysis sample" + normal_bam: "normal_bam file for analysis sample" tumor_sample_name:"Sample name for the tumor bam" normal_sample_name: "Sample name for the normal bam" } @@ -35,18 +35,21 @@ workflow vardict { name: "vardict/1.8.3", url: "https://github.com/pachterlab/vardict" }, - { - name: "samtools/1.16.1", - url: "https://www.htslib.org/" - }, { name: "java", url: "https://www.java.com/en/" } - ] + ], + output_meta: { + vardict_vcf: { + description: "VCF file for variant calling from vardict", + vidarr_label: "vardict_vcf" + } + } } + output { - File vcf_out = runVardict.vcf_file + File vardict_vcf = runVardict.vcf_file } } @@ -60,22 +63,44 @@ task runVardict { File normal_bam String tumor_sample_name String normal_sample_name - String refFasta = "$HG19_ROOT/hg19_random.fa" + String refFasta = "$HG38_ROOT/hg38_random.fa" String AF_THR = 0.01 - String modules = "samtools/1.16.1 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg19/p13" + String MAP_QUAL = 10 + String READ_POSTION_FILTER = 5 + String modules = "rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg38/p12" String bed_file = "/.mounts/labs/gsi/testdata/mutect2/input_data/PCSI0022.val.bed" Int timeout = 48 Int jobMemory = 24 } + parameter_meta { + tumor_bam: "tumor_bam file for analysis sample" + normal_bam: "normal_bam file for analysis sample" + tumor_sample_name:"Sample name for the tumor bam" + normal_sample_name: "Sample name for the normal bam" + refFasta: "The reference fasta" + AF_THR: "The threshold for allele frequency, default: 0.01 or 1%" + MAP_QUAL: " Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored" + READ_POSTION_FILTER: "The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5" + bed_file: "BED files for specifying regions of interest" + jobMemory: "Memory in Gb for this job" + modules: "Names and versions of modules" + timeout: "Timeout in hours, needed to override imposed limits" + } command <<< $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ -N ~{tumor_sample_name} \ - -b "~{tumor_bam}" \ + -b "~{tumor_bam}|~{normal_bam}" \ + -Q ~{MAP_QUAL} \ + -P ~{READ_POSTION_FILTER} \ -c 1 -S 2 -E 3 -g 4 \ - ~{bed_file} > vardict.txt + ~{bed_file} | \ + $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ + $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ + -N "~{tumor_sample_name}|~{normal_sample_name}" \ + -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf >>> @@ -86,7 +111,7 @@ task runVardict { } output { - File vcf_file = "vardict.txt" + File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf" } } \ No newline at end of file From fa18b277f80e7c1e00c238c21fdb1614880805e4 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 23 Dec 2024 16:34:30 -0500 Subject: [PATCH 09/27] perl version --- vardict_perl.wdl | 49 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/vardict_perl.wdl b/vardict_perl.wdl index 401045f..8ebf918 100755 --- a/vardict_perl.wdl +++ b/vardict_perl.wdl @@ -3,15 +3,15 @@ version 1.0 workflow vardict { input { - File tumor_bam = "/.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/CHUM_Data/1.Fastq_to_Bam/Outputs_2/MoHQ-CM-1-180.DT.filter.deduped.realigned.recalibrated.bam" - File normal_bam = "/.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/CHUM_Data/1.Fastq_to_Bam/Outputs_2/MoHQ-CM-1-180.DN.filter.deduped.realigned.recalibrated.bam" - String tumor_sample_name = "MoHQ-CM-1-180.DT" - String normal_sample_name = "MoHQ-CM-1-180.DN" + File tumor_bam + File normal_bam + String tumor_sample_name + String normal_sample_name } parameter_meta { - tumor_bam: "Input fastqR1 file for analysis sample" - normal_bam: "Input fastqR2 file for analysis sample" + tumor_bam: "tumor_bam file for analysis sample" + normal_bam: "normal_bam file for analysis sample" tumor_sample_name:"Sample name for the tumor bam" normal_sample_name: "Sample name for the normal bam" } @@ -46,7 +46,7 @@ workflow vardict { ] } output { - File vcf_out = runVardict.vcf_file + File vardict_vcf = runVardict.vcf_file } } @@ -60,24 +60,41 @@ task runVardict { File normal_bam String tumor_sample_name String normal_sample_name - String refFasta = "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa" - String AF_THR = 0.03 - String modules = "samtools/1.16.1 rstats/4.2 java/9 perl/5.30 vardict/1.8.3" - String bed_file = "/.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/CHUM_Data/5.Somatic_Calls/Part0_Running_Callers/renamed_hg38.bed" + String refFasta = "$HG38_ROOT/hg38_random.fa" + String AF_THR = 0.01 + String MAP_QUAL = 10 + String modules = "samtools/1.16.1 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg38/p12" + String bed_file = "/.mounts/labs/gsiprojects/gsi/gsiusers/gpeng/workflow/vardict/test/renamed_hg38.bed" Int timeout = 48 Int jobMemory = 24 } + parameter_meta { + tumor_bam: "tumor_bam file for analysis sample" + normal_bam: "normal_bam file for analysis sample" + tumor_sample_name:"Sample name for the tumor bam" + normal_sample_name: "Sample name for the normal bam" + refFasta: "The reference fasta" + AF_THR: "The threshold for allele frequency, default: 0.01 or 1%" + MAP_QUAL: " Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored" + bed_file: "BED files for specifying regions of interest" + jobMemory: "Memory in Gb for this job" + modules: "Names and versions of modules" + timeout: "Timeout in hours, needed to override imposed limits" + } command <<< - /.mounts/labs/gsiprojects/gsi/gsiusers/hdriver/Scripting/NeoAntigen/VarDict/vardict.pl \ + /.mounts/labs/gsiprojects/gsi/gsiusers/gpeng/workflow/vardict/test/vardict.pl \ -G ~{refFasta} \ -f ~{AF_THR} \ -N ~{tumor_sample_name} \ - -b "~{tumor_bam} | ~{normal_bam}" \ + -b "~{tumor_bam}|~{normal_bam}" \ -c 1 -S 2 -E 3 -g 4 \ + -Q ~{MAP_QUAL} \ ~{bed_file} | \ - $VARDICT_ROOT/bin/testsomatic.R | \ - $VARDICT_ROOT/bin/var2vcf_paired.pl -N "MoHQ-CM-1-180.DT|MoHQ-CM-1-180.DN" -f 0.03 > ~{tumor_sample_name}.vardict.vcf + $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ + $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ + -N "~{tumor_sample_name}|~{normal_sample_name}" \ + -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}_perl.vardict.vcf >>> @@ -88,7 +105,7 @@ task runVardict { } output { - File vcf_file = "~{tumor_sample_name}.vardict.vcf" + File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}_perl.vardict.vcf" } } \ No newline at end of file From 66819361304e88cf308ca16491b1cc80afbb5eae Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Tue, 24 Dec 2024 11:53:18 -0500 Subject: [PATCH 10/27] add set pattern in bash script --- README.md | 84 +++++++++++++++++++++++++++++++++++++++++++++++- vardict.wdl | 10 ++++-- vardict_perl.wdl | 15 +++++++-- 3 files changed, 103 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 872354c..3ddc021 100644 --- a/README.md +++ b/README.md @@ -1 +1,83 @@ -vardict wdl workflow +# vardict + +VarDict is an ultra sensitive variant caller for both single and paired sample variant calling from BAM files. VarDict implements several novel features such as amplicon bias aware variant calling from targeted sequencing experiments, rescue of long indels by realigning bwa soft clipped reads and better scalability than many Java based variant callers. + +## Overview + +## Dependencies + +* [vardict 1.8.3](https://github.com/pachterlab/vardict) +* [java](https://www.java.com/en/) + + +## Usage + +### Cromwell +``` +java -jar cromwell.jar run vardict.wdl --inputs inputs.json +``` + +### Inputs + +#### Required workflow parameters: +Parameter|Value|Description +---|---|--- +`tumor_bam`|File|tumor_bam file for analysis sample +`normal_bam`|File|normal_bam file for analysis sample +`tumor_sample_name`|String|Sample name for the tumor bam +`normal_sample_name`|String|Sample name for the normal bam +`bed_file`|String|BED files for specifying regions of interest + + +#### Optional workflow parameters: +Parameter|Value|Default|Description +---|---|---|--- + + +#### Optional task parameters: +Parameter|Value|Default|Description +---|---|---|--- +`runVardict.refFasta`|String|"$HG38_ROOT/hg38_random.fa"|The reference fasta +`runVardict.AF_THR`|String|0.01|The threshold for allele frequency, default: 0.01 or 1% +`runVardict.MAP_QUAL`|String|10| Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored +`runVardict.READ_POSTION_FILTER`|String|5|The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5 +`runVardict.modules`|String|"rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg38/p12"|Names and versions of modules +`runVardict.timeout`|Int|48|Timeout in hours, needed to override imposed limits +`runVardict.jobMemory`|Int|24|Memory in Gb for this job + + +### Outputs + +Output | Type | Description +---|---|--- +`vardict_vcf`|File|{'description': 'VCF file for variant calling from vardict', 'vidarr_label': 'vardict_vcf'} + + +## Commands + This section lists command(s) run by WORKFLOW workflow + + * Running WORKFLOW + + === Description here ===. + + <<< + $VARDICT_ROOT/bin/VarDict \ + -G ~{refFasta} \ + -f ~{AF_THR} \ + -N ~{tumor_sample_name} \ + -b "~{tumor_bam}|~{normal_bam}" \ + -Q ~{MAP_QUAL} \ + -P ~{READ_POSTION_FILTER} \ + -c 1 -S 2 -E 3 -g 4 \ + ~{bed_file} | \ + $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ + $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ + -N "~{tumor_sample_name}|~{normal_sample_name}" \ + -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf + + >>> + ## Support + +For support, please file an issue on the [Github project](https://github.com/oicr-gsi) or send an email to gsi@oicr.on.ca . + +_Generated with generate-markdown-readme (https://github.com/oicr-gsi/gsi-wdl-tools/)_ diff --git a/vardict.wdl b/vardict.wdl index 8914f0d..abd6a94 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -7,6 +7,7 @@ workflow vardict { File normal_bam String tumor_sample_name String normal_sample_name + String bed_file } parameter_meta { @@ -14,6 +15,7 @@ workflow vardict { normal_bam: "normal_bam file for analysis sample" tumor_sample_name:"Sample name for the tumor bam" normal_sample_name: "Sample name for the normal bam" + bed_file: "BED files for specifying regions of interest" } # run vardict @@ -23,7 +25,8 @@ workflow vardict { tumor_bam = tumor_bam, normal_bam = normal_bam, tumor_sample_name = tumor_sample_name, - normal_sample_name = normal_sample_name + normal_sample_name = normal_sample_name, + bed_file = bed_file } meta { @@ -39,7 +42,7 @@ workflow vardict { name: "java", url: "https://www.java.com/en/" } - ], + ] output_meta: { vardict_vcf: { description: "VCF file for variant calling from vardict", @@ -68,7 +71,7 @@ task runVardict { String MAP_QUAL = 10 String READ_POSTION_FILTER = 5 String modules = "rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg38/p12" - String bed_file = "/.mounts/labs/gsi/testdata/mutect2/input_data/PCSI0022.val.bed" + String bed_file Int timeout = 48 Int jobMemory = 24 } @@ -88,6 +91,7 @@ task runVardict { } command <<< + set -euo pipefail $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ diff --git a/vardict_perl.wdl b/vardict_perl.wdl index 8ebf918..60ea370 100755 --- a/vardict_perl.wdl +++ b/vardict_perl.wdl @@ -7,6 +7,7 @@ workflow vardict { File normal_bam String tumor_sample_name String normal_sample_name + String bed_file } parameter_meta { @@ -14,6 +15,7 @@ workflow vardict { normal_bam: "normal_bam file for analysis sample" tumor_sample_name:"Sample name for the tumor bam" normal_sample_name: "Sample name for the normal bam" + bed_file: "BED files for specifying regions of interest" } # run vardict @@ -23,7 +25,8 @@ workflow vardict { tumor_bam = tumor_bam, normal_bam = normal_bam, tumor_sample_name = tumor_sample_name, - normal_sample_name = normal_sample_name + normal_sample_name = normal_sample_name, + bed_file = bed_file } meta { @@ -44,7 +47,14 @@ workflow vardict { url: "https://www.java.com/en/" } ] + output_meta: { + vardict_vcf: { + description: "VCF file for variant calling from vardict", + vidarr_label: "vardict_vcf" + } + } } + output { File vardict_vcf = runVardict.vcf_file } @@ -64,7 +74,7 @@ task runVardict { String AF_THR = 0.01 String MAP_QUAL = 10 String modules = "samtools/1.16.1 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg38/p12" - String bed_file = "/.mounts/labs/gsiprojects/gsi/gsiusers/gpeng/workflow/vardict/test/renamed_hg38.bed" + String bed_file Int timeout = 48 Int jobMemory = 24 } @@ -83,6 +93,7 @@ task runVardict { } command <<< + set -euo pipefail /.mounts/labs/gsiprojects/gsi/gsiusers/gpeng/workflow/vardict/test/vardict.pl \ -G ~{refFasta} \ -f ~{AF_THR} \ From 4dad3efa29df5d1884c76d56036aaf7c81ea6649 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Tue, 24 Dec 2024 14:25:20 -0500 Subject: [PATCH 11/27] add java Xmx option --- README.md | 15 ++++++++------- commands.txt | 43 ++++++++++++++++++++++--------------------- vardict.wdl | 3 ++- 3 files changed, 32 insertions(+), 29 deletions(-) mode change 100644 => 100755 README.md mode change 100644 => 100755 commands.txt diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 3ddc021..105e559 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Parameter|Value|Default|Description `runVardict.READ_POSTION_FILTER`|String|5|The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5 `runVardict.modules`|String|"rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg38/p12"|Names and versions of modules `runVardict.timeout`|Int|48|Timeout in hours, needed to override imposed limits -`runVardict.jobMemory`|Int|24|Memory in Gb for this job +`runVardict.jobMemory`|Int|48|Memory in Gb for this job ### Outputs @@ -54,13 +54,14 @@ Output | Type | Description ## Commands - This section lists command(s) run by WORKFLOW workflow + This section lists command(s) run by vardict workflow - * Running WORKFLOW - - === Description here ===. + * Running Vardict + - <<< + ``` + set -euo pipefail + export JAVA_OPTS="-Xmx~{jobMemory - 6}G " $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ @@ -75,7 +76,7 @@ Output | Type | Description -N "~{tumor_sample_name}|~{normal_sample_name}" \ -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf - >>> +``` ## Support For support, please file an issue on the [Github project](https://github.com/oicr-gsi) or send an email to gsi@oicr.on.ca . diff --git a/commands.txt b/commands.txt old mode 100644 new mode 100755 index 3418776..6fa9392 --- a/commands.txt +++ b/commands.txt @@ -1,23 +1,24 @@ ## Commands -This section lists command(s) run by WORKFLOW workflow + This section lists command(s) run by vardict workflow + + * Running Vardict -* Running WORKFLOW - -=== Description here ===. - -<<< - $VARDICT_ROOT/bin/VarDict \ - -G ~{refFasta} \ - -f ~{AF_THR} \ - -N ~{tumor_sample_name} \ - -b "~{tumor_bam}|~{normal_bam}" \ - -Q ~{MAP_QUAL} \ - -P ~{READ_POSTION_FILTER} \ - -c 1 -S 2 -E 3 -g 4 \ - ~{bed_file} | \ - $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ - $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ - -N "~{tumor_sample_name}|~{normal_sample_name}" \ - -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf - - >>> + + ``` + set -euo pipefail + export JAVA_OPTS="-Xmx~{jobMemory - 6}G " + $VARDICT_ROOT/bin/VarDict \ + -G ~{refFasta} \ + -f ~{AF_THR} \ + -N ~{tumor_sample_name} \ + -b "~{tumor_bam}|~{normal_bam}" \ + -Q ~{MAP_QUAL} \ + -P ~{READ_POSTION_FILTER} \ + -c 1 -S 2 -E 3 -g 4 \ + ~{bed_file} | \ + $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ + $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ + -N "~{tumor_sample_name}|~{normal_sample_name}" \ + -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf + +``` diff --git a/vardict.wdl b/vardict.wdl index abd6a94..7b2b10f 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -73,7 +73,7 @@ task runVardict { String modules = "rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg38/p12" String bed_file Int timeout = 48 - Int jobMemory = 24 + Int jobMemory = 48 } parameter_meta { tumor_bam: "tumor_bam file for analysis sample" @@ -92,6 +92,7 @@ task runVardict { command <<< set -euo pipefail + export JAVA_OPTS="-Xmx~{jobMemory - 6}G " $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ From 956c169b6be2541fa8ba487707faea9a9ce7e2d1 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Thu, 2 Jan 2025 14:12:36 -0500 Subject: [PATCH 12/27] add vidarr files --- Jenkinsfile | 24 +++++++++++++ README.md | 46 ++++++++++++------------- commands.txt | 1 + tests/calculate.sh | 8 +++++ tests/compare.sh | 2 ++ vardict.wdl | 32 +++++++++++++++-- vidarrbuild.json | 6 ++++ vidarrtest-regression.json.in | 65 +++++++++++++++++++++++++++++++++++ 8 files changed, 158 insertions(+), 26 deletions(-) create mode 100644 Jenkinsfile create mode 100755 tests/calculate.sh create mode 100755 tests/compare.sh create mode 100755 vidarrbuild.json create mode 100755 vidarrtest-regression.json.in diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..f2d9c90 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,24 @@ + +pipeline { + agent any + stages { + stage('build') { + when { + not { + buildingTag() + } + } + steps { + sh '/.mounts/labs/gsi/vidarr/jenkins-ci-wrapper test -t /.mounts/labs/gsi/vidarr/testing-config.json' + } + } + stage('Deploy') { + when { + buildingTag() + } + steps { + sh '/.mounts/labs/gsi/vidarr/jenkins-ci-wrapper deploy -v $TAG_NAME -t /.mounts/labs/gsi/vidarr/testing-config.json -U /.mounts/labs/gsi/vidarr/deploy-urls' + } + } + } +} diff --git a/README.md b/README.md index 105e559..b18df4e 100755 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Parameter|Value|Description `tumor_sample_name`|String|Sample name for the tumor bam `normal_sample_name`|String|Sample name for the normal bam `bed_file`|String|BED files for specifying regions of interest +`reference`|String|the reference genome for input sample #### Optional workflow parameters: @@ -37,11 +38,9 @@ Parameter|Value|Default|Description #### Optional task parameters: Parameter|Value|Default|Description ---|---|---|--- -`runVardict.refFasta`|String|"$HG38_ROOT/hg38_random.fa"|The reference fasta `runVardict.AF_THR`|String|0.01|The threshold for allele frequency, default: 0.01 or 1% `runVardict.MAP_QUAL`|String|10| Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored `runVardict.READ_POSTION_FILTER`|String|5|The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5 -`runVardict.modules`|String|"rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg38/p12"|Names and versions of modules `runVardict.timeout`|Int|48|Timeout in hours, needed to override imposed limits `runVardict.jobMemory`|Int|48|Memory in Gb for this job @@ -54,29 +53,30 @@ Output | Type | Description ## Commands - This section lists command(s) run by vardict workflow - - * Running Vardict - + This section lists command(s) run by vardict workflow + + * Running Vardict + + ``` + set -euo pipefail + cp ~{refFai} . + export JAVA_OPTS="-Xmx~{jobMemory - 6}G " + $VARDICT_ROOT/bin/VarDict \ + -G ~{refFasta} \ + -f ~{AF_THR} \ + -N ~{tumor_sample_name} \ + -b "~{tumor_bam}|~{normal_bam}" \ + -Q ~{MAP_QUAL} \ + -P ~{READ_POSTION_FILTER} \ + -c 1 -S 2 -E 3 -g 4 \ + ~{bed_file} | \ + $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ + $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ + -N "~{tumor_sample_name}|~{normal_sample_name}" \ + -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf + ``` - set -euo pipefail - export JAVA_OPTS="-Xmx~{jobMemory - 6}G " - $VARDICT_ROOT/bin/VarDict \ - -G ~{refFasta} \ - -f ~{AF_THR} \ - -N ~{tumor_sample_name} \ - -b "~{tumor_bam}|~{normal_bam}" \ - -Q ~{MAP_QUAL} \ - -P ~{READ_POSTION_FILTER} \ - -c 1 -S 2 -E 3 -g 4 \ - ~{bed_file} | \ - $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ - $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ - -N "~{tumor_sample_name}|~{normal_sample_name}" \ - -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf - -``` ## Support For support, please file an issue on the [Github project](https://github.com/oicr-gsi) or send an email to gsi@oicr.on.ca . diff --git a/commands.txt b/commands.txt index 6fa9392..9c7e8e8 100755 --- a/commands.txt +++ b/commands.txt @@ -6,6 +6,7 @@ ``` set -euo pipefail + cp ~{refFai} . export JAVA_OPTS="-Xmx~{jobMemory - 6}G " $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ diff --git a/tests/calculate.sh b/tests/calculate.sh new file mode 100755 index 0000000..884fc56 --- /dev/null +++ b/tests/calculate.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -o nounset +set -o errexit +set -o pipefail + +cd $1 + +find . -name '*.vcf' | xargs md5sum \ No newline at end of file diff --git a/tests/compare.sh b/tests/compare.sh new file mode 100755 index 0000000..81ce07e --- /dev/null +++ b/tests/compare.sh @@ -0,0 +1,2 @@ +#!/bin/bash +diff -s <(sort $1) <(sort $2) diff --git a/vardict.wdl b/vardict.wdl index 7b2b10f..7b2f08a 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -1,5 +1,10 @@ version 1.0 +struct GenomeResources { + String refFai + String refFasta + String modules +} workflow vardict { input { @@ -8,6 +13,7 @@ workflow vardict { String tumor_sample_name String normal_sample_name String bed_file + String reference } parameter_meta { @@ -16,6 +22,20 @@ workflow vardict { tumor_sample_name:"Sample name for the tumor bam" normal_sample_name: "Sample name for the normal bam" bed_file: "BED files for specifying regions of interest" + reference: "the reference genome for input sample" + } + + Map[String, GenomeResources] resources = { + "hg19": { + "refFai" : "$HG19_ROOT/hg19_random.fa.fai", + "refFasta" : "$HG19_ROOT/hg19_random.fa", + "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3" + }, + "hg38": { + "refFai" : "$HG38_ROOT/hg38_random.fa.fai", + "refFasta" : "$HG38_ROOT/hg38_random.fa", + "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3" + } } # run vardict @@ -26,7 +46,10 @@ workflow vardict { normal_bam = normal_bam, tumor_sample_name = tumor_sample_name, normal_sample_name = normal_sample_name, - bed_file = bed_file + bed_file = bed_file, + modules = resources [ reference ].modules, + refFai = resources[reference].refFai, + refFasta = resources[reference].refFasta, } meta { @@ -66,11 +89,12 @@ task runVardict { File normal_bam String tumor_sample_name String normal_sample_name - String refFasta = "$HG38_ROOT/hg38_random.fa" + String refFasta + String refFai String AF_THR = 0.01 String MAP_QUAL = 10 String READ_POSTION_FILTER = 5 - String modules = "rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg38/p12" + String modules String bed_file Int timeout = 48 Int jobMemory = 48 @@ -80,6 +104,7 @@ task runVardict { normal_bam: "normal_bam file for analysis sample" tumor_sample_name:"Sample name for the tumor bam" normal_sample_name: "Sample name for the normal bam" + refFai: "Reference fasta fai index" refFasta: "The reference fasta" AF_THR: "The threshold for allele frequency, default: 0.01 or 1%" MAP_QUAL: " Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored" @@ -92,6 +117,7 @@ task runVardict { command <<< set -euo pipefail + cp ~{refFai} . export JAVA_OPTS="-Xmx~{jobMemory - 6}G " $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ diff --git a/vidarrbuild.json b/vidarrbuild.json new file mode 100755 index 0000000..480b296 --- /dev/null +++ b/vidarrbuild.json @@ -0,0 +1,6 @@ +{ + "names": [ + "vardict" + ], + "wdl": "vardict.wdl" +} diff --git a/vidarrtest-regression.json.in b/vidarrtest-regression.json.in new file mode 100755 index 0000000..9bf49a2 --- /dev/null +++ b/vidarrtest-regression.json.in @@ -0,0 +1,65 @@ +[ + { + "arguments": { + "vardict.tumor_bam": { + "contents": { + "configuration": "/.mounts/labs/gsi/testdata/mutect2/input_data/PCSI0022P.sorted.filter.deduped.realigned.recal.bam", + "externalIds": [ + { + "id": "TEST", + "provider": "TEST" + } + ] + }, + "type": "EXTERNAL" + }, + "vardict.normal_bam": { + "contents": { + "configuration": "/.mounts/labs/gsi/testdata/mutect2/input_data/PCSI0022R.val.bam", + "externalIds": [ + { + "id": "TEST", + "provider": "TEST" + } + ] + }, + "type": "EXTERNAL" + }, + "vardict.normal_sample_name": "PCSI0022R", + "vardict.runVardict.MAP_QUAL": null, + "vardict.runVardict.READ_POSTION_FILTER": null, + "vardict.runVardict.timeout": 48, + "vardict.runVardict.modules": null, + "vardict.runVardict.jobMemory": null, + "vardict.bed_file": "/.mounts/labs/gsi/testdata/mutect2/input_data/PCSI0022.val.bed", + "vardict.runVardict.refFasta": null, + "vardict.tumor_sample_name": "PCSI0022P", + "vardict.runVardict.AF_THR": null + }, + "description": "vardict workflow test", + "engineArguments": { + "write_to_cache": false, + "read_from_cache": false + }, + "id": "vardict_test", + "metadata": { + "vardict.vardict_vcf": { + "contents": [ + { + "outputDirectory": "@SCRATCH@/@DATE@_Workflow_vardict_@JENKINSID@" + } + ], + "type": "ALL" + } + }, + + "validators": [ + { + "metrics_calculate": "@CHECKOUT@/tests/calculate.sh", + "metrics_compare": "@CHECKOUT@/tests/compare.sh", + "output_metrics": "/.mounts/labs/gsi/testdata/vardict/output_expectation/1.0.0/vardict_test.metrics", + "type": "script" + } + ] + } +] \ No newline at end of file From 7062e6241b58caa5f8ff81a29bdeaf4ce4ee3604 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Thu, 2 Jan 2025 14:21:19 -0500 Subject: [PATCH 13/27] add missing argument --- vidarrtest-regression.json.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vidarrtest-regression.json.in b/vidarrtest-regression.json.in index 9bf49a2..2c6e25a 100755 --- a/vidarrtest-regression.json.in +++ b/vidarrtest-regression.json.in @@ -34,7 +34,8 @@ "vardict.bed_file": "/.mounts/labs/gsi/testdata/mutect2/input_data/PCSI0022.val.bed", "vardict.runVardict.refFasta": null, "vardict.tumor_sample_name": "PCSI0022P", - "vardict.runVardict.AF_THR": null + "vardict.runVardict.AF_THR": null, + "vardict.reference": "hg38" }, "description": "vardict workflow test", "engineArguments": { From ceb2dfeb5c2f8074f26c8728379671772637150b Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Thu, 2 Jan 2025 19:01:44 -0500 Subject: [PATCH 14/27] add samtools as dependency of perl version --- vardict_perl.wdl | 44 ++++++++++++++++++++++++++++------- vidarrtest-regression.json.in | 2 +- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/vardict_perl.wdl b/vardict_perl.wdl index 60ea370..47ed096 100755 --- a/vardict_perl.wdl +++ b/vardict_perl.wdl @@ -1,5 +1,10 @@ version 1.0 +struct GenomeResources { + String refFai + String refFasta + String modules +} workflow vardict { input { @@ -8,6 +13,7 @@ workflow vardict { String tumor_sample_name String normal_sample_name String bed_file + String reference } parameter_meta { @@ -16,6 +22,20 @@ workflow vardict { tumor_sample_name:"Sample name for the tumor bam" normal_sample_name: "Sample name for the normal bam" bed_file: "BED files for specifying regions of interest" + reference: "the reference genome for input sample" + } + + Map[String, GenomeResources] resources = { + "hg19": { + "refFai" : "$HG19_ROOT/hg19_random.fa.fai", + "refFasta" : "$HG19_ROOT/hg19_random.fa", + "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 samtools/1.16.1" + }, + "hg38": { + "refFai" : "$HG38_ROOT/hg38_random.fa.fai", + "refFasta" : "$HG38_ROOT/hg38_random.fa", + "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 samtools/1.16.1" + } } # run vardict @@ -26,7 +46,10 @@ workflow vardict { normal_bam = normal_bam, tumor_sample_name = tumor_sample_name, normal_sample_name = normal_sample_name, - bed_file = bed_file + bed_file = bed_file, + modules = resources [ reference ].modules, + refFai = resources[reference].refFai, + refFasta = resources[reference].refFasta } meta { @@ -43,8 +66,8 @@ workflow vardict { url: "https://www.htslib.org/" }, { - name: "java", - url: "https://www.java.com/en/" + name: "perl", + url: "https://www.perl.org/" } ] output_meta: { @@ -70,22 +93,26 @@ task runVardict { File normal_bam String tumor_sample_name String normal_sample_name - String refFasta = "$HG38_ROOT/hg38_random.fa" + String refFasta + String refFai String AF_THR = 0.01 String MAP_QUAL = 10 - String modules = "samtools/1.16.1 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 hg38/p12" + String READ_POSTION_FILTER = 5 + String modules String bed_file Int timeout = 48 - Int jobMemory = 24 + Int jobMemory = 48 } parameter_meta { tumor_bam: "tumor_bam file for analysis sample" normal_bam: "normal_bam file for analysis sample" tumor_sample_name:"Sample name for the tumor bam" normal_sample_name: "Sample name for the normal bam" + refFai: "Reference fasta fai index" refFasta: "The reference fasta" AF_THR: "The threshold for allele frequency, default: 0.01 or 1%" MAP_QUAL: " Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored" + READ_POSTION_FILTER: "The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5" bed_file: "BED files for specifying regions of interest" jobMemory: "Memory in Gb for this job" modules: "Names and versions of modules" @@ -94,7 +121,7 @@ task runVardict { command <<< set -euo pipefail - /.mounts/labs/gsiprojects/gsi/gsiusers/gpeng/workflow/vardict/test/vardict.pl \ + $PERL_ROOT/bin/perl /.mounts/labs/gsiprojects/gsi/gsiusers/gpeng/workflow/vardict/test/vardict.pl \ -G ~{refFasta} \ -f ~{AF_THR} \ -N ~{tumor_sample_name} \ @@ -105,7 +132,8 @@ task runVardict { $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ -N "~{tumor_sample_name}|~{normal_sample_name}" \ - -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}_perl.vardict.vcf + -f ~{AF_THR} \ + -P 0.05 > ~{tumor_sample_name}_~{normal_sample_name}_perl.vardict.vcf >>> diff --git a/vidarrtest-regression.json.in b/vidarrtest-regression.json.in index 2c6e25a..39a70b7 100755 --- a/vidarrtest-regression.json.in +++ b/vidarrtest-regression.json.in @@ -35,7 +35,7 @@ "vardict.runVardict.refFasta": null, "vardict.tumor_sample_name": "PCSI0022P", "vardict.runVardict.AF_THR": null, - "vardict.reference": "hg38" + "vardict.reference": "hg19" }, "description": "vardict workflow test", "engineArguments": { From 7f5dc642542aeccd645813087d65903d20db63cd Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Wed, 15 Jan 2025 22:47:36 -0500 Subject: [PATCH 15/27] try parallellizing runVardict --- vardict.wdl | 127 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 115 insertions(+), 12 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index 7b2f08a..ce77a70 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -4,6 +4,7 @@ struct GenomeResources { String refFai String refFasta String modules + String splitRegionModule } workflow vardict { @@ -14,6 +15,7 @@ workflow vardict { String normal_sample_name String bed_file String reference + Int baseMemory } parameter_meta { @@ -23,24 +25,34 @@ workflow vardict { normal_sample_name: "Sample name for the normal bam" bed_file: "BED files for specifying regions of interest" reference: "the reference genome for input sample" + intervalsToParallelizeBy: "Comma separated list of intervals to split by (e.g. chr1,chr2,chr3+chr4)" } Map[String, GenomeResources] resources = { "hg19": { "refFai" : "$HG19_ROOT/hg19_random.fa.fai", "refFasta" : "$HG19_ROOT/hg19_random.fa", - "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3" + "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3", + "splitRegionModule": "hg19/p13" }, "hg38": { - "refFai" : "$HG38_ROOT/hg38_random.fa.fai", + "refFai" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa.fai", "refFasta" : "$HG38_ROOT/hg38_random.fa", - "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3" + "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3", + "splitRegionModule": "hg38/p12" } } + call splitRegion { + input: + refFai = resources[reference].refFai, + baseMemory = baseMemory, + modules = resources [ reference ].splitRegionModule + } + # run vardict - call runVardict - { + scatter (region in splitRegion.regions) { + call runVardict { input: tumor_bam = tumor_bam, normal_bam = normal_bam, @@ -50,7 +62,17 @@ workflow vardict { modules = resources [ reference ].modules, refFai = resources[reference].refFai, refFasta = resources[reference].refFasta, + region = region, + mem_allocate = splitRegion.region_memory_map[region] } + } + Array[File] vardictVcfs = runVardict.vcf_file + + call mergeVCFs { + input: + vcfs = vardictVcfs + } + meta { author: "Gavin Peng" @@ -75,11 +97,46 @@ workflow vardict { } output { - File vardict_vcf = runVardict.vcf_file + File vardictVcf = mergeVCFs.mergedVcf + File vardictVcfIndex = mergeVCFs.mergedVcfIdx } } +task splitRegion { + input { + File refFai + Int baseMemory = 120 # Memory coefficient (GB per Gb) + Int overhead = 4 # Overhead memory (GB) + String modules + Int memory = 1 + Int timeout = 1 + } + + command <<< + set -euo pipefail + grep -E '^chr[0-9XY]{1,2}\s' ~{refFai} > filtered_genome.fasta.fai + + awk -v coef=~{baseMemory} -v over=~{overhead} '{ + size_gb = $2 / 1e9 + memory = int(size_gb * coef + over + 0.5) + region = $1 ":1-" $2 + print region "\t" memory > "region_memory_map.tsv" # Tab-separated key-value map + print region > "regions.txt" # List of regions only + }' filtered_genome.fasta.fai > region_memory_map.json + >>> + + runtime { + memory: "~{memory} GB" + modules: "~{modules}" + timeout: "~{timeout}" + } + output { + Array[String] regions = read_lines("regions.txt") # Array of regions only + Map[String, Int] region_memory_map = read_map("region_memory_map.tsv") # Map of region to memory + } +} + # ========================== # configure and run vardict # ========================== @@ -96,8 +153,9 @@ task runVardict { String READ_POSTION_FILTER = 5 String modules String bed_file - Int timeout = 48 - Int jobMemory = 48 + Int timeout = 96 + String region + Int mem_allocate } parameter_meta { tumor_bam: "tumor_bam file for analysis sample" @@ -110,15 +168,16 @@ task runVardict { MAP_QUAL: " Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored" READ_POSTION_FILTER: "The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5" bed_file: "BED files for specifying regions of interest" - jobMemory: "Memory in Gb for this job" modules: "Names and versions of modules" timeout: "Timeout in hours, needed to override imposed limits" - } + } command <<< set -euo pipefail cp ~{refFai} . - export JAVA_OPTS="-Xmx~{jobMemory - 6}G " + + export JAVA_OPTS="-Xmx$(echo "scale=0; ~{mem_allocate} * 0.9 / 1" | bc)G" + $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ @@ -136,13 +195,57 @@ task runVardict { >>> runtime { - memory: "~{jobMemory} GB" modules: "~{modules}" timeout: "~{timeout}" + jobMemory: "~{mem_allocate} GB" } output { File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf" } +} + +task mergeVCFs { + input { + String modules = "gatk" + Array[File] vcfs + Int memory = 4 + Int timeout = 12 + } + + parameter_meta { + modules: "Environment module names and version to load (space separated) before command execution" + vcfs: "Vcf's from scatter to merge together" + memory: "Memory allocated for job" + timeout: "Hours before task timeout" + } + + meta { + output_meta: { + mergedVcf: "Merged vcf, unfiltered.", + mergedVcfIdx: "Merged vcf index, unfiltered." + } + } + + String outputName = basename(vcfs[0]) + + command <<< + set -euo pipefail + + gatk --java-options "-Xmx~{memory-3}g" MergeVcfs \ + -I ~{sep=" -I " vcfs} \ + -O ~{outputName} + >>> + + runtime { + memory: "~{memory} GB" + modules: "~{modules}" + timeout: "~{timeout}" + } + + output { + File mergedVcf = "~{outputName}" + File mergedVcfIdx = "~{outputName}.idx" + } } \ No newline at end of file From 54a7f53300dcfdf2aefadfd31faf51aca9538bc4 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Thu, 16 Jan 2025 22:39:05 -0500 Subject: [PATCH 16/27] add creating dictionary for mergeVcfs and remove perl version --- vardict.wdl | 43 +++++++++----- vardict_perl.wdl | 150 ----------------------------------------------- 2 files changed, 28 insertions(+), 165 deletions(-) delete mode 100755 vardict_perl.wdl diff --git a/vardict.wdl b/vardict.wdl index ce77a70..a811714 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -30,15 +30,15 @@ workflow vardict { Map[String, GenomeResources] resources = { "hg19": { - "refFai" : "$HG19_ROOT/hg19_random.fa.fai", + "refFai" : "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.fa.fai", "refFasta" : "$HG19_ROOT/hg19_random.fa", - "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3", + "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 htslib/1.9", "splitRegionModule": "hg19/p13" }, "hg38": { "refFai" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa.fai", "refFasta" : "$HG38_ROOT/hg38_random.fa", - "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3", + "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 htslib/1.9", "splitRegionModule": "hg38/p12" } } @@ -63,14 +63,17 @@ workflow vardict { refFai = resources[reference].refFai, refFasta = resources[reference].refFasta, region = region, - mem_allocate = splitRegion.region_memory_map[region] + memory = splitRegion.region_memory_map[region] } } Array[File] vardictVcfs = runVardict.vcf_file + Array[File] vardictVcfIndexes = runVardict.vcf_index call mergeVCFs { input: - vcfs = vardictVcfs + vcfs = vardictVcfs, + vcfIndexes = vardictVcfIndexes, + refFasta = resources[reference].refFasta } @@ -106,7 +109,7 @@ workflow vardict { task splitRegion { input { File refFai - Int baseMemory = 120 # Memory coefficient (GB per Gb) + Int baseMemory = 500 # Memory coefficient (GB per Gb) Int overhead = 4 # Overhead memory (GB) String modules Int memory = 1 @@ -120,10 +123,10 @@ task splitRegion { awk -v coef=~{baseMemory} -v over=~{overhead} '{ size_gb = $2 / 1e9 memory = int(size_gb * coef + over + 0.5) - region = $1 ":1-" $2 + region = $1 "_1_" $2 print region "\t" memory > "region_memory_map.tsv" # Tab-separated key-value map print region > "regions.txt" # List of regions only - }' filtered_genome.fasta.fai > region_memory_map.json + }' filtered_genome.fasta.fai >>> runtime { @@ -155,7 +158,7 @@ task runVardict { String bed_file Int timeout = 96 String region - Int mem_allocate + Int memory } parameter_meta { tumor_bam: "tumor_bam file for analysis sample" @@ -175,8 +178,9 @@ task runVardict { command <<< set -euo pipefail cp ~{refFai} . + original_region=$(echo ~{region} | sed 's/_/:/; s/_/-/') - export JAVA_OPTS="-Xmx$(echo "scale=0; ~{mem_allocate} * 0.9 / 1" | bc)G" + export JAVA_OPTS="-Xmx$(echo "scale=0; ~{memory} * 0.9 / 1" | bc)G" $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ @@ -186,30 +190,35 @@ task runVardict { -Q ~{MAP_QUAL} \ -P ~{READ_POSTION_FILTER} \ -c 1 -S 2 -E 3 -g 4 \ + -R ${original_region} \ ~{bed_file} | \ $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ -N "~{tumor_sample_name}|~{normal_sample_name}" \ - -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf - + -f ~{AF_THR} | gzip > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz + + $HTSLIB_ROOT/bin/tabix -p vcf ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz >>> runtime { modules: "~{modules}" timeout: "~{timeout}" - jobMemory: "~{mem_allocate} GB" + memory: "~{memory} GB" } output { - File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf" + File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz" + File vcf_index = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.tbi" } } task mergeVCFs { input { - String modules = "gatk" + String modules = "gatk/4.2.6.1 picard/3.1.0" Array[File] vcfs + Array[File] vcfIndexes + String refFasta Int memory = 4 Int timeout = 12 } @@ -233,6 +242,10 @@ task mergeVCFs { command <<< set -euo pipefail + java -jar $PICARD_ROOT/picard.jar CreateSequenceDictionary \ + R=~{refFasta} \ + O=$(basename(~{refFasta}) fa).dict + gatk --java-options "-Xmx~{memory-3}g" MergeVcfs \ -I ~{sep=" -I " vcfs} \ -O ~{outputName} diff --git a/vardict_perl.wdl b/vardict_perl.wdl deleted file mode 100755 index 47ed096..0000000 --- a/vardict_perl.wdl +++ /dev/null @@ -1,150 +0,0 @@ -version 1.0 - -struct GenomeResources { - String refFai - String refFasta - String modules -} - -workflow vardict { - input { - File tumor_bam - File normal_bam - String tumor_sample_name - String normal_sample_name - String bed_file - String reference - } - - parameter_meta { - tumor_bam: "tumor_bam file for analysis sample" - normal_bam: "normal_bam file for analysis sample" - tumor_sample_name:"Sample name for the tumor bam" - normal_sample_name: "Sample name for the normal bam" - bed_file: "BED files for specifying regions of interest" - reference: "the reference genome for input sample" - } - - Map[String, GenomeResources] resources = { - "hg19": { - "refFai" : "$HG19_ROOT/hg19_random.fa.fai", - "refFasta" : "$HG19_ROOT/hg19_random.fa", - "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 samtools/1.16.1" - }, - "hg38": { - "refFai" : "$HG38_ROOT/hg38_random.fa.fai", - "refFasta" : "$HG38_ROOT/hg38_random.fa", - "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 samtools/1.16.1" - } - } - - # run vardict - call runVardict - { - input: - tumor_bam = tumor_bam, - normal_bam = normal_bam, - tumor_sample_name = tumor_sample_name, - normal_sample_name = normal_sample_name, - bed_file = bed_file, - modules = resources [ reference ].modules, - refFai = resources[reference].refFai, - refFasta = resources[reference].refFasta - } - - meta { - author: "Gavin Peng" - email: "gpeng@oicr.on.ca" - description: "VarDict is an ultra sensitive variant caller for both single and paired sample variant calling from BAM files. VarDict implements several novel features such as amplicon bias aware variant calling from targeted sequencing experiments, rescue of long indels by realigning bwa soft clipped reads and better scalability than many Java based variant callers." - dependencies: [ - { - name: "vardict/1.8.3", - url: "https://github.com/pachterlab/vardict" - }, - { - name: "samtools/1.16.1", - url: "https://www.htslib.org/" - }, - { - name: "perl", - url: "https://www.perl.org/" - } - ] - output_meta: { - vardict_vcf: { - description: "VCF file for variant calling from vardict", - vidarr_label: "vardict_vcf" - } - } - } - - output { - File vardict_vcf = runVardict.vcf_file - } - -} - -# ========================== -# configure and run vardict -# ========================== -task runVardict { - input { - File tumor_bam - File normal_bam - String tumor_sample_name - String normal_sample_name - String refFasta - String refFai - String AF_THR = 0.01 - String MAP_QUAL = 10 - String READ_POSTION_FILTER = 5 - String modules - String bed_file - Int timeout = 48 - Int jobMemory = 48 - } - parameter_meta { - tumor_bam: "tumor_bam file for analysis sample" - normal_bam: "normal_bam file for analysis sample" - tumor_sample_name:"Sample name for the tumor bam" - normal_sample_name: "Sample name for the normal bam" - refFai: "Reference fasta fai index" - refFasta: "The reference fasta" - AF_THR: "The threshold for allele frequency, default: 0.01 or 1%" - MAP_QUAL: " Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored" - READ_POSTION_FILTER: "The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5" - bed_file: "BED files for specifying regions of interest" - jobMemory: "Memory in Gb for this job" - modules: "Names and versions of modules" - timeout: "Timeout in hours, needed to override imposed limits" - } - - command <<< - set -euo pipefail - $PERL_ROOT/bin/perl /.mounts/labs/gsiprojects/gsi/gsiusers/gpeng/workflow/vardict/test/vardict.pl \ - -G ~{refFasta} \ - -f ~{AF_THR} \ - -N ~{tumor_sample_name} \ - -b "~{tumor_bam}|~{normal_bam}" \ - -c 1 -S 2 -E 3 -g 4 \ - -Q ~{MAP_QUAL} \ - ~{bed_file} | \ - $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ - $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ - -N "~{tumor_sample_name}|~{normal_sample_name}" \ - -f ~{AF_THR} \ - -P 0.05 > ~{tumor_sample_name}_~{normal_sample_name}_perl.vardict.vcf - - >>> - - runtime { - memory: "~{jobMemory} GB" - modules: "~{modules}" - timeout: "~{timeout}" - } - - output { - File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}_perl.vardict.vcf" - - } -} \ No newline at end of file From e86a04328c429af1ce83b120c6215899a411159b Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Fri, 17 Jan 2025 13:27:22 -0500 Subject: [PATCH 17/27] add parameters like refDict --- vardict.wdl | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index a811714..bb9b99c 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -3,8 +3,10 @@ version 1.0 struct GenomeResources { String refFai String refFasta + String refDict String modules String splitRegionModule + String mergeVcfModules } workflow vardict { @@ -15,7 +17,7 @@ workflow vardict { String normal_sample_name String bed_file String reference - Int baseMemory + Int mem_coefficient } parameter_meta { @@ -31,22 +33,26 @@ workflow vardict { Map[String, GenomeResources] resources = { "hg19": { "refFai" : "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.fa.fai", - "refFasta" : "$HG19_ROOT/hg19_random.fa", - "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 htslib/1.9", - "splitRegionModule": "hg19/p13" + "refFasta" : "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.fa", + "refDict" : "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.dict", + "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3", + "splitRegionModule": "hg19/p13", + "mergeVcfModules": "gatk hg19/p13" }, "hg38": { "refFai" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa.fai", - "refFasta" : "$HG38_ROOT/hg38_random.fa", - "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 htslib/1.9", - "splitRegionModule": "hg38/p12" + "refFasta" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa", + "refDict" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.dict", + "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3", + "splitRegionModule": "hg38/p12", + "mergeVcfModules": "gatk hg38/p12" } } call splitRegion { input: refFai = resources[reference].refFai, - baseMemory = baseMemory, + mem_coefficient = mem_coefficient, modules = resources [ reference ].splitRegionModule } @@ -67,13 +73,13 @@ workflow vardict { } } Array[File] vardictVcfs = runVardict.vcf_file - Array[File] vardictVcfIndexes = runVardict.vcf_index call mergeVCFs { input: vcfs = vardictVcfs, - vcfIndexes = vardictVcfIndexes, - refFasta = resources[reference].refFasta + refDict = resources[reference].refDict, + refFasta = resources[reference].refFasta, + modules = resources[reference].mergeVcfModules } @@ -109,7 +115,7 @@ workflow vardict { task splitRegion { input { File refFai - Int baseMemory = 500 # Memory coefficient (GB per Gb) + Int mem_coefficient = 500 # Memory coefficient (GB per Gb) Int overhead = 4 # Overhead memory (GB) String modules Int memory = 1 @@ -120,7 +126,7 @@ task splitRegion { set -euo pipefail grep -E '^chr[0-9XY]{1,2}\s' ~{refFai} > filtered_genome.fasta.fai - awk -v coef=~{baseMemory} -v over=~{overhead} '{ + awk -v coef=~{mem_coefficient} -v over=~{overhead} '{ size_gb = $2 / 1e9 memory = int(size_gb * coef + over + 0.5) region = $1 "_1_" $2 @@ -196,8 +202,6 @@ task runVardict { $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ -N "~{tumor_sample_name}|~{normal_sample_name}" \ -f ~{AF_THR} | gzip > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz - - $HTSLIB_ROOT/bin/tabix -p vcf ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz >>> runtime { @@ -208,16 +212,14 @@ task runVardict { output { File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz" - File vcf_index = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.tbi" - } } task mergeVCFs { input { - String modules = "gatk/4.2.6.1 picard/3.1.0" + String modules Array[File] vcfs - Array[File] vcfIndexes + String refDict String refFasta Int memory = 4 Int timeout = 12 @@ -242,9 +244,7 @@ task mergeVCFs { command <<< set -euo pipefail - java -jar $PICARD_ROOT/picard.jar CreateSequenceDictionary \ - R=~{refFasta} \ - O=$(basename(~{refFasta}) fa).dict + cp ~{refDict} . gatk --java-options "-Xmx~{memory-3}g" MergeVcfs \ -I ~{sep=" -I " vcfs} \ From da7c31aad2e2e0c1e4f77ded896f4012ef4f0f58 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Sun, 19 Jan 2025 00:33:47 -0500 Subject: [PATCH 18/27] change to split bed instead of reference --- vardict.wdl | 95 ++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index bb9b99c..8c3cec7 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -5,7 +5,6 @@ struct GenomeResources { String refFasta String refDict String modules - String splitRegionModule String mergeVcfModules } @@ -17,7 +16,6 @@ workflow vardict { String normal_sample_name String bed_file String reference - Int mem_coefficient } parameter_meta { @@ -27,7 +25,6 @@ workflow vardict { normal_sample_name: "Sample name for the normal bam" bed_file: "BED files for specifying regions of interest" reference: "the reference genome for input sample" - intervalsToParallelizeBy: "Comma separated list of intervals to split by (e.g. chr1,chr2,chr3+chr4)" } Map[String, GenomeResources] resources = { @@ -35,29 +32,25 @@ workflow vardict { "refFai" : "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.fa.fai", "refFasta" : "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.fa", "refDict" : "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.dict", - "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3", - "splitRegionModule": "hg19/p13", - "mergeVcfModules": "gatk hg19/p13" + "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 bcftools/1.9", + "mergeVcfModules": "picard/2.19.2 hg19/p13" }, "hg38": { "refFai" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa.fai", "refFasta" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa", "refDict" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.dict", - "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3", - "splitRegionModule": "hg38/p12", - "mergeVcfModules": "gatk hg38/p12" + "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 bcftools/1.9", + "mergeVcfModules": "picard/2.19.2 hg38/p12" } } - call splitRegion { + call splitBedByChromosome { input: - refFai = resources[reference].refFai, - mem_coefficient = mem_coefficient, - modules = resources [ reference ].splitRegionModule + bed_file = bed_file } # run vardict - scatter (region in splitRegion.regions) { + scatter (bed_file in splitBedByChromosome.bed_files) { call runVardict { input: tumor_bam = tumor_bam, @@ -68,21 +61,19 @@ workflow vardict { modules = resources [ reference ].modules, refFai = resources[reference].refFai, refFasta = resources[reference].refFasta, - region = region, - memory = splitRegion.region_memory_map[region] } } Array[File] vardictVcfs = runVardict.vcf_file + Array[File] vardictVcfIndexes = runVardict.vcf_index call mergeVCFs { input: vcfs = vardictVcfs, + vcfIndexes = vardictVcfIndexes, refDict = resources[reference].refDict, - refFasta = resources[reference].refFasta, modules = resources[reference].mergeVcfModules } - meta { author: "Gavin Peng" email: "gpeng@oicr.on.ca" @@ -112,38 +103,40 @@ workflow vardict { } -task splitRegion { +task splitBedByChromosome { input { - File refFai - Int mem_coefficient = 500 # Memory coefficient (GB per Gb) - Int overhead = 4 # Overhead memory (GB) - String modules + File bed_file Int memory = 1 Int timeout = 1 } + parameter_meta { + bed_file: "Input BED file to split" + memory: "Memory allocated for task in GB" + timeout: "Timeout in hours" + } command <<< set -euo pipefail - grep -E '^chr[0-9XY]{1,2}\s' ~{refFai} > filtered_genome.fasta.fai - - awk -v coef=~{mem_coefficient} -v over=~{overhead} '{ - size_gb = $2 / 1e9 - memory = int(size_gb * coef + over + 0.5) - region = $1 "_1_" $2 - print region "\t" memory > "region_memory_map.tsv" # Tab-separated key-value map - print region > "regions.txt" # List of regions only - }' filtered_genome.fasta.fai + + mkdir split_beds + CHROMS=($(seq 1 22) X Y) + + for chr in "${CHROMS[@]}"; do + grep -E "^(chr)?${chr}[[:space:]]" ~{bed_file} > split_beds/chr${chr}.bed || true + if [ -s split_beds/chr${chr}.bed ]; then + echo "split_beds/chr${chr}.bed" >> split_beds.list + fi + done >>> + output { + Array[File] bed_files = read_lines("split_beds.list") + } + runtime { memory: "~{memory} GB" - modules: "~{modules}" timeout: "~{timeout}" } - output { - Array[String] regions = read_lines("regions.txt") # Array of regions only - Map[String, Int] region_memory_map = read_map("region_memory_map.tsv") # Map of region to memory - } } # ========================== @@ -163,7 +156,6 @@ task runVardict { String modules String bed_file Int timeout = 96 - String region Int memory } parameter_meta { @@ -184,7 +176,6 @@ task runVardict { command <<< set -euo pipefail cp ~{refFai} . - original_region=$(echo ~{region} | sed 's/_/:/; s/_/-/') export JAVA_OPTS="-Xmx$(echo "scale=0; ~{memory} * 0.9 / 1" | bc)G" @@ -196,12 +187,20 @@ task runVardict { -Q ~{MAP_QUAL} \ -P ~{READ_POSTION_FILTER} \ -c 1 -S 2 -E 3 -g 4 \ - -R ${original_region} \ ~{bed_file} | \ $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ -N "~{tumor_sample_name}|~{normal_sample_name}" \ - -f ~{AF_THR} | gzip > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz + -f ~{AF_THR} | gzip > vardict.vcf.gz + + # the vardict generated vcf header missing contig name, need extract contig lines from refFai + bcftools view -h vardict.vcf.gz > header.txt + while read -r name length rest; do + echo "##contig=" + done < ~{refFai} >> header.txt + + bcftools reheader -h header.txt -o ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz vardict.vcf.gz + bcftools index --tbi ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz >>> runtime { @@ -212,6 +211,7 @@ task runVardict { output { File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz" + File vcf_index = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz.tbi" } } @@ -219,8 +219,8 @@ task mergeVCFs { input { String modules Array[File] vcfs + Array[File] vcfIndexes String refDict - String refFasta Int memory = 4 Int timeout = 12 } @@ -244,11 +244,10 @@ task mergeVCFs { command <<< set -euo pipefail - cp ~{refDict} . - - gatk --java-options "-Xmx~{memory-3}g" MergeVcfs \ - -I ~{sep=" -I " vcfs} \ - -O ~{outputName} + java "-Xmx~{memory-3}g" -jar $PICARD_ROOT/picard.jar MergeVcfs \ + I=~{sep=" I=" vcfs} \ + O=~{outputName} \ + SEQUENCE_DICTIONARY=~{refDict} >>> runtime { @@ -259,6 +258,6 @@ task mergeVCFs { output { File mergedVcf = "~{outputName}" - File mergedVcfIdx = "~{outputName}.idx" + File mergedVcfIdx = "~{outputName}.tbi" } } \ No newline at end of file From 041a944111d6e50e12a01469cabb4e3d01e10951 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 20 Jan 2025 12:24:32 -0500 Subject: [PATCH 19/27] add memory allocating based on bed size --- vardict.wdl | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index 8c3cec7..78f4833 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -50,14 +50,16 @@ workflow vardict { } # run vardict - scatter (bed_file in splitBedByChromosome.bed_files) { + + scatter (i in range(length(splitBedByChromosome.bed_files))) { call runVardict { input: tumor_bam = tumor_bam, normal_bam = normal_bam, tumor_sample_name = tumor_sample_name, normal_sample_name = normal_sample_name, - bed_file = bed_file, + bed_file = splitBedByChromosome.bed_files[i], + memory_coefficient = splitBedByChromosome.memory_coefficients[i], modules = resources [ reference ].modules, refFai = resources[reference].refFai, refFasta = resources[reference].refFasta, @@ -125,12 +127,20 @@ task splitBedByChromosome { grep -E "^(chr)?${chr}[[:space:]]" ~{bed_file} > split_beds/chr${chr}.bed || true if [ -s split_beds/chr${chr}.bed ]; then echo "split_beds/chr${chr}.bed" >> split_beds.list + # Calculate range size for this bed + range_size=$(awk '{sum += ($3 - $2)} END {print sum}' split_beds/chr${chr}.bed) + echo "${range_size}" >> range_sizes.txt fi done + max_size=$(sort -n range_sizes.txt | tail -n1) + while IFS= read -r size; do + awk -v size="$size" -v max="$max_size" 'BEGIN {printf "%.1f\n", size/max}' + done < range_sizes.txt > memory_coefficients.txt >>> output { Array[File] bed_files = read_lines("split_beds.list") + Array[Float] memory_coefficients = read_lines("memory_coefficients.txt") } runtime { @@ -156,7 +166,9 @@ task runVardict { String modules String bed_file Int timeout = 96 - Int memory + Int memory = 32 + Int minMemory = 24 + Float memory_coefficient } parameter_meta { tumor_bam: "tumor_bam file for analysis sample" @@ -172,13 +184,13 @@ task runVardict { modules: "Names and versions of modules" timeout: "Timeout in hours, needed to override imposed limits" } + Int allocatedMemory = if minMemory > round(memory * memory_coefficient) then minMemory else round(memory * memory_coefficient) command <<< set -euo pipefail cp ~{refFai} . export JAVA_OPTS="-Xmx$(echo "scale=0; ~{memory} * 0.9 / 1" | bc)G" - $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ @@ -206,7 +218,7 @@ task runVardict { runtime { modules: "~{modules}" timeout: "~{timeout}" - memory: "~{memory} GB" + memory: "~{allocatedMemory} GB" } output { From 84ceaf9bd3933c868339b96f35a9a75dfc52b971 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 20 Jan 2025 12:45:59 -0500 Subject: [PATCH 20/27] update readme, regression test --- README.md | 97 +++++++++++++++++++++++++---------- commands.txt | 46 +++++++++++++++-- vardict.wdl | 10 +++- vidarrtest-regression.json.in | 15 +++++- 4 files changed, 133 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index b18df4e..14dbc9b 100755 --- a/README.md +++ b/README.md @@ -38,45 +38,88 @@ Parameter|Value|Default|Description #### Optional task parameters: Parameter|Value|Default|Description ---|---|---|--- +`splitBedByChromosome.memory`|Int|1|Memory allocated for task in GB +`splitBedByChromosome.timeout`|Int|1|Timeout in hours `runVardict.AF_THR`|String|0.01|The threshold for allele frequency, default: 0.01 or 1% `runVardict.MAP_QUAL`|String|10| Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored `runVardict.READ_POSTION_FILTER`|String|5|The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5 -`runVardict.timeout`|Int|48|Timeout in hours, needed to override imposed limits -`runVardict.jobMemory`|Int|48|Memory in Gb for this job +`runVardict.timeout`|Int|96|Timeout in hours, needed to override imposed limits +`runVardict.memory`|Int|32|base memory for this job +`runVardict.minMemory`|Int|24|The minimum value for allocated memory +`mergeVCFs.memory`|Int|4|Memory allocated for job +`mergeVCFs.timeout`|Int|12|Hours before task timeout ### Outputs -Output | Type | Description ----|---|--- -`vardict_vcf`|File|{'description': 'VCF file for variant calling from vardict', 'vidarr_label': 'vardict_vcf'} +Output | Type | Description | Labels +---|---|---|--- +`vardictVcf`|File|Merged vcf, unfiltered.| +`vardictVcfIndex`|File|VCF index for variant calling from vardict|vidarr_label: vardcit_index ## Commands - This section lists command(s) run by vardict workflow - - * Running Vardict + This section lists command(s) run by vardict workflow + + * Running vardict + + ``` + set -euo pipefail + + mkdir split_beds + CHROMS=($(seq 1 22) X Y) + + for chr in "${CHROMS[@]}"; do + grep -E "^(chr)?${chr}[[:space:]]" ~{bed_file} > split_beds/chr${chr}.bed || true + if [ -s split_beds/chr${chr}.bed ]; then + echo "split_beds/chr${chr}.bed" >> split_beds.list + # Calculate range size for this bed + range_size=$(awk '{sum += ($3 - $2)} END {print sum}' split_beds/chr${chr}.bed) + echo "${range_size}" >> range_sizes.txt + fi + done + max_size=$(sort -n range_sizes.txt | tail -n1) + while IFS= read -r size; do + awk -v size="$size" -v max="$max_size" 'BEGIN {printf "%.1f\n", size/max}' + done < range_sizes.txt > memory_coefficients.txt +``` + ``` + set -euo pipefail + cp ~{refFai} . + + export JAVA_OPTS="-Xmx$(echo "scale=0; ~{memory} * 0.9 / 1" | bc)G" + $VARDICT_ROOT/bin/VarDict \ + -G ~{refFasta} \ + -f ~{AF_THR} \ + -N ~{tumor_sample_name} \ + -b "~{tumor_bam}|~{normal_bam}" \ + -Q ~{MAP_QUAL} \ + -P ~{READ_POSTION_FILTER} \ + -c 1 -S 2 -E 3 -g 4 \ + ~{bed_file} | \ + $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ + $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ + -N "~{tumor_sample_name}|~{normal_sample_name}" \ + -f ~{AF_THR} | gzip > vardict.vcf.gz + + # the vardict generated vcf header missing contig name, need extract contig lines from refFai + bcftools view -h vardict.vcf.gz > header.txt + while read -r name length rest; do + echo "##contig=" + done < ~{refFai} >> header.txt - - ``` - set -euo pipefail - cp ~{refFai} . - export JAVA_OPTS="-Xmx~{jobMemory - 6}G " - $VARDICT_ROOT/bin/VarDict \ - -G ~{refFasta} \ - -f ~{AF_THR} \ - -N ~{tumor_sample_name} \ - -b "~{tumor_bam}|~{normal_bam}" \ - -Q ~{MAP_QUAL} \ - -P ~{READ_POSTION_FILTER} \ - -c 1 -S 2 -E 3 -g 4 \ - ~{bed_file} | \ - $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ - $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ - -N "~{tumor_sample_name}|~{normal_sample_name}" \ - -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf - + bcftools reheader -h header.txt -o ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz vardict.vcf.gz + bcftools index --tbi ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz +``` ``` + set -euo pipefail + + java "-Xmx~{memory-3}g" -jar $PICARD_ROOT/picard.jar MergeVcfs \ + I=~{sep=" I=" vcfs} \ + O=~{outputName} \ + SEQUENCE_DICTIONARY=~{refDict} +``` + ## Support For support, please file an issue on the [Github project](https://github.com/oicr-gsi) or send an email to gsi@oicr.on.ca . diff --git a/commands.txt b/commands.txt index 9c7e8e8..a432d1c 100755 --- a/commands.txt +++ b/commands.txt @@ -1,13 +1,33 @@ ## Commands This section lists command(s) run by vardict workflow - * Running Vardict - + * Running vardict + ``` + set -euo pipefail + + mkdir split_beds + CHROMS=($(seq 1 22) X Y) + + for chr in "${CHROMS[@]}"; do + grep -E "^(chr)?${chr}[[:space:]]" ~{bed_file} > split_beds/chr${chr}.bed || true + if [ -s split_beds/chr${chr}.bed ]; then + echo "split_beds/chr${chr}.bed" >> split_beds.list + # Calculate range size for this bed + range_size=$(awk '{sum += ($3 - $2)} END {print sum}' split_beds/chr${chr}.bed) + echo "${range_size}" >> range_sizes.txt + fi + done + max_size=$(sort -n range_sizes.txt | tail -n1) + while IFS= read -r size; do + awk -v size="$size" -v max="$max_size" 'BEGIN {printf "%.1f\n", size/max}' + done < range_sizes.txt > memory_coefficients.txt +``` ``` set -euo pipefail cp ~{refFai} . - export JAVA_OPTS="-Xmx~{jobMemory - 6}G " + + export JAVA_OPTS="-Xmx$(echo "scale=0; ~{memory} * 0.9 / 1" | bc)G" $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ @@ -20,6 +40,22 @@ $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ -N "~{tumor_sample_name}|~{normal_sample_name}" \ - -f ~{AF_THR} > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf - + -f ~{AF_THR} | gzip > vardict.vcf.gz + + # the vardict generated vcf header missing contig name, need extract contig lines from refFai + bcftools view -h vardict.vcf.gz > header.txt + while read -r name length rest; do + echo "##contig=" + done < ~{refFai} >> header.txt + + bcftools reheader -h header.txt -o ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz vardict.vcf.gz + bcftools index --tbi ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz ``` + ``` + set -euo pipefail + + java "-Xmx~{memory-3}g" -jar $PICARD_ROOT/picard.jar MergeVcfs \ + I=~{sep=" I=" vcfs} \ + O=~{outputName} \ + SEQUENCE_DICTIONARY=~{refDict} +``` \ No newline at end of file diff --git a/vardict.wdl b/vardict.wdl index 78f4833..1146141 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -50,7 +50,6 @@ workflow vardict { } # run vardict - scatter (i in range(length(splitBedByChromosome.bed_files))) { call runVardict { input: @@ -94,6 +93,10 @@ workflow vardict { vardict_vcf: { description: "VCF file for variant calling from vardict", vidarr_label: "vardict_vcf" + }, + vardictVcfIndex: { + description: "VCF index for variant calling from vardict", + vidarr_label: "vardcit_index" } } } @@ -183,6 +186,9 @@ task runVardict { bed_file: "BED files for specifying regions of interest" modules: "Names and versions of modules" timeout: "Timeout in hours, needed to override imposed limits" + memory: "base memory for this job" + memory_coefficient: "coefficient for calculating allocated memory" + minMemory: "The minimum value for allocated memory" } Int allocatedMemory = if minMemory > round(memory * memory_coefficient) then minMemory else round(memory * memory_coefficient) @@ -240,8 +246,10 @@ task mergeVCFs { parameter_meta { modules: "Environment module names and version to load (space separated) before command execution" vcfs: "Vcf's from scatter to merge together" + vcfIndexes: "The indices for the input vcfs" memory: "Memory allocated for job" timeout: "Hours before task timeout" + refDict: "reference sequence dictionary" } meta { diff --git a/vidarrtest-regression.json.in b/vidarrtest-regression.json.in index 39a70b7..c723689 100755 --- a/vidarrtest-regression.json.in +++ b/vidarrtest-regression.json.in @@ -30,12 +30,23 @@ "vardict.runVardict.READ_POSTION_FILTER": null, "vardict.runVardict.timeout": 48, "vardict.runVardict.modules": null, - "vardict.runVardict.jobMemory": null, + "vardict.runVardict.memory": null, + "vardict.runVardict.memory_coefficient": null, + "vardict.runVardict.minMemory": null, "vardict.bed_file": "/.mounts/labs/gsi/testdata/mutect2/input_data/PCSI0022.val.bed", "vardict.runVardict.refFasta": null, "vardict.tumor_sample_name": "PCSI0022P", "vardict.runVardict.AF_THR": null, - "vardict.reference": "hg19" + "vardict.reference": "hg19", + "vardict.splitBedByChromosome.memory": null, + "vardict.splitBedByChromosome.timeout": null, + "vardict.splitBedByChromosome.bed_file": null, + "vardict.mergVcfs.memory": null, + "vardict.mergVcfs.modules": null, + "vardict.mergVcfs.timeout": null, + "vardict.mergVcfs.refDict": null, + "vardict.mergVcfs.vcfs": null, + "vardict.mergVcfs.vcfIndexes": null }, "description": "vardict workflow test", "engineArguments": { From 1391937916cb4d9f0024e32f18121f0ae8c58616 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 20 Jan 2025 12:55:21 -0500 Subject: [PATCH 21/27] add missing parameters --- vardict.wdl | 8 ++++---- vidarrtest-regression.json.in | 10 +++++++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index 1146141..e1c5c06 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -67,7 +67,7 @@ workflow vardict { Array[File] vardictVcfs = runVardict.vcf_file Array[File] vardictVcfIndexes = runVardict.vcf_index - call mergeVCFs { + call mergeVcfs { input: vcfs = vardictVcfs, vcfIndexes = vardictVcfIndexes, @@ -102,8 +102,8 @@ workflow vardict { } output { - File vardictVcf = mergeVCFs.mergedVcf - File vardictVcfIndex = mergeVCFs.mergedVcfIdx + File vardictVcf = mergeVcfs.mergedVcf + File vardictVcfIndex = mergeVcfs.mergedVcfIdx } } @@ -233,7 +233,7 @@ task runVardict { } } -task mergeVCFs { +task mergeVcfs { input { String modules Array[File] vcfs diff --git a/vidarrtest-regression.json.in b/vidarrtest-regression.json.in index c723689..6de8b2a 100755 --- a/vidarrtest-regression.json.in +++ b/vidarrtest-regression.json.in @@ -55,7 +55,15 @@ }, "id": "vardict_test", "metadata": { - "vardict.vardict_vcf": { + "vardict.vardictVcf": { + "contents": [ + { + "outputDirectory": "@SCRATCH@/@DATE@_Workflow_vardict_@JENKINSID@" + } + ], + "type": "ALL" + }, + "vardict.vardictVcfIndex": { "contents": [ { "outputDirectory": "@SCRATCH@/@DATE@_Workflow_vardict_@JENKINSID@" From b83eb13e056e243f9c0f98fcfc686535d3821ea7 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 20 Jan 2025 12:58:03 -0500 Subject: [PATCH 22/27] fix regression test parameter name --- vidarrtest-regression.json.in | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vidarrtest-regression.json.in b/vidarrtest-regression.json.in index 6de8b2a..14a7052 100755 --- a/vidarrtest-regression.json.in +++ b/vidarrtest-regression.json.in @@ -41,12 +41,12 @@ "vardict.splitBedByChromosome.memory": null, "vardict.splitBedByChromosome.timeout": null, "vardict.splitBedByChromosome.bed_file": null, - "vardict.mergVcfs.memory": null, - "vardict.mergVcfs.modules": null, - "vardict.mergVcfs.timeout": null, - "vardict.mergVcfs.refDict": null, - "vardict.mergVcfs.vcfs": null, - "vardict.mergVcfs.vcfIndexes": null + "vardict.mergeVcfs.memory": null, + "vardict.mergeVcfs.modules": null, + "vardict.mergeVcfs.timeout": null, + "vardict.mergeVcfs.refDict": null, + "vardict.mergeVcfs.vcfs": null, + "vardict.mergeVcfs.vcfIndexes": null }, "description": "vardict workflow test", "engineArguments": { From a79004d42db3c6adca0e3e27e0f5ae2a7146b921 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 27 Jan 2025 10:47:04 -0500 Subject: [PATCH 23/27] memory and time parameters adjustment --- vardict.wdl | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index e1c5c06..bd0b29e 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -135,10 +135,17 @@ task splitBedByChromosome { echo "${range_size}" >> range_sizes.txt fi done + min_coff=0.8 + p=0.8 max_size=$(sort -n range_sizes.txt | tail -n1) - while IFS= read -r size; do - awk -v size="$size" -v max="$max_size" 'BEGIN {printf "%.1f\n", size/max}' - done < range_sizes.txt > memory_coefficients.txt + + awk -v max="$max_size" -v min_coff="$min_coff" -v p="$p" ' + { + size = $1 + ratio = size / max + coefficient = min_coff + (1 - min_coff) * (ratio ^ p) + printf "%.2f\n", coefficient + }' range_sizes.txt > memory_coefficients.txt >>> output { @@ -168,7 +175,7 @@ task runVardict { String READ_POSTION_FILTER = 5 String modules String bed_file - Int timeout = 96 + Int timeout = 120 Int memory = 32 Int minMemory = 24 Float memory_coefficient @@ -196,7 +203,7 @@ task runVardict { set -euo pipefail cp ~{refFai} . - export JAVA_OPTS="-Xmx$(echo "scale=0; ~{memory} * 0.9 / 1" | bc)G" + export JAVA_OPTS="-Xmx$(echo "scale=0; ~{allocatedMemory} * 0.8 / 1" | bc)G" $VARDICT_ROOT/bin/VarDict \ -G ~{refFasta} \ -f ~{AF_THR} \ From a1354e7784e4b7594747ef88ffe53c90f7e445bf Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 10 Feb 2025 15:25:25 -0500 Subject: [PATCH 24/27] add numThreads, remove vcf header in calculate.sh --- tests/calculate.sh | 2 +- vardict.wdl | 8 +++++--- vidarrtest-regression.json.in | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/calculate.sh b/tests/calculate.sh index 884fc56..1221192 100755 --- a/tests/calculate.sh +++ b/tests/calculate.sh @@ -5,4 +5,4 @@ set -o pipefail cd $1 -find . -name '*.vcf' | xargs md5sum \ No newline at end of file +for f in $(find . -xtype f -name "*.vcf.gz" | sort -V);do zcat $f | grep -v ^# | md5sum \ No newline at end of file diff --git a/vardict.wdl b/vardict.wdl index bd0b29e..faf89a8 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -172,12 +172,13 @@ task runVardict { String refFai String AF_THR = 0.01 String MAP_QUAL = 10 - String READ_POSTION_FILTER = 5 + String READ_POSITION_FILTER = 5 String modules String bed_file Int timeout = 120 Int memory = 32 Int minMemory = 24 + Int numThreads = 8 Float memory_coefficient } parameter_meta { @@ -189,7 +190,7 @@ task runVardict { refFasta: "The reference fasta" AF_THR: "The threshold for allele frequency, default: 0.01 or 1%" MAP_QUAL: " Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored" - READ_POSTION_FILTER: "The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5" + READ_POSITION_FILTER : "The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5" bed_file: "BED files for specifying regions of interest" modules: "Names and versions of modules" timeout: "Timeout in hours, needed to override imposed limits" @@ -205,12 +206,13 @@ task runVardict { export JAVA_OPTS="-Xmx$(echo "scale=0; ~{allocatedMemory} * 0.8 / 1" | bc)G" $VARDICT_ROOT/bin/VarDict \ + -th ~{numThreads} \ -G ~{refFasta} \ -f ~{AF_THR} \ -N ~{tumor_sample_name} \ -b "~{tumor_bam}|~{normal_bam}" \ -Q ~{MAP_QUAL} \ - -P ~{READ_POSTION_FILTER} \ + -P ~{READ_POSITION_FILTER} \ -c 1 -S 2 -E 3 -g 4 \ ~{bed_file} | \ $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ diff --git a/vidarrtest-regression.json.in b/vidarrtest-regression.json.in index 14a7052..2f20c24 100755 --- a/vidarrtest-regression.json.in +++ b/vidarrtest-regression.json.in @@ -33,6 +33,7 @@ "vardict.runVardict.memory": null, "vardict.runVardict.memory_coefficient": null, "vardict.runVardict.minMemory": null, + "vardict.runVardict.numThreads": 8, "vardict.bed_file": "/.mounts/labs/gsi/testdata/mutect2/input_data/PCSI0022.val.bed", "vardict.runVardict.refFasta": null, "vardict.tumor_sample_name": "PCSI0022P", From a46d7402ae2de6961abb62c27ac1c8739b633787 Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 10 Feb 2025 15:30:23 -0500 Subject: [PATCH 25/27] fix typo --- vidarrtest-regression.json.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vidarrtest-regression.json.in b/vidarrtest-regression.json.in index 2f20c24..3d45cfe 100755 --- a/vidarrtest-regression.json.in +++ b/vidarrtest-regression.json.in @@ -27,7 +27,7 @@ }, "vardict.normal_sample_name": "PCSI0022R", "vardict.runVardict.MAP_QUAL": null, - "vardict.runVardict.READ_POSTION_FILTER": null, + "vardict.runVardict.READ_POSITION_FILTER": null, "vardict.runVardict.timeout": 48, "vardict.runVardict.modules": null, "vardict.runVardict.memory": null, From c98eed2c40cedff0aed97f3e82c621585b62fc0e Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Mon, 10 Feb 2025 15:51:55 -0500 Subject: [PATCH 26/27] fix syntax --- tests/calculate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/calculate.sh b/tests/calculate.sh index 1221192..d6f409c 100755 --- a/tests/calculate.sh +++ b/tests/calculate.sh @@ -5,4 +5,4 @@ set -o pipefail cd $1 -for f in $(find . -xtype f -name "*.vcf.gz" | sort -V);do zcat $f | grep -v ^# | md5sum \ No newline at end of file +for f in $(find . -xtype f -name "*.vcf.gz" | sort -V);do zcat $f | grep -v ^# | md5sum; done \ No newline at end of file From d1bf884d26dee5e6caca99a826db15f6941770cb Mon Sep 17 00:00:00 2001 From: Gavin Peng Date: Fri, 14 Feb 2025 12:04:14 -0500 Subject: [PATCH 27/27] fix the non-standard notation in vardict generated vcf --- vardict.wdl | 57 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/vardict.wdl b/vardict.wdl index faf89a8..95107f5 100755 --- a/vardict.wdl +++ b/vardict.wdl @@ -32,14 +32,14 @@ workflow vardict { "refFai" : "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.fa.fai", "refFasta" : "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.fa", "refDict" : "/.mounts/labs/gsi/modulator/sw/data/hg19-p13/hg19_random.dict", - "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 bcftools/1.9", + "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 bcftools/1.9 htslib/1.9", "mergeVcfModules": "picard/2.19.2 hg19/p13" }, "hg38": { "refFai" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa.fai", "refFasta" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.fa", "refDict" : "/.mounts/labs/gsi/modulator/sw/data/hg38-p12/hg38_random.dict", - "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 bcftools/1.9", + "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3 bcftools/1.9 htslib/1.9", "mergeVcfModules": "picard/2.19.2 hg38/p12" } } @@ -218,7 +218,7 @@ task runVardict { $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \ $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \ -N "~{tumor_sample_name}|~{normal_sample_name}" \ - -f ~{AF_THR} | gzip > vardict.vcf.gz + -f ~{AF_THR} | bgzip > vardict.vcf.gz # the vardict generated vcf header missing contig name, need extract contig lines from refFai bcftools view -h vardict.vcf.gz > header.txt @@ -227,7 +227,7 @@ task runVardict { done < ~{refFai} >> header.txt bcftools reheader -h header.txt -o ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz vardict.vcf.gz - bcftools index --tbi ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz + tabix -p vcf ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf.gz >>> runtime { @@ -270,11 +270,56 @@ task mergeVcfs { String outputName = basename(vcfs[0]) - command <<< + command <<< set -euo pipefail + # normalize vcf file from vardict since vardict may generate vcf with non-standard notation + fixed_vcfs="" + + echo '~{sep="\n" vcfs}' > vcf_files.txt + + while IFS= read -r VCF_FILE; do + FIXED_VCF="$(basename ${VCF_FILE%.gz})_fixed.vcf" + + if [[ $VCF_FILE == *.gz ]]; then + INPUT_COMMAND="zcat" + else + INPUT_COMMAND="cat" + fi + + $INPUT_COMMAND "$VCF_FILE" | awk ' + BEGIN { dup_pattern = "" } + # Print all header lines unchanged + /^#/ { print; next } + # Process non-header lines + { + if ($0 ~ dup_pattern) { + # Extract dup size + match($0, //, dup) + svlen = dup[1] + end = $2 + svlen + + # Replace dup-XX with DUP + gsub(//, "", $5) + + # Update INFO field + if ($8 ~ /TYPE=Insertion/) { + sub(/TYPE=Insertion/, "SVTYPE=DUP", $8) + $8 = $8 ";SVLEN=" svlen ";END=" end + } + } + print + }' > "$FIXED_VCF" + + fixed_vcfs="${fixed_vcfs} ${FIXED_VCF}" + done < vcf_files.txt + + input_args="" + for vcf in $fixed_vcfs; do + input_args="$input_args I=$vcf" + done java "-Xmx~{memory-3}g" -jar $PICARD_ROOT/picard.jar MergeVcfs \ - I=~{sep=" I=" vcfs} \ + $input_args \ O=~{outputName} \ SEQUENCE_DICTIONARY=~{refDict} >>>