oicr-gsi · gavin-peng · Dec 19, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 23, 2024
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -0,0 +1,24 @@
+
+pipeline {
+  agent any
+  stages {
+    stage('build') {
+      when {
+        not {
+          buildingTag()
+        }
+      }
+      steps {
+        sh '/.mounts/labs/gsi/vidarr/jenkins-ci-wrapper test -t /.mounts/labs/gsi/vidarr/testing-config.json'
+      }
+    }
+    stage('Deploy') {
+      when {
+        buildingTag()
+      }
+      steps {
+        sh '/.mounts/labs/gsi/vidarr/jenkins-ci-wrapper deploy -v $TAG_NAME -t /.mounts/labs/gsi/vidarr/testing-config.json -U /.mounts/labs/gsi/vidarr/deploy-urls'
+      }
+    }
+  }
+}
diff --git a/README.md b/README.md
@@ -1 +1,84 @@
-vardict wdl workflow
+# vardict
+
+VarDict is an ultra sensitive variant caller for both single and paired sample variant calling from BAM files. VarDict implements several novel features such as amplicon bias aware variant calling from targeted sequencing experiments, rescue of long indels by realigning bwa soft clipped reads and better scalability than many Java based variant callers.
+
+## Overview
+
+## Dependencies
+
+* [vardict 1.8.3](https://github.com/pachterlab/vardict)
+* [java](https://www.java.com/en/)
+
+
+## Usage
+
+### Cromwell
+```
+java -jar cromwell.jar run vardict.wdl --inputs inputs.json
+```
+
+### Inputs
+
+#### Required workflow parameters:
+Parameter|Value|Description
+---|---|---
+`tumor_bam`|File|tumor_bam file for analysis sample
+`normal_bam`|File|normal_bam file for analysis sample
+`tumor_sample_name`|String|Sample name for the tumor bam
+`normal_sample_name`|String|Sample name for the normal bam
+`bed_file`|String|BED files for specifying regions of interest
+`reference`|String|the reference genome for input sample
+
+
+#### Optional workflow parameters:
+Parameter|Value|Default|Description
+---|---|---|---
+
+
+#### Optional task parameters:
+Parameter|Value|Default|Description
+---|---|---|---
+`runVardict.AF_THR`|String|0.01|The threshold for allele frequency, default: 0.01 or 1%
+`runVardict.MAP_QUAL`|String|10| Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored
+`runVardict.READ_POSTION_FILTER`|String|5|The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5
+`runVardict.timeout`|Int|48|Timeout in hours, needed to override imposed limits
+`runVardict.jobMemory`|Int|48|Memory in Gb for this job
+
+
+### Outputs
+
+Output | Type | Description
+---|---|---
+`vardict_vcf`|File|{'description': 'VCF file for variant calling from vardict', 'vidarr_label': 'vardict_vcf'}
+
+
+## Commands
+  This section lists command(s) run by vardict workflow
+
+  * Running Vardict
+
+
+  ```
+          set -euo pipefail
+          cp ~{refFai} .
+          export JAVA_OPTS="-Xmx~{jobMemory - 6}G "
+          $VARDICT_ROOT/bin/VarDict \
+              -G ~{refFasta} \
+              -f ~{AF_THR} \
+              -N ~{tumor_sample_name} \
+              -b "~{tumor_bam}|~{normal_bam}" \
+              -Q ~{MAP_QUAL} \
+              -P ~{READ_POSTION_FILTER} \
+              -c 1 -S 2 -E 3 -g 4 \
+              ~{bed_file} | \
+              $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \
+              $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \
+              -N "~{tumor_sample_name}|~{normal_sample_name}" \
+              -f ~{AF_THR}  > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf
+
+ ```
+ ## Support
+
+For support, please file an issue on the [Github project](https://github.com/oicr-gsi) or send an email to [email protected] .
+
+_Generated with generate-markdown-readme (https://github.com/oicr-gsi/gsi-wdl-tools/)_
diff --git a/commands.txt b/commands.txt
@@ -0,0 +1,25 @@
+## Commands
+ This section lists command(s) run by vardict workflow
+
+ * Running Vardict
+
+
+ ```
+         set -euo pipefail
+         cp ~{refFai} .
+         export JAVA_OPTS="-Xmx~{jobMemory - 6}G "
+         $VARDICT_ROOT/bin/VarDict \
+             -G ~{refFasta} \
+             -f ~{AF_THR} \
+             -N ~{tumor_sample_name} \
+             -b "~{tumor_bam}|~{normal_bam}" \
+             -Q ~{MAP_QUAL} \
+             -P ~{READ_POSTION_FILTER} \
+             -c 1 -S 2 -E 3 -g 4 \
+             ~{bed_file} | \
+             $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \
+             $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \
+             -N "~{tumor_sample_name}|~{normal_sample_name}" \
+             -f ~{AF_THR}  > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf
+
+```
diff --git a/tests/calculate.sh b/tests/calculate.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -o nounset
+set -o errexit
+set -o pipefail
+
+cd $1
+
+find . -name '*.vcf'  | xargs md5sum 
diff --git a/tests/compare.sh b/tests/compare.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+diff -s <(sort $1) <(sort $2)
diff --git a/vardict.wdl b/vardict.wdl
@@ -0,0 +1,148 @@
+version 1.0
+
+struct GenomeResources {
+    String refFai
+    String refFasta
+    String modules
+}
+
+workflow vardict {
+    input {
+        File tumor_bam
+        File normal_bam
+        String tumor_sample_name
+        String normal_sample_name
+        String bed_file
+        String reference
+    }
+
+    parameter_meta {
+        tumor_bam: "tumor_bam file for analysis sample"
+        normal_bam: "normal_bam file for analysis sample"
+        tumor_sample_name:"Sample name for the tumor bam"
+        normal_sample_name: "Sample name for the normal bam"
+        bed_file: "BED files for specifying regions of interest"
+        reference: "the reference genome for input sample"
+    }
+
+    Map[String, GenomeResources] resources = {
+        "hg19": {
+            "refFai" : "$HG19_ROOT/hg19_random.fa.fai",
+            "refFasta" : "$HG19_ROOT/hg19_random.fa",
+            "modules" : "hg19/p13 rstats/4.2 java/9 perl/5.30 vardict/1.8.3"
+        },
+        "hg38": {
+            "refFai" : "$HG38_ROOT/hg38_random.fa.fai",
+            "refFasta" : "$HG38_ROOT/hg38_random.fa",
+            "modules" : "hg38/p12 rstats/4.2 java/9 perl/5.30 vardict/1.8.3"
+        }
+    }
+
+    # run vardict
+    call runVardict 
+        { 
+            input: 
+                tumor_bam = tumor_bam,
+                normal_bam = normal_bam,
+                tumor_sample_name = tumor_sample_name,
+                normal_sample_name = normal_sample_name,
+                bed_file = bed_file,
+                modules = resources [ reference ].modules,
+                refFai = resources[reference].refFai,
+                refFasta = resources[reference].refFasta,
+        }
+
+    meta {
+        author: "Gavin Peng"
+        email: "[email protected]"
+        description: "VarDict is an ultra sensitive variant caller for both single and paired sample variant calling from BAM files. VarDict implements several novel features such as amplicon bias aware variant calling from targeted sequencing experiments, rescue of long indels by realigning bwa soft clipped reads and better scalability than many Java based variant callers."
+        dependencies: [
+            {
+                name: "vardict/1.8.3",
+                url: "https://github.com/pachterlab/vardict"
+            },
+            {
+                name: "java",
+                url: "https://www.java.com/en/"
+            }
+        ]
+        output_meta: {
+            vardict_vcf: {
+                description: "VCF file for variant calling from vardict",
+                vidarr_label: "vardict_vcf"
+            }
+        }
+    }
+
+    output {
+        File vardict_vcf = runVardict.vcf_file
+    }
+
+}
+
+# ==========================
+#  configure and run vardict
+# ==========================
+task runVardict {
+    input {
+        File tumor_bam
+        File normal_bam
+        String tumor_sample_name
+        String normal_sample_name
+        String refFasta
+        String refFai
+        String AF_THR = 0.01
+        String MAP_QUAL = 10
+        String READ_POSTION_FILTER = 5
+        String modules
+        String bed_file
+        Int timeout = 48
+        Int jobMemory = 48
+    }
+    parameter_meta {
+        tumor_bam: "tumor_bam file for analysis sample"
+        normal_bam: "normal_bam file for analysis sample"
+        tumor_sample_name:"Sample name for the tumor bam"
+        normal_sample_name: "Sample name for the normal bam"
+        refFai: "Reference fasta fai index"
+        refFasta: "The reference fasta"
+        AF_THR: "The threshold for allele frequency, default: 0.01 or 1%"
+        MAP_QUAL: " Mapping quality. If set, reads with mapping quality less than the number will be filtered and ignored"
+        READ_POSTION_FILTER: "The read position filter. If the mean variants position is less that specified, it is considered false positive. Default: 5"
+        bed_file: "BED files for specifying regions of interest"
+        jobMemory: "Memory in Gb for this job"
+        modules: "Names and versions of modules"
+        timeout: "Timeout in hours, needed to override imposed limits"
+    }
+
+    command <<<
+        set -euo pipefail
+        cp ~{refFai} .
+        export JAVA_OPTS="-Xmx~{jobMemory - 6}G "
+        $VARDICT_ROOT/bin/VarDict \
+            -G ~{refFasta} \
+            -f ~{AF_THR} \
+            -N ~{tumor_sample_name} \
+            -b "~{tumor_bam}|~{normal_bam}" \
+            -Q ~{MAP_QUAL} \
+            -P ~{READ_POSTION_FILTER} \
+            -c 1 -S 2 -E 3 -g 4 \
+            ~{bed_file} | \
+            $RSTATS_ROOT/bin/Rscript $VARDICT_ROOT/bin/testsomatic.R | \
+            $PERL_ROOT/bin/perl $VARDICT_ROOT/bin/var2vcf_paired.pl \
+            -N "~{tumor_sample_name}|~{normal_sample_name}" \
+            -f ~{AF_THR}  > ~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf
+
+    >>>
+
+    runtime {
+        memory:  "~{jobMemory} GB"
+        modules: "~{modules}"
+        timeout: "~{timeout}"
+    }
+
+    output {
+        File vcf_file = "~{tumor_sample_name}_~{normal_sample_name}.vardict.vcf"
+
+    }
+}