weizhu365
diff --git a/‎.editorconfig
Lines changed: 15 additions & 0 deletions b/‎.editorconfig
Lines changed: 15 additions & 0 deletions
diff --git a/‎.gitattributes
Lines changed: 4 additions & 0 deletions b/‎.gitattributes
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/main.yml
Lines changed: 25 additions & 0 deletions b/‎.github/workflows/main.yml
Lines changed: 25 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 33 additions & 0 deletions b/‎.gitignore
Lines changed: 33 additions & 0 deletions
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎.test/config.yaml
Lines changed: 46 additions & 0 deletions b/‎.test/config.yaml
Lines changed: 46 additions & 0 deletions
diff --git a/‎.test/report.html
Lines changed: 1039 additions & 0 deletions b/‎.test/report.html
Lines changed: 1039 additions & 0 deletions
diff --git a/‎.test/samples.tsv
Lines changed: 3 additions & 0 deletions b/‎.test/samples.tsv
Lines changed: 3 additions & 0 deletions
diff --git a/‎.test/units.tsv
Lines changed: 4 additions & 0 deletions b/‎.test/units.tsv
Lines changed: 4 additions & 0 deletions
diff --git a/‎LICENSE
Lines changed: 21 additions & 0 deletions b/‎LICENSE
Lines changed: 21 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 80 additions & 0 deletions b/‎README.md
Lines changed: 80 additions & 0 deletions
diff --git a/‎Snakefile
Lines changed: 20 additions & 0 deletions b/‎Snakefile
Lines changed: 20 additions & 0 deletions
diff --git a/‎config.yaml
Lines changed: 54 additions & 0 deletions b/‎config.yaml
Lines changed: 54 additions & 0 deletions
diff --git a/‎envs/bedops.yaml
Lines changed: 5 additions & 0 deletions b/‎envs/bedops.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎envs/rbt.yaml
Lines changed: 6 additions & 0 deletions b/‎envs/rbt.yaml
Lines changed: 6 additions & 0 deletions
diff --git a/‎envs/stats.yaml
Lines changed: 8 additions & 0 deletions b/‎envs/stats.yaml
Lines changed: 8 additions & 0 deletions
diff --git a/‎report/calls.rst
Lines changed: 2 additions & 0 deletions b/‎report/calls.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎report/depths.rst
Lines changed: 1 addition & 0 deletions b/‎report/depths.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎report/freqs.rst
Lines changed: 1 addition & 0 deletions b/‎report/freqs.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎report/multiqc.rst
Lines changed: 1 addition & 0 deletions b/‎report/multiqc.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎report/vcf.rst
Lines changed: 2 additions & 0 deletions b/‎report/vcf.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎report/workflow.rst
Lines changed: 22 additions & 0 deletions b/‎report/workflow.rst
Lines changed: 22 additions & 0 deletions
diff --git a/‎rules/annotation.smk
Lines changed: 13 additions & 0 deletions b/‎rules/annotation.smk
Lines changed: 13 additions & 0 deletions
@@ -0,0 +1,15 @@
+# EditorConfig is awesome: http://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+charset = utf-8
+indent_style = space
+indent_size = 4
+
+[*.{yml,yaml}]
+indent_style = space
+indent_size = 2
@@ -0,0 +1,4 @@
+*.smk linguist-language=Python
+Snakefile linguist-language=Python
+.test/* linguist-vendored=false
+.test/report.html linguist-generated=true
@@ -0,0 +1,25 @@
+name: Tests
+
+on: 
+  - push
+  - pull_request
+
+jobs:
+  run-workflow:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: Checkout submodules
+      uses: textbook/[email protected]
+    - name: snakemake
+      uses: snakemake/snakemake-github-action@master
+      with:
+        directory: .test
+        snakefile: Snakefile
+        args: "--use-conda --show-failed-logs"
+        stagein: |
+          conda create -c bioconda -c conda-forge -q -n prep bwa gatk4 samtools
+          source activate prep
+          bwa index .test/data/ref/genome.chr21.fa
+          samtools faidx .test/data/ref/genome.chr21.fa
+          gatk CreateSequenceDictionary -R .test/data/ref/genome.chr21.fa
@@ -0,0 +1,33 @@
+*
+!scripts
+!scripts/*
+!scripts/common
+!scripts/common/*
+scripts/.snakemake*
+!Snakefile
+!config.yaml
+!samples.tsv
+!resources
+!resources/*
+!envs
+!envs/*
+!environment.yaml
+!LICENSE
+!README.md
+!rules
+!rules/*
+!.gitignore
+!.gitattributes
+!.editorconfig
+!.travis.yml
+!.test
+!samples.tsv
+!units.tsv
+!schemas
+!schemas/*
+!.test/data
+!.test/samples.tsv
+!.test/units.tsv
+!.test/config.yaml
+!report
+!report/*.rst
@@ -0,0 +1,3 @@
+[submodule ".test/data"]
+	path = .test/data
+	url = https://github.com/snakemake-workflows/ngs-test-data.git
@@ -0,0 +1,46 @@
+samples: samples.tsv
+units: units.tsv
+
+ref:
+  name: GRCh38.86
+  genome: data/ref/genome.chr21.fa
+  known-variants: data/ref/dbsnp.vcf.gz
+
+filtering:
+  # Set to true in order to apply machine learning based recalibration of
+  # quality scores instead of hard filtering.
+  vqsr: false
+  hard:
+    # hard filtering as outlined in GATK docs
+    # (https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set)
+    snvs:
+      "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0"
+    indels:
+      "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"
+
+processing:
+  remove-duplicates: true
+  # restrict-regions: ../raw/captured_regions.bed
+  # region-padding: 100
+
+params:
+  gatk:
+    HaplotypeCaller: ""
+    BaseRecalibrator: ""
+    GenotypeGVCFs: ""
+    VariantRecalibrator: ""
+  picard:
+    MarkDuplicates: "REMOVE_DUPLICATES=true"
+  trimmomatic:
+    pe:
+      trimmer:
+        - "LEADING:3"
+        - "TRAILING:3"
+        - "SLIDINGWINDOW:4:15"
+        - "MINLEN:36"
+    se:
+      trimmer:
+        - "LEADING:3"
+        - "TRAILING:3"
+        - "SLIDINGWINDOW:4:15"
+        - "MINLEN:36"
@@ -0,0 +1,3 @@
+sample
+A
+B
@@ -0,0 +1,4 @@
+sample	unit	platform	fq1	fq2
+A	1	ILLUMINA	data/reads/a.chr21.1.fq	data/reads/a.chr21.2.fq
+B	1	ILLUMINA	data/reads/b.chr21.1.fq	data/reads/b.chr21.2.fq
+B	2	ILLUMINA	data/reads/b.chr21.1.fq	
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018, Johannes Köster
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,80 @@
+# Snakemake workflow: dna-seq-gatk-variant-calling
+
+[![Snakemake](https://img.shields.io/badge/snakemake-≥5.7.1-brightgreen.svg)](https://snakemake.readthedocs.io)
+[![Snakemake-Report](https://img.shields.io/badge/snakemake-report-green.svg)](https://cdn.rawgit.com/snakemake-workflows/dna-seq-gatk-variant-calling/master/.test/report.html)
+
+This Snakemake pipeline implements the [GATK best-practices workflow](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11145) for calling small genomic variants.
+
+## Authors
+
+* Johannes Köster (https://koesterlab.github.io)
+
+
+## Usage
+
+In any case, if you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and, if available, its DOI (see above).
+
+
+#### Step 1: Obtain a copy of this workflow
+
+1. Create a new github repository using this workflow [as a template](https://help.github.com/en/articles/creating-a-repository-from-a-template).
+2. [Clone](https://help.github.com/en/articles/cloning-a-repository) the newly created repository to your local system, into the place where you want to perform the data analysis.
+
+#### Step 2: Configure workflow
+
+Configure the workflow according to your needs via editing the file `config.yaml`.
+
+#### Step 3: Execute workflow
+
+Test your configuration by performing a dry-run via
+
+    snakemake --use-conda -n
+
+Execute the workflow locally via
+
+    snakemake --use-conda --cores $N
+
+using `$N` cores or run it in a cluster environment via
+
+    snakemake --use-conda --cluster qsub --jobs 100
+
+or
+
+    snakemake --use-conda --drmaa --jobs 100
+
+If you not only want to fix the software stack but also the underlying OS, use
+
+    snakemake --use-conda --use-singularity
+
+in combination with any of the modes above.
+See the [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/executable.html) for further details.
+
+#### Step 4: Investigate results
+
+After successful execution, you can create a self-contained interactive HTML report with all results via:
+
+    snakemake --report report.html
+
+This report can, e.g., be forwarded to your collaborators.
+An example (using some trivial test data) can be seen [here](https://cdn.rawgit.com/snakemake-workflows/dna-seq-gatk-variant-calling/master/.test/report.html).
+
+#### Step 5: Commit changes
+
+Whenever you change something, don't forget to commit the changes back to your github copy of the repository:
+
+    git commit -a
+    git push
+
+#### Step 6: Contribute back
+
+In case you have also changed or added steps, please consider contributing them back to the original repository:
+
+1. [Fork](https://help.github.com/en/articles/fork-a-repo) the original repo to a personal or lab account.
+2. [Clone](https://help.github.com/en/articles/cloning-a-repository) the fork to your local system, to a different place than where you ran your analysis.
+3. Copy the modified files from your analysis to the clone of your fork, e.g., `cp -r envs rules scripts path/to/fork`. Make sure to **not** accidentally copy config file contents or sample sheets.
+4. Commit and push your changes to your fork.
+5. Create a [pull request](https://help.github.com/en/articles/creating-a-pull-request) against the original repository.
+
+## Testing
+
+Test cases are in the subfolder `.test`. They are automtically executed via continuous integration with Github actions.
@@ -0,0 +1,20 @@
+include: "rules/common.smk"
+
+##### Target rules #####
+
+rule all:
+    input:
+        "annotated/all.vcf.gz",
+        "qc/multiqc.html",
+        "plots/depths.svg",
+        "plots/allele-freqs.svg"
+
+
+##### Modules #####
+
+include: "rules/mapping.smk"
+include: "rules/calling.smk"
+include: "rules/filtering.smk"
+include: "rules/stats.smk"
+include: "rules/qc.smk"
+include: "rules/annotation.smk"
@@ -0,0 +1,54 @@
+samples: samples.tsv
+units: units.tsv
+
+ref:
+  name: GRCh38.86
+  # Path to the reference genome, ideally as it is provided by the GATK bundle.
+  genome: data/ref/genome.chr21.fa
+  # Path to any database of known variants, ideally as it is provided by the GATK bundle.
+  known-variants: data/ref/dbsnp.vcf.gz
+
+filtering:
+  # Set to true in order to apply machine learning based recalibration of
+  # quality scores instead of hard filtering.
+  vqsr: false
+  hard:
+    # hard filtering as outlined in GATK docs
+    # (https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set)
+    snvs:
+      "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0"
+    indels:
+      "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"
+
+processing:
+  remove-duplicates: true
+  # Uncomment and point to a bed file with, e.g., captured regions if necessary,
+  # see https://gatkforums.broadinstitute.org/gatk/discussion/4133/when-should-i-use-l-to-pass-in-a-list-of-intervals.
+  # restrict-regions: captured_regions.bed
+  # If regions are restricted, uncomment this to enlarge them by the given value in order to include
+  # flanking areas.
+  # region-padding: 100
+
+params:
+  gatk:
+    HaplotypeCaller: ""
+    BaseRecalibrator: ""
+    GenotypeGVCFs: ""
+    VariantRecalibrator: ""
+  picard:
+    MarkDuplicates: "REMOVE_DUPLICATES=true"
+  trimmomatic:
+    pe:
+      trimmer:
+        # See trimmomatic manual for adding additional options, e.g. for adapter trimming.
+        - "LEADING:3"
+        - "TRAILING:3"
+        - "SLIDINGWINDOW:4:15"
+        - "MINLEN:36"
+    se:
+      trimmer:
+        # See trimmomatic manual for adding additional options, e.g. for adapter trimming.
+        - "LEADING:3"
+        - "TRAILING:3"
+        - "SLIDINGWINDOW:4:15"
+        - "MINLEN:36"
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::bedops =2.4
@@ -0,0 +1,6 @@
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - rust-bio-tools =0.6.0
+  - bcftools =1.9
@@ -0,0 +1,8 @@
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - python =3.6
+  - matplotlib =2.2
+  - pandas =0.23
+  - seaborn =0.8
@@ -0,0 +1,2 @@
+Filtered and annotated variant calls as gzipped tab separated table (TSV).
+All variants that do not pass filters have been removed.
@@ -0,0 +1 @@
+Read depth distribution over variant alleles of each sample.
@@ -0,0 +1 @@
+Per variant per sample allele frequency, i.e., :math:`m / n` where :math:`m` is the number of reads supporting the variant allele and :math:`n` is the total number of reads over the variant allele in that sample.
@@ -0,0 +1 @@
+Quality controls aggregated into an interactive report via MultiQC.
@@ -0,0 +1,2 @@
+Filtered and annotated variant calls as gzipped VCF file.
+Variants that do not pass filters are kept, but marked with a value other than ``PASS`` in ther ``FILTER`` column.
@@ -0,0 +1,22 @@
+Variants where called following the `GATK best practices workflow`_:
+Reads were mapped onto {{ snakemake.config["ref"]["name"] }} with `BWA mem`_, and both optical and PCR duplicates were removed with Picard_, followed by base recalibration with GATK_.
+The GATK_ HaplotypeCaller was used to call variants per-sample, including summarized evidence for non-variant sites (GVCF_ approach).
+Then, GATK_ genotyping was done in a joint way over GVCF_ files of all samples.
+{% if snakemake.config["filtering"]["vqsr"] %}
+Genotyped variants were filtered with the GATK_ VariantRecalibrator approach.
+{% else %}
+Genotyped variants were filtered using hard thresholds.
+For SNVs, the criterion ``{{ snakemake.config["filtering"]["hard"]["snvs"] }}`` was used, for Indels the criterion ``{{ snakemake.config["filtering"]["hard"]["indels"] }}`` was used.
+{% endif %}
+Finally, SnpEff_ was used to predict and report variant effects.
+In addition, quality control was performed with FastQC_, Samtools_, and Picard_ and aggregated into an interactive report via MultiQC_.
+
+.. _GATK: https://software.broadinstitute.org/gatk/
+.. _BWA mem: http://bio-bwa.sourceforge.net/
+.. _Picard: https://broadinstitute.github.io/picard
+.. _GATK best practices workflow: https://software.broadinstitute.org/gatk/best-practices/workflow?id=11145
+.. _GVCF: https://gatkforums.broadinstitute.org/gatk/discussion/4017/what-is-a-gvcf-and-how-is-it-different-from-a-regular-vcf
+.. _SnpEff: http://snpeff.sourceforge.net
+.. _MultiQC: http://multiqc.info/
+.. _Samtools: http://samtools.sourceforge.net/
+.. _FastQC: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
@@ -0,0 +1,13 @@
+rule snpeff:
+    input:
+        "filtered/all.vcf.gz",
+    output:
+        vcf=report("annotated/all.vcf.gz", caption="../report/vcf.rst", category="Calls"),
+        csvstats="snpeff/all.csv"
+    log:
+        "logs/snpeff.log"
+    params:
+        reference=config["ref"]["name"],
+        extra="-Xmx6g"
+    wrapper:
+        "0.27.1/bio/snpeff"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule ".test/data"]`
	`2`	`+ path = .test/data`
	`3`	`+ url = https://github.com/snakemake-workflows/ngs-test-data.git`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Filtered and annotated variant calls as gzipped tab separated table (TSV).`
	`2`	`+All variants that do not pass filters have been removed.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Read depth distribution over variant alleles of each sample.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Per variant per sample allele frequency, i.e., :math:`m / n` where :math:`m` is the number of reads supporting the variant allele and :math:`n` is the total number of reads over the variant allele in that sample.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Quality controls aggregated into an interactive report via MultiQC.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Filtered and annotated variant calls as gzipped VCF file.`
	`2`	+Variants that do not pass filters are kept, but marked with a value other than ``PASS`` in ther ``FILTER`` column.