Skip to content

Commit b9fe64e

Browse files
committed
Initial commit
0 parents  commit b9fe64e

36 files changed

+1950
-0
lines changed

.editorconfig

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# EditorConfig is awesome: http://EditorConfig.org
2+
3+
# top-most EditorConfig file
4+
root = true
5+
6+
[*]
7+
end_of_line = lf
8+
insert_final_newline = true
9+
charset = utf-8
10+
indent_style = space
11+
indent_size = 4
12+
13+
[*.{yml,yaml}]
14+
indent_style = space
15+
indent_size = 2

.gitattributes

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
*.smk linguist-language=Python
2+
Snakefile linguist-language=Python
3+
.test/* linguist-vendored=false
4+
.test/report.html linguist-generated=true

.github/workflows/main.yml

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
name: Tests
2+
3+
on:
4+
- push
5+
- pull_request
6+
7+
jobs:
8+
run-workflow:
9+
runs-on: ubuntu-latest
10+
steps:
11+
- uses: actions/checkout@v1
12+
- name: Checkout submodules
13+
uses: textbook/[email protected]
14+
- name: snakemake
15+
uses: snakemake/snakemake-github-action@master
16+
with:
17+
directory: .test
18+
snakefile: Snakefile
19+
args: "--use-conda --show-failed-logs"
20+
stagein: |
21+
conda create -c bioconda -c conda-forge -q -n prep bwa gatk4 samtools
22+
source activate prep
23+
bwa index .test/data/ref/genome.chr21.fa
24+
samtools faidx .test/data/ref/genome.chr21.fa
25+
gatk CreateSequenceDictionary -R .test/data/ref/genome.chr21.fa

.gitignore

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
*
2+
!scripts
3+
!scripts/*
4+
!scripts/common
5+
!scripts/common/*
6+
scripts/.snakemake*
7+
!Snakefile
8+
!config.yaml
9+
!samples.tsv
10+
!resources
11+
!resources/*
12+
!envs
13+
!envs/*
14+
!environment.yaml
15+
!LICENSE
16+
!README.md
17+
!rules
18+
!rules/*
19+
!.gitignore
20+
!.gitattributes
21+
!.editorconfig
22+
!.travis.yml
23+
!.test
24+
!samples.tsv
25+
!units.tsv
26+
!schemas
27+
!schemas/*
28+
!.test/data
29+
!.test/samples.tsv
30+
!.test/units.tsv
31+
!.test/config.yaml
32+
!report
33+
!report/*.rst

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule ".test/data"]
2+
path = .test/data
3+
url = https://github.com/snakemake-workflows/ngs-test-data.git

.test/config.yaml

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
samples: samples.tsv
2+
units: units.tsv
3+
4+
ref:
5+
name: GRCh38.86
6+
genome: data/ref/genome.chr21.fa
7+
known-variants: data/ref/dbsnp.vcf.gz
8+
9+
filtering:
10+
# Set to true in order to apply machine learning based recalibration of
11+
# quality scores instead of hard filtering.
12+
vqsr: false
13+
hard:
14+
# hard filtering as outlined in GATK docs
15+
# (https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set)
16+
snvs:
17+
"QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0"
18+
indels:
19+
"QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"
20+
21+
processing:
22+
remove-duplicates: true
23+
# restrict-regions: ../raw/captured_regions.bed
24+
# region-padding: 100
25+
26+
params:
27+
gatk:
28+
HaplotypeCaller: ""
29+
BaseRecalibrator: ""
30+
GenotypeGVCFs: ""
31+
VariantRecalibrator: ""
32+
picard:
33+
MarkDuplicates: "REMOVE_DUPLICATES=true"
34+
trimmomatic:
35+
pe:
36+
trimmer:
37+
- "LEADING:3"
38+
- "TRAILING:3"
39+
- "SLIDINGWINDOW:4:15"
40+
- "MINLEN:36"
41+
se:
42+
trimmer:
43+
- "LEADING:3"
44+
- "TRAILING:3"
45+
- "SLIDINGWINDOW:4:15"
46+
- "MINLEN:36"

.test/report.html

+1,039
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.test/samples.tsv

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
sample
2+
A
3+
B

.test/units.tsv

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
sample unit platform fq1 fq2
2+
A 1 ILLUMINA data/reads/a.chr21.1.fq data/reads/a.chr21.2.fq
3+
B 1 ILLUMINA data/reads/b.chr21.1.fq data/reads/b.chr21.2.fq
4+
B 2 ILLUMINA data/reads/b.chr21.1.fq

LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2018, Johannes Köster
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Snakemake workflow: dna-seq-gatk-variant-calling
2+
3+
[![Snakemake](https://img.shields.io/badge/snakemake-≥5.7.1-brightgreen.svg)](https://snakemake.readthedocs.io)
4+
[![Snakemake-Report](https://img.shields.io/badge/snakemake-report-green.svg)](https://cdn.rawgit.com/snakemake-workflows/dna-seq-gatk-variant-calling/master/.test/report.html)
5+
6+
This Snakemake pipeline implements the [GATK best-practices workflow](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11145) for calling small genomic variants.
7+
8+
## Authors
9+
10+
* Johannes Köster (https://koesterlab.github.io)
11+
12+
13+
## Usage
14+
15+
In any case, if you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and, if available, its DOI (see above).
16+
17+
18+
#### Step 1: Obtain a copy of this workflow
19+
20+
1. Create a new github repository using this workflow [as a template](https://help.github.com/en/articles/creating-a-repository-from-a-template).
21+
2. [Clone](https://help.github.com/en/articles/cloning-a-repository) the newly created repository to your local system, into the place where you want to perform the data analysis.
22+
23+
#### Step 2: Configure workflow
24+
25+
Configure the workflow according to your needs via editing the file `config.yaml`.
26+
27+
#### Step 3: Execute workflow
28+
29+
Test your configuration by performing a dry-run via
30+
31+
snakemake --use-conda -n
32+
33+
Execute the workflow locally via
34+
35+
snakemake --use-conda --cores $N
36+
37+
using `$N` cores or run it in a cluster environment via
38+
39+
snakemake --use-conda --cluster qsub --jobs 100
40+
41+
or
42+
43+
snakemake --use-conda --drmaa --jobs 100
44+
45+
If you not only want to fix the software stack but also the underlying OS, use
46+
47+
snakemake --use-conda --use-singularity
48+
49+
in combination with any of the modes above.
50+
See the [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/executable.html) for further details.
51+
52+
#### Step 4: Investigate results
53+
54+
After successful execution, you can create a self-contained interactive HTML report with all results via:
55+
56+
snakemake --report report.html
57+
58+
This report can, e.g., be forwarded to your collaborators.
59+
An example (using some trivial test data) can be seen [here](https://cdn.rawgit.com/snakemake-workflows/dna-seq-gatk-variant-calling/master/.test/report.html).
60+
61+
#### Step 5: Commit changes
62+
63+
Whenever you change something, don't forget to commit the changes back to your github copy of the repository:
64+
65+
git commit -a
66+
git push
67+
68+
#### Step 6: Contribute back
69+
70+
In case you have also changed or added steps, please consider contributing them back to the original repository:
71+
72+
1. [Fork](https://help.github.com/en/articles/fork-a-repo) the original repo to a personal or lab account.
73+
2. [Clone](https://help.github.com/en/articles/cloning-a-repository) the fork to your local system, to a different place than where you ran your analysis.
74+
3. Copy the modified files from your analysis to the clone of your fork, e.g., `cp -r envs rules scripts path/to/fork`. Make sure to **not** accidentally copy config file contents or sample sheets.
75+
4. Commit and push your changes to your fork.
76+
5. Create a [pull request](https://help.github.com/en/articles/creating-a-pull-request) against the original repository.
77+
78+
## Testing
79+
80+
Test cases are in the subfolder `.test`. They are automtically executed via continuous integration with Github actions.

Snakefile

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
include: "rules/common.smk"
2+
3+
##### Target rules #####
4+
5+
rule all:
6+
input:
7+
"annotated/all.vcf.gz",
8+
"qc/multiqc.html",
9+
"plots/depths.svg",
10+
"plots/allele-freqs.svg"
11+
12+
13+
##### Modules #####
14+
15+
include: "rules/mapping.smk"
16+
include: "rules/calling.smk"
17+
include: "rules/filtering.smk"
18+
include: "rules/stats.smk"
19+
include: "rules/qc.smk"
20+
include: "rules/annotation.smk"

config.yaml

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
samples: samples.tsv
2+
units: units.tsv
3+
4+
ref:
5+
name: GRCh38.86
6+
# Path to the reference genome, ideally as it is provided by the GATK bundle.
7+
genome: data/ref/genome.chr21.fa
8+
# Path to any database of known variants, ideally as it is provided by the GATK bundle.
9+
known-variants: data/ref/dbsnp.vcf.gz
10+
11+
filtering:
12+
# Set to true in order to apply machine learning based recalibration of
13+
# quality scores instead of hard filtering.
14+
vqsr: false
15+
hard:
16+
# hard filtering as outlined in GATK docs
17+
# (https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set)
18+
snvs:
19+
"QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0"
20+
indels:
21+
"QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"
22+
23+
processing:
24+
remove-duplicates: true
25+
# Uncomment and point to a bed file with, e.g., captured regions if necessary,
26+
# see https://gatkforums.broadinstitute.org/gatk/discussion/4133/when-should-i-use-l-to-pass-in-a-list-of-intervals.
27+
# restrict-regions: captured_regions.bed
28+
# If regions are restricted, uncomment this to enlarge them by the given value in order to include
29+
# flanking areas.
30+
# region-padding: 100
31+
32+
params:
33+
gatk:
34+
HaplotypeCaller: ""
35+
BaseRecalibrator: ""
36+
GenotypeGVCFs: ""
37+
VariantRecalibrator: ""
38+
picard:
39+
MarkDuplicates: "REMOVE_DUPLICATES=true"
40+
trimmomatic:
41+
pe:
42+
trimmer:
43+
# See trimmomatic manual for adding additional options, e.g. for adapter trimming.
44+
- "LEADING:3"
45+
- "TRAILING:3"
46+
- "SLIDINGWINDOW:4:15"
47+
- "MINLEN:36"
48+
se:
49+
trimmer:
50+
# See trimmomatic manual for adding additional options, e.g. for adapter trimming.
51+
- "LEADING:3"
52+
- "TRAILING:3"
53+
- "SLIDINGWINDOW:4:15"
54+
- "MINLEN:36"

envs/bedops.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
dependencies:
5+
- bioconda::bedops =2.4

envs/rbt.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
channels:
2+
- bioconda
3+
- conda-forge
4+
dependencies:
5+
- rust-bio-tools =0.6.0
6+
- bcftools =1.9

envs/stats.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
channels:
2+
- bioconda
3+
- conda-forge
4+
dependencies:
5+
- python =3.6
6+
- matplotlib =2.2
7+
- pandas =0.23
8+
- seaborn =0.8

report/calls.rst

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Filtered and annotated variant calls as gzipped tab separated table (TSV).
2+
All variants that do not pass filters have been removed.

report/depths.rst

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Read depth distribution over variant alleles of each sample.

report/freqs.rst

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Per variant per sample allele frequency, i.e., :math:`m / n` where :math:`m` is the number of reads supporting the variant allele and :math:`n` is the total number of reads over the variant allele in that sample.

report/multiqc.rst

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Quality controls aggregated into an interactive report via MultiQC.

report/vcf.rst

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Filtered and annotated variant calls as gzipped VCF file.
2+
Variants that do not pass filters are kept, but marked with a value other than ``PASS`` in ther ``FILTER`` column.

report/workflow.rst

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
Variants where called following the `GATK best practices workflow`_:
2+
Reads were mapped onto {{ snakemake.config["ref"]["name"] }} with `BWA mem`_, and both optical and PCR duplicates were removed with Picard_, followed by base recalibration with GATK_.
3+
The GATK_ HaplotypeCaller was used to call variants per-sample, including summarized evidence for non-variant sites (GVCF_ approach).
4+
Then, GATK_ genotyping was done in a joint way over GVCF_ files of all samples.
5+
{% if snakemake.config["filtering"]["vqsr"] %}
6+
Genotyped variants were filtered with the GATK_ VariantRecalibrator approach.
7+
{% else %}
8+
Genotyped variants were filtered using hard thresholds.
9+
For SNVs, the criterion ``{{ snakemake.config["filtering"]["hard"]["snvs"] }}`` was used, for Indels the criterion ``{{ snakemake.config["filtering"]["hard"]["indels"] }}`` was used.
10+
{% endif %}
11+
Finally, SnpEff_ was used to predict and report variant effects.
12+
In addition, quality control was performed with FastQC_, Samtools_, and Picard_ and aggregated into an interactive report via MultiQC_.
13+
14+
.. _GATK: https://software.broadinstitute.org/gatk/
15+
.. _BWA mem: http://bio-bwa.sourceforge.net/
16+
.. _Picard: https://broadinstitute.github.io/picard
17+
.. _GATK best practices workflow: https://software.broadinstitute.org/gatk/best-practices/workflow?id=11145
18+
.. _GVCF: https://gatkforums.broadinstitute.org/gatk/discussion/4017/what-is-a-gvcf-and-how-is-it-different-from-a-regular-vcf
19+
.. _SnpEff: http://snpeff.sourceforge.net
20+
.. _MultiQC: http://multiqc.info/
21+
.. _Samtools: http://samtools.sourceforge.net/
22+
.. _FastQC: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/

rules/annotation.smk

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
rule snpeff:
2+
input:
3+
"filtered/all.vcf.gz",
4+
output:
5+
vcf=report("annotated/all.vcf.gz", caption="../report/vcf.rst", category="Calls"),
6+
csvstats="snpeff/all.csv"
7+
log:
8+
"logs/snpeff.log"
9+
params:
10+
reference=config["ref"]["name"],
11+
extra="-Xmx6g"
12+
wrapper:
13+
"0.27.1/bio/snpeff"

0 commit comments

Comments
 (0)