-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_filter_vcf.nf
210 lines (181 loc) · 8.42 KB
/
01_filter_vcf.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
nextflow.enable.dsl=2
params.high_vmiss = 0.7
params.high_imiss = 0.7
params.low_vmiss = 0.3
params.low_imiss = 0.3
params.min_maf = 0.01
params.samples_included = '' // TODO: set a whitelist sample list as default
def publish_dir = "${params.outdir ?: launchDir}/01_filt_vcf"
// Filter VCF sites by the variant type and the number of alleles
//
// NOTE:
// The params.samples_included variable allows a manual removal of samples that
// - are duplicated
// - or show low sequencing quality
process BCFTOOLS_VIEW_PASS_BIALLELIC {
tag "$interval"
publishDir "${publish_dir}/01_pass_biallelic", mode: 'symlink'
input: tuple val(interval), path(vcf)
output: tuple val(interval), path("*.bcf")
script:
def S_args = params.samples_included ? "-S ${file(params.samples_included)}" : ""
"""
if [ -f ${vcf}.csi ] ; then bcftools index ${vcf}; fi
bcftools view ${S_args} ${vcf} -Ou | bcftools view -f PASS -Ou | bcftools view -v snps -Ou | bcftools view -m2 -M2 -Ob -o ${interval}.bcf
"""
stub:
"""
touch ${interval}.bcf
"""
}
// The process filters VCF sites by the site-missingness (what percentage of samples have
// missing genotype for a given site) and then calculates per-sample stats.
// Note that, as the input VCFs are only part of the genome (a range), only after full genome
// information (per-sample-stats for all genome-interval VCFs) are gathered can
// we filter the VCF samples by missingness.
process BCFTOOLS_VIEW_F_MISSING1_WITH_PSC_STATS {
tag "$interval"
publishDir "${publish_dir}/02_fmissing1", mode: 'symlink'
input: tuple val(interval), path(vcf)
output: tuple val(interval), path("*_fmiss1.bcf"), path("*_fmiss1_stats.txt")
script: """ bcftools view -i "F_MISSING<${params.high_vmiss}" ${vcf} -Ob -o ${interval}_fmiss1.bcf
bcftools stats -s- ${interval}_fmiss1.bcf > ${interval}_fmiss1_stats.txt
"""
stub:
"""
touch ${interval}_fmiss1.bcf ${interval}_fmiss1_stats.txt
"""
}
// Base on per-sample stats for all interval-VCF files across the whole genome,
// the process generate a list of samples that have the percentage of genotype
// missiningness less than a given threshold.
process PANDAS_IMISS1_SAMPLES_TO_KEEP {
tag "sample_to_keep_imiss1"
publishDir "${publish_dir}/03_imiss1_samples", mode: 'symlink'
input: path(vcf_stats)
output: path("samples_to_keep_imiss1.txt")
script: """ get_samples_with_max_imiss.py --bcftools_psc_stats_files *stats.txt --max_imiss ${params.high_imiss} --out samples_to_keep_imiss1.txt """
stub:
"""
touch samples_to_keep_imiss1.txt
"""
}
// This process does the actual sample filtering (based on missingness).
// After samples are filtering, a step added to ensure
// the remaining sites are segregating.
process BCFTOOLS_VIEW_FILTER_SAMPLE_BY_IMISS1 {
tag "$interval"
publishDir "${publish_dir}/04_imiss1_bcf", mode: 'symlink'
input: tuple val(interval), path(vmiss1_bcf); path(samples_to_keep_imiss1)
output: tuple val(interval), path('*_fmiss1_imiss1.bcf')
script: """ bcftools index ${vmiss1_bcf}; bcftools view -S ${samples_to_keep_imiss1} ${vmiss1_bcf} | bcftools view -i "MAC>0" -Ob -o ${interval}_fmiss1_imiss1.bcf"""
stub:
"""
touch ${interval}_fmiss1_imiss1.bcf
"""
}
// This process is similar to `BCFTOOLS_VIEW_F_MISSING1_WITH_PSC_STATS` but uses
// a stricter threshold.
process BCFTOOLS_VIEW_F_MISSING2_WITH_PSC_STATS {
tag "$interval"
publishDir "${publish_dir}/05_fmissing2", mode: 'symlink'
input: tuple val(interval), path(vcf)
output: tuple val(interval), path("*_fmiss2.bcf"), path("*_fmiss2_stats.txt")
script: """ bcftools view -i "F_MISSING<${params.low_vmiss}" ${vcf} -Ob -o ${interval}_fmiss2.bcf
bcftools stats -s- ${interval}_fmiss2.bcf > ${interval}_fmiss2_stats.txt
"""
stub:
"""
touch ${interval}_fmiss2.bcf ${interval}_fmiss2_stats.txt
"""
}
// This process is similar to `PANDAS_IMISS1_SAMPLES_TO_KEEP` but uses
// a stricter threshold.
process PANDAS_IMISS2_SAMPLES_TO_KEEP {
tag "sample_to_keep_imiss2"
publishDir "${publish_dir}/06_imiss2_samples", mode: 'symlink'
input: path(vcf_stats)
output: path("samples_to_keep_imiss2.txt")
script: """ get_samples_with_max_imiss.py --bcftools_psc_stats_files *stats.txt --max_imiss ${params.low_imiss} --out samples_to_keep_imiss2.txt """
stub:
"""
touch samples_to_keep_imiss2.txt
"""
}
// This process is similar to `BCFTOOLS_VIEW_FILTER_SAMPLE_BY_IMISS1` but uses
// a stricter threshold.
process BCFTOOLS_VIEW_FILTER_SAMPLE_BY_IMISS2 {
tag "$interval"
publishDir "${publish_dir}/07_imiss2_bcf", mode: 'symlink'
input: tuple val(interval), path(vmiss2_bcf); path(samples_to_keep_imiss2)
output: tuple val(interval), path('*_fmiss2_imiss2.bcf')
script: """ bcftools index ${vmiss2_bcf}; bcftools view -S ${samples_to_keep_imiss2} ${vmiss2_bcf} | bcftools view -i "MAC>0" -Ob -o ${interval}_fmiss2_imiss2.bcf"""
stub:
"""
touch ${interval}_fmiss2_imiss2.bcf
"""
}
// This process filter out rare variants
process BCFTOOLS_VIEW_FILTER_SITE_BY_MAF {
tag "$interval"
publishDir "${publish_dir}/08_maf_bcf", mode: 'symlink'
input: tuple val(interval), path(imiss2_bcf)
output: tuple val(interval), path("*_fmiss2_imiss2_maf.vcf.gz")
script: """ bcftools view -q ${params.min_maf}:minor ${imiss2_bcf} -Oz -o ${interval}_fmiss2_imiss2_maf.vcf.gz"""
stub:
"""
touch ${interval}_fmiss2_imiss2_maf.vcf.gz
"""
}
// --------------------------- SUBWORKFLOW --------------------------------------------
// This subworkflow filters the set of VCF files by variant type, number of
// alleles per site, and missingness per site and per sample.
workflow WF_FILTER_VCF {
take: ch_input_vcf // FORMAT: tuple val(interval), path(vcf)
main:
// filter by filter=PASS and -m2 and -M2
BCFTOOLS_VIEW_PASS_BIALLELIC(ch_input_vcf)
// Filter by vmiss (round 1)
ch_pass_biallelic_vcf = BCFTOOLS_VIEW_PASS_BIALLELIC.out
BCFTOOLS_VIEW_F_MISSING1_WITH_PSC_STATS(ch_pass_biallelic_vcf)
// Filter by imiss (round 1)
ch_fmiss1_vcf_and_psc_stats = BCFTOOLS_VIEW_F_MISSING1_WITH_PSC_STATS.out
ch_fmiss1_vcf = ch_fmiss1_vcf_and_psc_stats.map{interval, fmiss1_bcf, fmiss1_bcf_stat->[interval, fmiss1_bcf]}
ch_fmiss1_stats_list = ch_fmiss1_vcf_and_psc_stats.map{interval, fmiss1_bcf, fmiss1_bcf_stat -> fmiss1_bcf_stat}.toList()
PANDAS_IMISS1_SAMPLES_TO_KEEP(ch_fmiss1_stats_list)
ch_imiss1_samples = PANDAS_IMISS1_SAMPLES_TO_KEEP.out
BCFTOOLS_VIEW_FILTER_SAMPLE_BY_IMISS1(ch_fmiss1_vcf, ch_imiss1_samples)
// Filter by vmiss (round 2)
ch_fmiss1_imiss1_vcf = BCFTOOLS_VIEW_FILTER_SAMPLE_BY_IMISS1.out
// Filter by imiss (round 2)
ch_fmiss2_vcf_and_psc_stats = BCFTOOLS_VIEW_F_MISSING2_WITH_PSC_STATS(ch_fmiss1_imiss1_vcf)
ch_fmiss2_vcf = ch_fmiss2_vcf_and_psc_stats.map{interval, fmiss2_bcf, fmiss2_bcf_stat->[interval, fmiss2_bcf]}
ch_fmiss2_stats_list = ch_fmiss2_vcf_and_psc_stats.map{interval, fmiss2_bcf, fmiss2_bcf_stat -> fmiss2_bcf_stat}.toList()
PANDAS_IMISS2_SAMPLES_TO_KEEP(ch_fmiss2_stats_list)
ch_imiss2_samples = PANDAS_IMISS2_SAMPLES_TO_KEEP.out
BCFTOOLS_VIEW_FILTER_SAMPLE_BY_IMISS2(ch_fmiss2_vcf, ch_imiss2_samples)
// Filter by maf
ch_imiss2_vcf = BCFTOOLS_VIEW_FILTER_SAMPLE_BY_IMISS2.out
BCFTOOLS_VIEW_FILTER_SITE_BY_MAF(ch_imiss2_vcf)
ch_filterred_vcf = BCFTOOLS_VIEW_FILTER_SITE_BY_MAF.out
emit:
ch_filterred_vcf // FORMAT: tuple val(interval), path(vcf)
}
// -------------------------- TESTING -----------------------------------------------
// The workflow is designed to efficiently process large datasets
// by breaking them down into smaller VCF files, each containing a specific
// genome interval or chunk for all samples. The workflow is designed to run in
// parallel, allowing for faster processing times, while also gathering related
// information as needed.
workflow {
ch_input_vcf = channel
.fromPath(params.input_vcf_fn, type: 'file', checkIfExists: true )
.splitCsv(header: false, sep: '\t')
.map{
assert it.size() == 2
def interval = it[0]
def vcf = file(it[1], type: 'file', checkIfExists: true)
return [interval, vcf]
}.take(6)
WF_FILTER_VCF(ch_input_vcf)
}