@@ -28,6 +28,7 @@ def usage():
28
28
print "-a/--alt <sample name for alternative allele>"
29
29
print "[-r/--ref] <sample name for reference allele. Default is the same as in the initial file>"
30
30
print "[-f/--filt] <Filtering level. 0 - no filtering. 1 - based on FI information. 2 - based on FILTER information. Default is 2>"
31
+ print "[-x/--exclude] <Exclude potential contaminants from the list of SNPs which are equal to ALT genotype, i.e REF is expected to be contaminated>"
31
32
print "[-v/--verbose] <Verbose>"
32
33
print "[-h/--help] <Help>"
33
34
return
@@ -37,11 +38,12 @@ def get_args():
37
38
try :
38
39
opts , args = getopt .getopt (
39
40
sys .argv [1 :],
40
- "i:r:a:f:vh" ,
41
+ "i:r:a:f:x: vh" ,
41
42
["vcf=" ,
42
43
"alt=" ,
43
44
"ref=" ,
44
45
"filt=" ,
46
+ "excl=" ,
45
47
"verbose" , "help" ])
46
48
except getopt .GetoptError :
47
49
usage ()
@@ -50,7 +52,7 @@ def get_args():
50
52
51
53
## 0/0, 1/1, A, T, 1
52
54
## 1/1, 2/2, A, T, C, 1
53
- def get_filter_snp_gt (gref , galt , ref , alt ):
55
+ def get_filter_snp_gt (gref , galt , ref , alt , conta ):
54
56
55
57
#print gref
56
58
#print galt
@@ -76,14 +78,21 @@ def get_filter_snp_gt(gref, galt, ref, alt):
76
78
## Non informative SNPs
77
79
if ref_snp == alt_snp :
78
80
return - 3
79
- else :
80
- ## check alt and ref alleles
81
- alleles = []
82
- alleles .append (ref )
83
- for a in alt .split (',' ):
84
- alleles .append (a )
81
+ ## Remove SNPs that are different from the reference and therefore likely to be wrongly assigned to the alternative
82
+ elif len (conta )> 0 :
83
+ for i in range (len (conta )):
84
+ conta_geno = re .split ('/|\|' , conta [i ])
85
+ ## if conta = alt remove snps
86
+ if conta_geno [0 ] == alt_geno [0 ] or conta_geno [1 ] == alt_geno [1 ]:
87
+ return - 4
88
+
89
+ ## check alt and ref alleles
90
+ alleles = []
91
+ alleles .append (ref )
92
+ for a in alt .split (',' ):
93
+ alleles .append (a )
85
94
86
- return [alleles [int (ref_snp )], alleles [int (alt_snp )]]
95
+ return [alleles [int (ref_snp )], alleles [int (alt_snp )]]
87
96
88
97
89
98
if __name__ == "__main__" :
@@ -93,6 +102,7 @@ def get_filter_snp_gt(gref, galt, ref, alt):
93
102
vcfFile = None
94
103
refSample = None
95
104
altSample = None
105
+ exclusion = None
96
106
filt_qual = 2
97
107
verbose = False
98
108
@@ -112,6 +122,8 @@ def get_filter_snp_gt(gref, galt, ref, alt):
112
122
altSample = arg
113
123
elif opt in ("-f" , "--filt" ):
114
124
filt_qual = int (arg )
125
+ elif opt in ("-x" , "--exclude" ):
126
+ exclusion = arg
115
127
elif opt in ("-v" , "--verbose" ):
116
128
verbose = True
117
129
else :
@@ -133,16 +145,20 @@ def get_filter_snp_gt(gref, galt, ref, alt):
133
145
samples = []
134
146
altidx = - 1
135
147
refidx = - 1
136
-
148
+ contaidx = []
149
+
137
150
var_counter = 0
138
151
snp_counter = 0
139
152
hetero_counter = 0
140
153
badqual_counter = 0
141
154
undefined_counter = 0
142
155
nonspe_counter = 0
156
+ conta_counter = 0
143
157
144
158
for line in vcf_handle :
145
159
line = line .rstrip ()
160
+ #print >> sys.stderr, line
161
+
146
162
## for now we don't care about the header
147
163
if line .startswith ('##' ):
148
164
if refSample is not None and line .startswith ("##reference=" ):
@@ -158,7 +174,29 @@ def get_filter_snp_gt(gref, galt, ref, alt):
158
174
refidx = i
159
175
elif samples [i ] == altSample :
160
176
altidx = i
161
-
177
+ elif exclusion is not None :
178
+ ## conta idx
179
+ exs = exclusion .split ("," )
180
+ for i in range (len (exs )):
181
+ ct = exs [i ]
182
+ if samples [i ] == ct :
183
+ contaidx .append (i )
184
+ if verbose :
185
+ print >> sys .stderr , "## Potential Contaminant(s) = " + ct
186
+
187
+
188
+ ## Check if Bl6 is in the conta list
189
+ if exclusion is not None :
190
+ exs = exclusion .split ("," )
191
+ for i in range (len (exs )):
192
+ ct = exs [i ]
193
+ if ct == "REF" :
194
+ contaidx .append (- 1 )
195
+ if verbose :
196
+ print >> sys .stderr , "## Potential Contaminant(s) = REF"
197
+
198
+
199
+
162
200
## Check input parameters
163
201
if refSample != None and refidx == - 1 :
164
202
print >> sys .stderr , "Error : REF sample not found"
@@ -185,7 +223,10 @@ def get_filter_snp_gt(gref, galt, ref, alt):
185
223
186
224
fields = line .split ('\t ' ,9 )
187
225
var_counter += 1
188
-
226
+
227
+ ## init list of contaminant
228
+ contg = []
229
+
189
230
## check chromosomes name
190
231
if re .compile ('^chr' ).match (fields [0 ]):
191
232
chrom = fields [0 ]
@@ -214,24 +255,37 @@ def get_filter_snp_gt(gref, galt, ref, alt):
214
255
else :
215
256
refg = ["0/0" ]
216
257
reffi = "1"
258
+
259
+ if len (contaidx ) > 0 :
260
+ for i in range (len (contaidx )):
261
+ if contaidx [i ] == - 1 :
262
+ contg .append ("0/0" )
263
+ else :
264
+ cg = genotypes [contaidx [i ]].split (':' )
265
+ cfi = cg [len (cg )- 1 ]
266
+ if filt_qual != 1 or filt_qual == 1 and cfi == str (1 ):
267
+ contg .append (cg [0 ])
217
268
218
269
## Filter on FI field
219
270
if filt_qual != 1 or (filt_qual == 1 and reffi == str (1 ) and altfi == str (1 )):
220
271
#print "---------"
221
272
#print refg
222
273
#print altg
223
274
#print fields
224
- geno = get_filter_snp_gt (refg [0 ], altg [0 ], fields [3 ], fields [4 ])
275
+ geno = get_filter_snp_gt (refg [0 ], altg [0 ], fields [3 ], fields [4 ], contg )
276
+
225
277
if geno == - 1 :
226
278
undefined_counter += 1
227
279
elif geno == - 2 :
228
280
hetero_counter += 1
229
281
elif geno == - 3 :
230
282
nonspe_counter += 1
283
+ elif geno == - 4 :
284
+ conta_counter += 1
231
285
else :
232
286
snp_counter += 1
233
287
#altg[0]="1/1"
234
- #print chrom + "\t" + fields[1] + "\t" + fields[2] + "\t" + geno[0] + "\t" + geno[1] + "\t" + fields[5] + "\t" + fields[6] + "\t" + fields[7] + "\t" + fields[8] + "\t" + ":".join(altg)
288
+ ## print chrom + "\t" + fields[1] + "\t" + fields[2] + "\t" + geno[0] + "\t" + geno[1] + "\t" + fields[5] + "\t" + fields[6] + "\t" + fields[7] + "\t" + fields[8] + "\t" + ":".join(altg)
235
289
print chrom + "\t " + fields [1 ] + "\t " + fields [2 ] + "\t " + geno [0 ] + "\t " + geno [1 ] + "\t " + fields [5 ] + "\t " + fields [6 ] + "\t " + fields [7 ] + "\t " + "GT" + "\t " + "0/1"
236
290
237
291
else :
@@ -252,6 +306,7 @@ def get_filter_snp_gt(gref, galt, ref, alt):
252
306
print >> sys .stderr , "## Number of heterozygous SNPs =" , hetero_counter
253
307
print >> sys .stderr , "## Number of undefined genotype SNPs =" , undefined_counter
254
308
print >> sys .stderr , "## Number of bad quality SNPs =" , badqual_counter
309
+ print >> sys .stderr , "## Number of potential contaminant SNPs =" , conta_counter
255
310
256
311
257
312
vcf_handle .close ()
0 commit comments