-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBlastTab_filter.py
154 lines (121 loc) · 4.29 KB
/
BlastTab_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
'''Python Default Script.
This script filters tabular blast output for best hit based on bitscore,
as well as user defined percent match length, and read length.
This tool takes the following input parameters:
* tabular blast input file that includes query and subject lengths
in columns 13 and 14 (or 12 & 13 for 0-index)
* percent_match_length to filter for as decimal (ex: 0.9 for 90%)
* read_length to filter for as integer (ex: 70 for 70 base pairs)
This script returns the following files:
* input_file.filtered_best_hits.blst
This script requires the following packages:
* argparse
This file can also be imported as a module and contains the follwing
functions:
* filter_blast - This function coordinates the filtering.
* main - the main function of the script
-------------------------------------------
Author :: Roth Conrad
Email :: [email protected]
GitHub :: https://github.com/rotheconrad
Date Created :: Thursday, July 23th, 2019
License :: GNU GPLv3
Copyright 2019 Roth Conrad
All rights reserved
-------------------------------------------
'''
import argparse
def best_hits(query, bitscore, d, line, dups):
""" Filters the besthit based on bitscore """
if query in d:
dups += 1
old_bitscore = float(d[query].split('\t')[11])
if bitscore > old_bitscore:
d[query] = line
else:
d[query] = line
return d, dups
def filter_blast(infile, pml, rl):
"""Gets and prints the spreadsheet's header columns
Parameters
----------
infile : str
The file location of the input file
pml : float
Percent match length of the query read to filter above. (ex: 0.9)
alignment length / query length
rl : int
Length of the query read to filter above. (ex: 70)
Returns
-------
file
writes out tabular blast lines passing the filter to
a new file infile.filtered_best_hits.blst
"""
d = {} # initialize dictionary for bitscore besthits
dups = 0 # counter for number of duplicate matches for reads
fails = 0 # counter for number of matches failing filters
passes = 0 # counter for number of matches passing filters
total = 0 # counter for total blast entries in file
with open(infile, 'r') as f:
for l in f:
total += 1
X = l.rstrip().split('\t')
query = X[0] # read identifier
bitscore = float(X[11]) # bitscore
pIdent = float(X[2]) # percent sequence identity
aLen = int(X[3]) # read alignment length
qLen = int(X[12]) # full length of read
pMatch = aLen / qLen # percent match length of read length
if pMatch >= pml and qLen >= rl:
d, dups = best_hits(query, bitscore, d, l, dups)
passes += 1
else:
fails += 1
outfile = infile + '.filtered_best_hits.blst'
with open(outfile, 'w') as o:
for k,v in d.items():
o.write(v)
print('Total number of entries in blast file:', total)
print('Number of reads failing the filters:', fails)
print('Number of reads passing the filters:', passes)
print('Number of duplicate blast matches passing filter:', dups)
print('Number of best hit entries written to new file:', len(d))
def main():
# Configure Argument Parser
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
'-i', '--in_file',
help='Please specify the tabular blast input file!',
metavar='',
type=str,
required=True
)
parser.add_argument(
'-pml', '--percent_match_length',
help='Percent match length to filter for (ex: 0.9).',
metavar='',
type=float,
required=True
)
parser.add_argument(
'-rl', '--read_length',
help='Read length to filter for (ex: 70).',
metavar='',
type=float,
required=True
)
args=vars(parser.parse_args())
# Do what you came here to do:
print('Running Script...')
filter_blast(
args['in_file'],
args['percent_match_length'],
args['read_length']
)
if __name__ == "__main__":
main()