forked from COG-UK/datapipe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd_mask.py
executable file
·66 lines (51 loc) · 2.08 KB
/
add_mask.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python3
from Bio import SeqIO
import argparse
import re
def parse_args():
parser = argparse.ArgumentParser(description="""Apply a mask to some bases of alignment""",
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--in-alignment', dest = 'in_alignment', required=True, help='Aligned FASTA')
parser.add_argument('--mask', dest = 'mask', required=True, help='Mask CSV of pos, mask character, regex')
parser.add_argument('--out-alignment', dest = 'out_alignment', required=True, help='FASTA to write out')
args = parser.parse_args()
return args
def parse_mask_file(file):
"""
input is in the format:
start (1-based), mask character, regex-format string to match record.id
e.g.:
13402,?,^Belgium/
d is a dictionary with the regex strings as keys and position,
mask character and compiled regular expression as values.
it has the same number of entries as lines in file
"""
d = {}
with open(file, 'r') as f:
for line in f:
l = line.rstrip().split(',')
pos, mask_char, regex = l
d[regex] = {'pos': int(pos),
'mask_char': mask_char,
'regex': re.compile(regex)}
return(d)
def apply_mask(in_fasta, out_fasta, mask):
mask_info = parse_mask_file(mask)
with open(in_fasta, "r") as fasta_in, \
open(out_fasta, "w") as fasta_out:
for record in SeqIO.parse(fasta_in, 'fasta'):
ID = record.id
seq = str(record.seq)
for entry in mask_info:
regex = mask_info[entry]['regex']
if re.search(regex, ID):
pos = mask_info[entry]['pos']
mask_char = mask_info[entry]['mask_char']
seq = seq[:pos - 1] + mask_char + seq[pos:]
fasta_out.write('>' + ID + '\n')
fasta_out.write(seq + '\n')
def main():
args = parse_args()
apply_mask(args.in_alignment, args.out_alignment, args.mask)
if __name__ == '__main__':
main()