-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCollect_GraphBin_MAGs_as_fasta.py
101 lines (84 loc) · 2.68 KB
/
Collect_GraphBin_MAGs_as_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
''' Collect MAGs in fasta format from GraphBin output.
-------------------------------------------
Author :: Roth Conrad
Email :: [email protected]
GitHub :: https://github.com/rotheconrad
Date Created :: July 23rd, 2020d
License :: GNU GPLv3
Copyright 2020 Roth Conrad
All rights reserved
-------------------------------------------
'''
import argparse
from collections import defaultdict
def read_fasta(fp):
name, seq = None, []
for line in fp:
line = line.rstrip()
if line.startswith(">"):
if name: yield (name, ''.join(seq))
name, seq = line, []
else:
seq.append(line)
if name: yield (name, ''.join(seq))
def get_graphbin_fasta(Assembly, Graphbin, Prefix):
''' Collects contig and bin names from graphbin output file and
writes out fasta files for each MAG. Collects fastas sequence
from the assembly fasta file'''
MAGs = {}
with open(Graphbin, 'r') as bins:
for b in bins:
X = b.rstrip().split(',')
contig = X[0]
mag = X[1]
MAGs[contig] = mag
data = defaultdict(list)
with open(Assembly, 'r') as fasta:
for name, seq in read_fasta(fasta):
node = '_'.join(name.split('_')[1:3])
fa = f'{name}\n{seq}\n'
if node in MAGs:
bin_number = MAGs[node]
data[bin_number].append(fa)
for MAG, sequences in data.items():
print(f'\nWriting MAG {MAG}...\n')
outfile = f'{Prefix}_{MAG}.fasta'
with open(outfile, 'w') as out:
for sequence in sequences:
out.write(sequence)
def main():
# Configure Argument Parser
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
'-a', '--assembly_fasta_file',
help='Please specify the assembly file in fasta format!',
metavar=':',
type=str,
required=True
)
parser.add_argument(
'-g', '--graphbin_output_file',
help='Please specify the graphbin output file!',
metavar=':',
type=str,
required=True
)
parser.add_argument(
'-o', '--output_file_prefix',
help='What do you want to name the output file?',
metavar='',
type=str,
required=True
)
args=vars(parser.parse_args())
get_graphbin_fasta(
args['assembly_fasta_file'],
args['graphbin_output_file'],
args['output_file_prefix']
)
if __name__ == "__main__":
main()