2
2
from Bio .Seq import Seq
3
3
from Bio .SeqIO import SeqRecord
4
4
from Bio .Align import MultipleSeqAlignment
5
- from glob import glob
6
- import tempfile
5
+ import logging
7
6
from pathlib import Path
8
7
import re
9
- import subprocess
10
8
from collections import namedtuple , defaultdict , Counter
11
- import os
12
- from Bio .Align import substitution_matrices
13
- from itertools import product , combinations
9
+ import bisect
14
10
import numpy as np
15
- from Bio .AlignIO .MafIO import MafWriter , MafIterator
16
- from Bio .AlignIO .MauveIO import MauveWriter , MauveIterator
17
- from logger import logger
18
- import time
11
+ from tqdm import tqdm
12
+ from logger import logger , TqdmToLogger , MIN_TQDM_INTERVAL
13
+ import spoa
19
14
#%%
20
15
21
16
@@ -46,29 +41,38 @@ def parse_xmfa_header(xmfa_file):
46
41
return index_to_gid , gid_to_index
47
42
48
43
49
- def index_input_sequences (xmfa_file , input_dir ):
44
+ def index_input_sequences (xmfa_file , file_list ):
45
+ basename_to_path = {}
46
+ for f in file_list :
47
+ basename = str (Path (f ).stem )
48
+ basename_to_path [basename ] = f
50
49
gid_to_records = {}
51
50
gid_to_cid_to_index = {}
51
+ gid_to_index_to_cid = {}
52
52
with open (xmfa_file ) as parsnp_fd :
53
53
for line in (line .strip () for line in parsnp_fd ):
54
54
if line [:2 ] == "##" :
55
55
if line .startswith ("##SequenceFile" ):
56
- p = Path (os .path .join (input_dir + line .split (' ' )[1 ]))
57
- gid_to_records [p .stem ] = {record .id : record for record in SeqIO .parse (str (p ), "fasta" )}
58
- gid_to_cid_to_index [p .stem ] = {idx + 1 : rec .id for (idx , rec ) in enumerate (SeqIO .parse (str (p ), "fasta" ))}
59
- return gid_to_records , gid_to_cid_to_index
56
+ basename = Path (line .split (' ' )[1 ]).stem
57
+ p = Path (basename_to_path [basename ])
58
+ gid_to_records [p .stem ] = {}
59
+ gid_to_cid_to_index [p .stem ] = {}
60
+ gid_to_index_to_cid [p .stem ] = {}
61
+ for idx , rec in enumerate (SeqIO .parse (str (p ), "fasta" )):
62
+ gid_to_records [p .stem ][rec .id ] = rec
63
+ gid_to_cid_to_index [p .stem ][rec .id ] = idx + 1
64
+ gid_to_index_to_cid [p .stem ][idx + 1 ] = rec .id
65
+ return gid_to_records , gid_to_cid_to_index , gid_to_index_to_cid
60
66
61
67
62
-
63
- def xmfa_to_covered (xmfa_file , index_to_gid , gid_to_cid_to_index ):
68
+ def xmfa_to_covered (xmfa_file , index_to_gid , gid_to_index_to_cid ):
64
69
seqid_parser = re .compile (r'^cluster(\d+) s(\d+):p(\d+)/.*' )
65
70
idpair_to_segments = defaultdict (list )
66
- idpair_to_tree = defaultdict (IntervalTree )
67
71
cluster_to_named_segments = defaultdict (list )
68
- for aln in tqdm ( AlignIO .parse (xmfa_file , "mauve" ) ):
72
+ for aln in AlignIO .parse (xmfa_file , "mauve" ):
69
73
for seq in aln :
70
74
# Skip reference for now...
71
- aln_len = seq .annotations ["end" ] - seq .annotations ["start" ] + 1
75
+ aln_len = seq .annotations ["end" ] - seq .annotations ["start" ]
72
76
cluster_idx , contig_idx , startpos = [int (x ) for x in seqid_parser .match (seq .id ).groups ()]
73
77
74
78
gid = index_to_gid [seq .name ]
@@ -78,29 +82,29 @@ def xmfa_to_covered(xmfa_file, index_to_gid, gid_to_cid_to_index):
78
82
else :
79
83
endpos = startpos + aln_len
80
84
81
- idp = IdPair (gid , gid_to_cid_to_index [gid ][contig_idx ])
85
+ idp = IdPair (gid , gid_to_index_to_cid [gid ][contig_idx ])
82
86
seg = Segment (idp , startpos , startpos + aln_len , seq .annotations ["strand" ])
83
87
idpair_to_segments [idp ].append (seg )
84
- idpair_to_tree [idp ].addi (seg .start , seg .stop )
85
88
cluster_to_named_segments [cluster_idx ].append (seg )
86
89
87
90
for idp in idpair_to_segments :
88
91
idpair_to_segments [idp ] = sorted (idpair_to_segments [idp ])
89
- idpair_to_tree [idp ].merge_overlaps ()
90
- return idpair_to_segments , idpair_to_tree , cluster_to_named_segments
92
+ return idpair_to_segments , cluster_to_named_segments
91
93
92
94
93
95
def run_msa (downstream_segs_to_align , gid_to_records ):
94
96
keep_extending = True
95
97
iteration = 0
96
- seq_len_desc = stats .describe ([seg .stop - seg .start for seg in downstream_segs_to_align ])
97
- longest_seq = seq_len_desc .minmax [1 ]
98
- if sum (
99
- seq_len_desc .mean * (1 - length_window ) <= (seg .stop - seg .start ) <= seq_len_desc .mean * (1 + length_window ) for seg in downstream_segs_to_align ) > len (downstream_segs_to_align )* window_prop :
100
- base_length = int (seq_len_desc .mean * (1 + length_window ))
101
- else :
102
- base_length = BASE_LENGTH
98
+ seq_lens = [seg .stop - seg .start for seg in downstream_segs_to_align ]
99
+ longest_seq = max (seq_lens )
100
+ mean_seq_len = np .mean (seq_lens )
101
+ # if sum(
102
+ # mean_seq_len*(1 - length_window) <= (seg.stop - seg.start) <= mean_seq_len*(1 + length_window) for seg in downstream_segs_to_align) > len(downstream_segs_to_align)*window_prop:
103
+ # base_length = int(mean_seq_len*(1 + length_window))
104
+ # else:
105
+ # base_length = BASE_LENGTH
103
106
107
+ base_length = BASE_LENGTH
104
108
while keep_extending :
105
109
seqs_to_align = ["A" + (str (
106
110
gid_to_records [seg .idp .gid ][seg .idp .cid ].seq [seg .start :seg .stop ] if seg .strand == 1
@@ -131,11 +135,15 @@ def run_msa(downstream_segs_to_align, gid_to_records):
131
135
return aligned_msa_seqs
132
136
133
137
134
- def extend_clusters (xmfa_file , index_to_gid , gid_to_cid_to_index , idpair_to_segments , idpair_to_tree , cluster_to_named_segments , gid_to_records ):
138
+ def extend_clusters (xmfa_file , gid_to_index , gid_to_cid_to_index , idpair_to_segments , cluster_to_named_segments , gid_to_records ):
135
139
ret_lcbs = []
136
140
seqid_parser = re .compile (r'^cluster(\d+) s(\d+):p(\d+)/.*' )
137
141
138
- for aln_idx , aln in enumerate (tqdm (AlignIO .parse (xmfa_file , "mauve" ), total = len (cluster_to_named_segments ))):
142
+ for aln_idx , aln in enumerate (tqdm (
143
+ AlignIO .parse (xmfa_file , "mauve" ),
144
+ total = len (cluster_to_named_segments ),
145
+ file = TqdmToLogger (logger , level = logging .INFO ),
146
+ mininterval = MIN_TQDM_INTERVAL )):
139
147
# validate_lcb(aln, gid_to_records, parsnp_header=True)
140
148
seq = aln [0 ]
141
149
cluster_idx , contig_idx , startpos = [int (x ) for x in seqid_parser .match (seq .id ).groups ()]
@@ -167,29 +175,36 @@ def extend_clusters(xmfa_file, index_to_gid, gid_to_cid_to_index, idpair_to_segm
167
175
new_lcb = MultipleSeqAlignment ([])
168
176
# Assumes alignments are always in the same order
169
177
new_bp = []
170
- for seq_idx , (covered_seg , uncovered_seg , aln_str ) in enumerate (zip (segs , downstream_segs_to_align , aligned_msa_seqs )):
178
+ for seg_idx , (covered_seg , uncovered_seg , aln_str ) in enumerate (zip (segs , downstream_segs_to_align , aligned_msa_seqs )):
171
179
# Update segment in idpair_to_segments
180
+ if len (aln_str ) < MIN_LEN :
181
+ continue
172
182
new_bp_covered = len (aln_str ) - aln_str .count ("-" )
173
183
# print(f"Extending {covered_seg} by {new_bp_covered}")
174
184
new_bp .append (new_bp_covered )
175
185
new_seq = aln_str
176
186
if covered_seg .strand == 1 :
177
187
new_seg = Segment (covered_seg .idp , uncovered_seg .start , uncovered_seg .start + new_bp_covered , covered_seg .strand )
188
+ if new_bp_covered > 0 :
189
+ segs [seg_idx ] = Segment (covered_seg .idp , covered_seg .start , new_seg .stop , covered_seg .strand )
178
190
else :
179
191
aln_str = Seq (aln_str ).reverse_complement ()
180
192
new_seg = Segment (covered_seg .idp , covered_seg .start - new_bp_covered , covered_seg .start , covered_seg .strand )
193
+ if new_bp_covered > 0 :
194
+ segs [seg_idx ] = Segment (covered_seg .idp , new_seg .start , covered_seg .stop , covered_seg .strand )
181
195
182
196
new_record = SeqRecord (
183
197
seq = new_seq ,
184
- id = f"{ covered_seg .idp .gid } #{ covered_seg .idp .cid } " ,
198
+ id = f"cluster{ cluster_idx } s{ gid_to_cid_to_index [covered_seg .idp .gid ][covered_seg .idp .cid ]} :p{ new_seg .start if new_seg .strand == 1 else new_seg .stop } " ,
199
+ name = gid_to_index [covered_seg .idp .gid ],
185
200
annotations = {"start" : new_seg .start , "end" : new_seg .stop , "strand" : new_seg .strand }
186
201
)
202
+
187
203
# if covered_seg.strand == 1:
188
204
new_lcb .append (new_record )
189
- if new_bp_covered > 0 :
190
- idpair_to_tree [covered_seg .idp ].addi (new_seg .start , new_seg .stop )
191
205
192
- ret_lcbs .append (new_lcb )
206
+ if len (new_lcb ) > 0 :
207
+ ret_lcbs .append (new_lcb )
193
208
return ret_lcbs
194
209
195
210
0 commit comments