2
2
3
3
import random
4
4
import argparse
5
- from itertools import islice
6
5
7
6
from poplars .common import convert_fasta
8
7
from poplars .mafft import align
8
+ import numpy as np
9
9
10
10
# subset of HIV-1 group M subtype references curated by LANL
11
11
with open ('../poplars/ref_genomes/HIV1_Mgroup.fasta' ) as handle :
@@ -21,13 +21,17 @@ def pdistance(seq1, seq2):
21
21
"""
22
22
denom = 0. # number of valid columns
23
23
ndiff = 0
24
- for i , nt1 in enumerate (seq1 ):
25
- nt2 = seq2 [i ]
26
- if nt1 == '-' or nt2 == '-' :
27
- continue
28
- denom += 1
29
- if nt1 != nt2 :
30
- ndiff += 1
24
+
25
+ seqs = np .char .asarray ([seq1 , seq2 ]) # Convert sequences to numpy arrays
26
+
27
+ # Stack 2-D numpy arrays and find columns not containing '-'
28
+ # Gives an array containing True and False
29
+ denoms = np .where (np .all (np .isin (seqs , '-' , invert = True ), 0 ))[0 ]
30
+ denom = denoms .shape [0 ]
31
+
32
+ # From the valid positions, find where the sequences contain different nucleotides
33
+ ndiff = np .sum (seqs [0 , :][denoms ] != seqs [1 , :][denoms ])
34
+
31
35
return ndiff , denom
32
36
33
37
@@ -37,52 +41,41 @@ def bootstrap(s1, s2, reps=100):
37
41
:param s1: first sequence
38
42
:param s2: second sequence (must be aligned to s1)
39
43
:param reps: number of replicates to generate
40
-
44
+
41
45
:yield: tuples of sequences generated by bootstrap resampling
42
46
"""
43
47
seqlen = len (s1 )
44
48
assert len (s2 ) == seqlen , "s1 and s2 must be of same length in bootstrap()"
45
-
49
+
50
+ # Convert sequences to numpy arrays
51
+ s1_np = np .char .asarray (list (s1 ))
52
+ s2_np = np .char .asarray (list (s2 ))
53
+
46
54
for rep in range (reps ):
47
- bootstrap = [ random .randint (0 , seqlen - 1 ) for _ in range ( seqlen )]
48
- b1 = '' . join ([ s1 [ i ] for i in bootstrap ])
49
- b2 = '' . join ([ s2 [ i ] for i in bootstrap ])
55
+ bootstrap = np . random .randint (0 , seqlen , seqlen )
56
+ b1 = s1_np [ bootstrap ]
57
+ b2 = s2_np [ bootstrap ]
50
58
yield b1 , b2
51
59
52
60
53
61
def update_alignment (seq ):
54
62
# append query sequence to reference alignment
55
63
fasta = align (seq , reference )
56
-
64
+
57
65
# eliminate insertions in query relative to references
58
66
try :
59
67
conseq = dict (fasta )['CON_OF_CONS' ]
60
68
except :
61
69
print ("ERROR: reference alignment in poplars.riplike does not contain CON_OF_CONS entry" )
62
70
raise
63
-
71
+
64
72
skip = [i for i in range (len (conseq )) if conseq [i ] == '-' ]
65
73
fasta2 = []
66
74
for h , s in fasta :
67
75
s2 = [nt for i , nt in enumerate (s ) if i not in skip ]
68
76
fasta2 .append ([h , '' .join (s2 )])
69
-
70
- return fasta2
71
77
72
-
73
- def sliding_window (seq , window = 400 ):
74
- """
75
- Gives a sliding window of of width 'window' over nucleotides from the sequence
76
- :param seq: the reference sequence
77
- :param window: the width of the sliding window in nucleotides
78
- """
79
- window_iter = iter (seq )
80
- seq_part = tuple (islice (window_iter , window ))
81
- if len (seq_part ) == window :
82
- yield seq_part
83
- for elem in window_iter :
84
- seq_part = seq_part [1 :] + (elem ,)
85
- yield seq_part
78
+ return fasta2
86
79
87
80
88
81
def riplike (seq , outfile , window = 400 , step = 5 , nrep = 100 ):
@@ -93,83 +86,70 @@ def riplike(seq, outfile, window=400, step=5, nrep=100):
93
86
:param step: step size of sliding window in nucleotides
94
87
:param nrep: number of replicates for nonparametric bootstrap sampling
95
88
"""
96
-
97
89
results = []
98
-
90
+
99
91
fasta = update_alignment (seq )
100
92
query = dict (fasta )['query' ] # aligned query
101
93
seqlen = len (query )
102
94
103
- # b = [a[i:i+3] for i in range(len(a)-2)]
104
- count = 0
105
- for seq_window in sliding_window (seq , window ):
106
- seq_region = '' .join (str (nt ) for nt in seq_window )
107
- print (seq_region )
108
- centre = len (seq_window ) // 2
109
-
95
+ for center in range (window // 2 , seqlen - (window // 2 ), step ):
110
96
best_p , second_p = 1. , 1. # maximum p-distance
111
97
best_ref , second_ref = None , None
112
98
best_seq = ''
113
-
99
+
114
100
# cut slice from query sequence for this window
115
- # q1 = query[centre - (window // 2): (centre + (window // 2))]
116
- q = sliding_window (query , window )
117
- q1 = '' .join (str (nt ) for nt in q )
118
-
101
+ q1 = query [center - (window // 2 ):center + (window // 2 )]
102
+
119
103
# iterate over reference genomes
120
104
for h , s in fasta :
121
105
if h == 'query' or h == 'CON_OF_CONS' :
122
- continue
123
-
124
- # # slice window segment from reference
125
- # s1 = s[centre - (window // 2): (centre + (window // 2) )]
126
-
106
+ continue
107
+
108
+ # slice window segment from reference
109
+ s1 = s [center - (window // 2 ):center + (window // 2 )]
110
+
127
111
# calculate p-distance
128
- # ndiff, denom = pdistance(s1, q1)
129
- ndiff , denom = pdistance (seq_region , q1 )
112
+ ndiff , denom = pdistance (list (s1 ), list (q1 ))
130
113
if denom == 0 :
131
114
# no overlap! TODO: require minimum overlap?
132
115
continue
133
- pd = ndiff / denom
134
-
116
+ pd = ndiff / denom
117
+
135
118
if pd < best_p :
136
119
# query is closer to this reference
137
120
second_p = best_p
138
121
second_ref = best_ref
139
122
best_p = pd
140
123
best_ref = h
141
- # best_seq = s1
142
- best_seq = seq_region
124
+ best_seq = s1
143
125
elif pd < second_p :
144
126
# replace second best
145
- second_p = pd ; second_ref = h
146
-
127
+ second_p = pd
128
+ second_ref = h
129
+
147
130
if best_ref is None :
148
- outfile .write ('{},{},None,,None,,\n ' .format (h , centre ))
131
+ outfile .write ('{},{},None,,None,,\n ' .format (h , center ))
149
132
continue
150
-
151
- result = {'centre ' : centre , 'best_ref' : best_ref , 'best_p' : best_p ,
133
+
134
+ result = {'center ' : center , 'best_ref' : best_ref , 'best_p' : best_p ,
152
135
'second_ref' : second_ref , 'second_p' : None if second_ref is None else second_p }
153
-
136
+
154
137
quant = None
155
138
if second_ref is not None :
156
139
# use nonparametric bootstrap to determine significance
157
140
boot_dist = []
158
141
for bs , bq in bootstrap (best_seq , q1 , reps = nrep ):
159
142
ndiff , denom = pdistance (bs , bq )
160
143
if denom > 0 :
161
- boot_dist .append (ndiff / denom )
162
-
144
+ boot_dist .append (ndiff / denom )
145
+
163
146
# how many are closer than second best?
164
147
quant = list (map (lambda x : x < second_p , boot_dist ))
165
148
quant = sum (quant ) / float (len (quant ))
166
-
149
+
167
150
result .update ({'quant' : quant })
168
151
results .append (result )
169
- count += 1
170
- print (count )
171
152
172
- print (fasta )
173
153
return results
174
154
175
155
@@ -184,27 +164,26 @@ def main():
184
164
help = '<output> file to write CSV results.' )
185
165
parser .add_argument ('-window' , type = int , default = 400 ,
186
166
help = '<optional, int> Window size for p-distances.' )
187
- # FIXME: step size is by default 1
188
-
189
- parser .add_argument ('-step' , type = int , default = 1 ,
167
+ parser .add_argument ('-step' , type = int , default = 5 ,
190
168
help = '<optional, int> Window step size.' )
191
169
parser .add_argument ('-nrep' , type = int , default = 100 ,
192
170
help = '<optional, int> Number of bootstrap replicates.' )
193
171
194
172
args = parser .parse_args ()
195
173
args .outfile .write ('qname,pos,rname,pdist,rname2,pdist2,qboot\n ' )
174
+
196
175
fasta = convert_fasta (args .infile )
197
176
for h , s in fasta :
198
177
print (h ) # crude progress monitoring
199
178
results = riplike (s , args .outfile , window = args .window , step = args .step , nrep = args .nrep )
200
179
for result in results :
201
180
args .outfile .write (
202
- '{},{centre },{best_ref},{best_p},{second_ref},{second_p},{quant}\n '
203
- .format (h , ** result )
181
+ '{},{center },{best_ref},{best_p},{second_ref},{second_p},{quant}\n '
182
+ .format (h , ** result )
204
183
)
205
-
184
+
206
185
args .outfile .close ()
207
-
186
+
208
187
209
188
if __name__ == '__main__' :
210
189
main ()
0 commit comments