Skip to content

Commit 2d12ba5

Browse files
committed
-Updated for loop and slicing indices to address #17
-Some optimization
1 parent f1bf792 commit 2d12ba5

File tree

1 file changed

+54
-75
lines changed

1 file changed

+54
-75
lines changed

poplars/riplike.py

+54-75
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
import random
44
import argparse
5-
from itertools import islice
65

76
from poplars.common import convert_fasta
87
from poplars.mafft import align
8+
import numpy as np
99

1010
# subset of HIV-1 group M subtype references curated by LANL
1111
with open('../poplars/ref_genomes/HIV1_Mgroup.fasta') as handle:
@@ -21,13 +21,17 @@ def pdistance(seq1, seq2):
2121
"""
2222
denom = 0. # number of valid columns
2323
ndiff = 0
24-
for i, nt1 in enumerate(seq1):
25-
nt2 = seq2[i]
26-
if nt1 == '-' or nt2 == '-':
27-
continue
28-
denom += 1
29-
if nt1 != nt2:
30-
ndiff += 1
24+
25+
seqs = np.char.asarray([seq1, seq2]) # Convert sequences to numpy arrays
26+
27+
# Stack 2-D numpy arrays and find columns not containing '-'
28+
# Gives an array containing True and False
29+
denoms = np.where(np.all(np.isin(seqs, '-', invert=True), 0))[0]
30+
denom = denoms.shape[0]
31+
32+
# From the valid positions, find where the sequences contain different nucleotides
33+
ndiff = np.sum(seqs[0, :][denoms] != seqs[1, :][denoms])
34+
3135
return ndiff, denom
3236

3337

@@ -37,52 +41,41 @@ def bootstrap(s1, s2, reps=100):
3741
:param s1: first sequence
3842
:param s2: second sequence (must be aligned to s1)
3943
:param reps: number of replicates to generate
40-
44+
4145
:yield: tuples of sequences generated by bootstrap resampling
4246
"""
4347
seqlen = len(s1)
4448
assert len(s2) == seqlen, "s1 and s2 must be of same length in bootstrap()"
45-
49+
50+
# Convert sequences to numpy arrays
51+
s1_np = np.char.asarray(list(s1))
52+
s2_np = np.char.asarray(list(s2))
53+
4654
for rep in range(reps):
47-
bootstrap = [random.randint(0, seqlen - 1) for _ in range(seqlen)]
48-
b1 = ''.join([s1[i] for i in bootstrap])
49-
b2 = ''.join([s2[i] for i in bootstrap])
55+
bootstrap = np.random.randint(0, seqlen, seqlen)
56+
b1 = s1_np[bootstrap]
57+
b2 = s2_np[bootstrap]
5058
yield b1, b2
5159

5260

5361
def update_alignment(seq):
5462
# append query sequence to reference alignment
5563
fasta = align(seq, reference)
56-
64+
5765
# eliminate insertions in query relative to references
5866
try:
5967
conseq = dict(fasta)['CON_OF_CONS']
6068
except:
6169
print("ERROR: reference alignment in poplars.riplike does not contain CON_OF_CONS entry")
6270
raise
63-
71+
6472
skip = [i for i in range(len(conseq)) if conseq[i] == '-']
6573
fasta2 = []
6674
for h, s in fasta:
6775
s2 = [nt for i, nt in enumerate(s) if i not in skip]
6876
fasta2.append([h, ''.join(s2)])
69-
70-
return fasta2
7177

72-
73-
def sliding_window(seq, window=400):
74-
"""
75-
Gives a sliding window of of width 'window' over nucleotides from the sequence
76-
:param seq: the reference sequence
77-
:param window: the width of the sliding window in nucleotides
78-
"""
79-
window_iter = iter(seq)
80-
seq_part = tuple(islice(window_iter, window))
81-
if len(seq_part) == window:
82-
yield seq_part
83-
for elem in window_iter:
84-
seq_part = seq_part[1:] + (elem,)
85-
yield seq_part
78+
return fasta2
8679

8780

8881
def riplike(seq, outfile, window=400, step=5, nrep=100):
@@ -93,83 +86,70 @@ def riplike(seq, outfile, window=400, step=5, nrep=100):
9386
:param step: step size of sliding window in nucleotides
9487
:param nrep: number of replicates for nonparametric bootstrap sampling
9588
"""
96-
9789
results = []
98-
90+
9991
fasta = update_alignment(seq)
10092
query = dict(fasta)['query'] # aligned query
10193
seqlen = len(query)
10294

103-
# b = [a[i:i+3] for i in range(len(a)-2)]
104-
count = 0
105-
for seq_window in sliding_window(seq, window):
106-
seq_region = ''.join(str(nt) for nt in seq_window)
107-
print(seq_region)
108-
centre = len(seq_window) // 2
109-
95+
for center in range(window//2, seqlen - (window//2), step):
11096
best_p, second_p = 1., 1. # maximum p-distance
11197
best_ref, second_ref = None, None
11298
best_seq = ''
113-
99+
114100
# cut slice from query sequence for this window
115-
# q1 = query[centre - (window // 2): (centre + (window // 2))]
116-
q = sliding_window(query, window)
117-
q1 = ''.join(str(nt) for nt in q)
118-
101+
q1 = query[center-(window//2):center + (window//2)]
102+
119103
# iterate over reference genomes
120104
for h, s in fasta:
121105
if h == 'query' or h == 'CON_OF_CONS':
122-
continue
123-
124-
# # slice window segment from reference
125-
# s1 = s[centre - (window // 2): (centre + (window // 2))]
126-
106+
continue
107+
108+
# slice window segment from reference
109+
s1 = s[center-(window//2):center + (window//2)]
110+
127111
# calculate p-distance
128-
# ndiff, denom = pdistance(s1, q1)
129-
ndiff, denom = pdistance(seq_region, q1)
112+
ndiff, denom = pdistance(list(s1), list(q1))
130113
if denom == 0:
131114
# no overlap! TODO: require minimum overlap?
132115
continue
133-
pd = ndiff/denom
134-
116+
pd = ndiff / denom
117+
135118
if pd < best_p:
136119
# query is closer to this reference
137120
second_p = best_p
138121
second_ref = best_ref
139122
best_p = pd
140123
best_ref = h
141-
# best_seq = s1
142-
best_seq = seq_region
124+
best_seq = s1
143125
elif pd < second_p:
144126
# replace second best
145-
second_p = pd; second_ref = h
146-
127+
second_p = pd
128+
second_ref = h
129+
147130
if best_ref is None:
148-
outfile.write('{},{},None,,None,,\n'.format(h, centre))
131+
outfile.write('{},{},None,,None,,\n'.format(h, center))
149132
continue
150-
151-
result = {'centre': centre, 'best_ref': best_ref, 'best_p': best_p,
133+
134+
result = {'center': center, 'best_ref': best_ref, 'best_p': best_p,
152135
'second_ref': second_ref, 'second_p': None if second_ref is None else second_p}
153-
136+
154137
quant = None
155138
if second_ref is not None:
156139
# use nonparametric bootstrap to determine significance
157140
boot_dist = []
158141
for bs, bq in bootstrap(best_seq, q1, reps=nrep):
159142
ndiff, denom = pdistance(bs, bq)
160143
if denom > 0:
161-
boot_dist.append(ndiff/denom)
162-
144+
boot_dist.append(ndiff / denom)
145+
163146
# how many are closer than second best?
164147
quant = list(map(lambda x: x < second_p, boot_dist))
165148
quant = sum(quant) / float(len(quant))
166-
149+
167150
result.update({'quant': quant})
168151
results.append(result)
169-
count += 1
170-
print(count)
171152

172-
print(fasta)
173153
return results
174154

175155

@@ -184,27 +164,26 @@ def main():
184164
help='<output> file to write CSV results.')
185165
parser.add_argument('-window', type=int, default=400,
186166
help='<optional, int> Window size for p-distances.')
187-
# FIXME: step size is by default 1
188-
189-
parser.add_argument('-step', type=int, default=1,
167+
parser.add_argument('-step', type=int, default=5,
190168
help='<optional, int> Window step size.')
191169
parser.add_argument('-nrep', type=int, default=100,
192170
help='<optional, int> Number of bootstrap replicates.')
193171

194172
args = parser.parse_args()
195173
args.outfile.write('qname,pos,rname,pdist,rname2,pdist2,qboot\n')
174+
196175
fasta = convert_fasta(args.infile)
197176
for h, s in fasta:
198177
print(h) # crude progress monitoring
199178
results = riplike(s, args.outfile, window=args.window, step=args.step, nrep=args.nrep)
200179
for result in results:
201180
args.outfile.write(
202-
'{},{centre},{best_ref},{best_p},{second_ref},{second_p},{quant}\n'
203-
.format(h, **result)
181+
'{},{center},{best_ref},{best_p},{second_ref},{second_p},{quant}\n'
182+
.format(h, **result)
204183
)
205-
184+
206185
args.outfile.close()
207-
186+
208187

209188
if __name__ == '__main__':
210189
main()

0 commit comments

Comments
 (0)