Skip to content

Commit 66a078c

Browse files
committed
fixes #12
1 parent 6a1415b commit 66a078c

File tree

1 file changed

+53
-62
lines changed

1 file changed

+53
-62
lines changed

poplars/riplike.py

+53-62
Original file line numberDiff line numberDiff line change
@@ -15,46 +15,41 @@
1515
with open(ref_seq) as handle:
1616
reference = convert_fasta(handle)
1717

18-
def pdistance(seq1, seq2):
19-
"""
20-
Calculate p-distance between two aligned sequences
21-
:param seq1: First sequence
22-
:param seq2: Second sequence
23-
:return: <ndiff> is the number of differences, <denom> is the number of valid positions
18+
19+
def hamming(fasta):
2420
"""
25-
denom = 0. # number of valid columns
26-
ndiff = 0
27-
for i, nt1 in enumerate(seq1):
28-
nt2 = seq2[i]
29-
if nt1 == '-' or nt2 == '-':
30-
continue
31-
denom += 1
32-
if nt1 != nt2:
33-
ndiff += 1
34-
return ndiff, denom
35-
36-
37-
def bootstrap(s1, s2, reps=100):
21+
Convert list of lists into boolean outcomes (difference between query and reference)
22+
:param fasta: object returned by align()
23+
:return: dictionary of boolean lists keyed by reference label
3824
"""
39-
Sample positions at random with replacement.
40-
:param s1: first sequence
41-
:param s2: second sequence (must be aligned to s1)
42-
:param reps: number of replicates to generate
43-
44-
:yield: tuples of sequences generated by bootstrap resampling
45-
"""
46-
seqlen = len(s1)
47-
assert len(s2)==seqlen, "s1 and s2 must be of same length in bootstrap()"
48-
49-
for rep in range(reps):
25+
aln = dict(fasta)
26+
assert "query" in aln, "Argument <fasta> must contain 'query' entry"
27+
query = aln.pop('query')
28+
_ = aln.pop('CON_OF_CONS')
29+
30+
# iterate over remaining sequences as references
31+
results = {}
32+
for h, s in aln.items():
5033
result = []
51-
bootstrap = [random.randint(0, seqlen-1) for _ in range(seqlen)]
52-
b1 = ''.join([s1[i] for i in bootstrap])
53-
b2 = ''.join([s2[i] for i in bootstrap])
54-
yield b1, b2
34+
for i, nt1 in enumerate(query):
35+
nt2 = s[i]
36+
if nt1 == '-' or nt2 == '-':
37+
result.append(None)
38+
continue
39+
result.append(int(nt1 != nt2))
40+
results.update({h: result})
41+
42+
return results
43+
5544

5645

5746
def update_alignment(seq):
47+
"""
48+
Append query sequence <seq> to reference alignment and remove insertions relative to
49+
global consensus sequence.
50+
:param seq:
51+
:return: a list of [header, sequence] lists
52+
"""
5853
# append query sequence to reference alignment
5954
fasta = align(seq, reference)
6055

@@ -81,32 +76,33 @@ def riplike(seq, window=400, step=5, nrep=100):
8176
:param window: width of sliding window in nucleotides
8277
:param step: step size of sliding window in nucleotides
8378
:param nrep: number of replicates for nonparametric bootstrap sampling
79+
:return: list of result dictionaries in order of window position
8480
"""
8581

8682
results = []
87-
83+
8884
fasta = update_alignment(seq)
8985
query = dict(fasta)['query'] # aligned query
9086
seqlen = len(query)
91-
87+
ham = hamming(fasta)
88+
9289
for left in range(0, seqlen-window, step):
9390
best_p, second_p = 1., 1. # maximum p-distance
9491
best_ref, second_ref = None, None
95-
best_seq = ''
96-
97-
# cut slice from query sequence for this window
98-
q1 = query[left:(left+window)]
99-
92+
best_seq = []
93+
10094
# iterate over reference genomes
101-
for h, s in fasta:
102-
if h=='query' or h=='CON_OF_CONS':
95+
for h, s in ham.items():
96+
if h == 'query' or h == 'CON_OF_CONS':
10397
continue
10498

10599
# slice window segment from reference
106100
s1 = s[left:(left+window)]
101+
s2 = [x for x in s1 if x is not None]
107102

108103
# calculate p-distance
109-
ndiff, denom = pdistance(s1, q1)
104+
ndiff = sum(s2)
105+
denom = len(s2)
110106
if denom == 0:
111107
# no overlap! TODO: require minimum overlap?
112108
continue
@@ -115,30 +111,25 @@ def riplike(seq, window=400, step=5, nrep=100):
115111
if pd < best_p:
116112
# query is closer to this reference
117113
second_p = best_p; second_ref = best_ref
118-
best_p = pd; best_ref = h; best_seq = s1
114+
best_p = pd; best_ref = h; best_seq = s2
119115
elif pd < second_p:
120116
# replace second best
121117
second_p = pd; second_ref = h
122118

123-
if best_ref is None:
124-
outfile.write('{},{},None,,None,,\n'.format(header, left))
125-
continue
126-
127-
result = {'left': left, 'best_ref': best_ref, 'best_p': best_p,
128-
'second_ref': second_ref, 'second_p': None if second_ref is None else second_p}
119+
result = {'left': left, 'best_ref': best_ref, 'best_p': best_p,
120+
'second_ref': second_ref, 'second_p': None if second_ref is None else second_p}
121+
# print(result)
129122

130123
quant = None
131-
if second_ref is not None:
124+
if second_ref is not None and nrep > 0:
132125
# use nonparametric bootstrap to determine significance
133-
boot_dist = []
134-
for bs, bq in bootstrap(best_seq, q1, reps=nrep):
135-
ndiff, denom = pdistance(bs, bq)
136-
if denom > 0:
137-
boot_dist.append(ndiff/denom)
138-
139-
# how many are closer than second best?
140-
quant = list(map(lambda x: x < second_p, boot_dist))
141-
quant = sum(quant) / float(len(quant))
126+
count = 0.
127+
n = len(best_seq)
128+
for rep in range(nrep):
129+
boot = [best_seq[round(random.random()*(n-1))] for _ in range(n)]
130+
if sum(boot)/len(boot) < second_p:
131+
count += 1
132+
quant = count / nrep
142133

143134
result.update({'quant': quant})
144135
results.append(result)
@@ -172,7 +163,7 @@ def main():
172163
for result in results:
173164
args.outfile.write(
174165
'{},{left},{best_ref},{best_p},{second_ref},{second_p},{quant}\n'
175-
.format(header, **result)
166+
.format(h, **result)
176167
)
177168

178169
args.outfile.close()

0 commit comments

Comments
 (0)