Skip to content

Commit 8c805e1

Browse files
committed
-Fixes #17
-Implemented 'encode' (#22) -Added unit test -Fixes #19
1 parent 2d12ba5 commit 8c805e1

File tree

4 files changed

+114
-120
lines changed

4 files changed

+114
-120
lines changed

poplars/riplike.py

+83-76
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,49 @@
11
# TODO: consistent reference coordinates across outputs
22

3+
import sys
4+
import subprocess
35
import random
46
import argparse
7+
import os
58

69
from poplars.common import convert_fasta
710
from poplars.mafft import align
8-
import numpy as np
911

10-
# subset of HIV-1 group M subtype references curated by LANL
11-
with open('../poplars/ref_genomes/HIV1_Mgroup.fasta') as handle:
12-
reference = convert_fasta(handle)
1312

14-
15-
def pdistance(seq1, seq2):
13+
def hamming(fasta):
1614
"""
17-
Calculate p-distance between two aligned sequences
18-
:param seq1: First sequence
19-
:param seq2: Second sequence
20-
:return: <ndiff> is the number of differences, <denom> is the number of valid positions
15+
Convert list of lists into boolean outcomes (difference between query and reference)
16+
:param fasta: object returned by align()
17+
:return: dictionary of boolean lists keyed by reference label
2118
"""
22-
denom = 0. # number of valid columns
23-
ndiff = 0
24-
25-
seqs = np.char.asarray([seq1, seq2]) # Convert sequences to numpy arrays
26-
27-
# Stack 2-D numpy arrays and find columns not containing '-'
28-
# Gives an array containing True and False
29-
denoms = np.where(np.all(np.isin(seqs, '-', invert=True), 0))[0]
30-
denom = denoms.shape[0]
31-
32-
# From the valid positions, find where the sequences contain different nucleotides
33-
ndiff = np.sum(seqs[0, :][denoms] != seqs[1, :][denoms])
19+
aln = dict(fasta)
20+
assert "query" in aln, "Argument <fasta> must contain 'query' entry"
21+
query = aln.pop('query')
22+
_ = aln.pop('CON_OF_CONS')
23+
24+
# iterate over remaining sequences as references
25+
results = {}
26+
for h, s in aln.items():
27+
result = []
28+
for i, nt1 in enumerate(query):
29+
nt2 = s[i]
30+
if nt1 == '-' or nt2 == '-':
31+
result.append(None)
32+
continue
33+
result.append(int(nt1 != nt2))
34+
results.update({h: result})
3435

35-
return ndiff, denom
36+
return results
3637

3738

38-
def bootstrap(s1, s2, reps=100):
39+
def update_alignment(seq, reference):
3940
"""
40-
Sample positions at random with replacement.
41-
:param s1: first sequence
42-
:param s2: second sequence (must be aligned to s1)
43-
:param reps: number of replicates to generate
44-
45-
:yield: tuples of sequences generated by bootstrap resampling
41+
Append query sequence <seq> to reference alignment and remove insertions relative to
42+
global consensus sequence.
43+
:param seq: the query sequence
44+
:param reference: the reference sequence
45+
:return: a list of [header, sequence] lists
4646
"""
47-
seqlen = len(s1)
48-
assert len(s2) == seqlen, "s1 and s2 must be of same length in bootstrap()"
49-
50-
# Convert sequences to numpy arrays
51-
s1_np = np.char.asarray(list(s1))
52-
s2_np = np.char.asarray(list(s2))
53-
54-
for rep in range(reps):
55-
bootstrap = np.random.randint(0, seqlen, seqlen)
56-
b1 = s1_np[bootstrap]
57-
b2 = s2_np[bootstrap]
58-
yield b1, b2
59-
60-
61-
def update_alignment(seq):
6247
# append query sequence to reference alignment
6348
fasta = align(seq, reference)
6449

@@ -78,38 +63,53 @@ def update_alignment(seq):
7863
return fasta2
7964

8065

81-
def riplike(seq, outfile, window=400, step=5, nrep=100):
66+
def encode(sequence):
67+
"""
68+
Encodes each nucleotide in a sequence using 4-bits
69+
:param sequence: the sequence
70+
:return: the sequence as a bitstring where each nucleotide is encoded using a 4-bits
71+
"""
72+
seq = []
73+
binary_nt = {'A': 0B0001, 'T': 0B0010, 'C': 0B0011, 'G': 0B0100, ' ': 0B0000, '-': 0B1111}
74+
for nt in sequence:
75+
seq.append(binary_nt[nt])
76+
return seq
77+
78+
79+
def riplike(seq, reference, window=400, step=5, nrep=100):
8280
"""
8381
:param seq: query sequence
84-
:param outfile: open file stream in write mode for results
82+
:param reference: the alignment background
8583
:param window: width of sliding window in nucleotides
8684
:param step: step size of sliding window in nucleotides
8785
:param nrep: number of replicates for nonparametric bootstrap sampling
86+
:return: list of result dictionaries in order of window position
8887
"""
88+
8989
results = []
9090

91-
fasta = update_alignment(seq)
91+
fasta = update_alignment(seq, reference)
9292
query = dict(fasta)['query'] # aligned query
9393
seqlen = len(query)
94+
ham = hamming(fasta)
9495

95-
for center in range(window//2, seqlen - (window//2), step):
96+
for centre in range(window // 2, seqlen - (window // 2), step):
9697
best_p, second_p = 1., 1. # maximum p-distance
9798
best_ref, second_ref = None, None
98-
best_seq = ''
99-
100-
# cut slice from query sequence for this window
101-
q1 = query[center-(window//2):center + (window//2)]
99+
best_seq = []
102100

103101
# iterate over reference genomes
104-
for h, s in fasta:
102+
for h, s in ham.items():
105103
if h == 'query' or h == 'CON_OF_CONS':
106104
continue
107105

108-
# slice window segment from reference
109-
s1 = s[center-(window//2):center + (window//2)]
106+
# slice window segment from reference
107+
s1 = s[centre - (window // 2): centre + (window // 2)]
108+
s2 = [x for x in s1 if x is not None]
110109

111110
# calculate p-distance
112-
ndiff, denom = pdistance(list(s1), list(q1))
111+
ndiff = sum(s2)
112+
denom = len(s2)
113113
if denom == 0:
114114
# no overlap! TODO: require minimum overlap?
115115
continue
@@ -121,31 +121,25 @@ def riplike(seq, outfile, window=400, step=5, nrep=100):
121121
second_ref = best_ref
122122
best_p = pd
123123
best_ref = h
124-
best_seq = s1
124+
best_seq = s2
125125
elif pd < second_p:
126126
# replace second best
127127
second_p = pd
128128
second_ref = h
129129

130-
if best_ref is None:
131-
outfile.write('{},{},None,,None,,\n'.format(h, center))
132-
continue
133-
134-
result = {'center': center, 'best_ref': best_ref, 'best_p': best_p,
130+
result = {'centre': centre, 'best_ref': best_ref, 'best_p': best_p,
135131
'second_ref': second_ref, 'second_p': None if second_ref is None else second_p}
136132

137133
quant = None
138-
if second_ref is not None:
134+
if second_ref is not None and nrep > 0:
139135
# use nonparametric bootstrap to determine significance
140-
boot_dist = []
141-
for bs, bq in bootstrap(best_seq, q1, reps=nrep):
142-
ndiff, denom = pdistance(bs, bq)
143-
if denom > 0:
144-
boot_dist.append(ndiff / denom)
145-
146-
# how many are closer than second best?
147-
quant = list(map(lambda x: x < second_p, boot_dist))
148-
quant = sum(quant) / float(len(quant))
136+
count = 0.
137+
n = len(best_seq)
138+
for rep in range(nrep):
139+
boot = [best_seq[round(random.random() * (n - 1))] for _ in range(n)]
140+
if sum(boot) / len(boot) < second_p:
141+
count += 1
142+
quant = count / nrep
149143

150144
result.update({'quant': quant})
151145
results.append(result)
@@ -168,18 +162,31 @@ def main():
168162
help='<optional, int> Window step size.')
169163
parser.add_argument('-nrep', type=int, default=100,
170164
help='<optional, int> Number of bootstrap replicates.')
165+
parser.add_argument('-custombg', type=argparse.FileType('r'),
166+
help='<optional> FASTA file to be used as the alignment background')
171167

172168
args = parser.parse_args()
169+
170+
if args.custombg:
171+
ref_seq = args.custombg
172+
else:
173+
# subset of HIV-1 group M subtype references curated by LANL
174+
seq_path = os.path.dirname(os.path.abspath(__file__))
175+
ref_seq = os.path.join(seq_path, 'ref_genomes/HIV1_Mgroup.fasta')
176+
177+
with open(ref_seq) as handle:
178+
reference = convert_fasta(handle)
179+
173180
args.outfile.write('qname,pos,rname,pdist,rname2,pdist2,qboot\n')
174181

175182
fasta = convert_fasta(args.infile)
176183
for h, s in fasta:
177184
print(h) # crude progress monitoring
178-
results = riplike(s, args.outfile, window=args.window, step=args.step, nrep=args.nrep)
185+
results = riplike(s, reference, window=args.window, step=args.step, nrep=args.nrep)
179186
for result in results:
180187
args.outfile.write(
181-
'{},{center},{best_ref},{best_p},{second_ref},{second_p},{quant}\n'
182-
.format(h, **result)
188+
'{},{centre},{best_ref},{best_p},{second_ref},{second_p},{quant}\n'
189+
.format(h, **result)
183190
)
184191

185192
args.outfile.close()

poplars/sequence_locator.py

-13
Original file line numberDiff line numberDiff line change
@@ -604,19 +604,6 @@ def retrieve(virus, base, ref_regions, region, outfile=None, start_offset=1, end
604604
else:
605605
end = length + 1
606606

607-
# # Handles global and local start coordinates
608-
# if start_offset <= region_start:
609-
# start = region_start
610-
# else:
611-
# start = region_start + (start_offset - region_start)
612-
#
613-
# # Handles global and local end coordinates
614-
# if end_offset == 'end' or end_offset > region_end:
615-
# end = region_end
616-
# else:
617-
# end = region_end + (region_end - end_offset)
618-
619-
# Create a GenomeRegion object for the query
620607
if ref_region.region_name == region:
621608
query_region = GenomeRegion(region)
622609
query_region.set_coords([start, end], base) # Set global coordinates

poplars/tests/test_riplike.py

+27-31
Original file line numberDiff line numberDiff line change
@@ -2,59 +2,55 @@
22
import os
33
from poplars.riplike import *
44

5-
65
TEST_HIV_GENOME = os.path.join(os.path.dirname(__file__), '../ref_genomes/K03455.fasta')
6+
TEST_REFERENCE = os.path.join(os.path.dirname(__file__), '../ref_genomes/HIV1_Mgroup.fasta')
7+
78

89
class testRiplike(unittest.TestCase):
910

1011
def setUp(self):
11-
with open(TEST_HIV_GENOME) as handle:
12+
with open(TEST_HIV_GENOME) as handle, open(TEST_REFERENCE) as ref_handle:
1213
self.hiv_genome = handle.readlines()[1]
13-
14-
def testPdistSimple(self):
15-
result = pdistance('ACGT', 'ACGC')
16-
expected = (1, 4)
17-
self.assertEqual(expected, result)
18-
19-
result = pdistance('ACGT', 'TGCA')
20-
expected = (4, 4)
21-
self.assertEqual(expected, result)
22-
23-
def testPdistGapped(self):
24-
result = pdistance('ACGT', '---T')
25-
expected = (0, 1)
26-
self.assertEqual(expected, result)
27-
28-
result = pdistance('ACGT', 'G---')
29-
expected = (1, 1)
14+
self.reference = convert_fasta(ref_handle)
15+
16+
def testHamming(self):
17+
test = [['query', 'ACGT'], ['CON_OF_CONS', 'ACGT'], ['A', 'ACGG']]
18+
result = hamming(test)
19+
expected = {'A': [0, 0, 0, 1]}
3020
self.assertEqual(expected, result)
31-
32-
def testBootstrap(self):
33-
pass #FIXME: not sure how to test this function just yet
34-
21+
3522
def testUpdateAlignment(self):
36-
aln = update_alignment(self.hiv_genome)
23+
aln = update_alignment(self.hiv_genome, self.reference)
3724
aln = dict(aln)
38-
25+
3926
# alignment should have original number plus one
4027
result = len(aln)
41-
expected = 11
42-
self.assertEqual(expected, result)
43-
28+
expected = 13 + 1
29+
self.assertEqual(expected, result)
30+
4431
# alignment should contain entry for query sequence
4532
result = 'query' in aln
4633
self.assertTrue(result)
47-
34+
4835
# aligned query should have same number of nucleotides
4936
result = [aln['query'].count(nt) for nt in 'ACGT']
5037
expected = [self.hiv_genome.upper().count(nt) for nt in 'ACGT']
5138
self.assertEqual(expected, result)
52-
39+
5340
# all sequences should have the same length
5441
result = len(set([len(s) for h, s in aln.items()]))
5542
expected = 1
5643
self.assertEqual(expected, result)
57-
44+
45+
def testEncode(self):
46+
s = 'ATGCGC-- T'
47+
expected = [0B0001, 0B0010, 0B0100, 0B0011, 0B0100, 0B0011, 0B1111, 0B1111, 0B0000, 0B0000, 0B0010]
48+
result = encode(s)
49+
self.assertEqual(expected, result)
50+
51+
def testRiplike(self):
52+
result = riplike(self.hiv_genome, self.reference)
53+
5854

5955

6056

poplars/tests/test_sequence_locator.py

+4
Original file line numberDiff line numberDiff line change
@@ -1139,3 +1139,7 @@ def tearDown(self):
11391139
self.siv_default_prot.close()
11401140
self.siv_test_genome.close()
11411141
self.siv_test_prot.close()
1142+
1143+
1144+
if __name__ == '__main__':
1145+
unittest.main()

0 commit comments

Comments
 (0)