Skip to content

Commit 4b58083

Browse files
committed
-#8, #20 in progress
1 parent c24bfb9 commit 4b58083

File tree

2 files changed

+42
-51
lines changed

2 files changed

+42
-51
lines changed

poplars/riplike.py

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,23 @@
88
from poplars.mafft import align
99

1010

11-
def hamming(fasta):
11+
def hamming(bin_fasta):
1212
"""
1313
Convert list of lists into boolean outcomes (difference between query and reference)
14-
:param fasta: object returned by align()
14+
:param fasta: object returned by align() converted to bit strings
1515
:return: dictionary of boolean lists keyed by reference label
1616
"""
17-
aln = dict(fasta)
18-
assert "query" in aln, "Argument <fasta> must contain 'query' entry"
19-
query = aln.pop('query')
20-
_ = aln.pop('CON_OF_CONS')
17+
query = bin_fasta.pop('query')
2118

22-
# iterate over remaining sequences as references
19+
# Iterate over remaining sequences as references
2320
results = {}
24-
for h, s in aln.items():
21+
for h, s in bin_fasta.items():
2522
result = []
26-
for i, nt1 in enumerate(query):
27-
nt2 = s[i]
28-
if nt1 == '-' or nt2 == '-':
23+
for nt1, nt2 in zip(query, s):
24+
if (nt1 | nt2) & 0B1111:
2925
result.append(None)
3026
continue
31-
result.append(int(nt1 != nt2))
27+
result.append(bin(nt1 ^ nt2))
3228
results.update({h: result})
3329

3430
return results
@@ -61,17 +57,28 @@ def update_alignment(seq, reference):
6157
return fasta2
6258

6359

64-
def encode(sequence):
60+
def encode(fasta):
6561
"""
6662
Encodes each nucleotide in a sequence using 4-bits
67-
:param sequence: the sequence
63+
:param fasta: the result of the alignment
6864
:return: the sequence as a bitstring where each nucleotide is encoded using a 4-bits
6965
"""
70-
seq = []
71-
binary_nt = {'A': 0B0001, 'T': 0B0010, 'C': 0B0011, 'G': 0B0100, ' ': 0B0000, '-': 0B1111}
72-
for nt in sequence:
73-
seq.append(binary_nt[nt])
74-
return seq
66+
bin_fasta = dict(fasta)
67+
assert "query" in bin_fasta, "Argument <fasta> must contain 'query' entry"
68+
_ = bin_fasta.pop('CON_OF_CONS')
69+
70+
binary_nt = {' ': 0B00000, 'A': 0B00001, 'T': 0B00010, 'C': 0B00011, 'G': 0B00100,
71+
'N': 0B00101, 'R': 0B00110, 'Y': 0B00111, 'K': 0B01000, 'M': 0B01001,
72+
'S': 0B01010, 'W': 0B01011, 'B': 0B01100, 'D': 0B01101, 'H': 0B01110,
73+
'V': 0B01111, 'X': 0B10000, '-': 0B1111}
74+
75+
for h, s in bin_fasta.items():
76+
seq = []
77+
for nt in s:
78+
seq.append(binary_nt[nt])
79+
bin_fasta[h] = seq
80+
81+
return bin_fasta
7582

7683

7784
def riplike(seq, reference, window=400, step=5, nrep=100):
@@ -89,7 +96,8 @@ def riplike(seq, reference, window=400, step=5, nrep=100):
8996
fasta = update_alignment(seq, reference)
9097
query = dict(fasta)['query'] # aligned query
9198
seqlen = len(query)
92-
ham = hamming(fasta)
99+
bin_fasta = encode(fasta)
100+
ham = hamming(bin_fasta)
93101

94102
for centre in range(window // 2, seqlen - (window // 2), step):
95103
best_p, second_p = 1., 1. # maximum p-distance
@@ -193,5 +201,3 @@ def main():
193201

194202
if __name__ == '__main__':
195203
main()
196-
197-

poplars/sequence_locator.py

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def __init__(self, region_name, ncoords=None, nt_seq=None, pcoords=None, aa_seq=
3333
self.nt_seq = nt_seq
3434
self.pcoords = pcoords
3535
self.aa_seq = aa_seq
36-
self.rel_pos = {'CDS': [], 'gstart': [], 'qstart': [], 'rstart': [], 'pstart': []}
36+
self.rel_pos = {'CDS': [], 'gstart': [], 'qstart': [], 'pstart': []}
3737
self.codon_aln = ''
3838

3939
def get_coords(self, base):
@@ -106,18 +106,20 @@ def set_pos_from_pstart(self, virus):
106106
else:
107107
self.rel_pos['pstart'] = None
108108

109-
def set_pos_from_rstart(self, region):
110-
start_offset = self.ncoords[0] - region.ncoords[0]
111-
end_offset = self.ncoords[1] - region.ncoords[1]
112-
self.rel_pos['rstart'] = [end_offset, start_offset]
113-
114109
def set_pos_from_qstart(self, query, base):
115-
if self.ncoords is None or self.pcoords is None:
116-
self.set_coords(self.get_coords(base), base)
117-
118-
start_offset = self.ncoords[0] - query.ncoords[0]
119-
end_offset = self.ncoords[1] - query.ncoords[1]
120-
self.rel_pos['qstart'] = [end_offset, start_offset]
110+
"""
111+
Gives the position of the sequence relative to the start of the region of interest
112+
:param query: The GenomeRegion objects for the query
113+
:param base: The base of the sequence (nucleotide or protein)
114+
:return: The position relative to the start of the region of interest
115+
"""
116+
r_coords = self.get_coords(base)
117+
r_seq = self.get_sequence(base)
118+
if r_coords is not None and r_seq is not None:
119+
q_coords = query.get_coords(base)
120+
start_offset = r_coords[0] - q_coords[0] + 1
121+
end_offset = start_offset + len(r_seq)
122+
self.rel_pos['qstart'] = [start_offset, end_offset]
121123

122124
def make_codon_aln(self):
123125
"""
@@ -495,7 +497,6 @@ def find_matches(virus, base, ref_regions, match_coordinates):
495497
query_region.set_pos_from_cds(virus)
496498
query_region.set_pos_from_gstart()
497499
query_region.set_pos_from_qstart(ref_region, base)
498-
query_region.set_pos_from_rstart(ref_region)
499500
query_region.set_pos_from_pstart(virus)
500501

501502
if base == 'nucl':
@@ -587,10 +588,6 @@ def output_retrieved_region(region, outfile=None):
587588
print("\t\tPosition relative to query start:\t{} --> {}"
588589
.format(region.rel_pos['qstart'][0], region.rel_pos['qstart'][1]))
589590

590-
if region.rel_pos['rstart']:
591-
print("\t\tPosition relative to region start:\t{} --> {}\n"
592-
.format(region.rel_pos['rstart'][0], region.rel_pos['rstart'][1]))
593-
594591
else:
595592
outfile.write("\nRetrieved Region:\t{}".format(region.region_name))
596593
outfile.write("\tNucleotide Sequence:\n")
@@ -624,10 +621,6 @@ def output_retrieved_region(region, outfile=None):
624621
outfile.write("\t\tPosition relative to query start:\t{} --> {}"
625622
.format(region.rel_pos['qstart'][0], region.rel_pos['qstart'][1]))
626623

627-
if region.rel_pos['rstart']:
628-
outfile.write("\t\tPosition relative to region start:\t{} --> {}\n"
629-
.format(region.rel_pos['rstart'][0], region.rel_pos['rstart'][1]))
630-
631624

632625
def output_overlap(overlap_regions, outfile=None):
633626
"""
@@ -676,10 +669,6 @@ def output_overlap(overlap_regions, outfile=None):
676669
print("\t\tPosition relative to query start:\t{} --> {}"
677670
.format(region.rel_pos['qstart'][0], region.rel_pos['qstart'][1]))
678671

679-
if region.rel_pos['rstart']:
680-
print("\t\tPosition relative to region start:\t{} --> {}\n"
681-
.format(region.rel_pos['rstart'][0], region.rel_pos['rstart'][1]))
682-
683672
else:
684673
for key in overlap_regions:
685674
region = overlap_regions[key]
@@ -718,10 +707,6 @@ def output_overlap(overlap_regions, outfile=None):
718707
outfile.write("\t\tPosition relative to query start:\t{} --> {}"
719708
.format(region.rel_pos['qstart'][0], region.rel_pos['qstart'][1]))
720709

721-
if region.rel_pos['rstart']:
722-
outfile.write("\t\tPosition relative to region start:\t{} --> {}\n"
723-
.format(region.rel_pos['rstart'][0], region.rel_pos['rstart'][1]))
724-
725710

726711
def retrieve(virus, base, ref_regions, region, qstart=1, qend='end'):
727712
"""

0 commit comments

Comments
 (0)