Skip to content

Commit f1bf792

Browse files
committed
-Fixed bug in retrieve (#8, #20)
-Updated test case for retrieve
1 parent 80b777a commit f1bf792

File tree

4 files changed

+202
-102
lines changed

4 files changed

+202
-102
lines changed

poplars/riplike.py

Lines changed: 51 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
#TODO: consistent reference coordinates across outputs
1+
# TODO: consistent reference coordinates across outputs
22

33
import random
44
import argparse
5+
from itertools import islice
56

67
from poplars.common import convert_fasta
78
from poplars.mafft import align
@@ -40,11 +41,10 @@ def bootstrap(s1, s2, reps=100):
4041
:yield: tuples of sequences generated by bootstrap resampling
4142
"""
4243
seqlen = len(s1)
43-
assert len(s2)==seqlen, "s1 and s2 must be of same length in bootstrap()"
44+
assert len(s2) == seqlen, "s1 and s2 must be of same length in bootstrap()"
4445

4546
for rep in range(reps):
46-
result = []
47-
bootstrap = [random.randint(0, seqlen-1) for _ in range(seqlen)]
47+
bootstrap = [random.randint(0, seqlen - 1) for _ in range(seqlen)]
4848
b1 = ''.join([s1[i] for i in bootstrap])
4949
b2 = ''.join([s2[i] for i in bootstrap])
5050
yield b1, b2
@@ -70,6 +70,21 @@ def update_alignment(seq):
7070
return fasta2
7171

7272

73+
def sliding_window(seq, window=400):
74+
"""
75+
Gives a sliding window of of width 'window' over nucleotides from the sequence
76+
:param seq: the reference sequence
77+
:param window: the width of the sliding window in nucleotides
78+
"""
79+
window_iter = iter(seq)
80+
seq_part = tuple(islice(window_iter, window))
81+
if len(seq_part) == window:
82+
yield seq_part
83+
for elem in window_iter:
84+
seq_part = seq_part[1:] + (elem,)
85+
yield seq_part
86+
87+
7388
def riplike(seq, outfile, window=400, step=5, nrep=100):
7489
"""
7590
:param seq: query sequence
@@ -78,49 +93,62 @@ def riplike(seq, outfile, window=400, step=5, nrep=100):
7893
:param step: step size of sliding window in nucleotides
7994
:param nrep: number of replicates for nonparametric bootstrap sampling
8095
"""
81-
96+
8297
results = []
8398

8499
fasta = update_alignment(seq)
85100
query = dict(fasta)['query'] # aligned query
86101
seqlen = len(query)
87-
88-
for left in range(0, seqlen-window, step):
102+
103+
# b = [a[i:i+3] for i in range(len(a)-2)]
104+
count = 0
105+
for seq_window in sliding_window(seq, window):
106+
seq_region = ''.join(str(nt) for nt in seq_window)
107+
print(seq_region)
108+
centre = len(seq_window) // 2
109+
89110
best_p, second_p = 1., 1. # maximum p-distance
90111
best_ref, second_ref = None, None
91112
best_seq = ''
92113

93114
# cut slice from query sequence for this window
94-
q1 = query[left:(left+window)]
115+
# q1 = query[centre - (window // 2): (centre + (window // 2))]
116+
q = sliding_window(query, window)
117+
q1 = ''.join(str(nt) for nt in q)
95118

96119
# iterate over reference genomes
97120
for h, s in fasta:
98121
if h == 'query' or h == 'CON_OF_CONS':
99122
continue
100123

101-
# slice window segment from reference
102-
s1 = s[left:(left+window)]
124+
# # slice window segment from reference
125+
# s1 = s[centre - (window // 2): (centre + (window // 2))]
103126

104127
# calculate p-distance
105-
ndiff, denom = pdistance(s1, q1)
128+
# ndiff, denom = pdistance(s1, q1)
129+
ndiff, denom = pdistance(seq_region, q1)
106130
if denom == 0:
107131
# no overlap! TODO: require minimum overlap?
108132
continue
109133
pd = ndiff/denom
110134

111135
if pd < best_p:
112136
# query is closer to this reference
113-
second_p = best_p; second_ref = best_ref
114-
best_p = pd; best_ref = h; best_seq = s1
137+
second_p = best_p
138+
second_ref = best_ref
139+
best_p = pd
140+
best_ref = h
141+
# best_seq = s1
142+
best_seq = seq_region
115143
elif pd < second_p:
116144
# replace second best
117145
second_p = pd; second_ref = h
118146

119147
if best_ref is None:
120-
outfile.write('{},{},None,,None,,\n'.format(h, left))
148+
outfile.write('{},{},None,,None,,\n'.format(h, centre))
121149
continue
122150

123-
result = {'left': left, 'best_ref': best_ref, 'best_p': best_p,
151+
result = {'centre': centre, 'best_ref': best_ref, 'best_p': best_p,
124152
'second_ref': second_ref, 'second_p': None if second_ref is None else second_p}
125153

126154
quant = None
@@ -138,7 +166,10 @@ def riplike(seq, outfile, window=400, step=5, nrep=100):
138166

139167
result.update({'quant': quant})
140168
results.append(result)
141-
169+
count += 1
170+
print(count)
171+
172+
print(fasta)
142173
return results
143174

144175

@@ -153,21 +184,22 @@ def main():
153184
help='<output> file to write CSV results.')
154185
parser.add_argument('-window', type=int, default=400,
155186
help='<optional, int> Window size for p-distances.')
156-
parser.add_argument('-step', type=int, default=5,
187+
# FIXME: step size is by default 1
188+
189+
parser.add_argument('-step', type=int, default=1,
157190
help='<optional, int> Window step size.')
158191
parser.add_argument('-nrep', type=int, default=100,
159192
help='<optional, int> Number of bootstrap replicates.')
160193

161194
args = parser.parse_args()
162195
args.outfile.write('qname,pos,rname,pdist,rname2,pdist2,qboot\n')
163-
164196
fasta = convert_fasta(args.infile)
165197
for h, s in fasta:
166198
print(h) # crude progress monitoring
167199
results = riplike(s, args.outfile, window=args.window, step=args.step, nrep=args.nrep)
168200
for result in results:
169201
args.outfile.write(
170-
'{},{left},{best_ref},{best_p},{second_ref},{second_p},{quant}\n'
202+
'{},{centre},{best_ref},{best_p},{second_ref},{second_p},{quant}\n'
171203
.format(h, **result)
172204
)
173205

poplars/sequence_locator.py

Lines changed: 81 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,18 @@
1616
class GenomeRegion:
1717
"""
1818
Represents information about a genomic region
19+
20+
:ivar region_name: The name of the genomic region
21+
:ivar nt_coords: A list containing the start and end indices of the nucleotide region
22+
:ivar nt_seq: The nucleotide sequence
23+
:ivar aa_coords: A list containing the start and end indices of the protein region
24+
:ivar aa_seq: The amino acid sequence
25+
:ivar pos_from_cds: Nucleotide position relative to CDS start
26+
:ivar pos_from_gstart: Nucleotide position relative to the reference genome start
27+
:ivar pos_from_qstart: Nucleotide position relative to the start of the query sequence
28+
:ivar pos_from_rstart: Nucleotide position relative to the start of the region
29+
:ivar pos_from_pstart: Amino acid position relative to protein start
30+
:ivar codon_aln: Amino acid sequence aligned with the nucleotide sequence
1931
"""
2032

2133
def __init__(self, region_name, nt_coords=None, nt_seq=None, aa_coords=None, aa_seq=None):
@@ -28,17 +40,11 @@ def __init__(self, region_name, nt_coords=None, nt_seq=None, aa_coords=None, aa_
2840
:param aa_seq: <option> The amino acid sequence of the genomic region
2941
"""
3042
self.region_name = region_name
31-
self.nt_coords = nt_coords
32-
self.nt_seq = nt_seq
33-
self.aa_coords = aa_coords
34-
self.aa_seq = aa_seq
35-
36-
self.pos_from_cds = [] # Nucleotide position relative to CDS start
37-
self.pos_from_gstart = None # Nucleotide position relative to the reference genome start
38-
self.pos_from_qstart = None # Nucleotide position relative to the start of the query sequence
39-
self.pos_from_rstart = None
40-
self.pos_from_aa_start = None # Amino acid position relative to protein start
41-
self.codon_aln = '' # Amino acid sequence aligned with the nucleotide sequence
43+
self.nt_coords, self.nt_seq = nt_coords, nt_seq
44+
self.aa_coords, self.aa_seq = aa_coords, aa_seq
45+
self.pos_from_cds = []
46+
self.pos_from_gstart, self.pos_from_qstart, self.pos_from_rstart, self.pos_from_pstart = None, None, None, None
47+
self.codon_aln = ''
4248

4349
def get_coords(self, base):
4450
if base == 'nucl':
@@ -86,7 +92,7 @@ def set_pos_from_cds(self, virus):
8692
self.pos_from_cds.append((self.nt_coords[0] + 1 - cds_start))
8793
self.pos_from_cds.append((self.nt_coords[1] + 1 - cds_start))
8894

89-
def set_pos_from_aa_start(self, virus):
95+
def set_pos_from_pstart(self, virus):
9096
"""
9197
Gives the position of the sequence relative to the start of the protein sequence
9298
"""
@@ -98,13 +104,13 @@ def set_pos_from_aa_start(self, virus):
98104
if len(self.pos_from_cds) == 2:
99105
# If the whole protein sequence is encompassed in the CDS range
100106
if len(self.aa_seq) == (((self.pos_from_cds[1] - self.pos_from_cds[0]) // 3) + 1):
101-
self.pos_from_aa_start = [1, (((self.pos_from_cds[1] - self.pos_from_cds[0]) // 3) + 1)]
107+
self.pos_from_pstart = [1, (((self.pos_from_cds[1] - self.pos_from_cds[0]) // 3) + 1)]
102108
else:
103-
self.pos_from_aa_start = [(self.pos_from_cds[0] // 3) + 1, self.pos_from_cds[1] // 3]
109+
self.pos_from_pstart = [(self.pos_from_cds[0] // 3) + 1, self.pos_from_cds[1] // 3]
104110

105111
# If the region is outside the CDS
106112
else:
107-
self.pos_from_aa_start = None
113+
self.pos_from_pstart = None
108114

109115
def make_codon_aln(self):
110116
"""
@@ -188,7 +194,7 @@ def set_regions(virus, nt_reference, nt_coords, aa_reference, aa_coords):
188194
seq_region.set_seq_from_ref(nt_reference, 'nucl')
189195
seq_region.set_pos_from_cds(virus)
190196
seq_region.pos_from_gstart = nucl_coords
191-
seq_region.set_pos_from_aa_start(virus)
197+
seq_region.set_pos_from_pstart(virus)
192198
genome_regions.append(seq_region)
193199

194200
# Parse protein coordinates file
@@ -310,7 +316,7 @@ def get_query(base, query_file, revcomp):
310316
sys.exit(0)
311317

312318
else:
313-
if revcomp == 'y':
319+
if revcomp:
314320
if base == 'prot':
315321
print("Invalid option: reverse complement is not available for proteins.")
316322
else:
@@ -426,7 +432,7 @@ def find_matches(virus, base, ref_regions, match_coordinates):
426432
query_region.set_sequence(ov_seq, base)
427433
query_region.set_pos_from_cds(virus)
428434
query_region.pos_from_gstart = [start_aln, end_aln]
429-
query_region.set_pos_from_aa_start(virus)
435+
query_region.set_pos_from_pstart(virus)
430436

431437
if base == 'nucl':
432438
set_protein_equivalents(query_region, ref_regions)
@@ -507,9 +513,9 @@ def output(query_regions, outfile=None):
507513
print("\tNucleotide position relative to genome start: {} --> {}"
508514
.format(reg.pos_from_gstart[0] + 1, reg.pos_from_gstart[1] + 1))
509515

510-
if reg.pos_from_aa_start is not None:
516+
if reg.pos_from_pstart is not None:
511517
print("\tAmino acid position relative to protein start: {} --> {}"
512-
.format(reg.pos_from_aa_start[0], reg.pos_from_aa_start[1]))
518+
.format(reg.pos_from_pstart[0], reg.pos_from_pstart[1]))
513519

514520
if reg.pos_from_qstart is not None:
515521
print("\tPosition relative to query start: {} --> {}\n"
@@ -565,32 +571,69 @@ def retrieve(virus, base, ref_regions, region, outfile=None, start_offset=1, end
565571
:param ref_regions: A list of GenomeRegion objects
566572
:param region: The genomic region
567573
:param outfile: The file stream of the output file
568-
:param start_offset: <option> The start coordinate
569-
:param end_offset: <option> The end coordinate
574+
:param start_offset: <option> The start coordinate of the query region
575+
:param end_offset: <option> The end coordinate of the query region
570576
:return: The genomic region defined by the starting and ending coordinates
571577
"""
578+
query_region = None
572579
for ref_region in ref_regions:
580+
sequence_range = ref_region.get_coords(base)
581+
region_start, region_end = sequence_range[0], sequence_range[1]
582+
length = region_end - region_start
583+
584+
# Check if offsets are given in global coordinates
585+
if region_start <= start_offset <= region_end:
586+
start = start_offset - region_start + 1
587+
# Or in local coordinates
588+
elif 1 <= start_offset <= length:
589+
start = start_offset
590+
# Or out of range
591+
else:
592+
start = 1
593+
594+
# Check for 'end'
595+
if end_offset == 'end':
596+
end = length + 1
597+
# Check for global coordinates
598+
elif region_start <= end_offset <= region_end:
599+
end = end_offset - region_start + 1
600+
# Or local coordinates
601+
elif 1 <= start_offset <= length:
602+
end = end_offset + 1
603+
# Or out of range
604+
else:
605+
end = length + 1
606+
607+
# # Handles global and local start coordinates
608+
# if start_offset <= region_start:
609+
# start = region_start
610+
# else:
611+
# start = region_start + (start_offset - region_start)
612+
#
613+
# # Handles global and local end coordinates
614+
# if end_offset == 'end' or end_offset > region_end:
615+
# end = region_end
616+
# else:
617+
# end = region_end + (region_end - end_offset)
618+
619+
# Create a GenomeRegion object for the query
573620
if ref_region.region_name == region:
574-
sequence_range = ref_region.get_coords(base)
575-
region_start = sequence_range[0]
576-
region_end = sequence_range[1]
621+
query_region = GenomeRegion(region)
622+
query_region.set_coords([start, end], base) # Set global coordinates
577623

578-
if start_offset <= region_start:
579-
start = region_start
580-
else:
581-
start = region_start + (start_offset - region_start)
624+
# Slice reference sequence with local coordinates
625+
qstart, qend = query_region.global_to_local_index([start, end], base)
626+
query_seq = ref_region.get_sequence(base)[qstart - 1: qend]
627+
query_region.set_sequence(query_seq, base)
582628

583-
# If end_coord is greater than the region's end coordinate, set end_coord to region's end coordinate
584-
if end_offset == 'end' or end_offset > region_end:
585-
end = region_end
586-
else:
587-
end = region_end + (region_end - end_offset)
629+
if query_region is not None:
630+
# TODO: sort retrieved_regions to print query_region first
631+
retrieved_regions = find_matches(virus, base, ref_regions, [query_region.get_coords(base)])
588632

589-
# TODO: sort retrieved_regions to print region first
590-
retrieved_regions = find_matches(virus, base, ref_regions, [[start, end]])
633+
# TODO: remove duplicate query_region
591634
output(retrieved_regions, outfile)
592635

593-
return retrieved_regions
636+
return query_region, retrieved_regions
594637

595638

596639
def handle_args(virus, base, ref_nt, nt_coords, ref_aa, aa_coords):
@@ -677,7 +720,7 @@ def parse_args():
677720
'relative to the HIV or SIV reference genome')
678721
parser_align.add_argument('query', type=argparse.FileType('r'),
679722
help='Path to the file containing the query sequence.')
680-
parser_align.add_argument('-revcomp', default='n', choices=['y', 'n'],
723+
parser_align.add_argument('-revcomp', type=bool, default=False, choices=[True, False],
681724
help='Align the reverse complement of the query sequence with the reference sequence')
682725
parser_align.add_argument('-outfile', type=argparse.FileType('w'),
683726
help='Path to the file where results will be written. '

poplars/tests/test_hypermut.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,3 @@ def test2UndefinedRateRatio(self):
123123
self.assertEqual(expected, result)
124124

125125

126-
class testMakeDataFile(unittest.TestCase):
127-
def test(self):
128-
pass

0 commit comments

Comments
 (0)