Skip to content

Commit a66f019

Browse files
committed
-Passes all test cases for 'align' mode (#8)
1 parent 1e3056c commit a66f019

11 files changed

+85
-138
lines changed

poplars/ref_genomes/K03455-protein.fasta

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,16 @@ IGVTRQRRARNGASRS
6565
>Tat|HIVHXB2CG
6666
MEPVDPRLEPWKHPGSQPKTACTNCYCKKCCFHCQVCFITKALGISYGRKKRRQRRRAHQNSQTHQASLSKQPTSQPRGD
6767
PTGPKESKKKVERETETDPFD
68-
>Tat(exon 1)|HIVHXB2CG
68+
>Tat(exon1)|HIVHXB2CG
6969
MEPVDPRLEPWKHPGSQPKTACTNCYCKKCCFHCQVCFITKALGISYGRKKRRQRRRAHQNSQTHQASLSK
70-
>Tat(exon 2)|HIVHXB2CG
70+
>Tat(exon2)|HIVHXB2CG
7171
PTSQPRGDPTGPKESKKKVERETETDPFD
7272
>Rev|HIVHXB2CG
7373
MAGRSGDSDEELIRTVRLIKLLYQSNPPPNPEGTRQARRNRRRRWRERQRQIHSISERILGTYLGRSAEPVPLQLPPLER
7474
LTLDCNEDCGTSGTQGVGSPQILVESPTVLESGTKE
75-
>Rev(exon 1)|HIVHXB2CG
75+
>Rev(exon1)|HIVHXB2CG
7676
MAGRSGDSDEELIRTVRLIKLLYQS
77-
>Rev(exon 2)|HIVHXB2CG
77+
>Rev(exon2)|HIVHXB2CG
7878
PPPNPEGTRQARRNRRRRWRERQRQIHSISERILGTYLGRSAEPVPLQLPPLERLTLDCNEDCGTSGTQGVGSPQILVES
7979
PTVLESGTKE
8080
>Vpu|HIVHXB2CG

poplars/ref_genomes/K03455_genome_coordinates.csv

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,8 @@ RNase,3870,4229
1414
Integrase,4230,5096
1515
Vif,5041,5619
1616
Vpr,5559,5850
17-
Tat,5831,8469
1817
Tat(exon1),5831,6045
1918
Tat(exon2),8379,8469
20-
Rev,5970,8653
2119
Rev(exon1),5970,6045
2220
Rev(exon2),8739,8653
2321
Vpu,6062,6310

poplars/ref_genomes/K03455_protein_coordinates.csv

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,8 @@ RNase,1096,1215
1212
Integrase,1216,1503
1313
Vif,1504,1695
1414
Vpr,1696,1791
15-
Tat,1792,1892
1615
Tat(exon1),1792,1862
1716
Tat(exon2),1863,1892
18-
Rev(with intron),1893,2008
1917
Rev(exon1),1893,1918
2018
Rev(exon2),1919,2008
2119
Vpu,2009,2090

poplars/ref_genomes/M33262-protein.fasta

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,20 +65,20 @@ MHCKKGCRCLGEGHGAGGWRPGPPPPPPPGLA
6565
>Vpr|SIVMM239
6666
MEERPPENEGPQREPWDEWVVEVLEELKEEALKHFDPRLLTALGNHIYNRHGDTLEGAGELIRILQRALFMHFRGGCIHS
6767
RIGQPGGGNPLSAIPPSRSML
68-
>Tat(full protein)|SIVMM239
68+
>Tat|SIVMM239
6969
METPLREQENSLESSNERSSCISEADASTPESANLGEEILSQLYRPLEACYNTCYCKKCCYHCQFCFLKKGLGICYEQSR
7070
KRRRTPKKAKANTSSASNKPISNRTRHCQPEKAKKETVEKAVATAPGLGR
71-
>Tat(exon 1)|SIVMM239
71+
>Tat(exon1)|SIVMM239
7272
METPLREQENSLESSNERSSCISEADASTPESANLGEEILSQLYRPLEACYNTCYCKKCCYHCQFCFLKKGLGICYEQSR
7373
KRRRTPKKAKANTSSASN
74-
>Tat(exon 2)|SIVMM239
74+
>Tat(exon2)|SIVMM239
7575
PISNRTRHCQPEKAKKETVEKAVATAPGLGR
7676
>Rev|SIVMM239
7777
MSNHEREEELRKRLRLIHLLHQTNPYPTGPGTANQRRQRKRRWRRRWQQLLALADRIYSFPDPPTDTPLDLAIQQLQNLA
7878
IESIPDPPTNTPEALCDPTEDSRSPQD
79-
>Rev(exon 1)|SIVMM239
79+
>Rev(exon1)|SIVMM239
8080
MSNHEREEELRKRLRLIHLLHQT
81-
>Rev(exon 2)|SIVMM239
81+
>Rev(exon2)|SIVMM239
8282
PYPTGPGTANQRRQRKRRWRRRWQQLLALADRIYSFPDPPTDTPLDLAIQQLQNLAIESIPDPPTNTPEALCDPTEDSRS
8383
PQD
8484
>Env(gp160)|SIVMM239

poplars/ref_genomes/M33262_genome_coordinates.csv

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,8 @@ Integrase,4785,5666
1515
Vif,5596,6240
1616
Vpx,6068,6406
1717
Vpr,6407,6712
18-
Tat,6558,9158
1918
Tat(exon1),6558,6853
2019
Tat(exon2),9062,9158
21-
Rev,6784,9315
2220
Rev(exon1),6784,6853
2321
Rev(exon2),9062,9315
2422
Env(gp160),6860,9499

poplars/ref_genomes/M33262_protein_coordinates.csv

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,8 @@ Integrase,1237,1529
1313
Vif,1530,1743
1414
Vpx,1744,1855
1515
Vpr,1856,1956
16-
Tat,1957,2086
1716
Tat(exon1),1957,2055
1817
Tat(exon2),2056,2086
19-
Rev,2087,2193
2018
Rev(exon1),2087,2110
2119
Rev(exon2),2111,2193
2220
Env(gp160),2194,3072

poplars/sequence_locator.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -128,12 +128,12 @@ def make_codon_aln(self):
128128
if self.nt_seq is not None and self.aa_seq is not None:
129129
codon_aln = []
130130
codons = [''.join(t) for t in zip(*[iter(self.nt_seq)] * 3)]
131-
for i in range(len(codons)):
131+
for aa, codon in zip(self.aa_seq, codons):
132132
# Check if stop codon
133-
if codons[i] == 'TAA' or codons[i] == 'TGA' or codons[i] == 'TAG':
133+
if codon == 'TAA' or codon == 'TGA' or codon == 'TAG':
134134
codon_aln.append('-*-')
135135
else:
136-
codon_aln.append('-{}-'.format(self.aa_seq[i]))
136+
codon_aln.append('-{}-'.format(aa))
137137
self.codon_aln = ''.join(codon_aln)
138138
return self.codon_aln
139139

@@ -194,10 +194,10 @@ def set_regions(virus, base, nt_coords, nt_seq, aa_coords, aa_seq):
194194
:param base: The base of the sequence
195195
:param nt_coords: Path to the csv file containing the global coordinates of the nucleotide region.
196196
The file stream has one genomic entry per line and has the following format: region_name,start,end
197-
:param nt_seq: The file stream containing the reference nucleotide sequence in read mode
197+
:param nt_seq: The nucleotide sequence
198198
:param aa_coords: Path to the csv file containing the global coordinates of the protein region.
199199
The file stream has one genomic entry per line and has the following format: region_name,start,end
200-
:param aa_seqe: The file stream containing the reference nucleotide sequence in read mode
200+
:param aa_seq: A list of lists containing the amino acid sequences
201201
:return: A list of GenomeRegions
202202
"""
203203

@@ -225,26 +225,24 @@ def set_regions(virus, base, nt_coords, nt_seq, aa_coords, aa_seq):
225225
genome_regions.append(seq_region)
226226

227227
# Parse protein coordinates file
228-
prot_names = []
229228
prot_coords = []
230229
for aa_line in aa_coords:
231230
aa_line = aa_line.strip()
232231
aa_line = aa_line.split(',')
233232
prot_coords.append([int(aa_line[1]), int(aa_line[2])])
234-
prot_names.append(aa_line[0])
235233

236-
for i in range(len(prot_names)):
234+
for i, coords in enumerate(prot_coords):
237235
for seq_region in genome_regions:
238-
if prot_names[i] in seq_region.region_name:
236+
if aa_seq[i][0].startswith(seq_region.region_name):
239237
# Set global and local protein coordinates
240-
seq_region.set_coords(prot_coords[i], 'prot')
241-
seq_region.set_seq_from_ref(aa_seq, 'prot')
242-
238+
seq_region.set_coords(coords, 'prot')
239+
seq_region.set_sequence(aa_seq[i][1], 'prot')
243240
seq_region.set_pos_from_pstart(virus)
241+
seq_region.make_codon_aln()
244242

245243
else:
246244
# Parse protein region coordinates file
247-
for aa_line in aa_coords:
245+
for i, aa_line in enumerate(aa_coords):
248246
aa_line = aa_line.strip()
249247
aa_line = aa_line.split(',')
250248
prot_coords = [int(aa_line[1]), int(aa_line[2])]
@@ -253,7 +251,7 @@ def set_regions(virus, base, nt_coords, nt_seq, aa_coords, aa_seq):
253251

254252
# Set global and local nucleotide coordinates
255253
seq_region.set_coords(prot_coords, 'prot')
256-
seq_region.set_seq_from_ref(aa_seq, 'prot')
254+
seq_region.set_sequence(aa_seq[i][1], 'prot')
257255

258256
# Set relative positions
259257
seq_region.set_pos_from_cds(virus)
@@ -268,17 +266,17 @@ def set_regions(virus, base, nt_coords, nt_seq, aa_coords, aa_seq):
268266
for nt_line in nt_coords:
269267
nt_line = nt_line.strip()
270268
nt_line = nt_line.split(',')
271-
nucl_coords.append([int(nt_line[1]), int(nt_line[2])])
272269
nucl_names.append(nt_line[0])
270+
nucl_coords.append([int(nt_line[1]), int(nt_line[2])])
273271

274-
for i in range(len(nucl_names)):
272+
for i, name in enumerate(nucl_names):
275273
for seq_region in genome_regions:
276-
if nucl_names[i] in seq_region.region_name:
274+
if name.startswith(seq_region.region_name):
277275
# Set global and local protein coordinates
278276
seq_region.set_coords(nucl_coords[i], 'nucl')
279277
seq_region.set_seq_from_ref(nt_seq, 'nucl')
280-
281278
seq_region.set_pos_from_pstart(virus)
279+
seq_region.make_codon_aln()
282280

283281
return genome_regions
284282

@@ -557,7 +555,7 @@ def output_retrieved_region(region, outfile=None):
557555
print("\tNucleotide Sequence:")
558556
seq_lines = [region.nt_seq[i:i + 60] for i in range(0, len(region.nt_seq), 60)]
559557
for line in seq_lines:
560-
print('\t\t{}\n'.format(line))
558+
print('\t\t{}'.format(line))
561559

562560
if region.aa_seq:
563561
print("\tProtein Sequence:")
@@ -739,8 +737,8 @@ def retrieve(virus, base, ref_regions, region, qstart=1, qend='end'):
739737
query_region = GenomeRegion(region)
740738

741739
# Set local and global coordinates
742-
query_region.set_coords([qstart, qend], base)
743-
global_coords = query_region.local_to_global_index(ref_region, [qstart, qend], base)
740+
# query_region.set_coords([qstart, qend], base)
741+
global_coords = GenomeRegion.local_to_global_index(ref_region, [qstart, qend], base)
744742
query_region.set_coords(global_coords, base)
745743

746744
# Set sequences protein and nucleotide sequences
@@ -878,7 +876,7 @@ def main():
878876
# Ensure proper configuration files are set
879877
configs = handle_args(args.virus, args.base, args.ref_nt, args.nt_coords, args.ref_aa, args.aa_coords)
880878
ref_nt_seq = configs[0][0][1]
881-
ref_aa_seq = configs[1][0][1]
879+
ref_aa_seq = configs[1]
882880
nt_coords = configs[2]
883881
aa_coords = configs[3]
884882
reference_sequence = configs[4]
Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
1-
>Rev|SIVMM239
2-
MSNHEREEELRKRLRLIHLLHQTNPYPTGPGTANQRRQRKRRWRRRWQQLLALADRIYSFPDPPTDTPLDLAIQQLQNLA
3-
IESIPDPPTNTPEALCDPTEDSRSPQD
4-
>Rev(exon 1)|SIVMM239
1+
>Rev(exon1)|SIVMM239
52
MSNHEREEELRKRLRLIHLLHQT
6-
>Rev(exon 2)|SIVMM239
7-
PYPTGPGTANQRRQRKRRWRRRWQQLLALADRIYSFPDPPTDTPLDLAIQQLQNLAIESIPDPPTNTPEALCDPTEDSRS
8-
PQD
3+
>Rev(exon2)|SIVMM239
4+
PYPTGPGTANQRRQRKRRWRRRWQQLLALADRIYSFPDPPTDTPLDLAIQQLQNLAIESIPDPPTNTPEALCDPTEDSRSPQD
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
1-
Rev(with intron),6784,9315
21
Rev(exon1),6784,6853
32
Rev(exon2),9062,9315
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
1-
Rev,2087,2193
21
Rev(exon1),2087,2110
32
Rev(exon2),2111,2193

0 commit comments

Comments
 (0)