Skip to content

Commit e44dd45

Browse files
committed
-Updated test cases and code cleanup (#20 in progress)
1 parent 4b58083 commit e44dd45

File tree

4 files changed

+180
-174
lines changed

4 files changed

+180
-174
lines changed

poplars/sequence_locator.py

+87-88
Original file line numberDiff line numberDiff line change
@@ -186,18 +186,18 @@ def get_overlap(self, region, coord_pair, base):
186186
return overlap
187187

188188

189-
def set_regions(virus, base, nt_coords, nt_reference, aa_coords, aa_reference):
189+
def set_regions(virus, base, nt_coords, nt_seq, aa_coords, aa_seq):
190190
"""
191191
Reads in the start and end coordinates of the genomic regions and associates the region with its coordinates.
192192
If no coordinate files are specified, set default nucleotide and protein coordinate files.
193193
:param virus: The organism (HIV or SIV)
194194
:param base: The base of the sequence
195195
:param nt_coords: Path to the csv file containing the global coordinates of the nucleotide region.
196196
The file stream has one genomic entry per line and has the following format: region_name,start,end
197-
:param nt_reference: The file stream containing the reference nucleotide sequence in read mode
197+
:param nt_seq: The file stream containing the reference nucleotide sequence in read mode
198198
:param aa_coords: Path to the csv file containing the global coordinates of the protein region.
199199
The file stream has one genomic entry per line and has the following format: region_name,start,end
200-
:param aa_reference: The file stream containing the reference nucleotide sequence in read mode
200+
:param aa_seqe: The file stream containing the reference nucleotide sequence in read mode
201201
:return: A list of GenomeRegions
202202
"""
203203

@@ -206,83 +206,79 @@ def set_regions(virus, base, nt_coords, nt_reference, aa_coords, aa_reference):
206206
if base == 'nucl':
207207

208208
# Parse nucleotide region coordinates file
209-
with open(nt_coords, 'r') as nt_handle:
210-
for nt_line in nt_handle:
211-
nt_line = nt_line.strip()
212-
nt_line = nt_line.split(',')
213-
nucl_coords = [int(nt_line[1]), int(nt_line[2])]
209+
for nt_line in nt_coords:
210+
nt_line = nt_line.strip()
211+
nt_line = nt_line.split(',')
212+
nucl_coords = [int(nt_line[1]), int(nt_line[2])]
214213

215-
seq_region = GenomeRegion(nt_line[0], nucl_coords)
214+
seq_region = GenomeRegion(nt_line[0])
216215

217-
# Set global and local nucleotide coordinates
218-
seq_region.set_coords(nucl_coords, 'nucl')
219-
seq_region.set_seq_from_ref(nt_reference, 'nucl')
216+
# Set global and local nucleotide coordinates
217+
seq_region.set_coords(nucl_coords, 'nucl')
218+
seq_region.set_seq_from_ref(nt_seq, 'nucl')
220219

221-
# Set relative positions
222-
seq_region.set_pos_from_cds(virus)
223-
seq_region.set_pos_from_gstart()
224-
seq_region.set_pos_from_pstart(virus)
220+
# Set relative positions
221+
seq_region.set_pos_from_cds(virus)
222+
seq_region.set_pos_from_gstart()
223+
seq_region.set_pos_from_pstart(virus)
225224

226-
genome_regions.append(seq_region)
225+
genome_regions.append(seq_region)
227226

228227
# Parse protein coordinates file
229-
with open(aa_coords, 'r') as aa_handle:
230-
prot_names = []
231-
prot_coords = []
232-
for aa_line in aa_handle:
233-
aa_line = aa_line.strip()
234-
aa_line = aa_line.split(',')
235-
prot_coords.append([int(aa_line[1]), int(aa_line[2])])
236-
prot_names.append(aa_line[0])
237-
238-
for i in range(len(prot_names)):
239-
for seq_region in genome_regions:
240-
if prot_names[i] in seq_region.region_name:
241-
# Set global and local protein coordinates
242-
seq_region.set_coords(prot_coords[i], 'prot')
243-
seq_region.set_seq_from_ref(aa_reference, 'prot')
244-
245-
seq_region.set_pos_from_pstart(virus)
228+
prot_names = []
229+
prot_coords = []
230+
for aa_line in aa_coords:
231+
aa_line = aa_line.strip()
232+
aa_line = aa_line.split(',')
233+
prot_coords.append([int(aa_line[1]), int(aa_line[2])])
234+
prot_names.append(aa_line[0])
235+
236+
for i in range(len(prot_names)):
237+
for seq_region in genome_regions:
238+
if prot_names[i] in seq_region.region_name:
239+
# Set global and local protein coordinates
240+
seq_region.set_coords(prot_coords[i], 'prot')
241+
seq_region.set_seq_from_ref(aa_seq, 'prot')
242+
243+
seq_region.set_pos_from_pstart(virus)
246244

247245
else:
248246
# Parse protein region coordinates file
249-
with open(aa_coords, 'r') as aa_handle:
250-
for aa_line in aa_handle:
251-
aa_line = aa_line.strip()
252-
aa_line = aa_line.split(',')
253-
prot_coords = [int(aa_line[1]), int(aa_line[2])]
247+
for aa_line in aa_coords:
248+
aa_line = aa_line.strip()
249+
aa_line = aa_line.split(',')
250+
prot_coords = [int(aa_line[1]), int(aa_line[2])]
254251

255-
seq_region = GenomeRegion(aa_line[0])
252+
seq_region = GenomeRegion(aa_line[0])
256253

257-
# Set global and local nucleotide coordinates
258-
seq_region.set_coords(prot_coords, 'prot')
259-
seq_region.set_seq_from_ref(aa_reference, 'prot')
254+
# Set global and local nucleotide coordinates
255+
seq_region.set_coords(prot_coords, 'prot')
256+
seq_region.set_seq_from_ref(aa_seq, 'prot')
260257

261-
# Set relative positions
262-
seq_region.set_pos_from_cds(virus)
263-
seq_region.set_pos_from_gstart()
264-
seq_region.set_pos_from_pstart(virus)
258+
# Set relative positions
259+
seq_region.set_pos_from_cds(virus)
260+
seq_region.set_pos_from_gstart()
261+
seq_region.set_pos_from_pstart(virus)
265262

266-
genome_regions.append(seq_region)
263+
genome_regions.append(seq_region)
267264

268265
# Parse nucleotide coordinates file
269-
with open(nt_coords, 'r') as nt_handle:
270-
nucl_names = []
271-
nucl_coords = []
272-
for nt_line in nt_handle:
273-
nt_line = nt_line.strip()
274-
nt_line = nt_line.split(',')
275-
nucl_coords.append([int(nt_line[1]), int(nt_line[2])])
276-
nucl_names.append(nt_line[0])
277-
278-
for i in range(len(nucl_names)):
279-
for seq_region in genome_regions:
280-
if nucl_names[i] in seq_region.region_name:
281-
# Set global and local protein coordinates
282-
seq_region.set_coords(nucl_coords[i], 'nucl')
283-
seq_region.set_seq_from_ref(nt_reference, 'nucl')
284-
285-
seq_region.set_pos_from_pstart(virus)
266+
nucl_names = []
267+
nucl_coords = []
268+
for nt_line in nt_coords:
269+
nt_line = nt_line.strip()
270+
nt_line = nt_line.split(',')
271+
nucl_coords.append([int(nt_line[1]), int(nt_line[2])])
272+
nucl_names.append(nt_line[0])
273+
274+
for i in range(len(nucl_names)):
275+
for seq_region in genome_regions:
276+
if nucl_names[i] in seq_region.region_name:
277+
# Set global and local protein coordinates
278+
seq_region.set_coords(nucl_coords[i], 'nucl')
279+
seq_region.set_seq_from_ref(nt_seq, 'nucl')
280+
281+
seq_region.set_pos_from_pstart(virus)
286282

287283
return genome_regions
288284

@@ -367,6 +363,7 @@ def get_query(base, query_file, revcomp):
367363
:param revcomp: Option to align the reverse complement of the nucleotide sequence ('y' or 'n')
368364
:return: A list of lists containing the sequence identifiers and the query sequences
369365
"""
366+
370367
line = query_file.readline()
371368
query_file.seek(0) # Reset pointer to beginning
372369

@@ -886,38 +883,40 @@ def main():
886883
aa_coords = configs[3]
887884
reference_sequence = configs[4]
888885

889-
# Create genomic region objects based on configuration files
890-
ref_regions = set_regions(args.virus, args.base, nt_coords, ref_nt_seq, aa_coords, ref_aa_seq)
886+
with open(nt_coords) as nt_coords_handle, open(aa_coords) as aa_coords_handle:
891887

892-
if args.subcommand == "align":
893-
query = get_query(args.base, args.query, args.revcomp)
894-
alignment = sequence_align(query, reference_sequence, args.outfile)
888+
# Create genomic region objects based on configuration files
889+
ref_regions = set_regions(args.virus, args.base, nt_coords_handle, ref_nt_seq, aa_coords_handle, ref_aa_seq)
895890

896-
# Find indices where the query sequence aligns with the reference sequence
897-
match_coords = get_region_coordinates(alignment[-1]) # Query sequence will be the last item in the list
898-
query_regions = find_matches(args.virus, args.base, ref_regions, match_coords)
891+
if args.subcommand == "align":
892+
query = get_query(args.base, args.query, args.revcomp)
893+
alignment = sequence_align(query, reference_sequence, args.outfile)
899894

900-
output_overlap(query_regions, args.outfile)
895+
# Find indices where the query sequence aligns with the reference sequence
896+
match_coords = get_region_coordinates(alignment[-1]) # Query sequence will be the last item in the list
897+
query_regions = find_matches(args.virus, args.base, ref_regions, match_coords)
901898

902-
else:
903-
valid_in = valid_inputs(args.virus, args.start, args.end, args.region)
899+
output_overlap(query_regions, args.outfile)
904900

905-
if not valid_in:
906-
sys.exit(0)
907901
else:
908-
result = retrieve(args.virus, args.base, ref_regions, args.region, args.start, args.end)
909-
query_region = result[0]
910-
overlap_regions = result[1]
902+
valid_in = valid_inputs(args.virus, args.start, args.end, args.region)
911903

912-
# Print retrieved region first
913-
output_retrieved_region(query_region, args.outfile)
914-
915-
if args.outfile is None:
916-
print("\n\nRegions touched by the query sequence:")
904+
if not valid_in:
905+
sys.exit(0)
917906
else:
918-
args.outfile.write("\n\nRegions touched by the query sequence:\n\n")
907+
result = retrieve(args.virus, args.base, ref_regions, args.region, args.start, args.end)
908+
query_region = result[0]
909+
overlap_regions = result[1]
910+
911+
# Print retrieved region first
912+
output_retrieved_region(query_region, args.outfile)
913+
914+
if args.outfile is None:
915+
print("\n\nRegions touched by the query sequence:")
916+
else:
917+
args.outfile.write("\n\nRegions touched by the query sequence:\n\n")
919918

920-
output_overlap(overlap_regions, args.outfile)
919+
output_overlap(overlap_regions, args.outfile)
921920

922921

923922
if __name__ == '__main__':

poplars/tests/fixtures/siv-test-genome.fasta

+1-13
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,4 @@ ACTTGGCTATTCAGCAACTGCAGAACCTTGCTATCGAGAGTATACCAGATCCTCCAACCAATACTCCAGAGGCTCTCTGC
3030
GACCCTACAGAGGATTCGAGAAGTCCTCAGGACTGAACTGACCTACCTACAATATGGGTGGAGCTATTTCCATGAGGCGG
3131
TCCAGGCCGTCTGGAGATCTGCGACAGAGACTCTTGCGGGCGCGTGGGGAGACTTATGGGAGACTCTTAGGAGAGGTGGA
3232
AGATGGATACTCGCAATCCCCAGGAGGATTAGACAAGGGCTTGAGCTCACTCTCTTGTGAGGGACAGAAATACAATCAGG
33-
GACAGTATATGAATACTCCATGGAGAAACCCAGCTGAAGAGAGAGAAAAATTAGCATACAGAAAACAAAATATGGATGAT
34-
ATAGATGAGGAAGATGATGACTTGGTAGGGGTATCAGTGAGGCCAAAAGTTCCCCTAAGAACAATGAGTTACAAATTGGC
35-
AATAGACATGTCTCATTTTATAAAAGAAAAGGGGGGACTGGAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAG
36-
ACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGATTACACCTCAGGACCAGGAATTAGATACCCAAAG
37-
ACATTTGGCTGGCTATGGAAATTAGTCCCTGTAAATGTATCAGATGAGGCACAGGAGGATGAGGAGCATTATTTAATGCA
38-
TCCAGCTCAAACTTCCCAGTGGGATGACCCTTGGGGAGAGGTTCTAGCATGGAAGTTTGATCCAACTCTGGCCTACACTT
39-
ATGAGGCATATGTTAGATACCCAGAAGAGTTTGGAAGCAAGTCAGGCCTGTCAGAGGAAGAGGTTAGAAGAAGGCTAACC
40-
GCAAGAGGCCTTCTTAACATGGCTGACAAGAAGGAAACTCGCTGAAACAGCAGGGACTTTCCACAAGGGGATGTTACGGG
41-
GAGGTACTGGGGAGGAGCCGGTCGGGAACGCCCACTTTCTTGATGTATAAATATCACTGCATTTCGCTCTGTATTCAGTC
42-
GCTCTGCGGAGAGGCTGGCAGATTGAGCCCTGGGAGGTTCTCTCCAGCACTAGCAGGTAGAGCCTGGGTGTTCCCTGCTA
43-
GACTCTCACCAGCACTTGGCCGGTGCTGGGCAGAGTGACTCCACGCTTGCTTGCTTAAAGCCCTCTTCAATAAAGCTGCC
44-
ATTTTAGAAGTAAGCTAGTGTGTGTTCCCATCTCTCCTAGCCGCCGCCTGGTCAACTCGGTACTCAATAATAAGAAGACC
45-
CTGGTCTGTTAGGACCCTTTCTGCTTTGGGAAACCGAAGCAGGAAAATCCCTAGC
33+
GACAGTATATGAATACTCCATGGAGAAACCCAGCTGAAGAGAGAGAAAAATT
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
11
Rev(with intron),6784,9315
22
Rev(exon1),6784,6853
3-
Rev(exon2),9062,9315
4-
Env(gp160),6860,9499
5-
gp120,6860,8434
6-
gp41,8435,9499
7-
Nef,9333,10124
8-
3'LTR,9719,10535
3+
Rev(exon2),9062,9315

0 commit comments

Comments
 (0)