Skip to content

Commit 80b777a

Browse files
committed
-Code clean-up and testing (#8)
1 parent 5b49a61 commit 80b777a

10 files changed

+193
-250
lines changed

poplars/ref_genomes/K03455_genome_coordinates.csv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
5'LTR,1,634
22
Gag,790,2292
3-
Matrix,790,1185
4-
Capsid,1186,1878
3+
Matrix(p17/p15),790,1185
4+
Capsid(p24/p27),1186,1878
55
p2,1879,1920
6-
Nucleocapsid,1921,2085
6+
Nucleocapsid(p7/p8),1921,2085
77
p1,2086,2133
88
p6,2134,2292
99
Pol,2085,5096
@@ -21,7 +21,7 @@ Rev,5970,8653
2121
Rev(exon1),5970,6045
2222
Rev(exon2),8739,8653
2323
Vpu,6062,6310
24-
Env,6225,8795
24+
Env(gp160),6225,8795
2525
gp120,6225,7757
2626
gp41,7758,8795
2727
Nef,8797,9417

poplars/ref_genomes/K03455_protein_coordinates.csv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Gag,1,500
2-
p17,1,132
3-
p24,133,363
2+
Matrix(p17/p15),1,132
3+
Capsid(p24/p27),133,363
44
p2,364,377
5-
p7,378,432
5+
Nucleocapsid(p7/p8),378,432
66
p1,433,448
77
p6,449,500
88
Pol,501,1503
@@ -19,7 +19,7 @@ Rev(with intron),1893,2008
1919
Rev(exon1),1893,1918
2020
Rev(exon2),1919,2008
2121
Vpu,2009,2090
22-
gp160,2091,2946
22+
Env(gp160),2091,2946
2323
gp120,2091,2601
2424
gp41,2602,2946
2525
Nef,2947,3152

poplars/ref_genomes/M33262_genome_coordinates.csv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
5'LTR,257,1074
22
Gag,1309,2842
3-
Matrix,1309,1713
4-
Capsid,1714,2400
3+
Matrix(p17/p15),1309,1713
4+
Capsid(p24/p27),1714,2400
55
p2,2401,2451
6-
Nucleocapsid,2452,2607
6+
Nucleocapsid(p7/p8),2452,2607
77
p1,2608,2649
88
p6,2650,2842
99
Pol,2607,5666
@@ -21,7 +21,7 @@ Tat(exon2),9062,9158
2121
Rev,6784,9315
2222
Rev(exon1),6784,6853
2323
Rev(exon2),9062,9315
24-
Env,6860,9499
24+
Env(gp160),6860,9499
2525
gp120,6860,8434
2626
gp41,8435,9499
2727
Nef,9333,10124

poplars/ref_genomes/M33262_protein_coordinates.csv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Gag,1,510
2-
p15,1,135
3-
p27,136,364
2+
Matrix(p17/p15),1,135
3+
capsid(p24/p27),136,364
44
p2,365,381
5-
p8,382,433
5+
Nucleocapsid(p7/p8),382,433
66
p1,434,447
77
p6,448,510
88
Pol,511,1529
@@ -19,7 +19,7 @@ Tat(exon2),2056,2086
1919
Rev,2087,2193
2020
Rev(exon1),2087,2110
2121
Rev(exon2),2111,2193
22-
gp160,2194,3072
22+
Env(gp160),2194,3072
2323
gp120,2194,2718
2424
gp41,2719,3072
2525
Nef,3073,3335

poplars/sequence_locator.py

Lines changed: 31 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ def valid_inputs(virus, start_coord, end_coord, region):
258258
if type(start_coord) == str:
259259
print("Invalid start coordinate type: {}".format(type(start_coord)))
260260

261-
if start_coord <= 0:
261+
if start_coord < 0:
262262
print("Invalid start coordinate: {}".format(start_coord))
263263
return False
264264

@@ -511,6 +511,14 @@ def output(query_regions, outfile=None):
511511
print("\tAmino acid position relative to protein start: {} --> {}"
512512
.format(reg.pos_from_aa_start[0], reg.pos_from_aa_start[1]))
513513

514+
if reg.pos_from_qstart is not None:
515+
print("\tPosition relative to query start: {} --> {}\n"
516+
.format(reg.pos_from_qstart[0], reg.pos_from_qstart[1]))
517+
518+
if reg.pos_from_rstart is not None:
519+
print("\tPosition relative to region start: {} --> {}\n"
520+
.format(reg.pos_from_rstart[0], reg.pos_from_rstart[1]))
521+
514522
else:
515523
outfile.write("\n\nRegions touched by the query sequence:")
516524
for reg in query_regions:
@@ -540,6 +548,14 @@ def output(query_regions, outfile=None):
540548
outfile.write("\tAmino acid position relative to protein start: {} --> {}\n"
541549
.format(reg.pos_from_aa_start[0], reg.pos_from_aa_start[1]))
542550

551+
if reg.pos_from_qstart is not None:
552+
outfile.write("\tPosition relative to query start: {} --> {}\n"
553+
.format(reg.pos_from_qstart[0], reg.pos_from_qstart[1]))
554+
555+
if reg.pos_from_rstart is not None:
556+
outfile.write("\tPosition relative to region start: {} --> {}\n"
557+
.format(reg.pos_from_rstart[0], reg.pos_from_rstart[1]))
558+
543559

544560
def retrieve(virus, base, ref_regions, region, outfile=None, start_offset=1, end_offset='end'):
545561
"""
@@ -554,82 +570,27 @@ def retrieve(virus, base, ref_regions, region, outfile=None, start_offset=1, end
554570
:return: The genomic region defined by the starting and ending coordinates
555571
"""
556572
for ref_region in ref_regions:
557-
sequence_range = ref_region.get_coords(base)
558-
region_start = sequence_range[0]
559-
region_end = sequence_range[1]
560-
561-
if start_offset <= region_start:
562-
start = region_start
563-
else:
564-
start = region_start + (start_offset - region_start)
565-
566-
# If end_coord is greater than the region's end coordinate, set end_coord to region's end coordinate
567-
if end_offset == 'end' or end_offset > region_end:
568-
end = region_end
569-
else:
570-
end = region_end + (region_end - end_offset)
571-
572-
retrieved_region = None
573573
if ref_region.region_name == region:
574-
s = ref_region.get_sequence(base)
575-
region_to_retrieve = s[start - 1: end]
574+
sequence_range = ref_region.get_coords(base)
575+
region_start = sequence_range[0]
576+
region_end = sequence_range[1]
576577

577-
if base == 'nucl':
578-
retrieved_region = GenomeRegion(region, [start, end], region_to_retrieve,
579-
ref_region.aa_coords, ref_region.aa_seq)
580-
retrieved_region.set_sequence(region_to_retrieve, 'nucl')
578+
if start_offset <= region_start:
579+
start = region_start
581580
else:
582-
retrieved_region = GenomeRegion(region, ref_region.nt_coords, ref_region.nt_seq,
583-
[start, end], region_to_retrieve)
584-
retrieved_region.set_sequence(region_to_retrieve, 'prot')
585-
586-
587-
retrieved_region.set_pos_from_cds(virus)
588-
retrieved_region.pos_from_gstart = retrieved_region.local_to_global_index([start, end], base)
589-
retrieved_region.set_pos_from_aa_start(virus)
590-
591-
if retrieved_region:
592-
if outfile is None:
593-
print("\033[1mRetrieved sequence: \033[0m\n")
594-
print("Region:\t{}".format(retrieved_region.region_name))
595-
print(textwrap.fill(retrieved_region.get_sequence(base)))
596-
597-
print("\n\033[1mRelative Positions: \033[0m")
598-
if len(retrieved_region.pos_from_cds) == 2:
599-
print("\tNucleotide position relative to CDS start: {} --> {}"
600-
.format(retrieved_region.pos_from_cds[0], retrieved_region.pos_from_cds[1]))
601-
else:
602-
print("\tNucleotide position relative to CDS start: N/A")
603-
604-
if retrieved_region.pos_from_gstart is not None:
605-
print("\tNucleotide position relative to genome start: {} --> {}"
606-
.format(retrieved_region.pos_from_gstart[0] + 1, retrieved_region.pos_from_gstart[1]))
607-
608-
if retrieved_region.pos_from_aa_start is not None:
609-
print("\tAmino acid position relative to protein start: {} --> {}"
610-
.format(retrieved_region.pos_from_aa_start[0], retrieved_region.pos_from_aa_start[1]))
581+
start = region_start + (start_offset - region_start)
611582

583+
# If end_coord is greater than the region's end coordinate, set end_coord to region's end coordinate
584+
if end_offset == 'end' or end_offset > region_end:
585+
end = region_end
612586
else:
613-
outfile.write("\n\nRetrieved sequence:\n")
614-
outfile.write("\nRegion:\t{}".format(retrieved_region.region_name))
615-
outfile.write("\n" + textwrap.fill(retrieved_region.get_sequence(base)))
616-
617-
outfile.write("\n\nRelative Positions: \n")
618-
if len(retrieved_region.pos_from_cds) == 2:
619-
outfile.write("\tNucleotide position relative to CDS: {} to {}\n"
620-
.format(retrieved_region.pos_from_cds[0], retrieved_region.pos_from_cds[1]))
621-
else:
622-
outfile.write("\tNucleotide position relative to CDS start: N/A\n")
623-
624-
if retrieved_region.pos_from_gstart is not None:
625-
outfile.write("\tNucleotide position relative to start of genome: {} --> {}\n"
626-
.format(retrieved_region.pos_from_gstart[0] + 1, retrieved_region.pos_from_gstart[1]))
587+
end = region_end + (region_end - end_offset)
627588

628-
if retrieved_region.pos_from_aa_start is not None:
629-
outfile.write("\tAmino acid position relative to protein start: {} --> {}\n"
630-
.format(retrieved_region.pos_from_aa_start[0], retrieved_region.pos_from_aa_start[1]))
589+
# TODO: sort retrieved_regions to print region first
590+
retrieved_regions = find_matches(virus, base, ref_regions, [[start, end]])
591+
output(retrieved_regions, outfile)
631592

632-
return retrieved_region
593+
return retrieved_regions
633594

634595

635596
def handle_args(virus, base, ref_nt, nt_coords, ref_aa, aa_coords):
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
5'LTR,1,634
22
Gag,790,2292
3-
Matrix,790,1185
4-
Capsid,1186,1878
3+
Matrix(p17/p15),790,1185
4+
Capsid(p24/p27),1186,1878
55
p2,1879,1920
6-
Nucleocapsid,1921,2085
6+
Nucleocapsid(p7/p8),1921,2085
77
p1,2086,2133
88
p6,2134,2292
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Gag,1,500
2-
p17,1,132
3-
p24,133,363
2+
Matrix(p17/p15),1,132
3+
Capsid(p24/27),133,363
44
p2,364,377
5-
p7,378,432
5+
Nucleocapsid(p7/p8),378,432
66
p1,433,448
77
p6,449,500

poplars/tests/fixtures/siv_test_nt_coords.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Rev(with intron),6784,9315
22
Rev(exon1),6784,6853
33
Rev(exon2),9062,9315
4-
gp160,6860,9499
4+
Env(gp160),6860,9499
55
gp120,6860,8434
66
gp41,8435,9499
77
Nef,9333,10124
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Rev,2087,2193
22
Rev(exon1),2087,2110
33
Rev(exon2),2111,2193
4-
gp160,2194,3072
4+
Env(gp160),2194,3072
55
gp120,2194,2718
66
gp41,2719,3072
77
Nef,3073,3335

0 commit comments

Comments
 (0)