16
16
class GenomeRegion :
17
17
"""
18
18
Represents information about a genomic region
19
+
20
+ :ivar region_name: The name of the genomic region
21
+ :ivar nt_coords: A list containing the start and end indices of the nucleotide region
22
+ :ivar nt_seq: The nucleotide sequence
23
+ :ivar aa_coords: A list containing the start and end indices of the protein region
24
+ :ivar aa_seq: The amino acid sequence
25
+ :ivar pos_from_cds: Nucleotide position relative to CDS start
26
+ :ivar pos_from_gstart: Nucleotide position relative to the reference genome start
27
+ :ivar pos_from_qstart: Nucleotide position relative to the start of the query sequence
28
+ :ivar pos_from_rstart: Nucleotide position relative to the start of the region
29
+ :ivar pos_from_pstart: Amino acid position relative to protein start
30
+ :ivar codon_aln: Amino acid sequence aligned with the nucleotide sequence
19
31
"""
20
32
21
33
def __init__ (self , region_name , nt_coords = None , nt_seq = None , aa_coords = None , aa_seq = None ):
@@ -28,17 +40,11 @@ def __init__(self, region_name, nt_coords=None, nt_seq=None, aa_coords=None, aa_
28
40
:param aa_seq: <option> The amino acid sequence of the genomic region
29
41
"""
30
42
self .region_name = region_name
31
- self .nt_coords = nt_coords
32
- self .nt_seq = nt_seq
33
- self .aa_coords = aa_coords
34
- self .aa_seq = aa_seq
35
-
36
- self .pos_from_cds = [] # Nucleotide position relative to CDS start
37
- self .pos_from_gstart = None # Nucleotide position relative to the reference genome start
38
- self .pos_from_qstart = None # Nucleotide position relative to the start of the query sequence
39
- self .pos_from_rstart = None
40
- self .pos_from_aa_start = None # Amino acid position relative to protein start
41
- self .codon_aln = '' # Amino acid sequence aligned with the nucleotide sequence
43
+ self .nt_coords , self .nt_seq = nt_coords , nt_seq
44
+ self .aa_coords , self .aa_seq = aa_coords , aa_seq
45
+ self .pos_from_cds = []
46
+ self .pos_from_gstart , self .pos_from_qstart , self .pos_from_rstart , self .pos_from_pstart = None , None , None , None
47
+ self .codon_aln = ''
42
48
43
49
def get_coords (self , base ):
44
50
if base == 'nucl' :
@@ -86,7 +92,7 @@ def set_pos_from_cds(self, virus):
86
92
self .pos_from_cds .append ((self .nt_coords [0 ] + 1 - cds_start ))
87
93
self .pos_from_cds .append ((self .nt_coords [1 ] + 1 - cds_start ))
88
94
89
- def set_pos_from_aa_start (self , virus ):
95
+ def set_pos_from_pstart (self , virus ):
90
96
"""
91
97
Gives the position of the sequence relative to the start of the protein sequence
92
98
"""
@@ -98,13 +104,13 @@ def set_pos_from_aa_start(self, virus):
98
104
if len (self .pos_from_cds ) == 2 :
99
105
# If the whole protein sequence is encompassed in the CDS range
100
106
if len (self .aa_seq ) == (((self .pos_from_cds [1 ] - self .pos_from_cds [0 ]) // 3 ) + 1 ):
101
- self .pos_from_aa_start = [1 , (((self .pos_from_cds [1 ] - self .pos_from_cds [0 ]) // 3 ) + 1 )]
107
+ self .pos_from_pstart = [1 , (((self .pos_from_cds [1 ] - self .pos_from_cds [0 ]) // 3 ) + 1 )]
102
108
else :
103
- self .pos_from_aa_start = [(self .pos_from_cds [0 ] // 3 ) + 1 , self .pos_from_cds [1 ] // 3 ]
109
+ self .pos_from_pstart = [(self .pos_from_cds [0 ] // 3 ) + 1 , self .pos_from_cds [1 ] // 3 ]
104
110
105
111
# If the region is outside the CDS
106
112
else :
107
- self .pos_from_aa_start = None
113
+ self .pos_from_pstart = None
108
114
109
115
def make_codon_aln (self ):
110
116
"""
@@ -188,7 +194,7 @@ def set_regions(virus, nt_reference, nt_coords, aa_reference, aa_coords):
188
194
seq_region .set_seq_from_ref (nt_reference , 'nucl' )
189
195
seq_region .set_pos_from_cds (virus )
190
196
seq_region .pos_from_gstart = nucl_coords
191
- seq_region .set_pos_from_aa_start (virus )
197
+ seq_region .set_pos_from_pstart (virus )
192
198
genome_regions .append (seq_region )
193
199
194
200
# Parse protein coordinates file
@@ -310,7 +316,7 @@ def get_query(base, query_file, revcomp):
310
316
sys .exit (0 )
311
317
312
318
else :
313
- if revcomp == 'y' :
319
+ if revcomp :
314
320
if base == 'prot' :
315
321
print ("Invalid option: reverse complement is not available for proteins." )
316
322
else :
@@ -426,7 +432,7 @@ def find_matches(virus, base, ref_regions, match_coordinates):
426
432
query_region .set_sequence (ov_seq , base )
427
433
query_region .set_pos_from_cds (virus )
428
434
query_region .pos_from_gstart = [start_aln , end_aln ]
429
- query_region .set_pos_from_aa_start (virus )
435
+ query_region .set_pos_from_pstart (virus )
430
436
431
437
if base == 'nucl' :
432
438
set_protein_equivalents (query_region , ref_regions )
@@ -507,9 +513,9 @@ def output(query_regions, outfile=None):
507
513
print ("\t Nucleotide position relative to genome start: {} --> {}"
508
514
.format (reg .pos_from_gstart [0 ] + 1 , reg .pos_from_gstart [1 ] + 1 ))
509
515
510
- if reg .pos_from_aa_start is not None :
516
+ if reg .pos_from_pstart is not None :
511
517
print ("\t Amino acid position relative to protein start: {} --> {}"
512
- .format (reg .pos_from_aa_start [0 ], reg .pos_from_aa_start [1 ]))
518
+ .format (reg .pos_from_pstart [0 ], reg .pos_from_pstart [1 ]))
513
519
514
520
if reg .pos_from_qstart is not None :
515
521
print ("\t Position relative to query start: {} --> {}\n "
@@ -565,32 +571,69 @@ def retrieve(virus, base, ref_regions, region, outfile=None, start_offset=1, end
565
571
:param ref_regions: A list of GenomeRegion objects
566
572
:param region: The genomic region
567
573
:param outfile: The file stream of the output file
568
- :param start_offset: <option> The start coordinate
569
- :param end_offset: <option> The end coordinate
574
+ :param start_offset: <option> The start coordinate of the query region
575
+ :param end_offset: <option> The end coordinate of the query region
570
576
:return: The genomic region defined by the starting and ending coordinates
571
577
"""
578
+ query_region = None
572
579
for ref_region in ref_regions :
580
+ sequence_range = ref_region .get_coords (base )
581
+ region_start , region_end = sequence_range [0 ], sequence_range [1 ]
582
+ length = region_end - region_start
583
+
584
+ # Check if offsets are given in global coordinates
585
+ if region_start <= start_offset <= region_end :
586
+ start = start_offset - region_start + 1
587
+ # Or in local coordinates
588
+ elif 1 <= start_offset <= length :
589
+ start = start_offset
590
+ # Or out of range
591
+ else :
592
+ start = 1
593
+
594
+ # Check for 'end'
595
+ if end_offset == 'end' :
596
+ end = length + 1
597
+ # Check for global coordinates
598
+ elif region_start <= end_offset <= region_end :
599
+ end = end_offset - region_start + 1
600
+ # Or local coordinates
601
+ elif 1 <= start_offset <= length :
602
+ end = end_offset + 1
603
+ # Or out of range
604
+ else :
605
+ end = length + 1
606
+
607
+ # # Handles global and local start coordinates
608
+ # if start_offset <= region_start:
609
+ # start = region_start
610
+ # else:
611
+ # start = region_start + (start_offset - region_start)
612
+ #
613
+ # # Handles global and local end coordinates
614
+ # if end_offset == 'end' or end_offset > region_end:
615
+ # end = region_end
616
+ # else:
617
+ # end = region_end + (region_end - end_offset)
618
+
619
+ # Create a GenomeRegion object for the query
573
620
if ref_region .region_name == region :
574
- sequence_range = ref_region .get_coords (base )
575
- region_start = sequence_range [0 ]
576
- region_end = sequence_range [1 ]
621
+ query_region = GenomeRegion (region )
622
+ query_region .set_coords ([start , end ], base ) # Set global coordinates
577
623
578
- if start_offset <= region_start :
579
- start = region_start
580
- else :
581
- start = region_start + ( start_offset - region_start )
624
+ # Slice reference sequence with local coordinates
625
+ qstart , qend = query_region . global_to_local_index ([ start , end ], base )
626
+ query_seq = ref_region . get_sequence ( base )[ qstart - 1 : qend ]
627
+ query_region . set_sequence ( query_seq , base )
582
628
583
- # If end_coord is greater than the region's end coordinate, set end_coord to region's end coordinate
584
- if end_offset == 'end' or end_offset > region_end :
585
- end = region_end
586
- else :
587
- end = region_end + (region_end - end_offset )
629
+ if query_region is not None :
630
+ # TODO: sort retrieved_regions to print query_region first
631
+ retrieved_regions = find_matches (virus , base , ref_regions , [query_region .get_coords (base )])
588
632
589
- # TODO: sort retrieved_regions to print region first
590
- retrieved_regions = find_matches (virus , base , ref_regions , [[start , end ]])
633
+ # TODO: remove duplicate query_region
591
634
output (retrieved_regions , outfile )
592
635
593
- return retrieved_regions
636
+ return query_region , retrieved_regions
594
637
595
638
596
639
def handle_args (virus , base , ref_nt , nt_coords , ref_aa , aa_coords ):
@@ -677,7 +720,7 @@ def parse_args():
677
720
'relative to the HIV or SIV reference genome' )
678
721
parser_align .add_argument ('query' , type = argparse .FileType ('r' ),
679
722
help = 'Path to the file containing the query sequence.' )
680
- parser_align .add_argument ('-revcomp' , default = 'n' , choices = ['y' , 'n' ],
723
+ parser_align .add_argument ('-revcomp' , type = bool , default = False , choices = [True , False ],
681
724
help = 'Align the reverse complement of the query sequence with the reference sequence' )
682
725
parser_align .add_argument ('-outfile' , type = argparse .FileType ('w' ),
683
726
help = 'Path to the file where results will be written. '
0 commit comments