@@ -186,18 +186,18 @@ def get_overlap(self, region, coord_pair, base):
186186 return overlap
187187
188188
189- def set_regions (virus , base , nt_coords , nt_reference , aa_coords , aa_reference ):
189+ def set_regions (virus , base , nt_coords , nt_seq , aa_coords , aa_seq ):
190190 """
191191 Reads in the start and end coordinates of the genomic regions and associates the region with its coordinates.
192192 If no coordinate files are specified, set default nucleotide and protein coordinate files.
193193 :param virus: The organism (HIV or SIV)
194194 :param base: The base of the sequence
195195 :param nt_coords: Path to the csv file containing the global coordinates of the nucleotide region.
196196 The file stream has one genomic entry per line and has the following format: region_name,start,end
197- :param nt_reference : The file stream containing the reference nucleotide sequence in read mode
197+ :param nt_seq : The file stream containing the reference nucleotide sequence in read mode
198198 :param aa_coords: Path to the csv file containing the global coordinates of the protein region.
199199 The file stream has one genomic entry per line and has the following format: region_name,start,end
200- :param aa_reference : The file stream containing the reference nucleotide sequence in read mode
200+ :param aa_seqe : The file stream containing the reference nucleotide sequence in read mode
201201 :return: A list of GenomeRegions
202202 """
203203
@@ -206,83 +206,79 @@ def set_regions(virus, base, nt_coords, nt_reference, aa_coords, aa_reference):
206206 if base == 'nucl' :
207207
208208 # Parse nucleotide region coordinates file
209- with open (nt_coords , 'r' ) as nt_handle :
210- for nt_line in nt_handle :
211- nt_line = nt_line .strip ()
212- nt_line = nt_line .split (',' )
213- nucl_coords = [int (nt_line [1 ]), int (nt_line [2 ])]
209+ for nt_line in nt_coords :
210+ nt_line = nt_line .strip ()
211+ nt_line = nt_line .split (',' )
212+ nucl_coords = [int (nt_line [1 ]), int (nt_line [2 ])]
214213
215- seq_region = GenomeRegion (nt_line [0 ], nucl_coords )
214+ seq_region = GenomeRegion (nt_line [0 ])
216215
217- # Set global and local nucleotide coordinates
218- seq_region .set_coords (nucl_coords , 'nucl' )
219- seq_region .set_seq_from_ref (nt_reference , 'nucl' )
216+ # Set global and local nucleotide coordinates
217+ seq_region .set_coords (nucl_coords , 'nucl' )
218+ seq_region .set_seq_from_ref (nt_seq , 'nucl' )
220219
221- # Set relative positions
222- seq_region .set_pos_from_cds (virus )
223- seq_region .set_pos_from_gstart ()
224- seq_region .set_pos_from_pstart (virus )
220+ # Set relative positions
221+ seq_region .set_pos_from_cds (virus )
222+ seq_region .set_pos_from_gstart ()
223+ seq_region .set_pos_from_pstart (virus )
225224
226- genome_regions .append (seq_region )
225+ genome_regions .append (seq_region )
227226
228227 # Parse protein coordinates file
229- with open (aa_coords , 'r' ) as aa_handle :
230- prot_names = []
231- prot_coords = []
232- for aa_line in aa_handle :
233- aa_line = aa_line .strip ()
234- aa_line = aa_line .split (',' )
235- prot_coords .append ([int (aa_line [1 ]), int (aa_line [2 ])])
236- prot_names .append (aa_line [0 ])
237-
238- for i in range (len (prot_names )):
239- for seq_region in genome_regions :
240- if prot_names [i ] in seq_region .region_name :
241- # Set global and local protein coordinates
242- seq_region .set_coords (prot_coords [i ], 'prot' )
243- seq_region .set_seq_from_ref (aa_reference , 'prot' )
244-
245- seq_region .set_pos_from_pstart (virus )
228+ prot_names = []
229+ prot_coords = []
230+ for aa_line in aa_coords :
231+ aa_line = aa_line .strip ()
232+ aa_line = aa_line .split (',' )
233+ prot_coords .append ([int (aa_line [1 ]), int (aa_line [2 ])])
234+ prot_names .append (aa_line [0 ])
235+
236+ for i in range (len (prot_names )):
237+ for seq_region in genome_regions :
238+ if prot_names [i ] in seq_region .region_name :
239+ # Set global and local protein coordinates
240+ seq_region .set_coords (prot_coords [i ], 'prot' )
241+ seq_region .set_seq_from_ref (aa_seq , 'prot' )
242+
243+ seq_region .set_pos_from_pstart (virus )
246244
247245 else :
248246 # Parse protein region coordinates file
249- with open (aa_coords , 'r' ) as aa_handle :
250- for aa_line in aa_handle :
251- aa_line = aa_line .strip ()
252- aa_line = aa_line .split (',' )
253- prot_coords = [int (aa_line [1 ]), int (aa_line [2 ])]
247+ for aa_line in aa_coords :
248+ aa_line = aa_line .strip ()
249+ aa_line = aa_line .split (',' )
250+ prot_coords = [int (aa_line [1 ]), int (aa_line [2 ])]
254251
255- seq_region = GenomeRegion (aa_line [0 ])
252+ seq_region = GenomeRegion (aa_line [0 ])
256253
257- # Set global and local nucleotide coordinates
258- seq_region .set_coords (prot_coords , 'prot' )
259- seq_region .set_seq_from_ref (aa_reference , 'prot' )
254+ # Set global and local nucleotide coordinates
255+ seq_region .set_coords (prot_coords , 'prot' )
256+ seq_region .set_seq_from_ref (aa_seq , 'prot' )
260257
261- # Set relative positions
262- seq_region .set_pos_from_cds (virus )
263- seq_region .set_pos_from_gstart ()
264- seq_region .set_pos_from_pstart (virus )
258+ # Set relative positions
259+ seq_region .set_pos_from_cds (virus )
260+ seq_region .set_pos_from_gstart ()
261+ seq_region .set_pos_from_pstart (virus )
265262
266- genome_regions .append (seq_region )
263+ genome_regions .append (seq_region )
267264
268265 # Parse nucleotide coordinates file
269- with open (nt_coords , 'r' ) as nt_handle :
270- nucl_names = []
271- nucl_coords = []
272- for nt_line in nt_handle :
273- nt_line = nt_line .strip ()
274- nt_line = nt_line .split (',' )
275- nucl_coords .append ([int (nt_line [1 ]), int (nt_line [2 ])])
276- nucl_names .append (nt_line [0 ])
277-
278- for i in range (len (nucl_names )):
279- for seq_region in genome_regions :
280- if nucl_names [i ] in seq_region .region_name :
281- # Set global and local protein coordinates
282- seq_region .set_coords (nucl_coords [i ], 'nucl' )
283- seq_region .set_seq_from_ref (nt_reference , 'nucl' )
284-
285- seq_region .set_pos_from_pstart (virus )
266+ nucl_names = []
267+ nucl_coords = []
268+ for nt_line in nt_coords :
269+ nt_line = nt_line .strip ()
270+ nt_line = nt_line .split (',' )
271+ nucl_coords .append ([int (nt_line [1 ]), int (nt_line [2 ])])
272+ nucl_names .append (nt_line [0 ])
273+
274+ for i in range (len (nucl_names )):
275+ for seq_region in genome_regions :
276+ if nucl_names [i ] in seq_region .region_name :
277+ # Set global and local protein coordinates
278+ seq_region .set_coords (nucl_coords [i ], 'nucl' )
279+ seq_region .set_seq_from_ref (nt_seq , 'nucl' )
280+
281+ seq_region .set_pos_from_pstart (virus )
286282
287283 return genome_regions
288284
@@ -367,6 +363,7 @@ def get_query(base, query_file, revcomp):
367363 :param revcomp: Option to align the reverse complement of the nucleotide sequence ('y' or 'n')
368364 :return: A list of lists containing the sequence identifiers and the query sequences
369365 """
366+
370367 line = query_file .readline ()
371368 query_file .seek (0 ) # Reset pointer to beginning
372369
@@ -886,38 +883,40 @@ def main():
886883 aa_coords = configs [3 ]
887884 reference_sequence = configs [4 ]
888885
889- # Create genomic region objects based on configuration files
890- ref_regions = set_regions (args .virus , args .base , nt_coords , ref_nt_seq , aa_coords , ref_aa_seq )
886+ with open (nt_coords ) as nt_coords_handle , open (aa_coords ) as aa_coords_handle :
891887
892- if args .subcommand == "align" :
893- query = get_query (args .base , args .query , args .revcomp )
894- alignment = sequence_align (query , reference_sequence , args .outfile )
888+ # Create genomic region objects based on configuration files
889+ ref_regions = set_regions (args .virus , args .base , nt_coords_handle , ref_nt_seq , aa_coords_handle , ref_aa_seq )
895890
896- # Find indices where the query sequence aligns with the reference sequence
897- match_coords = get_region_coordinates ( alignment [ - 1 ]) # Query sequence will be the last item in the list
898- query_regions = find_matches ( args . virus , args . base , ref_regions , match_coords )
891+ if args . subcommand == "align" :
892+ query = get_query ( args . base , args . query , args . revcomp )
893+ alignment = sequence_align ( query , reference_sequence , args . outfile )
899894
900- output_overlap (query_regions , args .outfile )
895+ # Find indices where the query sequence aligns with the reference sequence
896+ match_coords = get_region_coordinates (alignment [- 1 ]) # Query sequence will be the last item in the list
897+ query_regions = find_matches (args .virus , args .base , ref_regions , match_coords )
901898
902- else :
903- valid_in = valid_inputs (args .virus , args .start , args .end , args .region )
899+ output_overlap (query_regions , args .outfile )
904900
905- if not valid_in :
906- sys .exit (0 )
907901 else :
908- result = retrieve (args .virus , args .base , ref_regions , args .region , args .start , args .end )
909- query_region = result [0 ]
910- overlap_regions = result [1 ]
902+ valid_in = valid_inputs (args .virus , args .start , args .end , args .region )
911903
912- # Print retrieved region first
913- output_retrieved_region (query_region , args .outfile )
914-
915- if args .outfile is None :
916- print ("\n \n Regions touched by the query sequence:" )
904+ if not valid_in :
905+ sys .exit (0 )
917906 else :
918- args .outfile .write ("\n \n Regions touched by the query sequence:\n \n " )
907+ result = retrieve (args .virus , args .base , ref_regions , args .region , args .start , args .end )
908+ query_region = result [0 ]
909+ overlap_regions = result [1 ]
910+
911+ # Print retrieved region first
912+ output_retrieved_region (query_region , args .outfile )
913+
914+ if args .outfile is None :
915+ print ("\n \n Regions touched by the query sequence:" )
916+ else :
917+ args .outfile .write ("\n \n Regions touched by the query sequence:\n \n " )
919918
920- output_overlap (overlap_regions , args .outfile )
919+ output_overlap (overlap_regions , args .outfile )
921920
922921
923922if __name__ == '__main__' :
0 commit comments