@@ -186,18 +186,18 @@ def get_overlap(self, region, coord_pair, base):
186
186
return overlap
187
187
188
188
189
- def set_regions (virus , base , nt_coords , nt_reference , aa_coords , aa_reference ):
189
+ def set_regions (virus , base , nt_coords , nt_seq , aa_coords , aa_seq ):
190
190
"""
191
191
Reads in the start and end coordinates of the genomic regions and associates the region with its coordinates.
192
192
If no coordinate files are specified, set default nucleotide and protein coordinate files.
193
193
:param virus: The organism (HIV or SIV)
194
194
:param base: The base of the sequence
195
195
:param nt_coords: Path to the csv file containing the global coordinates of the nucleotide region.
196
196
The file stream has one genomic entry per line and has the following format: region_name,start,end
197
- :param nt_reference : The file stream containing the reference nucleotide sequence in read mode
197
+ :param nt_seq : The file stream containing the reference nucleotide sequence in read mode
198
198
:param aa_coords: Path to the csv file containing the global coordinates of the protein region.
199
199
The file stream has one genomic entry per line and has the following format: region_name,start,end
200
- :param aa_reference : The file stream containing the reference nucleotide sequence in read mode
200
+ :param aa_seqe : The file stream containing the reference nucleotide sequence in read mode
201
201
:return: A list of GenomeRegions
202
202
"""
203
203
@@ -206,83 +206,79 @@ def set_regions(virus, base, nt_coords, nt_reference, aa_coords, aa_reference):
206
206
if base == 'nucl' :
207
207
208
208
# Parse nucleotide region coordinates file
209
- with open (nt_coords , 'r' ) as nt_handle :
210
- for nt_line in nt_handle :
211
- nt_line = nt_line .strip ()
212
- nt_line = nt_line .split (',' )
213
- nucl_coords = [int (nt_line [1 ]), int (nt_line [2 ])]
209
+ for nt_line in nt_coords :
210
+ nt_line = nt_line .strip ()
211
+ nt_line = nt_line .split (',' )
212
+ nucl_coords = [int (nt_line [1 ]), int (nt_line [2 ])]
214
213
215
- seq_region = GenomeRegion (nt_line [0 ], nucl_coords )
214
+ seq_region = GenomeRegion (nt_line [0 ])
216
215
217
- # Set global and local nucleotide coordinates
218
- seq_region .set_coords (nucl_coords , 'nucl' )
219
- seq_region .set_seq_from_ref (nt_reference , 'nucl' )
216
+ # Set global and local nucleotide coordinates
217
+ seq_region .set_coords (nucl_coords , 'nucl' )
218
+ seq_region .set_seq_from_ref (nt_seq , 'nucl' )
220
219
221
- # Set relative positions
222
- seq_region .set_pos_from_cds (virus )
223
- seq_region .set_pos_from_gstart ()
224
- seq_region .set_pos_from_pstart (virus )
220
+ # Set relative positions
221
+ seq_region .set_pos_from_cds (virus )
222
+ seq_region .set_pos_from_gstart ()
223
+ seq_region .set_pos_from_pstart (virus )
225
224
226
- genome_regions .append (seq_region )
225
+ genome_regions .append (seq_region )
227
226
228
227
# Parse protein coordinates file
229
- with open (aa_coords , 'r' ) as aa_handle :
230
- prot_names = []
231
- prot_coords = []
232
- for aa_line in aa_handle :
233
- aa_line = aa_line .strip ()
234
- aa_line = aa_line .split (',' )
235
- prot_coords .append ([int (aa_line [1 ]), int (aa_line [2 ])])
236
- prot_names .append (aa_line [0 ])
237
-
238
- for i in range (len (prot_names )):
239
- for seq_region in genome_regions :
240
- if prot_names [i ] in seq_region .region_name :
241
- # Set global and local protein coordinates
242
- seq_region .set_coords (prot_coords [i ], 'prot' )
243
- seq_region .set_seq_from_ref (aa_reference , 'prot' )
244
-
245
- seq_region .set_pos_from_pstart (virus )
228
+ prot_names = []
229
+ prot_coords = []
230
+ for aa_line in aa_coords :
231
+ aa_line = aa_line .strip ()
232
+ aa_line = aa_line .split (',' )
233
+ prot_coords .append ([int (aa_line [1 ]), int (aa_line [2 ])])
234
+ prot_names .append (aa_line [0 ])
235
+
236
+ for i in range (len (prot_names )):
237
+ for seq_region in genome_regions :
238
+ if prot_names [i ] in seq_region .region_name :
239
+ # Set global and local protein coordinates
240
+ seq_region .set_coords (prot_coords [i ], 'prot' )
241
+ seq_region .set_seq_from_ref (aa_seq , 'prot' )
242
+
243
+ seq_region .set_pos_from_pstart (virus )
246
244
247
245
else :
248
246
# Parse protein region coordinates file
249
- with open (aa_coords , 'r' ) as aa_handle :
250
- for aa_line in aa_handle :
251
- aa_line = aa_line .strip ()
252
- aa_line = aa_line .split (',' )
253
- prot_coords = [int (aa_line [1 ]), int (aa_line [2 ])]
247
+ for aa_line in aa_coords :
248
+ aa_line = aa_line .strip ()
249
+ aa_line = aa_line .split (',' )
250
+ prot_coords = [int (aa_line [1 ]), int (aa_line [2 ])]
254
251
255
- seq_region = GenomeRegion (aa_line [0 ])
252
+ seq_region = GenomeRegion (aa_line [0 ])
256
253
257
- # Set global and local nucleotide coordinates
258
- seq_region .set_coords (prot_coords , 'prot' )
259
- seq_region .set_seq_from_ref (aa_reference , 'prot' )
254
+ # Set global and local nucleotide coordinates
255
+ seq_region .set_coords (prot_coords , 'prot' )
256
+ seq_region .set_seq_from_ref (aa_seq , 'prot' )
260
257
261
- # Set relative positions
262
- seq_region .set_pos_from_cds (virus )
263
- seq_region .set_pos_from_gstart ()
264
- seq_region .set_pos_from_pstart (virus )
258
+ # Set relative positions
259
+ seq_region .set_pos_from_cds (virus )
260
+ seq_region .set_pos_from_gstart ()
261
+ seq_region .set_pos_from_pstart (virus )
265
262
266
- genome_regions .append (seq_region )
263
+ genome_regions .append (seq_region )
267
264
268
265
# Parse nucleotide coordinates file
269
- with open (nt_coords , 'r' ) as nt_handle :
270
- nucl_names = []
271
- nucl_coords = []
272
- for nt_line in nt_handle :
273
- nt_line = nt_line .strip ()
274
- nt_line = nt_line .split (',' )
275
- nucl_coords .append ([int (nt_line [1 ]), int (nt_line [2 ])])
276
- nucl_names .append (nt_line [0 ])
277
-
278
- for i in range (len (nucl_names )):
279
- for seq_region in genome_regions :
280
- if nucl_names [i ] in seq_region .region_name :
281
- # Set global and local protein coordinates
282
- seq_region .set_coords (nucl_coords [i ], 'nucl' )
283
- seq_region .set_seq_from_ref (nt_reference , 'nucl' )
284
-
285
- seq_region .set_pos_from_pstart (virus )
266
+ nucl_names = []
267
+ nucl_coords = []
268
+ for nt_line in nt_coords :
269
+ nt_line = nt_line .strip ()
270
+ nt_line = nt_line .split (',' )
271
+ nucl_coords .append ([int (nt_line [1 ]), int (nt_line [2 ])])
272
+ nucl_names .append (nt_line [0 ])
273
+
274
+ for i in range (len (nucl_names )):
275
+ for seq_region in genome_regions :
276
+ if nucl_names [i ] in seq_region .region_name :
277
+ # Set global and local protein coordinates
278
+ seq_region .set_coords (nucl_coords [i ], 'nucl' )
279
+ seq_region .set_seq_from_ref (nt_seq , 'nucl' )
280
+
281
+ seq_region .set_pos_from_pstart (virus )
286
282
287
283
return genome_regions
288
284
@@ -367,6 +363,7 @@ def get_query(base, query_file, revcomp):
367
363
:param revcomp: Option to align the reverse complement of the nucleotide sequence ('y' or 'n')
368
364
:return: A list of lists containing the sequence identifiers and the query sequences
369
365
"""
366
+
370
367
line = query_file .readline ()
371
368
query_file .seek (0 ) # Reset pointer to beginning
372
369
@@ -886,38 +883,40 @@ def main():
886
883
aa_coords = configs [3 ]
887
884
reference_sequence = configs [4 ]
888
885
889
- # Create genomic region objects based on configuration files
890
- ref_regions = set_regions (args .virus , args .base , nt_coords , ref_nt_seq , aa_coords , ref_aa_seq )
886
+ with open (nt_coords ) as nt_coords_handle , open (aa_coords ) as aa_coords_handle :
891
887
892
- if args .subcommand == "align" :
893
- query = get_query (args .base , args .query , args .revcomp )
894
- alignment = sequence_align (query , reference_sequence , args .outfile )
888
+ # Create genomic region objects based on configuration files
889
+ ref_regions = set_regions (args .virus , args .base , nt_coords_handle , ref_nt_seq , aa_coords_handle , ref_aa_seq )
895
890
896
- # Find indices where the query sequence aligns with the reference sequence
897
- match_coords = get_region_coordinates ( alignment [ - 1 ]) # Query sequence will be the last item in the list
898
- query_regions = find_matches ( args . virus , args . base , ref_regions , match_coords )
891
+ if args . subcommand == "align" :
892
+ query = get_query ( args . base , args . query , args . revcomp )
893
+ alignment = sequence_align ( query , reference_sequence , args . outfile )
899
894
900
- output_overlap (query_regions , args .outfile )
895
+ # Find indices where the query sequence aligns with the reference sequence
896
+ match_coords = get_region_coordinates (alignment [- 1 ]) # Query sequence will be the last item in the list
897
+ query_regions = find_matches (args .virus , args .base , ref_regions , match_coords )
901
898
902
- else :
903
- valid_in = valid_inputs (args .virus , args .start , args .end , args .region )
899
+ output_overlap (query_regions , args .outfile )
904
900
905
- if not valid_in :
906
- sys .exit (0 )
907
901
else :
908
- result = retrieve (args .virus , args .base , ref_regions , args .region , args .start , args .end )
909
- query_region = result [0 ]
910
- overlap_regions = result [1 ]
902
+ valid_in = valid_inputs (args .virus , args .start , args .end , args .region )
911
903
912
- # Print retrieved region first
913
- output_retrieved_region (query_region , args .outfile )
914
-
915
- if args .outfile is None :
916
- print ("\n \n Regions touched by the query sequence:" )
904
+ if not valid_in :
905
+ sys .exit (0 )
917
906
else :
918
- args .outfile .write ("\n \n Regions touched by the query sequence:\n \n " )
907
+ result = retrieve (args .virus , args .base , ref_regions , args .region , args .start , args .end )
908
+ query_region = result [0 ]
909
+ overlap_regions = result [1 ]
910
+
911
+ # Print retrieved region first
912
+ output_retrieved_region (query_region , args .outfile )
913
+
914
+ if args .outfile is None :
915
+ print ("\n \n Regions touched by the query sequence:" )
916
+ else :
917
+ args .outfile .write ("\n \n Regions touched by the query sequence:\n \n " )
919
918
920
- output_overlap (overlap_regions , args .outfile )
919
+ output_overlap (overlap_regions , args .outfile )
921
920
922
921
923
922
if __name__ == '__main__' :
0 commit comments