13
13
import re
14
14
import sys
15
15
16
- from poplars .common import convert_fasta
16
+ from poplars .common import convert_fasta , resolve_mixtures
17
17
from poplars .mafft import align
18
18
19
19
NON_CODING = ["5'LTR" , "TAR" , "3'LTR" ]
@@ -706,8 +706,9 @@ def output_retrieved_region(base, region, outfile=None):
706
706
def valid_sequence (base , sequence ):
707
707
"""
708
708
Verifies that input sequence is valid
709
- :param base: The base of the sequence (nucl or prot )
709
+ :param base: The base of the sequence (NA or AA )
710
710
:param sequence: A list of lists containing header and sequence pairs
711
+ :param verbatim: if True, reject any nucleotide sequence with mixtures
711
712
:raises ValueError: If the sequence is empty or if it contains invalid characters
712
713
:return: <True> If the input sequence uses the correct alphabet, <False> otherwise
713
714
"""
@@ -726,11 +727,13 @@ def valid_sequence(base, sequence):
726
727
if not all (pos in dna_alphabet for pos in s ):
727
728
print ("Invalid nucleotide sequence:\n {}\n {}\n " .format (h , s ))
728
729
return False
729
- else :
730
+ elif base == 'AA' :
730
731
if not all (pos in aa_alphabet for pos in s ):
731
732
print ("Invalid amino acid sequence:\n {}\n {}\n " .format (h , s ))
732
733
return False
733
-
734
+ else :
735
+ print ("Unexpected base argument {} in valid_sequence()" .format (base ))
736
+ sys .exit ()
734
737
return True
735
738
736
739
@@ -782,12 +785,13 @@ def reverse_comp(query_sequence):
782
785
return rev_comp
783
786
784
787
785
- def get_query (base , query , rev_comp ):
788
+ def get_query (base , query , rev_comp , verbatim = False ):
786
789
"""
787
790
Gets the query sequence and checks that it is valid
788
791
:param base: The base (nucleotide or protein)
789
792
:param query: The query sequence as a string or the file path to the query sequence
790
793
:param rev_comp: Reverse complement flag (False by default)
794
+ :param verbatim: if True, reject any nucleotide sequences with mixtures
791
795
:return: A list of lists containing the sequence identifiers and the query sequences
792
796
"""
793
797
@@ -825,7 +829,18 @@ def get_query(base, query, rev_comp):
825
829
count += 1
826
830
827
831
if not valid_sequence (base , query_seq ):
828
- sys .exit (0 )
832
+ if base == 'NA' and not verbatim :
833
+ # attempt to salvage sequence with mixtures
834
+ resolved = []
835
+ for h , s in query_seq :
836
+ rs = resolve_mixtures (s )
837
+ if rs is None :
838
+ print ("Failed to resolve mixtures in {}" .format (h ))
839
+ sys .exit ()
840
+ resolved .append ([h , rs ])
841
+ query_seq = resolved
842
+ else :
843
+ sys .exit (0 )
829
844
830
845
# At this point, the sequence is valid
831
846
if base == 'NA' and rev_comp :
@@ -903,20 +918,23 @@ def parse_args():
903
918
)
904
919
subparsers = parser .add_subparsers (title = 'sub-commands' , dest = 'subcommand' )
905
920
906
- # Create sub-parser for 'align ' mode
921
+ # Create sub-parser for 'locate ' mode
907
922
parser_locate = subparsers .add_parser ('locate' ,
908
923
help = 'find the location of a sequence' )
909
924
parser_locate .add_argument ('virus' , metavar = 'virus' , choices = ['hiv' , 'siv' ],
910
925
help = 'the reference virus (choices: hiv, siv)' )
911
926
parser_locate .add_argument ('base' , metavar = 'base' , choices = ['NA' , 'AA' ],
912
- help = 'sequence base type (choices: \' nucl \' and \' prot \' )' )
927
+ help = 'sequence base type (choices: \' NA \' and \' AA \' )' )
913
928
parser_locate .add_argument ('query' , metavar = 'query' , nargs = '+' ,
914
929
help = 'the query sequence as a string or a FASTA file' )
915
930
parser_locate .add_argument ('-o' , '--out' , metavar = 'DIRECTORY' ,
916
931
help = 'directs the output to the specified directory (default: stdout) '
917
932
'and creates an alignment and an output file for each query' )
918
933
parser_locate .add_argument ('-rc' , '--revcomp' , action = 'store_true' ,
919
934
help = 'aligns the reverse complement of the query with the reference genome' )
935
+ parser_locate .add_argument ('--verbatim' , action = 'store_true' ,
936
+ help = 'no tolerance for ambiguous base calls, i.e., mixtures (R=A/G), '
937
+ 'exits gracefully' )
920
938
921
939
# Create sub-parser for 'retrieve' mode
922
940
parser_retrieve = subparsers .add_parser ('retrieve' ,
@@ -952,12 +970,6 @@ def parse_args():
952
970
if len (sys .argv ) == 1 or len (sys .argv ) == 2 :
953
971
print ("\033 [1mSequence Locator\033 [0m" )
954
972
parser .print_help ()
955
- print ("\n {}" .format ("-" * 80 ))
956
- print ("\n \033 [1m'locate' sub-command:\033 [0m" )
957
- parser_locate .print_help ()
958
- print ("\n {}" .format ("-" * 80 ))
959
- print ("\n \033 [1m'retrieve' sub-command:\033 [0m" )
960
- parser_retrieve .print_help ()
961
973
sys .exit (2 )
962
974
963
975
return parser .parse_args ()
0 commit comments