Skip to content

Commit bef6b2e

Browse files
authored
potential fix for #38 (#39)
1 parent 7e31178 commit bef6b2e

File tree

2 files changed

+48
-14
lines changed

2 files changed

+48
-14
lines changed

poplars/common.py

+22
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,25 @@ def convert_clustal(handle):
149149
result['aln'] += ln[offset:].strip('\n\r')
150150

151151
return result
152+
153+
154+
def resolve_mixtures(seq, replaceN=False):
155+
"""
156+
Randomly resolve mixtures (ambiguous base calls) to nucleotides
157+
:param seq: str, nucleotide sequence
158+
:param replaceN: if True, replace N with [ACGT]
159+
:return: str, sequence with all mixtures [WRKYSMBDHVN] replaced with
160+
nucleotides [ACGT]; or None if sequence contains illegal characters
161+
"""
162+
newseq = ''
163+
for nt in seq.upper():
164+
if nt == 'N':
165+
newseq += random.sample('ACGT', 1)[0] if replaceN else 'N'
166+
elif nt in 'ACGT-':
167+
newseq += nt # unambiguous base call or gap
168+
elif nt in mixture_dict:
169+
newseq += random.sample(mixture_dict[nt], 1)[0]
170+
else:
171+
print("Unexpected character {} in resolve_mixtures()".format(nt))
172+
return None
173+
return newseq

poplars/sequence_locator.py

+26-14
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import re
1414
import sys
1515

16-
from poplars.common import convert_fasta
16+
from poplars.common import convert_fasta, resolve_mixtures
1717
from poplars.mafft import align
1818

1919
NON_CODING = ["5'LTR", "TAR", "3'LTR"]
@@ -706,8 +706,9 @@ def output_retrieved_region(base, region, outfile=None):
706706
def valid_sequence(base, sequence):
707707
"""
708708
Verifies that input sequence is valid
709-
:param base: The base of the sequence (nucl or prot)
709+
:param base: The base of the sequence (NA or AA)
710710
:param sequence: A list of lists containing header and sequence pairs
711+
:param verbatim: if True, reject any nucleotide sequence with mixtures
711712
:raises ValueError: If the sequence is empty or if it contains invalid characters
712713
:return: <True> If the input sequence uses the correct alphabet, <False> otherwise
713714
"""
@@ -726,11 +727,13 @@ def valid_sequence(base, sequence):
726727
if not all(pos in dna_alphabet for pos in s):
727728
print("Invalid nucleotide sequence:\n{}\n{}\n".format(h, s))
728729
return False
729-
else:
730+
elif base == 'AA':
730731
if not all(pos in aa_alphabet for pos in s):
731732
print("Invalid amino acid sequence:\n{}\n{}\n".format(h, s))
732733
return False
733-
734+
else:
735+
print("Unexpected base argument {} in valid_sequence()".format(base))
736+
sys.exit()
734737
return True
735738

736739

@@ -782,12 +785,13 @@ def reverse_comp(query_sequence):
782785
return rev_comp
783786

784787

785-
def get_query(base, query, rev_comp):
788+
def get_query(base, query, rev_comp, verbatim=False):
786789
"""
787790
Gets the query sequence and checks that it is valid
788791
:param base: The base (nucleotide or protein)
789792
:param query: The query sequence as a string or the file path to the query sequence
790793
:param rev_comp: Reverse complement flag (False by default)
794+
:param verbatim: if True, reject any nucleotide sequences with mixtures
791795
:return: A list of lists containing the sequence identifiers and the query sequences
792796
"""
793797

@@ -825,7 +829,18 @@ def get_query(base, query, rev_comp):
825829
count += 1
826830

827831
if not valid_sequence(base, query_seq):
828-
sys.exit(0)
832+
if base == 'NA' and not verbatim:
833+
# attempt to salvage sequence with mixtures
834+
resolved = []
835+
for h, s in query_seq:
836+
rs = resolve_mixtures(s)
837+
if rs is None:
838+
print("Failed to resolve mixtures in {}".format(h))
839+
sys.exit()
840+
resolved.append([h, rs])
841+
query_seq = resolved
842+
else:
843+
sys.exit(0)
829844

830845
# At this point, the sequence is valid
831846
if base == 'NA' and rev_comp:
@@ -903,20 +918,23 @@ def parse_args():
903918
)
904919
subparsers = parser.add_subparsers(title='sub-commands', dest='subcommand')
905920

906-
# Create sub-parser for 'align' mode
921+
# Create sub-parser for 'locate' mode
907922
parser_locate = subparsers.add_parser('locate',
908923
help='find the location of a sequence')
909924
parser_locate.add_argument('virus', metavar='virus', choices=['hiv', 'siv'],
910925
help='the reference virus (choices: hiv, siv)')
911926
parser_locate.add_argument('base', metavar='base', choices=['NA', 'AA'],
912-
help='sequence base type (choices: \'nucl\' and \'prot\')')
927+
help='sequence base type (choices: \'NA\' and \'AA\')')
913928
parser_locate.add_argument('query', metavar='query', nargs='+',
914929
help='the query sequence as a string or a FASTA file')
915930
parser_locate.add_argument('-o', '--out', metavar='DIRECTORY',
916931
help='directs the output to the specified directory (default: stdout) '
917932
'and creates an alignment and an output file for each query')
918933
parser_locate.add_argument('-rc', '--revcomp', action='store_true',
919934
help='aligns the reverse complement of the query with the reference genome')
935+
parser_locate.add_argument('--verbatim', action='store_true',
936+
help='no tolerance for ambiguous base calls, i.e., mixtures (R=A/G), '
937+
'exits gracefully')
920938

921939
# Create sub-parser for 'retrieve' mode
922940
parser_retrieve = subparsers.add_parser('retrieve',
@@ -952,12 +970,6 @@ def parse_args():
952970
if len(sys.argv) == 1 or len(sys.argv) == 2:
953971
print("\033[1mSequence Locator\033[0m")
954972
parser.print_help()
955-
print("\n{}".format("-" * 80))
956-
print("\n\033[1m'locate' sub-command:\033[0m")
957-
parser_locate.print_help()
958-
print("\n{}".format("-" * 80))
959-
print("\n\033[1m'retrieve' sub-command:\033[0m")
960-
parser_retrieve.print_help()
961973
sys.exit(2)
962974

963975
return parser.parse_args()

0 commit comments

Comments
 (0)