diff --git a/CHANGES.md b/CHANGES.md index 571fe95ea..02ad665f3 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,10 +8,11 @@ * ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield) * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. * `--vcf-reference` and `--root-sequence` are now mutually exclusive. - +* translate: Tree nodes are checked against the node-data JSON input to ensure sequences are present. [#1348][] (@jameshadfield) ### Bug Fixes +* translate: The 'source' ID for GFF files is now ignored as a potential gene feature (it is still used for overall nuc coords). [#1348][] (@jameshadfield) * translate: Improvements to command line arguments. [#1348][] (@jameshadfield) * `--tree` and `--ancestral-sequences` are now required arguments. * separate VCF-only arguments into their own group diff --git a/augur/translate.py b/augur/translate.py index fbde1ab8c..8c7253c39 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -20,6 +20,7 @@ from .utils import read_node_data, load_features, write_json, get_json_name from treetime.vcf_utils import read_vcf from augur.errors import AugurError +from textwrap import dedent class MissingNodeError(Exception): pass @@ -27,6 +28,9 @@ class MissingNodeError(Exception): class MismatchNodeError(Exception): pass +class NoVariationError(Exception): + pass + def safe_translate(sequence, report_exceptions=False): """Returns an amino acid translation of the given nucleotide sequence accounting for gaps in the given sequence. @@ -125,7 +129,7 @@ def translate_feature(aln, feature): return translations -def translate_vcf_feature(sequences, ref, feature): +def translate_vcf_feature(sequences, ref, feature, feature_name): '''Translates a subsequence of input nucleotide sequences. Parameters @@ -145,6 +149,9 @@ def translate_vcf_feature(sequences, ref, feature): translated reference gene, positions of AA differences, and AA differences indexed by node name + Raises + ------ + NoVariationError : if no variable sites within this feature (across all sequences) ''' def str_reverse_comp(str_seq): #gets reverse-compliment of a string and returns it as a string @@ -161,7 +168,7 @@ def str_reverse_comp(str_seq): # Need to get ref translation to store. check if multiple of 3 for sanity. # will be padded in safe_translate if not if len(refNuc)%3: - print("Gene length of {} is not a multiple of 3. will pad with N".format(feature.qualifiers['Name'][0]), file=sys.stderr) + print(f"Gene length of {feature_name!r} is not a multiple of 3. will pad with N", file=sys.stderr) ref_aa_seq = safe_translate(refNuc) prot['reference'] = ref_aa_seq @@ -205,11 +212,10 @@ def str_reverse_comp(str_seq): prot['positions'].sort() - #if no variable sites, exclude this gene + # raise an error if no variable sites observed if len(prot['positions']) == 0: - return None - else: - return prot + raise NoVariationError() + return prot def construct_mut(start, pos, end): return str(start) + str(pos) + str(end) @@ -334,7 +340,7 @@ def sequences_json(node_data_json, tree): Extract the full nuc sequence for each node in the provided node-data JSON. Returns a dict, keys are node names and values are a string of the genome sequence (nuc) """ - node_data = read_node_data(node_data_json, tree) + node_data = read_node_data(node_data_json) if node_data is None: raise AugurError("could not read node data (incl sequences)") # extract sequences from node meta data @@ -342,6 +348,14 @@ def sequences_json(node_data_json, tree): for k,v in node_data['nodes'].items(): if 'sequence' in v: sequences[k] = v['sequence'] + tree_nodes = {c.name for c in tree.find_clades()} + tree_nodes_missing_sequences = tree_nodes - set(sequences.keys()) + if len(tree_nodes_missing_sequences): + raise AugurError(dedent(f"""\ + {len(tree_nodes_missing_sequences)} nodes on the tree are missing nucleotide sequences in the node-data JSON. + These must be present under 'nodes' → → 'sequence'. + This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here. + """)) return sequences def register_parser(parent_subparsers): @@ -378,6 +392,8 @@ def check_arg_combinations(args, is_vcf): def run(args): ## read tree and data, if reading data fails, return with error code tree = Phylo.read(args.tree, 'newick') + is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]) + check_arg_combinations(args, is_vcf) # If genes is a file, read in the genes to translate if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]): @@ -385,16 +401,6 @@ def run(args): else: genes = args.genes - ## check file format and read in sequences - is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]) - check_arg_combinations(args, is_vcf) - - if is_vcf: - (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) - else: - sequences = sequences_json(args.ancestral_sequences, args.tree) - - ## load features; only requested features if genes given features = load_features(args.reference_sequence, genes) if features is None: @@ -402,22 +408,24 @@ def run(args): return 1 print("Read in {} features from reference sequence file".format(len(features))) - ### translate every feature - but not 'nuc'! + ## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature + ## Note that `load_features` _only_ extracts {'gene', 'source'} for GFF files, {'CDS', 'source'} for GenBank. translations = {} - deleted = [] - for fname, feat in features.items(): - if is_vcf: - trans = translate_vcf_feature(sequences, ref, feat) - if trans: - translations[fname] = trans - else: - deleted.append(fname) - else: - if feat.type != 'source': - translations[fname] = translate_feature(sequences, feat) - - if len(deleted) != 0: - print("{} genes had no mutations and so have been be excluded.".format(len(deleted))) + if is_vcf: + (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) + features_without_variation = [] + for fname, feat in features.items(): + if feat.type=='source': + continue + try: + translations[fname] = translate_vcf_feature(sequences, ref, feat, fname) + except NoVariationError: + features_without_variation.append(fname) + if len(features_without_variation): + print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation))) + else: + sequences = sequences_json(args.ancestral_sequences, tree) + translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'} ## glob the annotations for later auspice export # diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t index aac2e8748..3a5b866bd 100644 --- a/tests/functional/translate/cram/genes.t +++ b/tests/functional/translate/cram/genes.t @@ -15,9 +15,9 @@ as a feature ('nuc' in this case) > --reference-sequence "$DATA/reference.source.gff" \ > --genes gene2 gene3 \ > --output-node-data "aa_muts.genes-args.json" - Validating schema of .+ (re) Couldn't find gene gene3 in GFF or GenBank file Read in 2 features from reference sequence file + Validating schema of .+ (re) amino acid mutations written to .+ (re) $ python3 "$SCRIPTS/diff_jsons.py" \ @@ -37,9 +37,9 @@ Using a text file rather than command line arguments > --genes "genes.txt" \ > --output-node-data "aa_muts.genes-txt.json" Read in 2 specified genes to translate. - Validating schema of .+ (re) Couldn't find gene gene3 in GFF or GenBank file Read in 2 features from reference sequence file + Validating schema of .+ (re) amino acid mutations written to .+ (re) $ python3 "$SCRIPTS/diff_jsons.py" \ diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t index 9115b6f31..b68b134f9 100644 --- a/tests/functional/translate/cram/translate-with-genbank.t +++ b/tests/functional/translate/cram/translate-with-genbank.t @@ -12,8 +12,8 @@ Translate amino acids for genes using a GenBank file. > --reference-sequence "$DATA/zika/zika_outgroup.gb" \ > --genes CA PRO \ > --output-node-data aa_muts.json - Validating schema of '.+nt_muts.json'... (re) Read in 3 features from reference sequence file + Validating schema of '.+nt_muts.json'... (re) amino acid mutations written to .* (re) $ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \ diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t index 62e6acf38..9cb273bc5 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t @@ -18,8 +18,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --ancestral-sequences "${DATA}/zika/nt_muts.json" \ > --reference-sequence "genemap.gff" \ > --output-node-data aa_muts.json - Validating schema of '.+/nt_muts.json'... (re) Read in 2 features from reference sequence file + Validating schema of '.+/nt_muts.json'... (re) amino acid mutations written to .* (re) Other than the sequence ids which will include a temporary path, the JSONs diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t index ca32639ed..2c7d1d016 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene.t @@ -18,8 +18,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --ancestral-sequences "${DATA}/zika/nt_muts.json" \ > --reference-sequence genemap.gff \ > --output-node-data aa_muts.json - Validating schema of '.+/nt_muts.json'... (re) Read in 2 features from reference sequence file + Validating schema of '.+/nt_muts.json'... (re) amino acid mutations written to .* (re) $ python3 "${SCRIPTS}/diff_jsons.py" \ diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t index 8f7ede91c..e58ea8979 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t +++ b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t @@ -15,7 +15,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --output-node-data aa_muts.json \ > --alignment-output translations.vcf \ > --vcf-reference-output translations_reference.fasta - Gene length of rrs_Rvnr01 is not a multiple of 3. will pad with N + Gene length of 'rrs' is not a multiple of 3. will pad with N Read in 187 specified genes to translate. Read in 187 features from reference sequence file 162 genes had no mutations and so have been be excluded.