Skip to content

Commit 6839588

Browse files
Fix test, rename argument to --use-nextclade-gff-parsing
1 parent 61af409 commit 6839588

File tree

3 files changed

+17
-19
lines changed

3 files changed

+17
-19
lines changed

augur/ancestral.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ def register_parser(parent_subparsers):
320320
"template like 'aa_sequences_%%GENE.fasta' where %%GENE will be replaced "
321321
"by the gene name.")
322322
amino_acid_options_group.add_argument(
323-
'--nextclade-compatible', action="store_true", default=False,
323+
'--use-nextclade-gff-parsing', action="store_true", default=False,
324324
help="Read GFF annotations the way Nextclade does, using CDSes (including compound) and same qualifiers for gene names."
325325
)
326326

@@ -350,7 +350,7 @@ def validate_arguments(args, is_vcf):
350350
invalid combinations up-front we can exit quickly.
351351
"""
352352
mandatory_aa_arguments = (args.annotation, args.genes, args.translations)
353-
all_aa_arguments = (*mandatory_aa_arguments, args.nextclade_compatible)
353+
all_aa_arguments = (*mandatory_aa_arguments, args.use_nextclade_gff_parsing)
354354
if any(all_aa_arguments) and not all(mandatory_aa_arguments):
355355
raise AugurError("For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.")
356356

@@ -436,7 +436,7 @@ def run(args):
436436

437437
from .utils import load_features
438438
## load features; only requested features if genes given
439-
features = load_features(args.annotation, genes, args.nextclade_compatible)
439+
features = load_features(args.annotation, genes, args.use_nextclade_gff_parsing)
440440
# Ensure the already-created nuc annotation coordinates match those parsed from the reference file
441441
if (features['nuc'].location.start+1 != anc_seqs['annotations']['nuc']['start'] or
442442
features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):

augur/utils.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
import numpy as np
66
import os, json, sys
77
import pandas as pd
8-
from Bio.SeqFeature import CompoundLocation
8+
from Bio.SeqFeature import SimpleLocation, CompoundLocation, SeqFeature, FeatureLocation
99
from collections import defaultdict, OrderedDict
1010
from io import RawIOBase
1111
from textwrap import dedent
12+
from typing import Dict
1213
from .__version__ import __version__
1314

1415
from augur.data import as_file
@@ -168,7 +169,7 @@ def default(self, obj):
168169
return super().default(obj)
169170

170171

171-
def load_features(reference, feature_names=None, nextclade_compatible=False):
172+
def load_features(reference, feature_names=None, use_nextclade_gff_parsing=False):
172173
"""
173174
Parse a GFF/GenBank reference file. See the docstrings for _read_gff and
174175
_read_genbank for details.
@@ -179,7 +180,7 @@ def load_features(reference, feature_names=None, nextclade_compatible=False):
179180
File path to GFF or GenBank (.gb) reference
180181
feature_names : None or set or list (optional)
181182
Restrict the genes we read to those in the set/list
182-
nextclade_compatible : bool (optional)
183+
use_nextclade_gff_parsing : bool (optional)
183184
If True, parse GFF file the way Nextclade does
184185
185186
Returns
@@ -198,13 +199,13 @@ def load_features(reference, feature_names=None, nextclade_compatible=False):
198199
raise AugurError(f"reference sequence file {reference!r} not found")
199200

200201
if '.gff' in reference.lower():
201-
if nextclade_compatible:
202-
return _read_gff_nextclade_compatible(reference, feature_names)
202+
if use_nextclade_gff_parsing:
203+
return _read_gff_like_nextclade(reference, feature_names)
203204
return _read_gff(reference, feature_names)
204205
else:
205206
return _read_genbank(reference, feature_names)
206207

207-
def _read_nuc_annotation_from_gff(record, reference):
208+
def _read_nuc_annotation_from_gff(record: Bio.SeqRecord.SeqRecord, reference) -> SeqFeature:
208209
"""
209210
Looks for the ##sequence-region pragma as well as 'region' & 'source' GFF
210211
types. Note that 'source' isn't really a GFF feature type, but is used
@@ -234,7 +235,6 @@ def _read_nuc_annotation_from_gff(record, reference):
234235
if len(sequence_regions)>1:
235236
raise AugurError(f"Reference {reference!r} contains multiple ##sequence-region pragma lines. Augur can only handle GFF files with a single one.")
236237
elif sequence_regions:
237-
from Bio.SeqFeature import SeqFeature, FeatureLocation
238238
(name, start, stop) = sequence_regions[0]
239239
nuc['pragma'] = SeqFeature(
240240
FeatureLocation(start, stop, strand=1),
@@ -289,15 +289,15 @@ def _load_gff_record(reference, valid_types=None) -> Bio.SeqRecord.SeqRecord:
289289
warnings.simplefilter("default", BiopythonDeprecationWarning)
290290

291291
if len(gff_entries) == 0:
292-
msg = f"Reference {reference!r} contains no valid data rows"
292+
msg = f"Reference {reference!r} contains no valid data rows."
293293
if valid_types:
294-
msg += f"Valid GFF types (3rd column) are {', '.join(valid_types)}."
294+
msg += f" Valid GFF types (3rd column) are {', '.join(valid_types)}."
295295
raise AugurError(msg)
296296
if len(gff_entries) > 1:
297297
raise AugurError(f"Reference {reference!r} contains multiple seqids (first column). Augur can only handle GFF files with a single seqid.")
298298
return gff_entries[0]
299299

300-
def _lookup_feature_name_nextclade_compatible(feature):
300+
def _lookup_feature_name_like_nextclade(feature: SeqFeature) -> str:
301301
# Matching Nextclade conventions (NAME_ATTRS_CDS)
302302
# https://github.com/nextstrain/nextclade/blob/59e757fd9c9f8d8edd16cf2063d77a859d4d3b96/packages/nextclade/src/io/gff3.rs#L35-L54
303303
QUALIFIER_PRIORITY = [ "Name", "name", "Alias", "alias", "standard_name", "old-name", "Gene", "gene", "gene_name",
@@ -308,7 +308,7 @@ def _lookup_feature_name_nextclade_compatible(feature):
308308
return feature.qualifiers[qualifier][0]
309309
raise AugurError(f"No valid feature name found for feature for feature {feature.id}")
310310

311-
def _read_gff_nextclade_compatible(reference, feature_names):
311+
def _read_gff_like_nextclade(reference, feature_names) -> Dict[str, SeqFeature]:
312312
"""
313313
Read a GFF file the way Nextclade does. That means:
314314
- We only look at CDS features.
@@ -359,7 +359,7 @@ def _flatten(feature):
359359
_flatten(feature)
360360

361361
for feature in cds_features.values():
362-
feature_name = _lookup_feature_name_nextclade_compatible(feature)
362+
feature_name = _lookup_feature_name_like_nextclade(feature)
363363
if feature_name == 'nuc':
364364
raise AugurError(f"Reference {reference!r} contains a gene with the name 'nuc'. This is not allowed.")
365365
if feature_name in feature_names or feature_names is None:
@@ -373,7 +373,7 @@ def _flatten(feature):
373373
return features
374374

375375

376-
def _read_gff(reference, feature_names):
376+
def _read_gff(reference, feature_names) -> Dict[str, SeqFeature]:
377377
"""
378378
Read a GFF file. We only read GFF IDs 'gene' or 'source' (the latter may not technically
379379
be a valid GFF field, but is used widely within the Nextstrain ecosystem).
@@ -948,8 +948,6 @@ def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nu
948948
See schema-annotations.json for the schema this conforms to
949949
950950
"""
951-
from Bio.SeqFeature import SimpleLocation, CompoundLocation
952-
953951
if assert_nuc and 'nuc' not in features:
954952
raise AugurError("Genome features must include a feature for 'nuc'")
955953

tests/functional/ancestral/cram/infer-amino-acid-sequences-nextclade-gff.t

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Infer ancestral nucleotide and amino acid sequences using Nextclade GFF annotati
99
> --alignment $TESTDIR/../data/ebola/masked.fasta \
1010
> --annotation $TESTDIR/../data/ebola/genome_annotation.gff3 \
1111
> --genes GP L NP sGP ssGP VP24 VP30 VP35 VP40 \
12-
> --nextclade-compatible \
12+
> --use-nextclade-gff-parsing \
1313
> --translations $TESTDIR/../data/ebola/translations/%GENE.fasta \
1414
> --infer-ambiguous \
1515
> --inference joint \

0 commit comments

Comments
 (0)