55import numpy as np
66import os , json , sys
77import pandas as pd
8- from Bio .SeqFeature import CompoundLocation
8+ from Bio .SeqFeature import SimpleLocation , CompoundLocation , SeqFeature , FeatureLocation
99from collections import defaultdict , OrderedDict
1010from io import RawIOBase
1111from textwrap import dedent
12+ from typing import Dict
1213from .__version__ import __version__
1314
1415from augur .data import as_file
@@ -168,7 +169,7 @@ def default(self, obj):
168169 return super ().default (obj )
169170
170171
171- def load_features (reference , feature_names = None , nextclade_compatible = False ):
172+ def load_features (reference , feature_names = None , use_nextclade_gff_parsing = False ):
172173 """
173174 Parse a GFF/GenBank reference file. See the docstrings for _read_gff and
174175 _read_genbank for details.
@@ -179,7 +180,7 @@ def load_features(reference, feature_names=None, nextclade_compatible=False):
179180 File path to GFF or GenBank (.gb) reference
180181 feature_names : None or set or list (optional)
181182 Restrict the genes we read to those in the set/list
182- nextclade_compatible : bool (optional)
183+ use_nextclade_gff_parsing : bool (optional)
183184 If True, parse GFF file the way Nextclade does
184185
185186 Returns
@@ -198,13 +199,13 @@ def load_features(reference, feature_names=None, nextclade_compatible=False):
198199 raise AugurError (f"reference sequence file { reference !r} not found" )
199200
200201 if '.gff' in reference .lower ():
201- if nextclade_compatible :
202- return _read_gff_nextclade_compatible (reference , feature_names )
202+ if use_nextclade_gff_parsing :
203+ return _read_gff_like_nextclade (reference , feature_names )
203204 return _read_gff (reference , feature_names )
204205 else :
205206 return _read_genbank (reference , feature_names )
206207
207- def _read_nuc_annotation_from_gff (record , reference ):
208+ def _read_nuc_annotation_from_gff (record : Bio . SeqRecord . SeqRecord , reference ) -> SeqFeature :
208209 """
209210 Looks for the ##sequence-region pragma as well as 'region' & 'source' GFF
210211 types. Note that 'source' isn't really a GFF feature type, but is used
@@ -234,7 +235,6 @@ def _read_nuc_annotation_from_gff(record, reference):
234235 if len (sequence_regions )> 1 :
235236 raise AugurError (f"Reference { reference !r} contains multiple ##sequence-region pragma lines. Augur can only handle GFF files with a single one." )
236237 elif sequence_regions :
237- from Bio .SeqFeature import SeqFeature , FeatureLocation
238238 (name , start , stop ) = sequence_regions [0 ]
239239 nuc ['pragma' ] = SeqFeature (
240240 FeatureLocation (start , stop , strand = 1 ),
@@ -289,15 +289,15 @@ def _load_gff_record(reference, valid_types=None) -> Bio.SeqRecord.SeqRecord:
289289 warnings .simplefilter ("default" , BiopythonDeprecationWarning )
290290
291291 if len (gff_entries ) == 0 :
292- msg = f"Reference { reference !r} contains no valid data rows"
292+ msg = f"Reference { reference !r} contains no valid data rows. "
293293 if valid_types :
294- msg += f"Valid GFF types (3rd column) are { ', ' .join (valid_types )} ."
294+ msg += f" Valid GFF types (3rd column) are { ', ' .join (valid_types )} ."
295295 raise AugurError (msg )
296296 if len (gff_entries ) > 1 :
297297 raise AugurError (f"Reference { reference !r} contains multiple seqids (first column). Augur can only handle GFF files with a single seqid." )
298298 return gff_entries [0 ]
299299
300- def _lookup_feature_name_nextclade_compatible (feature ) :
300+ def _lookup_feature_name_like_nextclade (feature : SeqFeature ) -> str :
301301 # Matching Nextclade conventions (NAME_ATTRS_CDS)
302302 # https://github.com/nextstrain/nextclade/blob/59e757fd9c9f8d8edd16cf2063d77a859d4d3b96/packages/nextclade/src/io/gff3.rs#L35-L54
303303 QUALIFIER_PRIORITY = [ "Name" , "name" , "Alias" , "alias" , "standard_name" , "old-name" , "Gene" , "gene" , "gene_name" ,
@@ -308,7 +308,7 @@ def _lookup_feature_name_nextclade_compatible(feature):
308308 return feature .qualifiers [qualifier ][0 ]
309309 raise AugurError (f"No valid feature name found for feature for feature { feature .id } " )
310310
311- def _read_gff_nextclade_compatible (reference , feature_names ):
311+ def _read_gff_like_nextclade (reference , feature_names ) -> Dict [ str , SeqFeature ] :
312312 """
313313 Read a GFF file the way Nextclade does. That means:
314314 - We only look at CDS features.
@@ -359,7 +359,7 @@ def _flatten(feature):
359359 _flatten (feature )
360360
361361 for feature in cds_features .values ():
362- feature_name = _lookup_feature_name_nextclade_compatible (feature )
362+ feature_name = _lookup_feature_name_like_nextclade (feature )
363363 if feature_name == 'nuc' :
364364 raise AugurError (f"Reference { reference !r} contains a gene with the name 'nuc'. This is not allowed." )
365365 if feature_name in feature_names or feature_names is None :
@@ -373,7 +373,7 @@ def _flatten(feature):
373373 return features
374374
375375
376- def _read_gff (reference , feature_names ):
376+ def _read_gff (reference , feature_names ) -> Dict [ str , SeqFeature ] :
377377 """
378378 Read a GFF file. We only read GFF IDs 'gene' or 'source' (the latter may not technically
379379 be a valid GFF field, but is used widely within the Nextstrain ecosystem).
@@ -948,8 +948,6 @@ def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nu
948948 See schema-annotations.json for the schema this conforms to
949949
950950 """
951- from Bio .SeqFeature import SimpleLocation , CompoundLocation
952-
953951 if assert_nuc and 'nuc' not in features :
954952 raise AugurError ("Genome features must include a feature for 'nuc'" )
955953
0 commit comments