Skip to content

Commit 146084c

Browse files
committed
🚧 add sequence support
1 parent afd6fc7 commit 146084c

File tree

3 files changed

+240
-5
lines changed

3 files changed

+240
-5
lines changed

augur/merge.py

Lines changed: 163 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,13 @@
4646
from shutil import which
4747
from tempfile import mkstemp
4848
from textwrap import dedent
49-
from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, TypeVar
49+
from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, TypeVar, Union
5050

5151
from augur.argparse_ import ExtendOverwriteDefault, SKIP_AUTO_DEFAULT_IN_HELP
5252
from augur.errors import AugurError
5353
from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, Metadata
5454
from augur.io.print import print_err, print_debug, _n
55+
from augur.io.sequences import read_sequences
5556
from augur.utils import first_line
5657

5758

@@ -74,6 +75,9 @@
7475
augur = f"augur"
7576

7677

78+
SEQUENCE_ID_COLUMN = 'id'
79+
80+
7781
class Database:
7882
fd: int
7983
"""Database file descriptor"""
@@ -105,13 +109,41 @@ def cleanup(self):
105109
os.unlink(self.path)
106110

107111

112+
class UnnamedFile:
113+
table_name: str
114+
"""Generated SQLite table name for this file, based on *path*."""
115+
116+
path: str
117+
118+
108119
class NamedFile:
109120
name: str
110121
"""User-provided descriptive name for this file."""
111122

112123
table_name: str
113124
"""Generated SQLite table name for this file, based on *name*."""
114125

126+
path: str
127+
128+
129+
class NamedSequenceFile(NamedFile):
130+
def __init__(self, name: str, path: str):
131+
self.name = name
132+
self.path = path
133+
self.table_name = f"sequences_{self.name}"
134+
135+
def __repr__(self):
136+
return f"<NamedSequenceFile {self.name}={self.path}>"
137+
138+
139+
class UnnamedSequenceFile(UnnamedFile):
140+
def __init__(self, path: str):
141+
self.path = path
142+
self.table_name = f"sequences_{re.sub(r'[^a-zA-Z0-9]', '_', os.path.basename(self.path))}"
143+
144+
def __repr__(self):
145+
return f"<NamedSequenceFile {self.name}={self.path}>"
146+
115147

116148
class NamedMetadata(Metadata, NamedFile):
117149
def __init__(self, name: str, *args, **kwargs):
@@ -128,6 +160,7 @@ def register_parser(parent_subparsers):
128160

129161
input_group = parser.add_argument_group("inputs", "options related to input")
130162
input_group.add_argument("--metadata", nargs="+", action="extend", metavar="NAME=FILE", help="Required. Metadata table names and file paths. Names are arbitrary monikers used solely for referring to the associated input file in other arguments and in output column names. Paths must be to seekable files, not unseekable streams. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
163+
input_group.add_argument("--sequences", nargs="+", action="extend", metavar="[NAME=]FILE", help="Sequence files, optionally named for validation with named metadata. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
131164

132165
input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="[TABLE=]COLUMN", help=f"Possible metadata column names containing identifiers, considered in the order given. Columns will be considered for all metadata tables by default. Table-specific column names may be given using the same names assigned in --metadata. Only one ID column will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_ID_COLUMNS))})" + SKIP_AUTO_DEFAULT_IN_HELP)
133166
input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="[TABLE=]CHARACTER", help=f"Possible field delimiters to use for reading metadata tables, considered in the order given. Delimiters will be considered for all metadata tables by default. Table-specific delimiters may be given using the same names assigned in --metadata. Only one delimiter will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_DELIMITERS))})" + SKIP_AUTO_DEFAULT_IN_HELP)
@@ -136,23 +169,26 @@ def register_parser(parent_subparsers):
136169
output_group.add_argument('--output-metadata', metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
137170
output_group.add_argument('--source-columns', metavar="TEMPLATE", help=f"Template with which to generate names for the columns (described above) identifying the source of each row's data. Must contain a literal placeholder, {{NAME}}, which stands in for the metadata table names assigned in --metadata. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP)
138171
output_group.add_argument('--no-source-columns', dest="source_columns", action="store_const", const=None, help=f"Suppress generated columns (described above) identifying the source of each row's data. This is the default behaviour, but it may be made explicit or used to override a previous --source-columns." + SKIP_AUTO_DEFAULT_IN_HELP)
172+
output_group.add_argument('--output-sequences', metavar="FILE", help="Required. Merged sequences as FASTA. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
139173
output_group.add_argument('--quiet', action="store_true", default=False, help="Suppress informational and warning messages normally written to stderr. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP)
140174

141175
return parser
142176

143177

144178
def validate_arguments(args):
145179
# These will make more sense when sequence support is added.
146-
if not args.metadata:
180+
if not any((args.metadata, args.sequences)):
147181
raise AugurError("At least one input must be specified.")
148-
if not args.output_metadata:
182+
if not any((args.output_metadata, args.output_sequences)):
149183
raise AugurError("At least one output must be specified.")
150184

151185
if args.metadata and not len(args.metadata) >= 2:
152186
raise AugurError(f"At least two metadata inputs are required for merging.")
153187

154188
if args.output_metadata and not args.metadata:
155189
raise AugurError("--output-metadata requires --metadata.")
190+
if args.output_sequences and not args.sequences:
191+
raise AugurError("--output-sequences requires --sequences.")
156192

157193

158194
def run(args: argparse.Namespace):
@@ -166,15 +202,65 @@ def run(args: argparse.Namespace):
166202

167203
db = Database()
168204

205+
metadata: Optional[List[NamedMetadata]] = None
206+
sequences: Optional[List[Union[NamedSequenceFile, UnnamedSequenceFile]]] = None
169207
if args.metadata:
170208
metadata = get_metadata(args.metadata, args.metadata_id_columns, args.metadata_delimiters)
171-
output_source_column = get_output_source_column(args.source_columns, metadata)
172-
output_columns = get_output_columns(metadata, args.source_columns)
209+
210+
if args.sequences:
211+
sequences = list(get_sequences(args.sequences))
212+
213+
214+
# Perform checks on file names.
215+
216+
named_sequences = [s for s in sequences if isinstance(s, NamedSequenceFile)]
217+
218+
if unnamed_sequences := [s for s in sequences if isinstance(s, UnnamedSequenceFile)]:
219+
for x in unnamed_sequences:
220+
print_info(f"WARNING: Sequence file {x.path!r} is unnamed. Skipping validation with metadata.")
221+
222+
if metadata and named_sequences:
223+
metadata_order = [m.name for m in metadata]
224+
sequences_order = [s.name for s in named_sequences]
225+
226+
if metadata_order != sequences_order:
227+
raise AugurError(f"Order of inputs differs between named metadata {metadata_order!r} and named sequences {sequences_order!r}.")
228+
229+
# FIXME: easy win that requires a few more conditions:
230+
# ERROR: Sequence file 'c=c.fasta' does not have a corresponding metadata file.
231+
232+
233+
# Load data.
234+
235+
if metadata:
173236
load_metadata(db, metadata)
237+
if sequences:
238+
load_sequences(db, sequences)
239+
240+
241+
# Perform checks on file contents.
242+
243+
if metadata and named_sequences:
244+
metadata_by_name = {m.name: m for m in metadata}
245+
sequences_by_name = {s.name: s for s in named_sequences}
246+
247+
for name in metadata_by_name.keys() & sequences_by_name.keys():
248+
# FIXME: check database for matching entries
249+
# WARNING: Sequence 'XXX' in a.tsv is missing from a.fasta. It will not be present in any output.
250+
# WARNING: Sequence 'YYY' in b.fasta is missing from b.csv. It will not be present in any output.
251+
...
252+
253+
254+
# Handle outputs.
174255

175256
if args.output_metadata:
257+
output_source_column = get_output_source_column(args.source_columns, metadata)
258+
output_columns = get_output_columns(metadata, args.source_columns)
176259
merge_metadata(db, metadata, output_columns, args.output_metadata, output_source_column)
177260

261+
if args.output_sequences:
262+
merge_sequences(db, sequences, args.output_sequences)
263+
178264

179265
def get_metadata(
180266
input_metadata: Sequence[str],
@@ -387,6 +473,78 @@ def merge_metadata(
387473
db.cleanup()
388474

389475

476+
def get_sequences(input_sequences: List[str]):
477+
sequences = parse_inputs(input_sequences)
478+
479+
for name, path in sequences:
480+
if name == "":
481+
yield UnnamedSequenceFile(path)
482+
else:
483+
yield NamedSequenceFile(name, path)
484+
485+
486+
def load_sequences(db: Database, sequences: List[Union[NamedSequenceFile, UnnamedSequenceFile]]):
487+
for s in sequences:
488+
ids = [seq.id for seq in read_sequences(s.path)]
489+
490+
if duplicates := [item for item, count in count_unique(ids) if count > 1]:
491+
raise AugurError(f"The following entries are duplicated in {s.path!r}:\n" + "\n".join(duplicates))
492+
493+
db.run(f"create table {sqlite_quote_id(s.table_name)} ({sqlite_quote_id(SEQUENCE_ID_COLUMN)} text);")
494+
values = ", ".join([f"('{id}')" for id in ids])
495+
db.run(f"insert into {sqlite_quote_id(s.table_name)} ({sqlite_quote_id(SEQUENCE_ID_COLUMN)}) values {values};")
496+
497+
db.run(f'create unique index {sqlite_quote_id(f"{s.table_name}_id")} on {sqlite_quote_id(s.table_name)}({sqlite_quote_id(SEQUENCE_ID_COLUMN)});')
498+
499+
500+
# FIXME: return a list of arguments and don't use shell
501+
def cat(filepath: str):
502+
cat = "cat"
503+
504+
if filepath.endswith(".gz"):
505+
cat = "gzcat"
506+
if filepath.endswith(".xz"):
507+
cat = "xzcat"
508+
if filepath.endswith(".zst"):
509+
cat = "zstdcat"
510+
511+
return f"{cat} {filepath}"
512+
513+
514+
def merge_sequences(
515+
db: Database,
516+
sequences: List[Union[NamedSequenceFile, UnnamedSequenceFile]],
517+
output_sequences: str,
518+
):
519+
# Confirm that seqkit is installed.
520+
if which("seqkit") is None:
521+
raise AugurError("'seqkit' is not installed! This is required to merge sequences.")
522+
523+
# Reversed because seqkit rmdup keeps the first entry but this command
524+
# should keep the last entry.
525+
# FIXME: don't use shell. just using it to get a sense of feasibility.
526+
# FIXME: is seqkit overkill here? compare to ncov's drop_duplicate_sequences which is plain Python.
527+
# https://github.com/nextstrain/ncov/blob/0769ac0429df8456ce70be2f74dc885d7b7fab12/scripts/sanitize_sequences.py#L127
528+
cat_processes = (f"<({cat(filepath)})" for filepath in reversed([sequence_input.path for sequence_input in sequences]))
529+
shell_cmd = f"cat {' '.join(cat_processes)} | seqkit rmdup"
530+
print_debug(F"running shell command {shell_cmd!r}")
531+
process = subprocess.Popen(shell_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE)
532+
533+
# FIXME: output only the sequences that are also present in metadata
534+
# 1. before calling this function, create an index table mapping ID -> output (boolean)
535+
# 2. create a file with list of IDs to output
536+
# 3. use `seqkit grep` to filter the output of rmdup
537+
538+
# FIXME: handle `-` better
539+
output = process.communicate()[0]
540+
if output_sequences == "-":
541+
sys.stdout.write(output.decode())
542+
else:
543+
with open(output_sequences, "w") as f:
544+
f.write(output.decode())
545+
546+
547+
# FIXME: do this for seqkit too
390548
def sqlite3(*args, **kwargs):
391549
"""
392550
Internal helper for invoking ``sqlite3``, the SQLite CLI program.
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
SETUP
2+
3+
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
4+
5+
Merge sequences and metadata
6+
7+
$ cat >x.fasta <<~~
8+
> >seq1
9+
> ATCG
10+
> >seq2
11+
> GCTA
12+
> >seq3
13+
> TCGA
14+
> ~~
15+
16+
$ cat >y.fasta <<~~
17+
> >seq3
18+
> ATCG
19+
> >seq4
20+
> GCTA
21+
> ~~
22+
23+
$ cat >x.tsv <<~~
24+
> strain a b c
25+
> one X1a X1b X1c
26+
> two X2a X2b X2c
27+
> ~~
28+
29+
$ cat >y.tsv <<~~
30+
> strain b c f e d
31+
> two Y2c Y2f Y2e Y2d
32+
> three Y3f Y3e Y3d
33+
> ~~
34+
35+
$ ${AUGUR} merge \
36+
> --metadata X=x.tsv Y=y.tsv \
37+
> --sequences X=x.fasta Y=y.fasta \
38+
> --output-metadata merged.tsv \
39+
> --output-sequences merged.fasta \
40+
> --quiet
41+
[INFO]\x1b[0m 1 duplicated records removed (esc)
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
SETUP
2+
3+
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
4+
5+
Merge sequences without metadata
6+
7+
$ cat >x.fasta <<~~
8+
> >seq1
9+
> ATCG
10+
> >seq2
11+
> GCTA
12+
> >seq3
13+
> TCGA
14+
> ~~
15+
16+
$ cat >y.fasta <<~~
17+
> >seq3
18+
> ATCG
19+
> >seq4
20+
> GCTA
21+
> ~~
22+
23+
$ ${AUGUR} merge \
24+
> --sequences x=x.fasta y=y.fasta \
25+
> --output-sequences - > merged.fasta
26+
[INFO]\x1b[0m 1 duplicated records removed (esc)
27+
28+
$ cat merged.fasta
29+
>seq3
30+
ATCG
31+
>seq4
32+
GCTA
33+
>seq1
34+
ATCG
35+
>seq2
36+
GCTA

0 commit comments

Comments
 (0)