Skip to content

Commit 52eb007

Browse files
authored
Merge pull request #1807 from nextstrain/empty-index
filter: Error on empty indexes
2 parents 071b811 + 511ad88 commit 52eb007

File tree

5 files changed

+43
-8
lines changed

5 files changed

+43
-8
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
### Major Changes
66

77
* `augur mask --mask`, `augur tree --exclude-sites`: BED files with inconsistent CHROM values (i.e., values in the first column of data lines) will throw an error, as Augur (implicitly) expects to be working on a single piece of DNA (chromosome, segment, etc), and multiple CHROM values in a BED file indicate a violation of this expectation. This is a breaking change. [#945][] (@genehack)
8+
* filter: Empty values in the metadata id column will result in an error that can only be resolved by editing the metadata file or by specifying a different id column with `--metadata-id-columns`. [#1807][] (@joverlee521)
89

910
### Bug fixes
1011

@@ -20,6 +21,7 @@
2021
[#1791]: https://github.com/nextstrain/augur/issues/1791
2122
[#1801]: https://github.com/nextstrain/augur/pull/1801
2223
[#1804]: https://github.com/nextstrain/augur/pull/1804
24+
[#1807]: https://github.com/nextstrain/augur/pull/1807
2325

2426
## 30.0.1 (28 April 2025)
2527

augur/filter/_run.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,18 @@ def run(args):
173173
dtype="string",
174174
)
175175
for metadata in metadata_reader:
176+
if len(metadata.loc[metadata.index == '']):
177+
cleanup_outputs(args)
178+
raise AugurError(f"Found rows with empty values in id column {metadata.index.name!r} in {args.metadata!r}\n" + \
179+
"Please remove the rows with empty ids or use a different id column via --metadata-id-columns.")
180+
176181
duplicate_strains = (
177182
set(metadata.index[metadata.index.duplicated()]) |
178183
(set(metadata.index) & metadata_strains)
179184
)
180185
if len(duplicate_strains) > 0:
181186
cleanup_outputs(args)
182-
raise AugurError(f"The following strains are duplicated in '{args.metadata}':\n" + "\n".join(sorted(duplicate_strains)))
187+
raise AugurError(f"The following strains are duplicated in '{args.metadata}':\n" + "\n".join(repr(x) for x in sorted(duplicate_strains)))
183188

184189
# Maintain list of all strains seen.
185190
metadata_strains.update(set(metadata.index.values))
@@ -396,7 +401,7 @@ def run(args):
396401

397402
if duplicates:
398403
cleanup_outputs(args)
399-
raise AugurError(f"The following strains are duplicated in '{args.sequences}':\n" + "\n".join(sorted(duplicates)))
404+
raise AugurError(f"The following strains are duplicated in '{args.sequences}':\n" + "\n".join(repr(x) for x in sorted(duplicates)))
400405

401406
if sequence_strains != observed_sequence_strains:
402407
# Warn the user if the expected strains from the sequence index are

docs/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ def prose_list(items):
164164
r'https://www\.gnu\.org/software/bash/manual/bash\.html#ANSI_002dC-Quoting',
165165
r'https://stackoverflow\.com/',
166166
r'https://github\.com/',
167+
r'https://www.merriam-webster\.com/',
167168
]
168169
linkcheck_anchors_ignore_for_url = [
169170
# Github uses anchor-looking links for highlighting lines but

tests/functional/filter/cram/filter-duplicates-error.t

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Error on duplicates in metadata within same chunk.
2020
> --metadata-chunk-size 10 \
2121
> --output-metadata metadata-filtered.tsv > /dev/null
2222
ERROR: The following strains are duplicated in .* (re)
23-
a
23+
'a'
2424
[2]
2525
$ cat metadata-filtered.tsv
2626
cat: .*: No such file or directory (re)
@@ -36,7 +36,7 @@ Error on duplicates in metadata in separate chunks.
3636
> --metadata-chunk-size 1 \
3737
> --output-metadata metadata-filtered.tsv > /dev/null
3838
ERROR: The following strains are duplicated in .* (re)
39-
a
39+
'a'
4040
[2]
4141
$ cat metadata-filtered.tsv
4242
cat: .*: No such file or directory (re)
@@ -68,8 +68,8 @@ Error on duplicates in sequences.
6868
> --sequences sequences.fasta \
6969
> --output-sequences sequences-filtered.fasta
7070
ERROR: The following strains are duplicated in 'sequences.fasta':
71-
a
72-
c
71+
'a'
72+
'c'
7373
[2]
7474

7575
Error even if the corresponding output is not used.
@@ -79,6 +79,6 @@ Error even if the corresponding output is not used.
7979
> --sequences sequences.fasta \
8080
> --output-strains filtered.txt
8181
ERROR: The following strains are duplicated in 'sequences.fasta':
82-
a
83-
c
82+
'a'
83+
'c'
8484
[2]
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
Setup
2+
3+
$ source "$TESTDIR"/_setup.sh
4+
5+
Error on empty indexes in metadata.
6+
7+
$ cat >metadata-empty-indexes.tsv <<~~
8+
> strain date
9+
> 2010-10-10
10+
> 2010-10-10
11+
> b 2010-10-10
12+
> c 2010-10-10
13+
> d 2010-10-10
14+
> ~~
15+
$ ${AUGUR} filter \
16+
> --metadata metadata-empty-indexes.tsv \
17+
> --group-by year \
18+
> --sequences-per-group 2 \
19+
> --subsample-seed 0 \
20+
> --metadata-chunk-size 10 \
21+
> --output-metadata metadata-filtered.tsv > /dev/null
22+
ERROR: Found rows with empty values in id column 'strain' in .* (re)
23+
Please remove the rows with empty ids or use a different id column via --metadata-id-columns.
24+
[2]
25+
$ cat metadata-filtered.tsv
26+
cat: .*: No such file or directory (re)
27+
[1]

0 commit comments

Comments
 (0)