Merge pull request #1807 from nextstrain/empty-index

joverlee521 · web-flow · commit 52eb0074636a · 2025-05-19T10:56:09.000-07:00
filter: Error on empty indexes
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,6 +5,7 @@
 ### Major Changes
 
 * `augur mask --mask`, `augur tree --exclude-sites`: BED files with inconsistent CHROM values (i.e., values in the first column of data lines) will throw an error, as Augur (implicitly) expects to be working on a single piece of DNA (chromosome, segment, etc), and multiple CHROM values in a BED file indicate a violation of this expectation. This is a breaking change. [#945][] (@genehack)
+* filter: Empty values in the metadata id column will result in an error that can only be resolved by editing the metadata file or by specifying a different id column with `--metadata-id-columns`. [#1807][] (@joverlee521)
 
 ### Bug fixes
 
@@ -20,6 +21,7 @@
 [#1791]: https://github.com/nextstrain/augur/issues/1791
 [#1801]: https://github.com/nextstrain/augur/pull/1801
 [#1804]: https://github.com/nextstrain/augur/pull/1804
+[#1807]: https://github.com/nextstrain/augur/pull/1807
 
 ## 30.0.1 (28 April 2025)
 
diff --git a/augur/filter/_run.py b/augur/filter/_run.py
@@ -173,13 +173,18 @@ def run(args):
         dtype="string",
     )
     for metadata in metadata_reader:
+        if len(metadata.loc[metadata.index == '']):
+            cleanup_outputs(args)
+            raise AugurError(f"Found rows with empty values in id column {metadata.index.name!r} in {args.metadata!r}\n" + \
+                             "Please remove the rows with empty ids or use a different id column via --metadata-id-columns.")
+
         duplicate_strains = (
             set(metadata.index[metadata.index.duplicated()]) |
             (set(metadata.index) & metadata_strains)
         )
         if len(duplicate_strains) > 0:
             cleanup_outputs(args)
-            raise AugurError(f"The following strains are duplicated in '{args.metadata}':\n" + "\n".join(sorted(duplicate_strains)))
+            raise AugurError(f"The following strains are duplicated in '{args.metadata}':\n" + "\n".join(repr(x) for x in sorted(duplicate_strains)))
 
         # Maintain list of all strains seen.
         metadata_strains.update(set(metadata.index.values))
@@ -396,7 +401,7 @@ def run(args):
 
         if duplicates:
             cleanup_outputs(args)
-            raise AugurError(f"The following strains are duplicated in '{args.sequences}':\n" + "\n".join(sorted(duplicates)))
+            raise AugurError(f"The following strains are duplicated in '{args.sequences}':\n" + "\n".join(repr(x) for x in sorted(duplicates)))
 
         if sequence_strains != observed_sequence_strains:
             # Warn the user if the expected strains from the sequence index are
diff --git a/docs/conf.py b/docs/conf.py
@@ -164,6 +164,7 @@ def prose_list(items):
      r'https://www\.gnu\.org/software/bash/manual/bash\.html#ANSI_002dC-Quoting',
      r'https://stackoverflow\.com/',
      r'https://github\.com/',
+     r'https://www.merriam-webster\.com/',
 ]
 linkcheck_anchors_ignore_for_url = [
      # Github uses anchor-looking links for highlighting lines but
diff --git a/tests/functional/filter/cram/filter-duplicates-error.t b/tests/functional/filter/cram/filter-duplicates-error.t
@@ -20,7 +20,7 @@ Error on duplicates in metadata within same chunk.
   >   --metadata-chunk-size 10 \
   >   --output-metadata metadata-filtered.tsv > /dev/null
   ERROR: The following strains are duplicated in .* (re)
-  a
+  'a'
   [2]
   $ cat metadata-filtered.tsv
   cat: .*: No such file or directory (re)
@@ -36,7 +36,7 @@ Error on duplicates in metadata in separate chunks.
   >   --metadata-chunk-size 1 \
   >   --output-metadata metadata-filtered.tsv > /dev/null
   ERROR: The following strains are duplicated in .* (re)
-  a
+  'a'
   [2]
   $ cat metadata-filtered.tsv
   cat: .*: No such file or directory (re)
@@ -68,8 +68,8 @@ Error on duplicates in sequences.
   >   --sequences sequences.fasta \
   >   --output-sequences sequences-filtered.fasta
   ERROR: The following strains are duplicated in 'sequences.fasta':
-  a
-  c
+  'a'
+  'c'
   [2]
 
 Error even if the corresponding output is not used.
@@ -79,6 +79,6 @@ Error even if the corresponding output is not used.
   >   --sequences sequences.fasta \
   >   --output-strains filtered.txt
   ERROR: The following strains are duplicated in 'sequences.fasta':
-  a
-  c
+  'a'
+  'c'
   [2]
diff --git a/tests/functional/filter/cram/filter-empty-index-error.t b/tests/functional/filter/cram/filter-empty-index-error.t
@@ -0,0 +1,27 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Error on empty indexes in metadata.
+
+  $ cat >metadata-empty-indexes.tsv <<~~
+  > strain	date
+  > 	2010-10-10
+  > 	2010-10-10
+  > b	2010-10-10
+  > c	2010-10-10
+  > d	2010-10-10
+  > ~~
+  $ ${AUGUR} filter \
+  >   --metadata metadata-empty-indexes.tsv \
+  >   --group-by year \
+  >   --sequences-per-group 2 \
+  >   --subsample-seed 0 \
+  >   --metadata-chunk-size 10 \
+  >   --output-metadata metadata-filtered.tsv > /dev/null
+  ERROR: Found rows with empty values in id column 'strain' in .* (re)
+  Please remove the rows with empty ids or use a different id column via --metadata-id-columns.
+  [2]
+  $ cat metadata-filtered.tsv
+  cat: .*: No such file or directory (re)
+  [1]

Original file line number	Diff line number	Diff line change
`@@ -164,6 +164,7 @@ def prose_list(items):`
`164`	`164`	`r'https://www\.gnu\.org/software/bash/manual/bash\.html#ANSI_002dC-Quoting',`
`165`	`165`	`r'https://stackoverflow\.com/',`
`166`	`166`	`r'https://github\.com/',`
	`167`	`+ r'https://www.merriam-webster\.com/',`
`167`	`168`	`]`
`168`	`169`	`linkcheck_anchors_ignore_for_url = [`
`169`	`170`	`# Github uses anchor-looking links for highlighting lines but`