Skip to content

Commit

Permalink
Merge pull request #740: filter: Support relative dates for `--min-da…
Browse files Browse the repository at this point in the history
…te` and `--max-date`
  • Loading branch information
victorlin authored Apr 11, 2022
2 parents 64ef713 + 29a5a65 commit b464692
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 4 deletions.
45 changes: 41 additions & 4 deletions augur/filter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""
Filter and subsample a sequence set.
"""
import argparse
from Bio import SeqIO
from collections import defaultdict
import csv
import datetime
import heapq
import isodate
import itertools
import json
import numpy as np
Expand All @@ -17,6 +19,7 @@
import sys
from tempfile import NamedTemporaryFile
import treetime.utils
from textwrap import dedent
from typing import Collection

from .index import index_sequences, index_vcf
Expand All @@ -30,6 +33,12 @@
"non_nucleotide",
)

SUPPORTED_DATE_HELP_TEXT = dedent("""\
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
""")


class FilterException(Exception):
"""Representation of an error that occurred during filtering.
Expand Down Expand Up @@ -1109,8 +1118,10 @@ def register_arguments(parser):
Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
(e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
)
metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--min-date', type=numeric_date,
help=f"""minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}""")
metadata_filter_group.add_argument('--max-date', type=numeric_date,
help=f"""maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}""")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
Expand Down Expand Up @@ -1689,18 +1700,44 @@ def numeric_date(date):
"""
Converts the given *date* string to a :py:class:`float`.
*date* may be given as a number (a float) with year as the integer part, or
in the YYYY-MM-DD (ISO 8601) syntax.
*date* may be given as:
1. A string or float (number) with year as the integer part
2. A string in the YYYY-MM-DD (ISO 8601) syntax
3. A string representing a relative date (duration before datetime.date.today())
>>> numeric_date("2020.42")
2020.42
>>> numeric_date("2020-06-04")
2020.42486...
>>> import datetime, isodate, treetime
>>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W"))
True
"""
# date is numeric
try:
return float(date)
except ValueError:
pass

# date is in YYYY-MM-DD form
try:
return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
except ValueError:
pass

# date is a duration treated as a backwards-looking relative date
try:
# make a copy of date for this block
duration_str = str(date)
if duration_str.startswith('P'):
duration_str = duration_str
else:
duration_str = 'P'+duration_str
return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str))
except (ValueError, isodate.ISO8601Error):
pass

raise argparse.ArgumentTypeError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""")


def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
install_requires = [
"bcbio-gff >=0.6.0, ==0.6.*",
"biopython >=1.67, !=1.77, !=1.78",
"isodate ==0.6.*",
"jsonschema >=3.0.0, ==3.*",
"networkx >= 2.5, ==2.*",
"packaging >=19.2",
Expand Down
152 changes: 152 additions & 0 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
from textwrap import dedent
import numpy as np
import random
import shlex
Expand All @@ -9,6 +10,8 @@
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from freezegun import freeze_time

import augur.filter
from augur.utils import read_metadata

Expand Down Expand Up @@ -265,3 +268,152 @@ def test_filter_date_formats(self, tmpdir, fasta_fn, argparser):
augur.filter.run(args)
output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
assert list(output.keys()) == ["SEQ_1", "SEQ_2", "SEQ_3"]

@freeze_time("2020-03-25")
@pytest.mark.parametrize(
"argparse_params, metadata_rows, output_sorted_expected",
[
(
"--min-date 1D",
(
("SEQ_1","2020-03-23"),
("SEQ_2","2020-03-24"),
("SEQ_3","2020-03-25"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date 1D",
(
("SEQ_1","2020-03-23"),
("SEQ_2","2020-03-24"),
("SEQ_3","2020-03-25"),
),
["SEQ_1", "SEQ_2"],
),
(
"--min-date 4W",
(
("SEQ_1","2020-02-25"),
("SEQ_2","2020-02-26"),
("SEQ_3","2020-03-25"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date 4W",
(
("SEQ_1","2020-02-25"),
("SEQ_2","2020-02-26"),
("SEQ_3","2020-03-25"),
),
["SEQ_1", "SEQ_2"],
),
(
"--min-date 1M",
(
("SEQ_1","2020-01-25"),
("SEQ_2","2020-02-25"),
("SEQ_3","2020-03-25"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date 1M",
(
("SEQ_1","2020-01-25"),
("SEQ_2","2020-02-25"),
("SEQ_3","2020-03-25"),
),
["SEQ_1", "SEQ_2"],
),
(
"--min-date P1M",
(
("SEQ_1","2020-01-25"),
("SEQ_2","2020-02-25"),
("SEQ_3","2020-03-25"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date P1M",
(
("SEQ_1","2020-01-25"),
("SEQ_2","2020-02-25"),
("SEQ_3","2020-03-25"),
),
["SEQ_1", "SEQ_2"],
),
(
"--min-date 2Y",
(
("SEQ_1","2017-03-25"),
("SEQ_2","2018-03-25"),
("SEQ_3","2019-03-25"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date 2Y",
(
("SEQ_1","2017-03-25"),
("SEQ_2","2018-03-25"),
("SEQ_3","2019-03-25"),
),
["SEQ_1", "SEQ_2"],
),
(
"--min-date 1Y2W5D",
(
("SEQ_1","2019-03-05"),
("SEQ_2","2019-03-06"),
("SEQ_3","2019-03-07"),
),
["SEQ_2", "SEQ_3"],
),
(
"--max-date 1Y2W5D",
(
("SEQ_1","2019-03-05"),
("SEQ_2","2019-03-06"),
("SEQ_3","2019-03-07"),
),
["SEQ_1", "SEQ_2"],
),
],
)
def test_filter_relative_dates(self, tmpdir, argparser, argparse_params, metadata_rows, output_sorted_expected):
"""Test that various relative dates work"""
out_fn = str(tmpdir / "filtered.txt")
meta_fn = write_metadata(tmpdir, (("strain","date"),
*metadata_rows))
args = argparser(f'--metadata {meta_fn} --output-strains {out_fn} {argparse_params}')
augur.filter.run(args)
with open(out_fn) as f:
output_sorted = sorted(line.rstrip() for line in f)
assert output_sorted == output_sorted_expected

@freeze_time("2020-03-25")
@pytest.mark.parametrize(
"argparse_flag, argparse_value",
[
("--min-date", "3000Y"),
("--max-date", "3000Y"),
("--min-date", "invalid"),
("--max-date", "invalid"),
],
)
def test_filter_relative_dates_error(self, tmpdir, argparser, argparse_flag, argparse_value):
"""Test that invalid dates fail"""
out_fn = str(tmpdir / "filtered.txt")
meta_fn = write_metadata(tmpdir, (("strain","date"),
("SEQ_1","2020-03-23")))
with pytest.raises(SystemExit) as e_info:
argparser(f'--metadata {meta_fn} --output-strains {out_fn} {argparse_flag} {argparse_value}')
assert e_info.value.__context__.message == dedent(f"""\
Unable to determine date from '{argparse_value}'. Ensure it is in one of the supported formats:
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
""")

0 comments on commit b464692

Please sign in to comment.