Skip to content

Commit

Permalink
Move date logic from filter to utils
Browse files Browse the repository at this point in the history
With #740, filter's numeric_date() provides support for 3 date formats.
However, supporting various date formats is not specific to filter (e.g. frequencies has a separate numeric_date() which is now out-of-sync with this filter's numeric_date()).

This commit:

1. Moves numeric_date() to a new submodule augur.utils.date_parsing
2. Moves the related SUPPORTED_DATE_HELP_TEXT to augur.utils.date_parsing
3. Updates numeric_date() to raise a TypeError rather than argparse.ArgumentTypeError so it can be generalized to non-argparse usage
4. Adds a new function numeric_date_type() which wraps numeric_date() and raises an argparse.ArgumentTypeError per #740 (comment)
  • Loading branch information
victorlin committed Apr 12, 2022
1 parent 24b96e6 commit 1ae4696
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 64 deletions.
75 changes: 11 additions & 64 deletions augur/filter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
"""
Filter and subsample a sequence set.
"""
import argparse
from Bio import SeqIO
from collections import defaultdict
import csv
import datetime
import heapq
import isodate
import itertools
import json
import numpy as np
Expand All @@ -18,12 +15,12 @@
import re
import sys
from tempfile import NamedTemporaryFile
import treetime.utils
from textwrap import dedent
from typing import Collection


from .index import index_sequences, index_vcf
from .io import open_file, read_metadata, read_sequences, write_sequences
from .utils import date_parsing
from .utils import is_vcf as filename_is_vcf, read_vcf, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous

comment_char = '#'
Expand All @@ -33,12 +30,6 @@
"non_nucleotide",
)

SUPPORTED_DATE_HELP_TEXT = dedent("""\
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
""")


class FilterException(Exception):
"""Representation of an error that occurred during filtering.
Expand Down Expand Up @@ -309,20 +300,20 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None):
Strains that pass the filter
>>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"])
>>> filter_by_date(metadata, min_date=numeric_date("2020-01-02"))
>>> filter_by_date(metadata, min_date=date_parsing.numeric_date("2020-01-02"))
{'strain2'}
>>> filter_by_date(metadata, max_date=numeric_date("2020-01-01"))
>>> filter_by_date(metadata, max_date=date_parsing.numeric_date("2020-01-01"))
{'strain1'}
>>> filter_by_date(metadata, min_date=numeric_date("2020-01-03"), max_date=numeric_date("2020-01-10"))
>>> filter_by_date(metadata, min_date=date_parsing.numeric_date("2020-01-03"), max_date=date_parsing.numeric_date("2020-01-10"))
set()
>>> sorted(filter_by_date(metadata, min_date=numeric_date("2019-12-30"), max_date=numeric_date("2020-01-10")))
>>> sorted(filter_by_date(metadata, min_date=date_parsing.numeric_date("2019-12-30"), max_date=date_parsing.numeric_date("2020-01-10")))
['strain1', 'strain2']
>>> sorted(filter_by_date(metadata))
['strain1', 'strain2']
If the requested date column does not exist, we quietly skip this filter.
>>> sorted(filter_by_date(metadata, date_column="missing_column", min_date=numeric_date("2020-01-02")))
>>> sorted(filter_by_date(metadata, date_column="missing_column", min_date=date_parsing.numeric_date("2020-01-02")))
['strain1', 'strain2']
"""
Expand Down Expand Up @@ -678,7 +669,7 @@ def filter_kwargs_to_str(kwargs):
>>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})]
>>> filter_kwargs_to_str(exclude_by[0][1])
'[["min_length", 27000]]'
>>> exclude_by = [(filter_by_date, {"max_date": numeric_date("2020-04-01"), "min_date": numeric_date("2020-03-01")})]
>>> exclude_by = [(filter_by_date, {"max_date": date_parsing.numeric_date("2020-04-01"), "min_date": date_parsing.numeric_date("2020-03-01")})]
>>> filter_kwargs_to_str(exclude_by[0][1])
'[["max_date", 2020.25], ["min_date", 2020.17]]'
Expand Down Expand Up @@ -732,7 +723,7 @@ def apply_filters(metadata, exclude_by, include_by):
from Africa.
>>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-10-02"}, {"region": "North America", "date": "2020-01-01"}], index=["strain1", "strain2", "strain3"])
>>> exclude_by = [(filter_by_date, {"min_date": numeric_date("2020-04-01")})]
>>> exclude_by = [(filter_by_date, {"min_date": date_parsing.numeric_date("2020-04-01")})]
>>> include_by = [(include_by_include_where, {"include_where": "region=Africa"})]
>>> strains_to_keep, strains_to_exclude, strains_to_include = apply_filters(metadata, exclude_by, include_by)
>>> strains_to_keep
Expand Down Expand Up @@ -1118,8 +1109,8 @@ def register_arguments(parser):
Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
(e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
)
metadata_filter_group.add_argument('--min-date', type=numeric_date, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--max-date', type=numeric_date, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--min-date', type=date_parsing.numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {date_parsing.SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--max-date', type=date_parsing.numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {date_parsing.SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
Expand Down Expand Up @@ -1694,50 +1685,6 @@ def _filename_gz(filename):
return filename.lower().endswith(".gz")


def numeric_date(date):
"""
Converts the given *date* string to a :py:class:`float`.
*date* may be given as:
1. A string or float (number) with year as the integer part
2. A string in the YYYY-MM-DD (ISO 8601) syntax
3. A string representing a relative date (duration before datetime.date.today())
>>> numeric_date("2020.42")
2020.42
>>> numeric_date("2020-06-04")
2020.42486...
>>> import datetime, isodate, treetime
>>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W"))
True
"""
# date is numeric
try:
return float(date)
except ValueError:
pass

# date is in YYYY-MM-DD form
try:
return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
except ValueError:
pass

# date is a duration treated as a backwards-looking relative date
try:
# make a copy of date for this block
duration_str = str(date)
if duration_str.startswith('P'):
duration_str = duration_str
else:
duration_str = 'P'+duration_str
return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str))
except (ValueError, isodate.ISO8601Error):
pass

raise argparse.ArgumentTypeError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""")


def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True):
"""Calculate the number of sequences per group for a given maximum number of
sequences to be returned and the number of sequences in each requested
Expand Down
65 changes: 65 additions & 0 deletions augur/util_support/date_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse
import datetime
from textwrap import dedent
import isodate
import treetime.utils

SUPPORTED_DATE_HELP_TEXT = dedent("""\
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
""")

def numeric_date(date):
"""
Converts the given *date* string to a :py:class:`float`.
*date* may be given as:
1. A string or float (number) with year as the integer part
2. A string in the YYYY-MM-DD (ISO 8601) syntax
3. A string representing a relative date (duration before datetime.date.today())
>>> numeric_date("2020.42")
2020.42
>>> numeric_date("2020-06-04")
2020.42486...
>>> import datetime, isodate, treetime
>>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W"))
True
"""
# date is numeric
try:
return float(date)
except ValueError:
pass

# date is in YYYY-MM-DD form
try:
return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2))))
except ValueError:
pass

# date is a duration treated as a backwards-looking relative date
try:
# make a copy of date for this block
duration_str = str(date)
if duration_str.startswith('P'):
duration_str = duration_str
else:
duration_str = 'P'+duration_str
return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str))
except (ValueError, isodate.ISO8601Error):
pass

raise ValueError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""")

def numeric_date_type(date):
"""Wraps numeric_date() for argparse usage.
This raises an ArgumentTypeError, otherwise the custom exception message won't be shown console output due to:
https://github.com/python/cpython/blob/5c4d1f6e0e192653560ae2941a6677fbf4fbd1f2/Lib/argparse.py#L2503-L2513
"""
try:
return numeric_date(date)
except ValueError as e:
raise argparse.ArgumentTypeError(str(e)) from e
1 change: 1 addition & 0 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from augur.io import open_file

from augur.util_support import date_parsing
from augur.util_support.color_parser import ColorParser
from augur.util_support.date_disambiguator import DateDisambiguator
from augur.util_support.metadata_file import MetadataFile
Expand Down

0 comments on commit 1ae4696

Please sign in to comment.