From aebf81625d4c60d727251d4dfcbbe7598c8ee080 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 12 Apr 2022 10:40:30 -0700 Subject: [PATCH] Move date logic from filter to utils With #740, filter's numeric_date() provides support for 3 date formats. However, supporting various date formats is not specific to filter (e.g. frequencies has a separate numeric_date() which is now out-of-sync with this filter's numeric_date()). This commit: 1. Moves numeric_date() to a new submodule augur.utils.date_parsing 2. Moves the related SUPPORTED_DATE_HELP_TEXT to augur.utils.date_parsing 3. Updates numeric_date() to raise a TypeError rather than argparse.ArgumentTypeError so it can be generalized to non-argparse usage 4. Adds a new function numeric_date_type() which wraps numeric_date() and raises an argparse.ArgumentTypeError per https://github.com/nextstrain/augur/pull/740#discussion_r844505081 --- augur/filter.py | 74 ++++-------------------------- augur/util_support/date_parsing.py | 65 ++++++++++++++++++++++++++ augur/utils.py | 1 + 3 files changed, 76 insertions(+), 64 deletions(-) create mode 100644 augur/util_support/date_parsing.py diff --git a/augur/filter.py b/augur/filter.py index 0e60c79fc..0cfb2c4e5 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -1,13 +1,10 @@ """ Filter and subsample a sequence set. """ -import argparse from Bio import SeqIO from collections import defaultdict import csv -import datetime import heapq -import isodate import itertools import json import numpy as np @@ -18,12 +15,11 @@ import re import sys from tempfile import NamedTemporaryFile -import treetime.utils -from textwrap import dedent from typing import Collection from .index import index_sequences, index_vcf from .io import open_file, read_metadata, read_sequences, write_sequences +from .utils import date_parsing from .utils import is_vcf as filename_is_vcf, read_vcf, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous comment_char = '#' @@ -33,12 +29,6 @@ "non_nucleotide", ) -SUPPORTED_DATE_HELP_TEXT = dedent("""\ - 1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or - 2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or - 3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W') -""") - class FilterException(Exception): """Representation of an error that occurred during filtering. @@ -309,20 +299,20 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None): Strains that pass the filter >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) - >>> filter_by_date(metadata, min_date=numeric_date("2020-01-02")) + >>> filter_by_date(metadata, min_date=date_parsing.numeric_date("2020-01-02")) {'strain2'} - >>> filter_by_date(metadata, max_date=numeric_date("2020-01-01")) + >>> filter_by_date(metadata, max_date=date_parsing.numeric_date("2020-01-01")) {'strain1'} - >>> filter_by_date(metadata, min_date=numeric_date("2020-01-03"), max_date=numeric_date("2020-01-10")) + >>> filter_by_date(metadata, min_date=date_parsing.numeric_date("2020-01-03"), max_date=date_parsing.numeric_date("2020-01-10")) set() - >>> sorted(filter_by_date(metadata, min_date=numeric_date("2019-12-30"), max_date=numeric_date("2020-01-10"))) + >>> sorted(filter_by_date(metadata, min_date=date_parsing.numeric_date("2019-12-30"), max_date=date_parsing.numeric_date("2020-01-10"))) ['strain1', 'strain2'] >>> sorted(filter_by_date(metadata)) ['strain1', 'strain2'] If the requested date column does not exist, we quietly skip this filter. - >>> sorted(filter_by_date(metadata, date_column="missing_column", min_date=numeric_date("2020-01-02"))) + >>> sorted(filter_by_date(metadata, date_column="missing_column", min_date=date_parsing.numeric_date("2020-01-02"))) ['strain1', 'strain2'] """ @@ -678,7 +668,7 @@ def filter_kwargs_to_str(kwargs): >>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})] >>> filter_kwargs_to_str(exclude_by[0][1]) '[["min_length", 27000]]' - >>> exclude_by = [(filter_by_date, {"max_date": numeric_date("2020-04-01"), "min_date": numeric_date("2020-03-01")})] + >>> exclude_by = [(filter_by_date, {"max_date": date_parsing.numeric_date("2020-04-01"), "min_date": date_parsing.numeric_date("2020-03-01")})] >>> filter_kwargs_to_str(exclude_by[0][1]) '[["max_date", 2020.25], ["min_date", 2020.17]]' @@ -732,7 +722,7 @@ def apply_filters(metadata, exclude_by, include_by): from Africa. >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-10-02"}, {"region": "North America", "date": "2020-01-01"}], index=["strain1", "strain2", "strain3"]) - >>> exclude_by = [(filter_by_date, {"min_date": numeric_date("2020-04-01")})] + >>> exclude_by = [(filter_by_date, {"min_date": date_parsing.numeric_date("2020-04-01")})] >>> include_by = [(include_by_include_where, {"include_where": "region=Africa"})] >>> strains_to_keep, strains_to_exclude, strains_to_include = apply_filters(metadata, exclude_by, include_by) >>> strains_to_keep @@ -1118,8 +1108,8 @@ def register_arguments(parser): Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax. (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")""" ) - metadata_filter_group.add_argument('--min-date', type=numeric_date, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}") - metadata_filter_group.add_argument('--max-date', type=numeric_date, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}") + metadata_filter_group.add_argument('--min-date', type=date_parsing.numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {date_parsing.SUPPORTED_DATE_HELP_TEXT}") + metadata_filter_group.add_argument('--max-date', type=date_parsing.numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {date_parsing.SUPPORTED_DATE_HELP_TEXT}") metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'], help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").') metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude") @@ -1694,50 +1684,6 @@ def _filename_gz(filename): return filename.lower().endswith(".gz") -def numeric_date(date): - """ - Converts the given *date* string to a :py:class:`float`. - - *date* may be given as: - 1. A string or float (number) with year as the integer part - 2. A string in the YYYY-MM-DD (ISO 8601) syntax - 3. A string representing a relative date (duration before datetime.date.today()) - - >>> numeric_date("2020.42") - 2020.42 - >>> numeric_date("2020-06-04") - 2020.42486... - >>> import datetime, isodate, treetime - >>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W")) - True - """ - # date is numeric - try: - return float(date) - except ValueError: - pass - - # date is in YYYY-MM-DD form - try: - return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2)))) - except ValueError: - pass - - # date is a duration treated as a backwards-looking relative date - try: - # make a copy of date for this block - duration_str = str(date) - if duration_str.startswith('P'): - duration_str = duration_str - else: - duration_str = 'P'+duration_str - return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str)) - except (ValueError, isodate.ISO8601Error): - pass - - raise argparse.ArgumentTypeError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""") - - def calculate_sequences_per_group(target_max_value, counts_per_group, allow_probabilistic=True): """Calculate the number of sequences per group for a given maximum number of sequences to be returned and the number of sequences in each requested diff --git a/augur/util_support/date_parsing.py b/augur/util_support/date_parsing.py new file mode 100644 index 000000000..8cac041c7 --- /dev/null +++ b/augur/util_support/date_parsing.py @@ -0,0 +1,65 @@ +import argparse +import datetime +from textwrap import dedent +import isodate +import treetime.utils + +SUPPORTED_DATE_HELP_TEXT = dedent("""\ + 1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or + 2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or + 3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W') +""") + +def numeric_date(date): + """ + Converts the given *date* string to a :py:class:`float`. + + *date* may be given as: + 1. A string or float (number) with year as the integer part + 2. A string in the YYYY-MM-DD (ISO 8601) syntax + 3. A string representing a relative date (duration before datetime.date.today()) + + >>> numeric_date("2020.42") + 2020.42 + >>> numeric_date("2020-06-04") + 2020.42486... + >>> import datetime, isodate, treetime + >>> numeric_date("1W") == treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration("P1W")) + True + """ + # date is numeric + try: + return float(date) + except ValueError: + pass + + # date is in YYYY-MM-DD form + try: + return treetime.utils.numeric_date(datetime.date(*map(int, date.split("-", 2)))) + except ValueError: + pass + + # date is a duration treated as a backwards-looking relative date + try: + # make a copy of date for this block + duration_str = str(date) + if duration_str.startswith('P'): + duration_str = duration_str + else: + duration_str = 'P'+duration_str + return treetime.utils.numeric_date(datetime.date.today() - isodate.parse_duration(duration_str)) + except (ValueError, isodate.ISO8601Error): + pass + + raise ValueError(f"""Unable to determine date from '{date}'. Ensure it is in one of the supported formats:\n{SUPPORTED_DATE_HELP_TEXT}""") + +def numeric_date_type(date): + """Wraps numeric_date() for argparse usage. + + This raises an ArgumentTypeError, otherwise the custom exception message won't be shown in console output due to: + https://github.com/python/cpython/blob/5c4d1f6e0e192653560ae2941a6677fbf4fbd1f2/Lib/argparse.py#L2503-L2513 + """ + try: + return numeric_date(date) + except ValueError as e: + raise argparse.ArgumentTypeError(str(e)) from e diff --git a/augur/utils.py b/augur/utils.py index b2207befe..e6555df51 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -17,6 +17,7 @@ from augur.io import open_file +from augur.util_support import date_parsing from augur.util_support.color_parser import ColorParser from augur.util_support.date_disambiguator import DateDisambiguator from augur.util_support.metadata_file import MetadataFile