From e303c254c80ffc9ba3c0390b58731905110fe4b1 Mon Sep 17 00:00:00 2001 From: Benjamin Otter Date: Thu, 24 Jun 2021 12:52:40 +0200 Subject: [PATCH] filter: Add support for --min/max-date-offset The date offsets are specified by the new arguments `--min-date-offset` and `--max-date-offset` and are parsed into `isodate.Duration` or `datetime.timedelta` objects. The offset values are ignored if a `--min-date`/`--max-date` counterpart is also specified. Offsets are given as positive input values following the ISO 8601 duration strings: e.g. `--min-date-offset 1Y2W5D` for 1 year, 2 weeks and 5 days ago or `--max-date-offset 1D` for yesterday This also adds a package dependency `isodate` to parse the duration string. --- augur/filter.py | 33 +++++++++++++++++++++++++++++---- setup.py | 3 ++- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index 58850b2ef..2585ce162 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -6,6 +6,7 @@ import csv import datetime import heapq +import isodate import itertools import json import numpy as np @@ -280,8 +281,10 @@ def filter_by_ambiguous_date(metadata, date_column="date", ambiguity="any"): return filtered -def filter_by_date(metadata, date_column="date", min_date=None, max_date=None): - """Filter metadata by minimum or maximum date. +def filter_by_date(metadata, date_column="date", min_date=None, max_date=None, min_date_offset=None, max_date_offset=None): + """Filter metadata by minimum/maximum dates as an absolute date or relative (offset) date. + + Absolute dates specified by min_date or max_date take precedence over offset counterparts. Parameters ---------- @@ -293,6 +296,10 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None): Minimum date max_date : float Maximum date + min_date_offset : str + Minimum date offset + max_date_offset : str + Maximum date offset Returns ------- @@ -328,9 +335,23 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None): if min_date: filtered = {s for s in filtered if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s]) >= min_date} + elif min_date_offset: + if min_date_offset.startswith('P'): + min_date_offset = min_date_offset + else: + min_date_offset = 'P'+min_date_offset + min_date = numeric_date((datetime.date.today() - isodate.parse_duration(min_date_offset)).strftime('%Y-%m-%d')) + filtered = {s for s in filtered if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s]) >= min_date} if max_date: filtered = {s for s in filtered if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s]) <= max_date} + elif max_date_offset: + if max_date_offset.startswith('P'): + max_date_offset = max_date_offset + else: + max_date_offset = 'P'+max_date_offset + max_date = numeric_date((datetime.date.today() - isodate.parse_duration(max_date_offset)).strftime('%Y-%m-%d')) + filtered = {s for s in filtered if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s]) <= max_date} return filtered @@ -604,14 +625,16 @@ def construct_filters(args, sequence_index): } )) - # Filter by date. - if args.min_date or args.max_date: + # Filter by date or date offset. + if any((args.min_date, args.max_date, args.min_date_offset, args.max_date_offset)): exclude_by.append(( filter_by_date, { "date_column": "date", "min_date": args.min_date, "max_date": args.max_date, + "min_date_offset": args.min_date_offset, + "max_date_offset": args.max_date_offset, } )) @@ -1110,6 +1133,8 @@ def register_arguments(parser): ) metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD") metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD") + metadata_filter_group.add_argument('--min-date-offset', type=str, help="date offset for minimal cutoff of date following the ISO-8601 syntax for durations (e.g. \"[n]Y[n]W[n]D\"), cutoff is ignored when --min-date is specified)") + metadata_filter_group.add_argument('--max-date-offset', type=str, help="date offset for maximal cutoff of date following the ISO-8601 syntax for durations (e.g. \"[n]Y[n]W[n]D\"), cutoff is ignored when --max-date is specified)") metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'], help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").') metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude") diff --git a/setup.py b/setup.py index 907168ae7..923963662 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,8 @@ "packaging >=19.2", "pandas >=1.0.0, ==1.*", "phylo-treetime ==0.8.*", - "xopen >=1.0.1, ==1.*" + "xopen >=1.0.1, ==1.*", + "isodate ==0.6.*" ], extras_require = { 'full': [