Skip to content

Commit

Permalink
filter: Add support for --min/max-date-offset
Browse files Browse the repository at this point in the history
The date offsets are specified by the new arguments `--min-date-offset` and `--max-date-offset` and are parsed into `isodate.Duration` or `datetime.timedelta` objects. The offset values are ignored if a `--min-date`/`--max-date` counterpart is also specified.

Offsets are given as positive input values following the ISO 8601 duration strings:
e.g. `--min-date-offset 1Y2W5D` for 1 year, 2 weeks and 5 days ago or `--max-date-offset 1D` for yesterday

This also adds a package dependency `isodate` to parse the duration string.
  • Loading branch information
benjaminotter authored and victorlin committed Mar 29, 2022
1 parent 5130fbd commit e303c25
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 5 deletions.
33 changes: 29 additions & 4 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import csv
import datetime
import heapq
import isodate
import itertools
import json
import numpy as np
Expand Down Expand Up @@ -280,8 +281,10 @@ def filter_by_ambiguous_date(metadata, date_column="date", ambiguity="any"):
return filtered


def filter_by_date(metadata, date_column="date", min_date=None, max_date=None):
"""Filter metadata by minimum or maximum date.
def filter_by_date(metadata, date_column="date", min_date=None, max_date=None, min_date_offset=None, max_date_offset=None):
"""Filter metadata by minimum/maximum dates as an absolute date or relative (offset) date.
Absolute dates specified by min_date or max_date take precedence over offset counterparts.
Parameters
----------
Expand All @@ -293,6 +296,10 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None):
Minimum date
max_date : float
Maximum date
min_date_offset : str
Minimum date offset
max_date_offset : str
Maximum date offset
Returns
-------
Expand Down Expand Up @@ -328,9 +335,23 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None):

if min_date:
filtered = {s for s in filtered if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s]) >= min_date}
elif min_date_offset:
if min_date_offset.startswith('P'):
min_date_offset = min_date_offset
else:
min_date_offset = 'P'+min_date_offset
min_date = numeric_date((datetime.date.today() - isodate.parse_duration(min_date_offset)).strftime('%Y-%m-%d'))
filtered = {s for s in filtered if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s]) >= min_date}

if max_date:
filtered = {s for s in filtered if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s]) <= max_date}
elif max_date_offset:
if max_date_offset.startswith('P'):
max_date_offset = max_date_offset
else:
max_date_offset = 'P'+max_date_offset
max_date = numeric_date((datetime.date.today() - isodate.parse_duration(max_date_offset)).strftime('%Y-%m-%d'))
filtered = {s for s in filtered if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s]) <= max_date}

return filtered

Expand Down Expand Up @@ -604,14 +625,16 @@ def construct_filters(args, sequence_index):
}
))

# Filter by date.
if args.min_date or args.max_date:
# Filter by date or date offset.
if any((args.min_date, args.max_date, args.min_date_offset, args.max_date_offset)):
exclude_by.append((
filter_by_date,
{
"date_column": "date",
"min_date": args.min_date,
"max_date": args.max_date,
"min_date_offset": args.min_date_offset,
"max_date_offset": args.max_date_offset,
}
))

Expand Down Expand Up @@ -1110,6 +1133,8 @@ def register_arguments(parser):
)
metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--min-date-offset', type=str, help="date offset for minimal cutoff of date following the ISO-8601 syntax for durations (e.g. \"[n]Y[n]W[n]D\"), cutoff is ignored when --min-date is specified)")
metadata_filter_group.add_argument('--max-date-offset', type=str, help="date offset for maximal cutoff of date following the ISO-8601 syntax for durations (e.g. \"[n]Y[n]W[n]D\"), cutoff is ignored when --max-date is specified)")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@
"packaging >=19.2",
"pandas >=1.0.0, ==1.*",
"phylo-treetime ==0.8.*",
"xopen >=1.0.1, ==1.*"
"xopen >=1.0.1, ==1.*",
"isodate ==0.6.*"
],
extras_require = {
'full': [
Expand Down

0 comments on commit e303c25

Please sign in to comment.