Skip to content

Commit b14080e

Browse files
kmuehlbauerdcherianspencerkclarkheadtr1ckpre-commit-ci[bot]
authored
Enhance and move ISO-8601 parser to coding.times (pydata#9899)
Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: Spencer Clark <[email protected]> Co-authored-by: Michael Niklas <[email protected]> Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Michael Niklas <[email protected]>
1 parent 33bf5e8 commit b14080e

File tree

7 files changed

+161
-87
lines changed

7 files changed

+161
-87
lines changed

doc/whats-new.rst

+3
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ Internal Changes
7575
within ``as_compatible_data``. This is consistent with how lists of these objects
7676
will be converted (:pull:`9900`).
7777
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
78+
- Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import), add capability to parse negative and/or five-digit years (:pull:`9899`).
79+
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
80+
7881

7982
.. _whats-new.2024.11.0:
8083

properties/test_encode_decode.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,22 @@
55
66
"""
77

8+
import warnings
9+
810
import pytest
911

1012
pytest.importorskip("hypothesis")
1113
# isort: split
1214

1315
import hypothesis.extra.numpy as npst
16+
import hypothesis.strategies as st
1417
import numpy as np
1518
from hypothesis import given
1619

1720
import xarray as xr
18-
from xarray.testing.strategies import variables
21+
from xarray.coding.times import _parse_iso8601
22+
from xarray.testing.strategies import CFTimeStrategyISO8601, variables
23+
from xarray.tests import requires_cftime
1924

2025

2126
@pytest.mark.slow
@@ -43,3 +48,13 @@ def test_CFScaleOffset_coder_roundtrip(original) -> None:
4348
coder = xr.coding.variables.CFScaleOffsetCoder()
4449
roundtripped = coder.decode(coder.encode(original))
4550
xr.testing.assert_identical(original, roundtripped)
51+
52+
53+
@requires_cftime
54+
@given(dt=st.datetimes() | CFTimeStrategyISO8601())
55+
def test_iso8601_decode(dt):
56+
iso = dt.isoformat()
57+
with warnings.catch_warnings():
58+
warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
59+
parsed, _ = _parse_iso8601(type(dt), iso)
60+
assert dt == parsed

xarray/coding/cftime_offsets.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,10 @@
5353
import pandas as pd
5454
from packaging.version import Version
5555

56-
from xarray.coding.cftimeindex import CFTimeIndex, _parse_iso8601_with_reso
56+
from xarray.coding.cftimeindex import CFTimeIndex
5757
from xarray.coding.times import (
5858
_is_standard_calendar,
59+
_parse_iso8601,
5960
_should_cftime_be_used,
6061
convert_time_or_go_back,
6162
format_cftime_datetime,
@@ -843,7 +844,7 @@ def to_cftime_datetime(date_str_or_date, calendar=None):
843844
"If converting a string to a cftime.datetime object, "
844845
"a calendar type must be provided"
845846
)
846-
date, _ = _parse_iso8601_with_reso(get_date_type(calendar), date_str_or_date)
847+
date, _ = _parse_iso8601(get_date_type(calendar), date_str_or_date)
847848
return date
848849
elif isinstance(date_str_or_date, cftime.datetime):
849850
return date_str_or_date

xarray/coding/cftimeindex.py

+6-76
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
from __future__ import annotations
4343

4444
import math
45-
import re
4645
import warnings
4746
from datetime import timedelta
4847
from typing import TYPE_CHECKING, Any
@@ -53,6 +52,7 @@
5352

5453
from xarray.coding.times import (
5554
_STANDARD_CALENDARS,
55+
_parse_iso8601,
5656
cftime_to_nptime,
5757
infer_calendar_name,
5858
)
@@ -78,71 +78,6 @@
7878
OUT_OF_BOUNDS_TIMEDELTA_ERRORS = (OverflowError,)
7979

8080

81-
def named(name, pattern):
82-
return "(?P<" + name + ">" + pattern + ")"
83-
84-
85-
def optional(x):
86-
return "(?:" + x + ")?"
87-
88-
89-
def trailing_optional(xs):
90-
if not xs:
91-
return ""
92-
return xs[0] + optional(trailing_optional(xs[1:]))
93-
94-
95-
def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."):
96-
pieces = [
97-
(None, "year", r"\d{4}"),
98-
(date_sep, "month", r"\d{2}"),
99-
(date_sep, "day", r"\d{2}"),
100-
(datetime_sep, "hour", r"\d{2}"),
101-
(time_sep, "minute", r"\d{2}"),
102-
(time_sep, "second", r"\d{2}"),
103-
(micro_sep, "microsecond", r"\d{1,6}"),
104-
]
105-
pattern_list = []
106-
for sep, name, sub_pattern in pieces:
107-
pattern_list.append((sep if sep else "") + named(name, sub_pattern))
108-
# TODO: allow timezone offsets?
109-
return "^" + trailing_optional(pattern_list) + "$"
110-
111-
112-
_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
113-
_EXTENDED_PATTERN = build_pattern()
114-
_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
115-
_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]
116-
117-
118-
def parse_iso8601_like(datetime_string):
119-
for pattern in _PATTERNS:
120-
match = re.match(pattern, datetime_string)
121-
if match:
122-
return match.groupdict()
123-
raise ValueError(
124-
f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
125-
)
126-
127-
128-
def _parse_iso8601_with_reso(date_type, timestr):
129-
_ = attempt_import("cftime")
130-
131-
default = date_type(1, 1, 1)
132-
result = parse_iso8601_like(timestr)
133-
replace = {}
134-
135-
for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
136-
value = result.get(attr, None)
137-
if value is not None:
138-
if attr == "microsecond":
139-
# convert match string into valid microsecond value
140-
value = 10 ** (6 - len(value)) * int(value)
141-
replace[attr] = int(value)
142-
resolution = attr
143-
return default.replace(**replace), resolution
144-
145-
14681
def _parsed_string_to_bounds(date_type, resolution, parsed):
14782
"""Generalization of
14883
pandas.tseries.index.DatetimeIndex._parsed_string_to_bounds
@@ -436,7 +371,7 @@ def _partial_date_slice(self, resolution, parsed):
436371

437372
def _get_string_slice(self, key):
438373
"""Adapted from pandas.tseries.index.DatetimeIndex._get_string_slice"""
439-
parsed, resolution = _parse_iso8601_with_reso(self.date_type, key)
374+
parsed, resolution = _parse_iso8601(self.date_type, key)
440375
try:
441376
loc = self._partial_date_slice(resolution, parsed)
442377
except KeyError as err:
@@ -483,7 +418,7 @@ def _maybe_cast_slice_bound(self, label, side):
483418
if not isinstance(label, str):
484419
return label
485420

486-
parsed, resolution = _parse_iso8601_with_reso(self.date_type, label)
421+
parsed, resolution = _parse_iso8601(self.date_type, label)
487422
start, end = _parsed_string_to_bounds(self.date_type, resolution, parsed)
488423
if self.is_monotonic_decreasing and len(self) > 1:
489424
return end if side == "left" else start
@@ -811,11 +746,6 @@ def is_leap_year(self):
811746
return func(self.year, calendar=self.calendar)
812747

813748

814-
def _parse_iso8601_without_reso(date_type, datetime_str):
815-
date, _ = _parse_iso8601_with_reso(date_type, datetime_str)
816-
return date
817-
818-
819749
def _parse_array_of_cftime_strings(strings, date_type):
820750
"""Create a numpy array from an array of strings.
821751
@@ -833,9 +763,9 @@ def _parse_array_of_cftime_strings(strings, date_type):
833763
-------
834764
np.array
835765
"""
836-
return np.array(
837-
[_parse_iso8601_without_reso(date_type, s) for s in strings.ravel()]
838-
).reshape(strings.shape)
766+
return np.array([_parse_iso8601(date_type, s)[0] for s in strings.ravel()]).reshape(
767+
strings.shape
768+
)
839769

840770

841771
def _contains_datetime_timedeltas(array):

xarray/coding/times.py

+70
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,76 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
189189
return delta_units, ref_date
190190

191191

192+
def named(name: str, pattern: str) -> str:
193+
return "(?P<" + name + ">" + pattern + ")"
194+
195+
196+
def optional(x: str) -> str:
197+
return "(?:" + x + ")?"
198+
199+
200+
def trailing_optional(xs: list[str]) -> str:
201+
if not xs:
202+
return ""
203+
return xs[0] + optional(trailing_optional(xs[1:]))
204+
205+
206+
def build_pattern(
207+
date_sep: str = r"\-",
208+
datetime_sep: str = r"T",
209+
time_sep: str = r"\:",
210+
micro_sep: str = r".",
211+
) -> str:
212+
pieces = [
213+
(None, "year", r"[+-]?\d{4,5}"),
214+
(date_sep, "month", r"\d{2}"),
215+
(date_sep, "day", r"\d{2}"),
216+
(datetime_sep, "hour", r"\d{2}"),
217+
(time_sep, "minute", r"\d{2}"),
218+
(time_sep, "second", r"\d{2}"),
219+
(micro_sep, "microsecond", r"\d{1,6}"),
220+
]
221+
pattern_list = []
222+
for sep, name, sub_pattern in pieces:
223+
pattern_list.append((sep if sep else "") + named(name, sub_pattern))
224+
# TODO: allow timezone offsets?
225+
return "^" + trailing_optional(pattern_list) + "$"
226+
227+
228+
_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
229+
_EXTENDED_PATTERN = build_pattern()
230+
_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
231+
_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]
232+
233+
234+
def parse_iso8601_like(datetime_string: str) -> dict[str, str | None]:
235+
for pattern in _PATTERNS:
236+
match = re.match(pattern, datetime_string)
237+
if match:
238+
return match.groupdict()
239+
raise ValueError(
240+
f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
241+
)
242+
243+
244+
def _parse_iso8601(date_type, timestr):
245+
default = date_type(1, 1, 1)
246+
result = parse_iso8601_like(timestr)
247+
replace = {}
248+
249+
for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
250+
value = result.get(attr, None)
251+
if value is not None:
252+
resolution = attr
253+
if attr == "microsecond":
254+
if len(value) <= 3:
255+
resolution = "millisecond"
256+
# convert match string into valid microsecond value
257+
value = 10 ** (6 - len(value)) * int(value)
258+
replace[attr] = int(value)
259+
return default.replace(**replace), resolution
260+
261+
192262
def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
193263
# same us _unpack_netcdf_time_units but finalizes ref_date for
194264
# processing in encode_cf_datetime

xarray/testing/strategies.py

+35
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import datetime
2+
import warnings
13
from collections.abc import Hashable, Iterable, Mapping, Sequence
24
from typing import TYPE_CHECKING, Any, Protocol, overload
35

@@ -473,3 +475,36 @@ def unique_subset_of(
473475
return (
474476
{k: objs[k] for k in subset_keys} if isinstance(objs, Mapping) else subset_keys
475477
)
478+
479+
480+
class CFTimeStategy(st.SearchStrategy):
481+
def __init__(self, min_value, max_value):
482+
self.min_value = min_value
483+
self.max_value = max_value
484+
485+
def do_draw(self, data):
486+
unit_microsecond = datetime.timedelta(microseconds=1)
487+
timespan_microseconds = (self.max_value - self.min_value) // unit_microsecond
488+
result = data.draw_integer(0, timespan_microseconds)
489+
with warnings.catch_warnings():
490+
warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
491+
return self.min_value + datetime.timedelta(microseconds=result)
492+
493+
494+
class CFTimeStrategyISO8601(st.SearchStrategy):
495+
def __init__(self):
496+
from xarray.tests.test_coding_times import _all_cftime_date_types
497+
498+
self.date_types = _all_cftime_date_types()
499+
self.calendars = list(self.date_types)
500+
501+
def do_draw(self, data):
502+
calendar = data.draw(st.sampled_from(self.calendars))
503+
date_type = self.date_types[calendar]
504+
with warnings.catch_warnings():
505+
warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
506+
daysinmonth = date_type(99999, 12, 1).daysinmonth
507+
min_value = date_type(-99999, 1, 1)
508+
max_value = date_type(99999, 12, daysinmonth, 23, 59, 59, 999999)
509+
strategy = CFTimeStategy(min_value, max_value)
510+
return strategy.do_draw(data)

xarray/tests/test_cftimeindex.py

+28-8
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
from xarray.coding.cftimeindex import (
1313
CFTimeIndex,
1414
_parse_array_of_cftime_strings,
15-
_parse_iso8601_with_reso,
1615
_parsed_string_to_bounds,
1716
assert_all_valid_date_type,
17+
)
18+
from xarray.coding.times import (
19+
_parse_iso8601,
1820
parse_iso8601_like,
1921
)
2022
from xarray.tests import (
@@ -132,16 +134,34 @@ def date_dict(
132134
list(ISO8601_LIKE_STRING_TESTS.values()),
133135
ids=list(ISO8601_LIKE_STRING_TESTS.keys()),
134136
)
135-
def test_parse_iso8601_like(string, expected):
136-
result = parse_iso8601_like(string)
137+
@pytest.mark.parametrize(
138+
"five_digit_year", [False, True], ids=["four-digit-year", "five-digit-year"]
139+
)
140+
@pytest.mark.parametrize("sign", ["", "+", "-"], ids=["None", "plus", "minus"])
141+
def test_parse_iso8601_like(
142+
five_digit_year: bool, sign: str, string: str, expected: dict
143+
) -> None:
144+
pre = "1" if five_digit_year else ""
145+
datestring = sign + pre + string
146+
result = parse_iso8601_like(datestring)
147+
expected = expected.copy()
148+
expected.update(year=sign + pre + expected["year"])
137149
assert result == expected
138150

139-
if result["microsecond"] is None:
151+
# check malformed single digit addendum
152+
# this check is only performed when we have at least "hour" given
153+
# like "1999010101", where a single added digit should raise
154+
# for "1999" (year), "199901" (month) and "19990101" (day)
155+
# and a single added digit the string would just be interpreted
156+
# as having a 5-digit year.
157+
if result["microsecond"] is None and result["hour"] is not None:
140158
with pytest.raises(ValueError):
141-
parse_iso8601_like(string + "3")
142-
if result["second"] is None:
159+
parse_iso8601_like(datestring + "3")
160+
161+
# check malformed floating point addendum
162+
if result["second"] is None or result["microsecond"] is not None:
143163
with pytest.raises(ValueError):
144-
parse_iso8601_like(string + ".3")
164+
parse_iso8601_like(datestring + ".3")
145165

146166

147167
_CFTIME_CALENDARS = [
@@ -348,7 +368,7 @@ def test_cftimeindex_days_in_month_accessor(index):
348368
def test_parse_iso8601_with_reso(date_type, string, date_args, reso):
349369
expected_date = date_type(*date_args)
350370
expected_reso = reso
351-
result_date, result_reso = _parse_iso8601_with_reso(date_type, string)
371+
result_date, result_reso = _parse_iso8601(date_type, string)
352372
assert result_date == expected_date
353373
assert result_reso == expected_reso
354374

0 commit comments

Comments
 (0)