Skip to content

Commit 3fd79ac

Browse files
kmuehlbauerspencerkclarkpre-commit-ci[bot]
authored
time coding refactor (#9906)
Co-authored-by: Spencer Clark <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent dc03b80 commit 3fd79ac

File tree

4 files changed

+109
-42
lines changed

4 files changed

+109
-42
lines changed

doc/whats-new.rst

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ Internal Changes
7777
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
7878
- Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import), add capability to parse negative and/or five-digit years (:pull:`9899`).
7979
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
80+
- Refactor of time coding to prepare for relaxing nanosecond restriction (:pull:`9906`).
81+
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
8082

8183

8284
.. _whats-new.2024.11.0:

xarray/coding/times.py

+89-40
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like
2525
from xarray.core.duck_array_ops import asarray, ravel, reshape
2626
from xarray.core.formatting import first_n_items, format_timestamp, last_item
27-
from xarray.core.pdcompat import nanosecond_precision_timestamp
27+
from xarray.core.pdcompat import nanosecond_precision_timestamp, timestamp_as_unit
2828
from xarray.core.utils import attempt_import, emit_user_level_warning
2929
from xarray.core.variable import Variable
3030
from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type
@@ -36,7 +36,11 @@
3636
except ImportError:
3737
cftime = None
3838

39-
from xarray.core.types import CFCalendar, NPDatetimeUnitOptions, T_DuckArray
39+
from xarray.core.types import (
40+
CFCalendar,
41+
NPDatetimeUnitOptions,
42+
T_DuckArray,
43+
)
4044

4145
T_Name = Union[Hashable, None]
4246

@@ -259,18 +263,26 @@ def _parse_iso8601(date_type, timestr):
259263
return default.replace(**replace), resolution
260264

261265

262-
def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
266+
def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp:
267+
# If the ref_date Timestamp is timezone-aware, convert to UTC and
268+
# make it timezone-naive (GH 2649).
269+
if date.tz is not None:
270+
return date.tz_convert("UTC").tz_convert(None)
271+
return date
272+
273+
274+
def _unpack_time_unit_and_ref_date(
275+
units: str,
276+
) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]:
263277
# same us _unpack_netcdf_time_units but finalizes ref_date for
264278
# processing in encode_cf_datetime
265-
time_units, _ref_date = _unpack_netcdf_time_units(units)
279+
time_unit, _ref_date = _unpack_netcdf_time_units(units)
280+
time_unit = _netcdf_to_numpy_timeunit(time_unit)
266281
# TODO: the strict enforcement of nanosecond precision Timestamps can be
267282
# relaxed when addressing GitHub issue #7493.
268283
ref_date = nanosecond_precision_timestamp(_ref_date)
269-
# If the ref_date Timestamp is timezone-aware, convert to UTC and
270-
# make it timezone-naive (GH 2649).
271-
if ref_date.tz is not None:
272-
ref_date = ref_date.tz_convert(None)
273-
return time_units, ref_date
284+
ref_date = _maybe_strip_tz_from_timestamp(ref_date)
285+
return time_unit, ref_date
274286

275287

276288
def _decode_cf_datetime_dtype(
@@ -317,6 +329,30 @@ def _decode_datetime_with_cftime(
317329
return np.array([], dtype=object)
318330

319331

332+
def _check_date_for_units_since_refdate(
333+
date, unit: str, ref_date: pd.Timestamp
334+
) -> pd.Timestamp:
335+
# check for out-of-bounds floats and raise
336+
if date > np.iinfo("int64").max or date < np.iinfo("int64").min:
337+
raise OutOfBoundsTimedelta(
338+
f"Value {date} can't be represented as Datetime/Timedelta."
339+
)
340+
delta = date * np.timedelta64(1, unit)
341+
if not np.isnan(delta):
342+
# this will raise on dtype overflow for integer dtypes
343+
if date.dtype.kind in "u" and not np.int64(delta) == date:
344+
raise OutOfBoundsTimedelta(
345+
"DType overflow in Datetime/Timedelta calculation."
346+
)
347+
# this will raise on overflow if ref_date + delta
348+
# can't be represented in the current ref_date resolution
349+
return timestamp_as_unit(ref_date + delta, ref_date.unit)
350+
else:
351+
# if date is exactly NaT (np.iinfo("int64").min) return NaT
352+
# to make follow-up checks work
353+
return pd.Timestamp("NaT")
354+
355+
320356
def _decode_datetime_with_pandas(
321357
flat_num_dates: np.ndarray, units: str, calendar: str
322358
) -> np.ndarray:
@@ -335,12 +371,8 @@ def _decode_datetime_with_pandas(
335371
elif flat_num_dates.dtype.kind == "u":
336372
flat_num_dates = flat_num_dates.astype(np.uint64)
337373

338-
time_units, ref_date_str = _unpack_netcdf_time_units(units)
339-
time_units = _netcdf_to_numpy_timeunit(time_units)
340374
try:
341-
# TODO: the strict enforcement of nanosecond precision Timestamps can be
342-
# relaxed when addressing GitHub issue #7493.
343-
ref_date = nanosecond_precision_timestamp(ref_date_str)
375+
time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
344376
except ValueError as err:
345377
# ValueError is raised by pd.Timestamp for non-ISO timestamp
346378
# strings, in which case we fall back to using cftime
@@ -350,8 +382,12 @@ def _decode_datetime_with_pandas(
350382
warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
351383
if flat_num_dates.size > 0:
352384
# avoid size 0 datetimes GH1329
353-
pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date
354-
pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date
385+
_check_date_for_units_since_refdate(
386+
flat_num_dates.min(), time_unit, ref_date
387+
)
388+
_check_date_for_units_since_refdate(
389+
flat_num_dates.max(), time_unit, ref_date
390+
)
355391

356392
# To avoid integer overflow when converting to nanosecond units for integer
357393
# dtypes smaller than np.int64 cast all integer and unsigned integer dtype
@@ -364,20 +400,24 @@ def _decode_datetime_with_pandas(
364400
elif flat_num_dates.dtype.kind in "f":
365401
flat_num_dates = flat_num_dates.astype(np.float64)
366402

367-
# Cast input ordinals to integers of nanoseconds because pd.to_timedelta
368-
# works much faster when dealing with integers (GH 1399).
369-
# properly handle NaN/NaT to prevent casting NaN to int
403+
# keep NaT/nan mask
370404
nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
371-
flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units]
372-
flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
373-
flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
374-
flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)
405+
# in case we need to change the unit, we fix the numbers here
406+
# this should be safe, as errors would have been raised above
407+
ns_time_unit = _NS_PER_TIME_DELTA[time_unit]
408+
ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_date.unit]
409+
if ns_time_unit > ns_ref_date_unit:
410+
flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit)
411+
time_unit = ref_date.unit
375412

376-
# Use pd.to_timedelta to safely cast integer values to timedeltas,
377-
# and add those to a Timestamp to safely produce a DatetimeIndex. This
378-
# ensures that we do not encounter integer overflow at any point in the
379-
# process without raising OutOfBoundsDatetime.
380-
return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values
413+
# Cast input ordinals to integers and properly handle NaN/NaT
414+
# to prevent casting NaN to int
415+
flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64)
416+
flat_num_dates_int[nan] = np.iinfo(np.int64).min
417+
flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64)
418+
419+
# cast to timedelta64[time_unit] and add to ref_date
420+
return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_unit}]")
381421

382422

383423
def decode_cf_datetime(
@@ -409,11 +449,15 @@ def decode_cf_datetime(
409449
dates = _decode_datetime_with_cftime(
410450
flat_num_dates.astype(float), units, calendar
411451
)
412-
413-
if (
414-
dates[np.nanargmin(num_dates)].year < 1678
415-
or dates[np.nanargmax(num_dates)].year >= 2262
416-
):
452+
# retrieve cftype
453+
dates_min = dates[np.nanargmin(num_dates)]
454+
cftype = type(dates_min)
455+
# "ns" borders
456+
# between ['1677-09-21T00:12:43.145224193', '2262-04-11T23:47:16.854775807']
457+
lower = cftype(1677, 9, 21, 0, 12, 43, 145224)
458+
upper = cftype(2262, 4, 11, 23, 47, 16, 854775)
459+
460+
if dates_min < lower or dates[np.nanargmax(num_dates)] > upper:
417461
if _is_standard_calendar(calendar):
418462
warnings.warn(
419463
"Unable to decode time axis into full "
@@ -833,8 +877,8 @@ def _eagerly_encode_cf_datetime(
833877
raise OutOfBoundsDatetime
834878
assert dates.dtype == "datetime64[ns]"
835879

836-
time_units, ref_date = _unpack_time_units_and_ref_date(units)
837-
time_delta = _time_units_to_timedelta64(time_units)
880+
time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
881+
time_delta = np.timedelta64(1, time_unit)
838882

839883
# Wrap the dates in a DatetimeIndex to do the subtraction to ensure
840884
# an OverflowError is raised if the ref_date is too far away from
@@ -843,16 +887,17 @@ def _eagerly_encode_cf_datetime(
843887
time_deltas = dates_as_index - ref_date
844888

845889
# retrieve needed units to faithfully encode to int64
846-
needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units)
890+
needed_unit, data_ref_date = _unpack_time_unit_and_ref_date(data_units)
891+
needed_units = _numpy_to_netcdf_timeunit(needed_unit)
847892
if data_units != units:
848893
# this accounts for differences in the reference times
849894
ref_delta = abs(data_ref_date - ref_date).to_timedelta64()
850-
data_delta = _time_units_to_timedelta64(needed_units)
895+
data_delta = np.timedelta64(1, needed_unit)
851896
if (ref_delta % data_delta) > np.timedelta64(0, "ns"):
852897
needed_units = _infer_time_units_from_diff(ref_delta)
853898

854899
# needed time delta to encode faithfully to int64
855-
needed_time_delta = _time_units_to_timedelta64(needed_units)
900+
needed_time_delta = _unit_timedelta_numpy(needed_units)
856901

857902
floor_division = np.issubdtype(dtype, np.integer) or dtype is None
858903
if time_delta > needed_time_delta:
@@ -865,6 +910,7 @@ def _eagerly_encode_cf_datetime(
865910
f"Set encoding['dtype'] to floating point dtype to silence this warning."
866911
)
867912
elif np.issubdtype(dtype, np.integer) and allow_units_modification:
913+
floor_division = True
868914
new_units = f"{needed_units} since {format_timestamp(ref_date)}"
869915
emit_user_level_warning(
870916
f"Times can't be serialized faithfully to int64 with requested units {units!r}. "
@@ -874,9 +920,12 @@ def _eagerly_encode_cf_datetime(
874920
)
875921
units = new_units
876922
time_delta = needed_time_delta
877-
floor_division = True
878923

879-
num = _division(time_deltas, time_delta, floor_division)
924+
# get resolution of TimedeltaIndex and align time_delta
925+
# todo: check, if this works in any case
926+
num = _division(
927+
time_deltas, time_delta.astype(f"=m8[{time_deltas.unit}]"), floor_division
928+
)
880929
num = reshape(num.values, dates.shape)
881930

882931
except (OutOfBoundsDatetime, OverflowError, ValueError):

xarray/core/pdcompat.py

+16
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
import pandas as pd
4242
from packaging.version import Version
4343

44+
from xarray.core.types import PDDatetimeUnitOptions
45+
4446

4547
def count_not_none(*args) -> int:
4648
"""Compute the number of non-None arguments.
@@ -73,6 +75,20 @@ def __repr__(self) -> str:
7375
NoDefault = Literal[_NoDefault.no_default] # For typing following pandas
7476

7577

78+
def timestamp_as_unit(date: pd.Timestamp, unit: PDDatetimeUnitOptions) -> pd.Timestamp:
79+
"""Convert the underlying int64 representation to the given unit.
80+
81+
Compatibility function for pandas issue where "as_unit" is not defined
82+
for pandas.Timestamp in pandas versions < 2.2. Can be removed minimum
83+
pandas version is >= 2.2.
84+
"""
85+
if hasattr(date, "as_unit"):
86+
date = date.as_unit(unit)
87+
elif hasattr(date, "_as_unit"):
88+
date = date._as_unit(unit)
89+
return date
90+
91+
7692
def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp:
7793
"""Return a nanosecond-precision Timestamp object.
7894

xarray/tests/test_coding_times.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,8 @@ def test_decode_cf_datetime_overflow() -> None:
167167
units = "days since 2000-01-01 00:00:00"
168168

169169
# date after 2262 and before 1678
170-
days = (-117608, 95795)
171-
expected = (datetime(1677, 12, 31), datetime(2262, 4, 12))
170+
days = (-117710, 95795)
171+
expected = (datetime(1677, 9, 20), datetime(2262, 4, 12))
172172

173173
for i, day in enumerate(days):
174174
with warnings.catch_warnings():

0 commit comments

Comments
 (0)