Skip to content

Commit d4ad1e9

Browse files
committed
Use resolution-dependent default units for lazy time encoding
1 parent c252152 commit d4ad1e9

File tree

3 files changed

+29
-10
lines changed

3 files changed

+29
-10
lines changed

doc/whats-new.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ v2025.02.0 (unreleased)
2121

2222
New Features
2323
~~~~~~~~~~~~
24+
- Default to resolution-dependent optimal integer encoding units when saving
25+
chunked non-nanosecond :py:class:`numpy.datetime64` or
26+
:py:class:`numpy.timedelta64` arrays to disk. Previously units of
27+
"nanoseconds" were chosen by default, which are optimal for
28+
nanosecond-resolution times, but not for times with coarser resolution. By
29+
`Spencer Clark <https://github.com/spencerkclark>`_ (:pull:`10017`).
2430

2531

2632
Breaking changes

xarray/coding/times.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,11 @@ def _numpy_to_netcdf_timeunit(units: NPDatetimeUnitOptions) -> str:
153153
}[units]
154154

155155

156+
def _numpy_dtype_to_netcdf_timeunit(dtype: np.dtype) -> str:
157+
unit, _ = np.datetime_data(dtype)
158+
return _numpy_to_netcdf_timeunit(unit)
159+
160+
156161
def _ensure_padded_year(ref_date: str) -> str:
157162
# Reference dates without a padded year (e.g. since 1-1-1 or since 2-3-4)
158163
# are ambiguous (is it YMD or DMY?). This can lead to some very odd
@@ -1143,7 +1148,8 @@ def _lazily_encode_cf_datetime(
11431148
units = "microseconds since 1970-01-01"
11441149
dtype = np.dtype("int64")
11451150
else:
1146-
units = "nanoseconds since 1970-01-01"
1151+
netcdf_unit = _numpy_dtype_to_netcdf_timeunit(dates.dtype)
1152+
units = f"{netcdf_unit} since 1970-01-01"
11471153
dtype = np.dtype("int64")
11481154

11491155
if units is None or dtype is None:
@@ -1249,7 +1255,7 @@ def _lazily_encode_cf_timedelta(
12491255
timedeltas: T_ChunkedArray, units: str | None = None, dtype: np.dtype | None = None
12501256
) -> tuple[T_ChunkedArray, str]:
12511257
if units is None and dtype is None:
1252-
units = "nanoseconds"
1258+
units = _numpy_dtype_to_netcdf_timeunit(timedeltas.dtype)
12531259
dtype = np.dtype("int64")
12541260

12551261
if units is None or dtype is None:

xarray/tests/test_coding_times.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1620,10 +1620,10 @@ def test_roundtrip_float_times(fill_value, times, units, encoded_values) -> None
16201620
_ENCODE_DATETIME64_VIA_DASK_TESTS.values(),
16211621
ids=_ENCODE_DATETIME64_VIA_DASK_TESTS.keys(),
16221622
)
1623-
def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype) -> None:
1623+
def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype, time_unit) -> None:
16241624
import dask.array
16251625

1626-
times_pd = pd.date_range(start="1700", freq=freq, periods=3)
1626+
times_pd = pd.date_range(start="1700", freq=freq, periods=3, unit=time_unit)
16271627
times = dask.array.from_array(times_pd, chunks=1)
16281628
encoded_times, encoding_units, encoding_calendar = encode_cf_datetime(
16291629
times, units, None, dtype
@@ -1636,13 +1636,17 @@ def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype) -> None:
16361636
assert encoding_units == units
16371637
assert encoded_times.dtype == dtype
16381638
else:
1639-
assert encoding_units == "nanoseconds since 1970-01-01"
1639+
expected_netcdf_time_unit = _numpy_to_netcdf_timeunit(time_unit)
1640+
assert encoding_units == f"{expected_netcdf_time_unit} since 1970-01-01"
16401641
assert encoded_times.dtype == np.dtype("int64")
16411642

16421643
assert encoding_calendar == "proleptic_gregorian"
16431644

1644-
decoded_times = decode_cf_datetime(encoded_times, encoding_units, encoding_calendar)
1645+
decoded_times = decode_cf_datetime(
1646+
encoded_times, encoding_units, encoding_calendar, time_unit=time_unit
1647+
)
16451648
np.testing.assert_equal(decoded_times, times)
1649+
assert decoded_times.dtype == times.dtype
16461650

16471651

16481652
@requires_dask
@@ -1749,11 +1753,11 @@ def test_encode_cf_datetime_casting_overflow_error(use_cftime, use_dask, dtype)
17491753
("units", "dtype"), [("days", np.dtype("int32")), (None, None)]
17501754
)
17511755
def test_encode_cf_timedelta_via_dask(
1752-
units: str | None, dtype: np.dtype | None
1756+
units: str | None, dtype: np.dtype | None, time_unit: PDDatetimeUnitOptions
17531757
) -> None:
17541758
import dask.array
17551759

1756-
times_pd = pd.timedelta_range(start="0D", freq="D", periods=3)
1760+
times_pd = pd.timedelta_range(start="0D", freq="D", periods=3, unit=time_unit)
17571761
times = dask.array.from_array(times_pd, chunks=1)
17581762
encoded_times, encoding_units = encode_cf_timedelta(times, units, dtype)
17591763

@@ -1764,11 +1768,14 @@ def test_encode_cf_timedelta_via_dask(
17641768
assert encoding_units == units
17651769
assert encoded_times.dtype == dtype
17661770
else:
1767-
assert encoding_units == "nanoseconds"
1771+
assert encoding_units == _numpy_to_netcdf_timeunit(time_unit)
17681772
assert encoded_times.dtype == np.dtype("int64")
17691773

1770-
decoded_times = decode_cf_timedelta(encoded_times, encoding_units)
1774+
decoded_times = decode_cf_timedelta(
1775+
encoded_times, encoding_units, time_unit=time_unit
1776+
)
17711777
np.testing.assert_equal(decoded_times, times)
1778+
assert decoded_times.dtype == times.dtype
17721779

17731780

17741781
@pytest.mark.parametrize("use_dask", [False, pytest.param(True, marks=requires_dask)])

0 commit comments

Comments
 (0)