From 02161127c89b9278a27c3cac98987ebc76332f2f Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 1 Feb 2025 10:02:18 -0500 Subject: [PATCH 1/2] Use resolution-dependent default units for lazy time encoding --- doc/whats-new.rst | 6 ++++++ xarray/coding/times.py | 10 ++++++++-- xarray/tests/test_coding_times.py | 23 +++++++++++++++-------- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 56d9a3d9bed..e4024835409 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,6 +33,12 @@ Deprecations Bug fixes ~~~~~~~~~ +- Default to resolution-dependent optimal integer encoding units when saving + chunked non-nanosecond :py:class:`numpy.datetime64` or + :py:class:`numpy.timedelta64` arrays to disk. Previously units of + "nanoseconds" were chosen by default, which are optimal for + nanosecond-resolution times, but not for times with coarser resolution. By + `Spencer Clark `_ (:pull:`10017`). Documentation diff --git a/xarray/coding/times.py b/xarray/coding/times.py index ad5e8653e2a..fb665f1df32 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -153,6 +153,11 @@ def _numpy_to_netcdf_timeunit(units: NPDatetimeUnitOptions) -> str: }[units] +def _numpy_dtype_to_netcdf_timeunit(dtype: np.dtype) -> str: + unit, _ = np.datetime_data(dtype) + return _numpy_to_netcdf_timeunit(unit) + + def _ensure_padded_year(ref_date: str) -> str: # Reference dates without a padded year (e.g. since 1-1-1 or since 2-3-4) # are ambiguous (is it YMD or DMY?). This can lead to some very odd @@ -1143,7 +1148,8 @@ def _lazily_encode_cf_datetime( units = "microseconds since 1970-01-01" dtype = np.dtype("int64") else: - units = "nanoseconds since 1970-01-01" + netcdf_unit = _numpy_dtype_to_netcdf_timeunit(dates.dtype) + units = f"{netcdf_unit} since 1970-01-01" dtype = np.dtype("int64") if units is None or dtype is None: @@ -1249,7 +1255,7 @@ def _lazily_encode_cf_timedelta( timedeltas: T_ChunkedArray, units: str | None = None, dtype: np.dtype | None = None ) -> tuple[T_ChunkedArray, str]: if units is None and dtype is None: - units = "nanoseconds" + units = _numpy_dtype_to_netcdf_timeunit(timedeltas.dtype) dtype = np.dtype("int64") if units is None or dtype is None: diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 7b34a01a680..9bfe44d44d0 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1620,10 +1620,10 @@ def test_roundtrip_float_times(fill_value, times, units, encoded_values) -> None _ENCODE_DATETIME64_VIA_DASK_TESTS.values(), ids=_ENCODE_DATETIME64_VIA_DASK_TESTS.keys(), ) -def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype) -> None: +def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype, time_unit) -> None: import dask.array - times_pd = pd.date_range(start="1700", freq=freq, periods=3) + times_pd = pd.date_range(start="1700", freq=freq, periods=3, unit=time_unit) times = dask.array.from_array(times_pd, chunks=1) encoded_times, encoding_units, encoding_calendar = encode_cf_datetime( times, units, None, dtype @@ -1636,13 +1636,17 @@ def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype) -> None: assert encoding_units == units assert encoded_times.dtype == dtype else: - assert encoding_units == "nanoseconds since 1970-01-01" + expected_netcdf_time_unit = _numpy_to_netcdf_timeunit(time_unit) + assert encoding_units == f"{expected_netcdf_time_unit} since 1970-01-01" assert encoded_times.dtype == np.dtype("int64") assert encoding_calendar == "proleptic_gregorian" - decoded_times = decode_cf_datetime(encoded_times, encoding_units, encoding_calendar) + decoded_times = decode_cf_datetime( + encoded_times, encoding_units, encoding_calendar, time_unit=time_unit + ) np.testing.assert_equal(decoded_times, times) + assert decoded_times.dtype == times.dtype @requires_dask @@ -1749,11 +1753,11 @@ def test_encode_cf_datetime_casting_overflow_error(use_cftime, use_dask, dtype) ("units", "dtype"), [("days", np.dtype("int32")), (None, None)] ) def test_encode_cf_timedelta_via_dask( - units: str | None, dtype: np.dtype | None + units: str | None, dtype: np.dtype | None, time_unit: PDDatetimeUnitOptions ) -> None: import dask.array - times_pd = pd.timedelta_range(start="0D", freq="D", periods=3) + times_pd = pd.timedelta_range(start="0D", freq="D", periods=3, unit=time_unit) times = dask.array.from_array(times_pd, chunks=1) encoded_times, encoding_units = encode_cf_timedelta(times, units, dtype) @@ -1764,11 +1768,14 @@ def test_encode_cf_timedelta_via_dask( assert encoding_units == units assert encoded_times.dtype == dtype else: - assert encoding_units == "nanoseconds" + assert encoding_units == _numpy_to_netcdf_timeunit(time_unit) assert encoded_times.dtype == np.dtype("int64") - decoded_times = decode_cf_timedelta(encoded_times, encoding_units) + decoded_times = decode_cf_timedelta( + encoded_times, encoding_units, time_unit=time_unit + ) np.testing.assert_equal(decoded_times, times) + assert decoded_times.dtype == times.dtype @pytest.mark.parametrize("use_dask", [False, pytest.param(True, marks=requires_dask)]) From 55da8e83e3664337d530c6eff5ca2307d720975d Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 1 Feb 2025 19:48:18 -0500 Subject: [PATCH 2/2] Fix typing --- xarray/coding/times.py | 1 + xarray/tests/test_coding_times.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index fb665f1df32..162d2980211 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -155,6 +155,7 @@ def _numpy_to_netcdf_timeunit(units: NPDatetimeUnitOptions) -> str: def _numpy_dtype_to_netcdf_timeunit(dtype: np.dtype) -> str: unit, _ = np.datetime_data(dtype) + unit = cast(NPDatetimeUnitOptions, unit) return _numpy_to_netcdf_timeunit(unit) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 9bfe44d44d0..2e61e5d853e 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1757,7 +1757,7 @@ def test_encode_cf_timedelta_via_dask( ) -> None: import dask.array - times_pd = pd.timedelta_range(start="0D", freq="D", periods=3, unit=time_unit) + times_pd = pd.timedelta_range(start="0D", freq="D", periods=3, unit=time_unit) # type: ignore[call-arg] times = dask.array.from_array(times_pd, chunks=1) encoded_times, encoding_units = encode_cf_timedelta(times, units, dtype)