Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use resolution-dependent default units for lazy time encoding #10017

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ Deprecations

Bug fixes
~~~~~~~~~
- Default to resolution-dependent optimal integer encoding units when saving
chunked non-nanosecond :py:class:`numpy.datetime64` or
:py:class:`numpy.timedelta64` arrays to disk. Previously units of
"nanoseconds" were chosen by default, which are optimal for
nanosecond-resolution times, but not for times with coarser resolution. By
`Spencer Clark <https://github.com/spencerkclark>`_ (:pull:`10017`).


Documentation
Expand Down
11 changes: 9 additions & 2 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ def _numpy_to_netcdf_timeunit(units: NPDatetimeUnitOptions) -> str:
}[units]


def _numpy_dtype_to_netcdf_timeunit(dtype: np.dtype) -> str:
unit, _ = np.datetime_data(dtype)
unit = cast(NPDatetimeUnitOptions, unit)
return _numpy_to_netcdf_timeunit(unit)


def _ensure_padded_year(ref_date: str) -> str:
# Reference dates without a padded year (e.g. since 1-1-1 or since 2-3-4)
# are ambiguous (is it YMD or DMY?). This can lead to some very odd
Expand Down Expand Up @@ -1143,7 +1149,8 @@ def _lazily_encode_cf_datetime(
units = "microseconds since 1970-01-01"
dtype = np.dtype("int64")
else:
units = "nanoseconds since 1970-01-01"
netcdf_unit = _numpy_dtype_to_netcdf_timeunit(dates.dtype)
units = f"{netcdf_unit} since 1970-01-01"
dtype = np.dtype("int64")

if units is None or dtype is None:
Expand Down Expand Up @@ -1249,7 +1256,7 @@ def _lazily_encode_cf_timedelta(
timedeltas: T_ChunkedArray, units: str | None = None, dtype: np.dtype | None = None
) -> tuple[T_ChunkedArray, str]:
if units is None and dtype is None:
units = "nanoseconds"
units = _numpy_dtype_to_netcdf_timeunit(timedeltas.dtype)
dtype = np.dtype("int64")

if units is None or dtype is None:
Expand Down
23 changes: 15 additions & 8 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -1620,10 +1620,10 @@ def test_roundtrip_float_times(fill_value, times, units, encoded_values) -> None
_ENCODE_DATETIME64_VIA_DASK_TESTS.values(),
ids=_ENCODE_DATETIME64_VIA_DASK_TESTS.keys(),
)
def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype) -> None:
def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype, time_unit) -> None:
import dask.array

times_pd = pd.date_range(start="1700", freq=freq, periods=3)
times_pd = pd.date_range(start="1700", freq=freq, periods=3, unit=time_unit)
times = dask.array.from_array(times_pd, chunks=1)
encoded_times, encoding_units, encoding_calendar = encode_cf_datetime(
times, units, None, dtype
Expand All @@ -1636,13 +1636,17 @@ def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype) -> None:
assert encoding_units == units
assert encoded_times.dtype == dtype
else:
assert encoding_units == "nanoseconds since 1970-01-01"
expected_netcdf_time_unit = _numpy_to_netcdf_timeunit(time_unit)
assert encoding_units == f"{expected_netcdf_time_unit} since 1970-01-01"
assert encoded_times.dtype == np.dtype("int64")

assert encoding_calendar == "proleptic_gregorian"

decoded_times = decode_cf_datetime(encoded_times, encoding_units, encoding_calendar)
decoded_times = decode_cf_datetime(
encoded_times, encoding_units, encoding_calendar, time_unit=time_unit
)
np.testing.assert_equal(decoded_times, times)
assert decoded_times.dtype == times.dtype


@requires_dask
Expand Down Expand Up @@ -1749,11 +1753,11 @@ def test_encode_cf_datetime_casting_overflow_error(use_cftime, use_dask, dtype)
("units", "dtype"), [("days", np.dtype("int32")), (None, None)]
)
def test_encode_cf_timedelta_via_dask(
units: str | None, dtype: np.dtype | None
units: str | None, dtype: np.dtype | None, time_unit: PDDatetimeUnitOptions
) -> None:
import dask.array

times_pd = pd.timedelta_range(start="0D", freq="D", periods=3)
times_pd = pd.timedelta_range(start="0D", freq="D", periods=3, unit=time_unit) # type: ignore[call-arg]
times = dask.array.from_array(times_pd, chunks=1)
encoded_times, encoding_units = encode_cf_timedelta(times, units, dtype)

Expand All @@ -1764,11 +1768,14 @@ def test_encode_cf_timedelta_via_dask(
assert encoding_units == units
assert encoded_times.dtype == dtype
else:
assert encoding_units == "nanoseconds"
assert encoding_units == _numpy_to_netcdf_timeunit(time_unit)
assert encoded_times.dtype == np.dtype("int64")

decoded_times = decode_cf_timedelta(encoded_times, encoding_units)
decoded_times = decode_cf_timedelta(
encoded_times, encoding_units, time_unit=time_unit
)
np.testing.assert_equal(decoded_times, times)
assert decoded_times.dtype == times.dtype


@pytest.mark.parametrize("use_dask", [False, pytest.param(True, marks=requires_dask)])
Expand Down
Loading