pydata · dcherian · Mar 7, 2025 · Feb 13, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -71,6 +71,9 @@ Bug fixes
   By `Benoit Bovy <https://github.com/benbovy>`_.
 - Fix dask tokenization when opening each node in :py:func:`xarray.open_datatree`
   (:issue:`10098`, :pull:`10100`). By `Sam Levang <https://github.com/slevang>`_.
+- Improve handling of dtype and NaT when encoding/decoding masked and packaged
+  datetimes and timedeltas (:issue:`8957`, :pull:`10050`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 Documentation
 ~~~~~~~~~~~~~

diff --git a/xarray/coding/common.py b/xarray/coding/common.py
@@ -0,0 +1,136 @@
+from __future__ import annotations
+
+from collections.abc import Callable, Hashable, MutableMapping
+from typing import TYPE_CHECKING, Any, Union
+
+import numpy as np
+
+from xarray.core import indexing
+from xarray.core.variable import Variable
+from xarray.namedarray.parallelcompat import get_chunked_array_type
+from xarray.namedarray.pycompat import is_chunked_array
+
+if TYPE_CHECKING:
+    T_VarTuple = tuple[tuple[Hashable, ...], Any, dict, dict]
+    T_Name = Union[Hashable, None]
+
+
+class SerializationWarning(RuntimeWarning):
+    """Warnings about encoding/decoding issues in serialization."""
+
+
+class VariableCoder:
+    """Base class for encoding and decoding transformations on variables.
+
+    We use coders for transforming variables between xarray's data model and
+    a format suitable for serialization. For example, coders apply CF
+    conventions for how data should be represented in netCDF files.
+
+    Subclasses should implement encode() and decode(), which should satisfy
+    the identity ``coder.decode(coder.encode(variable)) == variable``. If any
+    options are necessary, they should be implemented as arguments to the
+    __init__ method.
+
+    The optional name argument to encode() and decode() exists solely for the
+    sake of better error messages, and should correspond to the name of
+    variables in the underlying store.
+    """
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        """Convert an encoded variable to a decoded variable"""
+        raise NotImplementedError()
+
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        """Convert a decoded variable to an encoded variable"""
+        raise NotImplementedError()
+
+
+class _ElementwiseFunctionArray(indexing.ExplicitlyIndexedNDArrayMixin):
+    """Lazily computed array holding values of elemwise-function.
+
+    Do not construct this object directly: call lazy_elemwise_func instead.
+
+    Values are computed upon indexing or coercion to a NumPy array.
+    """
+
+    def __init__(self, array, func: Callable, dtype: np.typing.DTypeLike):
+        assert not is_chunked_array(array)
+        self.array = indexing.as_indexable(array)
+        self.func = func
+        self._dtype = dtype
+
+    @property
+    def dtype(self) -> np.dtype:
+        return np.dtype(self._dtype)
+
+    def _oindex_get(self, key):
+        return type(self)(self.array.oindex[key], self.func, self.dtype)
+
+    def _vindex_get(self, key):
+        return type(self)(self.array.vindex[key], self.func, self.dtype)
+
+    def __getitem__(self, key):
+        return type(self)(self.array[key], self.func, self.dtype)
+
+    def get_duck_array(self):
+        return self.func(self.array.get_duck_array())
+
+    def __repr__(self) -> str:
+        return f"{type(self).__name__}({self.array!r}, func={self.func!r}, dtype={self.dtype!r})"
+
+
+def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike):
+    """Lazily apply an element-wise function to an array.
+    Parameters
+    ----------
+    array : any valid value of Variable._data
+    func : callable
+        Function to apply to indexed slices of an array. For use with dask,
+        this should be a pickle-able object.
+    dtype : coercible to np.dtype
+        Dtype for the result of this function.
+
+    Returns
+    -------
+    Either a dask.array.Array or _ElementwiseFunctionArray.
+    """
+    if is_chunked_array(array):
+        chunkmanager = get_chunked_array_type(array)
+
+        return chunkmanager.map_blocks(func, array, dtype=dtype)  # type: ignore[arg-type]
+    else:
+        return _ElementwiseFunctionArray(array, func, dtype)
+
+
+def safe_setitem(dest, key: Hashable, value, name: T_Name = None):
+    if key in dest:
+        var_str = f" on variable {name!r}" if name else ""
+        raise ValueError(
+            f"failed to prevent overwriting existing key {key} in attrs{var_str}. "
+            "This is probably an encoding field used by xarray to describe "
+            "how a variable is serialized. To proceed, remove this key from "
+            "the variable's attributes manually."
+        )
+    dest[key] = value
+
+
+def pop_to(
+    source: MutableMapping, dest: MutableMapping, key: Hashable, name: T_Name = None
+) -> Any:
+    """
+    A convenience function which pops a key k from source to dest.
+    None values are not passed on.  If k already exists in dest an
+    error is raised.
+    """
+    value = source.pop(key, None)
+    if value is not None:
+        safe_setitem(dest, key, value, name=name)
+    return value
+
+
+def unpack_for_encoding(var: Variable) -> T_VarTuple:
+    return var.dims, var.data, var.attrs.copy(), var.encoding.copy()
+
+
+def unpack_for_decoding(var: Variable) -> T_VarTuple:
+    return var.dims, var._data, var.attrs.copy(), var.encoding.copy()
diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -11,7 +11,7 @@
 import pandas as pd
 from pandas.errors import OutOfBoundsDatetime, OutOfBoundsTimedelta
 
-from xarray.coding.variables import (
+from xarray.coding.common import (
     SerializationWarning,
     VariableCoder,
     lazy_elemwise_func,
@@ -1328,9 +1328,20 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
 
             units = encoding.pop("units", None)
             calendar = encoding.pop("calendar", None)
-            dtype = encoding.get("dtype", None)
+            dtype = encoding.pop("dtype", None)
+
+            # in the case of packed data we need to encode into
+            # float first, the correct dtype will be established
+            # via CFScaleOffsetCoder/CFMaskCoder
+            set_dtype_encoding = None
+            if "add_offset" in encoding or "scale_factor" in encoding:
+                set_dtype_encoding = dtype
+                dtype = data.dtype if data.dtype.kind == "f" else "float64"
             (data, units, calendar) = encode_cf_datetime(data, units, calendar, dtype)
 
+            # retain dtype for packed data
+            if set_dtype_encoding is not None:
+                safe_setitem(encoding, "dtype", set_dtype_encoding, name=name)
             safe_setitem(attrs, "units", units, name=name)
             safe_setitem(attrs, "calendar", calendar, name=name)
 
@@ -1382,9 +1393,22 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         if np.issubdtype(variable.data.dtype, np.timedelta64):
             dims, data, attrs, encoding = unpack_for_encoding(variable)
 
-            data, units = encode_cf_timedelta(
-                data, encoding.pop("units", None), encoding.get("dtype", None)
-            )
+            dtype = encoding.pop("dtype", None)
+
+            # in the case of packed data we need to encode into
+            # float first, the correct dtype will be established
+            # via CFScaleOffsetCoder/CFMaskCoder
+            set_dtype_encoding = None
+            if "add_offset" in encoding or "scale_factor" in encoding:
+                set_dtype_encoding = dtype
+                dtype = data.dtype if data.dtype.kind == "f" else "float64"
+
+            data, units = encode_cf_timedelta(data, encoding.pop("units", None), dtype)
+
+            # retain dtype for packed data
+            if set_dtype_encoding is not None:
+                safe_setitem(encoding, "dtype", set_dtype_encoding, name=name)
+
             safe_setitem(attrs, "units", units, name=name)
 
             return Variable(dims, data, attrs, encoding, fastpath=True)