Merge branch 'main' into topk

dcherian · dcherian · commit 16b0bac27591 · 2025-01-13T16:09:07.000-06:00
* main: More support for datetime, timedelta (#412)
diff --git a/flox/aggregate_numbagg.py b/flox/aggregate_numbagg.py
@@ -30,6 +30,8 @@
     "nanmean": {np.int_: np.float64},
     "nanvar": {np.int_: np.float64},
     "nanstd": {np.int_: np.float64},
+    "nanfirst": {np.datetime64: np.int64, np.timedelta64: np.int64},
+    "nanlast": {np.datetime64: np.int64, np.timedelta64: np.int64},
 }
 
 
@@ -51,7 +53,7 @@ def _numbagg_wrapper(
     if cast_to:
         for from_, to_ in cast_to.items():
             if np.issubdtype(array.dtype, from_):
-                array = array.astype(to_)
+                array = array.astype(to_, copy=False)
 
     func_ = getattr(numbagg.grouped, f"group_{func}")
 
diff --git a/flox/core.py b/flox/core.py
@@ -46,6 +46,9 @@
 )
 from .cache import memoize
 from .xrutils import (
+    _contains_cftime_datetimes,
+    _to_pytimedelta,
+    datetime_to_numeric,
     is_chunked_array,
     is_duck_array,
     is_duck_cubed_array,
@@ -172,6 +175,17 @@ def _is_first_last_reduction(func: T_Agg) -> bool:
     return func in ["nanfirst", "nanlast", "first", "last"]
 
 
+def _is_bool_supported_reduction(func: T_Agg) -> bool:
+    if isinstance(func, Aggregation):
+        func = func.name
+    return (
+        func in ["all", "any"]
+        # TODO: enable in npg
+        # or _is_first_last_reduction(func)
+        # or _is_minmax_reduction(func)
+    )
+
+
 def _get_expected_groups(by: T_By, sort: bool) -> T_ExpectIndex:
     if is_duck_dask_array(by):
         raise ValueError("Please provide expected_groups if not grouping by a numpy array.")
@@ -2432,7 +2446,7 @@ def groupby_reduce(
         array.dtype,
     )
 
-    is_bool_array = np.issubdtype(array.dtype, bool)
+    is_bool_array = np.issubdtype(array.dtype, bool) and not _is_bool_supported_reduction(func)
     array = array.astype(np.int_) if is_bool_array else array
 
     isbins = _atleast_1d(isbin, nby)
@@ -2482,7 +2496,8 @@ def groupby_reduce(
     has_dask = is_duck_dask_array(array) or is_duck_dask_array(by_)
     has_cubed = is_duck_cubed_array(array) or is_duck_cubed_array(by_)
 
-    if _is_first_last_reduction(func):
+    is_first_last = _is_first_last_reduction(func)
+    if is_first_last:
         if has_dask and nax != 1:
             raise ValueError(
                 "For dask arrays: first, last, nanfirst, nanlast reductions are "
@@ -2495,6 +2510,22 @@ def groupby_reduce(
                 "along a single axis or when reducing across all dimensions of `by`."
             )
 
+    is_npdatetime = array.dtype.kind in "Mm"
+    is_cftime = _contains_cftime_datetimes(array)
+    requires_numeric = (
+        (func not in ["count", "any", "all"] and not is_first_last)
+        # Flox's count works with non-numeric and its faster than converting.
+        or (func == "count" and engine != "flox")
+        or (is_first_last and is_cftime)
+    )
+    if requires_numeric:
+        if is_npdatetime:
+            datetime_dtype = array.dtype
+            array = array.view(np.int64)
+        elif is_cftime:
+            offset = array.min()
+            array = datetime_to_numeric(array, offset, datetime_unit="us")
+
     if nax == 1 and by_.ndim > 1 and expected_ is None:
         # When we reduce along all axes, we are guaranteed to see all
         # groups in the final combine stage, so everything works.
@@ -2680,6 +2711,14 @@ def groupby_reduce(
 
     if is_bool_array and (_is_minmax_reduction(func) or _is_first_last_reduction(func)):
         result = result.astype(bool)
+
+    # Output of count has an int dtype.
+    if requires_numeric and func != "count":
+        if is_npdatetime:
+            result = result.astype(datetime_dtype)
+        elif is_cftime:
+            result = _to_pytimedelta(result, unit="us") + offset
+
     return (result, *groups)
 
 
@@ -2820,6 +2859,12 @@ def groupby_scan(
     (by_,) = bys
     has_dask = is_duck_dask_array(array) or is_duck_dask_array(by_)
 
+    if array.dtype.kind in "Mm":
+        cast_to = array.dtype
+        array = array.view(np.int64)
+    else:
+        cast_to = None
+
     # TODO: move to aggregate_npg.py
     if agg.name in ["cumsum", "nancumsum"] and array.dtype.kind in ["i", "u"]:
         # https://numpy.org/doc/stable/reference/generated/numpy.cumsum.html
@@ -2835,7 +2880,10 @@ def groupby_scan(
     (single_axis,) = axis_  # type: ignore[misc]
     # avoid some roundoff error when we can.
     if by_.shape[-1] == 1 or by_.shape == grp_shape:
-        return array.astype(agg.dtype)
+        array = array.astype(agg.dtype)
+        if cast_to is not None:
+            array = array.astype(cast_to)
+        return array
 
     # Made a design choice here to have `preprocess` handle both array and group_idx
     # Example: for reversing, we need to reverse the whole array, not just reverse
@@ -2854,6 +2902,9 @@ def groupby_scan(
     out = AlignedArrays(array=result, group_idx=by_)
     if agg.finalize:
         out = agg.finalize(out)
+
+    if cast_to is not None:
+        return out.array.astype(cast_to)
     return out.array
 
 
diff --git a/flox/xarray.py b/flox/xarray.py
@@ -7,7 +7,6 @@
 import pandas as pd
 import xarray as xr
 from packaging.version import Version
-from xarray.core.duck_array_ops import _datetime_nanmin
 
 from .aggregations import (
     Aggregation,
@@ -24,7 +23,6 @@
 )
 from .core import rechunk_for_blockwise as rechunk_array_for_blockwise
 from .core import rechunk_for_cohorts as rechunk_array_for_cohorts
-from .xrutils import _contains_cftime_datetimes, _to_pytimedelta, datetime_to_numeric
 
 if TYPE_CHECKING:
     from xarray.core.types import T_DataArray, T_Dataset
@@ -372,22 +370,6 @@ def wrapper(array, *by, func, skipna, core_dims, **kwargs):
             if "nan" not in func and func not in ["all", "any", "count"]:
                 func = f"nan{func}"
 
-        # Flox's count works with non-numeric and its faster than converting.
-        requires_numeric = func not in ["count", "any", "all"] or (
-            func == "count" and kwargs["engine"] != "flox"
-        )
-        if requires_numeric:
-            is_npdatetime = array.dtype.kind in "Mm"
-            is_cftime = _contains_cftime_datetimes(array)
-            if is_npdatetime:
-                offset = _datetime_nanmin(array)
-                # xarray always uses np.datetime64[ns] for np.datetime64 data
-                dtype = "timedelta64[ns]"
-                array = datetime_to_numeric(array, offset)
-            elif is_cftime:
-                offset = array.min()
-                array = datetime_to_numeric(array, offset, datetime_unit="us")
-
         result, *groups = groupby_reduce(array, *by, func=func, **kwargs)
 
         # Transpose the new quantile or topk dimension to the end. This is ugly.
@@ -404,12 +386,6 @@ def wrapper(array, *by, func, skipna, core_dims, **kwargs):
             # This transpose is simply makes it easy to specify output_core_dims
             # output dim order: (*broadcast_dims, *group_dims, quantile_dim)
             result = np.moveaxis(result, 0, -1)
-        # Output of count has an int dtype.
-        if requires_numeric and func != "count":
-            if is_npdatetime:
-                return result.astype(dtype) + offset
-            elif is_cftime:
-                return _to_pytimedelta(result, unit="us") + offset
 
         return result
 
diff --git a/flox/xrutils.py b/flox/xrutils.py
@@ -213,8 +213,6 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
     """
     # TODO: make this function dask-compatible?
     # Set offset to minimum if not given
-    from xarray.core.duck_array_ops import _datetime_nanmin
-
     if offset is None:
         if array.dtype.kind in "Mm":
             offset = _datetime_nanmin(array)
@@ -345,6 +343,28 @@ def _contains_cftime_datetimes(array) -> bool:
             return False
 
 
+def _datetime_nanmin(array):
+    """nanmin() function for datetime64.
+
+    Caveats that this function deals with:
+
+    - In numpy < 1.18, min() on datetime64 incorrectly ignores NaT
+    - numpy nanmin() don't work on datetime64 (all versions at the moment of writing)
+    - dask min() does not work on datetime64 (all versions at the moment of writing)
+    """
+    from .xrdtypes import is_datetime_like
+
+    dtype = array.dtype
+    assert is_datetime_like(dtype)
+    # (NaT).astype(float) does not produce NaN...
+    array = np.where(pd.isnull(array), np.nan, array.astype(float))
+    array = np.nanmin(array)
+    if isinstance(array, float):
+        array = np.array(array)
+    # ...but (NaN).astype("M8") does produce NaT
+    return array.astype(dtype)
+
+
 def _select_along_axis(values, idx, axis):
     other_ind = np.ix_(*[np.arange(s) for s in idx.shape])
     sl = other_ind[:axis] + (idx,) + other_ind[axis:]
diff --git a/tests/strategies.py b/tests/strategies.py
@@ -13,44 +13,6 @@
 
 Chunks = tuple[tuple[int, ...], ...]
 
-
-def supported_dtypes() -> st.SearchStrategy[np.dtype]:
-    return (
-        npst.integer_dtypes(endianness="=")
-        | npst.unsigned_integer_dtypes(endianness="=")
-        | npst.floating_dtypes(endianness="=", sizes=(32, 64))
-        | npst.complex_number_dtypes(endianness="=")
-        | npst.datetime64_dtypes(endianness="=")
-        | npst.timedelta64_dtypes(endianness="=")
-        | npst.unicode_string_dtypes(endianness="=")
-    )
-
-
-# TODO: stop excluding everything but U
-array_dtypes = supported_dtypes().filter(lambda x: x.kind not in "cmMU")
-by_dtype_st = supported_dtypes()
-
-NON_NUMPY_FUNCS = [
-    "first",
-    "last",
-    "nanfirst",
-    "nanlast",
-    "count",
-    "any",
-    "all",
-] + list(SCIPY_STATS_FUNCS)
-SKIPPED_FUNCS = ["var", "std", "nanvar", "nanstd"]
-
-func_st = st.sampled_from([f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS])
-numeric_arrays = npst.arrays(
-    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtypes
-)
-all_arrays = npst.arrays(
-    elements={"allow_subnormal": False},
-    shape=npst.array_shapes(),
-    dtype=supported_dtypes(),
-)
-
 calendars = st.sampled_from(
     [
         "standard",
@@ -89,7 +51,7 @@ def units(draw, *, calendar: str) -> str:
 def cftime_arrays(
     draw: st.DrawFn,
     *,
-    shape: tuple[int, ...],
+    shape: st.SearchStrategy[tuple[int, ...]] = npst.array_shapes(),
     calendars: st.SearchStrategy[str] = calendars,
     elements: dict[str, Any] | None = None,
 ) -> np.ndarray[Any, Any]:
@@ -103,8 +65,55 @@ def cftime_arrays(
     return cftime.num2date(values, units=unit, calendar=cal)
 
 
+numeric_dtypes = (
+    npst.integer_dtypes(endianness="=")
+    | npst.unsigned_integer_dtypes(endianness="=")
+    | npst.floating_dtypes(endianness="=", sizes=(32, 64))
+    # TODO: add complex here not in supported_dtypes
+)
+numeric_like_dtypes = (
+    npst.boolean_dtypes()
+    | numeric_dtypes
+    | npst.datetime64_dtypes(endianness="=")
+    | npst.timedelta64_dtypes(endianness="=")
+)
+supported_dtypes = (
+    numeric_like_dtypes
+    | npst.unicode_string_dtypes(endianness="=")
+    | npst.complex_number_dtypes(endianness="=")
+)
+by_dtype_st = supported_dtypes
+
+NON_NUMPY_FUNCS = [
+    "first",
+    "last",
+    "nanfirst",
+    "nanlast",
+    "count",
+    "any",
+    "all",
+] + list(SCIPY_STATS_FUNCS)
+SKIPPED_FUNCS = ["var", "std", "nanvar", "nanstd"]
+
+func_st = st.sampled_from([f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS])
+numeric_arrays = npst.arrays(
+    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=numeric_dtypes
+)
+numeric_like_arrays = npst.arrays(
+    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=numeric_like_dtypes
+)
+all_arrays = (
+    npst.arrays(
+        elements={"allow_subnormal": False},
+        shape=npst.array_shapes(),
+        dtype=numeric_like_dtypes,
+    )
+    | cftime_arrays()
+)
+
+
 def by_arrays(
-    shape: tuple[int, ...], *, elements: dict[str, Any] | None = None
+    shape: st.SearchStrategy[tuple[int, ...]], *, elements: dict[str, Any] | None = None
 ) -> st.SearchStrategy[np.ndarray[Any, Any]]:
     if elements is None:
         elements = {}
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -2007,3 +2007,32 @@ def test_blockwise_avoid_rechunk():
     actual, groups = groupby_reduce(array, by, func="first")
     assert_equal(groups, ["", "0", "1"])
     assert_equal(actual, np.array([0, 0, 0], dtype=np.int64))
+
+
+def test_datetime_minmax(engine):
+    # GH403
+    array = np.array([np.datetime64("2000-01-01"), np.datetime64("2000-01-02"), np.datetime64("2000-01-03")])
+    by = np.array([0, 0, 1])
+    actual, _ = flox.groupby_reduce(array, by, func="nanmin", engine=engine)
+    expected = array[[0, 2]]
+    assert_equal(expected, actual)
+
+    expected = array[[1, 2]]
+    actual, _ = flox.groupby_reduce(array, by, func="nanmax", engine=engine)
+    assert_equal(expected, actual)
+
+
+@pytest.mark.parametrize("func", ["first", "last", "nanfirst", "nanlast"])
+def test_datetime_timedelta_first_last(engine, func):
+    import flox
+
+    idx = 0 if "first" in func else -1
+
+    dt = pd.date_range("2001-01-01", freq="d", periods=5).values
+    by = np.ones(dt.shape, dtype=int)
+    actual, _ = flox.groupby_reduce(dt, by, func=func, engine=engine)
+    assert_equal(actual, dt[[idx]])
+
+    dt = dt - dt[0]
+    actual, _ = flox.groupby_reduce(dt, by, func=func, engine=engine)
+    assert_equal(actual, dt[[idx]])
diff --git a/tests/test_properties.py b/tests/test_properties.py
diff --git a/tests/test_xarray.py b/tests/test_xarray.py