Skip to content

Commit 92284c5

Browse files
authored
BUG: GroupBy.cummin altering nullable arrays inplace (#46220)
1 parent 9303302 commit 92284c5

File tree

4 files changed

+44
-31
lines changed

4 files changed

+44
-31
lines changed

doc/source/whatsnew/v1.5.0.rst

+3-2
Original file line numberDiff line numberDiff line change
@@ -430,8 +430,9 @@ Groupby/resample/rolling
430430
- Bug in :meth:`.ExponentialMovingWindow.mean` with ``axis=1`` and ``engine='numba'`` when the :class:`DataFrame` has more columns than rows (:issue:`46086`)
431431
- Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`)
432432
- Bug in :meth:`.DataFrameGroupby.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`)
433-
- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`??`)
434-
- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`??`)
433+
- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`46216`)
434+
- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`46216`)
435+
- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`)
435436
-
436437

437438
Reshaping

pandas/_libs/groupby.pyx

+28-20
Original file line numberDiff line numberDiff line change
@@ -1477,7 +1477,8 @@ def group_min(
14771477
cdef group_cummin_max(
14781478
iu_64_floating_t[:, ::1] out,
14791479
ndarray[iu_64_floating_t, ndim=2] values,
1480-
uint8_t[:, ::1] mask,
1480+
const uint8_t[:, ::1] mask,
1481+
uint8_t[:, ::1] result_mask,
14811482
const intp_t[::1] labels,
14821483
int ngroups,
14831484
bint is_datetimelike,
@@ -1496,6 +1497,9 @@ cdef group_cummin_max(
14961497
mask : np.ndarray[bool] or None
14971498
If not None, indices represent missing values,
14981499
otherwise the mask will not be used
1500+
result_mask : ndarray[bool, ndim=2], optional
1501+
If not None, these specify locations in the output that are NA.
1502+
Modified in-place.
14991503
labels : np.ndarray[np.intp]
15001504
Labels to group by.
15011505
ngroups : int
@@ -1558,7 +1562,7 @@ cdef group_cummin_max(
15581562

15591563
if not skipna and na_possible and seen_na[lab, j]:
15601564
if uses_mask:
1561-
mask[i, j] = 1 # FIXME: shouldn't alter inplace
1565+
result_mask[i, j] = 1
15621566
# Set to 0 ensures that we are deterministic and can
15631567
# downcast if appropriate
15641568
out[i, j] = 0
@@ -1595,19 +1599,21 @@ def group_cummin(
15951599
const intp_t[::1] labels,
15961600
int ngroups,
15971601
bint is_datetimelike,
1598-
uint8_t[:, ::1] mask=None,
1602+
const uint8_t[:, ::1] mask=None,
1603+
uint8_t[:, ::1] result_mask=None,
15991604
bint skipna=True,
16001605
) -> None:
16011606
"""See group_cummin_max.__doc__"""
16021607
group_cummin_max(
1603-
out,
1604-
values,
1605-
mask,
1606-
labels,
1607-
ngroups,
1608-
is_datetimelike,
1609-
skipna,
1610-
compute_max=False
1608+
out=out,
1609+
values=values,
1610+
mask=mask,
1611+
result_mask=result_mask,
1612+
labels=labels,
1613+
ngroups=ngroups,
1614+
is_datetimelike=is_datetimelike,
1615+
skipna=skipna,
1616+
compute_max=False,
16111617
)
16121618

16131619

@@ -1619,17 +1625,19 @@ def group_cummax(
16191625
const intp_t[::1] labels,
16201626
int ngroups,
16211627
bint is_datetimelike,
1622-
uint8_t[:, ::1] mask=None,
1628+
const uint8_t[:, ::1] mask=None,
1629+
uint8_t[:, ::1] result_mask=None,
16231630
bint skipna=True,
16241631
) -> None:
16251632
"""See group_cummin_max.__doc__"""
16261633
group_cummin_max(
1627-
out,
1628-
values,
1629-
mask,
1630-
labels,
1631-
ngroups,
1632-
is_datetimelike,
1633-
skipna,
1634-
compute_max=True
1634+
out=out,
1635+
values=values,
1636+
mask=mask,
1637+
result_mask=result_mask,
1638+
labels=labels,
1639+
ngroups=ngroups,
1640+
is_datetimelike=is_datetimelike,
1641+
skipna=skipna,
1642+
compute_max=True,
16351643
)

pandas/core/groupby/ops.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -406,9 +406,13 @@ def _masked_ea_wrap_cython_operation(
406406
"""
407407
orig_values = values
408408

409-
# Copy to ensure input and result masks don't end up shared
410-
mask = values._mask.copy()
411-
result_mask = np.zeros(ngroups, dtype=bool)
409+
# libgroupby functions are responsible for NOT altering mask
410+
mask = values._mask
411+
if self.kind != "aggregate":
412+
result_mask = mask.copy()
413+
else:
414+
result_mask = np.zeros(ngroups, dtype=bool)
415+
412416
arr = values._data
413417

414418
res_values = self._cython_op_ndim_compat(
@@ -427,12 +431,7 @@ def _masked_ea_wrap_cython_operation(
427431
# dtype; last attempt ran into trouble on 32bit linux build
428432
res_values = res_values.astype(dtype.type, copy=False)
429433

430-
if self.kind != "aggregate":
431-
out_mask = mask
432-
else:
433-
out_mask = result_mask
434-
435-
return orig_values._maybe_mask_result(res_values, out_mask)
434+
return orig_values._maybe_mask_result(res_values, result_mask)
436435

437436
@final
438437
def _cython_op_ndim_compat(
@@ -568,6 +567,7 @@ def _call_cython_op(
568567
ngroups=ngroups,
569568
is_datetimelike=is_datetimelike,
570569
mask=mask,
570+
result_mask=result_mask,
571571
**kwargs,
572572
)
573573
else:

pandas/tests/groupby/test_function.py

+4
Original file line numberDiff line numberDiff line change
@@ -843,11 +843,15 @@ def test_cummax(dtypes_for_minmax):
843843
def test_cummin_max_skipna(method, dtype, groups, expected_data):
844844
# GH-34047
845845
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
846+
orig = df.copy()
846847
gb = df.groupby(groups)["a"]
847848

848849
result = getattr(gb, method)(skipna=False)
849850
expected = Series(expected_data, dtype=dtype, name="a")
850851

852+
# check we didn't accidentally alter df
853+
tm.assert_frame_equal(df, orig)
854+
851855
tm.assert_series_equal(result, expected)
852856

853857

0 commit comments

Comments
 (0)