Skip to content

BUG: Groupby min/max with nullable dtypes #42567

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Sep 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
6f7c961
BUG: Groupby.min/max with Int64
jbrockmendel Jul 13, 2021
8f98501
flesh out test cases
jbrockmendel Jul 15, 2021
936df47
Merge branch 'master' into nullable-41743
jbrockmendel Jul 16, 2021
b325cc0
const, rename test
jbrockmendel Jul 16, 2021
a021e58
Merge branch 'master' into nullable-41743
jbrockmendel Jul 21, 2021
2807b25
Merge branch 'master' into nullable-41743
jbrockmendel Jul 23, 2021
1988294
Merge branch 'master' into nullable-41743
jbrockmendel Jul 24, 2021
25307f6
Merge branch 'master' into nullable-41743
jbrockmendel Jul 26, 2021
921ad33
whatsnew
jbrockmendel Jul 26, 2021
98f8782
Merge branch 'master' into nullable-41743
jbrockmendel Jul 29, 2021
13bd9f3
Merge branch 'master' into nullable-41743
jbrockmendel Jul 30, 2021
f44e77b
Merge branch 'master' into nullable-41743
jbrockmendel Jul 30, 2021
cc95817
Merge branch 'master' into nullable-41743
jbrockmendel Jul 30, 2021
362eed5
Merge branch 'master' into nullable-41743
jbrockmendel Jul 31, 2021
5f4ea99
mask -> mask_in
jbrockmendel Jul 31, 2021
3eb06f5
Merge branch 'master' into nullable-41743
jbrockmendel Aug 8, 2021
359c171
Merge branch 'master' into nullable-41743
jbrockmendel Aug 9, 2021
35def86
mask_out -> result_mask
jbrockmendel Aug 9, 2021
edb1beb
Merge branch 'master' into nullable-41743
jbrockmendel Aug 13, 2021
e1a447b
Merge branch 'master' into nullable-41743
jbrockmendel Aug 18, 2021
f556ab8
Merge branch 'master' into nullable-41743
jbrockmendel Aug 24, 2021
92b4617
Merge branch 'master' into nullable-41743
jbrockmendel Aug 29, 2021
11d7f1d
Merge branch 'master' into nullable-41743
jbrockmendel Aug 31, 2021
6043105
mask_in->mask
jbrockmendel Aug 31, 2021
961073d
Merge branch 'master' into nullable-41743
jbrockmendel Sep 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`)
- Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`)
- Bug in :meth:`GroupBy.max` and :meth:`GroupBy.min` with nullable integer dtypes losing precision (:issue:`41743`)
- Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`)
- Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`)
- Bug in :meth:`SeriesGroupBy.nlargest` and :meth:`SeriesGroupBy.nsmallest` would have an inconsistent index when the input Series was sorted and ``n`` was greater than or equal to all group sizes (:issue:`15272`, :issue:`16345`, :issue:`29129`)
Expand Down
4 changes: 4 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,17 @@ def group_max(
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_min(
out: np.ndarray, # groupby_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_cummin(
out: np.ndarray, # groupby_t[:, ::1]
Expand Down
36 changes: 31 additions & 5 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1182,7 +1182,9 @@ cdef group_min_max(groupby_t[:, ::1] out,
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint compute_max=True):
bint compute_max=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None):
"""
Compute minimum/maximum of columns of `values`, in row groups `labels`.

Expand All @@ -1203,6 +1205,12 @@ cdef group_min_max(groupby_t[:, ::1] out,
True if `values` contains datetime-like entries.
compute_max : bint, default True
True to compute group-wise max, False to compute min
mask : ndarray[bool, ndim=2], optional
If not None, indices represent missing values,
otherwise the mask will not be used
result_mask : ndarray[bool, ndim=2], optional
If not None, these specify locations in the output that are NA.
Modified in-place.

Notes
-----
Expand All @@ -1215,6 +1223,8 @@ cdef group_min_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] group_min_or_max
bint runtime_error = False
int64_t[:, ::1] nobs
bint uses_mask = mask is not None
bint isna_entry

# TODO(cython 3.0):
# Instead of `labels.shape[0]` use `len(labels)`
Expand Down Expand Up @@ -1249,7 +1259,12 @@ cdef group_min_max(groupby_t[:, ::1] out,
for j in range(K):
val = values[i, j]

if not _treat_as_na(val, is_datetimelike):
if uses_mask:
isna_entry = mask[i, j]
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
nobs[lab, j] += 1
if compute_max:
if val > group_min_or_max[lab, j]:
Expand All @@ -1265,7 +1280,10 @@ cdef group_min_max(groupby_t[:, ::1] out,
runtime_error = True
break
else:
out[i, j] = nan_val
if uses_mask:
result_mask[i, j] = True
else:
out[i, j] = nan_val
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is out typically initialized? (np.empty?) I am wondering it it would be good practice to sill set out anyway (also if uses_mask=True to not have "unitialized" values)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm i could go either way on this

else:
out[i, j] = group_min_or_max[i, j]

Expand All @@ -1282,7 +1300,9 @@ def group_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False) -> None:
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None) -> None:
"""See group_min_max.__doc__"""
group_min_max(
out,
Expand All @@ -1292,6 +1312,8 @@ def group_max(groupby_t[:, ::1] out,
min_count=min_count,
is_datetimelike=is_datetimelike,
compute_max=True,
mask=mask,
result_mask=result_mask,
)


Expand All @@ -1302,7 +1324,9 @@ def group_min(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False) -> None:
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None) -> None:
"""See group_min_max.__doc__"""
group_min_max(
out,
Expand All @@ -1312,6 +1336,8 @@ def group_min(groupby_t[:, ::1] out,
min_count=min_count,
is_datetimelike=is_datetimelike,
compute_max=False,
mask=mask,
result_mask=result_mask,
)


Expand Down
2 changes: 2 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
raise ValueError("values must be a 1D array")
if mask.ndim != 1:
raise ValueError("mask must be a 1D array")
if values.shape != mask.shape:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this hit in tests?

raise ValueError("values and mask must have same shape")

if copy:
values = values.copy()
Expand Down
22 changes: 19 additions & 3 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def __init__(self, kind: str, how: str):
},
}

_MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax"}
_MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax", "min", "max"}

_cython_arity = {"ohlc": 4} # OHLC

Expand Down Expand Up @@ -404,6 +404,7 @@ def _masked_ea_wrap_cython_operation(

# Copy to ensure input and result masks don't end up shared
mask = values._mask.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For an aggregation op like this, I guess we can avoid the mask copy since the mask is not being modified inplace. (but not something needs to be done in this pr, just a note)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though I suppose there's the tradeoff that then we'd lose mask contiguity guarantee in the algo?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

haven't looked at the contiguity but that makes sense

result_mask = np.zeros(ngroups, dtype=bool)
arr = values._data

res_values = self._cython_op_ndim_compat(
Expand All @@ -412,13 +413,18 @@ def _masked_ea_wrap_cython_operation(
ngroups=ngroups,
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
**kwargs,
)

dtype = self._get_result_dtype(orig_values.dtype)
assert isinstance(dtype, BaseMaskedDtype)
cls = dtype.construct_array_type()

return cls(res_values.astype(dtype.type, copy=False), mask)
if self.kind != "aggregate":
return cls(res_values.astype(dtype.type, copy=False), mask)
else:
return cls(res_values.astype(dtype.type, copy=False), result_mask)

@final
def _cython_op_ndim_compat(
Expand All @@ -428,20 +434,24 @@ def _cython_op_ndim_compat(
min_count: int,
ngroups: int,
comp_ids: np.ndarray,
mask: np.ndarray | None,
mask: np.ndarray | None = None,
result_mask: np.ndarray | None = None,
**kwargs,
) -> np.ndarray:
if values.ndim == 1:
# expand to 2d, dispatch, then squeeze if appropriate
values2d = values[None, :]
if mask is not None:
mask = mask[None, :]
if result_mask is not None:
result_mask = result_mask[None, :]
res = self._call_cython_op(
values2d,
min_count=min_count,
ngroups=ngroups,
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
**kwargs,
)
if res.shape[0] == 1:
Expand All @@ -456,6 +466,7 @@ def _cython_op_ndim_compat(
ngroups=ngroups,
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
**kwargs,
)

Expand All @@ -468,6 +479,7 @@ def _call_cython_op(
ngroups: int,
comp_ids: np.ndarray,
mask: np.ndarray | None,
result_mask: np.ndarray | None,
**kwargs,
) -> np.ndarray: # np.ndarray[ndim=2]
orig_values = values
Expand All @@ -493,6 +505,8 @@ def _call_cython_op(
values = values.T
if mask is not None:
mask = mask.T
if result_mask is not None:
result_mask = result_mask.T

out_shape = self._get_output_shape(ngroups, values)
func, values = self.get_cython_func_and_vals(values, is_numeric)
Expand All @@ -508,6 +522,8 @@ def _call_cython_op(
values,
comp_ids,
min_count,
mask=mask,
result_mask=result_mask,
is_datetimelike=is_datetimelike,
)
else:
Expand Down
48 changes: 48 additions & 0 deletions pandas/tests/groupby/test_min_max.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,51 @@ def test_aggregate_categorical_lost_index(func: str):
expected["B"] = expected["B"].astype(ds.dtype)

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"])
def test_groupby_min_max_nullable(dtype):
if dtype == "Int64":
# GH#41743 avoid precision loss
ts = 1618556707013635762
elif dtype == "boolean":
ts = 0
else:
ts = 4.0

df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]})
df["ts"] = df["ts"].astype(dtype)

gb = df.groupby("id")

result = gb.min()
expected = df.iloc[:1].set_index("id")
tm.assert_frame_equal(result, expected)

res_max = gb.max()
expected_max = df.iloc[1:].set_index("id")
tm.assert_frame_equal(res_max, expected_max)

result2 = gb.min(min_count=3)
expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype)
tm.assert_frame_equal(result2, expected2)

res_max2 = gb.max(min_count=3)
tm.assert_frame_equal(res_max2, expected2)

# Case with NA values
df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]})
df2["ts"] = df2["ts"].astype(dtype)
gb2 = df2.groupby("id")

result3 = gb2.min()
tm.assert_frame_equal(result3, expected)

res_max3 = gb2.max()
tm.assert_frame_equal(res_max3, expected_max)

result4 = gb2.min(min_count=100)
tm.assert_frame_equal(result4, expected2)

res_max4 = gb2.max(min_count=100)
tm.assert_frame_equal(res_max4, expected2)