From df70d2db4becdcf65dfb51cc09ddd562724ff3dc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Feb 2021 15:25:26 +0100 Subject: [PATCH 1/9] [ArrayManager] GroupBy cython aggregations (no fallback) --- .github/workflows/ci.yml | 1 + pandas/core/groupby/generic.py | 132 ++++++++++++++---- pandas/core/internals/array_manager.py | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 7 + pandas/tests/groupby/aggregate/test_cython.py | 12 +- pandas/tests/groupby/aggregate/test_other.py | 3 + 6 files changed, 123 insertions(+), 34 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aec19e27a33e2..41ec1c9f36be4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,6 +153,7 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager + pytest pandas/tests/groupby/aggregate/ --array-manager # indexing iset related (temporary since other tests don't pass yet) pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_multi_index --array-manager diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9369dc61ca5f6..1e8837db22078 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -23,6 +23,7 @@ Mapping, Optional, Sequence, + Tuple, Type, TypeVar, Union, @@ -33,7 +34,7 @@ import numpy as np from pandas._libs import lib, reduction as libreduction -from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion, Manager from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -79,7 +80,7 @@ ) from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same import pandas.core.indexes.base as ibase -from pandas.core.internals import BlockManager +from pandas.core.internals import ArrayManager, BlockManager from pandas.core.series import Series from pandas.core.util.numba_ import maybe_use_numba @@ -157,6 +158,31 @@ def pinner(cls): return pinner +def _cast_agg_result( + result, values: ArrayLike, how: str, reshape1d: bool = True +) -> ArrayLike: + # see if we can cast the values to the desired dtype + # this may not be the original dtype + assert not isinstance(result, DataFrame) + + dtype = maybe_cast_result_dtype(values.dtype, how) + result = maybe_downcast_numeric(result, dtype) + + if isinstance(values, Categorical) and isinstance(result, np.ndarray): + # If the Categorical op didn't raise, it is dtype-preserving + result = type(values)._from_sequence(result.ravel(), dtype=values.dtype) + # Note this will have result.dtype == dtype from above + + elif reshape1d and isinstance(result, np.ndarray) and result.ndim == 1: + # We went through a SeriesGroupByPath and need to reshape + # GH#32223 includes case with IntegerArray values + result = result.reshape(1, -1) + # test_groupby_duplicate_columns gets here with + # result.dtype == int64, values.dtype=object, how="min" + + return result + + @pin_allowlisted_properties(Series, base.series_apply_allowlist) class SeriesGroupBy(GroupBy[Series]): _apply_allowlist = base.series_apply_allowlist @@ -1041,10 +1067,18 @@ def _iterate_slices(self) -> Iterable[Series]: def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ) -> DataFrame: - agg_mgr = self._cython_agg_blocks( - how, alt=alt, numeric_only=numeric_only, min_count=min_count - ) - return self._wrap_agged_blocks(agg_mgr.blocks, items=agg_mgr.items) + if isinstance(self.obj._mgr, BlockManager): + agg_mgr = self._cython_agg_blocks( + how, alt=alt, numeric_only=numeric_only, min_count=min_count + ) + return self._wrap_agged_blocks(agg_mgr.blocks, items=agg_mgr.items) + elif isinstance(self.obj._mgr, ArrayManager): + agg_arrays, columns = self._cython_agg_arrays( + how, alt=alt, numeric_only=numeric_only, min_count=min_count + ) + return self._wrap_agged_arrays(agg_arrays, columns) + else: + raise TypeError("Unknown manager type") def _cython_agg_blocks( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1055,28 +1089,6 @@ def _cython_agg_blocks( if numeric_only: data = data.get_numeric_data(copy=False) - def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: - # see if we can cast the values to the desired dtype - # this may not be the original dtype - assert not isinstance(result, DataFrame) - - dtype = maybe_cast_result_dtype(values.dtype, how) - result = maybe_downcast_numeric(result, dtype) - - if isinstance(values, Categorical) and isinstance(result, np.ndarray): - # If the Categorical op didn't raise, it is dtype-preserving - result = type(values)._from_sequence(result.ravel(), dtype=values.dtype) - # Note this will have result.dtype == dtype from above - - elif isinstance(result, np.ndarray) and result.ndim == 1: - # We went through a SeriesGroupByPath and need to reshape - # GH#32223 includes case with IntegerArray values - result = result.reshape(1, -1) - # test_groupby_duplicate_columns gets here with - # result.dtype == int64, values.dtype=object, how="min" - - return result - def py_fallback(bvalues: ArrayLike) -> ArrayLike: # if self.grouper.aggregate fails, we fall back to a pure-python # solution @@ -1139,7 +1151,7 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: result = py_fallback(bvalues) - return cast_agg_result(result, bvalues, how) + return _cast_agg_result(result, bvalues, how) # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block @@ -1151,6 +1163,66 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: return new_mgr + def _cython_agg_arrays( + self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + ) -> Tuple[List[ArrayLike], Index]: + + data: ArrayManager = self._get_data_to_aggregate() + + if numeric_only: + data = data.get_numeric_data(copy=False) + + def py_fallback(bvalues: ArrayLike) -> ArrayLike: + raise NotImplementedError + + def array_func(values: ArrayLike) -> ArrayLike: + + try: + result = self.grouper._cython_operation( + "aggregate", + values, + how, + axis=1, + min_count=min_count, + ) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + + if alt is None: + # we cannot perform the operation + # in an alternate way, exclude the block + assert how == "ohlc" + raise + + result = py_fallback(values) + + return _cast_agg_result(result, values, how, reshape1d=False) + + result_arrays = [array_func(arr) for arr in data.arrays] + + if not len(result_arrays): + raise DataError("No numeric types to aggregate") + + return result_arrays, data.axes[0] + + def _wrap_agged_arrays(self, arrays: List[ArrayLike], columns: Index) -> DataFrame: + if not self.as_index: + index = np.arange(arrays[0].shape[0]) + mgr = ArrayManager(arrays, axes=[index, columns]) + result = self.obj._constructor(mgr) + self._insert_inaxis_grouper_inplace(result) + else: + index = self.grouper.result_index + mgr = ArrayManager(arrays, axes=[index, columns]) + result = self.obj._constructor(mgr) + + if self.axis == 1: + result = result.T + + return self._reindex_output(result)._convert(datetime=True) + def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") @@ -1633,7 +1705,7 @@ def _wrap_frame_output(self, result, obj: DataFrame) -> DataFrame: else: return self.obj._constructor(result, index=obj.index, columns=result_index) - def _get_data_to_aggregate(self) -> BlockManager: + def _get_data_to_aggregate(self) -> Manager: obj = self._obj_with_exclusions if self.axis == 1: return obj.T._mgr diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 083f32488acd4..1c2c7010e3b87 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -302,7 +302,7 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: if hasattr(arr, "tz") and arr.tz is None: # type: ignore[union-attr] # DatetimeArray needs to be converted to ndarray for DatetimeBlock arr = arr._data # type: ignore[union-attr] - elif arr.dtype.kind == "m": + elif arr.dtype.kind == "m" and not isinstance(arr, np.ndarray): # TimedeltaArray needs to be converted to ndarray for TimedeltaBlock arr = arr._data # type: ignore[union-attr] if isinstance(arr, np.ndarray): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 48527de6b2047..40c130456676d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,6 +9,7 @@ import pytest from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -38,6 +39,7 @@ def test_agg_regression1(tsframe): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile/describe def test_agg_must_agg(df): grouped = df.groupby("A")["C"] @@ -127,6 +129,7 @@ def test_groupby_aggregation_multi_level_column(): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) non-cython agg def test_agg_apply_corner(ts, tsframe): # nothing to group, all NA grouped = ts.groupby(ts * np.nan) @@ -203,6 +206,7 @@ def test_aggregate_str_func(tsframe, groupbyfunc): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) non-cython agg def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func): gb = df.groupby(level=0) if reduction_func in ("idxmax", "idxmin"): @@ -482,6 +486,7 @@ def test_agg_index_has_complex_internals(index): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) agg py_fallback def test_agg_split_block(): # https://github.com/pandas-dev/pandas/issues/31522 df = DataFrame( @@ -499,6 +504,7 @@ def test_agg_split_block(): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) agg py_fallback def test_agg_split_object_part_datetime(): # https://github.com/pandas-dev/pandas/pull/31616 df = DataFrame( @@ -1189,6 +1195,7 @@ def test_aggregate_datetime_objects(): tm.assert_series_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) agg py_fallback def test_aggregate_numeric_object_dtype(): # https://github.com/pandas-dev/pandas/issues/39329 # simplified case: multiple object columns where one is all-NaN diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 8799f6faa775c..31b01072e9844 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -33,7 +33,10 @@ "max", ], ) -def test_cythonized_aggers(op_name): +def test_cythonized_aggers(op_name, using_array_manager): + if using_array_manager and op_name in {"count", "sem"}: + # TODO(ArrayManager) groupby count/sem + pytest.skip("ArrayManager groupby count/sem not yet implemented") data = { "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], "B": ["A", "B"] * 6, @@ -265,7 +268,7 @@ def test_cython_with_timestamp_and_nat(op, data): "cummax", ], ) -def test_read_only_buffer_source_agg(agg): +def test_read_only_buffer_source_agg(agg, using_array_manager): # https://github.com/pandas-dev/pandas/issues/36014 df = DataFrame( { @@ -273,7 +276,10 @@ def test_read_only_buffer_source_agg(agg): "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], } ) - df._mgr.blocks[0].values.flags.writeable = False + if using_array_manager: + df._mgr.arrays[0].flags.writeable = False + else: + df._mgr.blocks[0].values.flags.writeable = False result = df.groupby(["species"]).agg({"sepal_length": agg}) expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 04f17865b088a..b56051ec6c9b3 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,6 +8,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -412,6 +414,7 @@ def __call__(self, x): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) columns with ndarrays def test_agg_over_numpy_arrays(): # GH 3788 df = DataFrame( From 692175e8f55e2b587d25a0355e4b01a706a46235 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Feb 2021 15:30:08 +0100 Subject: [PATCH 2/9] style --- pandas/core/groupby/generic.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dad2b484c0098..373e032a0ceeb 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1207,17 +1207,14 @@ def _cython_agg_arrays( data = data.get_numeric_data(copy=False) def py_fallback(bvalues: ArrayLike) -> ArrayLike: + # TODO(ArrayManager) raise NotImplementedError def array_func(values: ArrayLike) -> ArrayLike: try: result = self.grouper._cython_operation( - "aggregate", - values, - how, - axis=1, - min_count=min_count, + "aggregate", values, how, axis=1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False From a7bf71ee79321f484b2f2fad6fa901fb9e7290f2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Feb 2021 20:08:42 +0100 Subject: [PATCH 3/9] common _cython_agg_manager --- pandas/core/groupby/generic.py | 150 ++++++++----------------- pandas/core/internals/array_manager.py | 26 +++++ 2 files changed, 71 insertions(+), 105 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5e7412793e263..909dbec88e63c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -24,7 +24,6 @@ List, Mapping, Optional, - Tuple, Type, TypeVar, Union, @@ -186,31 +185,6 @@ def pinner(cls): return pinner -def _cast_agg_result( - result, values: ArrayLike, how: str, reshape1d: bool = True -) -> ArrayLike: - # see if we can cast the values to the desired dtype - # this may not be the original dtype - assert not isinstance(result, DataFrame) - - dtype = maybe_cast_result_dtype(values.dtype, how) - result = maybe_downcast_numeric(result, dtype) - - if isinstance(values, Categorical) and isinstance(result, np.ndarray): - # If the Categorical op didn't raise, it is dtype-preserving - result = type(values)._from_sequence(result.ravel(), dtype=values.dtype) - # Note this will have result.dtype == dtype from above - - elif reshape1d and isinstance(result, np.ndarray) and result.ndim == 1: - # We went through a SeriesGroupByPath and need to reshape - # GH#32223 includes case with IntegerArray values - result = result.reshape(1, -1) - # test_groupby_duplicate_columns gets here with - # result.dtype == int64, values.dtype=object, how="min" - - return result - - @pin_allowlisted_properties(Series, base.series_apply_allowlist) class SeriesGroupBy(GroupBy[Series]): _apply_allowlist = base.series_apply_allowlist @@ -1104,28 +1078,48 @@ def _iterate_slices(self) -> Iterable[Series]: def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ) -> DataFrame: - if isinstance(self.obj._mgr, BlockManager): - agg_mgr = self._cython_agg_blocks( - how, alt=alt, numeric_only=numeric_only, min_count=min_count - ) - return self._wrap_agged_manager(agg_mgr) - elif isinstance(self.obj._mgr, ArrayManager): - agg_arrays, columns = self._cython_agg_arrays( - how, alt=alt, numeric_only=numeric_only, min_count=min_count - ) - return self._wrap_agged_arrays(agg_arrays, columns) - else: - raise TypeError("Unknown manager type") + agg_mgr = self._cython_agg_manager( + how, alt=alt, numeric_only=numeric_only, min_count=min_count + ) + return self._wrap_agged_manager(agg_mgr) - def _cython_agg_blocks( + def _cython_agg_manager( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ) -> BlockManager: - data: BlockManager = self._get_data_to_aggregate() + data: Manager = self._get_data_to_aggregate() if numeric_only: data = data.get_numeric_data(copy=False) + using_array_manager = isinstance(data, ArrayManager) + + def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: + # see if we can cast the values to the desired dtype + # this may not be the original dtype + assert not isinstance(result, DataFrame) + + dtype = maybe_cast_result_dtype(values.dtype, how) + result = maybe_downcast_numeric(result, dtype) + + if isinstance(values, Categorical) and isinstance(result, np.ndarray): + # If the Categorical op didn't raise, it is dtype-preserving + result = type(values)._from_sequence(result.ravel(), dtype=values.dtype) + # Note this will have result.dtype == dtype from above + + elif ( + not using_array_manager + and isinstance(result, np.ndarray) + and result.ndim == 1 + ): + # We went through a SeriesGroupByPath and need to reshape + # GH#32223 includes case with IntegerArray values + result = result.reshape(1, -1) + # test_groupby_duplicate_columns gets here with + # result.dtype == int64, values.dtype=object, how="min" + + return result + def py_fallback(bvalues: ArrayLike) -> ArrayLike: # if self.grouper.aggregate fails, we fall back to a pure-python # solution @@ -1169,11 +1163,11 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike: result = mgr.blocks[0].values return result - def blk_func(bvalues: ArrayLike) -> ArrayLike: + def array_func(values: ArrayLike) -> ArrayLike: try: result = self.grouper._cython_operation( - "aggregate", bvalues, how, axis=1, min_count=min_count + "aggregate", values, how, axis=1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False @@ -1186,77 +1180,20 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: assert how == "ohlc" raise - result = py_fallback(bvalues) + result = py_fallback(values) - return _cast_agg_result(result, bvalues, how) + return cast_agg_result(result, values, how) # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block # NotImplementedError -> "ohlc" with wrong dtype - new_mgr = data.grouped_reduce(blk_func, ignore_failures=True) + new_mgr = data.grouped_reduce(array_func, ignore_failures=True) if not len(new_mgr): raise DataError("No numeric types to aggregate") return new_mgr - def _cython_agg_arrays( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 - ) -> Tuple[List[ArrayLike], Index]: - - data: ArrayManager = self._get_data_to_aggregate() - - if numeric_only: - data = data.get_numeric_data(copy=False) - - def py_fallback(bvalues: ArrayLike) -> ArrayLike: - # TODO(ArrayManager) - raise NotImplementedError - - def array_func(values: ArrayLike) -> ArrayLike: - - try: - result = self.grouper._cython_operation( - "aggregate", values, how, axis=1, min_count=min_count - ) - except NotImplementedError: - # generally if we have numeric_only=False - # and non-applicable functions - # try to python agg - - if alt is None: - # we cannot perform the operation - # in an alternate way, exclude the block - assert how == "ohlc" - raise - - result = py_fallback(values) - - return _cast_agg_result(result, values, how, reshape1d=False) - - result_arrays = [array_func(arr) for arr in data.arrays] - - if not len(result_arrays): - raise DataError("No numeric types to aggregate") - - return result_arrays, data.axes[0] - - def _wrap_agged_arrays(self, arrays: List[ArrayLike], columns: Index) -> DataFrame: - if not self.as_index: - index = np.arange(arrays[0].shape[0]) - mgr = ArrayManager(arrays, axes=[index, columns]) - result = self.obj._constructor(mgr) - self._insert_inaxis_grouper_inplace(result) - else: - index = self.grouper.result_index - mgr = ArrayManager(arrays, axes=[index, columns]) - result = self.obj._constructor(mgr) - - if self.axis == 1: - result = result.T - - return self._reindex_output(result)._convert(datetime=True) - def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") @@ -1828,17 +1765,20 @@ def _wrap_transformed_output( return result - def _wrap_agged_manager(self, mgr: BlockManager) -> DataFrame: + def _wrap_agged_manager(self, mgr: Manager) -> DataFrame: + # TODO clean this up + axis = 1 if isinstance(mgr, BlockManager) else 0 + axes = mgr.axes if isinstance(mgr, BlockManager) else mgr._axes if not self.as_index: index = np.arange(mgr.shape[1]) - mgr.axes[1] = ibase.Index(index) + axes[axis] = ibase.Index(index) result = self.obj._constructor(mgr) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index - mgr.axes[1] = index + axes[axis] = index result = self.obj._constructor(mgr) if self.axis == 1: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index b2e01c55f7aad..bf6ded2f26bdf 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -215,6 +215,32 @@ def reduce( indexer = np.arange(self.shape[0]) return new_mgr, indexer + def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: + """ + Apply grouped reduction function columnwise, returning a new ArrayManager. + + Parameters + ---------- + func : grouped reduction function + ignore_failures : bool, default False + Whether to drop columns where func raises TypeError. + + Returns + ------- + ArrayManager + """ + result_arrays: List[ArrayLike] = [] + + # TODO ignore_failures + result_arrays = [func(arr) for arr in self.arrays] + + if len(result_arrays) == 0: + index = Index([None]) # placeholder + else: + index = Index(range(result_arrays[0].shape[0])) + + return type(self)(result_arrays, [index, self.items]) + def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager: """ Apply array_op blockwise with another (aligned) BlockManager. From 8c1b8a21aab236379a6d5b12306f51d3bdd3f360 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Feb 2021 20:12:09 +0100 Subject: [PATCH 4/9] clean-up test --- pandas/tests/groupby/aggregate/test_cython.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index d8eee81274f92..fc92abdff6eab 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -276,7 +276,7 @@ def test_cython_with_timestamp_and_nat(op, data): "cummax", ], ) -def test_read_only_buffer_source_agg(agg, using_array_manager): +def test_read_only_buffer_source_agg(agg): # https://github.com/pandas-dev/pandas/issues/36014 df = DataFrame( { @@ -284,10 +284,7 @@ def test_read_only_buffer_source_agg(agg, using_array_manager): "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], } ) - if using_array_manager: - df._mgr.arrays[0].flags.writeable = False - else: - df._mgr.blocks[0].values.flags.writeable = False + df._mgr.arrays[0].flags.writeable = False result = df.groupby(["species"]).agg({"sepal_length": agg}) expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) From 06b6f3f389b3a538cf59f9fda62d63407f60b327 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Feb 2021 20:15:50 +0100 Subject: [PATCH 5/9] clean-up setting of index axis --- pandas/core/groupby/generic.py | 7 ++----- pandas/core/internals/array_manager.py | 20 +++++++++++--------- pandas/core/internals/managers.py | 19 +++++++++++-------- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 909dbec88e63c..e26a421de63a4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1766,19 +1766,16 @@ def _wrap_transformed_output( return result def _wrap_agged_manager(self, mgr: Manager) -> DataFrame: - # TODO clean this up - axis = 1 if isinstance(mgr, BlockManager) else 0 - axes = mgr.axes if isinstance(mgr, BlockManager) else mgr._axes if not self.as_index: index = np.arange(mgr.shape[1]) - axes[axis] = ibase.Index(index) + mgr.set_axis(1, ibase.Index(index), verify_integrity=False) result = self.obj._constructor(mgr) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index - axes[axis] = index + mgr.set_axis(1, index, verify_integrity=False) result = self.obj._constructor(mgr) if self.axis == 1: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index bf6ded2f26bdf..57a1ce26c14cd 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -141,18 +141,20 @@ def _normalize_axis(axis): axis = 1 if axis == 0 else 0 return axis - # TODO can be shared - def set_axis(self, axis: int, new_labels: Index) -> None: + def set_axis( + self, axis: int, new_labels: Index, verify_integrity: bool = True + ) -> None: # Caller is responsible for ensuring we have an Index object. axis = self._normalize_axis(axis) - old_len = len(self._axes[axis]) - new_len = len(new_labels) + if verify_integrity: + old_len = len(self._axes[axis]) + new_len = len(new_labels) - if new_len != old_len: - raise ValueError( - f"Length mismatch: Expected axis has {old_len} elements, new " - f"values have {new_len} elements" - ) + if new_len != old_len: + raise ValueError( + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" + ) self._axes[axis] = new_labels diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 81fccc335b7c8..89faf7c89e402 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -235,16 +235,19 @@ def shape(self) -> Shape: def ndim(self) -> int: return len(self.axes) - def set_axis(self, axis: int, new_labels: Index) -> None: + def set_axis( + self, axis: int, new_labels: Index, verify_integrity: bool = True + ) -> None: # Caller is responsible for ensuring we have an Index object. - old_len = len(self.axes[axis]) - new_len = len(new_labels) + if verify_integrity: + old_len = len(self.axes[axis]) + new_len = len(new_labels) - if new_len != old_len: - raise ValueError( - f"Length mismatch: Expected axis has {old_len} elements, new " - f"values have {new_len} elements" - ) + if new_len != old_len: + raise ValueError( + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" + ) self.axes[axis] = new_labels From 244152ba239fa1f11306cbccf2efec03a9aeccc9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Feb 2021 21:39:22 +0100 Subject: [PATCH 6/9] fix BM.arrays for use in tests --- pandas/core/internals/managers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 89faf7c89e402..09b8ebe776b55 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -286,7 +286,7 @@ def get_dtypes(self): return algos.take_nd(dtypes, self.blknos, allow_fill=False) @property - def arrays(self): + def arrays(self) -> List[ArrayLike]: """ Quick access to the backing arrays of the Blocks. @@ -294,8 +294,7 @@ def arrays(self): Not to be used in actual code, and return value is not the same as the ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs). """ - for blk in self.blocks: - yield blk.values + return [blk.values for blk in self.blocks] def __getstate__(self): block_values = [b.values for b in self.blocks] From 32bf7d136fa59c9a42867cdba2ae909f00ea77a3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Feb 2021 22:56:12 +0100 Subject: [PATCH 7/9] typing --- pandas/core/groupby/generic.py | 2 +- pandas/core/internals/array_manager.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e26a421de63a4..45914519c0a94 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1085,7 +1085,7 @@ def _cython_agg_general( def _cython_agg_manager( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 - ) -> BlockManager: + ) -> Manager: data: Manager = self._get_data_to_aggregate() diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 57a1ce26c14cd..e1770c67f8107 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -231,8 +231,6 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: ------- ArrayManager """ - result_arrays: List[ArrayLike] = [] - # TODO ignore_failures result_arrays = [func(arr) for arr in self.arrays] From b44804ee0916d379443fffd6ce60cb958ac6262e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 25 Feb 2021 08:24:48 +0100 Subject: [PATCH 8/9] use add_marker --- pandas/tests/groupby/aggregate/test_cython.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index fc92abdff6eab..558e01d08660c 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -41,10 +41,12 @@ "max", ], ) -def test_cythonized_aggers(op_name, using_array_manager): +def test_cythonized_aggers(op_name, using_array_manager, request): if using_array_manager and op_name in {"count", "sem"}: # TODO(ArrayManager) groupby count/sem - pytest.skip("ArrayManager groupby count/sem not yet implemented") + request.node.add_marker( + pytest.mark.xfail(reason="ArrayManager groupby count/sem not implemented") + ) data = { "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], "B": ["A", "B"] * 6, From 50fb97fc45de993d9ff1d4784276ff409e2300a9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 25 Feb 2021 08:26:56 +0100 Subject: [PATCH 9/9] remove xfail marker - count is actually implemented now --- pandas/tests/groupby/aggregate/test_cython.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 558e01d08660c..4a8aabe41b754 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -41,12 +41,7 @@ "max", ], ) -def test_cythonized_aggers(op_name, using_array_manager, request): - if using_array_manager and op_name in {"count", "sem"}: - # TODO(ArrayManager) groupby count/sem - request.node.add_marker( - pytest.mark.xfail(reason="ArrayManager groupby count/sem not implemented") - ) +def test_cythonized_aggers(op_name): data = { "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], "B": ["A", "B"] * 6,