From 311368aa7b7d99152f895dbadbc47f8c6de53208 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Feb 2021 09:26:25 -0800 Subject: [PATCH] REF: extract_array earlier in block construction --- pandas/_testing/asserters.py | 5 ++-- pandas/core/internals/blocks.py | 8 +++--- pandas/core/internals/managers.py | 36 ++++++++++++++---------- pandas/tests/extension/test_numpy.py | 14 +++++++++ pandas/tests/internals/test_internals.py | 2 +- 5 files changed, 43 insertions(+), 22 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 829472f24852a..c883d246a2daa 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -18,6 +18,7 @@ is_numeric_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import PandasDtype from pandas.core.dtypes.missing import array_equivalent import pandas as pd @@ -630,12 +631,12 @@ def raise_assert_detail(obj, message, left, right, diff=None, index_values=None) if isinstance(left, np.ndarray): left = pprint_thing(left) - elif is_categorical_dtype(left): + elif is_categorical_dtype(left) or isinstance(left, PandasDtype): left = repr(left) if isinstance(right, np.ndarray): right = pprint_thing(right) - elif is_categorical_dtype(right): + elif is_categorical_dtype(right) or isinstance(right, PandasDtype): right = repr(right) msg += f""" diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8d5cadce823c7..689a067e1c211 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -178,7 +178,7 @@ def _maybe_coerce_values(cls, values): Parameters ---------- - values : np.ndarray, ExtensionArray, Index + values : np.ndarray or ExtensionArray Returns ------- @@ -350,7 +350,7 @@ def __getstate__(self): @final def __setstate__(self, state): self.mgr_locs = libinternals.BlockPlacement(state[0]) - self.values = state[1] + self.values = extract_array(state[1], extract_numpy=True) self.ndim = self.values.ndim def _slice(self, slicer): @@ -1623,7 +1623,7 @@ def _maybe_coerce_values(cls, values): Parameters ---------- - values : Index, Series, ExtensionArray + values : np.ndarray or ExtensionArray Returns ------- @@ -2105,7 +2105,7 @@ def _maybe_coerce_values(cls, values): Parameters ---------- - values : array-like + values : np.ndarray or ExtensionArray Must be convertible to datetime64/timedelta64 Returns diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c43261716076c..09275d288b718 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -45,7 +45,6 @@ from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCPandasArray, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -316,6 +315,8 @@ def __getstate__(self): def __setstate__(self, state): def unpickle_block(values, mgr_locs, ndim: int): # TODO(EA2D): ndim would be unnecessary with 2D EAs + # older pickles may store e.g. DatetimeIndex instead of DatetimeArray + values = extract_array(values, extract_numpy=True) return make_block(values, placement=mgr_locs, ndim=ndim) if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: @@ -1177,6 +1178,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False # TODO(EA2D): special case not needed with 2D EAs value = ensure_block_shape(value, ndim=2) + # TODO: type value as ArrayLike block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) for blkno, count in _fast_count_smallints(self.blknos[loc:]): @@ -1638,6 +1640,11 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: raise construction_error(tot_items, blocks[0].shape[1:], axes, e) +# We define this here so we can override it in tests.extension.test_numpy +def _extract_array(obj): + return extract_array(obj, extract_numpy=True) + + def create_block_manager_from_arrays( arrays, names: Index, axes: List[Index] ) -> BlockManager: @@ -1645,9 +1652,8 @@ def create_block_manager_from_arrays( assert isinstance(axes, list) assert all(isinstance(x, Index) for x in axes) - # ensure we dont have any PandasArrays when we call get_block_type - # Note: just calling extract_array breaks tests that patch PandasArray._typ. - arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays] + arrays = [_extract_array(x) for x in arrays] + try: blocks = _form_blocks(arrays, names, axes) mgr = BlockManager(blocks, axes) @@ -1657,7 +1663,12 @@ def create_block_manager_from_arrays( raise construction_error(len(arrays), arrays[0].shape, axes, e) -def construction_error(tot_items, block_shape, axes, e=None): +def construction_error( + tot_items: int, + block_shape: Shape, + axes: List[Index], + e: Optional[ValueError] = None, +): """ raise a helpful message about our construction """ passed = tuple(map(int, [tot_items] + list(block_shape))) # Correcting the user facing error message during dataframe construction @@ -1681,7 +1692,9 @@ def construction_error(tot_items, block_shape, axes, e=None): # ----------------------------------------------------------------------- -def _form_blocks(arrays, names: Index, axes: List[Index]) -> List[Block]: +def _form_blocks( + arrays: List[ArrayLike], names: Index, axes: List[Index] +) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? items_dict: DefaultDict[str, List] = defaultdict(list) @@ -1801,13 +1814,6 @@ def _multi_blockify(tuples, dtype: Optional[Dtype] = None): def _stack_arrays(tuples, dtype: np.dtype): - # fml - def _asarray_compat(x): - if isinstance(x, ABCSeries): - return x._values - else: - return np.asarray(x) - placement, arrays = zip(*tuples) first = arrays[0] @@ -1815,7 +1821,7 @@ def _asarray_compat(x): stacked = np.empty(shape, dtype=dtype) for i, arr in enumerate(arrays): - stacked[i] = _asarray_compat(arr) + stacked[i] = arr return stacked, placement @@ -1839,7 +1845,7 @@ def _interleaved_dtype(blocks: Sequence[Block]) -> Optional[DtypeObj]: return find_common_type([b.dtype for b in blocks]) -def _consolidate(blocks): +def _consolidate(blocks: Tuple[Block, ...]) -> List[Block]: """ Merge blocks having same dtype, exclude non-consolidating blocks """ diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 17f29e02a2883..e1712483c4e44 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -20,13 +20,26 @@ ExtensionDtype, PandasDtype, ) +from pandas.core.dtypes.generic import ABCPandasArray import pandas as pd import pandas._testing as tm from pandas.core.arrays.numpy_ import PandasArray +from pandas.core.internals import managers from pandas.tests.extension import base +def _extract_array_patched(obj): + if isinstance(obj, (pd.Index, pd.Series)): + obj = obj._values + if isinstance(obj, ABCPandasArray): + # TODO for reasons unclear, we get here in a couple of tests + # with PandasArray._typ *not* patched + obj = obj.to_numpy() + + return obj + + @pytest.fixture(params=["float", "object"]) def dtype(request): return PandasDtype(np.dtype(request.param)) @@ -51,6 +64,7 @@ def allow_in_pandas(monkeypatch): """ with monkeypatch.context() as m: m.setattr(PandasArray, "_typ", "extension") + m.setattr(managers, "_extract_array", _extract_array_patched) yield diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index a24c711df7b55..54130bb075666 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -123,7 +123,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): assert m is not None, f"incompatible typestr -> {typestr}" tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" - values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) + values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)._data elif typestr in ("timedelta", "td", "m8[ns]"): values = (mat * 1).astype("m8[ns]") elif typestr in ("category",):