Skip to content

REF: extract_array earlier in block construction #40026

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
is_numeric_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.dtypes import PandasDtype
from pandas.core.dtypes.missing import array_equivalent

import pandas as pd
Expand Down Expand Up @@ -630,12 +631,12 @@ def raise_assert_detail(obj, message, left, right, diff=None, index_values=None)

if isinstance(left, np.ndarray):
left = pprint_thing(left)
elif is_categorical_dtype(left):
elif is_categorical_dtype(left) or isinstance(left, PandasDtype):
left = repr(left)

if isinstance(right, np.ndarray):
right = pprint_thing(right)
elif is_categorical_dtype(right):
elif is_categorical_dtype(right) or isinstance(right, PandasDtype):
right = repr(right)

msg += f"""
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def _maybe_coerce_values(cls, values):

Parameters
----------
values : np.ndarray, ExtensionArray, Index
values : np.ndarray or ExtensionArray

Returns
-------
Expand Down Expand Up @@ -350,7 +350,7 @@ def __getstate__(self):
@final
def __setstate__(self, state):
self.mgr_locs = libinternals.BlockPlacement(state[0])
self.values = state[1]
self.values = extract_array(state[1], extract_numpy=True)
self.ndim = self.values.ndim

def _slice(self, slicer):
Expand Down Expand Up @@ -1623,7 +1623,7 @@ def _maybe_coerce_values(cls, values):

Parameters
----------
values : Index, Series, ExtensionArray
values : np.ndarray or ExtensionArray

Returns
-------
Expand Down Expand Up @@ -2105,7 +2105,7 @@ def _maybe_coerce_values(cls, values):

Parameters
----------
values : array-like
values : np.ndarray or ExtensionArray
Must be convertible to datetime64/timedelta64

Returns
Expand Down
36 changes: 21 additions & 15 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCPandasArray,
ABCSeries,
)
from pandas.core.dtypes.missing import (
Expand Down Expand Up @@ -316,6 +315,8 @@ def __getstate__(self):
def __setstate__(self, state):
def unpickle_block(values, mgr_locs, ndim: int):
# TODO(EA2D): ndim would be unnecessary with 2D EAs
# older pickles may store e.g. DatetimeIndex instead of DatetimeArray
values = extract_array(values, extract_numpy=True)
return make_block(values, placement=mgr_locs, ndim=ndim)

if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
Expand Down Expand Up @@ -1177,6 +1178,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False
# TODO(EA2D): special case not needed with 2D EAs
value = ensure_block_shape(value, ndim=2)

# TODO: type value as ArrayLike
block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1))

for blkno, count in _fast_count_smallints(self.blknos[loc:]):
Expand Down Expand Up @@ -1638,16 +1640,20 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager:
raise construction_error(tot_items, blocks[0].shape[1:], axes, e)


# We define this here so we can override it in tests.extension.test_numpy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pretty janky but i get it

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #40021

def _extract_array(obj):
return extract_array(obj, extract_numpy=True)


def create_block_manager_from_arrays(
arrays, names: Index, axes: List[Index]
) -> BlockManager:
assert isinstance(names, Index)
assert isinstance(axes, list)
assert all(isinstance(x, Index) for x in axes)

# ensure we dont have any PandasArrays when we call get_block_type
# Note: just calling extract_array breaks tests that patch PandasArray._typ.
arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays]
arrays = [_extract_array(x) for x in arrays]

try:
blocks = _form_blocks(arrays, names, axes)
mgr = BlockManager(blocks, axes)
Expand All @@ -1657,7 +1663,12 @@ def create_block_manager_from_arrays(
raise construction_error(len(arrays), arrays[0].shape, axes, e)


def construction_error(tot_items, block_shape, axes, e=None):
def construction_error(
tot_items: int,
block_shape: Shape,
axes: List[Index],
e: Optional[ValueError] = None,
):
""" raise a helpful message about our construction """
passed = tuple(map(int, [tot_items] + list(block_shape)))
# Correcting the user facing error message during dataframe construction
Expand All @@ -1681,7 +1692,9 @@ def construction_error(tot_items, block_shape, axes, e=None):
# -----------------------------------------------------------------------


def _form_blocks(arrays, names: Index, axes: List[Index]) -> List[Block]:
def _form_blocks(
arrays: List[ArrayLike], names: Index, axes: List[Index]
) -> List[Block]:
# put "leftover" items in float bucket, where else?
# generalize?
items_dict: DefaultDict[str, List] = defaultdict(list)
Expand Down Expand Up @@ -1801,21 +1814,14 @@ def _multi_blockify(tuples, dtype: Optional[Dtype] = None):

def _stack_arrays(tuples, dtype: np.dtype):

# fml
def _asarray_compat(x):
if isinstance(x, ABCSeries):
return x._values
else:
return np.asarray(x)

placement, arrays = zip(*tuples)

first = arrays[0]
shape = (len(arrays),) + first.shape

stacked = np.empty(shape, dtype=dtype)
for i, arr in enumerate(arrays):
stacked[i] = _asarray_compat(arr)
stacked[i] = arr

return stacked, placement

Expand All @@ -1839,7 +1845,7 @@ def _interleaved_dtype(blocks: Sequence[Block]) -> Optional[DtypeObj]:
return find_common_type([b.dtype for b in blocks])


def _consolidate(blocks):
def _consolidate(blocks: Tuple[Block, ...]) -> List[Block]:
"""
Merge blocks having same dtype, exclude non-consolidating blocks
"""
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,26 @@
ExtensionDtype,
PandasDtype,
)
from pandas.core.dtypes.generic import ABCPandasArray

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.numpy_ import PandasArray
from pandas.core.internals import managers
from pandas.tests.extension import base


def _extract_array_patched(obj):
if isinstance(obj, (pd.Index, pd.Series)):
obj = obj._values
if isinstance(obj, ABCPandasArray):
# TODO for reasons unclear, we get here in a couple of tests
# with PandasArray._typ *not* patched
obj = obj.to_numpy()

return obj


@pytest.fixture(params=["float", "object"])
def dtype(request):
return PandasDtype(np.dtype(request.param))
Expand All @@ -51,6 +64,7 @@ def allow_in_pandas(monkeypatch):
"""
with monkeypatch.context() as m:
m.setattr(PandasArray, "_typ", "extension")
m.setattr(managers, "_extract_array", _extract_array_patched)
yield


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0):
assert m is not None, f"incompatible typestr -> {typestr}"
tz = m.groups()[0]
assert num_items == 1, "must have only 1 num items for a tz-aware"
values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)
values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)._data
elif typestr in ("timedelta", "td", "m8[ns]"):
values = (mat * 1).astype("m8[ns]")
elif typestr in ("category",):
Expand Down