Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[backport 2.3.x] API: ignore empty range/object dtype in Index setop operations (string dtype compat) (#60797) #60948

Open
wants to merge 4 commits into
base: 2.3.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ Increased minimum version for Python

pandas 2.3.0 supports Python 3.10 and higher.

.. _whatsnew_230.api_changes:

API changes
~~~~~~~~~~~

- When enabling the ``future.infer_string`` option: Index set operations (like
union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or
empty ``Index`` with object dtype when determining the dtype of the resulting
Index (:issue:`60797`)

.. ---------------------------------------------------------------------------
.. _whatsnew_230.deprecations:

Expand Down
26 changes: 26 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6387,6 +6387,24 @@ def _find_common_type_compat(self, target) -> DtypeObj:
"""
target_dtype, _ = infer_dtype_from(target)

if using_string_dtype():
# special case: if left or right is a zero-length RangeIndex or
# Index[object], those can be created by the default empty constructors
# -> for that case ignore this dtype and always return the other
# (https://github.com/pandas-dev/pandas/pull/60797)
from pandas.core.indexes.range import RangeIndex

if len(self) == 0 and (
isinstance(self, RangeIndex) or self.dtype == np.object_
):
return target_dtype
if (
isinstance(target, Index)
and len(target) == 0
and (isinstance(target, RangeIndex) or target_dtype == np.object_)
):
return self.dtype

# special case: if one dtype is uint64 and the other a signed int, return object
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
# Now it's:
Expand Down Expand Up @@ -7005,6 +7023,14 @@ def insert(self, loc: int, item) -> Index:

arr = self._values

if using_string_dtype() and len(self) == 0 and self.dtype == np.object_:
# special case: if we are an empty object-dtype Index, also
# take into account the inserted item for the resulting dtype
# (https://github.com/pandas-dev/pandas/pull/60797)
dtype = self._find_common_type_compat(item)
if dtype != self.dtype:
return self.astype(dtype).insert(loc, item)

try:
if isinstance(arr, ExtensionArray):
res_values = arr.insert(loc, item)
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/constructors/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -44,7 +42,6 @@ def test_constructor_single_row(self):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
def test_constructor_list_of_series(self):
data = [
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
Expand Down
7 changes: 1 addition & 6 deletions pandas/tests/frame/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,7 @@ def test_26395(indexer_al):
df["D"] = 0

indexer_al(df)["C", "D"] = 2
expected = DataFrame(
{"D": [0, 0, 2]},
index=["A", "B", "C"],
columns=pd.Index(["D"], dtype=object),
dtype=np.int64,
)
expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
tm.assert_frame_equal(df, expected)

with tm.assert_produces_warning(
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1206,7 +1206,7 @@ def test_loc_setitem_datetimelike_with_inference(self):
result = df.dtypes
expected = Series(
[np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,
index=Index(list("ABCDEFGH"), dtype=object),
index=list("ABCDEFGH"),
)
tm.assert_series_equal(result, expected)

Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/frame/indexing/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ def test_insert_with_columns_dups(self):
df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
exp = DataFrame(
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]],
columns=Index(["A", "A", "A"], dtype=object),
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
)
tm.assert_frame_equal(df, exp)

Expand Down
32 changes: 22 additions & 10 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,18 +146,32 @@ def test_setitem_different_dtype(self):
)
tm.assert_series_equal(result, expected)

def test_setitem_empty_columns(self):
# GH 13522
def test_setitem_overwrite_index(self):
# GH 13522 - assign the index as a column and then overwrite the values
# -> should not affect the index
df = DataFrame(index=["A", "B", "C"])
df["X"] = df.index
df["X"] = ["x", "y", "z"]
exp = DataFrame(
data={"X": ["x", "y", "z"]},
index=["A", "B", "C"],
columns=Index(["X"], dtype=object),
data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"]
)
tm.assert_frame_equal(df, exp)

def test_setitem_empty_columns(self):
# Starting from an empty DataFrame and setting a column should result
# in a default string dtype for the columns' Index
# https://github.com/pandas-dev/pandas/issues/60338

df = DataFrame()
df["foo"] = [1, 2, 3]
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(df, expected)

df = DataFrame(columns=Index([]))
df["foo"] = [1, 2, 3]
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(df, expected)

def test_setitem_dt64_index_empty_columns(self):
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
df = DataFrame(index=np.arange(len(rng)))
Expand All @@ -171,9 +185,7 @@ def test_setitem_timestamp_empty_columns(self):
df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns")

expected = DataFrame(
[[Timestamp("20130101", tz="UTC")]] * 3,
index=range(3),
columns=Index(["now"], dtype=object),
[[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"]
)
tm.assert_frame_equal(df, expected)

Expand Down Expand Up @@ -212,7 +224,7 @@ def test_setitem_period_preserves_dtype(self):
result = DataFrame([])
result["a"] = data

expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object))
expected = DataFrame({"a": data}, columns=["a"])

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -939,7 +951,7 @@ def test_setitem_scalars_no_index(self):
# GH#16823 / GH#17894
df = DataFrame()
df["foo"] = 1
expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64)
expected = DataFrame(columns=["foo"]).astype(np.int64)
tm.assert_frame_equal(df, expected)

def test_setitem_newcol_tuple_key(self, float_frame):
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/frame/methods/test_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,9 @@ def test_dropna_multiple_axes(self):
with pytest.raises(TypeError, match="supplying multiple axes"):
inp.dropna(how="all", axis=(0, 1), inplace=True)

def test_dropna_tz_aware_datetime(self, using_infer_string):
def test_dropna_tz_aware_datetime(self):
# GH13407

df = DataFrame()
if using_infer_string:
df.columns = df.columns.astype("str")
dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
df["Time"] = [dt1]
Expand Down
34 changes: 31 additions & 3 deletions pandas/tests/frame/methods/test_reset_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -646,7 +644,6 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes):
tm.assert_frame_equal(res, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338")
@pytest.mark.parametrize(
"array, dtype",
[
Expand Down Expand Up @@ -783,3 +780,34 @@ def test_reset_index_false_index_name():
result_frame.reset_index()
expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
tm.assert_frame_equal(result_frame, expected_frame)


@pytest.mark.parametrize("columns", [None, Index([])])
def test_reset_index_with_empty_frame(columns):
# Currently empty DataFrame has RangeIndex or object dtype Index, but when
# resetting the index we still want to end up with the default string dtype
# https://github.com/pandas-dev/pandas/issues/60338

index = Index([], name="foo")
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame(columns=["foo"])
tm.assert_frame_equal(result, expected)

index = Index([1, 2, 3], name="foo")
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(result, expected)

index = MultiIndex.from_tuples([], names=["foo", "bar"])
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame(columns=["foo", "bar"])
tm.assert_frame_equal(result, expected)

index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"])
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame({"foo": [1, 2], "bar": [2, 3]})
tm.assert_frame_equal(result, expected)
3 changes: 0 additions & 3 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
import pytest
import pytz

from pandas._config import using_string_dtype

from pandas._libs import lib
from pandas.compat.numpy import np_version_gt2
from pandas.errors import IntCastingNaNError
Expand Down Expand Up @@ -2002,7 +2000,6 @@ def test_constructor_with_datetimes4(self):
df = DataFrame({"value": dr})
assert str(df.iat[0, 0].tz) == "US/Eastern"

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_constructor_with_datetimes5(self):
# GH 7822
# preserver an index with a tz on dict construction
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,6 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
tm.assert_frame_equal(result, expected)

expected = DataFrame(df_index)
expected.columns = expected.columns.astype(object)
result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1623,7 +1623,7 @@ def test_groupby_2d_malformed():
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean(numeric_only=True)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_numpy_array_equal(tmp.values, res_values)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/base_class/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_insert(self):

# test empty
null_index = Index([])
tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a"))
tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a"))

def test_insert_missing(self, request, nulls_fixture, using_infer_string):
if using_infer_string and nulls_fixture is pd.NA:
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/indexes/base_class/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ def test_tuple_union_bug(self, method, expected, sort):
def test_union_name_preservation(
self, first_list, second_list, first_name, second_name, expected_name, sort
):
expected_dtype = object if not first_list or not second_list else "str"
first = Index(first_list, name=first_name)
second = Index(second_list, name=second_name)
union = first.union(second, sort=sort)
Expand All @@ -251,7 +250,7 @@ def test_union_name_preservation(
expected = Index(sorted(vals), name=expected_name)
tm.assert_index_equal(union, expected)
else:
expected = Index(vals, name=expected_name, dtype=expected_dtype)
expected = Index(vals, name=expected_name)
tm.assert_index_equal(union.sort_values(), expected.sort_values())

@pytest.mark.parametrize(
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/indexes/datetimes/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,17 @@ def test_join_utc_convert(self, join_type):
assert isinstance(result, DatetimeIndex)
assert result.tz is timezone.utc

def test_datetimeindex_union_join_empty(self, sort):
def test_datetimeindex_union_join_empty(self, sort, using_infer_string):
dti = date_range(start="1/1/2001", end="2/1/2001", freq="D")
empty = Index([])

result = dti.union(empty, sort=sort)
expected = dti.astype("O")
tm.assert_index_equal(result, expected)
if using_infer_string:
assert isinstance(result, DatetimeIndex)
tm.assert_index_equal(result, dti)
else:
expected = dti.astype("O")
tm.assert_index_equal(result, expected)

result = dti.join(empty)
assert isinstance(result, DatetimeIndex)
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/indexes/test_old_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,10 +442,12 @@ def test_insert_out_of_bounds(self, index, using_infer_string):
else:
msg = "slice indices must be integers or None or have an __index__ method"

if using_infer_string and (
index.dtype == "string" or index.dtype == "category" # noqa: PLR1714
):
msg = "loc must be an integer between"
if using_infer_string:
if index.dtype == "string" or index.dtype == "category": # noqa: PLR1714
msg = "loc must be an integer between"
elif index.dtype == "object" and len(index) == 0:
msg = "loc must be an integer between"
err = TypeError

with pytest.raises(err, match=msg):
index.insert(0.5, "foo")
Expand Down
14 changes: 13 additions & 1 deletion pandas/tests/indexes/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ def test_intersection_difference_match_empty(self, index, sort):
@pytest.mark.parametrize(
"method", ["intersection", "union", "difference", "symmetric_difference"]
)
def test_setop_with_categorical(index_flat, sort, method):
def test_setop_with_categorical(index_flat, sort, method, using_infer_string):
# MultiIndex tested separately in tests.indexes.multi.test_setops
index = index_flat

Expand All @@ -533,10 +533,22 @@ def test_setop_with_categorical(index_flat, sort, method):

result = getattr(index, method)(other, sort=sort)
expected = getattr(index, method)(index, sort=sort)
if (
using_infer_string
and index.empty
and method in ("union", "symmetric_difference")
):
expected = expected.astype("category")
tm.assert_index_equal(result, expected, exact=exact)

result = getattr(index, method)(other[:5], sort=sort)
expected = getattr(index, method)(index[:5], sort=sort)
if (
using_infer_string
and index.empty
and method in ("union", "symmetric_difference")
):
expected = expected.astype("category")
tm.assert_index_equal(result, expected, exact=exact)


Expand Down
7 changes: 1 addition & 6 deletions pandas/tests/indexing/test_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
CategoricalIndex,
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
Expand Down Expand Up @@ -71,11 +70,7 @@ def test_at_setitem_item_cache_cleared(self):
df.at[0, "x"] = 4
df.at[0, "cost"] = 789

expected = DataFrame(
{"x": [4], "cost": 789},
index=[0],
columns=Index(["x", "cost"], dtype=object),
)
expected = DataFrame({"x": [4], "cost": 789}, index=[0])
tm.assert_frame_equal(df, expected)

# And in particular, check that the _item_cache has updated correctly.
Expand Down
Loading
Loading