diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2c5263f447951..e6dfe004dffcc 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -222,6 +222,7 @@ Deprecations - Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`) - Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`) - Deprecated 'quantile' keyword in :meth:`Rolling.quantile` and :meth:`Expanding.quantile`, renamed as 'q' instead (:issue:`52550`) +- Deprecated :func:`concat` behavior when any of the objects being concatenated have length 0; in the past the dtypes of empty objects were ignored when determining the resulting dtype, in a future version they will not (:issue:`39122`) - Deprecated :meth:`.DataFrameGroupBy.apply` and methods on the objects returned by :meth:`.DataFrameGroupBy.resample` operating on the grouping column(s); select the columns to operate on after groupby to either explicitly include or exclude the groupings and avoid the ``FutureWarning`` (:issue:`7155`) - Deprecated :meth:`.Groupby.all` and :meth:`.GroupBy.any` with datetime64 or :class:`PeriodDtype` values, matching the :class:`Series` and :class:`DataFrame` deprecations (:issue:`34479`) - Deprecated :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index cba7c44a219bf..35ebd9a4f4f52 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -8,10 +8,12 @@ Sequence, cast, ) +import warnings import numpy as np from pandas._libs import lib +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import ( @@ -107,6 +109,17 @@ def concat_compat( if len(to_concat) < len(orig): _, _, alt_dtype = _get_result_dtype(orig, non_empties) + if alt_dtype != target_dtype: + # GH#39122 + warnings.warn( + "The behavior of array concatenation with empty entries is " + "deprecated. In a future version, this will no longer exclude " + "empty items when determining the result dtype. " + "To retain the old behavior, exclude the empty entries before " + "the concat operation.", + FutureWarning, + stacklevel=find_stack_level(), + ) if target_dtype is not None: to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat] diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 1d22ed3fe8897..cb19a70c7a7ed 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -387,7 +387,9 @@ def is_na(self) -> bool: values = blk.values if values.size == 0: + # GH#39122 this case will return False once deprecation is enforced return True + if isinstance(values.dtype, SparseDtype): return False @@ -406,16 +408,14 @@ def is_na(self) -> bool: return all(isna_all(row) for row in values) @cache_readonly - def is_na_without_isna_all(self) -> bool: + def is_na_after_size_and_isna_all_deprecation(self) -> bool: + """ + Will self.is_na be True after values.size == 0 deprecation and isna_all + deprecation are enforced? + """ blk = self.block if blk.dtype.kind == "V": return True - if not blk._can_hold_na: - return False - - values = blk.values - if values.size == 0: - return True return False def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: @@ -477,17 +477,16 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike if empty_dtype != empty_dtype_future: if empty_dtype == concat_values.dtype: - # GH#40893 + # GH#39122, GH#40893 warnings.warn( - "The behavior of DataFrame concatenation with all-NA entries is " - "deprecated. In a future version, this will no longer exclude " - "all-NA columns when determining the result dtypes. " - "To retain the old behavior, cast the all-NA columns to the " - "desired dtype before the concat operation.", + "The behavior of DataFrame concatenation with empty or all-NA " + "entries is deprecated. In a future version, this will no longer " + "exclude empty or all-NA columns when determining the result dtypes. " + "To retain the old behavior, exclude the relevant entries before " + "the concat operation.", FutureWarning, stacklevel=find_stack_level(), ) - return concat_values @@ -543,7 +542,9 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj dtype_future = dtype if len(dtypes) != len(join_units): dtypes_future = [ - unit.block.dtype for unit in join_units if not unit.is_na_without_isna_all + unit.block.dtype + for unit in join_units + if not unit.is_na_after_size_and_isna_all_deprecation ] if not len(dtypes_future): dtypes_future = [ diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 772dfdfe8fb03..97718386dabb7 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -12,8 +12,11 @@ def test_concat_mismatched_categoricals_with_empty(): ser1 = Series(["a", "b", "c"], dtype="category") ser2 = Series([], dtype="category") - result = _concat.concat_compat([ser1._values, ser2._values]) - expected = pd.concat([ser1, ser2])._values + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = _concat.concat_compat([ser1._values, ser2._values]) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = pd.concat([ser1, ser2])._values tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0c6661b49d917..7ead31ec3244c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -374,9 +374,13 @@ def f3(x): df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) + depr_msg = "The behavior of array concatenation with empty entries is deprecated" + # correct result - result1 = df.groupby("a").apply(f1) - result2 = df2.groupby("a").apply(f1) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result1 = df.groupby("a").apply(f1) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -390,7 +394,8 @@ def f3(x): with pytest.raises(AssertionError, match=msg): df.groupby("a").apply(f3) with pytest.raises(AssertionError, match=msg): - df2.groupby("a").apply(f3) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + df2.groupby("a").apply(f3) def test_attr_wrapper(ts): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index db317a819c520..e3fc27671d869 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -616,7 +616,9 @@ def test_append_empty_preserve_name(self, name, expected): left = Index([], name="foo") right = Index([1, 2, 3], name=name) - result = left.append(right) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = left.append(right) assert result.name == expected @pytest.mark.parametrize( diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index fff10c7ea4bb3..e6faeedd09525 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -162,7 +162,9 @@ def test_append_preserve_index_name(self): df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) df2 = df2.set_index(["A"]) - result = df1._append(df2) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df1._append(df2) assert result.index.name == "A" indexes_can_append = [ diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index 2d84de8145111..e5d42d9cb1bfb 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -693,11 +693,14 @@ def test_concat_categorical_empty(self): s1 = Series([], dtype="category") s2 = Series([1, 2], dtype="category") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1._append(s2, ignore_index=True), s2) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1._append(s2, ignore_index=True), s2) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) - tm.assert_series_equal(s2._append(s1, ignore_index=True), s2) + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2._append(s1, ignore_index=True), s2) s1 = Series([], dtype="category") s2 = Series([], dtype="category") @@ -719,11 +722,13 @@ def test_concat_categorical_empty(self): # empty Series is ignored exp = Series([np.nan, np.nan]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) def test_categorical_concat_append(self): cat = Categorical(["a", "b"], categories=["a", "b"]) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index dc14e6e74302e..4d8f5da8848c0 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -756,8 +756,14 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype) empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype) - result = concat([empty, df]) - + msg = "The behavior of DataFrame concatenation with empty or all-NA entries" + warn = None + if df_dtype == "datetime64[ns]" or ( + df_dtype == "float64" and empty_dtype != "float64" + ): + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = concat([empty, df]) expected = df if df_dtype == "int64": # TODO what exact behaviour do we want for integer eventually? @@ -782,7 +788,7 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): else: df_dtype = "float64" - msg = "The behavior of DataFrame concatenation with all-NA entries" + msg = "The behavior of DataFrame concatenation with empty or all-NA entries" warn = None if empty_dtype != df_dtype and empty_dtype is not None: warn = FutureWarning @@ -804,7 +810,7 @@ def test_concat_ignore_empty_from_reindex(): aligned = df2.reindex(columns=df1.columns) - msg = "The behavior of DataFrame concatenation with all-NA entries" + msg = "The behavior of DataFrame concatenation with empty or all-NA entries" with tm.assert_produces_warning(FutureWarning, match=msg): result = concat([df1, aligned], ignore_index=True) expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]}) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 9ec0071ba9afa..a06fc5eede55c 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -578,7 +578,9 @@ def test_concat_float_datetime64(using_array_manager): if not using_array_manager: expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}) - result = concat([df_time, df_float.iloc[:0]]) + msg = "The behavior of DataFrame concatenation with empty or all-NA entries" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = concat([df_time, df_float.iloc[:0]]) tm.assert_frame_equal(result, expected) else: expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype( diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 919bcb8b2e577..6ef54b907cf34 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -58,7 +58,9 @@ def test_concat_empty_series(self): s1 = Series([1, 2, 3], name="x") s2 = Series(name="y", dtype="float64") - res = concat([s1, s2], axis=0) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = concat([s1, s2], axis=0) # name will be reset exp = Series([1, 2, 3]) tm.assert_series_equal(res, exp) @@ -238,9 +240,11 @@ def test_concat_inner_join_empty(self): df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") df_expected = DataFrame({"a": []}, index=RangeIndex(0), dtype="int64") - for how, expected in [("inner", df_expected), ("outer", df_a)]: - result = concat([df_a, df_empty], axis=1, join=how) - tm.assert_frame_equal(result, expected) + result = concat([df_a, df_empty], axis=1, join="inner") + tm.assert_frame_equal(result, df_expected) + + result = concat([df_a, df_empty], axis=1, join="outer") + tm.assert_frame_equal(result, df_a) def test_empty_dtype_coerce(self): # xref to #12411 diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py index c5d3a8a7c74d1..2711b6a34c62c 100644 --- a/pandas/tests/reshape/concat/test_series.py +++ b/pandas/tests/reshape/concat/test_series.py @@ -40,7 +40,9 @@ def test_concat_empty_and_non_empty_series_regression(self): s2 = Series([], dtype=object) expected = s1 - result = concat([s1, s2]) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = concat([s1, s2]) tm.assert_series_equal(result, expected) def test_concat_series_axis1(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 896f1a9be52be..bb8fc28109e1d 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -679,8 +679,13 @@ def test_join_append_timedeltas(self, using_array_manager): {"d": [datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500)]} ) df = DataFrame(columns=list("dt")) - df = concat([df, d], ignore_index=True) - result = concat([df, d], ignore_index=True) + msg = "The behavior of DataFrame concatenation with empty or all-NA entries" + warn = FutureWarning + if using_array_manager: + warn = None + with tm.assert_produces_warning(warn, match=msg): + df = concat([df, d], ignore_index=True) + result = concat([df, d], ignore_index=True) expected = DataFrame( { "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)], diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 46af5f509d6ab..fb6f7e386d5d5 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -63,7 +63,9 @@ def test_combine_first(self): # corner case ser = Series([1.0, 2, 3], index=[0, 1, 2]) empty = Series([], index=[], dtype=object) - result = ser.combine_first(empty) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.combine_first(empty) ser.index = ser.index.astype("O") tm.assert_series_equal(ser, result) @@ -110,7 +112,9 @@ def test_combine_first_timezone_series_with_empty_series(self): ) s1 = Series(range(10), index=time_index) s2 = Series(index=time_index) - result = s1.combine_first(s2) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s1.combine_first(s2) tm.assert_series_equal(result, s1) def test_combine_first_preserves_dtype(self):