diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 54640ff576338..cc270366ac940 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -313,6 +313,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`) - Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`) - Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`18529`) +- A tuple passed to :meth:`DataFrame.groupby` is now exclusively treated as a single key (:issue:`18314`) - Removed :meth:`Series.from_array` (:issue:`18258`) - Removed :meth:`DataFrame.from_items` (:issue:`18458`) - Removed :meth:`DataFrame.as_matrix`, :meth:`Series.as_matrix` (:issue:`18458`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 21c085c775399..232315660da8d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -14,8 +14,10 @@ class providing the base-class of operations. import re import types from typing import ( + Callable, Dict, FrozenSet, + Hashable, Iterable, List, Mapping, @@ -343,6 +345,15 @@ def _group_selection_context(groupby): groupby._reset_group_selection() +_KeysArgType = Union[ + Hashable, + List[Hashable], + Callable[[Hashable], Hashable], + List[Callable[[Hashable], Hashable]], + Mapping[Hashable, Hashable], +] + + class _GroupBy(PandasObject, SelectionMixin): _group_selection = None _apply_whitelist = frozenset() # type: FrozenSet[str] @@ -350,7 +361,7 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__( self, obj: NDFrame, - keys=None, + keys: Optional[_KeysArgType] = None, axis: int = 0, level=None, grouper: "Optional[ops.BaseGrouper]" = None, @@ -2504,7 +2515,7 @@ def _reindex_output( @Appender(GroupBy.__doc__) def get_groupby( obj: NDFrame, - by=None, + by: Optional[_KeysArgType] = None, axis: int = 0, level=None, grouper: "Optional[ops.BaseGrouper]" = None, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2b946d1ff0a7a..74195b0746091 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -4,7 +4,6 @@ """ from typing import Hashable, List, Optional, Tuple -import warnings import numpy as np @@ -14,7 +13,6 @@ ensure_categorical, is_categorical_dtype, is_datetime64_dtype, - is_hashable, is_list_like, is_scalar, is_timedelta64_dtype, @@ -514,28 +512,6 @@ def get_grouper( elif isinstance(key, ops.BaseGrouper): return key, [], obj - # In the future, a tuple key will always mean an actual key, - # not an iterable of keys. In the meantime, we attempt to provide - # a warning. We can assume that the user wanted a list of keys when - # the key is not in the index. We just have to be careful with - # unhashable elements of `key`. Any unhashable elements implies that - # they wanted a list of keys. - # https://github.com/pandas-dev/pandas/issues/18314 - if isinstance(key, tuple): - all_hashable = is_hashable(key) - if ( - all_hashable and key not in obj and set(key).issubset(obj) - ) or not all_hashable: - # column names ('a', 'b') -> ['a', 'b'] - # arrays like (a, b) -> [a, b] - msg = ( - "Interpreting tuple 'by' as a list of keys, rather than " - "a single key. Use 'by=[...]' instead of 'by=(...)'. In " - "the future, a tuple will always mean a single key." - ) - warnings.warn(msg, FutureWarning, stacklevel=5) - key = list(key) - if not isinstance(key, list): keys = [key] match_axis_length = False diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b848e9caad9be..5f454f7aefae4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1734,34 +1734,23 @@ def test_empty_dataframe_groupby(): tm.assert_frame_equal(result, expected) -def test_tuple_warns(): +def test_tuple_as_grouping(): # https://github.com/pandas-dev/pandas/issues/18314 df = pd.DataFrame( { - ("a", "b"): [1, 1, 2, 2], - "a": [1, 1, 1, 2], - "b": [1, 2, 2, 2], + ("a", "b"): [1, 1, 1, 1], + "a": [2, 2, 2, 2], + "b": [2, 2, 2, 2], "c": [1, 1, 1, 1], } ) - with tm.assert_produces_warning(FutureWarning) as w: - df[["a", "b", "c"]].groupby(("a", "b")).c.mean() - assert "Interpreting tuple 'by' as a list" in str(w[0].message) + with pytest.raises(KeyError): + df[["a", "b", "c"]].groupby(("a", "b")) - with tm.assert_produces_warning(None): - df.groupby(("a", "b")).c.mean() - - -def test_tuple_warns_unhashable(): - # https://github.com/pandas-dev/pandas/issues/18314 - business_dates = date_range(start="4/1/2014", end="6/30/2014", freq="B") - df = DataFrame(1, index=business_dates, columns=["a", "b"]) - - with tm.assert_produces_warning(FutureWarning) as w: - df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) - - assert "Interpreting tuple 'by' as a list" in str(w[0].message) + result = df.groupby(("a", "b"))["c"].sum() + expected = pd.Series([4], name="c", index=pd.Index([1], name=("a", "b"))) + tm.assert_series_equal(result, expected) def test_tuple_correct_keyerror():