diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4bc50695e1ecd..cf5973921fe8a 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -403,6 +403,7 @@ Other Enhancements - :meth:`pandas.api.types.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). +- :meth:`Categorical.from_codes` now can take a ``dtype`` parameter as an alternative to passing ``categories`` and ``ordered`` (:issue:`24398`). - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). - Compatibility with Matplotlib 3.0 (:issue:`22790`). - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e145a479cd3cb..f88249d0fa6b2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -603,13 +603,13 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, return cls(codes, dtype=dtype, fastpath=True) @classmethod - def from_codes(cls, codes, categories, ordered=False): + def from_codes(cls, codes, categories=None, ordered=None, dtype=None): """ - Make a Categorical type from codes and categories arrays. + Make a Categorical type from codes and categories or dtype. - This constructor is useful if you already have codes and categories and - so do not need the (computation intensive) factorization step, which is - usually done on the constructor. + This constructor is useful if you already have codes and + categories/dtype and so do not need the (computation intensive) + factorization step, which is usually done on the constructor. If your data does not follow this convention, please use the normal constructor. @@ -618,16 +618,38 @@ def from_codes(cls, codes, categories, ordered=False): ---------- codes : array-like, integers An integer array, where each integer points to a category in - categories or -1 for NaN - categories : index-like + categories or dtype.categories, or else is -1 for NaN + categories : index-like, optional The categories for the categorical. Items need to be unique. - ordered : boolean, (default False) - Whether or not this categorical is treated as a ordered - categorical. If not given, the resulting categorical will be - unordered. - """ - dtype = CategoricalDtype._from_values_or_dtype(codes, categories, - ordered) + If the categories are not given here, then they must be provided + in `dtype`. + ordered : bool, optional + Whether or not this categorical is treated as an ordered + categorical. If not given here or in `dtype`, the resulting + categorical will be unordered. + dtype : CategoricalDtype or the string "category", optional + If :class:`CategoricalDtype`, cannot be used together with + `categories` or `ordered`. + + .. versionadded:: 0.24.0 + + When `dtype` is provided, neither `categories` nor `ordered` + should be provided. + + Examples + -------- + >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) + >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) + [a, b, a, b] + Categories (2, object): [a < b] + """ + dtype = CategoricalDtype._from_values_or_dtype(categories=categories, + ordered=ordered, + dtype=dtype) + if dtype.categories is None: + msg = ("The categories must be provided in 'categories' or " + "'dtype'. Both were None.") + raise ValueError(msg) codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): @@ -642,12 +664,6 @@ def from_codes(cls, codes, categories, ordered=False): if msg: raise ValueError(msg) - try: - codes = coerce_indexer_dtype(codes, categories) - except (ValueError, TypeError): - raise ValueError( - "codes need to be convertible to an arrays of integers") - if len(codes) and ( codes.max() >= len(dtype.categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " @@ -1265,8 +1281,7 @@ def shift(self, periods, fill_value=None): else: codes[periods:] = fill_value - return self.from_codes(codes, categories=self.categories, - ordered=self.ordered) + return self.from_codes(codes, dtype=self.dtype) def __array__(self, dtype=None): """ @@ -1887,9 +1902,7 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) - result = type(self).from_codes(codes, - categories=dtype.categories, - ordered=dtype.ordered) + result = type(self).from_codes(codes, dtype=dtype) return result take = take_nd @@ -2078,9 +2091,7 @@ def __setitem__(self, key, value): new_codes = _recode_for_categories( value.codes, value.categories, self.categories ) - value = Categorical.from_codes(new_codes, - categories=self.categories, - ordered=self.ordered) + value = Categorical.from_codes(new_codes, dtype=self.dtype) rvalue = value if is_list_like(value) else [value] diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f76085f9889dd..e43b64827d02a 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -148,8 +148,7 @@ def _create_from_codes(self, codes, dtype=None, name=None): dtype = self.dtype if name is None: name = self.name - cat = Categorical.from_codes(codes, categories=dtype.categories, - ordered=dtype.ordered) + cat = Categorical.from_codes(codes, dtype=dtype) return CategoricalIndex(cat, name=name) @classmethod diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index f8e9e393091e5..25c299692ceca 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -77,7 +77,9 @@ def test_constructor_unsortable(self): assert not factor.ordered # this however will raise as cannot be sorted - with pytest.raises(TypeError): + msg = ("'values' is not ordered, please explicitly specify the " + "categories order by passing in a categories argument.") + with pytest.raises(TypeError, match=msg): Categorical(arr, ordered=True) def test_constructor_interval(self): @@ -99,10 +101,11 @@ def test_constructor(self): tm.assert_numpy_array_equal(c2.__array__(), exp_arr) # categories must be unique - with pytest.raises(ValueError): + msg = "Categorical categories must be unique" + with pytest.raises(ValueError, match=msg): Categorical([1, 2], [1, 2, 2]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Categorical(["a", "b"], ["a", "b", "b"]) # The default should be unordered @@ -211,21 +214,23 @@ def test_constructor(self): def test_constructor_not_sequence(self): # https://github.com/pandas-dev/pandas/issues/16022 - with pytest.raises(TypeError): + msg = r"^Parameter 'categories' must be list-like, was" + with pytest.raises(TypeError, match=msg): Categorical(['a', 'b'], categories='a') def test_constructor_with_null(self): # Cannot have NaN in categories - with pytest.raises(ValueError): + msg = "Categorial categories cannot be null" + with pytest.raises(ValueError, match=msg): Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Categorical(DatetimeIndex(['nat', '20160101']), categories=[NaT, Timestamp('20160101')]) @@ -347,13 +352,14 @@ def test_constructor_with_dtype(self, ordered): def test_constructor_dtype_and_others_raises(self): dtype = CategoricalDtype(['a', 'b'], ordered=True) - with pytest.raises(ValueError, match="Cannot"): + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + with pytest.raises(ValueError, match=msg): Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype) - with pytest.raises(ValueError, match="Cannot"): + with pytest.raises(ValueError, match=msg): Categorical(['a', 'b'], ordered=True, dtype=dtype) - with pytest.raises(ValueError, match="Cannot"): + with pytest.raises(ValueError, match=msg): Categorical(['a', 'b'], ordered=False, dtype=dtype) @pytest.mark.parametrize('categories', [ @@ -417,33 +423,44 @@ def test_constructor_with_categorical_categories(self): def test_from_codes(self): # too few categories - with pytest.raises(ValueError): - Categorical.from_codes([1, 2], [1, 2]) + dtype = CategoricalDtype(categories=[1, 2]) + msg = "codes need to be between " + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([1, 2], categories=dtype.categories) + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([1, 2], dtype=dtype) # no int codes - with pytest.raises(ValueError): - Categorical.from_codes(["a"], [1, 2]) + msg = "codes need to be array-like integers" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes(["a"], categories=dtype.categories) + with pytest.raises(ValueError, match=msg): + Categorical.from_codes(["a"], dtype=dtype) # no unique categories - with pytest.raises(ValueError): - Categorical.from_codes([0, 1, 2], ["a", "a", "b"]) + with pytest.raises(ValueError, + match="Categorical categories must be unique"): + Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) # NaN categories included - with pytest.raises(ValueError): - Categorical.from_codes([0, 1, 2], ["a", "b", np.nan]) + with pytest.raises(ValueError, + match="Categorial categories cannot be null"): + Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) # too negative - with pytest.raises(ValueError): - Categorical.from_codes([-2, 1, 2], ["a", "b", "c"]) + dtype = CategoricalDtype(categories=["a", "b", "c"]) + msg = r"codes need to be between -1 and len\(categories\)-1" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([-2, 1, 2], categories=dtype.categories) + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([-2, 1, 2], dtype=dtype) exp = Categorical(["a", "b", "c"], ordered=False) - res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"]) + res = Categorical.from_codes([0, 1, 2], categories=dtype.categories) tm.assert_categorical_equal(exp, res) - # Not available in earlier numpy versions - if hasattr(np.random, "choice"): - codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) - Categorical.from_codes(codes, categories=["train", "test"]) + res = Categorical.from_codes([0, 1, 2], dtype=dtype) + tm.assert_categorical_equal(exp, res) def test_from_codes_with_categorical_categories(self): # GH17884 @@ -458,28 +475,56 @@ def test_from_codes_with_categorical_categories(self): tm.assert_categorical_equal(result, expected) # non-unique Categorical still raises - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="Categorical categories must be unique"): Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a'])) def test_from_codes_with_nan_code(self): # GH21767 codes = [1, 2, np.nan] - categories = ['a', 'b', 'c'] - with pytest.raises(ValueError): - Categorical.from_codes(codes, categories) + dtype = CategoricalDtype(categories=['a', 'b', 'c']) + with pytest.raises(ValueError, + match="codes need to be array-like integers"): + Categorical.from_codes(codes, categories=dtype.categories) + with pytest.raises(ValueError, + match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype=dtype) def test_from_codes_with_float(self): # GH21767 codes = [1.0, 2.0, 0] # integer, but in float dtype - categories = ['a', 'b', 'c'] + dtype = CategoricalDtype(categories=['a', 'b', 'c']) + + with tm.assert_produces_warning(FutureWarning): + cat = Categorical.from_codes(codes, dtype.categories) + tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) with tm.assert_produces_warning(FutureWarning): - cat = Categorical.from_codes(codes, categories) + cat = Categorical.from_codes(codes, dtype=dtype) tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) codes = [1.1, 2.0, 0] # non-integer - with pytest.raises(ValueError): - Categorical.from_codes(codes, categories) + with pytest.raises(ValueError, + match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype.categories) + with pytest.raises(ValueError, + match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype=dtype) + + def test_from_codes_with_dtype_raises(self): + msg = 'Cannot specify' + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([0, 1], categories=['a', 'b'], + dtype=CategoricalDtype(['a', 'b'])) + + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([0, 1], ordered=True, + dtype=CategoricalDtype(['a', 'b'])) + + def test_from_codes_neither(self): + msg = "Both were None" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([0, 1]) @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories(self, dtype): @@ -515,14 +560,11 @@ def test_from_inferred_categories_coerces(self): expected = Categorical([1, 1, 2, np.nan]) tm.assert_categorical_equal(result, expected) - def test_construction_with_ordered(self): + @pytest.mark.parametrize('ordered', [None, True, False]) + def test_construction_with_ordered(self, ordered): # GH 9347, 9190 - cat = Categorical([0, 1, 2]) - assert not cat.ordered - cat = Categorical([0, 1, 2], ordered=False) - assert not cat.ordered - cat = Categorical([0, 1, 2], ordered=True) - assert cat.ordered + cat = Categorical([0, 1, 2], ordered=ordered) + assert cat.ordered == bool(ordered) @pytest.mark.xfail(reason="Imaginary values not supported in Categorical") def test_constructor_imaginary(self): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 8518c1fa369c2..d85568ce67d16 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self): tm.assert_index_equal(result, expected, exact=True) # error when combining categories/ordered and dtype kwargs - msg = 'Cannot specify `categories` or `ordered` together with `dtype`.' + msg = "Cannot specify `categories` or `ordered` together with `dtype`." with pytest.raises(ValueError, match=msg): CategoricalIndex(data, categories=cats, dtype=dtype)