Skip to content

Commit

Permalink
Backport PR #60940: ENH: Add dtype argument to str.decode (#60968)
Browse files Browse the repository at this point in the history
* ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829)

* Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code.

* Fix missed tests and correct mistake in error message.

* Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function.

(cherry picked from commit 57340ec)

* TST(string dtype): Resolve xfails in pytables (#60795)

(cherry picked from commit 4511251)

* BUG(string dtype): Resolve pytables xfail when reading with condition (#60943)

(cherry picked from commit 0ec5f26)

* Backport PR #60940: ENH: Add dtype argument to str.decode

---------

Co-authored-by: Jake Thomas Trevallion <[email protected]>
  • Loading branch information
rhshadrach and JakeTT404 authored Feb 20, 2025
1 parent b8624cb commit 81229e6
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 2 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Other enhancements
updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`)
- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)

Expand Down
18 changes: 16 additions & 2 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
is_list_like,
is_object_dtype,
is_re,
is_string_dtype,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
Expand Down Expand Up @@ -1981,7 +1982,9 @@ def slice_replace(self, start=None, stop=None, repl=None):
result = self._data.array._str_slice_replace(start, stop, repl)
return self._wrap_result(result)

def decode(self, encoding, errors: str = "strict"):
def decode(
self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None
):
"""
Decode character string in the Series/Index using indicated encoding.
Expand All @@ -1992,6 +1995,14 @@ def decode(self, encoding, errors: str = "strict"):
----------
encoding : str
errors : str, optional
Specifies the error handling scheme.
Possible values are those supported by :meth:`bytes.decode`.
dtype : str or dtype, optional
The dtype of the result. When not ``None``, must be either a string or
object dtype. When ``None``, the dtype of the result is determined by
``pd.options.future.infer_string``.
.. versionadded:: 2.3.0
Returns
-------
Expand All @@ -2008,6 +2019,10 @@ def decode(self, encoding, errors: str = "strict"):
2 ()
dtype: object
"""
if dtype is not None and not is_string_dtype(dtype):
raise ValueError(f"dtype must be string or object, got {dtype=}")
if dtype is None and get_option("future.infer_string"):
dtype = "str"
# TODO: Add a similar _bytes interface.
if encoding in _cpython_optimized_decoders:
# CPython optimized implementation
Expand All @@ -2017,7 +2032,6 @@ def decode(self, encoding, errors: str = "strict"):
f = lambda x: decoder(x, errors)[0]
arr = self._data.array
result = arr._str_map(f)
dtype = "str" if get_option("future.infer_string") else None
return self._wrap_result(result, dtype=dtype)

@forbid_nonstring_types(["bytes"])
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/strings/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,30 @@ def test_decode_errors_kwarg():
tm.assert_series_equal(result, expected)


def test_decode_string_dtype(string_dtype):
# https://github.com/pandas-dev/pandas/pull/60940
ser = Series([b"a", b"b"])
result = ser.str.decode("utf-8", dtype=string_dtype)
expected = Series(["a", "b"], dtype=string_dtype)
tm.assert_series_equal(result, expected)


def test_decode_object_dtype(object_dtype):
# https://github.com/pandas-dev/pandas/pull/60940
ser = Series([b"a", rb"\ud800"])
result = ser.str.decode("utf-8", dtype=object_dtype)
expected = Series(["a", r"\ud800"], dtype=object_dtype)
tm.assert_series_equal(result, expected)


def test_decode_bad_dtype():
# https://github.com/pandas-dev/pandas/pull/60940
ser = Series([b"a", b"b"])
msg = "dtype must be string or object, got dtype='int64'"
with pytest.raises(ValueError, match=msg):
ser.str.decode("utf-8", dtype="int64")


@pytest.mark.parametrize(
"form, expected",
[
Expand Down

0 comments on commit 81229e6

Please sign in to comment.