From d4c8d82b52045f49a0bb1d762968918a06886ae9 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 22 Dec 2023 11:18:49 -0800 Subject: [PATCH 001/152] RLS: 2.2.0rc0 From b59e594f6af21f04e33697bf6bd4cbcf02b8cff2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 24 Dec 2023 17:46:25 +0100 Subject: [PATCH 002/152] Backport PR #56595 on branch 2.2.x (TST/CLN: Inline seldom used fixture) (#56612) Backport PR #56595: TST/CLN: Inline seldom used fixture Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/arrays/categorical/conftest.py | 9 --------- pandas/tests/arrays/categorical/test_api.py | 3 ++- pandas/tests/arrays/categorical/test_indexing.py | 6 ++++-- pandas/tests/arrays/categorical/test_operators.py | 3 ++- pandas/tests/arrays/categorical/test_repr.py | 3 ++- pandas/tests/indexes/datetimes/test_ops.py | 3 +-- pandas/tests/tseries/offsets/conftest.py | 13 ------------- pandas/tests/tseries/offsets/test_common.py | 3 ++- 8 files changed, 13 insertions(+), 30 deletions(-) delete mode 100644 pandas/tests/arrays/categorical/conftest.py delete mode 100644 pandas/tests/tseries/offsets/conftest.py diff --git a/pandas/tests/arrays/categorical/conftest.py b/pandas/tests/arrays/categorical/conftest.py deleted file mode 100644 index 37249210f28f4..0000000000000 --- a/pandas/tests/arrays/categorical/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest - -from pandas import Categorical - - -@pytest.fixture -def factor(): - """Fixture returning a Categorical object""" - return Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index b4215b4a6fe21..a939ee5f6f53f 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -385,7 +385,8 @@ def test_remove_unused_categories(self): class TestCategoricalAPIWithFactor: - def test_describe(self, factor): + def test_describe(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # string type desc = factor.describe() assert factor.ordered diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 3377c411a7084..5e1c5c64fa660 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -21,7 +21,8 @@ class TestCategoricalIndexingWithFactor: - def test_getitem(self, factor): + def test_getitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) assert factor[0] == "a" assert factor[-1] == "c" @@ -31,7 +32,8 @@ def test_getitem(self, factor): subf = factor[np.asarray(factor) == "c"] tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) - def test_setitem(self, factor): + def test_setitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # int/positional c = factor.copy() c[0] = "b" diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 16b941eab4830..4174d2adc810b 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -17,7 +17,8 @@ def test_categories_none_comparisons(self): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(factor, factor) - def test_comparisons(self, factor): + def test_comparisons(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) result = factor[factor == "a"] expected = factor[np.asarray(factor) == "a"] tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index d6f93fbbd912f..ef0315130215c 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -17,7 +17,8 @@ class TestCategoricalReprWithFactor: - def test_print(self, factor, using_infer_string): + def test_print(self, using_infer_string): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) if using_infer_string: expected = [ "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 5db0aa5cf510f..bac9548b932c1 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -10,8 +10,6 @@ ) import pandas._testing as tm -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - class TestDatetimeIndexOps: def test_infer_freq(self, freq_sample): @@ -26,6 +24,7 @@ def test_infer_freq(self, freq_sample): class TestBusinessDatetimeIndex: @pytest.fixture def rng(self, freq): + START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) return bdate_range(START, END, freq=freq) def test_comparison(self, rng): diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py deleted file mode 100644 index 2fc846353dcb5..0000000000000 --- a/pandas/tests/tseries/offsets/conftest.py +++ /dev/null @@ -1,13 +0,0 @@ -import datetime - -import pytest - -from pandas._libs.tslibs import Timestamp - - -@pytest.fixture -def dt(): - """ - Fixture for common Timestamp. - """ - return Timestamp(datetime.datetime(2008, 1, 2)) diff --git a/pandas/tests/tseries/offsets/test_common.py b/pandas/tests/tseries/offsets/test_common.py index 5b80b8b1c4ab4..aa4e22f71ad66 100644 --- a/pandas/tests/tseries/offsets/test_common.py +++ b/pandas/tests/tseries/offsets/test_common.py @@ -250,7 +250,8 @@ def test_sub(date, offset_box, offset2): [BusinessHour, BusinessHour()], ], ) -def test_Mult1(offset_box, offset1, dt): +def test_Mult1(offset_box, offset1): + dt = Timestamp(2008, 1, 2) assert dt + 10 * offset1 == dt + offset_box(10) assert dt + 5 * offset1 == dt + offset_box(5) From 8f8b514b645db913a3b671b19b32eb7f67808df0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 26 Dec 2023 20:32:52 +0100 Subject: [PATCH 003/152] Backport PR #56615 on branch 2.2.x (CI: Fix deprecation warnings) (#56620) Backport PR #56615: CI: Fix deprecation warnings Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/io/parser/common/test_chunksize.py | 5 +++-- pandas/tests/io/parser/common/test_read_errors.py | 2 +- pandas/tests/io/test_parquet.py | 4 +--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 5e47bcc1c5b0e..9660b283a491b 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -223,7 +223,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): warn = None if parser.engine == "pyarrow": warn = DeprecationWarning - depr_msg = "Passing a BlockManager to DataFrame" + depr_msg = "Passing a BlockManager to DataFrame|make_block is deprecated" with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): with monkeypatch.context() as m: m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) @@ -254,7 +254,8 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): if parser.engine == "pyarrow": df = parser.read_csv_check_warnings( DeprecationWarning, - "Passing a BlockManager to DataFrame is deprecated", + "Passing a BlockManager to DataFrame is deprecated|" + "make_block is deprecated", buf, check_stacklevel=False, ) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 4a4ae2b259289..db8b586d22fc0 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -171,7 +171,7 @@ def test_suppress_error_output(all_parsers): warn = None if parser.engine == "pyarrow": warn = DeprecationWarning - msg = "Passing a BlockManager to DataFrame" + msg = "Passing a BlockManager to DataFrame|make_block is deprecated" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): result = parser.read_csv(StringIO(data), on_bad_lines="skip") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ad7cdad363e78..e4b94177eedb2 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1000,9 +1000,7 @@ def test_filter_row_groups(self, pa): df = pd.DataFrame({"a": list(range(3))}) with tm.ensure_clean() as path: df.to_parquet(path, engine=pa) - result = read_parquet( - path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False - ) + result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 def test_read_parquet_manager(self, pa, using_array_manager): From 18aa834fbac273b1ecfd50d6469cf5bf579bcac8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Dec 2023 00:50:47 +0100 Subject: [PATCH 004/152] Backport PR #56617 on branch 2.2.x (TYP: some return types from ruff) (#56624) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #56617: TYP: some return types from ruff Co-authored-by: Torsten Wörtwein --- .pre-commit-config.yaml | 2 ++ doc/source/whatsnew/v2.2.0.rst | 2 +- environment.yml | 2 +- pandas/_testing/asserters.py | 7 ++++--- pandas/_version.py | 2 +- pandas/core/computation/expr.py | 4 ++-- pandas/io/html.py | 8 ++++---- pandas/io/json/_json.py | 10 +++++----- pandas/io/parsers/arrow_parser_wrapper.py | 6 +++--- pandas/io/pytables.py | 2 +- pandas/io/sas/sas_xport.py | 2 +- pandas/io/sql.py | 8 ++++---- pandas/io/stata.py | 2 +- pandas/plotting/_matplotlib/core.py | 8 ++++---- pandas/util/_validators.py | 6 +++--- requirements-dev.txt | 2 +- 16 files changed, 38 insertions(+), 35 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a070e9a49b97..7f3fc95ce00cc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,8 @@ repos: # TODO: remove autofixe-only rules when they are checked by ruff name: ruff-selected-autofixes alias: ruff-selected-autofixes + files: ^pandas + exclude: ^pandas/tests args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] - repo: https://github.com/jendrikseipp/vulture rev: 'v2.10' diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d1481639ca5a0..5ee94b74c527e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -431,7 +431,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | Package | Minimum Version | Changed | +=================+=================+=========+ -| mypy (dev) | 1.7.1 | X | +| mypy (dev) | 1.8.0 | X | +-----------------+-----------------+---------+ | | | X | +-----------------+-----------------+---------+ diff --git a/environment.yml b/environment.yml index 74317d47e2e53..58eb69ad1f070 100644 --- a/environment.yml +++ b/environment.yml @@ -76,7 +76,7 @@ dependencies: # code checks - flake8=6.1.0 # run in subprocess over docstring examples - - mypy=1.7.1 # pre-commit uses locally installed mypy + - mypy=1.8.0 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - pre-commit>=3.6.0 diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index e342f76dc724b..800b03707540f 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -4,6 +4,7 @@ from typing import ( TYPE_CHECKING, Literal, + NoReturn, cast, ) @@ -143,7 +144,7 @@ def assert_almost_equal( ) -def _check_isinstance(left, right, cls): +def _check_isinstance(left, right, cls) -> None: """ Helper method for our assert_* methods that ensures that the two objects being compared have the right type before @@ -576,7 +577,7 @@ def assert_timedelta_array_equal( def raise_assert_detail( obj, message, left, right, diff=None, first_diff=None, index_values=None -): +) -> NoReturn: __tracebackhide__ = True msg = f"""{obj} are different @@ -664,7 +665,7 @@ def _get_base(obj): if left_base is right_base: raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") - def _raise(left, right, err_msg): + def _raise(left, right, err_msg) -> NoReturn: if err_msg is None: if left.shape != right.shape: raise_assert_detail( diff --git a/pandas/_version.py b/pandas/_version.py index 5d610b5e1ea7e..f8a960630126d 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -386,7 +386,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 4770f403b1bdb..b5861fbaebe9c 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -695,8 +695,8 @@ def visit_Call(self, node, side=None, **kwargs): if not isinstance(key, ast.keyword): # error: "expr" has no attribute "id" raise ValueError( - "keyword error in function call " # type: ignore[attr-defined] - f"'{node.func.id}'" + "keyword error in function call " + f"'{node.func.id}'" # type: ignore[attr-defined] ) if key.arg: diff --git a/pandas/io/html.py b/pandas/io/html.py index 5d5bf079784be..26e71c9546ffd 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -269,7 +269,7 @@ def _attr_getter(self, obj, attr): # Both lxml and BeautifulSoup have the same implementation: return obj.get(attr) - def _href_getter(self, obj): + def _href_getter(self, obj) -> str | None: """ Return a href if the DOM node contains a child or None. @@ -392,7 +392,7 @@ def _parse_tables(self, document, match, attrs): """ raise AbstractMethodError(self) - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: """ Return whether an individual DOM node matches a tag @@ -629,7 +629,7 @@ def _href_getter(self, obj) -> str | None: def _text_getter(self, obj): return obj.text - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.name == tag def _parse_td(self, row): @@ -758,7 +758,7 @@ def _parse_tables(self, document, match, kwargs): raise ValueError(f"No tables found matching regex {repr(pattern)}") return tables - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.tag == tag def _build_doc(self): diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ed66e46b300f7..4c490c6b2cda2 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -255,7 +255,7 @@ def __init__( self.is_copy = None self._format_axes() - def _format_axes(self): + def _format_axes(self) -> None: raise AbstractMethodError(self) def write(self) -> str: @@ -287,7 +287,7 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: else: return self.obj - def _format_axes(self): + def _format_axes(self) -> None: if not self.obj.index.is_unique and self.orient == "index": raise ValueError(f"Series index must be unique for orient='{self.orient}'") @@ -304,7 +304,7 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: obj_to_write = self.obj return obj_to_write - def _format_axes(self): + def _format_axes(self) -> None: """ Try to format axes if they are datelike. """ @@ -1193,7 +1193,7 @@ def parse(self): self._try_convert_types() return self.obj - def _parse(self): + def _parse(self) -> None: raise AbstractMethodError(self) @final @@ -1217,7 +1217,7 @@ def _convert_axes(self) -> None: new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False) setattr(self.obj, axis_name, new_axis) - def _try_convert_types(self): + def _try_convert_types(self) -> None: raise AbstractMethodError(self) @final diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 66a7ccacf675b..890b22154648e 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -41,7 +41,7 @@ def __init__(self, src: ReadBuffer[bytes], **kwds) -> None: self._parse_kwds() - def _parse_kwds(self): + def _parse_kwds(self) -> None: """ Validates keywords before passing to pyarrow. """ @@ -104,7 +104,7 @@ def _get_pyarrow_options(self) -> None: ] = None # PyArrow raises an exception by default elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: - def handle_warning(invalid_row): + def handle_warning(invalid_row) -> str: warnings.warn( f"Expected {invalid_row.expected_columns} columns, but found " f"{invalid_row.actual_columns}: {invalid_row.text}", @@ -219,7 +219,7 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: raise ValueError(e) return frame - def _validate_usecols(self, usecols): + def _validate_usecols(self, usecols) -> None: if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols): raise ValueError( "The pyarrow engine does not allow 'usecols' to be integer " diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 50611197ad7dd..1139519d2bcd3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1707,7 +1707,7 @@ def info(self) -> str: # ------------------------------------------------------------------------ # private methods - def _check_if_open(self): + def _check_if_open(self) -> None: if not self.is_open: raise ClosedFileError(f"{self._path} file is not open!") diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e68f4789f0a06..11b2ed0ee7316 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -288,7 +288,7 @@ def close(self) -> None: def _get_row(self): return self.filepath_or_buffer.read(80).decode() - def _read_header(self): + def _read_header(self) -> None: self.filepath_or_buffer.seek(0) # read file header diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b0fa6bc6e90c4..3a58daf681cfb 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1514,7 +1514,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: pass @@ -2073,7 +2073,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLTable( table_name, self, @@ -2433,7 +2433,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: raise NotImplementedError("not implemented for adbc") @@ -2879,7 +2879,7 @@ def _create_sql_schema( keys=None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLiteTable( table_name, self, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0f097c6059c7c..a4d8054ea4f8c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -687,7 +687,7 @@ def __init__( self._prepare_value_labels() - def _prepare_value_labels(self): + def _prepare_value_labels(self) -> None: """Encode value labels.""" self.text_len = 0 diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 479a5e19dc1c5..2979903edf360 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -662,7 +662,7 @@ def _ensure_frame(self, data) -> DataFrame: return data @final - def _compute_plot_data(self): + def _compute_plot_data(self) -> None: data = self.data # GH15079 reconstruct data if by is defined @@ -699,7 +699,7 @@ def _compute_plot_data(self): self.data = numeric_data.apply(type(self)._convert_to_ndarray) - def _make_plot(self, fig: Figure): + def _make_plot(self, fig: Figure) -> None: raise AbstractMethodError(self) @final @@ -745,7 +745,7 @@ def _post_plot_logic(self, ax: Axes, data) -> None: """Post process for each axes. Overridden in child classes""" @final - def _adorn_subplots(self, fig: Figure): + def _adorn_subplots(self, fig: Figure) -> None: """Common post process unrelated to data""" if len(self.axes) > 0: all_axes = self._get_subplots(fig) @@ -1323,7 +1323,7 @@ def __init__( c = self.data.columns[c] self.c = c - def _make_plot(self, fig: Figure): + def _make_plot(self, fig: Figure) -> None: x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index a47f622216ef7..cb0b4d549f49e 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -26,7 +26,7 @@ BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None) -def _check_arg_length(fname, args, max_fname_arg_count, compat_args): +def _check_arg_length(fname, args, max_fname_arg_count, compat_args) -> None: """ Checks whether 'args' has length of at most 'compat_args'. Raises a TypeError if that is not the case, similar to in Python when a @@ -46,7 +46,7 @@ def _check_arg_length(fname, args, max_fname_arg_count, compat_args): ) -def _check_for_default_values(fname, arg_val_dict, compat_args): +def _check_for_default_values(fname, arg_val_dict, compat_args) -> None: """ Check that the keys in `arg_val_dict` are mapped to their default values as specified in `compat_args`. @@ -125,7 +125,7 @@ def validate_args(fname, args, max_fname_arg_count, compat_args) -> None: _check_for_default_values(fname, kwargs, compat_args) -def _check_for_invalid_keys(fname, kwargs, compat_args): +def _check_for_invalid_keys(fname, kwargs, compat_args) -> None: """ Checks whether 'kwargs' contains any keys that are not in 'compat_args' and raises a TypeError if there is one. diff --git a/requirements-dev.txt b/requirements-dev.txt index cbfb6336b2e16..5a63e59e1db88 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -53,7 +53,7 @@ moto flask asv>=0.6.1 flake8==6.1.0 -mypy==1.7.1 +mypy==1.8.0 tokenize-rt pre-commit>=3.6.0 gitpython From d212e1f90aafec8f128582ce8485ffcd9d4f9765 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Dec 2023 20:43:08 +0100 Subject: [PATCH 005/152] Backport PR #56636 on branch 2.2.x (DOC: Fixup CoW userguide) (#56639) Backport PR #56636: DOC: Fixup CoW userguide Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/user_guide/copy_on_write.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index 050c3901c3420..a083297925007 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -317,7 +317,7 @@ you are modifying one object inplace. .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df2 = df.reset_index() + df2 = df.reset_index(drop=True) df2.iloc[0, 0] = 100 This creates two objects that share data and thus the setitem operation will trigger a @@ -328,7 +328,7 @@ held by the object. .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df = df.reset_index() + df = df.reset_index(drop=True) df.iloc[0, 0] = 100 No copy is necessary in this example. From 95cc20073c4fc2468dc1396dc031bb9b0ad0ce54 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Dec 2023 20:55:19 +0100 Subject: [PATCH 006/152] Backport PR #56632 on branch 2.2.x (DOC: Minor fixups for 2.2.0 whatsnew) (#56640) Backport PR #56632: DOC: Minor fixups for 2.2.0 whatsnew Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 118 ++++++++++++--------------------- 1 file changed, 43 insertions(+), 75 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5ee94b74c527e..5b955aa45219a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -123,7 +123,7 @@ nullability handling. with pg_dbapi.connect(uri) as conn: df.to_sql("pandas_table", conn, index=False) - # for roundtripping + # for round-tripping with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn) @@ -176,7 +176,7 @@ leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` .. code-block:: ipython - # for roundtripping + # for round-tripping with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") @@ -306,22 +306,21 @@ Other enhancements - :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`get_dummies` now returning extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`) -- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) +- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"`` (:issue:`54480`) - :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`) -- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) +- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs` (:issue:`54264`) - :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`) +- :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) -- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) -- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`) -- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`) +- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`) +- Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`) - The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) - .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: @@ -386,6 +385,8 @@ index levels when joining on two indexes with different levels (:issue:`34133`). left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + left + right result = left.join(right) *Old Behavior* @@ -415,15 +416,6 @@ Backwards incompatible API changes Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Some minimum supported versions of dependencies were updated. -If installed, we now require: - -+-----------------+-----------------+----------+---------+ -| Package | Minimum Version | Required | Changed | -+=================+=================+==========+=========+ -| | | X | X | -+-----------------+-----------------+----------+---------+ - For `optional libraries `_ the general recommendation is to use the latest version. The following table lists the lowest version per library that is currently being tested throughout the development of pandas. Optional libraries below the lowest tested version may still work, but are not considered supported. @@ -433,8 +425,6 @@ Optional libraries below the lowest tested version may still work, but are not c +=================+=================+=========+ | mypy (dev) | 1.8.0 | X | +-----------------+-----------------+---------+ -| | | X | -+-----------------+-----------------+---------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. @@ -606,20 +596,20 @@ Other Deprecations - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) - Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`) - Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer``. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer`` (:issue:`54229`) - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) - Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) - Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) @@ -692,31 +682,30 @@ Bug fixes Categorical ^^^^^^^^^^^ - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) -- Bug in :meth:`CategoricalDtype.__eq__` returning false for unordered categorical data with mixed types (:issue:`55468`) -- +- Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`) Datetimelike ^^^^^^^^^^^^ - Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`) - Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`) - Bug in :func:`Series.isin` with :class:`DatetimeTZDtype` dtype and comparison values that are all ``NaT`` incorrectly returning all-``False`` even if the series contains ``NaT`` entries (:issue:`56427`) -- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`) +- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame (:issue:`52093`) - Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`) +- Bug in :meth:`.Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`DatetimeIndex.shift` with non-nanosecond resolution incorrectly returning with nanosecond resolution (:issue:`56117`) - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) - Bug in :meth:`Series.fillna` with non-nanosecond resolution dtypes and higher-resolution vector values returning incorrect (internally-corrupted) results (:issue:`56410`) -- Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`) -- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) +- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetime64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) - Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`) -- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in addition or subtraction of very large :class:`.Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) @@ -739,14 +728,12 @@ Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) -- Conversion ^^^^^^^^^^ - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) -- Strings ^^^^^^^ @@ -763,13 +750,12 @@ Strings Interval ^^^^^^^^ -- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) +- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`) - Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`) - Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`) - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) - Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`) -- Indexing ^^^^^^^^ @@ -781,25 +767,23 @@ Indexing Missing ^^^^^^^ - Bug in :meth:`DataFrame.update` wasn't updating in-place for tz-aware datetime64 dtypes (:issue:`56227`) -- MultiIndex ^^^^^^^^^^ - Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`) -- I/O ^^^ -- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`) -- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`) -- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified (:issue:`56323`) +- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified (:issue:`55677`) +- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raising a Python warning; this now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``quotechar`` was ignored (:issue:`52266`) -- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) -- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a CSV with no headers (:issue:`54459`) +- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when the file contains ``NaN`` or ``Inf`` (:issue:`54564`) - Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`) -- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) +- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`) - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) -- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when string contains annotation (:issue:`55200`) +- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`) - Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`) @@ -808,12 +792,11 @@ Period - Bug in :class:`PeriodIndex` construction when more than one of ``data``, ``ordinal`` and ``**fields`` are passed failing to raise ``ValueError`` (:issue:`55961`) - Bug in :class:`Period` addition silently wrapping around instead of raising ``OverflowError`` (:issue:`55503`) - Bug in casting from :class:`PeriodDtype` with ``astype`` to ``datetime64`` or :class:`DatetimeTZDtype` with non-nanosecond unit incorrectly returning with nanosecond unit (:issue:`55958`) -- Plotting ^^^^^^^^ -- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) -- Bug in :meth:`DataFrame.plot.scatter` discaring string columns (:issue:`56142`) +- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a Matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) +- Bug in :meth:`DataFrame.plot.scatter` discarding string columns (:issue:`56142`) - Bug in :meth:`Series.plot` when reusing an ``ax`` object failing to raise when a ``how`` keyword is passed (:issue:`55953`) Groupby/resample/rolling @@ -821,9 +804,9 @@ Groupby/resample/rolling - Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`) @@ -845,22 +828,11 @@ Reshaping - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`) - Bug in :meth:`DataFrame.pivot` with numeric columns and extension dtype for data (:issue:`56528`) -- Bug in :meth:`DataFrame.stack` and :meth:`Series.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) +- Bug in :meth:`DataFrame.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) Sparse ^^^^^^ - Bug in :meth:`SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) -- - -ExtensionArray -^^^^^^^^^^^^^^ -- -- - -Styler -^^^^^^ -- -- Other ^^^^^ @@ -871,15 +843,11 @@ Other - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) -- Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) +- Bug in rendering ``inf`` values inside a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) -.. ***DO NOT USE THIS SECTION*** - -- -- .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: From f8e9892dd1bdea536b3f7c25eb244d36b05b53d6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 02:25:53 +0100 Subject: [PATCH 007/152] Backport PR #56644 on branch 2.2.x (BUG: Series.to_numpy raising for arrow floats to numpy floats) (#56648) Backport PR #56644: BUG: Series.to_numpy raising for arrow floats to numpy floats Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/arrow/array.py | 11 ++++++++++- pandas/tests/extension/test_arrow.py | 8 ++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 23b5448029dd9..de1ed9ecfdaf1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -37,6 +37,7 @@ CategoricalDtype, is_array_like, is_bool_dtype, + is_float_dtype, is_integer, is_list_like, is_numeric_dtype, @@ -1320,6 +1321,7 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: + original_na_value = na_value dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) pa_type = self._pa_array.type if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): @@ -1345,7 +1347,14 @@ def to_numpy( if dtype is not None and isna(na_value): na_value = None result = np.full(len(data), fill_value=na_value, dtype=dtype) - elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): + elif not data._hasna or ( + pa.types.is_floating(pa_type) + and ( + na_value is np.nan + or original_na_value is lib.no_default + and is_float_dtype(dtype) + ) + ): result = data._pa_array.to_numpy() if dtype is not None: result = result.astype(dtype, copy=False) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3b03272f18203..5624acfb64764 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3153,6 +3153,14 @@ def test_string_to_time_parsing_cast(): tm.assert_series_equal(result, expected) +def test_to_numpy_float(): + # GH#56267 + ser = pd.Series([32, 40, None], dtype="float[pyarrow]") + result = ser.astype("float64") + expected = pd.Series([32, 40, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) + + def test_to_numpy_timestamp_to_int(): # GH 55997 ser = pd.Series(["2020-01-01 04:30:00"], dtype="timestamp[ns][pyarrow]") From 3732cc4150fb58bdfe5c30251a8703f694c694be Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:16:27 +0100 Subject: [PATCH 008/152] Backport PR #56650 on branch 2.2.x (ENH: Implement dt methods for pyarrow duration types) (#56656) Backport PR #56650: ENH: Implement dt methods for pyarrow duration types Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 87 ++++++++++++++++++++++ pandas/core/indexes/accessors.py | 39 +++++++++- pandas/tests/extension/test_arrow.py | 105 +++++++++++++++++++++++++++ 4 files changed, 231 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5b955aa45219a..f7e1cc9cbe36d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -316,6 +316,7 @@ Other enhancements - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) +- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`) - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) - Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`) - Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index de1ed9ecfdaf1..32a4cadff8270 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -17,6 +17,7 @@ from pandas._libs import lib from pandas._libs.tslibs import ( + NaT, Timedelta, Timestamp, timezones, @@ -2498,6 +2499,92 @@ def _str_wrap(self, width: int, **kwargs): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) + @property + def _dt_days(self): + return type(self)( + pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) + ) + + @property + def _dt_hours(self): + return type(self)( + pa.array( + [ + td.components.hours if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_minutes(self): + return type(self)( + pa.array( + [ + td.components.minutes if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_seconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() + ) + ) + + @property + def _dt_milliseconds(self): + return type(self)( + pa.array( + [ + td.components.milliseconds if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_microseconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().microseconds, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_nanoseconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() + ) + ) + + def _dt_to_pytimedelta(self): + data = self._pa_array.to_pylist() + if self._dtype.pyarrow_dtype.unit == "ns": + data = [None if ts is None else ts.to_pytimedelta() for ts in data] + return np.array(data, dtype=object) + + def _dt_total_seconds(self): + return type(self)( + pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True) + ) + + def _dt_as_unit(self, unit: str): + if pa.types.is_date(self.dtype.pyarrow_dtype): + raise NotImplementedError("as_unit not implemented for date types") + pd_array = self._maybe_convert_datelike_array() + # Don't just cast _pa_array in order to follow pandas unit conversion rules + return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True)) + @property def _dt_year(self): return type(self)(pc.year(self._pa_array)) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 929c7f4a63f8f..7e3ba4089ff60 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -148,6 +148,20 @@ def _delegate_method(self, name: str, *args, **kwargs): return result +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_ops, + typ="property", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_methods, + typ="method", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) @delegate_names( delegate=ArrowExtensionArray, accessors=DatetimeArray._datetimelike_ops, @@ -213,6 +227,9 @@ def _delegate_method(self, name: str, *args, **kwargs): return result + def to_pytimedelta(self): + return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta() + def to_pydatetime(self): # GH#20306 warnings.warn( @@ -241,6 +258,26 @@ def isocalendar(self) -> DataFrame: ) return iso_calendar_df + @property + def components(self) -> DataFrame: + from pandas import DataFrame + + components_df = DataFrame( + { + col: getattr(self._parent.array, f"_dt_{col}") + for col in [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] + } + ) + return components_df + @delegate_names( delegate=DatetimeArray, @@ -592,7 +629,7 @@ def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor index=orig.index, ) - if isinstance(data.dtype, ArrowDtype) and data.dtype.kind == "M": + if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm": return ArrowTemporalProperties(data, orig) if lib.is_np_dtype(data.dtype, "M"): return DatetimeProperties(data, orig) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5624acfb64764..20cdcb9ce9ab8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2723,6 +2723,111 @@ def test_dt_tz_convert(unit): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"]) +def test_as_unit(dtype): + # GH 52284 + ser = pd.Series([1000, None], dtype=dtype) + result = ser.dt.as_unit("ns") + expected = ser.astype(dtype.replace("ms", "ns")) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "prop, expected", + [ + ["days", 1], + ["seconds", 2], + ["microseconds", 3], + ["nanoseconds", 4], + ], +) +def test_dt_timedelta_properties(prop, expected): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = getattr(ser.dt, prop) + expected = pd.Series( + ArrowExtensionArray(pa.array([expected, None], type=pa.int32())) + ) + tm.assert_series_equal(result, expected) + + +def test_dt_timedelta_total_seconds(): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.total_seconds() + expected = pd.Series( + ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64())) + ) + tm.assert_series_equal(result, expected) + + +def test_dt_to_pytimedelta(): + # GH 52284 + data = [timedelta(1, 2, 3), timedelta(1, 2, 4)] + ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns"))) + + result = ser.dt.to_pytimedelta() + expected = np.array(data, dtype=object) + tm.assert_numpy_array_equal(result, expected) + assert all(type(res) is timedelta for res in result) + + expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta() + tm.assert_numpy_array_equal(result, expected) + + +def test_dt_components(): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624 From f99f4d62c9dcc7633ae328535a5bf9ff9365a6b7 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:16:37 +0100 Subject: [PATCH 009/152] Backport PR #56647 on branch 2.2.x (floordiv fix for large values) (#56655) Backport PR #56647: floordiv fix for large values Co-authored-by: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 7 ++++++- pandas/tests/extension/test_arrow.py | 8 ++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index f7e1cc9cbe36d..34c9c142d3870 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -728,6 +728,7 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) Conversion diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 32a4cadff8270..b1164301e6d79 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -115,7 +115,12 @@ def cast_for_truediv( if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type ): - return arrow_array.cast(pa.float64()) + # https://github.com/apache/arrow/issues/35563 + # Arrow does not allow safe casting large integral values to float64. + # Intentionally not using arrow_array.cast because it could be a scalar + # value in reflected case, and safe=False only added to + # scalar cast in pyarrow 13. + return pc.cast(arrow_array, pa.float64(), safe=False) return arrow_array def floordiv_compat( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 20cdcb9ce9ab8..ed1b7b199a16f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3238,6 +3238,14 @@ def test_arrow_floordiv(): tm.assert_series_equal(result, expected) +def test_arrow_floordiv_large_values(): + # GH 55561 + a = pd.Series([1425801600000000000], dtype="int64[pyarrow]") + expected = pd.Series([1425801600000], dtype="int64[pyarrow]") + result = a // 1_000_000 + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] From 80ba45002e7f16cad9af3e6d9f6be499f445bb05 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 19:52:43 +0100 Subject: [PATCH 010/152] Backport PR #56613 on branch 2.2.x (BUG: Added raising when merging datetime columns with timedelta columns) (#56658) Backport PR #56613: BUG: Added raising when merging datetime columns with timedelta columns Co-authored-by: Huanghz2001 <83120995+Huanghz2001@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/merge.py | 5 +++++ pandas/tests/reshape/merge/test_merge.py | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 34c9c142d3870..d60dbefd83195 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -824,6 +824,7 @@ Reshaping - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) - Bug in :func:`merge_asof` raising incorrect error for string dtype (:issue:`56444`) - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`) +- Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`) - Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 690e3c2700c6c..320e4e33a29fb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1526,6 +1526,11 @@ def _maybe_coerce_merge_keys(self) -> None: ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"): # allows datetime with different resolutions continue + # datetime and timedelta not allowed + elif lk.dtype.kind == "M" and rk.dtype.kind == "m": + raise ValueError(msg) + elif lk.dtype.kind == "m" and rk.dtype.kind == "M": + raise ValueError(msg) elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype): continue diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d7a343ae9f152..ab8d22e567d27 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2988,3 +2988,23 @@ def test_merge_empty_frames_column_order(left_empty, right_empty): elif right_empty: expected.loc[:, ["C", "D"]] = np.nan tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +def test_merge_datetime_and_timedelta(how): + left = DataFrame({"key": Series([1, None], dtype="datetime64[ns]")}) + right = DataFrame({"key": Series([1], dtype="timedelta64[ns]")}) + + msg = ( + f"You are trying to merge on {left['key'].dtype} and {right['key'].dtype} " + "columns for key 'key'. If you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + left.merge(right, on="key", how=how) + + msg = ( + f"You are trying to merge on {right['key'].dtype} and {left['key'].dtype} " + "columns for key 'key'. If you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + right.merge(left, on="key", how=how) From d84425d666d6d1b8107b7a1031a68373f523cc80 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 20:22:58 +0100 Subject: [PATCH 011/152] Backport PR #56635 on branch 2.2.x (CoW: Boolean indexer in MultiIndex raising read-only error) (#56660) Backport PR #56635: CoW: Boolean indexer in MultiIndex raising read-only error Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/indexes/multi.py | 2 ++ pandas/tests/copy_view/test_indexing.py | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d60dbefd83195..263da59e6455c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -761,6 +761,7 @@ Interval Indexing ^^^^^^^^ +- Bug in :meth:`DataFrame.loc` mutating a boolean indexer when :class:`DataFrame` has a :class:`MultiIndex` (:issue:`56635`) - Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`) - Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) - Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2a4e027e2b806..02a841a2075fd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3488,6 +3488,8 @@ def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]: "is not the same length as the index" ) lvl_indexer = np.asarray(k) + if indexer is None: + lvl_indexer = lvl_indexer.copy() elif is_list_like(k): # a collection of labels to include from this level (these are or'd) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 6f3850ab64daa..2681c07f01990 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1224,6 +1224,27 @@ def test_series_midx_tuples_slice(using_copy_on_write, warn_copy_on_write): tm.assert_series_equal(ser, expected) +def test_midx_read_only_bool_indexer(): + # GH#56635 + def mklbl(prefix, n): + return [f"{prefix}{i}" for i in range(n)] + + idx = pd.MultiIndex.from_product( + [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)] + ) + cols = pd.MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"] + ) + df = DataFrame(1, index=idx, columns=cols).sort_index().sort_index(axis=1) + + mask = df[("a", "foo")] == 1 + expected_mask = mask.copy() + result = df.loc[pd.IndexSlice[mask, :, ["C1", "C3"]], :] + expected = df.loc[pd.IndexSlice[:, :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + tm.assert_series_equal(mask, expected_mask) + + def test_loc_enlarging_with_dataframe(using_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) rhs = DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) From 5043600e694cde9db4d26dd2bec8f2d7a75cdb44 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 20:23:31 +0100 Subject: [PATCH 012/152] Backport PR #56641 on branch 2.2.x (DOC: Add optional dependencies table in 2.2 whatsnew) (#56662) Backport PR #56641: DOC: Add optional dependencies table in 2.2 whatsnew Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 66 +++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 263da59e6455c..1da18cd9be8f9 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -417,15 +417,63 @@ Backwards incompatible API changes Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -For `optional libraries `_ the general recommendation is to use the latest version. -The following table lists the lowest version per library that is currently being tested throughout the development of pandas. -Optional libraries below the lowest tested version may still work, but are not considered supported. - -+-----------------+-----------------+---------+ -| Package | Minimum Version | Changed | -+=================+=================+=========+ -| mypy (dev) | 1.8.0 | X | -+-----------------+-----------------+---------+ +For `optional dependencies `_ the general recommendation is to use the latest version. +Optional dependencies below the lowest tested version may still work but are not considered supported. +The following table lists the optional dependencies that have had their minimum tested version increased. + ++-----------------+---------------------+ +| Package | New Minimum Version | ++=================+=====================+ +| beautifulsoup4 | 4.11.2 | ++-----------------+---------------------+ +| blosc | 1.21.3 | ++-----------------+---------------------+ +| bottleneck | 1.3.6 | ++-----------------+---------------------+ +| fastparquet | 2022.12.0 | ++-----------------+---------------------+ +| fsspec | 2022.11.0 | ++-----------------+---------------------+ +| gcsfs | 2022.11.0 | ++-----------------+---------------------+ +| lxml | 4.9.2 | ++-----------------+---------------------+ +| matplotlib | 3.6.3 | ++-----------------+---------------------+ +| numba | 0.56.4 | ++-----------------+---------------------+ +| numexpr | 2.8.4 | ++-----------------+---------------------+ +| qtpy | 2.3.0 | ++-----------------+---------------------+ +| openpyxl | 3.1.0 | ++-----------------+---------------------+ +| psycopg2 | 2.9.6 | ++-----------------+---------------------+ +| pyreadstat | 1.2.0 | ++-----------------+---------------------+ +| pytables | 3.8.0 | ++-----------------+---------------------+ +| pyxlsb | 1.0.10 | ++-----------------+---------------------+ +| s3fs | 2022.11.0 | ++-----------------+---------------------+ +| scipy | 1.10.0 | ++-----------------+---------------------+ +| sqlalchemy | 2.0.0 | ++-----------------+---------------------+ +| tabulate | 0.9.0 | ++-----------------+---------------------+ +| xarray | 2022.12.0 | ++-----------------+---------------------+ +| xlsxwriter | 3.0.5 | ++-----------------+---------------------+ +| zstandard | 0.19.0 | ++-----------------+---------------------+ +| pyqt5 | 5.15.8 | ++-----------------+---------------------+ +| tzdata | 2022.7 | ++-----------------+---------------------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. From 722d337e91e927ab882abe24c550c0845d093357 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 22:44:56 +0100 Subject: [PATCH 013/152] Backport PR #56370 on branch 2.2.x (BUG: rolling with datetime ArrowDtype) (#56665) Backport PR #56370: BUG: rolling with datetime ArrowDtype Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/datetimelike.py | 7 +++++- pandas/core/window/rolling.py | 23 +++++++++++-------- pandas/tests/window/test_timeseries_window.py | 16 +++++++++++++ 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 1da18cd9be8f9..129f5cedb86c2 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -865,6 +865,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where either the ``index`` or ``on`` column was :class:`ArrowDtype` with ``pyarrow.timestamp`` type (:issue:`55849`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 11a0c7bf18fcb..a0e0a1434e871 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -92,6 +92,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, @@ -2531,7 +2532,7 @@ def _validate_inferred_freq( return freq -def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: +def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype | ArrowDtype) -> str: """ Return the unit str corresponding to the dtype's resolution. @@ -2546,4 +2547,8 @@ def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: """ if isinstance(dtype, DatetimeTZDtype): return dtype.unit + elif isinstance(dtype, ArrowDtype): + if dtype.kind not in "mM": + raise ValueError(f"{dtype=} does not have a resolution.") + return dtype.pyarrow_dtype.unit return np.datetime_data(dtype)[0] diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e78bd258c11ff..68cec16ec9eca 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,7 +14,6 @@ Any, Callable, Literal, - cast, ) import numpy as np @@ -39,6 +38,7 @@ is_numeric_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -104,6 +104,7 @@ NDFrameT, QuantileInterpolation, WindowingRankType, + npt, ) from pandas import ( @@ -404,11 +405,12 @@ def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None: result[name] = extra_col @property - def _index_array(self): + def _index_array(self) -> npt.NDArray[np.int64] | None: # TODO: why do we get here with e.g. MultiIndex? - if needs_i8_conversion(self._on.dtype): - idx = cast("PeriodIndex | DatetimeIndex | TimedeltaIndex", self._on) - return idx.asi8 + if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)): + return self._on.asi8 + elif isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM": + return self._on.to_numpy(dtype=np.int64) return None def _resolve_output(self, out: DataFrame, obj: DataFrame) -> DataFrame: @@ -439,7 +441,7 @@ def _apply_series( self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None ) -> Series: """ - Series version of _apply_blockwise + Series version of _apply_columnwise """ obj = self._create_data(self._selected_obj) @@ -455,7 +457,7 @@ def _apply_series( index = self._slice_axis_for_step(obj.index, result) return obj._constructor(result, index=index, name=obj.name) - def _apply_blockwise( + def _apply_columnwise( self, homogeneous_func: Callable[..., ArrayLike], name: str, @@ -614,7 +616,7 @@ def calc(x): return result if self.method == "single": - return self._apply_blockwise(homogeneous_func, name, numeric_only) + return self._apply_columnwise(homogeneous_func, name, numeric_only) else: return self._apply_tablewise(homogeneous_func, name, numeric_only) @@ -1232,7 +1234,9 @@ def calc(x): return result - return self._apply_blockwise(homogeneous_func, name, numeric_only)[:: self.step] + return self._apply_columnwise(homogeneous_func, name, numeric_only)[ + :: self.step + ] @doc( _shared_docs["aggregate"], @@ -1868,6 +1872,7 @@ def _validate(self): if ( self.obj.empty or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex)) + or (isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM") ) and isinstance(self.window, (str, BaseOffset, timedelta)): self._validate_datetimelike_monotonic() diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index c99fc8a8eb60f..bd0fadeb3e475 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -1,9 +1,12 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, DatetimeIndex, + Index, MultiIndex, NaT, Series, @@ -697,3 +700,16 @@ def test_nat_axis_error(msg, axis): with pytest.raises(ValueError, match=f"{msg} values must not have NaT"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): df.rolling("D", axis=axis).mean() + + +@td.skip_if_no("pyarrow") +def test_arrow_datetime_axis(): + # GH 55849 + expected = Series( + np.arange(5, dtype=np.float64), + index=Index( + date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]" + ), + ) + result = expected.rolling("1D").sum() + tm.assert_series_equal(result, expected) From 944c40f381996ba5c64b582200a4fd168341254c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 23:17:14 +0100 Subject: [PATCH 014/152] Backport PR #56654 on branch 2.2.x (BUG: assert_series_equal not properly respecting check-dtype) (#56668) Backport PR #56654: BUG: assert_series_equal not properly respecting check-dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_testing/asserters.py | 10 ++++++++-- pandas/tests/extension/test_numpy.py | 10 ---------- pandas/tests/util/test_assert_frame_equal.py | 10 ++-------- pandas/tests/util/test_assert_series_equal.py | 16 +++++++++++----- 4 files changed, 21 insertions(+), 25 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 800b03707540f..d0f38c85868d4 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -949,9 +949,15 @@ def assert_series_equal( obj=str(obj), ) else: + # convert both to NumPy if not, check_dtype would raise earlier + lv, rv = left_values, right_values + if isinstance(left_values, ExtensionArray): + lv = left_values.to_numpy() + if isinstance(right_values, ExtensionArray): + rv = right_values.to_numpy() assert_numpy_array_equal( - left_values, - right_values, + lv, + rv, check_dtype=check_dtype, obj=str(obj), index_values=left.index, diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index aaf49f53ba02b..e38144f4c615b 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -421,16 +421,6 @@ def test_index_from_listlike_with_dtype(self, data): def test_EA_types(self, engine, data, request): super().test_EA_types(engine, data, request) - @pytest.mark.xfail(reason="Expect NumpyEA, get np.ndarray") - def test_compare_array(self, data, comparison_op): - super().test_compare_array(data, comparison_op) - - def test_compare_scalar(self, data, comparison_op, request): - if data.dtype.kind == "f" or comparison_op.__name__ in ["eq", "ne"]: - mark = pytest.mark.xfail(reason="Expect NumpyEA, get np.ndarray") - request.applymarker(mark) - super().test_compare_scalar(data, comparison_op) - class Test2DCompat(base.NDArrayBacked2DTests): pass diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index a074898f6046d..79132591b15b3 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -211,10 +211,7 @@ def test_assert_frame_equal_extension_dtype_mismatch(): "\\[right\\]: int[32|64]" ) - # TODO: this shouldn't raise (or should raise a better error message) - # https://github.com/pandas-dev/pandas/issues/56131 - with pytest.raises(AssertionError, match="classes are different"): - tm.assert_frame_equal(left, right, check_dtype=False) + tm.assert_frame_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) @@ -246,7 +243,6 @@ def test_assert_frame_equal_ignore_extension_dtype_mismatch(): tm.assert_frame_equal(left, right, check_dtype=False) -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") def test_assert_frame_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") @@ -300,9 +296,7 @@ def test_frame_equal_mixed_dtypes(frame_or_series, any_numeric_ea_dtype, indexer dtypes = (any_numeric_ea_dtype, "int64") obj1 = frame_or_series([1, 2], dtype=dtypes[indexer[0]]) obj2 = frame_or_series([1, 2], dtype=dtypes[indexer[1]]) - msg = r'(Series|DataFrame.iloc\[:, 0\] \(column name="0"\) classes) are different' - with pytest.raises(AssertionError, match=msg): - tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) + tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) def test_assert_frame_equal_check_like_different_indexes(): diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index f722f619bc456..c4ffc197298f0 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -290,10 +290,7 @@ def test_assert_series_equal_extension_dtype_mismatch(): \\[left\\]: Int64 \\[right\\]: int[32|64]""" - # TODO: this shouldn't raise (or should raise a better error message) - # https://github.com/pandas-dev/pandas/issues/56131 - with pytest.raises(AssertionError, match="Series classes are different"): - tm.assert_series_equal(left, right, check_dtype=False) + tm.assert_series_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_series_equal(left, right, check_dtype=True) @@ -372,7 +369,6 @@ def test_assert_series_equal_ignore_extension_dtype_mismatch(): tm.assert_series_equal(left, right, check_dtype=False) -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") def test_assert_series_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = Series([1, 2, 3], dtype="Int64") @@ -456,3 +452,13 @@ def test_large_unequal_ints(dtype): right = Series([1577840521123543], dtype=dtype) with pytest.raises(AssertionError, match="Series are different"): tm.assert_series_equal(left, right) + + +@pytest.mark.parametrize("dtype", [None, object]) +@pytest.mark.parametrize("check_exact", [True, False]) +@pytest.mark.parametrize("val", [3, 3.5]) +def test_ea_and_numpy_no_dtype_check(val, check_exact, dtype): + # GH#56651 + left = Series([1, 2, val], dtype=dtype) + right = Series(pd.array([1, 2, val])) + tm.assert_series_equal(left, right, check_dtype=False, check_exact=check_exact) From 903d1520ff21a715d6ee17f2d5692f32afcc7739 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 23:43:47 +0100 Subject: [PATCH 015/152] Backport PR #56664 on branch 2.2.x (CI: Run jobs on 2.2.x branch) (#56669) Backport PR #56664: CI: Run jobs on 2.2.x branch Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/code-checks.yml | 4 ++-- .github/workflows/docbuild-and-upload.yml | 4 ++-- .github/workflows/package-checks.yml | 4 ++-- .github/workflows/unit-tests.yml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index b49b9a67c4743..8e29d56f47dcf 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.2.x pull_request: branches: - main - - 2.1.x + - 2.2.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index da232404e6ff5..73acd9acc129a 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,13 +4,13 @@ on: push: branches: - main - - 2.1.x + - 2.2.x tags: - '*' pull_request: branches: - main - - 2.1.x + - 2.2.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 04d8b8e006985..d59ddf272f705 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.2.x pull_request: branches: - main - - 2.1.x + - 2.2.x types: [ labeled, opened, synchronize, reopened ] permissions: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6ca4d19196874..12e645dc9da81 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.2.x pull_request: branches: - main - - 2.1.x + - 2.2.x paths-ignore: - "doc/**" - "web/**" From 80ad64f05f44cc2ca3183fd6c63b8e3fa44dc486 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 29 Dec 2023 23:58:59 +0100 Subject: [PATCH 016/152] Backport PR #56666 on branch 2.2.x (STY: Use ruff instead of pygrep check for future annotation import) (#56683) Backport PR #56666: STY: Use ruff instead of pygrep check for future annotation import Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .pre-commit-config.yaml | 12 ------------ pyproject.toml | 2 ++ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7f3fc95ce00cc..4b02ad7cf886f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -358,18 +358,6 @@ repos: files: ^pandas/ exclude: ^(pandas/_libs/|pandas/tests/|pandas/errors/__init__.py$|pandas/_version.py) types: [python] - - id: future-annotations - name: import annotations from __future__ - entry: 'from __future__ import annotations' - language: pygrep - args: [--negate] - files: ^pandas/ - types: [python] - exclude: | - (?x) - /(__init__\.py)|(api\.py)|(_version\.py)|(testing\.py)|(conftest\.py)$ - |/tests/ - |/_testing/ - id: check-test-naming name: check that test names start with 'test' entry: python -m scripts.check_test_naming diff --git a/pyproject.toml b/pyproject.toml index 5e65edf81f9c7..8724a25909543 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -259,6 +259,8 @@ select = [ "FLY", # flake8-logging-format "G", + # flake8-future-annotations + "FA", ] ignore = [ From 8c990dff7e48386c27cc9dd89b31831119ca4b62 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 30 Dec 2023 00:47:36 +0100 Subject: [PATCH 017/152] Backport PR #56682 on branch 2.2.x (CLN: NEP 50 followups) (#56684) Backport PR #56682: CLN: NEP 50 followups Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311-sanitizers.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-312.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- pandas/core/dtypes/cast.py | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 12e645dc9da81..dd5d090e098b0 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -92,7 +92,7 @@ jobs: - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" - test_args: "-W error::FutureWarning" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 4b62ecc79e4ef..45f114322015b 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 95c0319d6f5b8..d6bf9ec7843de 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -21,7 +21,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 5455b9b84b034..d84063ac2a9ba 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -18,7 +18,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz - pip diff --git a/ci/deps/actions-311-sanitizers.yaml b/ci/deps/actions-311-sanitizers.yaml index dcd381066b0ea..f5f04c90bffad 100644 --- a/ci/deps/actions-311-sanitizers.yaml +++ b/ci/deps/actions-311-sanitizers.yaml @@ -22,7 +22,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # pandas dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 52074ae00ea18..d14686696e669 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 4c51e9e6029e3..86aaf24b4e15c 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index fd71315d2e7ac..7067048c4434d 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -22,7 +22,7 @@ dependencies: # required dependencies - python-dateutil=2.8.2 - - numpy=1.22.4, <2 + - numpy=1.22.4 - pytz=2020.1 # optional dependencies diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index cbe8f77c15730..31ee74174cd46 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 5a5a01f7aec72..d9c8dd81b7c33 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -20,7 +20,7 @@ dependencies: - hypothesis>=6.46.1 # required - - numpy<2 + - numpy - python-dateutil - pytz - pip: diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 8e106445cd4e0..a19ffd485262d 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7a088bf84c48e..259e83a5936d7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1332,7 +1332,7 @@ def find_result_type(left_dtype: DtypeObj, right: Any) -> DtypeObj: right = left_dtype elif ( not np.issubdtype(left_dtype, np.unsignedinteger) - and 0 < right <= 2 ** (8 * right_dtype.itemsize - 1) - 1 + and 0 < right <= np.iinfo(right_dtype).max ): # If left dtype isn't unsigned, check if it fits in the signed dtype right = np.dtype(f"i{right_dtype.itemsize}") From 0d0c79222e00f30614f1c94d2016039f5215dd65 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 30 Dec 2023 20:05:18 +0100 Subject: [PATCH 018/152] Backport PR #56312 on branch 2.2.x (DOC: Add whatsnew for concat regression) (#56686) Backport PR #56312: DOC: Add whatsnew for concat regression Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 129f5cedb86c2..649ad37a56b35 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -761,6 +761,7 @@ Datetimelike - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) - Bug in the results of :func:`to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) +- Fixed regression where :func:`concat` would raise an error when concatenating ``datetime64`` columns with differing resolutions (:issue:`53641`) Timedelta ^^^^^^^^^ From ee4c37723ce62e21717ea3fb584dc9a1b004de5b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 2 Jan 2024 21:39:07 +0100 Subject: [PATCH 019/152] Backport PR #56167 on branch 2.2.x ([ENH]: Expand types allowed in Series.struct.field) (#56698) Backport PR #56167: [ENH]: Expand types allowed in Series.struct.field Co-authored-by: Tom Augspurger --- doc/source/whatsnew/v2.2.0.rst | 8 ++ pandas/core/arrays/arrow/accessors.py | 125 +++++++++++++++--- .../series/accessors/test_struct_accessor.py | 48 ++++++- 3 files changed, 165 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 649ad37a56b35..15e98cbb2a4d7 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -251,6 +251,14 @@ DataFrame. (:issue:`54938`) ) series.struct.explode() +Use :meth:`Series.struct.field` to index into a (possible nested) +struct field. + + +.. ipython:: python + + series.struct.field("project") + .. _whatsnew_220.enhancements.list_accessor: Series.list accessor for PyArrow list data diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 7f88267943526..124f8fb6ad8bc 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -6,13 +6,18 @@ ABCMeta, abstractmethod, ) -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + cast, +) from pandas.compat import ( pa_version_under10p1, pa_version_under11p0, ) +from pandas.core.dtypes.common import is_list_like + if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc @@ -267,15 +272,27 @@ def dtypes(self) -> Series: names = [struct.name for struct in pa_type] return Series(types, index=Index(names)) - def field(self, name_or_index: str | int) -> Series: + def field( + self, + name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + ) -> Series: """ Extract a child field of a struct as a Series. Parameters ---------- - name_or_index : str | int + name_or_index : str | bytes | int | expression | list Name or index of the child field to extract. + For list-like inputs, this will index into a nested + struct. + Returns ------- pandas.Series @@ -285,6 +302,19 @@ def field(self, name_or_index: str | int) -> Series: -------- Series.struct.explode : Return all child fields as a DataFrame. + Notes + ----- + The name of the resulting Series will be set using the following + rules: + + - For string, bytes, or integer `name_or_index` (or a list of these, for + a nested selection), the Series name is set to the selected + field's name. + - For a :class:`pyarrow.compute.Expression`, this is set to + the string form of the expression. + - For list-like `name_or_index`, the name will be set to the + name of the final field selected. + Examples -------- >>> import pyarrow as pa @@ -314,27 +344,92 @@ def field(self, name_or_index: str | int) -> Series: 1 2 2 1 Name: version, dtype: int64[pyarrow] + + Or an expression + + >>> import pyarrow.compute as pc + >>> s.struct.field(pc.field("project")) + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + For nested struct types, you can pass a list of values to index + multiple levels: + + >>> version_type = pa.struct([ + ... ("major", pa.int64()), + ... ("minor", pa.int64()), + ... ]) + >>> s = pd.Series( + ... [ + ... {"version": {"major": 1, "minor": 5}, "project": "pandas"}, + ... {"version": {"major": 2, "minor": 1}, "project": "pandas"}, + ... {"version": {"major": 1, "minor": 26}, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", version_type), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.field(["version", "minor"]) + 0 5 + 1 1 + 2 26 + Name: minor, dtype: int64[pyarrow] + >>> s.struct.field([0, 0]) + 0 1 + 1 2 + 2 1 + Name: major, dtype: int64[pyarrow] """ from pandas import Series + def get_name( + level_name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + data: pa.ChunkedArray, + ): + if isinstance(level_name_or_index, int): + name = data.type.field(level_name_or_index).name + elif isinstance(level_name_or_index, (str, bytes)): + name = level_name_or_index + elif isinstance(level_name_or_index, pc.Expression): + name = str(level_name_or_index) + elif is_list_like(level_name_or_index): + # For nested input like [2, 1, 2] + # iteratively get the struct and field name. The last + # one is used for the name of the index. + level_name_or_index = list(reversed(level_name_or_index)) + selected = data + while level_name_or_index: + # we need the cast, otherwise mypy complains about + # getting ints, bytes, or str here, which isn't possible. + level_name_or_index = cast(list, level_name_or_index) + name_or_index = level_name_or_index.pop() + name = get_name(name_or_index, selected) + selected = selected.type.field(selected.type.get_field_index(name)) + name = selected.name + else: + raise ValueError( + "name_or_index must be an int, str, bytes, " + "pyarrow.compute.Expression, or list of those" + ) + return name + pa_arr = self._data.array._pa_array - if isinstance(name_or_index, int): - index = name_or_index - elif isinstance(name_or_index, str): - index = pa_arr.type.get_field_index(name_or_index) - else: - raise ValueError( - "name_or_index must be an int or str, " - f"got {type(name_or_index).__name__}" - ) + name = get_name(name_or_index, pa_arr) + field_arr = pc.struct_field(pa_arr, name_or_index) - pa_field = pa_arr.type[index] - field_arr = pc.struct_field(pa_arr, [index]) return Series( field_arr, dtype=ArrowDtype(field_arr.type), index=self._data.index, - name=pa_field.name, + name=name, ) def explode(self) -> DataFrame: diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index 1ec5b3b726d17..80aea75fda406 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -2,6 +2,11 @@ import pytest +from pandas.compat.pyarrow import ( + pa_version_under11p0, + pa_version_under13p0, +) + from pandas import ( ArrowDtype, DataFrame, @@ -11,6 +16,7 @@ import pandas._testing as tm pa = pytest.importorskip("pyarrow") +pc = pytest.importorskip("pyarrow.compute") def test_struct_accessor_dtypes(): @@ -53,6 +59,7 @@ def test_struct_accessor_dtypes(): tm.assert_series_equal(actual, expected) +@pytest.mark.skipif(pa_version_under13p0, reason="pyarrow>=13.0.0 required") def test_struct_accessor_field(): index = Index([-100, 42, 123]) ser = Series( @@ -94,10 +101,11 @@ def test_struct_accessor_field(): def test_struct_accessor_field_with_invalid_name_or_index(): ser = Series([], dtype=ArrowDtype(pa.struct([("field", pa.int64())]))) - with pytest.raises(ValueError, match="name_or_index must be an int or str"): + with pytest.raises(ValueError, match="name_or_index must be an int, str,"): ser.struct.field(1.1) +@pytest.mark.skipif(pa_version_under11p0, reason="pyarrow>=11.0.0 required") def test_struct_accessor_explode(): index = Index([-100, 42, 123]) ser = Series( @@ -148,3 +156,41 @@ def test_struct_accessor_api_for_invalid(invalid): ), ): invalid.struct + + +@pytest.mark.parametrize( + ["indices", "name"], + [ + (0, "int_col"), + ([1, 2], "str_col"), + (pc.field("int_col"), "int_col"), + ("int_col", "int_col"), + (b"string_col", b"string_col"), + ([b"string_col"], "string_col"), + ], +) +@pytest.mark.skipif(pa_version_under13p0, reason="pyarrow>=13.0.0 required") +def test_struct_accessor_field_expanded(indices, name): + arrow_type = pa.struct( + [ + ("int_col", pa.int64()), + ( + "struct_col", + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ("str_col", pa.string()), + ] + ), + ), + (b"string_col", pa.string()), + ] + ) + + data = pa.array([], type=arrow_type) + ser = Series(data, dtype=ArrowDtype(arrow_type)) + expected = pc.struct_field(data, indices) + result = ser.struct.field(indices) + tm.assert_equal(result.array._pa_array.combine_chunks(), expected) + assert result.name == name From d43af631d6fcb6d8b6a90594b82737cfee63724b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Jan 2024 11:38:27 -1000 Subject: [PATCH 020/152] Backport PR #56691 on branch 2.2.x (Bug pyarrow implementation of str.fullmatch matches partial string. issue #56652) (#56715) Backport PR #56691: Bug pyarrow implementation of str.fullmatch matches partial string. issue #56652 Co-authored-by: JackCollins91 <112877841+JackCollins91@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/extension/test_arrow.py | 19 ++++++++++++------- pandas/tests/strings/test_find_replace.py | 9 +++++++++ 5 files changed, 24 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 15e98cbb2a4d7..043646457f604 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -805,6 +805,7 @@ Strings - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) +- Bug in :meth:`str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) - Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`) Interval diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b1164301e6d79..5427cee55dfb1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2277,7 +2277,7 @@ def _str_match( def _str_fullmatch( self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None ): - if not pat.endswith("$") or pat.endswith("//$"): + if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" return self._str_match(pat, case, flags, na) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d5a76811a12e6..e8f614ff855c0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -433,7 +433,7 @@ def _str_match( def _str_fullmatch( self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None ): - if not pat.endswith("$") or pat.endswith("//$"): + if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" return self._str_match(pat, case, flags, na) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ed1b7b199a16f..e709e6fcfe456 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1903,16 +1903,21 @@ def test_str_match(pat, case, na, exp): @pytest.mark.parametrize( "pat, case, na, exp", [ - ["abc", False, None, [True, None]], - ["Abc", True, None, [False, None]], - ["bc", True, None, [False, None]], - ["ab", False, True, [True, True]], - ["a[a-z]{2}", False, None, [True, None]], - ["A[a-z]{1}", True, None, [False, None]], + ["abc", False, None, [True, True, False, None]], + ["Abc", True, None, [False, False, False, None]], + ["bc", True, None, [False, False, False, None]], + ["ab", False, None, [True, True, False, None]], + ["a[a-z]{2}", False, None, [True, True, False, None]], + ["A[a-z]{1}", True, None, [False, False, False, None]], + # GH Issue: #56652 + ["abc$", False, None, [True, False, False, None]], + ["abc\\$", False, None, [False, True, False, None]], + ["Abc$", True, None, [False, False, False, None]], + ["Abc\\$", True, None, [False, False, False, None]], ], ) def test_str_fullmatch(pat, case, na, exp): - ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) result = ser.str.match(pat, case=case, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 3f58c6d703f8f..cd4707ac405de 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -730,6 +730,15 @@ def test_fullmatch(any_string_dtype): tm.assert_series_equal(result, expected) +def test_fullmatch_dollar_literal(any_string_dtype): + # GH 56652 + ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) + result = ser.str.fullmatch("foo\\$") + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected = Series([False, False, np.nan, True], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + def test_fullmatch_na_kwarg(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype From 341939a3d5432dced806a72151bf8c2f1f336f5a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 3 Jan 2024 23:16:55 +0100 Subject: [PATCH 021/152] Backport PR #56699 on branch 2.2.x (DOC: Corrected typo in warning on coerce) (#56719) Backport PR #56699: DOC: Corrected typo in warning on coerce Co-authored-by: aaron-robeson-8451 <65349876+aaron-robeson-8451@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/internals/blocks.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 51b4c4f297b07..d4eb5742ef928 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -432,7 +432,7 @@ In a future version, these will raise an error and you should cast to a common d In [3]: ser[0] = 'not an int64' FutureWarning: - Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. + Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'not an int64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first. In [4]: ser diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 20eff9315bc80..b7af545bd523e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -512,7 +512,7 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: if warn_on_upcast: warnings.warn( f"Setting an item of incompatible dtype is deprecated " - "and will raise in a future error of pandas. " + "and will raise an error in a future version of pandas. " f"Value '{other}' has dtype incompatible with {self.values.dtype}, " "please explicitly cast to a compatible dtype first.", FutureWarning, From 24ce4e10a4f9ac1bc175edf6762f982903855ef5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 Jan 2024 00:31:08 +0100 Subject: [PATCH 022/152] Backport PR #56616 on branch 2.2.x (BUG: Add limit_area to EA ffill/bfill) (#56720) Backport PR #56616: BUG: Add limit_area to EA ffill/bfill Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/arrays/_mixins.py | 9 +- pandas/core/arrays/arrow/array.py | 13 +- pandas/core/arrays/base.py | 16 ++- pandas/core/arrays/interval.py | 11 +- pandas/core/arrays/masked.py | 21 +++- pandas/core/arrays/period.py | 11 +- pandas/core/arrays/sparse/array.py | 11 +- pandas/core/internals/blocks.py | 15 ++- pandas/core/missing.py | 117 +++++++++++++----- pandas/tests/extension/base/missing.py | 22 ++++ .../tests/extension/decimal/test_decimal.py | 30 +++++ pandas/tests/extension/json/array.py | 4 + pandas/tests/extension/json/test_json.py | 23 ++++ pandas/tests/frame/methods/test_fillna.py | 40 ++---- scripts/validate_unwanted_patterns.py | 1 + 16 files changed, 266 insertions(+), 80 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 043646457f604..75ba7c9f72c1b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -321,7 +321,7 @@ Other enhancements - :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) -- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`) +- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 9ece12cf51a7b..0da121c36644a 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -305,7 +305,12 @@ def _fill_mask_inplace( func(self._ndarray.T, limit=limit, mask=mask.T) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: mask = self.isna() if mask.any(): @@ -315,7 +320,7 @@ def _pad_or_backfill( npvalues = self._ndarray.T if copy: npvalues = npvalues.copy() - func(npvalues, limit=limit, mask=mask.T) + func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T) npvalues = npvalues.T if copy: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5427cee55dfb1..0bc01d2da330a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1005,13 +1005,18 @@ def dropna(self) -> Self: return type(self)(pc.drop_null(self._pa_array)) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: if not self._hasna: # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() - if limit is None: + if limit is None and limit_area is None: method = missing.clean_fill_method(method) try: if method == "pad": @@ -1027,7 +1032,9 @@ def _pad_or_backfill( # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) @doc(ExtensionArray.fillna) def fillna( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 59c6d911cfaef..ea0e2e54e3339 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -70,6 +70,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.missing import _fill_limit_area_1d from pandas.core.sorting import ( nargminmax, nargsort, @@ -954,7 +955,12 @@ def interpolate( ) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: """ Pad or backfill values, used by Series/DataFrame ffill and bfill. @@ -1012,6 +1018,12 @@ def _pad_or_backfill( DeprecationWarning, stacklevel=find_stack_level(), ) + if limit_area is not None: + raise NotImplementedError( + f"{type(self).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtnsionArray authors " + "need to add this argument to _pad_or_backfill." + ) return self.fillna(method=method, limit=limit) mask = self.isna() @@ -1021,6 +1033,8 @@ def _pad_or_backfill( meth = missing.clean_fill_method(method) npmask = np.asarray(mask) + if limit_area is not None and not npmask.all(): + _fill_limit_area_1d(npmask, limit_area) if meth == "pad": indexer = libalgos.get_fill_indexer(npmask, limit=limit) return self.take(indexer, allow_fill=True) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index a19b304529383..904c87c68e211 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -890,11 +890,18 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr return obj[indexer] def _pad_or_backfill( # pylint: disable=useless-parent-delegation - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 03c09c5b2fd18..fc092ef6eb463 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -192,7 +192,12 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._simple_new(self._data[item], newmask) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: mask = self._mask @@ -204,7 +209,21 @@ def _pad_or_backfill( if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() + elif limit_area is not None: + mask = mask.copy() func(npvalues, limit=limit, mask=new_mask) + + if limit_area is not None and not mask.all(): + mask = mask.T + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + new_mask[:first] |= mask[:first] + new_mask[last + 1 :] |= mask[last + 1 :] + elif limit_area == "outside": + new_mask[first + 1 : last] |= mask[first + 1 : last] + if copy: return self._simple_new(npvalues.T, new_mask.T) else: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 2930b979bfe78..28f25d38b2363 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -810,12 +810,19 @@ def searchsorted( return m8arr.searchsorted(npvalue, side=side, sorter=sorter) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # view as dt64 so we get treated as timelike in core.missing, # similar to dtl._period_dispatch dta = self.view("M8[ns]") - result = dta._pad_or_backfill(method=method, limit=limit, copy=copy) + result = dta._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) if copy: return cast("Self", result.view(self.dtype)) else: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 5db77db2a9c66..98d84d899094b 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -716,11 +716,18 @@ def isna(self) -> Self: # type: ignore[override] return type(self)(mask, fill_value=False, dtype=dtype) def _pad_or_backfill( # pylint: disable=useless-parent-delegation - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # TODO(3.0): We can remove this method once deprecation for fillna method # keyword is enforced. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) def fillna( self, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b7af545bd523e..06fd9ebe47eae 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import wraps +import inspect import re from typing import ( TYPE_CHECKING, @@ -2256,11 +2257,21 @@ def pad_or_backfill( ) -> list[Block]: values = self.values + kwargs: dict[str, Any] = {"method": method, "limit": limit} + if "limit_area" in inspect.signature(values._pad_or_backfill).parameters: + kwargs["limit_area"] = limit_area + elif limit_area is not None: + raise NotImplementedError( + f"{type(values).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtnsionArray authors " + "need to add this argument to _pad_or_backfill." + ) + if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 - new_values = values.T._pad_or_backfill(method=method, limit=limit).T + new_values = values.T._pad_or_backfill(**kwargs).T else: - new_values = values._pad_or_backfill(method=method, limit=limit) + new_values = values._pad_or_backfill(**kwargs) return [self.make_block_same_class(new_values)] diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d275445983b6f..5dd9aaf5fbb4a 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -3,10 +3,7 @@ """ from __future__ import annotations -from functools import ( - partial, - wraps, -) +from functools import wraps from typing import ( TYPE_CHECKING, Any, @@ -823,6 +820,7 @@ def _interpolate_with_limit_area( values, method=method, limit=limit, + limit_area=limit_area, ) if limit_area == "inside": @@ -863,27 +861,6 @@ def pad_or_backfill_inplace( ----- Modifies values in-place. """ - if limit_area is not None: - np.apply_along_axis( - # error: Argument 1 to "apply_along_axis" has incompatible type - # "partial[None]"; expected - # "Callable[..., Union[_SupportsArray[dtype[]], - # Sequence[_SupportsArray[dtype[]]], - # Sequence[Sequence[_SupportsArray[dtype[]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]], - # Sequence[Sequence[Sequence[Sequence[_ - # SupportsArray[dtype[]]]]]]]]" - partial( # type: ignore[arg-type] - _interpolate_with_limit_area, - method=method, - limit=limit, - limit_area=limit_area, - ), - axis, - values, - ) - return - transf = (lambda x: x) if axis == 0 else (lambda x: x.T) # reshape a 1 dim if needed @@ -897,8 +874,7 @@ def pad_or_backfill_inplace( func = get_fill_func(method, ndim=2) # _pad_2d and _backfill_2d both modify tvalues inplace - func(tvalues, limit=limit) - return + func(tvalues, limit=limit, limit_area=limit_area) def _fillna_prep( @@ -909,7 +885,6 @@ def _fillna_prep( if mask is None: mask = isna(values) - mask = mask.view(np.uint8) return mask @@ -919,16 +894,23 @@ def _datetimelike_compat(func: F) -> F: """ @wraps(func) - def new_func(values, limit: int | None = None, mask=None): + def new_func( + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask=None, + ): if needs_i8_conversion(values.dtype): if mask is None: # This needs to occur before casting to int64 mask = isna(values) - result, mask = func(values.view("i8"), limit=limit, mask=mask) + result, mask = func( + values.view("i8"), limit=limit, limit_area=limit_area, mask=mask + ) return result.view(values.dtype), mask - return func(values, limit=limit, mask=mask) + return func(values, limit=limit, limit_area=limit_area, mask=mask) return cast(F, new_func) @@ -937,9 +919,12 @@ def new_func(values, limit: int | None = None, mask=None): def _pad_1d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) algos.pad_inplace(values, mask, limit=limit) return values, mask @@ -948,9 +933,12 @@ def _pad_1d( def _backfill_1d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) algos.backfill_inplace(values, mask, limit=limit) return values, mask @@ -959,9 +947,12 @@ def _backfill_1d( def _pad_2d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ): mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) if values.size: algos.pad_2d_inplace(values, mask, limit=limit) @@ -973,9 +964,14 @@ def _pad_2d( @_datetimelike_compat def _backfill_2d( - values, limit: int | None = None, mask: npt.NDArray[np.bool_] | None = None + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask: npt.NDArray[np.bool_] | None = None, ): mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) if values.size: algos.backfill_2d_inplace(values, mask, limit=limit) @@ -985,6 +981,63 @@ def _backfill_2d( return values, mask +def _fill_limit_area_1d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 1d mask for ffill/bfill with limit_area. + + Caller is responsible for checking at least one value of mask is False. + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + mask[:first] = False + mask[last + 1 :] = False + elif limit_area == "outside": + mask[first + 1 : last] = False + + +def _fill_limit_area_2d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 2d mask for ffill/bfill with limit_area. + + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask.T + if limit_area == "outside": + # Identify inside + la_mask = ( + np.maximum.accumulate(neg_mask, axis=0) + & np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + else: + # Identify outside + la_mask = ( + ~np.maximum.accumulate(neg_mask, axis=0) + | ~np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + mask[la_mask.T] = False + + _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index ffb7a24b4b390..dbd6682c12123 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -77,6 +77,28 @@ def test_fillna_limit_pad(self, data_missing): expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + arr = data_missing.take(input_ilocs) + result = pd.Series(arr).ffill(limit_area=limit_area) + expected = pd.Series(data_missing.take(expected_ilocs)) + tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( "ignore:Series.fillna with 'method' is deprecated:FutureWarning" ) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index b3c57ee49a724..9907e345ada63 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -156,6 +156,36 @@ def test_fillna_limit_pad(self, data_missing): ): super().test_fillna_limit_pad(data_missing) + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "ExtensionArray.fillna 'method' keyword is deprecated" + with tm.assert_produces_warning( + DeprecationWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + msg = "DecimalArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) + def test_fillna_limit_backfill(self, data_missing): msg = "Series.fillna with 'method' is deprecated" with tm.assert_produces_warning( diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index d3d9dcc4a4712..31f44f886add7 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -235,6 +235,10 @@ def _values_for_argsort(self): frozen = [tuple(x.items()) for x in self] return construct_1d_object_array_from_listlike(frozen) + def _pad_or_backfill(self, *, method, limit=None, copy=True): + # GH#56616 - test EA method without limit_area argument + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 7686bc5abb44c..a18edac9aef93 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -149,6 +149,29 @@ def test_fillna_frame(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" super().test_fillna_frame() + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "JSONArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) + @unhashable def test_value_counts(self, all_data, dropna): super().test_value_counts(all_data, dropna) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 6757669351c5c..89c50a8c21e1c 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -862,41 +862,29 @@ def test_pad_backfill_deprecated(func): @pytest.mark.parametrize( "data, expected_data, method, kwargs", ( - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], "ffill", {"limit_area": "inside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], "ffill", {"limit_area": "inside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], "ffill", {"limit_area": "outside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], "ffill", {"limit_area": "outside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), ( [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], @@ -910,41 +898,29 @@ def test_pad_backfill_deprecated(func): "ffill", {"limit_area": "outside", "limit": 1}, ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], "bfill", {"limit_area": "inside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], "bfill", {"limit_area": "inside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], "bfill", {"limit_area": "outside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], "bfill", {"limit_area": "outside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), ), ) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 89b67ddd9f5b6..0d724779abfda 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -58,6 +58,7 @@ "_iLocIndexer", # TODO(3.0): GH#55043 - remove upon removal of ArrayManager "_get_option", + "_fill_limit_area_1d", } From 1c3c9884bdd29478ee7d93be9a4e6dc5976af075 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 Jan 2024 00:33:36 +0100 Subject: [PATCH 023/152] Backport PR #56721 on branch 2.2.x (DOC: Fixup read_csv docstring) (#56725) Backport PR #56721: DOC: Fixup read_csv docstring Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index a9b41b45aba2f..e26e7e7470461 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -396,7 +396,7 @@ - Callable, function with signature as described in `pyarrow documentation _` when ``engine='pyarrow'`` + #pyarrow.csv.ParseOptions.invalid_row_handler>`_ when ``engine='pyarrow'`` delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be From 0cd02c56ceeb6666872cf428acc3e07ffd568082 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 Jan 2024 00:48:10 +0100 Subject: [PATCH 024/152] Backport PR #56672 on branch 2.2.x (BUG: dictionary type astype categorical using dictionary as categories) (#56723) Backport PR #56672: BUG: dictionary type astype categorical using dictionary as categories Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/categorical.py | 46 +++++++++++++++++----------- pandas/tests/extension/test_arrow.py | 16 ++++++++++ 3 files changed, 45 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 75ba7c9f72c1b..4222de8ce324f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -740,6 +740,7 @@ Categorical ^^^^^^^^^^^ - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) - Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`) +- Bug when casting ``pa.dictionary`` to :class:`CategoricalDtype` using a ``pa.DictionaryArray`` as categories (:issue:`56672`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 065a942cae768..b87c5375856dc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -44,7 +44,9 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, + CategoricalDtypeType, ExtensionDtype, ) from pandas.core.dtypes.generic import ( @@ -443,24 +445,32 @@ def __init__( values = arr if dtype.categories is None: - if not isinstance(values, ABCIndex): - # in particular RangeIndex xref test_index_equal_range_categories - values = sanitize_array(values, None) - try: - codes, categories = factorize(values, sort=True) - except TypeError as err: - codes, categories = factorize(values, sort=False) - if dtype.ordered: - # raise, as we don't have a sortable data structure and so - # the user should give us one by specifying categories - raise TypeError( - "'values' is not ordered, please " - "explicitly specify the categories order " - "by passing in a categories argument." - ) from err - - # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) + if isinstance(values.dtype, ArrowDtype) and issubclass( + values.dtype.type, CategoricalDtypeType + ): + arr = values._pa_array.combine_chunks() + categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) + codes = arr.indices.to_numpy() + dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) + else: + if not isinstance(values, ABCIndex): + # in particular RangeIndex xref test_index_equal_range_categories + values = sanitize_array(values, None) + try: + codes, categories = factorize(values, sort=True) + except TypeError as err: + codes, categories = factorize(values, sort=False) + if dtype.ordered: + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories + raise TypeError( + "'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument." + ) from err + + # we're inferring from values + dtype = CategoricalDtype(categories, dtype.ordered) elif isinstance(values.dtype, CategoricalDtype): old_codes = extract_array(values)._codes diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e709e6fcfe456..6689fb34f2ae3 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3234,6 +3234,22 @@ def test_factorize_chunked_dictionary(): tm.assert_index_equal(res_uniques, exp_uniques) +def test_dictionary_astype_categorical(): + # GH#56672 + arrs = [ + pa.array(np.array(["a", "x", "c", "a"])).dictionary_encode(), + pa.array(np.array(["a", "d", "c"])).dictionary_encode(), + ] + ser = pd.Series(ArrowExtensionArray(pa.chunked_array(arrs))) + result = ser.astype("category") + categories = pd.Index(["a", "x", "c", "d"], dtype=ArrowDtype(pa.string())) + expected = pd.Series( + ["a", "x", "c", "a", "a", "d", "c"], + dtype=pd.CategoricalDtype(categories=categories), + ) + tm.assert_series_equal(result, expected) + + def test_arrow_floordiv(): # GH 55561 a = pd.Series([-7], dtype="int64[pyarrow]") From 97eb3315c957bf01831c544d70245e5ebb9f3735 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 Jan 2024 09:21:26 +0100 Subject: [PATCH 025/152] Backport PR #56543 on branch 2.2.x (DOC: Update docstring for read_excel) (#56730) Backport PR #56543: DOC: Update docstring for read_excel Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/user_guide/io.rst | 19 +++++++------------ pandas/io/excel/_base.py | 32 ++++++++++---------------------- 2 files changed, 17 insertions(+), 34 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6148086452d54..b3ad23e0d4104 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3471,20 +3471,15 @@ saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. See the :ref:`cookbook` for some advanced strategies. -.. warning:: - - The `xlrd `__ package is now only for reading - old-style ``.xls`` files. +.. note:: - Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel` - would result in using the ``xlrd`` engine in many cases, including new - Excel 2007+ (``.xlsx``) files. pandas will now default to using the - `openpyxl `__ engine. + When ``engine=None``, the following logic will be used to determine the engine: - It is strongly encouraged to install ``openpyxl`` to read Excel 2007+ - (``.xlsx``) files. - **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.** - This is no longer supported, switch to using ``openpyxl`` instead. + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. .. _io.excel_reader: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index bce890c6f73b0..786f719337b84 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -160,36 +160,24 @@ If converters are specified, they will be applied INSTEAD of dtype conversion. If you use ``None``, it will infer the dtype of each column based on the data. -engine : str, default None +engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - - ``xlr`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - - ``pyxlsb`` supports Binary Excel files. - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) and OpenDocument (.ods) file formats. + - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). + - ``pyxlsb`` supports Binary Excel files. + - ``xlrd`` supports old-style Excel files (.xls). - .. versionchanged:: 1.2.0 - The engine `xlrd `_ - now only supports old-style ``.xls`` files. - When ``engine=None``, the following logic will be - used to determine the engine: - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is an xls format, - ``xlrd`` will be used. - - Otherwise if ``path_or_buffer`` is in xlsb format, - ``pyxlsb`` will be used. - - .. versionadded:: 1.3.0 - - Otherwise ``openpyxl`` will be used. - - .. versionchanged:: 1.3.0 + When ``engine=None``, the following logic will be used to determine the engine: + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one From 54dbe4527b171717535ff3bbc1e13aa748a95ed3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 5 Jan 2024 19:56:14 +0100 Subject: [PATCH 026/152] Backport PR #56677 on branch 2.2.x (Fix integral truediv and floordiv for pyarrow types with large divisor and avoid floating points for floordiv) (#56744) Backport PR #56677: Fix integral truediv and floordiv for pyarrow types with large divisor and avoid floating points for floordiv Co-authored-by: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 46 +++++++++++++----- pandas/tests/extension/test_arrow.py | 71 +++++++++++++++++++++++++++- 3 files changed, 104 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 4222de8ce324f..0b04a1d313a6d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -786,6 +786,7 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`) - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0bc01d2da330a..3858ce4cf0ea1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -109,30 +109,50 @@ def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar - ) -> pa.ChunkedArray: + ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: # Ensure int / int -> float mirroring Python/Numpy behavior # as pc.divide_checked(int, int) -> int if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type ): + # GH: 56645. # https://github.com/apache/arrow/issues/35563 - # Arrow does not allow safe casting large integral values to float64. - # Intentionally not using arrow_array.cast because it could be a scalar - # value in reflected case, and safe=False only added to - # scalar cast in pyarrow 13. - return pc.cast(arrow_array, pa.float64(), safe=False) - return arrow_array + return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast( + pa_object, pa.float64(), safe=False + ) + + return arrow_array, pa_object def floordiv_compat( left: pa.ChunkedArray | pa.Array | pa.Scalar, right: pa.ChunkedArray | pa.Array | pa.Scalar, ) -> pa.ChunkedArray: - # Ensure int // int -> int mirroring Python/Numpy behavior - # as pc.floor(pc.divide_checked(int, int)) -> float - converted_left = cast_for_truediv(left, right) - result = pc.floor(pc.divide(converted_left, right)) + # TODO: Replace with pyarrow floordiv kernel. + # https://github.com/apache/arrow/issues/39386 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): + divided = pc.divide_checked(left, right) + if pa.types.is_signed_integer(divided.type): + # GH 56676 + has_remainder = pc.not_equal(pc.multiply(divided, right), left) + has_one_negative_operand = pc.less( + pc.bit_wise_xor(left, right), + pa.scalar(0, type=divided.type), + ) + result = pc.if_else( + pc.and_( + has_remainder, + has_one_negative_operand, + ), + # GH: 55561 + pc.subtract(divided, pa.scalar(1, type=divided.type)), + divided, + ) + else: + result = divided result = result.cast(left.type) + else: + divided = pc.divide(left, right) + result = pc.floor(divided) return result ARROW_ARITHMETIC_FUNCS = { @@ -142,8 +162,8 @@ def floordiv_compat( "rsub": lambda x, y: pc.subtract_checked(y, x), "mul": pc.multiply_checked, "rmul": lambda x, y: pc.multiply_checked(y, x), - "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y), - "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)), + "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)), + "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)), "floordiv": lambda x, y: floordiv_compat(x, y), "rfloordiv": lambda x, y: floordiv_compat(y, x), "mod": NotImplemented, diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6689fb34f2ae3..05a112e464677 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3260,13 +3260,82 @@ def test_arrow_floordiv(): def test_arrow_floordiv_large_values(): - # GH 55561 + # GH 56645 a = pd.Series([1425801600000000000], dtype="int64[pyarrow]") expected = pd.Series([1425801600000], dtype="int64[pyarrow]") result = a // 1_000_000 tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floordiv_large_integral_result(dtype): + # GH 56676 + a = pd.Series([18014398509481983], dtype=dtype) + result = a // 1 + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_larger_divisor(pa_type): + # GH 56676 + dtype = ArrowDtype(pa_type) + a = pd.Series([-23], dtype=dtype) + result = a // 24 + expected = pd.Series([-1], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_integral_invalid(pa_type): + # GH 56676 + min_value = np.iinfo(pa_type.to_pandas_dtype()).min + a = pd.Series([min_value], dtype=ArrowDtype(pa_type)) + with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"): + a // -1 + with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"): + a // 0 + + +@pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR) +def test_arrow_floordiv_floating_0_divisor(dtype): + # GH 56676 + a = pd.Series([2], dtype=dtype) + result = a // 0 + expected = pd.Series([float("inf")], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) +def test_arrow_integral_floordiv_large_values(pa_type): + # GH 56676 + max_value = np.iinfo(pa_type.to_pandas_dtype()).max + dtype = ArrowDtype(pa_type) + a = pd.Series([max_value], dtype=dtype) + b = pd.Series([1], dtype=dtype) + result = a // b + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_true_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype="float64[pyarrow]") + result = a / b + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floor_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype=dtype) + result = a // b + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] From 57079b6b91673b13109de5196b0679a191d714dc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 7 Jan 2024 22:02:02 +0100 Subject: [PATCH 027/152] Backport PR #56761 on branch 2.2.x (BUG: fix subclass metadata preservation in groupby column selection) (#56770) Backport PR #56761: BUG: fix subclass metadata preservation in groupby column selection Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/frame.py | 4 +++- pandas/tests/groupby/test_groupby_subclass.py | 8 ++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 0b04a1d313a6d..2b436bc5d1855 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -873,6 +873,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`) +- Bug in :meth:`DataFrame.groupby` for DataFrame subclasses when selecting a subset of columns to apply the function to (:issue:`56761`) - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e2e589440bd9..15ccbd602c9c8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4016,7 +4016,9 @@ def _getitem_nocopy(self, key: list): copy=False, only_slice=True, ) - return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = result.__finalize__(self) + return result def __getitem__(self, key): check_dict_or_set_indexers(key) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index bf809bd5db437..17ef6ee913463 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -69,6 +69,7 @@ def test_groupby_preserves_metadata(): def func(group): assert isinstance(group, tm.SubclassedDataFrame) assert hasattr(group, "testattr") + assert group.testattr == "hello" return group.testattr msg = "DataFrameGroupBy.apply operated on the grouping columns" @@ -79,6 +80,13 @@ def func(group): expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) + result = custom_df.groupby("c").apply(func, include_groups=False) + tm.assert_series_equal(result, expected) + + # https://github.com/pandas-dev/pandas/pull/56761 + result = custom_df.groupby("c")[["a", "b"]].apply(func) + tm.assert_series_equal(result, expected) + def func2(group): assert isinstance(group, tm.SubclassedSeries) assert hasattr(group, "testattr") From 41f22b302b18310d58b894b3b04412ca413cbbfe Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 8 Jan 2024 20:25:45 +0100 Subject: [PATCH 028/152] Backport PR #56769 on branch 2.2.x (BUG: replace matching Floats with bools for ea dtypes) (#56780) Backport PR #56769: BUG: replace matching Floats with bools for ea dtypes Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/missing.py | 42 ++++++++++++++------- pandas/tests/series/methods/test_replace.py | 12 ++++++ 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 2b436bc5d1855..b138e91b41661 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -789,6 +789,7 @@ Numeric - Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`) - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 5dd9aaf5fbb4a..ff45662d0bdc8 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -31,6 +31,7 @@ from pandas.core.dtypes.cast import infer_dtype_from from pandas.core.dtypes.common import ( is_array_like, + is_bool_dtype, is_numeric_dtype, is_numeric_v_string_like, is_object_dtype, @@ -100,21 +101,34 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: # GH 21977 mask = np.zeros(arr.shape, dtype=bool) - for x in nonna: - if is_numeric_v_string_like(arr, x): - # GH#29553 prevent numpy deprecation warnings - pass - else: - if potential_na: - new_mask = np.zeros(arr.shape, dtype=np.bool_) - new_mask[arr_mask] = arr[arr_mask] == x + if ( + is_numeric_dtype(arr.dtype) + and not is_bool_dtype(arr.dtype) + and is_bool_dtype(nonna.dtype) + ): + pass + elif ( + is_bool_dtype(arr.dtype) + and is_numeric_dtype(nonna.dtype) + and not is_bool_dtype(nonna.dtype) + ): + pass + else: + for x in nonna: + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + pass else: - new_mask = arr == x - - if not isinstance(new_mask, np.ndarray): - # usually BooleanArray - new_mask = new_mask.to_numpy(dtype=bool, na_value=False) - mask |= new_mask + if potential_na: + new_mask = np.zeros(arr.shape, dtype=np.bool_) + new_mask[arr_mask] = arr[arr_mask] == x + else: + new_mask = arr == x + + if not isinstance(new_mask, np.ndarray): + # usually BooleanArray + new_mask = new_mask.to_numpy(dtype=bool, na_value=False) + mask |= new_mask if na_mask.any(): mask |= isna(arr) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 4330153c186ca..b0f4e233ba5eb 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -799,3 +799,15 @@ def test_replace_numeric_column_with_na(self, val): ser.replace(to_replace=1, value=pd.NA, inplace=True) tm.assert_series_equal(ser, expected) + + def test_replace_ea_float_with_bool(self): + # GH#55398 + ser = pd.Series([0.0], dtype="Float64") + expected = ser.copy() + result = ser.replace(False, 1.0) + tm.assert_series_equal(result, expected) + + ser = pd.Series([False], dtype="boolean") + expected = ser.copy() + result = ser.replace(0.0, True) + tm.assert_series_equal(result, expected) From 6dbeeb4009bbfac5ea1ae2111346f5e9f05b81f4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 8 Jan 2024 23:24:22 +0100 Subject: [PATCH 029/152] Backport PR #56767 on branch 2.2.x (BUG: Series.round raising for nullable bool dtype) (#56782) Backport PR #56767: BUG: Series.round raising for nullable bool dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/masked.py | 2 ++ pandas/core/series.py | 6 ++---- pandas/tests/series/methods/test_round.py | 9 +++++++++ 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b138e91b41661..93b63f99ea399 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -790,6 +790,7 @@ Numeric - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`) +- Bug in :meth:`Series.round` raising for nullable boolean dtype (:issue:`55936`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index fc092ef6eb463..545d45e450f3f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -403,6 +403,8 @@ def round(self, decimals: int = 0, *args, **kwargs): DataFrame.round : Round values of a DataFrame. Series.round : Round values of a Series. """ + if self.dtype.kind == "b": + return self nv.validate_round(args, kwargs) values = np.round(self._data, decimals=decimals, **kwargs) diff --git a/pandas/core/series.py b/pandas/core/series.py index e3b401cd3c88b..a6762dd1b48a2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2788,13 +2788,11 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series: dtype: float64 """ nv.validate_round(args, kwargs) - result = self._values.round(decimals) - result = self._constructor(result, index=self.index, copy=False).__finalize__( + new_mgr = self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()) + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( self, method="round" ) - return result - @overload def quantile( self, q: float = ..., interpolation: QuantileInterpolation = ... diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py index 7f60c94f10e4f..c330b7a7dfbbb 100644 --- a/pandas/tests/series/methods/test_round.py +++ b/pandas/tests/series/methods/test_round.py @@ -63,3 +63,12 @@ def test_round_nat(self, method, freq, unit): round_method = getattr(ser.dt, method) result = round_method(freq) tm.assert_series_equal(result, expected) + + def test_round_ea_boolean(self): + # GH#55936 + ser = Series([True, False], dtype="boolean") + expected = ser.copy() + result = ser.round(2) + tm.assert_series_equal(result, expected) + result.iloc[0] = False + tm.assert_series_equal(ser, expected) From 58c9ef79cb976e14850929bde5d6f9b416122dd0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 8 Jan 2024 23:24:40 +0100 Subject: [PATCH 030/152] Backport PR #56771 on branch 2.2.x (BUG: to_stata not handling ea dtypes correctly) (#56783) Backport PR #56771: BUG: to_stata not handling ea dtypes correctly Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/stata.py | 23 ++++++++++++--------- pandas/tests/io/test_stata.py | 37 ++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 93b63f99ea399..c85fd75a3685f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -848,6 +848,7 @@ I/O - Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`) - Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`) - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) +- Bug in :meth:`DataFrame.to_stata` raising for extension dtypes (:issue:`54671`) - Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`) - Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index a4d8054ea4f8c..4abf9af185a01 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -47,9 +47,11 @@ ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_object, is_numeric_dtype, + is_string_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype @@ -62,8 +64,6 @@ to_datetime, to_timedelta, ) -from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.integer import IntegerDtype from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index from pandas.core.indexes.range import RangeIndex @@ -591,17 +591,22 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: for col in data: # Cast from unsupported types to supported types - is_nullable_int = isinstance(data[col].dtype, (IntegerDtype, BooleanDtype)) + is_nullable_int = ( + isinstance(data[col].dtype, ExtensionDtype) + and data[col].dtype.kind in "iub" + ) # We need to find orig_missing before altering data below orig_missing = data[col].isna() if is_nullable_int: - missing_loc = data[col].isna() - if missing_loc.any(): - # Replace with always safe value - fv = 0 if isinstance(data[col].dtype, IntegerDtype) else False - data.loc[missing_loc, col] = fv + fv = 0 if data[col].dtype.kind in "iu" else False # Replace with NumPy-compatible column - data[col] = data[col].astype(data[col].dtype.numpy_dtype) + data[col] = data[col].fillna(fv).astype(data[col].dtype.numpy_dtype) + elif isinstance(data[col].dtype, ExtensionDtype): + if getattr(data[col].dtype, "numpy_dtype", None) is not None: + data[col] = data[col].astype(data[col].dtype.numpy_dtype) + elif is_string_dtype(data[col].dtype): + data[col] = data[col].astype("object") + dtype = data[col].dtype empty_df = data.shape[0] == 0 for c_data in conversion_data: diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 3e4e1a107da9d..6bd74faa8a3db 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -11,6 +11,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import CategoricalDtype import pandas._testing as tm @@ -1921,6 +1923,41 @@ def test_writer_118_exceptions(self): with pytest.raises(ValueError, match="You must use version 119"): StataWriterUTF8(path, df, version=118) + @pytest.mark.parametrize( + "dtype_backend", + ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], + ) + def test_read_write_ea_dtypes(self, dtype_backend): + df = DataFrame( + { + "a": [1, 2, None], + "b": ["a", "b", "c"], + "c": [True, False, None], + "d": [1.5, 2.5, 3.5], + "e": pd.date_range("2020-12-31", periods=3, freq="D"), + }, + index=pd.Index([0, 1, 2], name="index"), + ) + df = df.convert_dtypes(dtype_backend=dtype_backend) + df.to_stata("test_stata.dta", version=118) + + with tm.ensure_clean() as path: + df.to_stata(path) + written_and_read_again = self.read_dta(path) + + expected = DataFrame( + { + "a": [1, 2, np.nan], + "b": ["a", "b", "c"], + "c": [1.0, 0, np.nan], + "d": [1.5, 2.5, 3.5], + "e": pd.date_range("2020-12-31", periods=3, freq="D"), + }, + index=pd.Index([0, 1, 2], name="index", dtype=np.int32), + ) + + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + @pytest.mark.parametrize("version", [105, 108, 111, 113, 114]) def test_backward_compat(version, datapath): From bf28e02bd592eba25a8eb8ca296316ff2671f469 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 8 Jan 2024 23:26:47 +0100 Subject: [PATCH 031/152] Backport PR #56766 on branch 2.2.x (BUG: IntervalIndex.from_tuples raising with masked subtype) (#56785) Backport PR #56766: BUG: IntervalIndex.from_tuples raising with masked subtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/interval.py | 18 +++++++++++++----- .../indexes/interval/test_constructors.py | 16 ++++++++++++++++ 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c85fd75a3685f..dbb11d3d0788d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -817,6 +817,7 @@ Interval - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`) - Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`) - Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`) +- Bug in :meth:`IntervalIndex.from_tuples` raising if subtype is a nullable extension dtype (:issue:`56765`) - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) - Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 904c87c68e211..e69f996441703 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -79,6 +79,7 @@ unique, value_counts_internal as value_counts, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.arrays.base import ( ExtensionArray, _extension_array_shared_docs, @@ -370,11 +371,18 @@ def _ensure_simple_new_inputs( right = ensure_wrapped_if_datetimelike(right) right = extract_array(right, extract_numpy=True) - lbase = getattr(left, "_ndarray", left).base - rbase = getattr(right, "_ndarray", right).base - if lbase is not None and lbase is rbase: - # If these share data, then setitem could corrupt our IA - right = right.copy() + if isinstance(left, ArrowExtensionArray) or isinstance( + right, ArrowExtensionArray + ): + pass + else: + lbase = getattr(left, "_ndarray", left) + lbase = getattr(lbase, "_data", lbase).base + rbase = getattr(right, "_ndarray", right) + rbase = getattr(rbase, "_data", rbase).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() dtype = IntervalDtype(left.dtype, closed=closed) diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 778c07b46e57c..e47a014f18045 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.dtypes.dtypes import IntervalDtype @@ -517,3 +519,17 @@ def test_dtype_closed_mismatch(): with pytest.raises(ValueError, match=msg): IntervalArray([], dtype=dtype, closed="neither") + + +@pytest.mark.parametrize( + "dtype", + ["Float64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow"))], +) +def test_ea_dtype(dtype): + # GH#56765 + bins = [(0.0, 0.4), (0.4, 0.6)] + interval_dtype = IntervalDtype(subtype=dtype, closed="left") + result = IntervalIndex.from_tuples(bins, closed="left", dtype=interval_dtype) + assert result.dtype == interval_dtype + expected = IntervalIndex.from_tuples(bins, closed="left").astype(interval_dtype) + tm.assert_index_equal(result, expected) From b89079bf0da316aff5433f3ee2b52cb9f89e34d0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 9 Jan 2024 00:27:06 +0100 Subject: [PATCH 032/152] Backport PR #56724 on branch 2.2.x (TST: Don't ignore tolerance for integer series) (#56786) Backport PR #56724: TST: Don't ignore tolerance for integer series Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_testing/asserters.py | 98 ++++++++++++++----- pandas/tests/util/test_assert_series_equal.py | 12 +++ 2 files changed, 84 insertions(+), 26 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index d0f38c85868d4..3de982498e996 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import lib from pandas._libs.missing import is_matching_na from pandas._libs.sparse import SparseIndex import pandas._libs.testing as _testing @@ -698,9 +699,9 @@ def assert_extension_array_equal( right, check_dtype: bool | Literal["equiv"] = True, index_values=None, - check_exact: bool = False, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + check_exact: bool | lib.NoDefault = lib.no_default, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "ExtensionArray", ) -> None: """ @@ -715,7 +716,12 @@ def assert_extension_array_equal( index_values : Index | numpy.ndarray, default None Optional index (shared by both left and right), used in output. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 @@ -739,6 +745,23 @@ def assert_extension_array_equal( >>> b, c = a.array, a.array >>> tm.assert_extension_array_equal(b, c) """ + if ( + check_exact is lib.no_default + and rtol is lib.no_default + and atol is lib.no_default + ): + check_exact = ( + is_numeric_dtype(left.dtype) + and not is_float_dtype(left.dtype) + or is_numeric_dtype(right.dtype) + and not is_float_dtype(right.dtype) + ) + elif check_exact is lib.no_default: + check_exact = False + + rtol = rtol if rtol is not lib.no_default else 1.0e-5 + atol = atol if atol is not lib.no_default else 1.0e-8 + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" if check_dtype: @@ -784,10 +807,7 @@ def assert_extension_array_equal( left_valid = left[~left_na].to_numpy(dtype=object) right_valid = right[~right_na].to_numpy(dtype=object) - if check_exact or ( - (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) - or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) - ): + if check_exact: assert_numpy_array_equal( left_valid, right_valid, obj=obj, index_values=index_values ) @@ -811,14 +831,14 @@ def assert_series_equal( check_index_type: bool | Literal["equiv"] = "equiv", check_series_type: bool = True, check_names: bool = True, - check_exact: bool = False, + check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_category_order: bool = True, check_freq: bool = True, check_flags: bool = True, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "Series", *, check_index: bool = True, @@ -841,7 +861,12 @@ def assert_series_equal( check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -877,6 +902,22 @@ def assert_series_equal( >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True + if ( + check_exact is lib.no_default + and rtol is lib.no_default + and atol is lib.no_default + ): + check_exact = ( + is_numeric_dtype(left.dtype) + and not is_float_dtype(left.dtype) + or is_numeric_dtype(right.dtype) + and not is_float_dtype(right.dtype) + ) + elif check_exact is lib.no_default: + check_exact = False + + rtol = rtol if rtol is not lib.no_default else 1.0e-5 + atol = atol if atol is not lib.no_default else 1.0e-8 if not check_index and check_like: raise ValueError("check_like must be False if check_index is False") @@ -931,10 +972,7 @@ def assert_series_equal( pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if check_exact or ( - (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) - or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) - ): + if check_exact: left_values = left._values right_values = right._values # Only check exact if dtype is numeric @@ -1061,14 +1099,14 @@ def assert_frame_equal( check_frame_type: bool = True, check_names: bool = True, by_blocks: bool = False, - check_exact: bool = False, + check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_like: bool = False, check_freq: bool = True, check_flags: bool = True, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "DataFrame", ) -> None: """ @@ -1103,7 +1141,12 @@ def assert_frame_equal( Specify how to compare internal data. If False, compare by columns. If True, compare by blocks. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -1158,6 +1201,9 @@ def assert_frame_equal( >>> assert_frame_equal(df1, df2, check_dtype=False) """ __tracebackhide__ = True + _rtol = rtol if rtol is not lib.no_default else 1.0e-5 + _atol = atol if atol is not lib.no_default else 1.0e-8 + _check_exact = check_exact if check_exact is not lib.no_default else False # instance validation _check_isinstance(left, right, DataFrame) @@ -1181,11 +1227,11 @@ def assert_frame_equal( right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=_check_exact, check_categorical=check_categorical, check_order=not check_like, - rtol=rtol, - atol=atol, + rtol=_rtol, + atol=_atol, obj=f"{obj}.index", ) @@ -1195,11 +1241,11 @@ def assert_frame_equal( right.columns, exact=check_column_type, check_names=check_names, - check_exact=check_exact, + check_exact=_check_exact, check_categorical=check_categorical, check_order=not check_like, - rtol=rtol, - atol=atol, + rtol=_rtol, + atol=_atol, obj=f"{obj}.columns", ) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index c4ffc197298f0..784a0347cf92b 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -462,3 +462,15 @@ def test_ea_and_numpy_no_dtype_check(val, check_exact, dtype): left = Series([1, 2, val], dtype=dtype) right = Series(pd.array([1, 2, val])) tm.assert_series_equal(left, right, check_dtype=False, check_exact=check_exact) + + +def test_assert_series_equal_int_tol(): + # GH#56646 + left = Series([81, 18, 121, 38, 74, 72, 81, 81, 146, 81, 81, 170, 74, 74]) + right = Series([72, 9, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72]) + tm.assert_series_equal(left, right, rtol=1.5) + + tm.assert_frame_equal(left.to_frame(), right.to_frame(), rtol=1.5) + tm.assert_extension_array_equal( + left.astype("Int64").values, right.astype("Int64").values, rtol=1.5 + ) From c4e04e0ea10a1d9ff6862cc4c5c6c82aee9e1ba8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 9 Jan 2024 03:28:18 +0100 Subject: [PATCH 033/152] Backport PR #56402 on branch 2.2.x (TST/CoW: expand test for chained inplace methods) (#56790) Backport PR #56402: TST/CoW: expand test for chained inplace methods Co-authored-by: Joris Van den Bossche --- .../test_chained_assignment_deprecation.py | 69 ++++++++++++++++++- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py index 80e38380ed27c..0a37f6b813e55 100644 --- a/pandas/tests/copy_view/test_chained_assignment_deprecation.py +++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import PY311 from pandas.errors import ( ChainedAssignmentError, SettingWithCopyWarning, @@ -42,7 +43,9 @@ def test_methods_iloc_warn(using_copy_on_write): ("ffill", ()), ], ) -def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): +def test_methods_iloc_getitem_item_cache( + func, args, using_copy_on_write, warn_copy_on_write +): # ensure we don't incorrectly raise chained assignment warning because # of the item cache / iloc not setting the item cache df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) @@ -66,14 +69,74 @@ def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): ser = df["a"] getattr(ser, func)(*args, inplace=True) + df = df_orig.copy() + df["a"] # populate the item_cache + # TODO(CoW-warn) because of the usage of *args, this doesn't warn on Py3.11+ + if using_copy_on_write: + with tm.raises_chained_assignment_error(not PY311): + getattr(df["a"], func)(*args, inplace=True) + else: + with tm.assert_cow_warning(not PY311, match="A value"): + getattr(df["a"], func)(*args, inplace=True) + + df = df_orig.copy() + ser = df["a"] # populate the item_cache and keep ref + if using_copy_on_write: + with tm.raises_chained_assignment_error(not PY311): + getattr(df["a"], func)(*args, inplace=True) + else: + # ideally also warns on the default mode, but the ser' _cacher + # messes up the refcount + even in warning mode this doesn't trigger + # the warning of Py3.1+ (see above) + with tm.assert_cow_warning(warn_copy_on_write and not PY311, match="A value"): + getattr(df["a"], func)(*args, inplace=True) + + +def test_methods_iloc_getitem_item_cache_fillna( + using_copy_on_write, warn_copy_on_write +): + # ensure we don't incorrectly raise chained assignment warning because + # of the item cache / iloc not setting the item cache + df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) + + df = df_orig.copy() + ser = df.iloc[:, 0] + ser.fillna(1, inplace=True) + + # parent that holds item_cache is dead, so don't increase ref count + df = df_orig.copy() + ser = df.copy()["a"] + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df.iloc[:, 0] # iloc creates a new object + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df["a"] + ser.fillna(1, inplace=True) + df = df_orig.copy() df["a"] # populate the item_cache if using_copy_on_write: with tm.raises_chained_assignment_error(): - df["a"].fillna(0, inplace=True) + df["a"].fillna(1, inplace=True) else: with tm.assert_cow_warning(match="A value"): - df["a"].fillna(0, inplace=True) + df["a"].fillna(1, inplace=True) + + df = df_orig.copy() + ser = df["a"] # populate the item_cache and keep ref + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].fillna(1, inplace=True) + else: + # TODO(CoW-warn) ideally also warns on the default mode, but the ser' _cacher + # messes up the refcount + with tm.assert_cow_warning(warn_copy_on_write, match="A value"): + df["a"].fillna(1, inplace=True) # TODO(CoW-warn) expand the cases From 3c89432d1adf277d785145f5c62abca1157e19dc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:57:14 +0100 Subject: [PATCH 034/152] Backport PR #56772 on branch 2.2.x (Support large strings in interchange protocol) (#56795) Backport PR #56772: Support large strings in interchange protocol Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/dtypes/dtypes.py | 4 +++- pandas/core/interchange/column.py | 9 +++------ pandas/core/interchange/utils.py | 1 + pandas/tests/interchange/test_impl.py | 9 +++++++++ 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index dbb11d3d0788d..36e677fa2a7a9 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -906,6 +906,7 @@ Sparse Other ^^^^^ +- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`) - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`) - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ed5256922377a..e90e92fa0ee1c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2190,7 +2190,9 @@ def numpy_dtype(self) -> np.dtype: # This can be removed if/when pyarrow addresses it: # https://github.com/apache/arrow/issues/34462 return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype): + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( + self.pyarrow_dtype + ): # pa.string().to_pandas_dtype() = object which we don't want return np.dtype(str) try: diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index acfbc5d9e6c62..7f524d6823f30 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -301,12 +301,9 @@ def _get_data_buffer( buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer - dtype = ( - DtypeKind.STRING, - 8, - ArrowCTypes.STRING, - Endianness.NATIVE, - ) # note: currently only support native endianness + # TODO: this will need correcting + # https://github.com/pandas-dev/pandas/issues/54781 + dtype = self.dtype else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 4ac063080e62d..2e73e560e5740 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -37,6 +37,7 @@ "float": "f", # float32 "double": "g", # float64 "string": "u", + "large_string": "U", "binary": "z", "time32[s]": "tts", "time32[ms]": "ttm", diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 15c2b8d000b37..27ea8ccdd17b1 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -362,3 +362,12 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: interchange.get_column_by_name = lambda _: column monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) pd.api.interchange.from_dataframe(df) + + +def test_large_string(): + # GH#56702 + pytest.importorskip("pyarrow") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + expected = pd.DataFrame({"a": ["x"]}, dtype="object") + tm.assert_frame_equal(result, expected) From 3945d5e16b0aca22166ce1e6a03ee978fb8853fd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 9 Jan 2024 16:47:34 +0100 Subject: [PATCH 035/152] Backport PR #5644 on branch 2.2.x (BUG: merge not sorting for new string dtype) (#56799) BUG: merge not sorting for new string dtype (#56442) * BUG: merge not sorting for new string dtype * Fixup * Update test_multi.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit b7e2202459eadc9dd599cbe58251ecc930798b97) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/merge.py | 18 +++- pandas/tests/reshape/merge/test_join.py | 43 ++++++---- pandas/tests/reshape/merge/test_multi.py | 100 ++++++++++++----------- 4 files changed, 94 insertions(+), 68 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 36e677fa2a7a9..6a232365fbfeb 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -893,6 +893,7 @@ Reshaping - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`) - Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`) - Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) +- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 320e4e33a29fb..410301b7697f2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2488,18 +2488,30 @@ def _factorize_keys( .combine_chunks() .dictionary_encode() ) - length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length) + pc.fill_null(dc.indices[slice(len_lk)], -1) .to_numpy() .astype(np.intp, copy=False), - pc.fill_null(dc.indices[slice(len_lk, None)], length) + pc.fill_null(dc.indices[slice(len_lk, None)], -1) .to_numpy() .astype(np.intp, copy=False), len(dc.dictionary), ) + + if sort: + uniques = dc.dictionary.to_numpy(zero_copy_only=False) + llab, rlab = _sort_labels(uniques, llab, rlab) + if dc.null_count > 0: + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) count += 1 return llab, rlab, count diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 5a1f47e341222..1d5ed2d7373ce 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -16,6 +16,7 @@ bdate_range, concat, merge, + option_context, ) import pandas._testing as tm @@ -563,24 +564,30 @@ def test_join_many_non_unique_index(self): tm.assert_frame_equal(inner, left) tm.assert_frame_equal(inner, right) - def test_join_sort(self): - left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) - right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) - - joined = left.join(right, on="key", sort=True) - expected = DataFrame( - { - "key": ["bar", "baz", "foo", "foo"], - "value": [2, 3, 1, 4], - "value2": ["a", "b", "c", "c"], - }, - index=[1, 2, 0, 3], - ) - tm.assert_frame_equal(joined, expected) - - # smoke test - joined = left.join(right, on="key", sort=False) - tm.assert_index_equal(joined.index, Index(range(4)), exact=True) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_sort(self, infer_string): + with option_context("future.infer_string", infer_string): + left = DataFrame( + {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]} + ) + right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) + + joined = left.join(right, on="key", sort=True) + expected = DataFrame( + { + "key": ["bar", "baz", "foo", "foo"], + "value": [2, 3, 1, 4], + "value2": ["a", "b", "c", "c"], + }, + index=[1, 2, 0, 3], + ) + tm.assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on="key", sort=False) + tm.assert_index_equal(joined.index, Index(range(4)), exact=True) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 269d3a2b7078e..5973f13c9d495 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -9,6 +11,7 @@ RangeIndex, Series, Timestamp, + option_context, ) import pandas._testing as tm from pandas.core.reshape.concat import concat @@ -88,67 +91,70 @@ def test_merge_on_multikey(self, left, right, join_type): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("sort", [False, True]) - def test_left_join_multi_index(self, sort): - icols = ["1st", "2nd", "3rd"] + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_left_join_multi_index(self, sort, infer_string): + with option_context("future.infer_string", infer_string): + icols = ["1st", "2nd", "3rd"] - def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) - f = lambda ts: ts.map(iord) - ord("a") - return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 + def bind_cols(df): + iord = lambda a: 0 if a != a else ord(a) + f = lambda ts: ts.map(iord) - ord("a") + return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 - def run_asserts(left, right, sort): - res = left.join(right, on=icols, how="left", sort=sort) + def run_asserts(left, right, sort): + res = left.join(right, on=icols, how="left", sort=sort) - assert len(left) < len(res) + 1 - assert not res["4th"].isna().any() - assert not res["5th"].isna().any() + assert len(left) < len(res) + 1 + assert not res["4th"].isna().any() + assert not res["5th"].isna().any() - tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) - result = bind_cols(res.iloc[:, :-2]) - tm.assert_series_equal(res["4th"], result, check_names=False) - assert result.name is None + tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) + result = bind_cols(res.iloc[:, :-2]) + tm.assert_series_equal(res["4th"], result, check_names=False) + assert result.name is None - if sort: - tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) + if sort: + tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) - out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") + out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") - res.index = RangeIndex(len(res)) - tm.assert_frame_equal(out, res) + res.index = RangeIndex(len(res)) + tm.assert_frame_equal(out, res) - lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) - left = DataFrame( - np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] - ) - # Explicit cast to float to avoid implicit cast when setting nan - left.insert( - 1, - "2nd", - np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), - ) + lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) + left = DataFrame( + np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] + ) + # Explicit cast to float to avoid implicit cast when setting nan + left.insert( + 1, + "2nd", + np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), + ) - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i].copy() + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i].copy() - left["4th"] = bind_cols(left) - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + left["4th"] = bind_cols(left) + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) - # inject some nulls - left.loc[1::4, "1st"] = np.nan - left.loc[2::5, "2nd"] = np.nan - left.loc[3::6, "3rd"] = np.nan - left["4th"] = bind_cols(left) + # inject some nulls + left.loc[1::4, "1st"] = np.nan + left.loc[2::5, "2nd"] = np.nan + left.loc[3::6, "3rd"] = np.nan + left["4th"] = bind_cols(left) - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i, :-1] - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i, :-1] + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) @pytest.mark.parametrize("sort", [False, True]) def test_merge_right_vs_left(self, left, right, sort): From 2ddeb4577363e1c3e3a839ec45c55deec4aa67c1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:29:45 +0100 Subject: [PATCH 036/152] CI: Add fixture back in (#56803) --- pandas/tests/reshape/merge/test_multi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 5973f13c9d495..b1aa6b88bc4ee 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -94,6 +94,7 @@ def test_merge_on_multikey(self, left, right, join_type): @pytest.mark.parametrize( "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] ) + @pytest.mark.parametrize("sort", [True, False]) def test_left_join_multi_index(self, sort, infer_string): with option_context("future.infer_string", infer_string): icols = ["1st", "2nd", "3rd"] From 66df0bda738c43506a22448b1ba2c7c050a07baa Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 10 Jan 2024 00:46:09 +0100 Subject: [PATCH 037/152] Backport PR #56059 on branch 2.2.x (ENH: Add case_when method) (#56800) ENH: Add case_when method (#56059) (cherry picked from commit e3a55a4cbfc83ec4ab1bcf73a1a0ec96e670903a) Co-authored-by: Samuel Oranyeli --- doc/source/reference/series.rst | 1 + doc/source/whatsnew/v2.2.0.rst | 20 +++ pandas/core/series.py | 124 ++++++++++++++- pandas/tests/series/methods/test_case_when.py | 148 ++++++++++++++++++ 4 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/series/methods/test_case_when.py diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index af262f9e6c336..a4ea0ec396ceb 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -177,6 +177,7 @@ Reindexing / selection / label manipulation :toctree: api/ Series.align + Series.case_when Series.drop Series.droplevel Series.drop_duplicates diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 6a232365fbfeb..e244794664b34 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -188,6 +188,26 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv Implementation Status `_ documentation. +.. _whatsnew_220.enhancements.case_when: + +Create a pandas Series based on one or more conditions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`Series.case_when` function has been added to create a Series object based on one or more conditions. (:issue:`39154`) + +.. ipython:: python + + import pandas as pd + + df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6])) + default=pd.Series('default', index=df.index) + default.case_when( + caselist=[ + (df.a == 1, 'first'), # condition, replacement + (df.a.gt(1) & df.b.eq(5), 'second'), # condition, replacement + ], + ) + .. _whatsnew_220.enhancements.to_numpy_ea: ``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype diff --git a/pandas/core/series.py b/pandas/core/series.py index a6762dd1b48a2..83eb545b9b681 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -67,6 +67,9 @@ from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( LossySetitemError, + construct_1d_arraylike_from_scalar, + find_common_type, + infer_dtype_from, maybe_box_native, maybe_cast_pointwise_result, ) @@ -84,7 +87,10 @@ CategoricalDtype, ExtensionDtype, ) -from pandas.core.dtypes.generic import ABCDataFrame +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import ( isna, @@ -113,6 +119,7 @@ from pandas.core.arrays.sparse import SparseAccessor from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( + array as pd_array, extract_array, sanitize_array, ) @@ -5627,6 +5634,121 @@ def between( return lmask & rmask + def case_when( + self, + caselist: list[ + tuple[ + ArrayLike | Callable[[Series], Series | np.ndarray | Sequence[bool]], + ArrayLike | Scalar | Callable[[Series], Series | np.ndarray], + ], + ], + ) -> Series: + """ + Replace values where the conditions are True. + + Parameters + ---------- + caselist : A list of tuples of conditions and expected replacements + Takes the form: ``(condition0, replacement0)``, + ``(condition1, replacement1)``, ... . + ``condition`` should be a 1-D boolean array-like object + or a callable. If ``condition`` is a callable, + it is computed on the Series + and should return a boolean Series or array. + The callable must not change the input Series + (though pandas doesn`t check it). ``replacement`` should be a + 1-D array-like object, a scalar or a callable. + If ``replacement`` is a callable, it is computed on the Series + and should return a scalar or Series. The callable + must not change the input Series + (though pandas doesn`t check it). + + .. versionadded:: 2.2.0 + + Returns + ------- + Series + + See Also + -------- + Series.mask : Replace values where the condition is True. + + Examples + -------- + >>> c = pd.Series([6, 7, 8, 9], name='c') + >>> a = pd.Series([0, 0, 1, 2]) + >>> b = pd.Series([0, 3, 4, 5]) + + >>> c.case_when(caselist=[(a.gt(0), a), # condition, replacement + ... (b.gt(0), b)]) + 0 6 + 1 3 + 2 1 + 3 2 + Name: c, dtype: int64 + """ + if not isinstance(caselist, list): + raise TypeError( + f"The caselist argument should be a list; instead got {type(caselist)}" + ) + + if not caselist: + raise ValueError( + "provide at least one boolean condition, " + "with a corresponding replacement." + ) + + for num, entry in enumerate(caselist): + if not isinstance(entry, tuple): + raise TypeError( + f"Argument {num} must be a tuple; instead got {type(entry)}." + ) + if len(entry) != 2: + raise ValueError( + f"Argument {num} must have length 2; " + "a condition and replacement; " + f"instead got length {len(entry)}." + ) + caselist = [ + ( + com.apply_if_callable(condition, self), + com.apply_if_callable(replacement, self), + ) + for condition, replacement in caselist + ] + default = self.copy() + conditions, replacements = zip(*caselist) + common_dtypes = [infer_dtype_from(arg)[0] for arg in [*replacements, default]] + if len(set(common_dtypes)) > 1: + common_dtype = find_common_type(common_dtypes) + updated_replacements = [] + for condition, replacement in zip(conditions, replacements): + if is_scalar(replacement): + replacement = construct_1d_arraylike_from_scalar( + value=replacement, length=len(condition), dtype=common_dtype + ) + elif isinstance(replacement, ABCSeries): + replacement = replacement.astype(common_dtype) + else: + replacement = pd_array(replacement, dtype=common_dtype) + updated_replacements.append(replacement) + replacements = updated_replacements + default = default.astype(common_dtype) + + counter = reversed(range(len(conditions))) + for position, condition, replacement in zip( + counter, conditions[::-1], replacements[::-1] + ): + try: + default = default.mask( + condition, other=replacement, axis=0, inplace=False, level=None + ) + except Exception as error: + raise ValueError( + f"Failed to apply condition{position} and replacement{position}." + ) from error + return default + # error: Cannot determine type of 'isna' @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] def isna(self) -> Series: diff --git a/pandas/tests/series/methods/test_case_when.py b/pandas/tests/series/methods/test_case_when.py new file mode 100644 index 0000000000000..7cb60a11644a3 --- /dev/null +++ b/pandas/tests/series/methods/test_case_when.py @@ -0,0 +1,148 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + array as pd_array, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture +def df(): + """ + base dataframe for testing + """ + return DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + +def test_case_when_caselist_is_not_a_list(df): + """ + Raise ValueError if caselist is not a list. + """ + msg = "The caselist argument should be a list; " + msg += "instead got.+" + with pytest.raises(TypeError, match=msg): # GH39154 + df["a"].case_when(caselist=()) + + +def test_case_when_no_caselist(df): + """ + Raise ValueError if no caselist is provided. + """ + msg = "provide at least one boolean condition, " + msg += "with a corresponding replacement." + with pytest.raises(ValueError, match=msg): # GH39154 + df["a"].case_when([]) + + +def test_case_when_odd_caselist(df): + """ + Raise ValueError if no of caselist is odd. + """ + msg = "Argument 0 must have length 2; " + msg += "a condition and replacement; instead got length 3." + + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), 1, df.a.gt(1))]) + + +def test_case_when_raise_error_from_mask(df): + """ + Raise Error from within Series.mask + """ + msg = "Failed to apply condition0 and replacement0." + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), [1, 2])]) + + +def test_case_when_single_condition(df): + """ + Test output on a single condition. + """ + result = Series([np.nan, np.nan, np.nan]).case_when([(df.a.eq(1), 1)]) + expected = Series([1, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions(df): + """ + Test output when booleans are derived from a computation + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [(df.a.eq(1), 1), (Series([False, True, False]), 2)] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_list(df): + """ + Test output when replacement is a list + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [([True, False, False], 1), (df["a"].gt(1) & df["b"].eq(5), [1, 2, 3])] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_extension_dtype(df): + """ + Test output when replacement has an extension dtype + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + ([True, False, False], 1), + (df["a"].gt(1) & df["b"].eq(5), pd_array([1, 2, 3], dtype="Int64")), + ], + ) + expected = Series([1, 2, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_series(df): + """ + Test output when replacement is a Series + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + (np.array([True, False, False]), 1), + (df["a"].gt(1) & df["b"].eq(5), Series([1, 2, 3])), + ], + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_non_range_index(): + """ + Test output if index is not RangeIndex + """ + rng = np.random.default_rng(seed=123) + dates = date_range("1/1/2000", periods=8) + df = DataFrame( + rng.standard_normal(size=(8, 4)), index=dates, columns=["A", "B", "C", "D"] + ) + result = Series(5, index=df.index, name="A").case_when([(df.A.gt(0), df.B)]) + expected = df.A.mask(df.A.gt(0), df.B).where(df.A.gt(0), 5) + tm.assert_series_equal(result, expected) + + +def test_case_when_callable(): + """ + Test output on a callable + """ + # https://numpy.org/doc/stable/reference/generated/numpy.piecewise.html + x = np.linspace(-2.5, 2.5, 6) + ser = Series(x) + result = ser.case_when( + caselist=[ + (lambda df: df < 0, lambda df: -df), + (lambda df: df >= 0, lambda df: df), + ] + ) + expected = np.piecewise(x, [x < 0, x >= 0], [lambda x: -x, lambda x: x]) + tm.assert_series_equal(result, Series(expected)) From 596ea0ba0511aa3c6c94baa0222a26c2d6291522 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 10 Jan 2024 12:47:41 +0000 Subject: [PATCH 038/152] 'Backport PR #56146: BUG raise pdep6 warning for loc full setter' (#56807) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/indexing.py | 20 +++++++++++++++ pandas/core/internals/blocks.py | 3 +++ pandas/tests/copy_view/test_indexing.py | 7 +++++- pandas/tests/frame/indexing/test_indexing.py | 21 ++++++++-------- pandas/tests/frame/indexing/test_setitem.py | 20 +++++++++++++++ pandas/tests/frame/methods/test_update.py | 5 +--- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/indexing/test_iloc.py | 6 +++-- pandas/tests/indexing/test_loc.py | 25 +++++++++++++------ pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 4 +-- pandas/tests/series/indexing/test_indexing.py | 4 +-- 13 files changed, 89 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e244794664b34..9d577aa5ac426 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -817,6 +817,7 @@ Conversion - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) +- Bug in ``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) Strings ^^^^^^^ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4be7e17035128..934ba3a4d7f29 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2141,6 +2141,26 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: # If we're setting an entire column and we can't do it inplace, # then we can use value's dtype (or inferred dtype) # instead of object + dtype = self.obj.dtypes.iloc[loc] + if dtype not in (np.void, object) and not self.obj.empty: + # - Exclude np.void, as that is a special case for expansion. + # We want to warn for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'a'] = .3 + # but not for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'b'] = .3 + # - Exclude `object`, as then no upcasting happens. + # - Exclude empty initial object with enlargement, + # as then there's nothing to be inconsistent with. + warnings.warn( + f"Setting an item of incompatible dtype is deprecated " + "and will raise in a future error of pandas. " + f"Value '{value}' has dtype incompatible with {dtype}, " + "please explicitly cast to a compatible dtype first.", + FutureWarning, + stacklevel=find_stack_level(), + ) self.obj.isetitem(loc, value) else: # set value into the column (first attempting to operate inplace, then diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 06fd9ebe47eae..70a27300bd60f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -499,6 +499,9 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and is_integer_dtype(self.values.dtype) and isna(other) and other is not NaT + and not ( + isinstance(other, (np.datetime64, np.timedelta64)) and np.isnat(other) + ) ): warn_on_upcast = False elif ( diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 2681c07f01990..479fa148f994a 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1144,11 +1144,16 @@ def test_set_value_copy_only_necessary_column( df_orig = df.copy() view = df[:] - if val == "a" and indexer[0] != slice(None): + if val == "a" and not warn_copy_on_write: with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype is deprecated" ): indexer_func(df)[indexer] = val + if val == "a" and warn_copy_on_write: + with tm.assert_produces_warning( + FutureWarning, match="incompatible dtype|Setting a value on a view" + ): + indexer_func(df)[indexer] = val else: with tm.assert_cow_warning(warn_copy_on_write and val == 100): indexer_func(df)[indexer] = val diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 97e7ae15c6c63..22d9c7f26a57c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -949,7 +949,8 @@ def test_setitem_frame_upcast(self): # needs upcasting df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"]) df2 = df.copy() - df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 expected = df.reindex(columns=["A", "B"]) expected += 0.5 expected["C"] = df["C"] @@ -1387,20 +1388,20 @@ def test_loc_expand_empty_frame_keep_midx_names(self): tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - "val, idxr, warn", + "val, idxr", [ - ("x", "a", None), # TODO: this should warn as well - ("x", ["a"], None), # TODO: this should warn as well - (1, "a", None), # TODO: this should warn as well - (1, ["a"], FutureWarning), + ("x", "a"), + ("x", ["a"]), + (1, "a"), + (1, ["a"]), ], ) - def test_loc_setitem_rhs_frame(self, idxr, val, warn): + def test_loc_setitem_rhs_frame(self, idxr, val): # GH#47578 df = DataFrame({"a": [1, 2]}) with tm.assert_produces_warning( - warn, match="Setting an item of incompatible dtype" + FutureWarning, match="Setting an item of incompatible dtype" ): df.loc[:, idxr] = DataFrame({"a": [val, 11]}, index=[1, 2]) expected = DataFrame({"a": [np.nan, val]}) @@ -1996,7 +1997,7 @@ def _check_setitem_invalid(self, df, invalid, indexer, warn): np.datetime64("NaT"), np.timedelta64("NaT"), ] - _indexers = [0, [0], slice(0, 1), [True, False, False]] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] @pytest.mark.parametrize( "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] @@ -2010,7 +2011,7 @@ def test_setitem_validation_scalar_bool(self, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) - if isna(invalid) and invalid is not pd.NaT: + if isna(invalid) and invalid is not pd.NaT and not np.isnat(invalid): warn = None else: warn = FutureWarning diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index e802a56ecbc81..99233d3cd4cf3 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1381,3 +1381,23 @@ def test_frame_setitem_empty_dataframe(self): index=dti[:0], ) tm.assert_frame_equal(df, expected) + + +def test_full_setter_loc_incompatible_dtype(): + # https://github.com/pandas-dev/pandas/issues/55791 + df = DataFrame({"a": [1, 2]}) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "a"] = True + expected = DataFrame({"a": [True, True]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "a"] = {0: 3.5, 1: 4.5} + expected = DataFrame({"a": [3.5, 4.5]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + df.loc[:, "a"] = {0: 3, 1: 4} + expected = DataFrame({"a": [3, 4]}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 7c7a0d23ff75f..20ba550beeb30 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -160,11 +160,8 @@ def test_update_with_different_dtype(self, using_copy_on_write): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - if using_copy_on_write: + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): df.update({"c": Series(["foo"], index=[0])}) - else: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - df["c"].update(Series(["foo"], index=[0])) expected = DataFrame( { diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6e818d79d5ba8..acd0675fd43ec 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2857,7 +2857,7 @@ def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type): ) result = DataFrame({key_val: [1, 2]}, columns=cols) expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols) - expected.iloc[:, 1] = expected.iloc[:, 1].astype(object) + expected.isetitem(1, expected.iloc[:, 1].astype(object)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 409eca42f404b..43dd3812e8b7d 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -535,7 +535,8 @@ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks( # if the assigned values cannot be held by existing integer arrays, # we cast - df.iloc[:, 0] = df.iloc[:, 0] + 0.5 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.iloc[:, 0] = df.iloc[:, 0] + 0.5 if not using_array_manager: assert len(df._mgr.blocks) == 2 @@ -1471,6 +1472,7 @@ def test_iloc_setitem_pure_position_based(self): def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) - result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") expected = DataFrame({"a": ["test"], "b": array([NA], dtype="Int64")}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index fb0adc56c401b..61c44c8a2a8f4 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -584,7 +584,8 @@ def test_loc_setitem_consistency(self, frame_for_consistency, val): } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = val + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = val tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): @@ -598,7 +599,8 @@ def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = "foo" + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = "foo" tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): @@ -611,14 +613,16 @@ def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = 1.0 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = 1.0 tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_single_row(self): # GH 15494 # setting on frame with single row df = DataFrame({"date": Series([Timestamp("20180101")])}) - df.loc[:, "date"] = "string" + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = "string" expected = DataFrame({"date": Series(["string"])}) tm.assert_frame_equal(df, expected) @@ -678,9 +682,10 @@ def test_loc_setitem_consistency_slice_column_len(self): # timedelta64[m] -> float, so this cannot be done inplace, so # no warning - df.loc[:, ("Respondent", "Duration")] = df.loc[ - :, ("Respondent", "Duration") - ] / Timedelta(60_000_000_000) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, ("Respondent", "Duration")] = df.loc[ + :, ("Respondent", "Duration") + ] / Timedelta(60_000_000_000) expected = Series( [23.0, 12.0, 14.0, 36.0], index=df.index, name=("Respondent", "Duration") @@ -1487,7 +1492,11 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # if result started off with object dtype, then the .loc.__setitem__ # below would retain object dtype result = DataFrame(index=idx, columns=["var"], dtype=np.float64) - result.loc[:, idxer] = expected + with tm.assert_produces_warning( + FutureWarning if idxer == "var" else None, match="incompatible dtype" + ): + # See https://github.com/pandas-dev/pandas/issues/56223 + result.loc[:, idxer] = expected tm.assert_frame_equal(result, expected) def test_loc_setitem_time_key(self, using_array_manager): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0eefb0b52c483..1da27ad173235 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -179,7 +179,7 @@ def test_frame_non_unique_columns(self, orient, data): # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need # TODO: a to_epoch method would also solve; see GH 14772 - expected.iloc[:, 0] = expected.iloc[:, 0].astype(np.int64) // 1000000 + expected.isetitem(0, expected.iloc[:, 0].astype(np.int64) // 1000000) elif orient == "split": expected = df expected.columns = ["x", "x.1"] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ab8d22e567d27..27959609422f3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2984,9 +2984,9 @@ def test_merge_empty_frames_column_order(left_empty, right_empty): if left_empty and right_empty: expected = expected.iloc[:0] elif left_empty: - expected.loc[:, "B"] = np.nan + expected["B"] = np.nan elif right_empty: - expected.loc[:, ["C", "D"]] = np.nan + expected[["C", "D"]] = np.nan tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index c52e47a812183..f4992b758af74 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -491,7 +491,7 @@ def _check_setitem_invalid(self, ser, invalid, indexer, warn): np.datetime64("NaT"), np.timedelta64("NaT"), ] - _indexers = [0, [0], slice(0, 1), [True, False, False]] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] @pytest.mark.parametrize( "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] @@ -505,7 +505,7 @@ def test_setitem_validation_scalar_bool(self, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) - if isna(invalid) and invalid is not NaT: + if isna(invalid) and invalid is not NaT and not np.isnat(invalid): warn = None else: warn = FutureWarning From 8757a3c9b35ae2be257f35495f1e017b4ca765fe Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 10 Jan 2024 17:27:15 +0100 Subject: [PATCH 039/152] Backport PR #56757 on branch 2.2.x (ENH: Implement interpolation for arrow and masked dtypes) (#56809) ENH: Implement interpolation for arrow and masked dtypes (#56757) * ENH: Implement interpolation for arrow and masked dtypes * Fixup * Fix typing * Update (cherry picked from commit 5fc2ed2703a1370207f4ebad834e665b6c2ad42f) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 40 ++++++++++++++ pandas/core/arrays/masked.py | 54 +++++++++++++++++++ pandas/core/missing.py | 14 +++-- .../tests/frame/methods/test_interpolate.py | 41 ++++++++++++-- 5 files changed, 143 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9d577aa5ac426..9a9ac769a4893 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -343,6 +343,7 @@ Other enhancements - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) +- Implement :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for :class:`ArrowDtype` and masked dtypes (:issue:`56267`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`) - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3858ce4cf0ea1..a5ce46ed612f3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -182,6 +182,7 @@ def floordiv_compat( AxisInt, Dtype, FillnaOptions, + InterpolateOptions, Iterator, NpDtype, NumpySorter, @@ -2048,6 +2049,45 @@ def _maybe_convert_setitem_value(self, value): raise TypeError(msg) from err return value + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> Self: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + mask = self.isna() + if self.dtype.kind == "f": + data = self._pa_array.to_numpy() + elif self.dtype.kind in "iu": + data = self.to_numpy(dtype="f8", na_value=0.0) + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + return type(self)(self._box_pa_array(pa.array(data, mask=mask))) + @classmethod def _if_else( cls, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 545d45e450f3f..234d96e53a67c 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -22,6 +22,7 @@ AxisInt, DtypeObj, FillnaOptions, + InterpolateOptions, NpDtype, PositionalIndexer, Scalar, @@ -98,6 +99,7 @@ NumpySorter, NumpyValueArrayLike, ) + from pandas.core.arrays import FloatingArray from pandas.compat.numpy import function as nv @@ -1491,6 +1493,58 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): else: return self.dtype.na_value + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> FloatingArray: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + if self.dtype.kind == "f": + if copy: + data = self._data.copy() + mask = self._mask.copy() + else: + data = self._data + mask = self._mask + elif self.dtype.kind in "iu": + copy = True + data = self._data.astype("f8") + mask = self._mask.copy() + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + if not copy: + return self # type: ignore[return-value] + if self.dtype.kind == "f": + return type(self)._simple_new(data, mask) # type: ignore[return-value] + else: + from pandas.core.arrays import FloatingArray + + return FloatingArray._simple_new(data, mask) + def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index ff45662d0bdc8..c016aab8ad074 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -349,6 +349,7 @@ def interpolate_2d_inplace( limit_direction: str = "forward", limit_area: str | None = None, fill_value: Any | None = None, + mask=None, **kwargs, ) -> None: """ @@ -396,6 +397,7 @@ def func(yvalues: np.ndarray) -> None: limit_area=limit_area_validated, fill_value=fill_value, bounds_error=False, + mask=mask, **kwargs, ) @@ -440,6 +442,7 @@ def _interpolate_1d( fill_value: Any | None = None, bounds_error: bool = False, order: int | None = None, + mask=None, **kwargs, ) -> None: """ @@ -453,8 +456,10 @@ def _interpolate_1d( ----- Fills 'yvalues' in-place. """ - - invalid = isna(yvalues) + if mask is not None: + invalid = mask + else: + invalid = isna(yvalues) valid = ~invalid if not valid.any(): @@ -531,7 +536,10 @@ def _interpolate_1d( **kwargs, ) - if is_datetimelike: + if mask is not None: + mask[:] = False + mask[preserve_nans] = True + elif is_datetimelike: yvalues[preserve_nans] = NaT.value else: yvalues[preserve_nans] = np.nan diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index e0641fcb65bd3..252b950004bea 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -508,8 +508,41 @@ def test_interpolate_empty_df(self): assert result is None tm.assert_frame_equal(df, expected) - def test_interpolate_ea_raise(self): + def test_interpolate_ea(self, any_int_ea_dtype): # GH#55347 - df = DataFrame({"a": [1, None, 2]}, dtype="Int64") - with pytest.raises(NotImplementedError, match="does not implement"): - df.interpolate() + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=any_int_ea_dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="Float64") + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + [ + "Float64", + "Float32", + pytest.param("float32[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_interpolate_ea_float(self, dtype): + # GH#55347 + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype=dtype) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + ["int64", "uint64", "int32", "int16", "int8", "uint32", "uint16", "uint8"], + ) + def test_interpolate_arrow(self, dtype): + # GH#55347 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype + "[pyarrow]") + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="float64[pyarrow]") + tm.assert_frame_equal(result, expected) From 24ea67fcf0cf982d011d249f2a711ef178e13065 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 10 Jan 2024 09:53:27 -0800 Subject: [PATCH 040/152] BLD: Pin numpy on 2.2.x (#56812) --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8724a25909543..2f70ade7b3afe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,9 +30,9 @@ authors = [ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ - "numpy>=1.22.4; python_version<'3.11'", - "numpy>=1.23.2; python_version=='3.11'", - "numpy>=1.26.0; python_version>='3.12'", + "numpy>=1.22.4,<2; python_version<'3.11'", + "numpy>=1.23.2,<2; python_version=='3.11'", + "numpy>=1.26.0,<2; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.7" From 922a671b41dbdeb20856a07eb91b1949bc827a3c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 10 Jan 2024 19:07:53 +0100 Subject: [PATCH 041/152] Backport PR #56594 on branch 2.2.x (DEPR: the method is_anchored() for offsets) (#56813) Backport PR #56594: DEPR: the method is_anchored() for offsets Co-authored-by: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 + pandas/_libs/tslibs/offsets.pyx | 54 ++++++++++++++++++- .../indexes/interval/test_interval_range.py | 4 +- .../tseries/offsets/test_business_quarter.py | 19 ++++--- pandas/tests/tseries/offsets/test_fiscal.py | 22 ++++---- pandas/tests/tseries/offsets/test_offsets.py | 7 ++- pandas/tests/tseries/offsets/test_quarter.py | 19 ++++--- pandas/tests/tseries/offsets/test_ticks.py | 5 +- pandas/tests/tseries/offsets/test_week.py | 12 +++-- 9 files changed, 111 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9a9ac769a4893..b5df0a319bc18 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -665,11 +665,13 @@ Other Deprecations - Deprecated :func:`pd.core.internals.api.make_block`, use public APIs instead (:issue:`40226`) - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) +- Deprecated :meth:`DateOffset.is_anchored`, use ``obj.n == 1`` for non-Tick subclasses (for Tick this was always False) (:issue:`55388`) - Deprecated :meth:`DatetimeArray.__init__` and :meth:`TimedeltaArray.__init__`, use :func:`array` instead (:issue:`55623`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) - Deprecated :meth:`Series.ravel`, the underlying array is already 1D, so ravel is not necessary (:issue:`52511`) - Deprecated :meth:`Series.resample` and :meth:`DataFrame.resample` with a :class:`PeriodIndex` (and the 'convention' keyword), convert to :class:`DatetimeIndex` (with ``.to_timestamp()``) before resampling instead (:issue:`53481`) - Deprecated :meth:`Series.view`, use :meth:`Series.astype` instead to change the dtype (:issue:`20251`) +- Deprecated :meth:`offsets.Tick.is_anchored`, use ``False`` instead (:issue:`55388`) - Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`) - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) - Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index b3788b6003e67..3a339171d0da2 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -756,11 +756,14 @@ cdef class BaseOffset: raise ValueError(f"{self} is a non-fixed frequency") def is_anchored(self) -> bool: - # TODO: Does this make sense for the general case? It would help - # if there were a canonical docstring for what is_anchored means. + # GH#55388 """ Return boolean whether the frequency is a unit frequency (n=1). + .. deprecated:: 2.2.0 + is_anchored is deprecated and will be removed in a future version. + Use ``obj.n == 1`` instead. + Examples -------- >>> pd.DateOffset().is_anchored() @@ -768,6 +771,12 @@ cdef class BaseOffset: >>> pd.DateOffset(2).is_anchored() False """ + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 # ------------------------------------------------------------------ @@ -954,6 +963,27 @@ cdef class Tick(SingleConstructorOffset): return True def is_anchored(self) -> bool: + # GH#55388 + """ + Return False. + + .. deprecated:: 2.2.0 + is_anchored is deprecated and will be removed in a future version. + Use ``False`` instead. + + Examples + -------- + >>> pd.offsets.Hour().is_anchored() + False + >>> pd.offsets.Hour(2).is_anchored() + False + """ + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use False instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return False # This is identical to BaseOffset.__hash__, but has to be redefined here @@ -2663,6 +2693,13 @@ cdef class QuarterOffset(SingleConstructorOffset): return f"{self._prefix}-{month}" def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1 " + f"and obj.startingMonth is not None\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 and self.startingMonth is not None def is_on_offset(self, dt: datetime) -> bool: @@ -3308,6 +3345,13 @@ cdef class Week(SingleConstructorOffset): self._cache = state.pop("_cache", {}) def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1 " + f"and obj.weekday is not None\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 and self.weekday is not None @apply_wraps @@ -3597,6 +3641,12 @@ cdef class FY5253Mixin(SingleConstructorOffset): self.variation = state.pop("variation") def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return ( self.n == 1 and self.startingMonth is not None and self.weekday is not None ) diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index d4d4a09c44d13..e8de59f84bcc6 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -84,9 +84,7 @@ def test_constructor_timestamp(self, closed, name, freq, periods, tz): tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods - if not breaks.freq.is_anchored() and tz is None: - # matches expected only for non-anchored offsets and tz naive - # (anchored/DST transitions cause unequal spacing in expected) + if not breaks.freq.n == 1 and tz is None: result = interval_range( start=start, end=end, periods=periods, name=name, closed=closed ) diff --git a/pandas/tests/tseries/offsets/test_business_quarter.py b/pandas/tests/tseries/offsets/test_business_quarter.py index 44a7f16ab039d..6d7a115054b7f 100644 --- a/pandas/tests/tseries/offsets/test_business_quarter.py +++ b/pandas/tests/tseries/offsets/test_business_quarter.py @@ -9,6 +9,7 @@ import pytest +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( assert_is_on_offset, assert_offset_equal, @@ -54,9 +55,12 @@ def test_repr(self): assert repr(BQuarterBegin(startingMonth=1)) == expected def test_is_anchored(self): - assert BQuarterBegin(startingMonth=1).is_anchored() - assert BQuarterBegin().is_anchored() - assert not BQuarterBegin(2, startingMonth=1).is_anchored() + msg = "BQuarterBegin.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert BQuarterBegin(startingMonth=1).is_anchored() + assert BQuarterBegin().is_anchored() + assert not BQuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -177,9 +181,12 @@ def test_repr(self): assert repr(BQuarterEnd(startingMonth=1)) == expected def test_is_anchored(self): - assert BQuarterEnd(startingMonth=1).is_anchored() - assert BQuarterEnd().is_anchored() - assert not BQuarterEnd(2, startingMonth=1).is_anchored() + msg = "BQuarterEnd.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert BQuarterEnd(startingMonth=1).is_anchored() + assert BQuarterEnd().is_anchored() + assert not BQuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index 7f8c34bc6832e..824e66a1ddef1 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -7,6 +7,7 @@ import pytest from pandas import Timestamp +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( WeekDay, assert_is_on_offset, @@ -295,15 +296,18 @@ def test_apply(self): class TestFY5253LastOfMonthQuarter: def test_is_anchored(self): - assert makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).is_anchored() - assert makeFY5253LastOfMonthQuarter( - weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 - ).is_anchored() - assert not makeFY5253LastOfMonthQuarter( - 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).is_anchored() + msg = "FY5253Quarter.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).is_anchored() + assert makeFY5253LastOfMonthQuarter( + weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 + ).is_anchored() + assert not makeFY5253LastOfMonthQuarter( + 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).is_anchored() def test_equality(self): assert makeFY5253LastOfMonthQuarter( diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index ddf56e68b1611..62afb8b83d576 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -625,8 +625,11 @@ def test_default_constructor(self, dt): assert (dt + DateOffset(2)) == datetime(2008, 1, 4) def test_is_anchored(self): - assert not DateOffset(2).is_anchored() - assert DateOffset(1).is_anchored() + msg = "DateOffset.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not DateOffset(2).is_anchored() + assert DateOffset(1).is_anchored() def test_copy(self): assert DateOffset(months=2).copy() == DateOffset(months=2) diff --git a/pandas/tests/tseries/offsets/test_quarter.py b/pandas/tests/tseries/offsets/test_quarter.py index d183645da507d..5fd3ba0a5fb87 100644 --- a/pandas/tests/tseries/offsets/test_quarter.py +++ b/pandas/tests/tseries/offsets/test_quarter.py @@ -9,6 +9,7 @@ import pytest +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( assert_is_on_offset, assert_offset_equal, @@ -53,9 +54,12 @@ def test_repr(self): assert repr(QuarterBegin(startingMonth=1)) == expected def test_is_anchored(self): - assert QuarterBegin(startingMonth=1).is_anchored() - assert QuarterBegin().is_anchored() - assert not QuarterBegin(2, startingMonth=1).is_anchored() + msg = "QuarterBegin.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert QuarterBegin(startingMonth=1).is_anchored() + assert QuarterBegin().is_anchored() + assert not QuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -161,9 +165,12 @@ def test_repr(self): assert repr(QuarterEnd(startingMonth=1)) == expected def test_is_anchored(self): - assert QuarterEnd(startingMonth=1).is_anchored() - assert QuarterEnd().is_anchored() - assert not QuarterEnd(2, startingMonth=1).is_anchored() + msg = "QuarterEnd.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert QuarterEnd(startingMonth=1).is_anchored() + assert QuarterEnd().is_anchored() + assert not QuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index b68b91826bc6f..399b7038d3426 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -339,7 +339,10 @@ def test_tick_equalities(cls): @pytest.mark.parametrize("cls", tick_classes) def test_tick_offset(cls): - assert not cls().is_anchored() + msg = f"{cls.__name__}.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not cls().is_anchored() @pytest.mark.parametrize("cls", tick_classes) diff --git a/pandas/tests/tseries/offsets/test_week.py b/pandas/tests/tseries/offsets/test_week.py index f42ff091af277..0cd6f769769ae 100644 --- a/pandas/tests/tseries/offsets/test_week.py +++ b/pandas/tests/tseries/offsets/test_week.py @@ -21,6 +21,7 @@ WeekOfMonth, ) +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( WeekDay, assert_is_on_offset, @@ -42,10 +43,13 @@ def test_corner(self): Week(weekday=-1) def test_is_anchored(self): - assert Week(weekday=0).is_anchored() - assert not Week().is_anchored() - assert not Week(2, weekday=2).is_anchored() - assert not Week(2).is_anchored() + msg = "Week.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Week(weekday=0).is_anchored() + assert not Week().is_anchored() + assert not Week(2, weekday=2).is_anchored() + assert not Week(2).is_anchored() offset_cases = [] # not business week From e28b4016f7594df9f1a57af55a3f17ab30c0fc89 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 10 Jan 2024 23:40:55 +0100 Subject: [PATCH 042/152] Backport PR #56788 on branch 2.2.x (Bug: Interchange protocol implementation does not allow for empty string columns) (#56816) Backport PR #56788: Bug: Interchange protocol implementation does not allow for empty string columns Co-authored-by: yashb <74137864+roadrollerdafjorst@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/interchange/column.py | 2 +- pandas/tests/interchange/test_impl.py | 8 ++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b5df0a319bc18..3a4c9438dbc21 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -936,6 +936,7 @@ Other - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) +- Bug in :func:`pd.api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) @@ -944,7 +945,6 @@ Other - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) - .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 7f524d6823f30..ee1b5cd34a7f7 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -116,7 +116,7 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: Endianness.NATIVE, ) elif is_string_dtype(dtype): - if infer_dtype(self._col) == "string": + if infer_dtype(self._col) in ("string", "empty"): return ( DtypeKind.STRING, 8, diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 27ea8ccdd17b1..c7b13f9fd7b2d 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -364,6 +364,14 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) +def test_empty_string_column(): + # https://github.com/pandas-dev/pandas/issues/56703 + df = pd.DataFrame({"a": []}, dtype=str) + df2 = df.__dataframe__() + result = pd.api.interchange.from_dataframe(df2) + tm.assert_frame_equal(df, result) + + def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") From 59c0a2dddc6736e189431cb65e3e943b94786db5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 11 Jan 2024 02:14:18 +0100 Subject: [PATCH 043/152] Backport PR #56481 on branch 2.2.x (Revert "DEPR: make_block (#56422)") (#56814) Backport PR #56481: Revert "DEPR: make_block (#56422)" Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/internals/api.py | 11 +--------- pandas/tests/internals/test_api.py | 4 +--- pandas/tests/internals/test_internals.py | 20 ++++++------------- .../tests/io/parser/common/test_chunksize.py | 1 - pandas/tests/io/parser/test_parse_dates.py | 9 +++------ 6 files changed, 12 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 3a4c9438dbc21..4265447f05b8b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -662,7 +662,6 @@ Other Deprecations - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) - Deprecated :attr:`offsets.Day.delta`, :attr:`offsets.Hour.delta`, :attr:`offsets.Minute.delta`, :attr:`offsets.Second.delta`, :attr:`offsets.Milli.delta`, :attr:`offsets.Micro.delta`, :attr:`offsets.Nano.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`) - Deprecated :func:`pandas.api.types.is_interval` and :func:`pandas.api.types.is_period`, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`) -- Deprecated :func:`pd.core.internals.api.make_block`, use public APIs instead (:issue:`40226`) - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) - Deprecated :meth:`DateOffset.is_anchored`, use ``obj.n == 1`` for non-Tick subclasses (for Tick this was always False) (:issue:`55388`) @@ -722,6 +721,7 @@ Other Deprecations - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`) +- .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index e5ef44d07061e..b0b3937ca47ea 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -9,12 +9,10 @@ from __future__ import annotations from typing import TYPE_CHECKING -import warnings import numpy as np from pandas._libs.internals import BlockPlacement -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import ( @@ -52,14 +50,6 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ - warnings.warn( - # GH#40226 - "make_block is deprecated and will be removed in a future version. " - "Use public APIs instead.", - DeprecationWarning, - stacklevel=find_stack_level(), - ) - if dtype is not None: dtype = pandas_dtype(dtype) @@ -123,6 +113,7 @@ def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int def __getattr__(name: str): # GH#55139 + import warnings if name in [ "Block", diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index f816cef38b9ab..1251a6ae97a1c 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -68,9 +68,7 @@ def test_deprecations(name): def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") - msg = "make_block is deprecated" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - blk = api.make_block(dti, placement=[0]) + blk = api.make_block(dti, placement=[0]) assert blk.shape == (1, 3) assert blk.values.shape == (1, 3) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 2265522bc7ecb..ce88bae6e02f2 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1383,11 +1383,9 @@ def test_validate_ndim(): values = np.array([1.0, 2.0]) placement = BlockPlacement(slice(2)) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" - depr_msg = "make_block is deprecated" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(DeprecationWarning, match=depr_msg): - make_block(values, placement, ndim=2) + make_block(values, placement, ndim=2) def test_block_shape(): @@ -1402,12 +1400,8 @@ def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.NumpyExtensionArray(np.array([1, 2])) - warn = None if block_maker is not make_block else DeprecationWarning - msg = "make_block is deprecated and will be removed in a future version" - # NumpyExtensionArray, no dtype - with tm.assert_produces_warning(warn, match=msg): - result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) + result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] if block_maker is make_block: @@ -1415,16 +1409,14 @@ def test_make_block_no_pandas_array(block_maker): assert result.is_extension is False # NumpyExtensionArray, NumpyEADtype - with tm.assert_produces_warning(warn, match=msg): - result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # new_block no longer taked dtype keyword # ndarray, NumpyEADtype - with tm.assert_produces_warning(warn, match=msg): - result = block_maker( - arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim - ) + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 9660b283a491b..d5dc723e2c7c5 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -233,7 +233,6 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -@pytest.mark.filterwarnings("ignore:make_block is deprecated:FutureWarning") def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index d8f362039ba13..623657b412682 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -33,12 +33,9 @@ from pandas.io.parsers import read_csv -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning"), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") From b7fa3b9bbc7b2474beb106f6b0da6c90c5caab20 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 11 Jan 2024 03:23:04 +0100 Subject: [PATCH 044/152] Backport PR #56818 on branch 2.2.x (CI: Fix failing builds) (#56819) Backport PR #56818: CI: Fix failing builds Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/tests/io/parser/common/test_chunksize.py | 17 ++++------------- .../tests/io/parser/common/test_read_errors.py | 17 +++-------------- 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index d5dc723e2c7c5..9f42cf674b0a7 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -220,14 +220,9 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) # Coercions should work without warnings. - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - depr_msg = "Passing a BlockManager to DataFrame|make_block is deprecated" - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - with monkeypatch.context() as m: - m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) - result = parser.read_csv(StringIO(data)) + with monkeypatch.context() as m: + m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) + result = parser.read_csv(StringIO(data)) assert type(result.a[0]) is np.float64 assert result.a.dtype == float @@ -251,12 +246,8 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): buf = StringIO(data) if parser.engine == "pyarrow": - df = parser.read_csv_check_warnings( - DeprecationWarning, - "Passing a BlockManager to DataFrame is deprecated|" - "make_block is deprecated", + df = parser.read_csv( buf, - check_stacklevel=False, ) else: df = parser.read_csv_check_warnings( diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index db8b586d22fc0..f5a724bad4fa2 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -130,14 +130,9 @@ def test_catch_too_many_names(all_parsers): else "Number of passed names did not match " "number of header fields in the file" ) - depr_msg = "Passing a BlockManager to DataFrame is deprecated" - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) @skip_pyarrow # CSV parse error: Empty CSV file or block @@ -168,13 +163,7 @@ def test_suppress_error_output(all_parsers): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - msg = "Passing a BlockManager to DataFrame|make_block is deprecated" - - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - result = parser.read_csv(StringIO(data), on_bad_lines="skip") + result = parser.read_csv(StringIO(data), on_bad_lines="skip") tm.assert_frame_equal(result, expected) From 1c34627fd911c1a1cdeb62a4cee4e67e7699dc68 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 11 Jan 2024 05:29:35 +0100 Subject: [PATCH 045/152] Backport PR #55327 on branch 2.2.x (COMPAT: Fix warning with numba >= 0.58.0) (#56820) Backport PR #55327: COMPAT: Fix warning with numba >= 0.58.0 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/util/numba_.py | 9 +++++++++ pandas/tests/window/test_numba.py | 7 +++++++ 3 files changed, 17 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 4265447f05b8b..5de5bd58bd35f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -944,6 +944,7 @@ Other - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) +- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`pandas.core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`) .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index b8d489179338b..4825c9fee24b1 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -1,11 +1,14 @@ """Common utilities for Numba operations""" from __future__ import annotations +import types from typing import ( TYPE_CHECKING, Callable, ) +import numpy as np + from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError @@ -83,6 +86,12 @@ def jit_user_function(func: Callable) -> Callable: if numba.extending.is_jitted(func): # Don't jit a user passed jitted function numba_func = func + elif getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + # Not necessary to jit builtins or np functions + # This will mess up register_jitable + numba_func = func else: numba_func = numba.extending.register_jitable(func) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index b1cc7ec186f19..139e1ff7f65fd 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -446,3 +446,10 @@ def test_table_method_ewm(self, data, method, axis, nogil, parallel, nopython): engine_kwargs=engine_kwargs, engine="numba" ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_npfunc_no_warnings(): + df = DataFrame({"col1": [1, 2, 3, 4, 5]}) + with tm.assert_produces_warning(False): + df.col1.rolling(2).apply(np.prod, raw=True, engine="numba") From d11887b1f908ee8391103176e9c55b798bdc3be6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 13 Jan 2024 22:02:29 +0100 Subject: [PATCH 046/152] Backport PR #56849 on branch 2.2.x (REGR: freq "m" (as alias of deprecated "M") raises an error) (#56862) Backport PR #56849: REGR: freq "m" (as alias of deprecated "M") raises an error Co-authored-by: Marco Edward Gorelli --- pandas/_libs/tslibs/offsets.pyx | 6 +++--- pandas/tests/indexes/datetimes/test_date_range.py | 11 +++++++++++ pandas/tests/tslibs/test_to_offset.py | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 3a339171d0da2..205ab6f01f8c6 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4845,15 +4845,15 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if is_period is False and name in c_OFFSET_DEPR_FREQSTR: + if is_period is False and name.upper() in c_OFFSET_DEPR_FREQSTR: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{c_OFFSET_DEPR_FREQSTR.get(name)}\' instead.", + f"\'{c_OFFSET_DEPR_FREQSTR.get(name.upper())}\' instead.", FutureWarning, stacklevel=find_stack_level(), ) - name = c_OFFSET_DEPR_FREQSTR[name] + name = c_OFFSET_DEPR_FREQSTR[name.upper()] if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR: if name.startswith("Y"): raise ValueError( diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 44dd64e162413..d26bee80003e9 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -822,6 +822,17 @@ def test_frequencies_A_deprecated_Y_renamed(self, freq, freq_depr): result = date_range("1/1/2000", periods=2, freq=freq_depr) tm.assert_index_equal(result, expected) + def test_to_offset_with_lowercase_deprecated_freq(self) -> None: + # https://github.com/pandas-dev/pandas/issues/56847 + msg = ( + "'m' is deprecated and will be removed in a future version, please use " + "'ME' instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = date_range("2010-01-01", periods=2, freq="m") + expected = DatetimeIndex(["2010-01-31", "2010-02-28"], freq="ME") + tm.assert_index_equal(result, expected) + def test_date_range_bday(self): sdate = datetime(1999, 12, 25) idx = date_range(start=sdate, freq="1B", periods=20) diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index ef68408305232..6e654e65a36d6 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -45,6 +45,7 @@ def test_to_offset_negative(freqstr, expected): assert result.n == expected +@pytest.mark.filterwarnings("ignore:.*'m' is deprecated.*:FutureWarning") @pytest.mark.parametrize( "freqstr", [ From b9dd271d170002e1ddaa44ed76f665204d897acb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 15 Jan 2024 01:23:47 +0100 Subject: [PATCH 047/152] Backport PR #56873 on branch 2.2.x (CI: unxfail adbc-driver-postgresql test) (#56875) Backport PR #56873: CI: unxfail adbc-driver-postgresql test Co-authored-by: Marco Edward Gorelli --- pandas/tests/io/test_sql.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6645aefd4f0a7..791b6da3deeca 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2229,12 +2229,14 @@ def test_api_chunksize_read(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_categorical(conn, request): if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail( - reason="categorical dtype not implemented for ADBC postgres driver", - strict=True, + adbc = import_optional_dependency("adbc_driver_postgresql", errors="ignore") + if adbc is not None and Version(adbc.__version__) < Version("0.9.0"): + request.node.add_marker( + pytest.mark.xfail( + reason="categorical dtype not implemented for ADBC postgres driver", + strict=True, + ) ) - ) # GH8624 # test that categorical gets written correctly as dense column conn = request.getfixturevalue(conn) From 8e25417474e6d015f70d6b22463648a5b7371470 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 15 Jan 2024 20:57:41 +0100 Subject: [PATCH 048/152] Backport PR #56891 on branch 2.2.x (DOC: Add deprecated markers for downcast keyword) (#56893) Backport PR #56891: DOC: Add deprecated markers for downcast keyword Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index de25a02c6b37c..f8728c61e46fc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7187,6 +7187,8 @@ def fillna( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None @@ -7522,6 +7524,8 @@ def ffill( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None @@ -7713,6 +7717,8 @@ def bfill( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None From 55c9fbd382cc65d896088927c56e811373308a47 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 16 Jan 2024 22:54:47 +0100 Subject: [PATCH 049/152] Backport PR #56906 on branch 2.2.x (DEPR: freq ''2BQ-SEP" for to_period should raise an error) (#56916) Backport PR #56906: DEPR: freq ''2BQ-SEP" for to_period should raise an error Co-authored-by: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> --- pandas/core/arrays/datetimes.py | 11 +++++------ .../indexes/datetimes/methods/test_to_period.py | 17 ----------------- .../tests/indexes/period/test_constructors.py | 9 +++++++++ 3 files changed, 14 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6b7ddc4a72957..02656b655a0c6 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -39,10 +39,7 @@ tz_convert_from_utc, tzconversion, ) -from pandas._libs.tslibs.dtypes import ( - abbrev_to_npy_unit, - freq_to_period_freqstr, -) +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_inclusive @@ -1232,8 +1229,10 @@ def to_period(self, freq=None) -> PeriodArray: if freq is None: freq = self.freqstr or self.inferred_freq - if isinstance(self.freq, BaseOffset): - freq = freq_to_period_freqstr(self.freq.n, self.freq.name) + if isinstance(self.freq, BaseOffset) and hasattr( + self.freq, "_period_dtype_code" + ): + freq = PeriodDtype(self.freq)._freqstr if freq is None: raise ValueError( diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 42a3f3b0f7b42..00c0216a9b3b5 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -111,23 +111,6 @@ def test_to_period_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): with tm.assert_produces_warning(FutureWarning, match=msg): assert prng.freq == freq_depr - @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2BQE-SEP", "2BQ-SEP"), - ("2BYE-MAR", "2BY-MAR"), - ], - ) - def test_to_period_frequency_BQ_BY_deprecated(self, freq, freq_depr): - # GH#9586 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - rng = date_range("01-Jan-2012", periods=8, freq=freq) - prng = rng.to_period() - with tm.assert_produces_warning(FutureWarning, match=msg): - prng.freq == freq_depr - def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 rng = date_range( diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 387dc47c48d20..0cd4ddef5a3c9 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -48,6 +48,15 @@ def test_period_index_frequency_invalid_freq(self, freq_depr): with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + @pytest.mark.parametrize("freq", ["2BQE-SEP", "2BYE-MAR", "2BME"]) + def test_period_index_from_datetime_index_invalid_freq(self, freq): + # GH#56899 + msg = f"Invalid frequency: {freq[1:]}" + + rng = date_range("01-Jan-2012", periods=8, freq=freq) + with pytest.raises(ValueError, match=msg): + rng.to_period() + class TestPeriodIndex: def test_from_ordinals(self): From d7dd696176baa62411b873e6b4487ee309caa227 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 17 Jan 2024 12:00:40 +0100 Subject: [PATCH 050/152] Backport PR #56910 on branch 2.2.x (DEPR: lowercase freqs 'ye', 'qe', etc. raise a ValueError) (#56924) Backport PR #56910: DEPR: lowercase freqs 'ye', 'qe', etc. raise a ValueError Co-authored-by: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> --- pandas/_libs/tslibs/offsets.pyx | 91 ++++++++++++------- pandas/tests/arrays/test_datetimes.py | 42 +++++++++ .../indexes/datetimes/test_partial_slicing.py | 2 +- .../tests/indexes/period/test_constructors.py | 15 ++- .../tests/indexes/period/test_period_range.py | 44 ++++++--- pandas/tests/resample/test_period_index.py | 29 ++++++ pandas/tests/scalar/period/test_period.py | 4 +- pandas/tests/tslibs/test_to_offset.py | 44 +++++++++ 8 files changed, 218 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 205ab6f01f8c6..764a044f32c82 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4711,29 +4711,7 @@ _lite_rule_alias = { "ns": "ns", } -_dont_uppercase = { - "h", - "bh", - "cbh", - "MS", - "ms", - "s", - "me", - "qe", - "qe-dec", - "qe-jan", - "qe-feb", - "qe-mar", - "qe-apr", - "qe-may", - "qe-jun", - "qe-jul", - "qe-aug", - "qe-sep", - "qe-oct", - "qe-nov", - "ye", -} +_dont_uppercase = _dont_uppercase = {"h", "bh", "cbh", "MS", "ms", "s"} INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4752,7 +4730,29 @@ def _get_offset(name: str) -> BaseOffset: -------- _get_offset('EOM') --> BMonthEnd(1) """ - if name.lower() not in _dont_uppercase: + if ( + name not in _lite_rule_alias + and (name.upper() in _lite_rule_alias) + and name != "ms" + ): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use \'{name.upper()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif ( + name not in _lite_rule_alias + and (name.lower() in _lite_rule_alias) + and name != "MS" + ): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use \'{name.lower()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if name not in _dont_uppercase: name = name.upper() name = _lite_rule_alias.get(name, name) name = _lite_rule_alias.get(name.lower(), name) @@ -4845,7 +4845,7 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if is_period is False and name.upper() in c_OFFSET_DEPR_FREQSTR: + if not is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " @@ -4854,31 +4854,52 @@ cpdef to_offset(freq, bint is_period=False): stacklevel=find_stack_level(), ) name = c_OFFSET_DEPR_FREQSTR[name.upper()] - if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR: - if name.startswith("Y"): + if (not is_period and + name != name.upper() and + name.lower() not in {"s", "ms", "us", "ns"} and + name.upper().split("-")[0].endswith(("S", "E"))): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{name.upper()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + name = name.upper() + if is_period and name.upper() in c_REVERSE_OFFSET_DEPR_FREQSTR: + if name.upper().startswith("Y"): raise ValueError( - f"for Period, please use \'Y{name[2:]}\' " + f"for Period, please use \'Y{name.upper()[2:]}\' " f"instead of \'{name}\'" ) - if (name.startswith("B") or - name.startswith("S") or name.startswith("C")): + if (name.upper().startswith("B") or + name.upper().startswith("S") or + name.upper().startswith("C")): raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) else: raise ValueError( f"for Period, please use " - f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name)}\' " + f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name.upper())}\' " f"instead of \'{name}\'" ) - elif is_period is True and name in c_OFFSET_DEPR_FREQSTR: - if name.startswith("A"): + elif is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: + if name.upper().startswith("A"): warnings.warn( f"\'{name}\' is deprecated and will be removed in a future " - f"version, please use \'{c_DEPR_ABBREVS.get(name)}\' " + f"version, please use " + f"\'{c_DEPR_ABBREVS.get(name.upper())}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if name.upper() != name: + warnings.warn( + f"\'{name}\' is deprecated and will be removed in " + f"a future version, please use \'{name.upper()}\' " f"instead.", FutureWarning, stacklevel=find_stack_level(), ) - name = c_OFFSET_DEPR_FREQSTR.get(name) + name = c_OFFSET_DEPR_FREQSTR.get(name.upper()) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 9a576be10d5ca..8f0576cc65a27 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -766,12 +766,18 @@ def test_iter_zoneinfo_fold(self, tz): "freq, freq_depr", [ ("2ME", "2M"), + ("2SME", "2SM"), + ("2SME", "2sm"), ("2QE", "2Q"), ("2QE-SEP", "2Q-SEP"), ("1YE", "1Y"), ("2YE-MAR", "2Y-MAR"), ("1YE", "1A"), ("2YE-MAR", "2A-MAR"), + ("2ME", "2m"), + ("2QE-SEP", "2q-sep"), + ("2YE-MAR", "2a-mar"), + ("2YE", "2y"), ], ) def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): @@ -784,6 +790,42 @@ def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("freq_depr", ["2H", "2CBH", "2MIN", "2S", "2mS", "2Us"]) + def test_date_range_uppercase_frequency_deprecated(self, freq_depr): + # GH#9586, GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.lower()[1:]}' instead." + + expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.lower()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq_depr", + [ + "2ye-mar", + "2ys", + "2qe", + "2qs-feb", + "2bqs", + "2sms", + "2bms", + "2cbme", + "2me", + "2w", + ], + ) + def test_date_range_lowercase_frequency_deprecated(self, freq_depr): + # GH#9586, GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.upper()[1:]}' instead." + + expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.upper()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + def test_factorize_sort_without_freq(): dta = DatetimeArray._from_sequence([0, 2, 1], dtype="M8[ns]") diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 0ebb88afb6c86..8b493fc61cb58 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -236,7 +236,7 @@ def test_partial_slice_second_precision(self): rng = date_range( start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990), periods=20, - freq="US", + freq="us", ) s = Series(np.arange(20), rng) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 0cd4ddef5a3c9..892eb7b4a00d1 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -26,9 +26,12 @@ class TestPeriodIndexDisallowedFreqs: ("2M", "2ME"), ("2Q-MAR", "2QE-MAR"), ("2Y-FEB", "2YE-FEB"), + ("2M", "2me"), + ("2Q-MAR", "2qe-MAR"), + ("2Y-FEB", "2yE-feb"), ], ) - def test_period_index_frequency_ME_error_message(self, freq, freq_depr): + def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): # GH#52064 msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" @@ -38,7 +41,7 @@ def test_period_index_frequency_ME_error_message(self, freq, freq_depr): with pytest.raises(ValueError, match=msg): period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) - @pytest.mark.parametrize("freq_depr", ["2SME", "2CBME", "2BYE"]) + @pytest.mark.parametrize("freq_depr", ["2SME", "2sme", "2CBME", "2BYE", "2Bye"]) def test_period_index_frequency_invalid_freq(self, freq_depr): # GH#9586 msg = f"Invalid frequency: {freq_depr[1:]}" @@ -547,7 +550,9 @@ def test_period_range_length(self): assert i1.freq == end_intv.freq assert i1[-1] == end_intv - end_intv = Period("2006-12-31", "1w") + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + end_intv = Period("2006-12-31", "1w") i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() @@ -576,7 +581,9 @@ def test_mixed_freq_raises(self): with tm.assert_produces_warning(FutureWarning, match=msg): end_intv = Period("2005-05-01", "B") - vals = [end_intv, Period("2006-12-31", "w")] + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + vals = [end_intv, Period("2006-12-31", "w")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" depr_msg = r"PeriodDtype\[B\] is deprecated" with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 2543b49089948..6f8e6d07da8bf 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -181,7 +181,9 @@ def test_construction_from_period(self): def test_mismatched_start_end_freq_raises(self): depr_msg = "Period with BDay freq is deprecated" - end_w = Period("2006-12-31", "1w") + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + end_w = Period("2006-12-31", "1w") with tm.assert_produces_warning(FutureWarning, match=depr_msg): start_b = Period("02-Apr-2005", "B") @@ -203,19 +205,37 @@ def test_constructor_U(self): with pytest.raises(ValueError, match="Invalid frequency: X"): period_range("2007-1-1", periods=500, freq="X") - def test_H_deprecated_from_time_series(self): + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2Y", "2A"), + ("2Y", "2a"), + ("2Y-AUG", "2A-AUG"), + ("2Y-AUG", "2A-aug"), + ], + ) + def test_a_deprecated_from_time_series(self, freq, freq_depr): # GH#52536 - msg = "'H' is deprecated and will be removed in a future version." + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") + + @pytest.mark.parametrize("freq_depr", ["2H", "2MIN", "2S", "2US", "2NS"]) + def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): + # GH#52536, GH#54939 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.lower()[1:]}' instead." + with tm.assert_produces_warning(FutureWarning, match=msg): - period_range(freq="2H", start="1/1/2001", end="12/1/2009") + period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) + + @pytest.mark.parametrize("freq_depr", ["2m", "2q-sep", "2y", "2w"]) + def test_lowercase_freq_deprecated_from_time_series(self, freq_depr): + # GH#52536, GH#54939 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.upper()[1:]}' instead." - @pytest.mark.parametrize("freq_depr", ["2A", "A-DEC", "200A-AUG"]) - def test_a_deprecated_from_time_series(self, freq_depr): - # GH#52536 - freq_msg = freq_depr[freq_depr.index("A") :] - msg = ( - f"'{freq_msg}' is deprecated and will be removed in a future version, " - f"please use 'Y{freq_msg[1:]}' instead." - ) with tm.assert_produces_warning(FutureWarning, match=msg): period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index eb80f56dd7d4b..451d2a83c1d5e 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1006,6 +1006,32 @@ def test_resample_t_l_deprecated(self): result = ser.resample("T").mean() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "freq, freq_depr, freq_res, freq_depr_res, data", + [ + ("2Q", "2q", "2Y", "2y", [0.5]), + ("2M", "2m", "2Q", "2q", [1.0, 3.0]), + ], + ) + def test_resample_lowercase_frequency_deprecated( + self, freq, freq_depr, freq_res, freq_depr_res, data + ): + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq[1:]}' instead." + depr_msg_res = f"'{freq_depr_res[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_res[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + rng_l = period_range("2020-01-01", "2020-08-01", freq=freq_depr) + ser = Series(np.arange(len(rng_l)), index=rng_l) + + rng = period_range("2020-01-01", "2020-08-01", freq=freq_res) + expected = Series(data=data, index=rng) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg_res): + result = ser.resample(freq_depr_res).mean() + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "offset", [ @@ -1031,6 +1057,9 @@ def test_asfreq_invalid_period_freq(self, offset, series_and_frame): ("2Q-FEB", "2QE-FEB"), ("2Y", "2YE"), ("2Y-MAR", "2YE-MAR"), + ("2M", "2me"), + ("2Q", "2qe"), + ("2Y-MAR", "2ye-mar"), ], ) def test_resample_frequency_ME_QE_YE_error_message(series_and_frame, freq, freq_depr): diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index aa4a8b152b19f..d819e903a0bae 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -106,7 +106,9 @@ def test_construction(self): assert i1 == i3 i1 = Period("1982", freq="min") - i2 = Period("1982", freq="MIN") + msg = "'MIN' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + i2 = Period("1982", freq="MIN") assert i1 == i2 i1 = Period(year=2005, month=3, day=1, freq="D") diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 6e654e65a36d6..8ca55648f3780 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -173,3 +173,47 @@ def test_to_offset_pd_timedelta(kwargs, expected): def test_anchored_shortcuts(shortcut, expected): result = to_offset(shortcut) assert result == expected + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2ye-mar", + "2ys", + "2qe", + "2qs-feb", + "2bqs", + "2sms", + "2bms", + "2cbme", + "2me", + "2w", + ], +) +def test_to_offset_lowercase_frequency_deprecated(freq_depr): + # GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.upper()[1:]}' instead." + + with pytest.raises(FutureWarning, match=depr_msg): + to_offset(freq_depr) + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2H", + "2BH", + "2MIN", + "2S", + "2Us", + "2NS", + ], +) +def test_to_offset_uppercase_frequency_deprecated(freq_depr): + # GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.lower()[1:]}' instead." + + with pytest.raises(FutureWarning, match=depr_msg): + to_offset(freq_depr) From 797cbb7c3cfb7b4a97240603828e1d1641267147 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:05:16 +0100 Subject: [PATCH 051/152] Backport PR #56930 on branch 2.2.x (DOC: update install instruction with correct Python version support (including 3.12)) (#56931) Backport PR #56930: DOC: update install instruction with correct Python version support (including 3.12) Co-authored-by: Joris Van den Bossche --- doc/source/getting_started/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1d7eca5223544..b9f7d64d4b2f8 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -21,7 +21,7 @@ Instructions for installing :ref:`from source `, Python version support ---------------------- -Officially Python 3.9, 3.10 and 3.11. +Officially Python 3.9, 3.10, 3.11 and 3.12. Installing pandas ----------------- From 988c3a4feefeeaa53a7fee76c7c977010b53c6c8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 17 Jan 2024 23:29:35 +0100 Subject: [PATCH 052/152] Backport PR #56824 on branch 2.2.x (DOC: 2.2.0 whatsnew cleanups) (#56933) Backport PR #56824: DOC: 2.2.0 whatsnew cleanups Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.4.rst | 2 +- doc/source/whatsnew/v2.2.0.rst | 24 ++++++++++-------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 57b83a294963b..73b1103c1bd37 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -42,4 +42,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.3..v2.1.4|HEAD +.. contributors:: v2.1.3..v2.1.4 diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5de5bd58bd35f..ceb67b4ef956c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_220: -What's new in 2.2.0 (Month XX, 2024) ------------------------------------- +What's new in 2.2.0 (January XX, 2024) +-------------------------------------- These are the changes in pandas 2.2.0. See :ref:`release` for a full changelog including other versions of pandas. @@ -436,12 +436,6 @@ index levels when joining on two indexes with different levels (:issue:`34133`). result -.. --------------------------------------------------------------------------- -.. _whatsnew_220.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - .. _whatsnew_220.api_breaking.deps: Increased minimum versions for dependencies @@ -820,7 +814,7 @@ Conversion - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) -- Bug in ``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) +- Bug in :meth:``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) Strings ^^^^^^^ @@ -830,10 +824,10 @@ Strings - Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) +- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) -- Bug in :meth:`str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) - Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`) Interval @@ -892,7 +886,6 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`) - Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) @@ -906,6 +899,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where either the ``index`` or ``on`` column was :class:`ArrowDtype` with ``pyarrow.timestamp`` type (:issue:`55849`) Reshaping @@ -927,16 +921,16 @@ Reshaping Sparse ^^^^^^ -- Bug in :meth:`SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) +- Bug in :meth:`arrays.SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) Other ^^^^^ - :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`) - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`) +- Bug in :func:`api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`) - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) -- Bug in :func:`pd.api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) @@ -944,10 +938,12 @@ Other - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) -- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`pandas.core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`) +- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`) .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.1.4..v2.2.0|HEAD From 74fa7402e487c2a6336ea5291990c9f269c5001a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 18 Jan 2024 07:21:14 -0800 Subject: [PATCH 053/152] Backport PR #56445: Adjust merge tests for new string option (#56938) Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/reshape/merge/test_join.py | 4 +- pandas/tests/reshape/merge/test_merge.py | 46 +++++++++++-------- pandas/tests/reshape/merge/test_merge_asof.py | 15 ++++-- pandas/tests/reshape/merge/test_multi.py | 2 +- 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 1d5ed2d7373ce..9a2f18f33bce5 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -631,7 +631,7 @@ def test_mixed_type_join_with_suffix(self): df.insert(5, "dt", "foo") grouped = df.groupby("id") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) @@ -776,7 +776,7 @@ def test_join_on_tz_aware_datetimeindex(self): ) result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() - expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object) + expected["vals_2"] = Series([np.nan] * 2 + list("tuv")) tm.assert_frame_equal(result, expected) def test_join_datetime_string(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 27959609422f3..ed49f3b758cc5 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_object_dtype +from pandas.core.dtypes.common import ( + is_object_dtype, + is_string_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -316,14 +319,15 @@ def test_merge_copy(self): merged["d"] = "peekaboo" assert (right["d"] == "bar").all() - def test_merge_nocopy(self, using_array_manager): + def test_merge_nocopy(self, using_array_manager, using_infer_string): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) assert np.shares_memory(merged["a"]._values, left["a"]._values) - assert np.shares_memory(merged["d"]._values, right["d"]._values) + if not using_infer_string: + assert np.shares_memory(merged["d"]._values, right["d"]._values) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -667,11 +671,13 @@ def test_merge_nan_right(self): "i1_": {0: 0, 1: np.nan}, "i3": {0: 0.0, 1: np.nan}, None: {0: 0, 1: 0}, - } + }, + columns=Index(["i1", "i2", "i1_", "i3", None], dtype=object), ) .set_index(None) .reset_index()[["i1", "i2", "i1_", "i3"]] ) + result.columns = result.columns.astype("object") tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_nan_right2(self): @@ -820,7 +826,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1498,7 +1504,7 @@ def test_different(self, right_vals): # We allow merging on object and categorical cols and cast # categorical cols to object result = merge(left, right, on="A") - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8] @@ -1637,7 +1643,7 @@ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): result = merge(df1, df2, on=["A"]) assert is_object_dtype(result.A.dtype) result = merge(df2, df1, on=["A"]) - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "df1_vals, df2_vals", @@ -1867,25 +1873,27 @@ def right(): class TestMergeCategorical: - def test_identical(self, left): + def test_identical(self, left, using_infer_string): # merging on the same, should preserve dtypes merged = merge(left, left, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( - [CategoricalDtype(categories=["foo", "bar"]), np.dtype("O"), np.dtype("O")], + [CategoricalDtype(categories=["foo", "bar"]), dtype, dtype], index=["X", "Y_x", "Y_y"], ) tm.assert_series_equal(result, expected) - def test_basic(self, left, right): + def test_basic(self, left, right, using_infer_string): # we have matching Categorical dtypes in X # so should preserve the merged column merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, np.dtype("int64"), ], index=["X", "Y", "Z"], @@ -1989,16 +1997,17 @@ def test_multiindex_merge_with_unordered_categoricalindex(self, ordered): ).set_index(["id", "p"]) tm.assert_frame_equal(result, expected) - def test_other_columns(self, left, right): + def test_other_columns(self, left, right, using_infer_string): # non-merge columns should preserve if possible right = right.assign(Z=right.Z.astype("category")) merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, CategoricalDtype(categories=[1, 2]), ], index=["X", "Y", "Z"], @@ -2017,7 +2026,9 @@ def test_other_columns(self, left, right): lambda x: x.astype(CategoricalDtype(ordered=True)), ], ) - def test_dtype_on_merged_different(self, change, join_type, left, right): + def test_dtype_on_merged_different( + self, change, join_type, left, right, using_infer_string + ): # our merging columns, X now has 2 different dtypes # so we must be object as a result @@ -2029,9 +2040,8 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - expected = Series( - [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"] - ) + dtype = np.dtype("O") if not using_infer_string else "string" + expected = Series([dtype, dtype, np.dtype("int64")], index=["X", "Y", "Z"]) tm.assert_series_equal(result, expected) def test_self_join_multiple_categories(self): @@ -2499,7 +2509,7 @@ def test_merge_multiindex_columns(): expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"]) expected = DataFrame(columns=expected_index) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_datetime_upcast_dtype(): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index b656191cc739d..a2e22ea73fd86 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3081,8 +3081,11 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) - def test_merge_datatype_error_raises(self): - msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" + def test_merge_datatype_error_raises(self, using_infer_string): + if using_infer_string: + msg = "incompatible merge keys" + else: + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) @@ -3134,7 +3137,7 @@ def test_merge_on_nans(self, func, side): else: merge_asof(df, df_null, on="a") - def test_by_nullable(self, any_numeric_ea_dtype): + def test_by_nullable(self, any_numeric_ea_dtype, using_infer_string): # Note: this test passes if instead of using pd.array we use # np.array([np.nan, 1]). Other than that, I (@jbrockmendel) # have NO IDEA what the expected behavior is. @@ -3176,6 +3179,8 @@ def test_by_nullable(self, any_numeric_ea_dtype): } ) expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) def test_merge_by_col_tz_aware(self): @@ -3201,7 +3206,7 @@ def test_merge_by_col_tz_aware(self): ) tm.assert_frame_equal(result, expected) - def test_by_mixed_tz_aware(self): + def test_by_mixed_tz_aware(self, using_infer_string): # GH 26649 left = pd.DataFrame( { @@ -3225,6 +3230,8 @@ def test_by_mixed_tz_aware(self): columns=["by_col1", "by_col2", "on_col", "value_x"], ) expected["value_y"] = np.array([np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index b1aa6b88bc4ee..402ff049884ba 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -639,7 +639,7 @@ def test_join_multi_levels_outer(self, portfolio, household, expected): axis=0, sort=True, ).reindex(columns=expected.columns) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_join_multi_levels_invalid(self, portfolio, household): portfolio = portfolio.copy() From 160d7a154f61fb55611626c311a1b0216828dea8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 18 Jan 2024 21:47:02 +0100 Subject: [PATCH 054/152] Backport PR #56587 on branch 2.2.x (ENH: support the Arrow PyCapsule Interface on pandas.DataFrame (export)) (#56944) Backport PR #56587: ENH: support the Arrow PyCapsule Interface on pandas.DataFrame (export) Co-authored-by: Joris Van den Bossche --- pandas/compat/_optional.py | 5 ++- pandas/core/frame.py | 27 +++++++++++++ pandas/tests/frame/test_arrow_interface.py | 45 ++++++++++++++++++++++ pandas/tests/test_optional_dependency.py | 14 +++++++ 4 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/frame/test_arrow_interface.py diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 9d04d7c0a1216..2bc6cd46f09a7 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -120,9 +120,8 @@ def import_optional_dependency( The imported module, when found and the version is correct. None is returned when the package is not found and `errors` is False, or when the package's version is too old and `errors` - is ``'warn'``. + is ``'warn'`` or ``'ignore'``. """ - assert errors in {"warn", "raise", "ignore"} package_name = INSTALL_MAPPING.get(name) @@ -163,5 +162,7 @@ def import_optional_dependency( return None elif errors == "raise": raise ImportError(msg) + else: + return None return module diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 15ccbd602c9c8..734756cb8f7c8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -987,6 +987,33 @@ def __dataframe_consortium_standard__( ) return convert_to_standard_compliant_dataframe(self, api_version=api_version) + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() + # ---------------------------------------------------------------------- @property diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py new file mode 100644 index 0000000000000..ac7b51cbdfa92 --- /dev/null +++ b/pandas/tests/frame/test_arrow_interface.py @@ -0,0 +1,45 @@ +import ctypes + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd + +pa = pytest.importorskip("pyarrow") + + +@td.skip_if_no("pyarrow", min_version="14.0") +def test_dataframe_arrow_interface(): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + capsule = df.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + table = pa.table(df) + expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.table(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="15.0") +def test_dataframe_to_arrow(): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + table = pa.RecordBatchReader.from_stream(df) + expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.RecordBatchReader.from_stream(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index c1d1948d6c31a..52b5f636b1254 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -50,6 +50,20 @@ def test_bad_version(monkeypatch): result = import_optional_dependency("fakemodule") assert result is module + with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"): + import_optional_dependency("fakemodule", min_version="1.1.0") + + with tm.assert_produces_warning(UserWarning): + result = import_optional_dependency( + "fakemodule", errors="warn", min_version="1.1.0" + ) + assert result is None + + result = import_optional_dependency( + "fakemodule", errors="ignore", min_version="1.1.0" + ) + assert result is None + def test_submodule(monkeypatch): # Create a fake module with a submodule From a95029a77f1c00678dda82f76a1b53b4b161b2a0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 19 Jan 2024 03:56:51 +0100 Subject: [PATCH 055/152] Backport PR #56947 on branch 2.2.x (DOC: Set date for 2.2) (#56950) Backport PR #56947: DOC: Set date for 2.2 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ceb67b4ef956c..d9ab0452c8334 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -1,6 +1,6 @@ .. _whatsnew_220: -What's new in 2.2.0 (January XX, 2024) +What's new in 2.2.0 (January 19, 2024) -------------------------------------- These are the changes in pandas 2.2.0. See :ref:`release` for a full changelog From cc37a13c5890a1d8fff7f8314008161b8fb1e21e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 19 Jan 2024 18:14:09 +0100 Subject: [PATCH 056/152] Backport PR #56949 on branch 2.2.x (CI: avoid FutureWarnings in to_xarray tests) (#56961) Backport PR #56949: CI: avoid FutureWarnings in to_xarray tests Co-authored-by: Luke Manley --- pandas/tests/generic/test_to_xarray.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index e0d79c3f15282..d8401a8b2ae3f 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -41,7 +41,7 @@ def test_to_xarray_index_types(self, index_flat, df, using_infer_string): df.index.name = "foo" df.columns.name = "bar" result = df.to_xarray() - assert result.dims["foo"] == 4 + assert result.sizes["foo"] == 4 assert len(result.coords) == 1 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) @@ -62,7 +62,7 @@ def test_to_xarray_empty(self, df): df.index.name = "foo" result = df[0:0].to_xarray() - assert result.dims["foo"] == 0 + assert result.sizes["foo"] == 0 assert isinstance(result, Dataset) def test_to_xarray_with_multiindex(self, df, using_infer_string): @@ -71,8 +71,8 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string): # MultiIndex df.index = MultiIndex.from_product([["a"], range(4)], names=["one", "two"]) result = df.to_xarray() - assert result.dims["one"] == 1 - assert result.dims["two"] == 4 + assert result.sizes["one"] == 1 + assert result.sizes["two"] == 4 assert len(result.coords) == 2 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) From 859c030f5ea378a0f83aeb1de0c6c0fa6b420604 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 19 Jan 2024 19:21:44 +0100 Subject: [PATCH 057/152] Backport PR #56922 on branch 2.2.x (REGR: DatetimeTZDtype __from_arrow__ interprets UTC values as wall time) (#56962) Backport PR #56922: REGR: DatetimeTZDtype __from_arrow__ interprets UTC values as wall time Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/core/dtypes/dtypes.py | 2 +- pandas/tests/arrays/datetimes/test_constructors.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e90e92fa0ee1c..1c43ef55c11d7 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -919,7 +919,7 @@ def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> DatetimeArray: else: np_arr = array.to_numpy() - return DatetimeArray._from_sequence(np_arr, dtype=self, copy=False) + return DatetimeArray._simple_new(np_arr, dtype=self) def __setstate__(self, state) -> None: # for pickle compat. __get_state__ is defined in the diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index daf4aa3b47f56..3652b5fec46bb 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -223,7 +223,7 @@ def test_2d(self, order): ("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE), ], ) -def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( +def test_from_arrow_with_different_units_and_timezones_with( pa_unit, pd_unit, pa_tz, pd_tz, data ): pa = pytest.importorskip("pyarrow") @@ -233,9 +233,8 @@ def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence( - np.array(data, dtype=f"datetime64[{pa_unit}]").astype(f"datetime64[{pd_unit}]"), - dtype=dtype, + expected = DatetimeArray._from_sequence(data, dtype=f"M8[{pa_unit}, UTC]").astype( + dtype, copy=False ) tm.assert_extension_array_equal(result, expected) From dfd0aeda19e475314bb874d46507269777795793 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 19 Jan 2024 19:22:16 +0100 Subject: [PATCH 058/152] Backport PR #56896 on branch 2.2.x (DEPS: Add warning if pyarrow is not installed) (#56963) Backport PR #56896: DEPS: Add warning if pyarrow is not installed Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 5 ++++- pandas/__init__.py | 33 ++++++++++++++++++++++++++++++-- pandas/compat/pyarrow.py | 2 ++ pandas/tests/test_common.py | 22 +++++++++++++++++++++ 4 files changed, 59 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index dd5d090e098b0..a3cffb4b03b93 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -92,7 +92,10 @@ jobs: - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" - test_args: "-W error::DeprecationWarning -W error::FutureWarning" + # Currently restricted the warnings that error to Deprecation Warnings from numpy + # done since pyarrow isn't compatible with numpydev always + # TODO: work with pyarrow to revert this? + test_args: "-W error::DeprecationWarning:numpy -W error::FutureWarning:numpy" - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" diff --git a/pandas/__init__.py b/pandas/__init__.py index 7fab662ed2de4..ed524c2bb3619 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -202,8 +202,37 @@ FutureWarning, stacklevel=2, ) -# Don't allow users to use pandas.os or pandas.warnings -del os, warnings + +# DeprecationWarning for missing pyarrow +from pandas.compat.pyarrow import pa_version_under10p1, pa_not_found + +if pa_version_under10p1: + # pyarrow is either too old or nonexistent, warn + from pandas.compat._optional import VERSIONS + + if pa_not_found: + pa_msg = "was not found to be installed on your system." + else: + pa_msg = ( + f"was too old on your system - pyarrow {VERSIONS['pyarrow']} " + "is the current minimum supported version as of this release." + ) + + warnings.warn( + f""" +Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), +(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) +but {pa_msg} +If this would cause problems for you, +please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 + """, # noqa: E501 + DeprecationWarning, + stacklevel=2, + ) + del VERSIONS, pa_msg + +# Delete all unnecessary imported modules +del pa_version_under10p1, pa_not_found, warnings, os # module level doc-string __doc__ = """ diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index beb4814914101..2e151123ef2c9 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -8,6 +8,7 @@ import pyarrow as pa _palv = Version(Version(pa.__version__).base_version) + pa_not_found = False pa_version_under10p1 = _palv < Version("10.0.1") pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p0 = _palv < Version("12.0.0") @@ -16,6 +17,7 @@ pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") except ImportError: + pa_not_found = True pa_version_under10p1 = True pa_version_under11p0 = True pa_version_under12p0 = True diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index e8a1c961c8cb6..fe24755e8cc23 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -8,6 +8,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import Series import pandas._testing as tm @@ -265,3 +267,23 @@ def test_bz2_missing_import(): code = textwrap.dedent(code) call = [sys.executable, "-c", code] subprocess.check_output(call) + + +@td.skip_if_installed("pyarrow") +@pytest.mark.parametrize("module", ["pandas", "pandas.arrays"]) +def test_pyarrow_missing_warn(module): + # GH56896 + response = subprocess.run( + [sys.executable, "-c", f"import {module}"], + capture_output=True, + check=True, + ) + msg = """ +Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), +(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) +but was not found to be installed on your system. +If this would cause problems for you, +please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 +""" # noqa: E501 + stderr_msg = response.stderr.decode("utf-8") + assert msg in stderr_msg, stderr_msg From b070774d9aa6e3ed0667c9ebdaa82ddce79a6d4b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 19 Jan 2024 12:05:34 -0800 Subject: [PATCH 059/152] =?UTF-8?q?Backport=20PR=20#56952:=20DEPR:=20Make?= =?UTF-8?q?=20FutureWarning=20into=20DeprecationWarning=20=E2=80=A6=20(#56?= =?UTF-8?q?964)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Backport PR #56952: DEPR: Make FutureWarning into DeprecationWarning for groupby.apply * Update test_groupby.py * fix finally? --------- Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/core/groupby/groupby.py | 2 +- pandas/core/resample.py | 2 +- pandas/tests/extension/base/groupby.py | 12 +- pandas/tests/frame/test_stack_unstack.py | 2 +- pandas/tests/groupby/aggregate/test_other.py | 4 +- .../groupby/methods/test_value_counts.py | 2 +- pandas/tests/groupby/test_apply.py | 132 +++++++++--------- pandas/tests/groupby/test_apply_mutate.py | 18 +-- pandas/tests/groupby/test_categorical.py | 6 +- pandas/tests/groupby/test_counting.py | 2 +- pandas/tests/groupby/test_groupby.py | 32 ++--- pandas/tests/groupby/test_groupby_dropna.py | 2 +- pandas/tests/groupby/test_groupby_subclass.py | 10 +- pandas/tests/groupby/test_grouping.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 10 +- .../tests/groupby/transform/test_transform.py | 4 +- pandas/tests/resample/test_datetime_index.py | 10 +- pandas/tests/resample/test_resample_api.py | 2 +- .../tests/resample/test_resampler_grouper.py | 37 ++--- pandas/tests/resample/test_time_grouper.py | 2 +- pandas/tests/window/test_groupby.py | 32 ++--- 21 files changed, 167 insertions(+), 158 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 089e15afd465b..5b18455dbe8a8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1831,7 +1831,7 @@ def f(g): message=_apply_groupings_depr.format( type(self).__name__, "apply" ), - category=FutureWarning, + category=DeprecationWarning, stacklevel=find_stack_level(), ) except TypeError: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 48a5f85e1c388..3e9507bd4347f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2906,7 +2906,7 @@ def _apply( new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") with rewrite_warning( target_message=target_message, - target_category=FutureWarning, + target_category=DeprecationWarning, new_message=new_message, ): result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 75628ea177fc2..414683b02dcba 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -114,13 +114,13 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("B", group_keys=False).apply(groupby_apply_op) - df.groupby("B", group_keys=False).A.apply(groupby_apply_op) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("A", group_keys=False).apply(groupby_apply_op) - df.groupby("A", group_keys=False).B.apply(groupby_apply_op) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 6e1e743eb60de..d8b92091260a3 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1825,7 +1825,7 @@ def test_unstack_bug(self, future_stack): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 0596193c137e1..00136e572288e 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -502,7 +502,7 @@ def test_agg_timezone_round_trip(): # GH#27110 applying iloc should return a DataFrame msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] @@ -510,7 +510,7 @@ def test_agg_timezone_round_trip(): # GH#27110 applying iloc should return a DataFrame msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 2fa79c815d282..8e25177368d8b 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -330,7 +330,7 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - warn = FutureWarning if groupby == "column" else None + warn = DeprecationWarning if groupby == "column" else None msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): expected = gp.apply( diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 34b6e7c4cde5f..0ddacfab8c102 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -28,7 +28,7 @@ def store(group): groups.append(group) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): df.groupby("index").apply(store) expected_value = DataFrame( {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) @@ -115,7 +115,7 @@ def test_apply_index_date_object(using_infer_string): ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("date", group_keys=False).apply( lambda x: x["time"][x["value"].idxmax()] ) @@ -227,7 +227,7 @@ def f_constant_df(group): del names[:] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): df.groupby("a", group_keys=False).apply(func) assert names == group_names @@ -247,7 +247,7 @@ def test_group_apply_once_per_group2(capsys): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): df.groupby("group_by_column", group_keys=False).apply( lambda df: print("function_called") ) @@ -271,9 +271,9 @@ def fast(group): return group.copy() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): fast_df = df.groupby("A", group_keys=False).apply(fast) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) @@ -297,7 +297,7 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("g", group_keys=False).apply(func) tm.assert_frame_equal(result, df) @@ -342,9 +342,9 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_not_as, exp) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): res_as_apply = g_as.apply(lambda x: x.head(2)).index - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering @@ -359,7 +359,7 @@ def test_groupby_as_index_apply(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -390,17 +390,17 @@ def desc3(group): return result msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -432,7 +432,7 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -445,7 +445,7 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) @@ -456,7 +456,7 @@ def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan @@ -481,7 +481,7 @@ def trans2(group): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) @@ -512,7 +512,7 @@ def test_apply_chunk_view(group_keys): df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) if group_keys: @@ -535,7 +535,7 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grouped.apply(lambda x: x.sort_values("value", inplace=True)) @@ -554,7 +554,7 @@ def f(group): return group msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() @@ -580,7 +580,7 @@ def f(group): return group msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() @@ -620,9 +620,9 @@ def filt2(x): return x[x.category == "c"] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = data.groupby("id_field").apply(filt1) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -643,7 +643,7 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): tm.assert_series_equal(result, expected) else: msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("Y", group_keys=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis @@ -690,7 +690,7 @@ def f(g): return g msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(f) assert "value3" in result @@ -706,11 +706,11 @@ def test_apply_numeric_coercion_when_datetime(): {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) @@ -723,7 +723,7 @@ def get_B(g): return g.iloc[0][["B"]] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A @@ -750,9 +750,9 @@ def predictions(tool): df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df1.groupby("Key").apply(predictions).p1 - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -769,7 +769,7 @@ def test_apply_aggregating_timedelta_and_datetime(): ) df["time_delta_zero"] = df.datetime - df.datetime msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("clientid").apply( lambda ddf: Series( {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} @@ -818,13 +818,13 @@ def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1] @@ -870,7 +870,7 @@ def test_func(x): pass msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = test_df.groupby("groups").apply(test_func) expected = DataFrame() tm.assert_frame_equal(result, expected) @@ -887,9 +887,9 @@ def test_func(x): return x.iloc[[0, -1]] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = test_df1.groupby("groups").apply(test_func) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result2 = test_df2.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) @@ -904,7 +904,7 @@ def test_groupby_apply_return_empty_chunk(): df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], @@ -933,7 +933,7 @@ def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) @@ -952,7 +952,7 @@ def test_apply_datetime_issue(group_column_dtlike, using_infer_string): df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) dtype = "string" if using_infer_string else "object" @@ -992,7 +992,7 @@ def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" @@ -1035,7 +1035,7 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): columns=["observation", "color", "mood", "intensity", "score"], ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes dtype = "string" if using_infer_string else object expected = Series( @@ -1058,7 +1058,7 @@ def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("group", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1083,7 +1083,7 @@ def test_apply_function_returns_non_pandas_non_scalar(function, expected_values) # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -1097,7 +1097,7 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") @@ -1110,7 +1110,7 @@ def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], @@ -1148,7 +1148,7 @@ def test_apply_result_type(group_keys, udf): # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) @@ -1165,9 +1165,9 @@ def test_result_order_group_keys_false(): # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A", group_keys=False).apply(lambda x: x) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1181,11 +1181,11 @@ def test_apply_with_timezones_aware(): df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = df1.groupby("x", group_keys=False).apply( lambda df: df[["x", "y"]].copy() ) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result2 = df2.groupby("x", group_keys=False).apply( lambda df: df[["x", "y"]].copy() ) @@ -1205,7 +1205,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): ) expected = DataFrame( - {"a": [264, 297], "b": [15, 6], "c": [150, 60]}, + {"b": [15, 6], "c": [150, 60]}, index=Index([88, 99], name="a"), ) @@ -1213,7 +1213,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): grp = df.groupby(by="a") msg = "The behavior of DataFrame.sum with axis=None is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(sum, include_groups=False) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() @@ -1221,7 +1221,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): args = get_groupby_method_args(reduction_func, df) _ = getattr(grp, reduction_func)(*args) with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(sum, include_groups=False) tm.assert_frame_equal(result, expected) @@ -1244,7 +1244,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): grp = df.groupby(["A", "B"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] @@ -1294,7 +1294,7 @@ def test_apply_dropna_with_indexed_same(dropna): index=list("xxyxz"), ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1321,7 +1321,7 @@ def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1333,7 +1333,7 @@ def test_sort_index_groups(): index=range(5), ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), @@ -1355,7 +1355,7 @@ def test_positional_slice_groups_datetimelike(): } ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = expected.groupby( [expected.let, expected.date.dt.date], group_keys=False ).apply(lambda x: x.iloc[0:]) @@ -1402,9 +1402,9 @@ def test_apply_na(dropna): ) dfgrp = df.groupby("grp", dropna=dropna) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) @@ -1412,7 +1412,7 @@ def test_apply_na(dropna): def test_apply_empty_string_nan_coerce_bug(): # GH#24903 msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = ( DataFrame( { @@ -1449,7 +1449,7 @@ def test_apply_index_key_error_bug(index_values): index=Index(["a2", "a3", "aa"], name="a"), ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = result.groupby("a").apply( lambda df: Series([df["b"].mean()], index=["b_mean"]) ) @@ -1501,7 +1501,7 @@ def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 expected = DataFrame({"col": arg}, index=idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = expected.groupby("col", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, expected) @@ -1554,7 +1554,7 @@ def test_include_groups(include_groups): # GH#7155 df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) gb = df.groupby("a") - warn = FutureWarning if include_groups else None + warn = DeprecationWarning if include_groups else None msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): result = gb.apply(lambda x: x.sum(), include_groups=include_groups) @@ -1590,11 +1590,11 @@ def test_builtins_apply(keys, f): npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = gb.apply(npfunc) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected2 = gb.apply(lambda x: npfunc(x)) tm.assert_frame_equal(result, expected2) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index 09d5e06bf6ddd..cfd1a4bca9d91 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -14,12 +14,12 @@ def test_group_by_copy(): ).set_index("name") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grp_by_same_value = df.groupby(["age"], group_keys=False).apply( lambda group: group ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grp_by_copy = df.groupby(["age"], group_keys=False).apply( lambda group: group.copy() ) @@ -54,9 +54,9 @@ def f_no_copy(x): return x.groupby("cat2")["rank"].min() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grpby_copy = df.groupby("cat1").apply(f_copy) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grpby_no_copy = df.groupby("cat1").apply(f_no_copy) tm.assert_series_equal(grpby_copy, grpby_no_copy) @@ -68,14 +68,14 @@ def test_no_mutate_but_looks_like(): df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) tm.assert_series_equal(result1, result2) -def test_apply_function_with_indexing(): +def test_apply_function_with_indexing(warn_copy_on_write): # GH: 33058 df = pd.DataFrame( {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} @@ -86,7 +86,9 @@ def fn(x): return x.col2 msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning( + DeprecationWarning, match=msg, raise_on_extra_warnings=not warn_copy_on_write + ): result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 7a91601bf688f..f60ff65536f20 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -125,7 +125,7 @@ def f(x): return x.drop_duplicates("person_name").iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") @@ -333,7 +333,7 @@ def test_apply(ordered): idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) @@ -2050,7 +2050,7 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = FutureWarning if method == "apply" and index_kind == "range" else None + warn = DeprecationWarning if method == "apply" and index_kind == "range" else None msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 16d7fe61b90ad..2622895f9f8d2 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -290,7 +290,7 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4c903e691add1..ed9acdd0c9dde 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -163,7 +163,7 @@ def max_value(group): return group.loc[group["value"].idxmax()] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): applied = df.groupby("A").apply(max_value) result = applied.dtypes expected = df.dtypes @@ -186,7 +186,7 @@ def f_0(grp): expected = df.groupby("A").first()[["B"]] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) @@ -196,7 +196,7 @@ def f_1(grp): return grp.iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(f_1)[["B"]] e = expected.copy() e.loc["Tiger"] = np.nan @@ -208,7 +208,7 @@ def f_2(grp): return grp.iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(f_2)[["B"]] e = expected.copy() e.loc["Pony"] = np.nan @@ -221,7 +221,7 @@ def f_3(grp): return grp.iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT @@ -234,7 +234,7 @@ def f_4(grp): return grp.iloc[0].loc["C"] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan @@ -421,9 +421,9 @@ def f3(x): # correct result msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = df.groupby("a").apply(f1) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) @@ -1377,13 +1377,13 @@ def summarize_random_name(df): return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1678,7 +1678,7 @@ def test_dont_clobber_name_column(): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("key", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1762,7 +1762,7 @@ def freducex(x): # make sure all these work msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) @@ -1785,7 +1785,7 @@ def f(group): return group.copy() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] @@ -1993,7 +1993,7 @@ def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): g.apply(test_sort) @@ -2180,7 +2180,7 @@ def test_empty_groupby_apply_nonunique_columns(): df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): res = gb.apply(lambda x: x) assert (res.dtypes == df.dtypes).all() diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 73638eba0a3b3..9155f2cccf117 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -325,7 +325,7 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 17ef6ee913463..0832b67b38098 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -74,7 +74,10 @@ def func(group): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False + DeprecationWarning, + match=msg, + raise_on_extra_warnings=False, + check_stacklevel=False, ): result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) @@ -123,7 +126,10 @@ def test_groupby_resample_preserves_subclass(obj): # Confirm groupby.resample() preserves dataframe type msg = "DataFrameGroupBy.resample operated on the grouping columns" with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False + DeprecationWarning, + match=msg, + raise_on_extra_warnings=False, + check_stacklevel=False, ): result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 363ff883385db..d763b67059375 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -238,7 +238,7 @@ def test_grouper_creation_bug(self): tm.assert_frame_equal(result, expected) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = g.apply(lambda x: x.sum()) expected["A"] = [0, 2, 4] expected = expected.loc[:, ["A", "B"]] diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index d357a65e79796..8ef7c2b8ce859 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -478,10 +478,10 @@ def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) @@ -499,9 +499,9 @@ def sumfunc_value(x): return x.value.sum() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) @@ -929,7 +929,7 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( # function that returns a Series msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): res = gb.apply(lambda x: x["Quantity"] * 2) dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index a2ecd6c65db60..fd9bd5cc55538 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -668,7 +668,7 @@ def f(group): grouped = df.groupby("c") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(f) assert result["d"].dtype == np.float64 @@ -826,7 +826,7 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): warn = None else: - warn = FutureWarning + warn = DeprecationWarning msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): expected = gb.apply(targop) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 80583f5d3c5f2..ddd81ab1d347d 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1080,10 +1080,10 @@ def test_resample_segfault(unit): ).set_index("timestamp") df.index = df.index.as_unit(unit) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("ID").resample("5min").sum() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1104,7 +1104,7 @@ def test_resample_dtype_preservation(unit): assert result.val.dtype == np.int32 msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1881,10 +1881,10 @@ def f(data, add_arg): df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 7e8779ab48b7e..d3e906827b754 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -78,7 +78,7 @@ def test_groupby_resample_api(): index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 337c5ff53bd14..550523a432a89 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -70,10 +70,10 @@ def f_0(x): return x.set_index("date").resample("D").asfreq() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby("id").apply(f_0) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) @@ -89,10 +89,10 @@ def f_1(x): return x.resample("1D").ffill() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby("group").apply(f_1) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -109,7 +109,7 @@ def test_getitem(test_frame): tm.assert_series_equal(result, expected) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -235,10 +235,10 @@ def test_methods(f, test_frame): r = g.resample("2s") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = getattr(r, f)() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -257,10 +257,10 @@ def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = getattr(r, f)(ddof=1) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -271,14 +271,14 @@ def test_apply(test_frame): # reduction msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = r.apply(f_0) tm.assert_frame_equal(result, expected) @@ -286,7 +286,7 @@ def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") @@ -356,7 +356,7 @@ def test_resample_groupby_with_label(unit): index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("col0").resample("1W", label="left").sum() mi = [ @@ -379,7 +379,7 @@ def test_consistency_with_window(test_frame): df = test_frame expected = Index([1, 2, 3], name="A") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -479,7 +479,7 @@ def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = ( DataFrame(columns=["a", "b"]) @@ -503,7 +503,8 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - with tm.assert_produces_warning(FutureWarning): + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ @@ -555,7 +556,7 @@ def test_resample_no_index(keys): df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) @@ -604,7 +605,7 @@ def test_groupby_resample_size_all_index_same(): index=date_range("31/12/2000 18:00", freq="h", periods=12), ) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").resample("D").size() mi_exp = pd.MultiIndex.from_arrays( diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 3d9098917a12d..3f9340b800eae 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -346,7 +346,7 @@ def test_groupby_resample_interpolate(): df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = ( df.set_index("week_starting") .groupby("volume") diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 400bf10817ab8..45e7e07affd75 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -101,7 +101,7 @@ def test_rolling(self, f, roll_frame): result = getattr(r, f)() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -117,7 +117,7 @@ def test_rolling_ddof(self, f, roll_frame): result = getattr(r, f)(ddof=1) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -135,7 +135,7 @@ def test_rolling_quantile(self, interpolation, roll_frame): result = r.quantile(0.4, interpolation=interpolation) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply( lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) ) @@ -182,7 +182,7 @@ def func(x): return getattr(x.rolling(4), f)(roll_frame) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) @@ -200,7 +200,7 @@ def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(func) tm.assert_series_equal(result, expected) @@ -247,7 +247,7 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -793,11 +793,11 @@ def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @@ -975,7 +975,7 @@ def test_groupby_monotonic(self): df = df.sort_values("date") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = ( df.set_index("date") .groupby("name") @@ -1000,7 +1000,7 @@ def test_datelike_on_monotonic_within_each_group(self): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = ( df.set_index("B") .groupby("A") @@ -1036,7 +1036,7 @@ def test_expanding(self, f, frame): result = getattr(r, f)() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.expanding(), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -1052,7 +1052,7 @@ def test_expanding_ddof(self, f, frame): result = getattr(r, f)(ddof=0) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -1070,7 +1070,7 @@ def test_expanding_quantile(self, interpolation, frame): result = r.quantile(0.4, interpolation=interpolation) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply( lambda x: x.expanding().quantile(0.4, interpolation=interpolation) ) @@ -1092,7 +1092,7 @@ def func_0(x): return getattr(x.expanding(), f)(frame) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values @@ -1109,7 +1109,7 @@ def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(func_1) tm.assert_series_equal(result, expected) @@ -1120,7 +1120,7 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply( lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) ) From fd3f57170aa1af588ba877e8e28c158a20a4886d Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 19 Jan 2024 12:10:08 -0800 Subject: [PATCH 060/152] RLS: 2.2.0 From 2fa26fd1fdaaef67b9cff812006b21fe0a76ebe9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 20 Jan 2024 01:20:40 +0100 Subject: [PATCH 061/152] =?UTF-8?q?Backport=20PR=20#56967=20on=20branch=20?= =?UTF-8?q?2.2.x=20(CI:=20Adjust=20pyarrow=20depr=20warning=20to=20account?= =?UTF-8?q?=20for=20different=20newlines=20on=20=E2=80=A6)=20(#56969)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #56967: CI: Adjust pyarrow depr warning to account for different newlines on … Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/tests/test_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index fe24755e8cc23..4af71be11fe6b 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -286,4 +286,7 @@ def test_pyarrow_missing_warn(module): please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 """ # noqa: E501 stderr_msg = response.stderr.decode("utf-8") - assert msg in stderr_msg, stderr_msg + # Split by \n to avoid \r\n vs \n differences on Windows/Unix + # https://stackoverflow.com/questions/11989501/replacing-r-n-with-n + stderr_msg = "\n".join(stderr_msg.splitlines()) + assert msg in stderr_msg From f538741432edf55c6b9fb5d0d496d2dd1d7c2457 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 19 Jan 2024 16:38:41 -0800 Subject: [PATCH 062/152] RLS: 2.2.0 From 22ae7890ddc15422431a689f5f7e6aff65e2917b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 20 Jan 2024 21:07:16 +0100 Subject: [PATCH 063/152] Backport PR #56980 on branch 2.2.x (WEB: Add version 2.2 to the dropdown) (#56983) Backport PR #56980: WEB: Add version 2.2 to the dropdown Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- web/pandas/versions.json | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/web/pandas/versions.json b/web/pandas/versions.json index e355005c7c937..09b3d8492a2c7 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -5,11 +5,16 @@ "url": "https://pandas.pydata.org/docs/dev/" }, { - "name": "2.1 (stable)", - "version": "2.1", + "name": "2.2 (stable)", + "version": "2.2", "url": "https://pandas.pydata.org/docs/", "preferred": true }, + { + "name": "2.1", + "version": "2.1", + "url": "https://pandas.pydata.org/pandas-docs/version/2.1/", + }, { "name": "2.0", "version": "2.0", From 6c563e3299543746befc6822fe53dea5ddc48979 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 20 Jan 2024 21:36:53 +0100 Subject: [PATCH 064/152] Backport PR #56986 on branch 2.2.x (WEB: Fix typo in dropdown page) (#56987) Backport PR #56986: WEB: Fix typo in dropdown page Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- web/pandas/versions.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/pandas/versions.json b/web/pandas/versions.json index 09b3d8492a2c7..2d2599ae8585b 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -13,7 +13,7 @@ { "name": "2.1", "version": "2.1", - "url": "https://pandas.pydata.org/pandas-docs/version/2.1/", + "url": "https://pandas.pydata.org/pandas-docs/version/2.1/" }, { "name": "2.0", From bfe6c4f057e477544ee53a346d615cf875971aaf Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 22 Jan 2024 19:24:51 +0100 Subject: [PATCH 065/152] Backport PR #56982 on branch 2.2.x (DOC: Add release notes for 2.2.1) (#56998) Backport PR #56982: DOC: Add release notes for 2.2.1 Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.2.1.rst | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 doc/source/whatsnew/v2.2.1.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index ec024f36d78b1..3a2ab4c17d1bd 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.2 .. toctree:: :maxdepth: 2 + v2.2.1 v2.2.0 Version 2.1 diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst new file mode 100644 index 0000000000000..22a6c0cd1028f --- /dev/null +++ b/doc/source/whatsnew/v2.2.1.rst @@ -0,0 +1,36 @@ +.. _whatsnew_221: + +What's new in 2.2.1 (February XX, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.contributors: + +Contributors +~~~~~~~~~~~~ From 987dcbbce3c9bdb5e422ff6dee65bd057064b7dc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 22 Jan 2024 21:10:25 +0100 Subject: [PATCH 066/152] Backport PR #57005 on branch 2.2.x (CI: pyarrow nightly failures) (#57013) Backport PR #57005: CI: pyarrow nightly failures Co-authored-by: Luke Manley --- pandas/tests/frame/test_arrow_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index ac7b51cbdfa92..098d1829b973c 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -35,11 +35,11 @@ def test_dataframe_arrow_interface(): def test_dataframe_to_arrow(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - table = pa.RecordBatchReader.from_stream(df) + table = pa.RecordBatchReader.from_stream(df).read_all() expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) assert table.equals(expected) schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) - table = pa.RecordBatchReader.from_stream(df, schema=schema) + table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all() expected = expected.cast(schema) assert table.equals(expected) From 662e3f8632f112b3f85c68b62445766c8186396c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 23 Jan 2024 01:22:04 +0100 Subject: [PATCH 067/152] Backport PR #57011 on branch 2.2.x (Remove SKIP summary from CI logs) (#57020) Backport PR #57011: Remove SKIP summary from CI logs Co-authored-by: William Ayd --- ci/run_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 48ef21686a26f..39ab0890a32d1 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -10,7 +10,7 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" From 3b833cfcc81ba55a1f46603b06fb9a043aa1bba9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 23 Jan 2024 01:22:19 +0100 Subject: [PATCH 068/152] Backport PR #57018 on branch 2.2.x (REGR: merge_ordered with fill_method="ffill" and how="left") (#57021) Backport PR #57018: REGR: merge_ordered with fill_method="ffill" and how="left" Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 2 +- pandas/core/reshape/merge.py | 7 +++--- .../tests/reshape/merge/test_merge_ordered.py | 23 +++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 22a6c0cd1028f..75445c978d262 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 410301b7697f2..646f40f6141d8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1930,10 +1930,9 @@ def get_result(self, copy: bool | None = True) -> DataFrame: if self.fill_method == "ffill": if left_indexer is None: - raise TypeError("left_indexer cannot be None") - left_indexer = cast("npt.NDArray[np.intp]", left_indexer) - right_indexer = cast("npt.NDArray[np.intp]", right_indexer) - left_join_indexer = libjoin.ffill_indexer(left_indexer) + left_join_indexer = None + else: + left_join_indexer = libjoin.ffill_indexer(left_indexer) if right_indexer is None: right_join_indexer = None else: diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index abd61026b4e37..0bd3ca3cf2c1b 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -219,3 +219,26 @@ def test_ffill_validate_fill_method(self, left, right, invalid_method): ValueError, match=re.escape("fill_method must be 'ffill' or None") ): merge_ordered(left, right, on="key", fill_method=invalid_method) + + def test_ffill_left_merge(self): + # GH 57010 + df1 = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + } + ) + df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + result = merge_ordered( + df1, df2, fill_method="ffill", left_by="group", how="left" + ) + expected = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + "rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) From fb2cf0fca6f231d1c24fdbed06035457e47f3970 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:12:31 +0100 Subject: [PATCH 069/152] Backport PR #57057 on branch 2.2.x (COMPAT: Make argsort compatable with numpy 2.0) (#57062) Backport PR #57057: COMPAT: Make argsort compatable with numpy 2.0 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/compat/numpy/function.py | 2 ++ pandas/core/indexes/base.py | 2 +- pandas/core/series.py | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index a36e25a9df410..4df30f7f4a8a7 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -138,6 +138,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None ARGSORT_DEFAULTS["kind"] = None +ARGSORT_DEFAULTS["stable"] = None validate_argsort = CompatValidator( @@ -149,6 +150,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {} ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None +ARGSORT_DEFAULTS_KIND["stable"] = None validate_argsort_kind = CompatValidator( ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both" ) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 88a08dd55f739..c36dcda6e2972 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -956,7 +956,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): return self.__array_wrap__(result) @final - def __array_wrap__(self, result, context=None): + def __array_wrap__(self, result, context=None, return_scalar=False): """ Gets called after a ufunc and other functions e.g. np.split. """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 83eb545b9b681..f06a1eb533ba4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4066,6 +4066,7 @@ def argsort( axis: Axis = 0, kind: SortKind = "quicksort", order: None = None, + stable: None = None, ) -> Series: """ Return the integer indices that would sort the Series values. @@ -4082,6 +4083,8 @@ def argsort( information. 'mergesort' and 'stable' are the only stable algorithms. order : None Has no effect but is accepted for compatibility with numpy. + stable : None + Has no effect but is accepted for compatibility with numpy. Returns ------- From 2df78e8821d0e8d5ef51e3842db142faf32a64c1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:13:44 +0100 Subject: [PATCH 070/152] Backport PR #57058 on branch 2.2.x (BUG: Series.pct_change with empty Series) (#57059) Backport PR #57058: BUG: Series.pct_change with empty Series Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/generic.py | 27 ++++++++++--------- .../tests/series/methods/test_pct_change.py | 8 ++++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 75445c978d262..d3065cdd8b624 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f8728c61e46fc..55693d4cdb753 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12126,19 +12126,20 @@ def pct_change( if limit is lib.no_default: cols = self.items() if self.ndim == 2 else [(None, self)] for _, col in cols: - mask = col.isna().values - mask = mask[np.argmax(~mask) :] - if mask.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will " - "be removed in a future version. Either fill in any " - "non-leading NA values prior to calling pct_change or " - "specify 'fill_method=None' to not fill NA values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - break + if len(col) > 0: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and " + "will be removed in a future version. Either fill in " + "any non-leading NA values prior to calling pct_change " + "or specify 'fill_method=None' to not fill NA values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 9727ef3d5c27c..6c80e711c3684 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -118,3 +118,11 @@ def test_pct_change_no_warning_na_beginning(): result = ser.pct_change() expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) tm.assert_series_equal(result, expected) + + +def test_pct_change_empty(): + # GH 57056 + ser = Series([], dtype="float64") + expected = ser.copy() + result = ser.pct_change(periods=0) + tm.assert_series_equal(expected, result) From 441f65df5c6dea850e6334ab770528184dfb0d33 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Jan 2024 19:03:14 +0100 Subject: [PATCH 071/152] Backport PR #57034 on branch 2.2.x (REGR: perf regression in Series.combine_first) (#57072) Backport PR #57034: REGR: perf regression in Series.combine_first Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/series.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index d3065cdd8b624..93965ffed23d3 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) diff --git a/pandas/core/series.py b/pandas/core/series.py index f06a1eb533ba4..c6a905cbb6ec1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -86,6 +86,7 @@ from pandas.core.dtypes.dtypes import ( CategoricalDtype, ExtensionDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -3510,6 +3511,13 @@ def combine_first(self, other) -> Series: """ from pandas.core.reshape.concat import concat + if self.dtype == other.dtype: + if self.index.equals(other.index): + return self.mask(self.isna(), other) + elif self._can_hold_na and not isinstance(self.dtype, SparseDtype): + this, other = self.align(other, join="outer") + return this.mask(this.isna(), other) + new_index = self.index.union(other.index) this = self From c45129431ef4b089b14f0f3a892cc115e155315b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 26 Jan 2024 19:41:39 +0100 Subject: [PATCH 072/152] Backport PR #57046 on branch 2.2.x (REGR: groupby.idxmin/idxmax wrong result on extreme values) (#57086) Backport PR #57046: REGR: groupby.idxmin/idxmax wrong result on extreme values Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 2 + pandas/_libs/groupby.pyx | 17 ++++--- pandas/core/groupby/ops.py | 1 + pandas/tests/groupby/test_reductions.py | 62 +++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 93965ffed23d3..23da4a7f6ab25 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -15,6 +15,8 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 19d71b0a6fde3..ac24c0bb8df88 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1767,6 +1767,7 @@ def group_idxmin_idxmax( Py_ssize_t i, j, N, K, lab numeric_object_t val numeric_object_t[:, ::1] group_min_or_max + uint8_t[:, ::1] seen bint uses_mask = mask is not None bint isna_entry bint compute_max = name == "idxmax" @@ -1780,13 +1781,10 @@ def group_idxmin_idxmax( if numeric_object_t is object: group_min_or_max = np.empty((out).shape, dtype=object) + seen = np.zeros((out).shape, dtype=np.uint8) else: group_min_or_max = np.empty_like(out, dtype=values.dtype) - if N > 0 and K > 0: - # When N or K is zero, we never use group_min_or_max - group_min_or_max[:] = _get_min_or_max( - values[0, 0], compute_max, is_datetimelike - ) + seen = np.zeros_like(out, dtype=np.uint8) # When using transform, we need a valid value for take in the case # a category is not observed; these values will be dropped @@ -1802,6 +1800,7 @@ def group_idxmin_idxmax( if not skipna and out[lab, j] == -1: # Once we've hit NA there is no going back continue + val = values[i, j] if uses_mask: @@ -1810,10 +1809,14 @@ def group_idxmin_idxmax( isna_entry = _treat_as_na(val, is_datetimelike) if isna_entry: - if not skipna: + if not skipna or not seen[lab, j]: out[lab, j] = -1 else: - if compute_max: + if not seen[lab, j]: + seen[lab, j] = True + group_min_or_max[lab, j] = val + out[lab, j] = i + elif compute_max: if val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5e83eaee02afc..e2ddf9aa5c0c1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -424,6 +424,7 @@ def _call_cython_op( mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, + **kwargs, ) elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: if self.how in ["std", "sem"]: diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 425079f943aba..422322d03c4c0 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -195,6 +195,68 @@ def test_empty(frame_or_series, bool_agg_func): tm.assert_equal(result, expected) +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype): + # GH#57040 + if any_real_numpy_dtype is int or any_real_numpy_dtype is float: + # No need to test + return + info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo + min_value = info(any_real_numpy_dtype).min + max_value = info(any_real_numpy_dtype).max + df = DataFrame( + {"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]}, + dtype=any_real_numpy_dtype, + ) + gb = df.groupby("a") + result = getattr(gb, how)() + expected = DataFrame( + {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): + # GH#57040 + min_value = np.finfo(float_numpy_dtype).min + max_value = np.finfo(float_numpy_dtype).max + df = DataFrame( + { + "a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"), + "b": Series( + [ + np.nan, + min_value, + np.nan, + max_value, + min_value, + np.nan, + max_value, + np.nan, + np.nan, + np.nan, + ], + dtype=float_numpy_dtype, + ), + }, + ) + gb = df.groupby("a") + + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, how)(skipna=skipna) + if skipna: + values = [1, 3, 4, 6, np.nan] + else: + values = np.nan + expected = DataFrame( + {"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp") + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "func, values", [ From b44512726e9d55421f67683ff6d26ec7ce82046b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 26 Jan 2024 20:50:08 +0100 Subject: [PATCH 073/152] Backport PR #57084 on branch 2.2.x (Fix mem leak in read_csv) (#57090) Backport PR #57084: Fix mem leak in read_csv Co-authored-by: William Ayd --- asv_bench/benchmarks/io/csv.py | 3 +++ doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/src/parser/tokenizer.c | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9ac83db4f85b9..dae6107db4d92 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -408,6 +408,9 @@ def time_read_stringcsv(self, engine): def time_read_bytescsv(self, engine): read_csv(self.data(self.BytesIO_input), engine=engine) + def peakmem_read_csv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) + class ReadCSVCategorical(BaseIO): fname = "__test__.csv" diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 23da4a7f6ab25..b9b2821ebc468 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 0e4188bea4dc7..c9f7a796a9b1c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -109,6 +109,14 @@ void parser_set_default_options(parser_t *self) { parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); } +static void parser_clear_data_buffers(parser_t *self) { + free_if_not_null((void *)&self->stream); + free_if_not_null((void *)&self->words); + free_if_not_null((void *)&self->word_starts); + free_if_not_null((void *)&self->line_start); + free_if_not_null((void *)&self->line_fields); +} + static void parser_cleanup(parser_t *self) { // XXX where to put this free_if_not_null((void *)&self->error_msg); @@ -119,6 +127,7 @@ static void parser_cleanup(parser_t *self) { self->skipset = NULL; } + parser_clear_data_buffers(self); if (self->cb_cleanup != NULL) { self->cb_cleanup(self->source); self->cb_cleanup = NULL; From 1550858f6e34f85bb9d489891d4c5ac3146b01b6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 27 Jan 2024 20:38:01 +0100 Subject: [PATCH 074/152] Backport PR #57078 on branch 2.2.x (54628 fix find stack level memory leak) (#57105) Backport PR #57078: 54628 fix find stack level memory leak Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_testing/_warnings.py | 7 ++++++- pandas/util/_exceptions.py | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index f11dc11f6ac0d..c9a287942f2da 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -218,7 +218,12 @@ def _assert_raised_with_correct_stacklevel( frame = inspect.currentframe() for _ in range(4): frame = frame.f_back # type: ignore[union-attr] - caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + try: + caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame msg = ( "Warning not set with correct stacklevel. " f"File where warning is raised: {actual_warning.filename} != " diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 573f76a63459b..5f50838d37315 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: from collections.abc import Generator + from types import FrameType @contextlib.contextmanager @@ -42,15 +43,20 @@ def find_stack_level() -> int: test_dir = os.path.join(pkg_dir, "tests") # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow - frame = inspect.currentframe() - n = 0 - while frame: - fname = inspect.getfile(frame) - if fname.startswith(pkg_dir) and not fname.startswith(test_dir): - frame = frame.f_back - n += 1 - else: - break + frame: FrameType | None = inspect.currentframe() + try: + n = 0 + while frame: + filename = inspect.getfile(frame) + if filename.startswith(pkg_dir) and not filename.startswith(test_dir): + frame = frame.f_back + n += 1 + else: + break + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame return n From f577be25dfe97e6e4fa4192e007df2882fb3f20f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 04:44:40 +0100 Subject: [PATCH 075/152] Backport PR #57089 on branch 2.2.x (BUG: wide_to_long with string columns) (#57120) Backport PR #57089: BUG: wide_to_long with string columns Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/reshape/melt.py | 3 +-- pandas/core/strings/accessor.py | 4 ++-- pandas/tests/reshape/test_melt.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index b9b2821ebc468..660594c98e0f2 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -16,6 +16,7 @@ Fixed regressions - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bb1cd0d738dac..e54f847895f1a 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -458,8 +458,7 @@ def wide_to_long( def get_var_names(df, stub: str, sep: str, suffix: str): regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" - pattern = re.compile(regex) - return df.columns[df.columns.str.match(pattern)] + return df.columns[df.columns.str.match(regex)] def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 1b7d632c0fa80..da10a12d02ae4 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1336,14 +1336,14 @@ def contains( return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat, case: bool = True, flags: int = 0, na=None): + def match(self, pat: str, case: bool = True, flags: int = 0, na=None): """ Determine if each string starts with a match of a regular expression. Parameters ---------- pat : str - Character sequence or regular expression. + Character sequence. case : bool, default True If True, case sensitive. flags : int, default 0 (no flags) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index ff9f927597956..272c5b3403293 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1220,3 +1220,33 @@ def test_missing_stubname(self, dtype): new_level = expected.index.levels[0].astype(dtype) expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) + + +def test_wide_to_long_pyarrow_string_columns(): + # GH 57066 + pytest.importorskip("pyarrow") + df = DataFrame( + { + "ID": {0: 1}, + "R_test1": {0: 1}, + "R_test2": {0: 1}, + "R_test3": {0: 2}, + "D": {0: 1}, + } + ) + df.columns = df.columns.astype("string[pyarrow_numpy]") + result = wide_to_long( + df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" + ) + expected = DataFrame( + [[1, 1], [1, 1], [1, 2]], + columns=Index(["D", "R"], dtype=object), + index=pd.MultiIndex.from_arrays( + [ + [1, 1, 1], + Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"), + ], + names=["ID", "UNPIVOTED"], + ), + ) + tm.assert_frame_equal(result, expected) From df0762d18eecb567b6813cc5fd5fabc6af214aa5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 21:53:39 +0100 Subject: [PATCH 076/152] Backport PR #57126 on branch 2.2.x (Bump pypa/cibuildwheel from 2.16.2 to 2.16.4) (#57132) Backport PR #57126: Bump pypa/cibuildwheel from 2.16.2 to 2.16.4 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 841559c8e9799..6d3b9048a2122 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -139,7 +139,7 @@ jobs: - name: Build normal wheels if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} - uses: pypa/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.4 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: @@ -148,7 +148,7 @@ jobs: - name: Build nightly wheels (with NumPy pre-release) if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.4 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From c1723cd09e83248dd3fbc81bb43ba537082a4fe0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 21:53:49 +0100 Subject: [PATCH 077/152] Backport PR #57101 on branch 2.2.x (REGR: Index.join raising TypeError when joining an empty index to a mixed type index) (#57133) Backport PR #57101: REGR: Index.join raising TypeError when joining an empty index to a mixed type index Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 65 +++++++++++++------------ pandas/tests/reshape/merge/test_join.py | 19 ++++++++ 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 660594c98e0f2..ff5fadec735e6 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c36dcda6e2972..5e7f2e27f1275 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4615,38 +4615,12 @@ def join( if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) - lidx: np.ndarray | None - ridx: np.ndarray | None - - if len(other) == 0: - if how in ("left", "outer"): - if sort and not self.is_monotonic_increasing: - lidx = self.argsort() - join_index = self.take(lidx) - else: - lidx = None - join_index = self._view() - ridx = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lidx, ridx - elif how in ("right", "inner", "cross"): - join_index = other._view() - lidx = np.array([], dtype=np.intp) - return join_index, lidx, None - - if len(self) == 0: - if how in ("right", "outer"): - if sort and not other.is_monotonic_increasing: - ridx = other.argsort() - join_index = other.take(ridx) - else: - ridx = None - join_index = other._view() - lidx = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lidx, ridx - elif how in ("left", "inner", "cross"): - join_index = self._view() - ridx = np.array([], dtype=np.intp) - return join_index, None, ridx + if len(self) == 0 or len(other) == 0: + try: + return self._join_empty(other, how, sort) + except TypeError: + # object dtype; non-comparable objects + pass if self.dtype != other.dtype: dtype = self._find_common_type_compat(other) @@ -4681,6 +4655,33 @@ def join( return self._join_via_get_indexer(other, how, sort) + @final + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + assert len(self) == 0 or len(other) == 0 + _validate_join_method(how) + + lidx: np.ndarray | None + ridx: np.ndarray | None + + if len(other): + how = cast(JoinHow, {"left": "right", "right": "left"}.get(how, how)) + join_index, ridx, lidx = other._join_empty(self, how, sort) + elif how in ["left", "outer"]: + if sort and not self.is_monotonic_increasing: + lidx = self.argsort() + join_index = self.take(lidx) + else: + lidx = None + join_index = self._view() + ridx = np.broadcast_to(np.intp(-1), len(join_index)) + else: + join_index = other._view() + lidx = np.array([], dtype=np.intp) + ridx = None + return join_index, lidx, ridx + @final def _join_via_get_indexer( self, other: Index, how: JoinHow, sort: bool diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 9a2f18f33bce5..db5a0437a14f0 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1042,6 +1042,25 @@ def test_join_empty(left_empty, how, exp): tm.assert_frame_equal(result, expected) +def test_join_empty_uncomparable_columns(): + # GH 57048 + df1 = DataFrame() + df2 = DataFrame(columns=["test"]) + df3 = DataFrame(columns=["foo", ("bar", "baz")]) + + result = df1 + df2 + expected = DataFrame(columns=["test"]) + tm.assert_frame_equal(result, expected) + + result = df2 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo", "test"]) + tm.assert_frame_equal(result, expected) + + result = df1 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "how, values", [ From 10b58730b6a302a6c372d6d4e1b04cb87007de97 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 23:48:32 +0100 Subject: [PATCH 078/152] Backport PR #57122 on branch 2.2.x (CI: autouse add_doctest_imports) (#57135) Backport PR #57122: CI: autouse add_doctest_imports Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/conftest.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 983272d79081e..a11d9c2b4b2a1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -190,10 +190,6 @@ def pytest_collection_modifyitems(items, config) -> None: if is_doctest: for item in items: - # autouse=True for the add_doctest_imports can lead to expensive teardowns - # since doctest_namespace is a session fixture - item.add_marker(pytest.mark.usefixtures("add_doctest_imports")) - for path, message in ignored_doctest_warnings: ignore_doctest_warning(item, path, message) @@ -250,7 +246,14 @@ def pytest_collection_modifyitems(items, config) -> None: ) -@pytest.fixture +# ---------------------------------------------------------------- +# Autouse fixtures +# ---------------------------------------------------------------- + + +# https://github.com/pytest-dev/pytest/issues/11873 +# Would like to avoid autouse=True, but cannot as of pytest 8.0.0 +@pytest.fixture(autouse=True) def add_doctest_imports(doctest_namespace) -> None: """ Make `np` and `pd` names available for doctests. @@ -259,9 +262,6 @@ def add_doctest_imports(doctest_namespace) -> None: doctest_namespace["pd"] = pd -# ---------------------------------------------------------------- -# Autouse fixtures -# ---------------------------------------------------------------- @pytest.fixture(autouse=True) def configure_tests() -> None: """ From acd914dedba087b3642e110aadac665006da23c0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 05:36:42 +0100 Subject: [PATCH 079/152] Backport PR #57102 on branch 2.2.x (ENH: Add skipna to groupby.first and groupby.last) (#57141) Backport PR #57102: ENH: Add skipna to groupby.first and groupby.last Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 3 +- pandas/_libs/groupby.pyi | 2 ++ pandas/_libs/groupby.pyx | 41 ++++++++++++++-------- pandas/_testing/__init__.py | 7 ++++ pandas/conftest.py | 32 +++++++++++++++++ pandas/core/groupby/groupby.py | 36 ++++++++++++++----- pandas/core/resample.py | 10 ++++-- pandas/tests/groupby/test_reductions.py | 31 ++++++++++++++++ pandas/tests/resample/test_base.py | 29 +++++++++++++++ pandas/tests/resample/test_resample_api.py | 4 +-- 10 files changed, 167 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ff5fadec735e6..589903ddcca71 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -34,7 +34,8 @@ Bug fixes Other ~~~~~ -- +- Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) +- Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`) .. --------------------------------------------------------------------------- .. _whatsnew_221.contributors: diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 135828a23648a..a494b61fa7e3d 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -136,6 +136,7 @@ def group_last( result_mask: npt.NDArray[np.bool_] | None = ..., min_count: int = ..., # Py_ssize_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_nth( out: np.ndarray, # rank_t[:, ::1] @@ -147,6 +148,7 @@ def group_nth( min_count: int = ..., # int64_t rank: int = ..., # int64_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_rank( out: np.ndarray, # float64_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ac24c0bb8df88..b855d64d0be18 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1424,6 +1424,7 @@ def group_last( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1458,14 +1459,19 @@ def group_last( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - resx[lab, j] = val + nobs[lab, j] += 1 + resx[lab, j] = val + + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx @@ -1486,6 +1492,7 @@ def group_nth( int64_t min_count=-1, int64_t rank=1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1520,15 +1527,19 @@ def group_nth( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 672c16a85086c..361998db8e38b 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -236,11 +236,18 @@ + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES ) + ALL_REAL_PYARROW_DTYPES_STR_REPR = ( + ALL_INT_PYARROW_DTYPES_STR_REPR + FLOAT_PYARROW_DTYPES_STR_REPR + ) else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] ALL_PYARROW_DTYPES = [] + ALL_REAL_PYARROW_DTYPES_STR_REPR = [] +ALL_REAL_NULLABLE_DTYPES = ( + FLOAT_NUMPY_DTYPES + ALL_REAL_EXTENSION_DTYPES + ALL_REAL_PYARROW_DTYPES_STR_REPR +) arithmetic_dunder_methods = [ "__add__", diff --git a/pandas/conftest.py b/pandas/conftest.py index a11d9c2b4b2a1..7c35dfdde90ba 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1642,6 +1642,38 @@ def any_numpy_dtype(request): return request.param +@pytest.fixture(params=tm.ALL_REAL_NULLABLE_DTYPES) +def any_real_nullable_dtype(request): + """ + Parameterized fixture for all real dtypes that can hold NA. + + * float + * 'float32' + * 'float64' + * 'Float32' + * 'Float64' + * 'UInt8' + * 'UInt16' + * 'UInt32' + * 'UInt64' + * 'Int8' + * 'Int16' + * 'Int32' + * 'Int64' + * 'uint8[pyarrow]' + * 'uint16[pyarrow]' + * 'uint32[pyarrow]' + * 'uint64[pyarrow]' + * 'int8[pyarrow]' + * 'int16[pyarrow]' + * 'int32[pyarrow]' + * 'int64[pyarrow]' + * 'float[pyarrow]' + * 'double[pyarrow]' + """ + return request.param + + @pytest.fixture(params=tm.ALL_NUMERIC_DTYPES) def any_numeric_dtype(request): """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5b18455dbe8a8..db8949788567b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3335,9 +3335,13 @@ def max( ) @final - def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def first( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the first non-null entry of each column. + Compute the first entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3345,12 +3349,17 @@ def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: Include only float, int, boolean columns. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - First non-null of values within each group. + First values within each group. See Also -------- @@ -3402,12 +3411,17 @@ def first(x: Series): min_count=min_count, alias="first", npfunc=first_compat, + skipna=skipna, ) @final - def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def last( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the last non-null entry of each column. + Compute the last entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3416,12 +3430,17 @@ def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: everything, then use only numeric data. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - Last non-null of values within each group. + Last of values within each group. See Also -------- @@ -3461,6 +3480,7 @@ def last(x: Series): min_count=min_count, alias="last", npfunc=last_compat, + skipna=skipna, ) @final diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3e9507bd4347f..2d430ef4dcff6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1306,12 +1306,15 @@ def first( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "first", args, kwargs) nv.validate_resampler_func("first", args, kwargs) - return self._downsample("first", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.last) @@ -1319,12 +1322,15 @@ def last( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "last", args, kwargs) nv.validate_resampler_func("last", args, kwargs) - return self._downsample("last", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.median) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 422322d03c4c0..25b0f80639cff 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -7,6 +7,9 @@ from pandas._libs.tslibs import iNaT +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.missing import na_value_for_dtype + import pandas as pd from pandas import ( DataFrame, @@ -327,6 +330,34 @@ def test_groupby_non_arithmetic_agg_int_like_precision(method, data): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): + # GH#57019 + na_value = na_value_for_dtype(pandas_dtype(any_real_nullable_dtype)) + df = DataFrame( + { + "a": [2, 1, 1, 2, 3, 3], + "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + }, + dtype=any_real_nullable_dtype, + ) + gb = df.groupby("a", sort=sort) + method = getattr(gb, how) + result = method(skipna=skipna) + + ilocs = { + ("first", True): [3, 1, 4], + ("first", False): [0, 1, 4], + ("last", True): [3, 1, 5], + ("last", False): [3, 2, 5], + }[how, skipna] + expected = df.iloc[ilocs].set_index("a") + if sort: + expected = expected.sort_index() + tm.assert_frame_equal(result, expected) + + def test_idxmin_idxmax_axis1(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 50644e33e45e1..dcf6c6099abab 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -3,6 +3,9 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_extension_array_dtype + +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -429,3 +432,29 @@ def test_resample_quantile(series): result = ser.resample(freq).quantile(q) expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, skipna, how): + # GH#57019 + if is_extension_array_dtype(any_real_nullable_dtype): + na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value + else: + na_value = np.nan + df = DataFrame( + { + "a": [2, 1, 1, 2], + "b": [na_value, 3.0, na_value, 4.0], + "c": [na_value, 3.0, na_value, 4.0], + }, + index=date_range("2020-01-01", periods=4, freq="D"), + dtype=any_real_nullable_dtype, + ) + rs = df.resample("ME") + method = getattr(rs, how) + result = method(skipna=skipna) + + gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + expected = getattr(gb, how)(skipna=skipna) + expected.index.freq = "ME" + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index d3e906827b754..12abd1c98784b 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -1040,11 +1040,11 @@ def test_args_kwargs_depr(method, raises): if raises: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(UnsupportedFunctionCall, match=error_msg): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) else: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(TypeError, match=error_msg_type): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) def test_df_axis_param_depr(): From c0a269b354ad1b50b9a7fd56789c5137e829fcc2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 05:37:02 +0100 Subject: [PATCH 080/152] Backport PR #57061 on branch 2.2.x (REGR: non-unique, masked dtype index raising IndexError) (#57142) Backport PR #57061: REGR: non-unique, masked dtype index raising IndexError Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/index.pyx | 63 +++++++++++++++---------------- pandas/tests/indexing/test_loc.py | 12 ++++++ 3 files changed, 44 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 589903ddcca71..6d6f0f1ee758f 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) +- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0dc139781f58d..e4dfe9dec3623 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -96,6 +96,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): return indexer.view(bool) +cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length): + """ + Resize array if loc is out of bounds. + """ + cdef: + Py_ssize_t n = len(values) + + if loc >= n: + while loc >= n: + n *= 2 + values = np.resize(values, min(n, max_length)) + return values + + # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 @@ -450,27 +464,18 @@ cdef class IndexEngine: # found if val in d: key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) - result[count] = j count += 1 # value not found else: - - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -1193,13 +1198,12 @@ cdef class MaskedIndexEngine(IndexEngine): if PySequence_GetItem(target_mask, i): if na_pos: + result = _maybe_resize_array( + result, + count + len(na_pos) - 1, + max_alloc, + ) for na_idx in na_pos: - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = na_idx count += 1 continue @@ -1207,23 +1211,18 @@ cdef class MaskedIndexEngine(IndexEngine): elif val in d: # found key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc, + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = j count += 1 continue # value not found - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 61c44c8a2a8f4..952251a58e981 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -3364,3 +3364,15 @@ def test_getitem_loc_str_periodindex(self): index = pd.period_range(start="2000", periods=20, freq="B") series = Series(range(20), index=index) assert series.loc["2000-01-14"] == 9 + + def test_loc_nonunique_masked_index(self): + # GH 57027 + ids = list(range(11)) + index = Index(ids * 1000, dtype="Int64") + df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index) + result = df.loc[ids] + expected = DataFrame( + {"val": index.argsort(kind="stable").astype(np.intp)}, + index=Index(np.array(ids).repeat(1000), dtype="Int64"), + ) + tm.assert_frame_equal(result, expected) From 4bad5fcf172ac443c384da8482a3cc617da908bb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 05:37:17 +0100 Subject: [PATCH 081/152] Backport PR #57139 on branch 2.2.x (BUG: Index(Series) makes array read only for object dtype) (#57143) Backport PR #57139: BUG: Index(Series) makes array read only for object dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 2 +- pandas/_libs/ops.pyx | 2 +- pandas/core/common.py | 2 ++ pandas/tests/indexes/base_class/test_constructors.py | 7 +++++++ pandas/tests/indexes/test_common.py | 9 +++++++++ 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 6d6f0f1ee758f..1302648c3fc9a 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -28,7 +28,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) .. --------------------------------------------------------------------------- .. _whatsnew_221.other: diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 9154e836b3477..567bfc02a2950 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -29,7 +29,7 @@ from pandas._libs.util cimport is_nan @cython.wraparound(False) @cython.boundscheck(False) -def scalar_compare(object[:] values, object val, object op) -> ndarray: +def scalar_compare(ndarray[object] values, object val, object op) -> ndarray: """ Compare each element of `values` array with the scalar `val`, with the comparison operation described by `op`. diff --git a/pandas/core/common.py b/pandas/core/common.py index 7d864e02be54e..9f024498d66ed 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -233,6 +233,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi values = list(values) elif isinstance(values, ABCIndex): return values._values + elif isinstance(values, ABCSeries): + return values._values if isinstance(values, list) and dtype in [np.object_, object]: return construct_1d_object_array_from_listlike(values) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index fd5176a28565e..338509dd239e6 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -71,3 +71,10 @@ def test_inference_on_pandas_objects(self): with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): result = Index(ser) assert result.dtype != np.object_ + + def test_constructor_not_read_only(self): + # GH#57130 + ser = Series([1, 2], dtype=object) + with pd.option_context("mode.copy_on_write", True): + idx = Index(ser) + assert idx._values.flags.writeable diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 412a59d15307d..80c39322b9b81 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -500,3 +500,12 @@ def test_ndarray_compat_properties(index): # test for validity idx.nbytes idx.values.nbytes + + +def test_compare_read_only_array(): + # GH#57130 + arr = np.array([], dtype=object) + arr.flags.writeable = False + idx = pd.Index(arr) + result = idx > 69 + assert result.dtype == bool From 27cea3a8d338e2ebaa809e9079de19fcb3991cee Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 18:46:41 +0100 Subject: [PATCH 082/152] Backport PR #57144 on branch 2.2.x (CI: Fix _get_dst_hours for numpy 2.0 change) (#57153) Backport PR #57144: CI: Fix _get_dst_hours for numpy 2.0 change Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_libs/tslibs/tzconversion.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 2c4f0cd14db13..e3facd3d9599b 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -607,7 +607,8 @@ cdef ndarray[int64_t] _get_dst_hours( ndarray[uint8_t, cast=True] mismatch ndarray[int64_t] delta, dst_hours ndarray[intp_t] switch_idxs, trans_idx, grp, a_idx, b_idx, one_diff - list trans_grp + # TODO: Can uncomment when numpy >=2 is the minimum + # tuple trans_grp intp_t switch_idx int64_t left, right From f6fd475680761f12dbc4abe3db4a26e55b68fd90 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Jan 2024 02:52:25 +0100 Subject: [PATCH 083/152] Backport PR #57157 on branch 2.2.x (BUG: Fix to_dict with datelike types and orient=list) (#57160) Backport PR #57157: BUG: Fix to_dict with datelike types and orient=list Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/methods/to_dict.py | 8 ++------ pandas/tests/frame/methods/test_to_dict.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 1302648c3fc9a..19b7e3493f964 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 7bd4851425c3b..accbd92a91ed6 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -171,13 +171,9 @@ def to_dict( return into_c( ( k, - list( - map( - maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist() - ) - ) + list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i]))) if i in object_dtype_indices_as_set - else v.to_numpy().tolist(), + else list(map(maybe_box_native, v.to_numpy())), ) for i, (k, v) in enumerate(df.items()) ) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 61f0ad30b4519..570f85a4a31ee 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -12,8 +12,11 @@ NA, DataFrame, Index, + Interval, MultiIndex, + Period, Series, + Timedelta, Timestamp, ) import pandas._testing as tm @@ -519,3 +522,14 @@ def test_to_dict_pos_args_deprecation(self): ) with tm.assert_produces_warning(FutureWarning, match=msg): df.to_dict("records", {}) + + +@pytest.mark.parametrize( + "val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)] +) +def test_to_dict_list_pd_scalars(val): + # GH 54824 + df = DataFrame({"a": [val]}) + result = df.to_dict(orient="list") + expected = {"a": [val]} + assert result == expected From 59e6c80a47dd5bf0799e622eae7b3a0c5864309f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Jan 2024 20:06:35 +0100 Subject: [PATCH 084/152] Backport PR #57175 on branch 2.2.x (BUG: Interchange protocol implementation handles empty dataframes incorrectly) (#57179) Backport PR #57175: BUG: Interchange protocol implementation handles empty dataframes incorrectly Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/interchange/buffer.py | 2 +- pandas/tests/interchange/test_impl.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 19b7e3493f964..4a20eda08937a 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -29,6 +29,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index a54e4428bd836..5c97fc17d7070 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -23,7 +23,7 @@ def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: """ Handle only regular columns (= numpy arrays) for now. """ - if not x.strides == (x.dtype.itemsize,): + if x.strides[0] and not x.strides == (x.dtype.itemsize,): # The protocol does not support strided buffers, so a copy is # necessary. If that's not allowed, we need to raise an exception. if allow_copy: diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index c7b13f9fd7b2d..c6365233728d2 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -379,3 +379,12 @@ def test_large_string(): result = pd.api.interchange.from_dataframe(df.__dataframe__()) expected = pd.DataFrame({"a": ["x"]}, dtype="object") tm.assert_frame_equal(result, expected) + + +def test_empty_dataframe(): + # https://github.com/pandas-dev/pandas/issues/56700 + df = pd.DataFrame({"a": []}, dtype="int8") + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) + expected = pd.DataFrame({"a": []}, dtype="int8") + tm.assert_frame_equal(result, expected) From bc09d5765197c39daae8326fc9ee44bb2c3153c4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Jan 2024 20:20:42 +0100 Subject: [PATCH 085/152] Backport PR #57169 on branch 2.2.x (REGR: DataFrame.sort_index not producing stable sort) (#57180) Backport PR #57169: REGR: DataFrame.sort_index not producing stable sort Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 11 ++++----- pandas/tests/frame/methods/test_sort_index.py | 24 +++++++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 4a20eda08937a..9002d9af2c602 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5e7f2e27f1275..6f9078bf5a2a2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5914,17 +5914,14 @@ def sort_values( (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ if key is None and ( - self.is_monotonic_increasing or self.is_monotonic_decreasing + (ascending and self.is_monotonic_increasing) + or (not ascending and self.is_monotonic_decreasing) ): - reverse = ascending != self.is_monotonic_increasing - sorted_index = self[::-1] if reverse else self.copy() if return_indexer: indexer = np.arange(len(self), dtype=np.intp) - if reverse: - indexer = indexer[::-1] - return sorted_index, indexer + return self.copy(), indexer else: - return sorted_index + return self.copy() # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MultiIndex diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 49e292057e4dc..830561a1349ee 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -1002,3 +1002,27 @@ def test_axis_columns_ignore_index(): result = df.sort_index(axis="columns", ignore_index=True) expected = DataFrame([[2, 1]]) tm.assert_frame_equal(result, expected) + + +def test_sort_index_stable_sort(): + # GH 57151 + df = DataFrame( + data=[ + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + result = df.sort_index(level="dt", kind="stable") + expected = DataFrame( + data=[ + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + tm.assert_frame_equal(result, expected) From 62aea0f06dc8bb7b029f551ef2bcfb8b08ee8ddf Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 1 Feb 2024 21:15:39 +0000 Subject: [PATCH 086/152] =?UTF-8?q?Backport=20PR=20#57173:=20BUG:=20pandas?= =?UTF-8?q?=20int=20extension=20dtypes=20has=20no=20attribute=E2=80=A6=20(?= =?UTF-8?q?#57198)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #57173: BUG: pandas int extension dtypes has no attribute byteorder --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/interchange/column.py | 3 +++ pandas/tests/interchange/test_impl.py | 12 ++++++++++++ 3 files changed, 16 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 9002d9af2c602..56e645d4c55db 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -30,6 +30,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`) - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index ee1b5cd34a7f7..96757072dd854 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.dtypes import ( ArrowDtype, + BaseMaskedDtype, DatetimeTZDtype, ) @@ -143,6 +144,8 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: byteorder = dtype.numpy_dtype.byteorder elif isinstance(dtype, DatetimeTZDtype): byteorder = dtype.base.byteorder # type: ignore[union-attr] + elif isinstance(dtype, BaseMaskedDtype): + byteorder = dtype.numpy_dtype.byteorder else: byteorder = dtype.byteorder diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index c6365233728d2..209944427d5dc 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -9,6 +9,7 @@ is_platform_windows, ) from pandas.compat.numpy import np_version_lt1p23 +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -381,6 +382,17 @@ def test_large_string(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] +) +def test_nullable_integers(dtype: str) -> None: + # https://github.com/pandas-dev/pandas/issues/55069 + df = pd.DataFrame({"a": [1]}, dtype=dtype) + expected = pd.DataFrame({"a": [1]}, dtype="int8") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + tm.assert_frame_equal(result, expected) + + def test_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/56700 df = pd.DataFrame({"a": []}, dtype="int8") From be8f9f267473133f5436cef564bd13e2872f9bec Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 2 Feb 2024 03:37:48 +0100 Subject: [PATCH 087/152] Backport PR #57163 on branch 2.2.x (CI: Add macOS M1 CI) (#57202) Backport PR #57163: CI: Add macOS M1 CI Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 9 +++++---- .github/workflows/wheels.yml | 18 ++++++++++-------- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311.yaml | 3 +-- ci/deps/actions-312.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- pyproject.toml | 4 ---- 7 files changed, 19 insertions(+), 21 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a3cffb4b03b93..4c7aa1e1e49ee 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -214,7 +214,8 @@ jobs: timeout-minutes: 90 strategy: matrix: - os: [macos-latest, windows-latest] + # Note: Don't use macOS latest since macos 14 appears to be arm64 only + os: [macos-13, macos-14, windows-latest] env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] fail-fast: false runs-on: ${{ matrix.os }} @@ -227,8 +228,7 @@ jobs: PANDAS_CI: 1 PYTEST_TARGET: pandas PATTERN: "not slow and not db and not network and not single_cpu" - # GH 47443: PYTEST_WORKERS > 0 crashes Windows builds with memory related errors - PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '0' }} + PYTEST_WORKERS: 'auto' steps: - name: Checkout @@ -354,7 +354,8 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-22.04, macOS-latest, windows-latest] + # Separate out macOS 13 and 14, since macOS 14 is arm64 only + os: [ubuntu-22.04, macOS-13, macOS-14, windows-latest] timeout-minutes: 90 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 6d3b9048a2122..f79b2c51b5f92 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -94,7 +94,9 @@ jobs: buildplat: - [ubuntu-22.04, manylinux_x86_64] - [ubuntu-22.04, musllinux_x86_64] - - [macos-12, macosx_*] + - [macos-12, macosx_x86_64] + # Note: M1 images on Github Actions start from macOS 14 + - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] @@ -128,7 +130,7 @@ jobs: # Python version used to build sdist doesn't matter # wheel will be built from sdist with the correct version - name: Unzip sdist (macOS) - if: ${{ matrix.buildplat[1] == 'macosx_*' }} + if: ${{ startsWith(matrix.buildplat[1], 'macosx') }} run: | tar -xzf ./dist/${{ env.sdist_name }} -C ./dist @@ -139,18 +141,18 @@ jobs: - name: Build normal wheels if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} - uses: pypa/cibuildwheel@v2.16.4 + uses: pypa/cibuildwheel@v2.16.5 with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} + package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - name: Build nightly wheels (with NumPy pre-release) if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.16.4 + uses: pypa/cibuildwheel@v2.16.5 with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} + package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: # The nightly wheels should be build witht he NumPy 2.0 pre-releases # which requires the additional URL. @@ -183,7 +185,7 @@ jobs: $TST_CMD = @" python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); - python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; + python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ # add rc to the end of the image name if the Python version is unreleased docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} @@ -191,7 +193,7 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }} + name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} path: ./wheelhouse/*.whl - name: Upload wheels & sdist diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 45f114322015b..a3e44e6373145 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -61,3 +60,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index d14686696e669..95cd1a4d46ef4 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -60,4 +59,4 @@ dependencies: - pip: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 86aaf24b4e15c..a442ed6feeb5d 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -61,3 +60,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 31ee74174cd46..b162a78e7f115 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -61,3 +60,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/pyproject.toml b/pyproject.toml index 2f70ade7b3afe..8c9a79aa2b059 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -162,10 +162,6 @@ test-command = """ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ -[tool.cibuildwheel.macos] -archs = "x86_64 arm64" -test-skip = "*_arm64" - [tool.cibuildwheel.windows] before-build = "pip install delvewheel" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" From 5034b780e90f499d4da8dc2e461aca19304c5263 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 2 Feb 2024 03:38:20 +0100 Subject: [PATCH 088/152] Backport PR #57174 on branch 2.2.x (BUG: Interchange protocol implementation allows non-string column names) (#57203) Backport PR #57174: BUG: Interchange protocol implementation allows non-string column names Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/interchange/column.py | 8 ++++++++ pandas/core/interchange/dataframe.py | 2 +- pandas/tests/interchange/test_impl.py | 26 ++++++++++++++++++++++++-- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 56e645d4c55db..13d5024b5a131 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,6 +32,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`) - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 96757072dd854..e273ecad8b51e 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -77,6 +77,14 @@ def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ + if isinstance(column, pd.DataFrame): + raise TypeError( + "Expected a Series, got a DataFrame. This likely happened " + "because you called __dataframe__ on a DataFrame which, " + "after converting column names to string, resulted in duplicated " + f"names: {column.columns}. Please rename these columns before " + "using the interchange protocol." + ) if not isinstance(column, pd.Series): raise NotImplementedError(f"Columns of type {type(column)} not handled yet") diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 4f08b2c2b3a7b..1ffe0e8e8dbb0 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -32,7 +32,7 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. """ - self._df = df + self._df = df.rename(columns=str, copy=False) self._allow_copy = allow_copy def __dataframe__( diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 209944427d5dc..d47b533f92235 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -180,8 +180,6 @@ def test_missing_from_masked(): } ) - df2 = df.__dataframe__() - rng = np.random.default_rng(2) dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns} for col, num_nulls in dict_null.items(): @@ -382,6 +380,30 @@ def test_large_string(): tm.assert_frame_equal(result, expected) +def test_non_str_names(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.Series([1, 2, 3], name=0).to_frame() + names = df.__dataframe__().column_names() + assert names == ["0"] + + +def test_non_str_names_w_duplicates(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) + dfi = df.__dataframe__() + with pytest.raises( + TypeError, + match=( + "Expected a Series, got a DataFrame. This likely happened because you " + "called __dataframe__ on a DataFrame which, after converting column " + r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " + r"dtype='object'\). Please rename these columns before using the " + "interchange protocol." + ), + ): + pd.api.interchange.from_dataframe(dfi, allow_copy=False) + + @pytest.mark.parametrize( "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] ) From e54e0e2a63af60e88fad5d5a592bdf957521f3a9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 4 Feb 2024 04:08:14 +0100 Subject: [PATCH 089/152] Backport PR #57232 on branch 2.2.x (REGR: to_json converting nullable ints to floats) (#57240) Backport PR #57232: REGR: to_json converting nullable ints to floats Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/arrow/array.py | 5 +++++ pandas/core/arrays/masked.py | 3 +++ pandas/tests/io/json/test_pandas.py | 16 ++++++++++++++++ 4 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 13d5024b5a131..3cc11974b14e5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) +- Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a5ce46ed612f3..621a32a049f3b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1348,6 +1348,11 @@ def _to_timedeltaarray(self) -> TimedeltaArray: np_array = np_array.astype(np_dtype) return TimedeltaArray._simple_new(np_array, dtype=np_dtype) + def _values_for_json(self) -> np.ndarray: + if is_numeric_dtype(self.dtype): + return np.asarray(self, dtype=object) + return super()._values_for_json() + @doc(ExtensionArray.to_numpy) def to_numpy( self, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 234d96e53a67c..8c41de9c9fffa 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -430,6 +430,9 @@ def __abs__(self) -> Self: # ------------------------------------------------------------------ + def _values_for_json(self) -> np.ndarray: + return np.asarray(self, dtype=object) + def to_numpy( self, dtype: npt.DTypeLike | None = None, diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1da27ad173235..caa25841d3596 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2172,3 +2172,19 @@ def test_json_pos_args_deprecation(): with tm.assert_produces_warning(FutureWarning, match=msg): buf = BytesIO() df.to_json(buf, "split") + + +@td.skip_if_no("pyarrow") +def test_to_json_ea_null(): + # GH#57224 + df = DataFrame( + { + "a": Series([1, NA], dtype="int64[pyarrow]"), + "b": Series([2, NA], dtype="Int64"), + } + ) + result = df.to_json(orient="records", lines=True) + expected = """{"a":1,"b":2} +{"a":null,"b":null} +""" + assert result == expected From c1b17ae8dcebb7faaa90ea520a7e407321daa594 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 6 Feb 2024 03:28:16 +0100 Subject: [PATCH 090/152] Backport PR #57265 on branch 2.2.x (COMPAT: Numpy 2.0 casting compat) (#57271) Backport PR #57265: COMPAT: Numpy 2.0 casting compat Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/dtypes/cast.py | 1 + pandas/core/internals/blocks.py | 9 ++++++++- pandas/tests/indexing/test_loc.py | 16 ++-------------- pandas/tests/series/test_constructors.py | 12 +++++++++--- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 259e83a5936d7..690af6b0ebdc7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1682,6 +1682,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n arr = np.asarray(arr) if np.issubdtype(arr.dtype, str): + # TODO(numpy-2.0 min): This case will raise an OverflowError above if (casted.astype(str) == arr).all(): return casted raise ValueError(f"string values cannot be losslessly cast to {dtype}") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 70a27300bd60f..259e969112dd7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1425,7 +1425,14 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 casted = casted[0, ...] - values[indexer] = casted + try: + values[indexer] = casted + except (TypeError, ValueError) as err: + if is_list_like(casted): + raise ValueError( + "setting an array element with a sequence." + ) from err + raise return self def putmask( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 952251a58e981..0cd1390d41461 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1235,13 +1235,7 @@ def test_loc_setitem_empty_append_raises(self): with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "|".join( - [ - "cannot copy sequence with size 2 to array axis with dimension 0", - r"could not broadcast input array from shape \(2,\) into shape \(0,\)", - "Must have equal len keys and value when setting with an iterable", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data @@ -1575,16 +1569,10 @@ def test_loc_setitem_2d_to_1d_raises(self): # float64 dtype to avoid upcast when trying to set float data ser = Series(range(2), dtype="float64") - msg = "|".join( - [ - r"shape mismatch: value array of shape \(2,2\)", - r"cannot reshape array of size 4 into shape \(2,\)", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): ser.loc[range(2)] = data - msg = r"could not broadcast input array from shape \(2,2\) into shape \(2,?\)" with pytest.raises(ValueError, match=msg): ser.loc[:] = data diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index da069afe5e709..4d3839553a0af 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1958,9 +1958,15 @@ def test_constructor_int64_dtype(self, any_int_dtype): def test_constructor_raise_on_lossy_conversion_of_strings(self): # GH#44923 - with pytest.raises( - ValueError, match="string values cannot be losslessly cast to int8" - ): + if not np_version_gt2: + raises = pytest.raises( + ValueError, match="string values cannot be losslessly cast to int8" + ) + else: + raises = pytest.raises( + OverflowError, match="The elements provided in the data" + ) + with raises: Series(["128"], dtype="int8") def test_constructor_dtype_timedelta_alternative_construct(self): From 45fc954839e79605b14af4b771da56b12b7283cb Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 7 Feb 2024 15:12:00 +0000 Subject: [PATCH 091/152] Backport PR #56945 on branch 2.2.x (ENH: raise ValueError if invalid period freq pass to asfreq when the index of df is a PeriodIndex) (#57292) 'Backport PR #56945: ENH: raise ValueError if invalid period freq pass to asfreq when the index of df is a PeriodIndex' (cherry picked from commit cb97ce6e496e7d1df76f06550c0fe8c4590ff3ce) Co-authored-by: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 32 +++++++----- pandas/core/arrays/period.py | 11 ++-- pandas/plotting/_matplotlib/timeseries.py | 5 +- pandas/tests/dtypes/test_dtypes.py | 10 ++-- .../datetimes/methods/test_to_period.py | 2 +- .../indexes/period/methods/test_asfreq.py | 51 +++++++++++++++++++ pandas/tests/resample/test_period_index.py | 4 +- pandas/tests/scalar/period/test_asfreq.py | 5 +- pandas/tests/scalar/period/test_period.py | 16 +++--- 10 files changed, 96 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 3cc11974b14e5..3f70c72a55a4c 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -35,6 +35,7 @@ Bug fixes - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) +- Fixed bug in :meth:`PeriodIndex.asfreq` which was silently converting frequencies which are not supported as period frequencies instead of raising an error (:issue:`56945`) .. --------------------------------------------------------------------------- .. _whatsnew_221.other: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 764a044f32c82..70e1ca1c4012a 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4271,9 +4271,7 @@ cdef class CustomBusinessDay(BusinessDay): @property def _period_dtype_code(self): # GH#52534 - raise TypeError( - "CustomBusinessDay is not supported as period frequency" - ) + raise ValueError(f"{self.base} is not supported as period frequency") _apply_array = BaseOffset._apply_array @@ -4822,19 +4820,19 @@ cpdef to_offset(freq, bint is_period=False): if freq is None: return None - if isinstance(freq, BaseOffset): - return freq - if isinstance(freq, tuple): raise TypeError( f"to_offset does not support tuples {freq}, pass as a string instead" ) + if isinstance(freq, BaseOffset): + result = freq + elif PyDelta_Check(freq): - return delta_to_tick(freq) + result = delta_to_tick(freq) elif isinstance(freq, str): - delta = None + result = None stride_sign = None try: @@ -4935,21 +4933,27 @@ cpdef to_offset(freq, bint is_period=False): offset = _get_offset(prefix) offset = offset * int(np.fabs(stride) * stride_sign) - if delta is None: - delta = offset + if result is None: + result = offset else: - delta = delta + offset + result = result + offset except (ValueError, TypeError) as err: raise ValueError(INVALID_FREQ_ERR_MSG.format( f"{freq}, failed to parse with error message: {repr(err)}") ) else: - delta = None + result = None - if delta is None: + if result is None: raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) - return delta + if is_period and not hasattr(result, "_period_dtype_code"): + if isinstance(freq, str): + raise ValueError(f"{result.name} is not supported as period frequency") + else: + raise ValueError(f"{freq} is not supported as period frequency") + + return result # ---------------------------------------------------------------------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 28f25d38b2363..d635eb4a25df3 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -733,8 +733,8 @@ def asfreq(self, freq=None, how: str = "E") -> Self: '2015-01'], dtype='period[M]') """ how = libperiod.validate_end_alias(how) - if isinstance(freq, BaseOffset): - freq = freq_to_period_freqstr(freq.n, freq.name) + if isinstance(freq, BaseOffset) and hasattr(freq, "_period_dtype_code"): + freq = PeriodDtype(freq)._freqstr freq = Period._maybe_convert_freq(freq) base1 = self._dtype._dtype_code @@ -1186,12 +1186,7 @@ def dt64arr_to_periodarr( reso = get_unit_from_dtype(data.dtype) freq = Period._maybe_convert_freq(freq) - try: - base = freq._period_dtype_code - except (AttributeError, TypeError): - # AttributeError: _period_dtype_code might not exist - # TypeError: _period_dtype_code might intentionally raise - raise TypeError(f"{freq.name} is not supported as period frequency") + base = freq._period_dtype_code return c_dt64arr_to_periodarr(data.view("i8"), base, tz, reso=reso), freq diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index bf1c0f6346f02..c7ddfa55d0417 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -205,7 +205,10 @@ def _get_ax_freq(ax: Axes): def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None: - freqstr = to_offset(freq, is_period=True).rule_code + if isinstance(freq, BaseOffset): + freqstr = freq.name + else: + freqstr = to_offset(freq, is_period=True).rule_code return get_period_alias(freqstr) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 0dad0b05303ad..de1ddce724a5b 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -445,12 +445,12 @@ def test_construction(self): def test_cannot_use_custom_businessday(self): # GH#52534 - msg = "CustomBusinessDay is not supported as period frequency" + msg = "C is not supported as period frequency" + msg1 = " is not supported as period frequency" msg2 = r"PeriodDtype\[B\] is deprecated" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - PeriodDtype("C") - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): + PeriodDtype("C") + with pytest.raises(ValueError, match=msg1): with tm.assert_produces_warning(FutureWarning, match=msg2): PeriodDtype(pd.offsets.CustomBusinessDay()) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 00c0216a9b3b5..de8d32f64cde2 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -221,5 +221,5 @@ def test_to_period_offsets_not_supported(self, freq): # GH#56243 msg = f"{freq[1:]} is not supported as period frequency" ts = date_range("1/1/2012", periods=4, freq=freq) - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): ts.to_period() diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index ed078a3e8fb8b..865bae69d91c7 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -1,3 +1,5 @@ +import re + import pytest from pandas import ( @@ -7,6 +9,8 @@ ) import pandas._testing as tm +from pandas.tseries import offsets + class TestPeriodIndex: def test_asfreq(self): @@ -136,3 +140,50 @@ def test_asfreq_with_different_n(self): excepted = Series([1, 2], index=PeriodIndex(["2020-02", "2020-04"], freq="M")) tm.assert_series_equal(result, excepted) + + @pytest.mark.parametrize( + "freq", + [ + "2BMS", + "2YS-MAR", + "2bh", + ], + ) + def test_pi_asfreq_not_supported_frequency(self, freq): + # GH#55785 + msg = f"{freq[1:]} is not supported as period frequency" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + "2BME", + "2YE-MAR", + "2QE", + ], + ) + def test_pi_asfreq_invalid_frequency(self, freq): + # GH#55785 + msg = f"Invalid frequency: {freq}" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), + ], + ) + def test_pi_asfreq_invalid_baseoffset(self, freq): + # GH#56945 + msg = re.escape(f"{freq} is not supported as period frequency") + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 451d2a83c1d5e..6b7cce7d15a5b 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1040,8 +1040,8 @@ def test_resample_lowercase_frequency_deprecated( offsets.BusinessHour(2), ], ) - def test_asfreq_invalid_period_freq(self, offset, series_and_frame): - # GH#9586 + def test_asfreq_invalid_period_offset(self, offset, series_and_frame): + # GH#55785 msg = f"Invalid offset: '{offset.base}' for converting time series " df = series_and_frame diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 4489c307172d7..73c4d8061c257 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -820,10 +820,9 @@ def test_asfreq_MS(self): assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") - msg = INVALID_FREQ_ERR_MSG + msg = "MS is not supported as period frequency" with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") - msg = "MonthBegin is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): Period("2013-01", "MS") diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index d819e903a0bae..2c3a0816737fc 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -3,6 +3,7 @@ datetime, timedelta, ) +import re import numpy as np import pytest @@ -40,21 +41,22 @@ class TestPeriodDisallowedFreqs: ) def test_offsets_not_supported(self, freq, freq_msg): # GH#55785 - msg = f"{freq_msg} is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = re.escape(f"{freq} is not supported as period frequency") + with pytest.raises(ValueError, match=msg): Period(year=2014, freq=freq) def test_custom_business_day_freq_raises(self): # GH#52534 - msg = "CustomBusinessDay is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = "C is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq="C") - with pytest.raises(TypeError, match=msg): + msg = f"{offsets.CustomBusinessDay().base} is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq=offsets.CustomBusinessDay()) def test_invalid_frequency_error_message(self): - msg = "WeekOfMonth is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = "WOM-1MON is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="WOM-1MON") def test_invalid_frequency_period_error_message(self): From 11a6136ce2efc25052849ff760b260ca49e371f0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 7 Feb 2024 19:02:06 +0100 Subject: [PATCH 092/152] Backport PR #57233 on branch 2.2.x (REGR: Fix to_numpy conversion for arrow ea with float dtype given) (#57294) Backport PR #57233: REGR: Fix to_numpy conversion for arrow ea with float dtype given Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/_utils.py | 2 ++ pandas/tests/arrays/boolean/test_construction.py | 2 -- pandas/tests/arrays/floating/test_to_numpy.py | 8 ++++---- pandas/tests/arrays/integer/test_dtypes.py | 2 +- pandas/tests/series/methods/test_to_numpy.py | 11 +++++++++++ 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 3f70c72a55a4c..883627bd4b19b 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -25,6 +25,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) +- Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index c75ec7f843ed2..88091a88a4e12 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -39,6 +39,8 @@ def to_numpy_dtype_inference( dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] elif dtype is not None: dtype = np.dtype(dtype) + if na_value is lib.no_default and hasna and dtype.kind == "f": + na_value = np.nan dtype_given = True else: dtype_given = True diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index a5a2dd33940b8..645e763fbf00c 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -308,8 +308,6 @@ def test_to_numpy(box): # converting to int or float without specifying na_value raises with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): arr.to_numpy(dtype="int64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - arr.to_numpy(dtype="float64") def test_to_numpy_copy(): diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py index a25ac40cb3e7c..e954cecba417a 100644 --- a/pandas/tests/arrays/floating/test_to_numpy.py +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -33,10 +33,10 @@ def test_to_numpy_float(box): tm.assert_numpy_array_equal(result, expected) arr = con([0.1, 0.2, None], dtype="Float64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - result = arr.to_numpy(dtype="float64") + result = arr.to_numpy(dtype="float64") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) - # need to explicitly specify na_value result = arr.to_numpy(dtype="float64", na_value=np.nan) expected = np.array([0.1, 0.2, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) @@ -100,7 +100,7 @@ def test_to_numpy_dtype(box, dtype): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) def test_to_numpy_na_raises(box, dtype): con = pd.Series if box else pd.array diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index e3848cdfe3aa9..8620763988e06 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -271,7 +271,7 @@ def test_to_numpy_dtype(dtype, in_series): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int64", "bool"]) def test_to_numpy_na_raises(dtype): a = pd.array([0, 1, None], dtype="Int64") with pytest.raises(ValueError, match=dtype): diff --git a/pandas/tests/series/methods/test_to_numpy.py b/pandas/tests/series/methods/test_to_numpy.py index 5fe3e19b0a20b..8dcc1dd551315 100644 --- a/pandas/tests/series/methods/test_to_numpy.py +++ b/pandas/tests/series/methods/test_to_numpy.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( NA, Series, @@ -23,3 +25,12 @@ def test_to_numpy_cast_before_setting_na(): result = ser.to_numpy(dtype=np.float64, na_value=np.nan) expected = np.array([1.0]) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_to_numpy_arrow_dtype_given(): + # GH#57121 + ser = Series([1, NA], dtype="int64[pyarrow]") + result = ser.to_numpy(dtype="float64") + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) From bbc655d782c2c62ded6b4e305234cbce23ac9f22 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 00:29:56 +0100 Subject: [PATCH 093/152] Backport PR #57121 on branch 2.2.x (REGR: Fix to_numpy for masked array with non-numeric dtype) (#57325) Backport PR #57121: REGR: Fix to_numpy for masked array with non-numeric dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 3 +++ pandas/core/arrays/masked.py | 2 ++ pandas/tests/arrays/masked/test_function.py | 17 +++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 883627bd4b19b..009d4794dbd1d 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -17,12 +17,15 @@ Fixed regressions - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) - Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8c41de9c9fffa..7c49ce5343158 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -501,6 +501,8 @@ def to_numpy( """ hasna = self._hasna dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) + if dtype is None: + dtype = object if hasna: if ( diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index 4c7bd6e293ef4..b259018cd6121 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -5,6 +5,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import BaseMaskedArray arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES] arrays += [ @@ -55,3 +56,19 @@ def test_tolist(data): result = data.tolist() expected = list(data) tm.assert_equal(result, expected) + + +def test_to_numpy(): + # GH#56991 + + class MyStringArray(BaseMaskedArray): + dtype = pd.StringDtype() + _dtype_cls = pd.StringDtype + _internal_fill_value = pd.NA + + arr = MyStringArray( + values=np.array(["a", "b", "c"]), mask=np.array([False, True, False]) + ) + result = arr.to_numpy() + expected = np.array(["a", pd.NA, "c"]) + tm.assert_numpy_array_equal(result, expected) From 361b0899dfd66cdc8eebcfb0ef81285cf126d8a1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 00:30:08 +0100 Subject: [PATCH 094/152] Backport PR #57250 on branch 2.2.x (REGR/DOC: pd.concat should special case DatetimeIndex to sort even when sort=False) (#57326) Backport PR #57250: REGR/DOC: pd.concat should special case DatetimeIndex to sort even when sort=False Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/api.py | 1 + pandas/core/reshape/concat.py | 6 ++++-- pandas/tests/reshape/concat/test_datetimes.py | 12 ++++++------ 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 009d4794dbd1d..69e14a9028dd3 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -15,6 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 560285bd57a22..15292953e72d0 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -295,6 +295,7 @@ def _find_common_index_dtype(inds): raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") if len(dtis) == len(indexes): + sort = True result = indexes[0] elif len(dtis) > 1: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index aacea92611697..dc18bb65b35bc 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -205,8 +205,10 @@ def concat( Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. sort : bool, default False - Sort non-concatenation axis if it is not already aligned. - + Sort non-concatenation axis if it is not already aligned. One exception to + this is when the non-concatentation axis is a DatetimeIndex and join='outer' + and the axis is not already aligned. In that case, the non-concatenation + axis is always sorted lexicographically. copy : bool, default True If False, do not copy data unnecessarily. diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 71ddff7438254..4c94dc0d51f7e 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -73,23 +73,23 @@ def test_concat_datetime_timezone(self): exp_idx = DatetimeIndex( [ - "2010-12-31 23:00:00+00:00", - "2011-01-01 00:00:00+00:00", - "2011-01-01 01:00:00+00:00", "2010-12-31 15:00:00+00:00", "2010-12-31 16:00:00+00:00", "2010-12-31 17:00:00+00:00", + "2010-12-31 23:00:00+00:00", + "2011-01-01 00:00:00+00:00", + "2011-01-01 01:00:00+00:00", ] ).as_unit("ns") expected = DataFrame( [ - [1, np.nan], - [2, np.nan], - [3, np.nan], [np.nan, 1], [np.nan, 2], [np.nan, 3], + [1, np.nan], + [2, np.nan], + [3, np.nan], ], index=exp_idx, columns=["a", "b"], From 044342727b2b7efacefa2c2dc485193897c47b0c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 12:25:24 +0100 Subject: [PATCH 095/152] Backport PR #57322 on branch 2.2.x (REGR: Fix astype conversion of ea int to td/dt with missing values) (#57331) Backport PR #57322: REGR: Fix astype conversion of ea int to td/dt with missing values Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/_utils.py | 13 ++++++++++--- pandas/tests/extension/test_arrow.py | 9 +++++++++ pandas/tests/series/methods/test_to_numpy.py | 13 +++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index 88091a88a4e12..6b46396d5efdf 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -39,14 +39,21 @@ def to_numpy_dtype_inference( dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] elif dtype is not None: dtype = np.dtype(dtype) - if na_value is lib.no_default and hasna and dtype.kind == "f": - na_value = np.nan dtype_given = True else: dtype_given = True if na_value is lib.no_default: - na_value = arr.dtype.na_value + if dtype is None or not hasna: + na_value = arr.dtype.na_value + elif dtype.kind == "f": # type: ignore[union-attr] + na_value = np.nan + elif dtype.kind == "M": # type: ignore[union-attr] + na_value = np.datetime64("nat") + elif dtype.kind == "m": # type: ignore[union-attr] + na_value = np.timedelta64("nat") + else: + na_value = arr.dtype.na_value if not dtype_given and hasna: try: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 05a112e464677..e041093bbf5bc 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3305,6 +3305,15 @@ def test_arrow_floordiv_floating_0_divisor(dtype): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["float64", "datetime64[ns]", "timedelta64[ns]"]) +def test_astype_int_with_null_to_numpy_dtype(dtype): + # GH 57093 + ser = pd.Series([1, None], dtype="int64[pyarrow]") + result = ser.astype(dtype) + expected = pd.Series([1, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) def test_arrow_integral_floordiv_large_values(pa_type): # GH 56676 diff --git a/pandas/tests/series/methods/test_to_numpy.py b/pandas/tests/series/methods/test_to_numpy.py index 8dcc1dd551315..4bc7631090761 100644 --- a/pandas/tests/series/methods/test_to_numpy.py +++ b/pandas/tests/series/methods/test_to_numpy.py @@ -6,6 +6,7 @@ from pandas import ( NA, Series, + Timedelta, ) import pandas._testing as tm @@ -34,3 +35,15 @@ def test_to_numpy_arrow_dtype_given(): result = ser.to_numpy(dtype="float64") expected = np.array([1.0, np.nan]) tm.assert_numpy_array_equal(result, expected) + + +def test_astype_ea_int_to_td_ts(): + # GH#57093 + ser = Series([1, None], dtype="Int64") + result = ser.astype("m8[ns]") + expected = Series([1, Timedelta("nat")], dtype="m8[ns]") + tm.assert_series_equal(result, expected) + + result = ser.astype("M8[ns]") + expected = Series([1, Timedelta("nat")], dtype="M8[ns]") + tm.assert_series_equal(result, expected) From 28a7ec7d34be480b9a14e88236500fd74a2e0109 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 16:06:36 +0100 Subject: [PATCH 096/152] Backport PR #57329 on branch 2.2.x (REGR: CategoricalIndex.difference with null values) (#57336) Backport PR #57329: REGR: CategoricalIndex.difference with null values Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 7 +++++-- .../tests/indexes/categorical/test_setops.py | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/indexes/categorical/test_setops.py diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 69e14a9028dd3..335dada439029 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6f9078bf5a2a2..4b3d1a9e006dc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3663,9 +3663,12 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex + this = self + if isinstance(self, ABCCategoricalIndex) and self.hasnans and other.hasnans: + this = this.dropna() other = other.unique() - the_diff = self[other.get_indexer_for(self) == -1] - the_diff = the_diff if self.is_unique else the_diff.unique() + the_diff = this[other.get_indexer_for(this) == -1] + the_diff = the_diff if this.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) return the_diff diff --git a/pandas/tests/indexes/categorical/test_setops.py b/pandas/tests/indexes/categorical/test_setops.py new file mode 100644 index 0000000000000..2e87b90efd54c --- /dev/null +++ b/pandas/tests/indexes/categorical/test_setops.py @@ -0,0 +1,18 @@ +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("na_value", [None, np.nan]) +def test_difference_with_na(na_value): + # GH 57318 + ci = CategoricalIndex(["a", "b", "c", None]) + other = Index(["c", na_value]) + result = ci.difference(other) + expected = CategoricalIndex(["a", "b"], categories=["a", "b", "c"]) + tm.assert_index_equal(result, expected) From 10b26fefce26e1b2d29d2f7ca39be3b28d875def Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 10 Feb 2024 16:09:36 -0500 Subject: [PATCH 097/152] Backport PR #57333 on branch 2.2.x (REGR: merge with 3rd party EA's can sometimes raise ValueError) (#57337) Backport PR #57333: REGR: merge with 3rd party EA's can sometimes raise ValueError --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/index.pyx | 2 +- pandas/tests/indexes/test_base.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 335dada439029..b342eef850459 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index e4dfe9dec3623..ee6a11ddab004 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -295,7 +295,7 @@ cdef class IndexEngine: values = self.values self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \ self._call_monotonic(values) - except TypeError: + except (TypeError, ValueError): self.monotonic_inc = 0 self.monotonic_dec = 0 is_strict_monotonic = 0 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 666d92064c86c..1fa48f98942c2 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1722,3 +1722,13 @@ def test_nan_comparison_same_object(op): result = op(idx, idx.copy()) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_is_monotonic_pyarrow_list_type(): + # GH 57333 + import pyarrow as pa + + idx = Index([[1], [2, 3]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + assert not idx.is_monotonic_increasing + assert not idx.is_monotonic_decreasing From 947f5aeda9d924ce7d93a46457574123bb7fb07a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 22:58:09 +0100 Subject: [PATCH 098/152] Backport PR #57323 on branch 2.2.x (REGR: Fix regression when grouping over a Series) (#57339) Backport PR #57323: REGR: Fix regression when grouping over a Series Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/internals/managers.py | 5 ++--- pandas/tests/copy_view/test_methods.py | 11 +++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index b342eef850459..eb6e50c1be160 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) +- Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3719bf1f77f85..220bb1133dfd5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -12,7 +12,6 @@ cast, ) import warnings -import weakref import numpy as np @@ -282,8 +281,8 @@ def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: Checks if two blocks from two different block managers reference the same underlying values. """ - ref = weakref.ref(self.blocks[blkno]) - return ref in mgr.blocks[blkno].refs.referenced_blocks + blk = self.blocks[blkno] + return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) def get_dtypes(self) -> npt.NDArray[np.object_]: dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 862aebdc70a9d..5d1eefccbb1e7 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -280,6 +280,17 @@ def test_reset_index_series_drop(using_copy_on_write, index): tm.assert_series_equal(ser, ser_orig) +def test_groupby_column_index_in_references(): + df = DataFrame( + {"A": ["a", "b", "c", "d"], "B": [1, 2, 3, 4], "C": ["a", "a", "b", "b"]} + ) + df = df.set_index("A") + key = df["C"] + result = df.groupby(key, observed=True).sum() + expected = df.groupby("C", observed=True).sum() + tm.assert_frame_equal(result, expected) + + def test_rename_columns(using_copy_on_write): # Case: renaming columns returns a new dataframe # + afterwards modifying the result From 58b182b0e823e25081b9a6df809d45b203733d03 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 12 Feb 2024 21:33:18 +0100 Subject: [PATCH 099/152] Backport PR #57341 on branch 2.2.x (REGR: assert_series_equal defaulting to check_exact=True for Index) (#57380) Backport PR #57341: REGR: assert_series_equal defaulting to check_exact=True for Index Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_testing/asserters.py | 3 ++- pandas/tests/util/test_assert_series_equal.py | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index eb6e50c1be160..47ca96c6eef44 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 3de982498e996..41d2a7344a4ed 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -902,6 +902,7 @@ def assert_series_equal( >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True + check_exact_index = False if check_exact is lib.no_default else check_exact if ( check_exact is lib.no_default and rtol is lib.no_default @@ -944,7 +945,7 @@ def assert_series_equal( right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=check_exact_index, check_categorical=check_categorical, check_order=not check_like, rtol=rtol, diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 784a0347cf92b..1878e7d838064 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -474,3 +474,11 @@ def test_assert_series_equal_int_tol(): tm.assert_extension_array_equal( left.astype("Int64").values, right.astype("Int64").values, rtol=1.5 ) + + +def test_assert_series_equal_index_exact_default(): + # GH#57067 + ser1 = Series(np.zeros(6, dtype=int), [0, 0.2, 0.4, 0.6, 0.8, 1]) + ser2 = Series(np.zeros(6, dtype=int), np.linspace(0, 1, 6)) + tm.assert_series_equal(ser1, ser2) + tm.assert_frame_equal(ser1.to_frame(), ser2.to_frame()) From 169bb9cf8ca6ee7adbc9e1328e9fff378793ecd0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 12 Feb 2024 21:33:27 +0100 Subject: [PATCH 100/152] Backport PR #57340 on branch 2.2.x (REGR: shift raising for axis=1 and empty df) (#57381) Backport PR #57340: REGR: shift raising for axis=1 and empty df Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/frame.py | 3 +++ pandas/tests/frame/methods/test_shift.py | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 47ca96c6eef44..94e26ff6aa46a 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -25,6 +25,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 734756cb8f7c8..b531ddc418df1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5859,6 +5859,9 @@ def shift( ) fill_value = lib.no_default + if self.empty: + return self.copy() + axis = self._get_axis_number(axis) if is_list_like(periods): diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index b21aa2d687682..abb30595fdcb8 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -756,3 +756,9 @@ def test_shift_with_iterable_check_other_arguments(self): msg = "Cannot specify `suffix` if `periods` is an int." with pytest.raises(ValueError, match=msg): df.shift(1, suffix="fails") + + def test_shift_axis_one_empty(self): + # GH#57301 + df = DataFrame() + result = df.shift(1, axis=1) + tm.assert_frame_equal(result, df) From 09debec463762f209787534dc335b3bb7cd3961f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 12 Feb 2024 22:50:47 +0100 Subject: [PATCH 101/152] Backport PR #57379 on branch 2.2.x (Fix numpy-dev CI warnings) (#57383) Backport PR #57379: Fix numpy-dev CI warnings Co-authored-by: William Ayd --- .../src/vendored/ujson/python/objToJSON.c | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 8bba95dd456de..74ca8ead3d936 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -447,8 +447,15 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { npyarr->curdim--; npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; npyarr->stridedim -= npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArrayPassThru_iterEnd received a non-array object"); + return; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); npyarr->dataptr += npyarr->stride; NpyArr_freeItemValue(obj, tc); @@ -467,12 +474,19 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) { + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNextItem received a non-array object"); + return 0; + } + PyArrayObject *arrayobj = (PyArrayObject *)npyarr->array; + + if (PyArray_ISDATETIME(arrayobj)) { GET_TC(tc)->itemValue = obj; Py_INCREF(obj); - ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); + ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(arrayobj); // Also write the resolution (unit) of the ndarray - PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); + PyArray_Descr *dtype = PyArray_DESCR(arrayobj); ((PyObjectEncoder *)tc->encoder)->valueUnit = get_datetime_metadata_from_dtype(dtype).base; ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; @@ -505,8 +519,15 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { npyarr->curdim++; npyarr->stridedim += npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNext received a non-array object"); + return 0; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + + npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); npyarr->index[npyarr->stridedim] = 0; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; @@ -1610,7 +1631,14 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (!values) { goto INVALID; } - pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + + if (!PyArray_Check(pc->newObj)) { + PyErr_SetString(PyExc_TypeError, + "Object_beginTypeContext received a non-array object"); + goto INVALID; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj; + pc->columnLabelsLen = PyArray_DIM(arrayobj, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); if (!pc->columnLabels) { From 3dca4f0f8d583c67d2df56bf9ba51839cdaa6d26 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 14 Feb 2024 03:12:16 +0100 Subject: [PATCH 102/152] Backport PR #57388 on branch 2.2.x (BUG: map(na_action=ignore) not respected for Arrow & masked types) (#57413) Backport PR #57388: BUG: map(na_action=ignore) not respected for Arrow & masked types Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/masked.py | 2 +- pandas/tests/extension/test_arrow.py | 7 +++++++ pandas/tests/extension/test_masked.py | 9 +++++++++ 5 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 94e26ff6aa46a..9733aff0e6eb5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 621a32a049f3b..f8b07bd73d315 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1414,7 +1414,7 @@ def to_numpy( def map(self, mapper, na_action=None): if is_numeric_dtype(self.dtype): - return map_array(self.to_numpy(), mapper, na_action=None) + return map_array(self.to_numpy(), mapper, na_action=na_action) else: return super().map(mapper, na_action) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7c49ce5343158..0bc0d9f8d7a7d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1333,7 +1333,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action=None): - return map_array(self.to_numpy(), mapper, na_action=None) + return map_array(self.to_numpy(), mapper, na_action=na_action) def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e041093bbf5bc..d9a3033b8380e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3379,3 +3379,10 @@ def test_to_numpy_timestamp_to_int(): result = ser.to_numpy(dtype=np.int64) expected = np.array([1577853000000000000]) tm.assert_numpy_array_equal(result, expected) + + +def test_map_numeric_na_action(): + ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") + result = ser.map(lambda x: 42, na_action="ignore") + expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 3efc561d6a125..651f783b44d1f 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -179,6 +179,15 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) + def test_map_na_action_ignore(self, data_missing_for_sorting): + zero = data_missing_for_sorting[2] + result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") + if data_missing_for_sorting.dtype.kind == "b": + expected = np.array([False, pd.NA, False], dtype=object) + else: + expected = np.array([zero, np.nan, zero]) + tm.assert_numpy_array_equal(result, expected) + def _get_expected_exception(self, op_name, obj, other): try: dtype = tm.get_dtype(obj) From 11818adf539b701f7dd68ee46d920d05c52fd7ba Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 16 Feb 2024 18:26:32 +0100 Subject: [PATCH 103/152] Backport PR #57450 on branch 2.2.x (DOC: Set verbose parameter as deprecated in docstring) (#57455) Backport PR #57450: DOC: Set verbose parameter as deprecated in docstring Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> --- pandas/io/parsers/readers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index e26e7e7470461..e04f27b560610 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -240,6 +240,8 @@ performance of reading a large file. verbose : bool, default False Indicate number of ``NA`` values placed in non-numeric columns. + + .. deprecated:: 2.2.0 skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \ From 5550bdb9bebe02ec57c54ca75835bbf12e05ceab Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 16 Feb 2024 20:21:45 +0100 Subject: [PATCH 104/152] Backport PR #57402 on branch 2.2.x (BUG: wrong future Warning on string assignment in certain condition) (#57460) Backport PR #57402: BUG: wrong future Warning on string assignment in certain condition Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/dtypes/missing.py | 14 ++++++++++++ pandas/core/indexing.py | 25 +++++++++++++-------- pandas/tests/frame/indexing/test_setitem.py | 16 +++++++++++++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 9733aff0e6eb5..5e814ec2a1b92 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 `_) (:issue:`56503`) - Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 4dc0d477f89e8..655a53997620a 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -647,6 +647,20 @@ def infer_fill_value(val): return np.nan +def construct_1d_array_from_inferred_fill_value( + value: object, length: int +) -> ArrayLike: + # Find our empty_value dtype by constructing an array + # from our value and doing a .take on it + from pandas.core.algorithms import take_nd + from pandas.core.construction import sanitize_array + from pandas.core.indexes.base import Index + + arr = sanitize_array(value, Index(range(1)), copy=False) + taker = -1 * np.ones(length, dtype=np.intp) + return take_nd(arr, taker) + + def maybe_fill(arr: np.ndarray) -> np.ndarray: """ Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 934ba3a4d7f29..869e511fc0720 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -57,6 +57,7 @@ ABCSeries, ) from pandas.core.dtypes.missing import ( + construct_1d_array_from_inferred_fill_value, infer_fill_value, is_valid_na_for_dtype, isna, @@ -68,7 +69,6 @@ from pandas.core.construction import ( array as pd_array, extract_array, - sanitize_array, ) from pandas.core.indexers import ( check_array_indexer, @@ -844,7 +844,6 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: if self.ndim != 2: return - orig_key = key if isinstance(key, tuple) and len(key) > 1: # key may be a tuple if we are .loc # if length of key is > 1 set key to column part @@ -862,7 +861,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: keys = self.obj.columns.union(key, sort=False) diff = Index(key).difference(self.obj.columns, sort=False) - if len(diff) and com.is_null_slice(orig_key[0]): + if len(diff): # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B" # is a new column, add the new columns with dtype=np.void # so that later when we go through setitem_single_column @@ -1878,12 +1877,9 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): self.obj[key] = empty_value elif not is_list_like(value): - # Find our empty_value dtype by constructing an array - # from our value and doing a .take on it - arr = sanitize_array(value, Index(range(1)), copy=False) - taker = -1 * np.ones(len(self.obj), dtype=np.intp) - empty_value = algos.take_nd(arr, taker) - self.obj[key] = empty_value + self.obj[key] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) else: # FIXME: GH#42099#issuecomment-864326014 self.obj[key] = infer_fill_value(value) @@ -2165,6 +2161,17 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: else: # set value into the column (first attempting to operate inplace, then # falling back to casting if necessary) + dtype = self.obj.dtypes.iloc[loc] + if dtype == np.void: + # This means we're expanding, with multiple columns, e.g. + # df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6]}) + # df.loc[df.index <= 2, ['F', 'G']] = (1, 'abc') + # Columns F and G will initially be set to np.void. + # Here, we replace those temporary `np.void` columns with + # columns of the appropriate dtype, based on `value`. + self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) self.obj._mgr.column_setitem(loc, plane_indexer, value) self.obj._clear_item_cache() diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 99233d3cd4cf3..a58dd701f0f22 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1401,3 +1401,19 @@ def test_full_setter_loc_incompatible_dtype(): df.loc[:, "a"] = {0: 3, 1: 4} expected = DataFrame({"a": [3, 4]}) tm.assert_frame_equal(df, expected) + + +def test_setitem_partial_row_multiple_columns(): + # https://github.com/pandas-dev/pandas/issues/56503 + df = DataFrame({"A": [1, 2, 3], "B": [4.0, 5, 6]}) + # should not warn + df.loc[df.index <= 1, ["F", "G"]] = (1, "abc") + expected = DataFrame( + { + "A": [1, 2, 3], + "B": [4.0, 5, 6], + "F": [1.0, 1, float("nan")], + "G": ["abc", "abc", float("nan")], + } + ) + tm.assert_frame_equal(df, expected) From 32d2b9912fc6292778f3d0ba67b928dbcf9e401d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 17 Feb 2024 23:33:12 +0100 Subject: [PATCH 105/152] Backport PR #57311 on branch 2.2.x (Fixing multi method for to_sql for non-oracle databases) (#57466) Backport PR #57311: Fixing multi method for to_sql for non-oracle databases Co-authored-by: Samuel Chai <121340503+kassett@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/generic.py | 3 +++ pandas/io/sql.py | 11 ++++------- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 5e814ec2a1b92..ca4fef4f57fb6 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -31,6 +31,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) +- Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 55693d4cdb753..62dd0c3bf0161 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2969,6 +2969,9 @@ def to_sql( database. Otherwise, the datetimes will be stored as timezone unaware timestamps local to the original timezone. + Not all datastores support ``method="multi"``. Oracle, for example, + does not support multi-value insert. + References ---------- .. [1] https://docs.sqlalchemy.org diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 3a58daf681cfb..195a7c5040853 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1012,22 +1012,19 @@ def _execute_insert(self, conn, keys: list[str], data_iter) -> int: def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: """ - Alternative to _execute_insert for DBs support multivalue INSERT. + Alternative to _execute_insert for DBs support multi-value INSERT. Note: multi-value insert is usually faster for analytics DBs and tables containing a few columns but performance degrades quickly with increase of columns. + """ from sqlalchemy import insert data = [dict(zip(keys, row)) for row in data_iter] - stmt = insert(self.table) - # conn.execute is used here to ensure compatibility with Oracle. - # Using stmt.values(data) would produce a multi row insert that - # isn't supported by Oracle. - # see: https://docs.sqlalchemy.org/en/20/core/dml.html#sqlalchemy.sql.expression.Insert.values - result = conn.execute(stmt, data) + stmt = insert(self.table).values(data) + result = conn.execute(stmt) return result.rowcount def insert_data(self) -> tuple[list[str], list[np.ndarray]]: From ab8541ce51b70a8be728e8a15d78f7a5f82b62de Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 18 Feb 2024 13:35:55 +0100 Subject: [PATCH 106/152] Backport PR #57454 on branch 2.2.x (Release the gil in take for axis=1) (#57484) Backport PR #57454: Release the gil in take for axis=1 Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_libs/algos_take_helper.pxi.in | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 88c3abba506a3..385727fad3c50 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -184,6 +184,17 @@ def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, fv = fill_value + {{if c_type_in == c_type_out != "object"}} + with nogil: + for i in range(n): + for j in range(k): + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + {{else}} for i in range(n): for j in range(k): idx = indexer[j] @@ -195,6 +206,7 @@ def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{else}} out[i, j] = values[i, idx] {{endif}} + {{endif}} @cython.wraparound(False) From b79fe7e19db5931a4f6165e48b04a514d235e1a8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 18 Feb 2024 10:13:56 -0500 Subject: [PATCH 107/152] REGR: DataFrame.update emits spurious warning about downcasting (#57485) --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/frame.py | 13 ++++++++++++- pandas/tests/frame/methods/test_update.py | 12 +++++++----- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ca4fef4f57fb6..8c8db76ccc3d0 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,6 +32,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) +- Fixed regression in :meth:`DataFrame.update` emitting incorrect warnings about downcasting (:issue:`57124`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b531ddc418df1..c09989fe87fb0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8962,6 +8962,7 @@ def update( 1 2 500.0 2 3 6.0 """ + if not PYPY and using_copy_on_write(): if sys.getrefcount(self) <= REF_COUNT: warnings.warn( @@ -9010,7 +9011,17 @@ def update( if mask.all(): continue - self.loc[:, col] = self[col].where(mask, that) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Downcasting behavior", + category=FutureWarning, + ) + # GH#57124 - `that` might get upcasted because of NA values, and then + # downcasted in where because of the mask. Ignoring the warning + # is a stopgap, will replace with a new implementation of update + # in 3.0. + self.loc[:, col] = self[col].where(mask, that) # ---------------------------------------------------------------------- # Data reshaping diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 20ba550beeb30..8af1798aa8e00 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -48,16 +48,18 @@ def test_update(self): def test_update_dtypes(self): # gh 3016 df = DataFrame( - [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], + [[1.0, 2.0, 1, False, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], ) - other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) + other = DataFrame( + [[45, 45, 3, True]], index=[0], columns=["A", "B", "int", "bool1"] + ) df.update(other) expected = DataFrame( - [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], + [[45.0, 45.0, 3, True, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], ) tm.assert_frame_equal(df, expected) From dfc66f6e5da81f8e7cfc8ff5b4e557ac34f80a47 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 19 Feb 2024 04:51:13 -0500 Subject: [PATCH 108/152] Backport PR #57474 on branch 2.2.x (REGR: DataFrame.transpose resulting in not contiguous data on nullable EAs) (#57496) Backport PR #57474: REGR: DataFrame.transpose resulting in not contiguous data on nullable EAs --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/masked.py | 17 ++++++++++++++--- pandas/tests/frame/methods/test_transpose.py | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 8c8db76ccc3d0..e1da0af59b3b8 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,6 +32,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) +- Fixed regression in :meth:`DataFrame.transpose` with nullable extension dtypes not having F-contiguous data potentially causing exceptions when used (:issue:`57315`) - Fixed regression in :meth:`DataFrame.update` emitting incorrect warnings about downcasting (:issue:`57124`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 0bc0d9f8d7a7d..320d8cb10b8c2 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1621,13 +1621,24 @@ def transpose_homogeneous_masked_arrays( same dtype. The caller is responsible for ensuring validity of input data. """ masked_arrays = list(masked_arrays) + dtype = masked_arrays[0].dtype + values = [arr._data.reshape(1, -1) for arr in masked_arrays] - transposed_values = np.concatenate(values, axis=0) + transposed_values = np.concatenate( + values, + axis=0, + out=np.empty( + (len(masked_arrays), len(masked_arrays[0])), + order="F", + dtype=dtype.numpy_dtype, + ), + ) masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] - transposed_masks = np.concatenate(masks, axis=0) + transposed_masks = np.concatenate( + masks, axis=0, out=np.empty_like(transposed_values, dtype=bool) + ) - dtype = masked_arrays[0].dtype arr_type = dtype.construct_array_type() transposed_arrays: list[BaseMaskedArray] = [] for i in range(transposed_values.shape[1]): diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index d0caa071fae1c..3e74094f266d1 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -3,6 +3,7 @@ import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -190,3 +191,19 @@ def test_transpose_not_inferring_dt_mixed_blocks(self): dtype=object, ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype1", ["Int64", "Float64"]) + @pytest.mark.parametrize("dtype2", ["Int64", "Float64"]) + def test_transpose(self, dtype1, dtype2): + # GH#57315 - transpose should have F contiguous blocks + df = DataFrame( + { + "a": pd.array([1, 1, 2], dtype=dtype1), + "b": pd.array([3, 4, 5], dtype=dtype2), + } + ) + result = df.T + for blk in result._mgr.blocks: + # When dtypes are unequal, we get NumPy object array + data = blk.values._data if dtype1 == dtype2 else blk.values + assert data.flags["F_CONTIGUOUS"] From 25df68eb19c7c783afe276762f3c0a3f03ac8706 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 19 Feb 2024 21:15:56 +0100 Subject: [PATCH 109/152] Backport PR #57486 on branch 2.2.x (CI: Run excel tests on single cpu for windows) (#57507) Backport PR #57486: CI: Run excel tests on single cpu for windows Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/io/excel/test_odf.py | 5 +++++ pandas/tests/io/excel/test_odswriter.py | 5 +++++ pandas/tests/io/excel/test_openpyxl.py | 5 +++++ pandas/tests/io/excel/test_readers.py | 4 ++++ pandas/tests/io/excel/test_style.py | 4 ++++ pandas/tests/io/excel/test_writers.py | 4 ++++ pandas/tests/io/excel/test_xlrd.py | 5 +++++ pandas/tests/io/excel/test_xlsxwriter.py | 5 +++++ 8 files changed, 37 insertions(+) diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index f01827fa4ca2f..b5bb9b27258d8 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -3,11 +3,16 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm pytest.importorskip("odf") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture(autouse=True) def cd_and_set_engine(monkeypatch, datapath): diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index 271353a173d2a..1c728ad801bc1 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -6,6 +6,8 @@ import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm @@ -13,6 +15,9 @@ odf = pytest.importorskip("odf") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 2df9ec9e53516..e53b5830ec6a4 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -17,6 +19,9 @@ openpyxl = pytest.importorskip("openpyxl") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 15712f36da4ca..8da8535952dcf 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -18,6 +18,7 @@ from pandas._config import using_pyarrow_string_dtype +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td import pandas as pd @@ -34,6 +35,9 @@ StringArray, ) +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ # Add any engines to test here diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 3ca8637885639..89615172688d7 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td from pandas import ( @@ -20,6 +21,9 @@ # could compute styles and render to excel without jinja2, since there is no # 'template' file, but this needs the import error to delayed until render time. +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + def assert_equal_cell_styles(cell1, cell2): # TODO: should find a better way to check equality diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 8c003723c1c71..292eab2d88152 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -11,6 +11,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows from pandas.compat._constants import PY310 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -34,6 +35,9 @@ ) from pandas.io.excel._util import _writers +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + def get_exp_unit(path: str) -> str: return "ns" diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 6d5008ca9ee68..066393d91eead 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm @@ -11,6 +13,9 @@ xlrd = pytest.importorskip("xlrd") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture(params=[".xls"]) def read_ext_xlrd(request): diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index 94f6bdfaf069c..529367761fc02 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -2,6 +2,8 @@ import pytest +from pandas.compat import is_platform_windows + from pandas import DataFrame import pandas._testing as tm @@ -9,6 +11,9 @@ xlsxwriter = pytest.importorskip("xlsxwriter") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): From 6766c92fbac05118374c3963ac4b59524d80cf82 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 19 Feb 2024 23:12:25 +0100 Subject: [PATCH 110/152] Backport PR #57490 on branch 2.2.x (DOC: Add a few deprecation notes) (#57509) Backport PR #57490: DOC: Add a few deprecation notes Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/dtypes/common.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2245359fd8eac..df0251d141984 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -169,6 +169,9 @@ def is_sparse(arr) -> bool: """ Check whether an array-like is a 1-D pandas sparse array. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.SparseDtype) instead. + Check that the one-dimensional array-like is a pandas sparse array. Returns True if it is a pandas sparse array, not another type of sparse array. @@ -295,6 +298,9 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of a DatetimeTZDtype dtype. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.DatetimeTZDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -381,6 +387,9 @@ def is_period_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Period dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.Period) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -424,6 +433,9 @@ def is_interval_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Interval dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.IntervalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -470,6 +482,9 @@ def is_categorical_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Categorical dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.CategoricalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype From 2ae7a1057a1c64b4cddf94aa5fa439f5a14ea5d1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 20 Feb 2024 02:20:20 +0100 Subject: [PATCH 111/152] Backport PR #57488 on branch 2.2.x (REGR: query raising for all NaT in object column) (#57515) Backport PR #57488: REGR: query raising for all NaT in object column Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/generic.py | 2 +- pandas/tests/frame/test_query_eval.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index e1da0af59b3b8..6c6a37b2b2f89 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -27,6 +27,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 `_) (:issue:`56503`) - Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.query` with all ``NaT`` column with object dtype (:issue:`57068`) - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 62dd0c3bf0161..0ec17173287f5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -657,7 +657,7 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: return { clean_column_name(k): Series( - v, copy=False, index=self.index, name=k + v, copy=False, index=self.index, name=k, dtype=self.dtypes[k] ).__finalize__(self) for k, v in zip(self.columns, self._iter_column_arrays()) if not isinstance(k, int) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index a498296e09c52..2c807c72582c5 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1415,3 +1415,11 @@ def test_query_ea_equality_comparison(self, dtype, engine): } ) tm.assert_frame_equal(result, expected) + + def test_all_nat_in_object(self): + # GH#57068 + now = pd.Timestamp.now("UTC") # noqa: F841 + df = DataFrame({"a": pd.to_datetime([None, None], utc=True)}, dtype=object) + result = df.query("a > @now") + expected = DataFrame({"a": []}, dtype=object) + tm.assert_frame_equal(result, expected) From 9b1ce062d3985e7556fc7fa6f9d4c6bc184576eb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 20 Feb 2024 19:51:27 +0100 Subject: [PATCH 112/152] Backport PR #57489 on branch 2.2.x (REGR: astype introducing decimals when casting from int with na to string) (#57533) Backport PR #57489: REGR: astype introducing decimals when casting from int with na to string Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/lib.pyx | 2 +- pandas/tests/series/methods/test_astype.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 6c6a37b2b2f89..85b9346aa01a5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -39,6 +39,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) +- Fixed regression in :meth:`Series.astype` introducing decimals when converting from integer with missing values to string dtype (:issue:`57418`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) - Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c483f35513a40..7656e8d986117 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -759,7 +759,7 @@ cpdef ndarray[object] ensure_string_array( out = arr.astype(str).astype(object) out[arr.isna()] = na_value return out - arr = arr.to_numpy() + arr = arr.to_numpy(dtype=object) elif not util.is_array(arr): arr = np.array(arr, dtype="object") diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 46f55fff91e41..4c8028e74ee55 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -673,3 +673,11 @@ def test_astype_timedelta64_with_np_nan(self): result = Series([Timedelta(1), np.nan], dtype="timedelta64[ns]") expected = Series([Timedelta(1), NaT], dtype="timedelta64[ns]") tm.assert_series_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_astype_int_na_string(self): + # GH#57418 + ser = Series([12, NA], dtype="Int64[pyarrow]") + result = ser.astype("string[pyarrow]") + expected = Series(["12", NA], dtype="string[pyarrow]") + tm.assert_series_equal(result, expected) From 0b49cf35bdc025e69ef104d59660cb08b973fec5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 20 Feb 2024 22:32:29 +0100 Subject: [PATCH 113/152] Backport PR #57536 on branch 2.2.x (BUG: dt64 + DateOffset with milliseconds) (#57537) Backport PR #57536: BUG: dt64 + DateOffset with milliseconds Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 15 +++++++++- pandas/tests/arithmetic/test_datetime64.py | 32 ++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 85b9346aa01a5..ab62a7c7598d5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -42,6 +42,7 @@ Fixed regressions - Fixed regression in :meth:`Series.astype` introducing decimals when converting from integer with missing values to string dtype (:issue:`57418`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) - Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) +- Fixed regression in addition or subtraction of :class:`DateOffset` objects with millisecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` (:issue:`57529`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 70e1ca1c4012a..c37a4b285daef 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1458,13 +1458,22 @@ cdef class RelativeDeltaOffset(BaseOffset): "minutes", "seconds", "microseconds", + "milliseconds", } # relativedelta/_offset path only valid for base DateOffset if self._use_relativedelta and set(kwds).issubset(relativedelta_fast): + td_args = { + "days", + "hours", + "minutes", + "seconds", + "microseconds", + "milliseconds" + } td_kwds = { key: val for key, val in kwds.items() - if key in ["days", "hours", "minutes", "seconds", "microseconds"] + if key in td_args } if "weeks" in kwds: days = td_kwds.get("days", 0) @@ -1474,6 +1483,8 @@ cdef class RelativeDeltaOffset(BaseOffset): delta = Timedelta(**td_kwds) if "microseconds" in kwds: delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") else: delta = delta.as_unit("s") else: @@ -1491,6 +1502,8 @@ cdef class RelativeDeltaOffset(BaseOffset): delta = Timedelta(self._offset * self.n) if "microseconds" in kwds: delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") else: delta = delta.as_unit("s") return delta diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index dbff88dc6f4f6..a468449efd507 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1586,6 +1586,38 @@ def test_dti_add_sub_nonzero_mth_offset( expected = tm.box_expected(expected, box_with_array, False) tm.assert_equal(result, expected) + def test_dt64arr_series_add_DateOffset_with_milli(self): + # GH 57529 + dti = DatetimeIndex( + [ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ], + dtype="datetime64[ns]", + ) + result = dti + DateOffset(milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-01 00:00:00.016345678", + "2000-01-31 00:00:00.016345678", + "2000-02-29 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + + result = dti + DateOffset(days=1, milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-02 00:00:00.016345678", + "2000-02-01 00:00:00.016345678", + "2000-03-01 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + class TestDatetime64OverflowHandling: # TODO: box + de-duplicate From c101d3022b0a6b3a2375db64111c3f6fe516acf3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 20 Feb 2024 22:32:46 +0100 Subject: [PATCH 114/152] Backport PR #57510 on branch 2.2.x (DOC: Fix xarray example) (#57538) DOC: Fix xarray example (#57510) * DOC: Fix xarray example * Update * Skip doctest * Igore another doctest --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 33a1cd7163ce712e5a38fdb5d2e04de203b3ddf9) --- pandas/core/generic.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0ec17173287f5..1dd471a09f1b1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3278,18 +3278,18 @@ def to_xarray(self): 2 lion mammal 80.5 4 3 monkey mammal NaN 4 - >>> df.to_xarray() + >>> df.to_xarray() # doctest: +SKIP Dimensions: (index: 4) Coordinates: - * index (index) int64 0 1 2 3 + * index (index) int64 32B 0 1 2 3 Data variables: - name (index) object 'falcon' 'parrot' 'lion' 'monkey' - class (index) object 'bird' 'bird' 'mammal' 'mammal' - max_speed (index) float64 389.0 24.0 80.5 nan - num_legs (index) int64 2 2 4 4 + name (index) object 32B 'falcon' 'parrot' 'lion' 'monkey' + class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' + max_speed (index) float64 32B 389.0 24.0 80.5 nan + num_legs (index) int64 32B 2 2 4 4 - >>> df['max_speed'].to_xarray() + >>> df['max_speed'].to_xarray() # doctest: +SKIP array([389. , 24. , 80.5, nan]) Coordinates: @@ -3311,7 +3311,7 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' 2018-01-02 falcon 361 parrot 15 - >>> df_multiindex.to_xarray() + >>> df_multiindex.to_xarray() # doctest: +SKIP Dimensions: (date: 2, animal: 2) Coordinates: From 3a4033c7296a665fcbbbcd49a05a420611fa3f3c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 21 Feb 2024 21:46:10 +0100 Subject: [PATCH 115/152] Backport PR #57439 on branch 2.2.x (BUG: read_json returning Index instead of RangeIndex) (#57552) Backport PR #57439: BUG: read_json returning Index instead of RangeIndex Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/io/json/_json.py | 25 ++++++++++++++----------- pandas/tests/io/json/test_pandas.py | 16 ++++++++++++++-- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ab62a7c7598d5..d81032fafa730 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) +- Fixed regression in :func:`read_json` where an :class:`Index` would be returned instead of a :class:`RangeIndex` (:issue:`57429`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 4c490c6b2cda2..9414f45215029 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1266,6 +1266,7 @@ def _try_convert_data( if result: return new_data, True + converted = False if self.dtype_backend is not lib.no_default and not is_axis: # Fall through for conversion later on return data, True @@ -1273,16 +1274,17 @@ def _try_convert_data( # try float try: data = data.astype("float64") + converted = True except (TypeError, ValueError): pass - if data.dtype.kind == "f": - if data.dtype != "float64": - # coerce floats to 64 - try: - data = data.astype("float64") - except (TypeError, ValueError): - pass + if data.dtype.kind == "f" and data.dtype != "float64": + # coerce floats to 64 + try: + data = data.astype("float64") + converted = True + except (TypeError, ValueError): + pass # don't coerce 0-len data if len(data) and data.dtype in ("float", "object"): @@ -1291,14 +1293,15 @@ def _try_convert_data( new_data = data.astype("int64") if (new_data == data).all(): data = new_data + converted = True except (TypeError, ValueError, OverflowError): pass - # coerce ints to 64 - if data.dtype == "int": - # coerce floats to 64 + if data.dtype == "int" and data.dtype != "int64": + # coerce ints to 64 try: data = data.astype("int64") + converted = True except (TypeError, ValueError): pass @@ -1307,7 +1310,7 @@ def _try_convert_data( if self.orient == "split": return data, False - return data, True + return data, converted @final def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index caa25841d3596..5279f3f1cdfbe 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -24,6 +24,7 @@ DataFrame, DatetimeIndex, Index, + RangeIndex, Series, Timestamp, date_range, @@ -493,12 +494,12 @@ def test_frame_mixedtype_orient(self): # GH10289 left = read_json(inp, orient=orient, convert_axes=False) tm.assert_frame_equal(left, right) - right.index = pd.RangeIndex(len(df)) + right.index = RangeIndex(len(df)) inp = StringIO(df.to_json(orient="records")) left = read_json(inp, orient="records", convert_axes=False) tm.assert_frame_equal(left, right) - right.columns = pd.RangeIndex(df.shape[1]) + right.columns = RangeIndex(df.shape[1]) inp = StringIO(df.to_json(orient="values")) left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) @@ -2188,3 +2189,14 @@ def test_to_json_ea_null(): {"a":null,"b":null} """ assert result == expected + + +def test_read_json_lines_rangeindex(): + # GH 57429 + data = """ +{"a": 1, "b": 2} +{"a": 3, "b": 4} +""" + result = read_json(StringIO(data), lines=True).index + expected = RangeIndex(2) + tm.assert_index_equal(result, expected, exact=True) From 3bfedfef7401f6af7e551b9fd2182442b70f0409 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 21 Feb 2024 13:49:28 -1000 Subject: [PATCH 116/152] Backport PR #57551: BLD: Add pyarrow extra for pip installation (#57557) --- .github/workflows/package-checks.yml | 2 +- doc/source/whatsnew/v2.2.1.rst | 6 ++++++ pyproject.toml | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index d59ddf272f705..7c1da5678a2aa 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] + extra: ["test", "pyarrow", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] fail-fast: false name: Install Extras - ${{ matrix.extra }} concurrency: diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index d81032fafa730..e381f9df16383 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -9,6 +9,12 @@ including other versions of pandas. {{ header }} .. --------------------------------------------------------------------------- +.. _whatsnew_221.enhancements: + +Enhancements +~~~~~~~~~~~~ +- Added ``pyarrow`` pip extra so users can install pandas and pyarrow with pip with ``pip install pandas[pyarrow]`` (:issue:`54466`) + .. _whatsnew_221.regressions: Fixed regressions diff --git a/pyproject.toml b/pyproject.toml index 8c9a79aa2b059..c225ed80dcb10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] +pyarrow = ['pyarrow>=10.0.1'] performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4'] computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] fss = ['fsspec>=2022.11.0'] From ea56e0cd720b8a4e1c26df7387e8ccb4badf6eb2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 22 Feb 2024 19:45:36 +0100 Subject: [PATCH 117/152] Backport PR #57556 on branch 2.2.x (Remove PyArrow deprecation warning) (#57568) --- .github/workflows/unit-tests.yml | 5 +---- doc/source/whatsnew/v2.2.1.rst | 10 ++++++++++ pandas/__init__.py | 31 +------------------------------ pandas/compat/pyarrow.py | 2 -- pandas/tests/test_common.py | 25 ------------------------- 5 files changed, 12 insertions(+), 61 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 4c7aa1e1e49ee..8736674bbf965 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -92,10 +92,7 @@ jobs: - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" - # Currently restricted the warnings that error to Deprecation Warnings from numpy - # done since pyarrow isn't compatible with numpydev always - # TODO: work with pyarrow to revert this? - test_args: "-W error::DeprecationWarning:numpy -W error::FutureWarning:numpy" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index e381f9df16383..35d64c99f2002 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -67,6 +67,16 @@ Bug fixes Other ~~~~~ + +.. note:: + + The ``DeprecationWarning`` that was raised when pandas was imported without PyArrow being + installed has been removed. This decision was made because the warning was too noisy for too + many users and a lot of feedback was collected about the decision to make PyArrow a required + dependency. Pandas is currently considering the decision whether or not PyArrow should be added + as a hard dependency in 3.0. Interested users can follow the discussion + `here `_. + - Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) - Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`) diff --git a/pandas/__init__.py b/pandas/__init__.py index ed524c2bb3619..ca2eba2043292 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -203,36 +203,7 @@ stacklevel=2, ) -# DeprecationWarning for missing pyarrow -from pandas.compat.pyarrow import pa_version_under10p1, pa_not_found - -if pa_version_under10p1: - # pyarrow is either too old or nonexistent, warn - from pandas.compat._optional import VERSIONS - - if pa_not_found: - pa_msg = "was not found to be installed on your system." - else: - pa_msg = ( - f"was too old on your system - pyarrow {VERSIONS['pyarrow']} " - "is the current minimum supported version as of this release." - ) - - warnings.warn( - f""" -Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), -(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) -but {pa_msg} -If this would cause problems for you, -please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 - """, # noqa: E501 - DeprecationWarning, - stacklevel=2, - ) - del VERSIONS, pa_msg - -# Delete all unnecessary imported modules -del pa_version_under10p1, pa_not_found, warnings, os +del warnings, os # module level doc-string __doc__ = """ diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 2e151123ef2c9..beb4814914101 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -8,7 +8,6 @@ import pyarrow as pa _palv = Version(Version(pa.__version__).base_version) - pa_not_found = False pa_version_under10p1 = _palv < Version("10.0.1") pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p0 = _palv < Version("12.0.0") @@ -17,7 +16,6 @@ pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") except ImportError: - pa_not_found = True pa_version_under10p1 = True pa_version_under11p0 = True pa_version_under12p0 = True diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 4af71be11fe6b..e8a1c961c8cb6 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -8,8 +8,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import Series import pandas._testing as tm @@ -267,26 +265,3 @@ def test_bz2_missing_import(): code = textwrap.dedent(code) call = [sys.executable, "-c", code] subprocess.check_output(call) - - -@td.skip_if_installed("pyarrow") -@pytest.mark.parametrize("module", ["pandas", "pandas.arrays"]) -def test_pyarrow_missing_warn(module): - # GH56896 - response = subprocess.run( - [sys.executable, "-c", f"import {module}"], - capture_output=True, - check=True, - ) - msg = """ -Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), -(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) -but was not found to be installed on your system. -If this would cause problems for you, -please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 -""" # noqa: E501 - stderr_msg = response.stderr.decode("utf-8") - # Split by \n to avoid \r\n vs \n differences on Windows/Unix - # https://stackoverflow.com/questions/11989501/replacing-r-n-with-n - stderr_msg = "\n".join(stderr_msg.splitlines()) - assert msg in stderr_msg From 5521dc98f9fe73cc3a0adbbf61a2e15f3ae0736e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 23 Feb 2024 01:38:10 +0100 Subject: [PATCH 118/152] Backport PR #57314 on branch 2.2.x (BUG: Fix near-minimum timestamp handling) (#57573) Backport PR #57314: BUG: Fix near-minimum timestamp handling Co-authored-by: Robert Schmidtke --- doc/source/whatsnew/v2.2.1.rst | 1 + .../src/vendored/numpy/datetime/np_datetime.c | 18 ++++++++++++++---- pandas/tests/tslibs/test_array_to_datetime.py | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 35d64c99f2002..fa5304c1c3b56 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -21,6 +21,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression causing overflow for near-minimum timestamps (:issue:`57150`) - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 06e3251db8315..277d01807f2f3 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -482,10 +482,20 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, if (base == NPY_FR_ns) { int64_t nanoseconds; - PD_CHECK_OVERFLOW( - scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); - PD_CHECK_OVERFLOW( - checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + + // Minimum valid timestamp in nanoseconds (1677-09-21 00:12:43.145224193). + const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; + if (microseconds == min_nanoseconds / 1000 - 1) { + // For values within one microsecond of min_nanoseconds, use it as base + // and offset it with nanosecond delta to avoid overflow during scaling. + PD_CHECK_OVERFLOW(checked_int64_add( + min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); + } else { + PD_CHECK_OVERFLOW( + scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + } return nanoseconds; } diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 632d3b4cc3c84..82175c67764f8 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -296,6 +296,23 @@ def test_to_datetime_barely_out_of_bounds(): tslib.array_to_datetime(arr) +@pytest.mark.parametrize( + "timestamp", + [ + # Close enough to bounds that scaling micros to nanos overflows + # but adding nanos would result in an in-bounds datetime. + "1677-09-21T00:12:43.145224193", + "1677-09-21T00:12:43.145224999", + # this always worked + "1677-09-21T00:12:43.145225000", + ], +) +def test_to_datetime_barely_inside_bounds(timestamp): + # see gh-57150 + result, _ = tslib.array_to_datetime(np.array([timestamp], dtype=object)) + tm.assert_numpy_array_equal(result, np.array([timestamp], dtype="M8[ns]")) + + class SubDatetime(datetime): pass From bdbb17993cacf916648c1f3e0aa637f459346a4f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 23 Feb 2024 05:47:35 +0100 Subject: [PATCH 119/152] Backport PR #57576 on branch 2.2.x (DOC: Add release date for 2.2.1) (#57579) Backport PR #57576: DOC: Add release date for 2.2.1 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index fa5304c1c3b56..f4ed594613349 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -1,6 +1,6 @@ .. _whatsnew_221: -What's new in 2.2.1 (February XX, 2024) +What's new in 2.2.1 (February 22, 2024) --------------------------------------- These are the changes in pandas 2.2.1. See :ref:`release` for a full changelog From 541448e41cefbff27e45866ad173826fd65e807e Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 23 Feb 2024 08:36:39 -0500 Subject: [PATCH 120/152] RLS: 2.2.1 From 470b886470f202e578c77952db8163e49f3efeab Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 23 Feb 2024 15:48:32 +0100 Subject: [PATCH 121/152] Backport PR #57582 on branch 2.2.x (DOC: Add contributors for 2.2.1) (#57583) Backport PR #57582: DOC: Add contributors for 2.2.1 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- doc/source/whatsnew/v2.2.1.rst | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d9ab0452c8334..e015afb17dce5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -946,4 +946,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.4..v2.2.0|HEAD +.. contributors:: v2.1.4..v2.2.0 diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index f4ed594613349..310dd921e44f6 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -86,3 +86,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.2.0..v2.2.1|HEAD From bdc79c146c2e32f2cab629be240f01658cfb6cc2 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 23 Feb 2024 09:49:27 -0500 Subject: [PATCH 122/152] RLS: 2.2.1 From 9a0718468d67608f25dd56c3d912e43950da154a Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 1 Mar 2024 15:36:48 +0000 Subject: [PATCH 123/152] Backport PR #57689 on branch 2.2.x (CI: fix ci (calamine typing)) (#57692) Backport PR #57689: CI: fix ci (calamine typing) --- pandas/io/excel/_calamine.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py index 4f65acf1aa40e..5259469f7a569 100644 --- a/pandas/io/excel/_calamine.py +++ b/pandas/io/excel/_calamine.py @@ -74,9 +74,7 @@ def load_workbook( ) -> CalamineWorkbook: from python_calamine import load_workbook - return load_workbook( - filepath_or_buffer, **engine_kwargs # type: ignore[arg-type] - ) + return load_workbook(filepath_or_buffer, **engine_kwargs) @property def sheet_names(self) -> list[str]: From 4ac5ee21a073a93fbf47b1e95ea1637a02d870da Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 3 Mar 2024 13:54:00 +0100 Subject: [PATCH 124/152] Backport PR #57668 on branch 2.2.x (CLN: More numpy 2 stuff) (#57707) Backport PR #57668: CLN: More numpy 2 stuff Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 74ca8ead3d936..fa91db5fe34e3 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -74,7 +74,6 @@ typedef struct __NpyArrContext { npy_intp ndim; npy_intp index[NPY_MAXDIMS]; int type_num; - PyArray_GetItemFunc *getitem; char **rowLabels; char **columnLabels; @@ -405,7 +404,6 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { } npyarr->array = (PyObject *)obj; - npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem; npyarr->dataptr = PyArray_DATA(obj); npyarr->ndim = PyArray_NDIM(obj) - 1; npyarr->curdim = 0; @@ -492,7 +490,7 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { - GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + GET_TC(tc)->itemValue = PyArray_GETITEM(arrayobj, npyarr->dataptr); } npyarr->dataptr += npyarr->stride; From 6db283c367ab178e5464bb5ff323bdc6c9ebce1f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 4 Mar 2024 12:45:38 -1000 Subject: [PATCH 125/152] Backport PR #57726: TST/CI: Fix test_repr on musl for dateutil 2.9 (#57728) --- pandas/tests/io/pytables/test_timezones.py | 2 +- pandas/tests/scalar/timestamp/test_formats.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 8c61830ebe038..c5613daf62207 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -104,7 +104,7 @@ def test_append_with_timezones(setup_path, gettz): msg = ( r"invalid info for \[values_block_1\] for \[tz\], " - r"existing_value \[(dateutil/.*)?US/Eastern\] " + r"existing_value \[(dateutil/.*)?(US/Eastern|America/New_York)\] " r"conflicts with new value \[(dateutil/.*)?EET\]" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index d7160597ea6d6..e7ebcccef1c86 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -88,7 +88,7 @@ def test_isoformat(ts, timespec, expected_iso): class TestTimestampRendering: - timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] + timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/America/Los_Angeles"] @pytest.mark.parametrize("tz", timezones) @pytest.mark.parametrize("freq", ["D", "M", "S", "N"]) From 3cc5afa53e123a89e594df87630f9ac61e718a09 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 4 Mar 2024 23:46:47 +0100 Subject: [PATCH 126/152] Backport PR #57721 on branch 2.2.x (update from 2022 to 2024 image) (#57729) Backport PR #57721: update from 2022 to 2024 image Co-authored-by: Thomas Baumann --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 90afb1ce29684..ea93575ac9430 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,7 +3,7 @@ version: 2.1 jobs: test-arm: machine: - image: ubuntu-2004:2022.04.1 + image: default resource_class: arm.large environment: ENV_FILE: ci/deps/circle-310-arm64.yaml @@ -46,7 +46,7 @@ jobs: cibw-build: type: string machine: - image: ubuntu-2004:2022.04.1 + image: default resource_class: arm.large environment: TRIGGER_SOURCE: << pipeline.trigger_source >> From 301f9145b26f70c18e2a34c0e2fd1a346ba413d9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Mar 2024 14:08:17 -1000 Subject: [PATCH 127/152] Backport PR #57172: MAINT: Adjust the codebase to the new 's keyword meaning (#57740) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mateusz Sokół <8431159+mtsokol@users.noreply.github.com> --- pandas/core/array_algos/quantile.py | 6 ++--- pandas/core/arrays/arrow/array.py | 4 +++- pandas/core/arrays/base.py | 5 +++- pandas/core/arrays/categorical.py | 4 +++- pandas/core/arrays/datetimelike.py | 4 +++- pandas/core/arrays/datetimes.py | 6 ++--- pandas/core/arrays/interval.py | 4 +++- pandas/core/arrays/masked.py | 4 +++- pandas/core/arrays/numeric.py | 14 +++++++---- pandas/core/arrays/numpy_.py | 4 +++- pandas/core/arrays/period.py | 9 ++++++-- pandas/core/arrays/sparse/array.py | 4 +++- pandas/core/arrays/timedeltas.py | 7 ++++-- pandas/core/construction.py | 14 ++++++++--- pandas/core/dtypes/cast.py | 7 ++++-- pandas/core/dtypes/missing.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 4 +++- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/multi.py | 6 ++--- pandas/core/internals/managers.py | 2 ++ pandas/core/series.py | 7 +++++- pandas/io/pytables.py | 2 +- .../tests/arrays/integer/test_arithmetic.py | 1 + pandas/tests/arrays/test_datetimelike.py | 23 +++++++++++-------- pandas/tests/dtypes/test_inference.py | 4 ++-- .../tests/extension/array_with_attr/array.py | 5 +++- pandas/tests/extension/json/array.py | 8 ++++--- pandas/tests/extension/list/array.py | 5 +++- pandas/tests/extension/test_common.py | 8 ++++--- .../tests/frame/methods/test_select_dtypes.py | 2 +- pandas/tests/frame/test_arithmetic.py | 2 +- pandas/tests/indexes/test_index_new.py | 2 +- 33 files changed, 125 insertions(+), 58 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index ee6f00b219a15..5c933294fb944 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -102,7 +102,7 @@ def quantile_with_mask( interpolation=interpolation, ) - result = np.array(result, copy=False) + result = np.asarray(result) result = result.T return result @@ -201,9 +201,9 @@ def _nanpercentile( ] if values.dtype.kind == "f": # preserve itemsize - result = np.array(result, dtype=values.dtype, copy=False).T + result = np.asarray(result, dtype=values.dtype).T else: - result = np.array(result, copy=False).T + result = np.asarray(result).T if ( result.dtype != values.dtype and not mask.all() diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f8b07bd73d315..f2b8aa75ca5bf 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -656,7 +656,9 @@ def __arrow_array__(self, type=None): """Convert myself to a pyarrow ChunkedArray.""" return self._pa_array - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" return self.to_numpy(dtype=dtype) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ea0e2e54e3339..abfe2369b0d0d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -719,7 +719,10 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) - return np.array(self, dtype=dtype, copy=copy) + if not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b87c5375856dc..f191f7277743f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1636,7 +1636,9 @@ def _validate_codes_for_dtype(cls, codes, *, dtype: CategoricalDtype) -> np.ndar # ------------------------------------------------------------- @ravel_compat - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ The numpy array interface. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a0e0a1434e871..1042a1b3fde61 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -351,7 +351,9 @@ def _formatter(self, boxed: bool = False): # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): return np.array(list(self), dtype=object) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 02656b655a0c6..a146220d249e2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -635,12 +635,12 @@ def _resolution_obj(self) -> Resolution: # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: if dtype is None and self.tz: # The default for tz-aware is object, to preserve tz info dtype = object - return super().__array__(dtype=dtype) + return super().__array__(dtype=dtype, copy=copy) def __iter__(self) -> Iterator: """ @@ -2393,7 +2393,7 @@ def objects_to_datetime64( assert errors in ["raise", "ignore", "coerce"] # if str-dtype, convert - data = np.array(data, copy=False, dtype=np.object_) + data = np.asarray(data, dtype=np.object_) result, tz_parsed = tslib.array_to_datetime( data, diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index e69f996441703..91db7f11bcbe0 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1567,7 +1567,9 @@ def is_non_overlapping_monotonic(self) -> bool: # --------------------------------------------------------------------- # Conversion - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 320d8cb10b8c2..d7e816b9d3781 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -593,7 +593,9 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ the array interface, return my values We return an object array here to preserve our scalar values diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 210450e868698..68fa7fcb6573c 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -159,7 +159,10 @@ def _coerce_to_data_and_mask( return values, mask, dtype, inferred_type original = values - values = np.array(values, copy=copy) + if not copy: + values = np.asarray(values) + else: + values = np.array(values, copy=copy) inferred_type = None if values.dtype == object or is_string_dtype(values.dtype): inferred_type = lib.infer_dtype(values, skipna=True) @@ -168,7 +171,10 @@ def _coerce_to_data_and_mask( raise TypeError(f"{values.dtype} cannot be converted to {name}") elif values.dtype.kind == "b" and checker(dtype): - values = np.array(values, dtype=default_dtype, copy=copy) + if not copy: + values = np.asarray(values, dtype=default_dtype) + else: + values = np.array(values, dtype=default_dtype, copy=copy) elif values.dtype.kind not in "iuf": name = dtype_cls.__name__.strip("_") @@ -207,9 +213,9 @@ def _coerce_to_data_and_mask( inferred_type not in ["floating", "mixed-integer-float"] and not mask.any() ): - values = np.array(original, dtype=dtype, copy=False) + values = np.asarray(original, dtype=dtype) else: - values = np.array(original, dtype="object", copy=False) + values = np.asarray(original, dtype="object") # we copy as need to coerce here if mask.any(): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index d83a37088daec..07eb91e0cb13b 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -150,7 +150,9 @@ def dtype(self) -> NumpyEADtype: # ------------------------------------------------------------------------ # NumPy Array Interface - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index d635eb4a25df3..c1229e27ab51a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -256,7 +256,10 @@ def __init__( raise raise_on_incompatible(values, dtype.freq) values, dtype = values._ndarray, values.dtype - values = np.array(values, dtype="int64", copy=copy) + if not copy: + values = np.asarray(values, dtype="int64") + else: + values = np.array(values, dtype="int64", copy=copy) if dtype is None: raise ValueError("dtype is not specified and cannot be inferred") dtype = cast(PeriodDtype, dtype) @@ -400,7 +403,9 @@ def freq(self) -> BaseOffset: def freqstr(self) -> str: return freq_to_period_freqstr(self.freq.n, self.freq.name) - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: if dtype == "i8": return self.asi8 elif dtype == bool: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 98d84d899094b..82fcfa74ec7d2 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -551,7 +551,9 @@ def from_spmatrix(cls, data: spmatrix) -> Self: return cls._simple_new(arr, index, dtype) - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: fill_value = self.fill_value if self.sp_index.ngaps == 0: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 1b885a2bdcd47..e9260a3ec50a2 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -1072,7 +1072,10 @@ def sequence_to_td64ns( # This includes datetime64-dtype, see GH#23539, GH#29794 raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") - data = np.array(data, copy=copy) + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) assert data.dtype.kind == "m" assert data.dtype != "m8" # i.e. not unit-less @@ -1150,7 +1153,7 @@ def _objects_to_td64ns(data, unit=None, errors: DateTimeErrorChoices = "raise"): higher level. """ # coerce Index to np.ndarray, converting string-dtype if necessary - values = np.array(data, dtype=np.object_, copy=False) + values = np.asarray(data, dtype=np.object_) result = array_to_timedelta64(values, unit=unit, errors=errors) return result.view("timedelta64[ns]") diff --git a/pandas/core/construction.py b/pandas/core/construction.py index d41a9c80a10ec..f8250ae475a10 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -626,7 +626,10 @@ def sanitize_array( elif hasattr(data, "__array__"): # e.g. dask array GH#38645 - data = np.array(data, copy=copy) + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) return sanitize_array( data, index=index, @@ -744,8 +747,11 @@ def _sanitize_str_dtypes( # GH#19853: If data is a scalar, result has already the result if not lib.is_scalar(data): if not np.all(isna(data)): - data = np.array(data, dtype=dtype, copy=False) - result = np.array(data, dtype=object, copy=copy) + data = np.asarray(data, dtype=dtype) + if not copy: + result = np.asarray(data, dtype=object) + else: + result = np.array(data, dtype=object, copy=copy) return result @@ -810,6 +816,8 @@ def _try_cast( # this will raise if we have e.g. floats subarr = maybe_cast_to_integer_array(arr, dtype) + elif not copy: + subarr = np.asarray(arr, dtype=dtype) else: subarr = np.array(arr, dtype=dtype, copy=copy) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 690af6b0ebdc7..7dd81ec59bc49 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1501,7 +1501,10 @@ def construct_2d_arraylike_from_scalar( # Attempt to coerce to a numpy array try: - arr = np.array(value, dtype=dtype, copy=copy) + if not copy: + arr = np.asarray(value, dtype=dtype) + else: + arr = np.array(value, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: raise TypeError( f"DataFrame constructor called with incompatible data and dtype: {err}" @@ -1651,7 +1654,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n "out-of-bound Python int", DeprecationWarning, ) - casted = np.array(arr, dtype=dtype, copy=False) + casted = np.asarray(arr, dtype=dtype) else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 655a53997620a..c341ff9dff7e6 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -632,7 +632,7 @@ def infer_fill_value(val): """ if not is_list_like(val): val = [val] - val = np.array(val, copy=False) + val = np.asarray(val) if val.dtype.kind in "mM": return np.array("NaT", dtype=val.dtype) elif val.dtype == object: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c09989fe87fb0..5c510d98596df 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1980,7 +1980,7 @@ def to_numpy( dtype = np.dtype(dtype) result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) if result.dtype is not dtype: - result = np.array(result, dtype=dtype, copy=False) + result = np.asarray(result, dtype=dtype) return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1dd471a09f1b1..2a86f75badecd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2145,7 +2145,9 @@ def empty(self) -> bool_t: # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented __array_priority__: int = 1000 - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None + ) -> np.ndarray: values = self._values arr = np.asarray(values, dtype=dtype) if ( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4b3d1a9e006dc..6822c2c99427e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -912,7 +912,7 @@ def __len__(self) -> int: """ return len(self._data) - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 02a841a2075fd..091ddbcc099be 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -774,7 +774,7 @@ def _values(self) -> np.ndarray: ): vals = vals.astype(object) - vals = np.array(vals, copy=False) + vals = np.asarray(vals) vals = algos.take_nd(vals, codes, fill_value=index._na_value) values.append(vals) @@ -1309,7 +1309,7 @@ def copy( # type: ignore[override] new_index._id = self._id return new_index - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" return self.values @@ -3397,7 +3397,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): locs = (level_codes >= idx.start) & (level_codes < idx.stop) return locs - locs = np.array(level_codes == idx, dtype=bool, copy=False) + locs = np.asarray(level_codes == idx, dtype=bool) if not locs.any(): # The label is present in self.levels[level] but unused: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 220bb1133dfd5..2e0e04717373f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1682,6 +1682,8 @@ def as_array( na_value=na_value, copy=copy, ).reshape(blk.shape) + elif not copy: + arr = np.asarray(blk.values, dtype=dtype) else: arr = np.array(blk.values, dtype=dtype, copy=copy) diff --git a/pandas/core/series.py b/pandas/core/series.py index c6a905cbb6ec1..236085c2a62e1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -971,7 +971,9 @@ def view(self, dtype: Dtype | None = None) -> Series: # ---------------------------------------------------------------------- # NDArray Compat - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool | None = None + ) -> np.ndarray: """ Return the values as a NumPy array. @@ -984,6 +986,9 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy : bool or None, optional + Unused. + Returns ------- numpy.ndarray diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1139519d2bcd3..13c2f10785124 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4065,7 +4065,7 @@ def _create_axes( if isinstance(data_converted.dtype, CategoricalDtype): ordered = data_converted.ordered meta = "category" - metadata = np.array(data_converted.categories, copy=False).ravel() + metadata = np.asarray(data_converted.categories).ravel() data, dtype_name = _get_data_and_dtype_name(data_converted) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index d979dd445a61a..8acd298f37a07 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -197,6 +197,7 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "Addition/subtraction of integers and integer-arrays with Timestamp", "has no kernel", "not implemented", + "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.", ] ) with pytest.raises(errs, match=msg): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 82524ea115019..7f85c891afeed 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -12,6 +12,7 @@ Timestamp, ) from pandas._libs.tslibs.dtypes import freq_to_period_freqstr +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -638,13 +639,14 @@ def test_round(self, arr1d): def test_array_interface(self, datetime_index): arr = datetime_index._data + copy_false = None if np_version_gt2 else False # default asarray gives the same underlying data (for tz naive) result = np.asarray(arr) expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, copy=False) + result = np.array(arr, copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) @@ -653,7 +655,7 @@ def test_array_interface(self, datetime_index): expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype="datetime64[ns]", copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="datetime64[ns]") @@ -696,6 +698,7 @@ def test_array_tz(self, arr1d): # GH#23524 arr = arr1d dti = self.index_cls(arr1d) + copy_false = None if np_version_gt2 else False expected = dti.asi8.view("M8[ns]") result = np.array(arr, dtype="M8[ns]") @@ -704,17 +707,18 @@ def test_array_tz(self, arr1d): result = np.array(arr, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) - # check that we are not making copies when setting copy=False - result = np.array(arr, dtype="M8[ns]", copy=False) + # check that we are not making copies when setting copy=copy_false + result = np.array(arr, dtype="M8[ns]", copy=copy_false) assert result.base is expected.base assert result.base is not None - result = np.array(arr, dtype="datetime64[ns]", copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=copy_false) assert result.base is expected.base assert result.base is not None def test_array_i8_dtype(self, arr1d): arr = arr1d dti = self.index_cls(arr1d) + copy_false = None if np_version_gt2 else False expected = dti.asi8 result = np.array(arr, dtype="i8") @@ -723,8 +727,8 @@ def test_array_i8_dtype(self, arr1d): result = np.array(arr, dtype=np.int64) tm.assert_numpy_array_equal(result, expected) - # check that we are still making copies when setting copy=False - result = np.array(arr, dtype="i8", copy=False) + # check that we are still making copies when setting copy=copy_false + result = np.array(arr, dtype="i8", copy=copy_false) assert result.base is not expected.base assert result.base is None @@ -950,13 +954,14 @@ def test_int_properties(self, timedelta_index, propname): def test_array_interface(self, timedelta_index): arr = timedelta_index._data + copy_false = None if np_version_gt2 else False # default asarray gives the same underlying data result = np.asarray(arr) expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, copy=False) + result = np.array(arr, copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) @@ -965,7 +970,7 @@ def test_array_interface(self, timedelta_index): expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype="timedelta64[ns]", copy=False) + result = np.array(arr, dtype="timedelta64[ns]", copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="timedelta64[ns]") diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 49eb06c299886..0567be737c681 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -112,8 +112,8 @@ def it_outer(): def __len__(self) -> int: return len(self._values) - def __array__(self, t=None): - return np.asarray(self._values, dtype=t) + def __array__(self, dtype=None, copy=None): + return np.asarray(self._values, dtype=dtype) @property def ndim(self): diff --git a/pandas/tests/extension/array_with_attr/array.py b/pandas/tests/extension/array_with_attr/array.py index d0249d9af8098..2789d51ec2ce3 100644 --- a/pandas/tests/extension/array_with_attr/array.py +++ b/pandas/tests/extension/array_with_attr/array.py @@ -49,7 +49,10 @@ def __init__(self, values, attr=None) -> None: @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): - data = np.array(scalars, dtype="float64", copy=copy) + if not copy: + data = np.asarray(scalars, dtype="float64") + else: + data = np.array(scalars, dtype="float64", copy=copy) return cls(data) def __getitem__(self, item): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 31f44f886add7..e43b50322bb92 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -146,7 +146,7 @@ def __eq__(self, other): def __ne__(self, other): return NotImplemented - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): if dtype is None: dtype = object if dtype == object: @@ -210,8 +210,10 @@ def astype(self, dtype, copy=True): value = self.astype(str) # numpy doesn't like nested dicts arr_cls = dtype.construct_array_type() return arr_cls._from_sequence(value, dtype=dtype, copy=False) - - return np.array([dict(x) for x in self], dtype=dtype, copy=copy) + elif not copy: + return np.asarray([dict(x) for x in self], dtype=dtype) + else: + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) def unique(self): # Parent method doesn't work since np.array will try to infer diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py index f07585c0aec10..b3bb35c9396f4 100644 --- a/pandas/tests/extension/list/array.py +++ b/pandas/tests/extension/list/array.py @@ -115,7 +115,10 @@ def astype(self, dtype, copy=True): elif is_string_dtype(dtype) and not is_object_dtype(dtype): # numpy has problems with astype(str) for nested elements return np.array([str(x) for x in self.data], dtype=dtype) - return np.array(self.data, dtype=dtype, copy=copy) + elif not copy: + return np.asarray(self.data, dtype=dtype) + else: + return np.array(self.data, dtype=dtype, copy=copy) @classmethod def _concat_same_type(cls, to_concat): diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 3d8523f344d46..5eda0f00f54ca 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -17,7 +17,7 @@ class DummyArray(ExtensionArray): def __init__(self, data) -> None: self.data = data - def __array__(self, dtype): + def __array__(self, dtype=None, copy=None): return self.data @property @@ -30,8 +30,10 @@ def astype(self, dtype, copy=True): if copy: return type(self)(self.data) return self - - return np.array(self, dtype=dtype, copy=copy) + elif not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) class TestExtensionArrayDtype: diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 47c479faed1ef..d1bee6a3de613 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -32,7 +32,7 @@ def __init__(self, data, dtype) -> None: self.data = data self._dtype = dtype - def __array__(self, dtype): + def __array__(self, dtype=None, copy=None): return self.data @property diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 42ce658701355..0593de7556406 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -59,7 +59,7 @@ def __init__(self, value, dtype) -> None: self.value = value self.dtype = np.dtype(dtype) - def __array__(self): + def __array__(self, dtype=None, copy=None): return np.array(self.value, dtype=self.dtype) def __str__(self) -> str: diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 72641077c90fe..6042e5b9cc679 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -413,7 +413,7 @@ class ArrayLike: def __init__(self, array) -> None: self.array = array - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: return self.array expected = Index(array) From 63b9eba683019ef1b3ce3b66fe71dc7712f2d4aa Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 7 Mar 2024 18:13:44 +0100 Subject: [PATCH 128/152] Backport PR #57759 on branch 2.2.x (DOC: add whatsnew for v2.2.2) (#57763) * Backport PR #57759: DOC: add whatsnew for v2.2.2 * [skip-ci] --------- Co-authored-by: Marco Edward Gorelli Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.2.2.rst | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 doc/source/whatsnew/v2.2.2.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 3a2ab4c17d1bd..34a2845290d5a 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.2 .. toctree:: :maxdepth: 2 + v2.2.2 v2.2.1 v2.2.0 diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst new file mode 100644 index 0000000000000..058f7aebcd538 --- /dev/null +++ b/doc/source/whatsnew/v2.2.2.rst @@ -0,0 +1,36 @@ +.. _whatsnew_222: + +What's new in 2.2.2 (April XX, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.contributors: + +Contributors +~~~~~~~~~~~~ From e44f91d70ef02081bebec8137b59db1c8cc2161d Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 8 Mar 2024 07:02:16 +0000 Subject: [PATCH 129/152] Backport PR #57665 on branch 2.2.x (BUG: interchange protocol with nullable datatypes a non-null validity) (#57769) BUG: interchange protocol with nullable datatypes a non-null validity (#57665) * BUG: interchange protocol with nullable datatypes a non-null validity provides nonsense results * whatsnew * :label: typing * parametrise over more types * move whatsnew (cherry picked from commit 03717bcc5ae762d8a0ab8d259ca000af66e8ba82) --- doc/source/whatsnew/v2.2.2.rst | 1 + pandas/core/interchange/column.py | 18 ++++++++++- pandas/tests/interchange/test_impl.py | 44 +++++++++++++++++++++++---- 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 058f7aebcd538..96f210ce6b7b9 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index e273ecad8b51e..7b39403ca1916 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -190,6 +190,10 @@ def describe_categorical(self): @property def describe_null(self): + if isinstance(self._col.dtype, BaseMaskedDtype): + column_null_dtype = ColumnNullType.USE_BYTEMASK + null_value = 1 + return column_null_dtype, null_value kind = self.dtype[0] try: null, value = _NULL_DESCRIPTION[kind] @@ -290,7 +294,13 @@ def _get_data_buffer( if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: np_arr = self._col.dt.tz_convert(None).to_numpy() else: - np_arr = self._col.to_numpy() + arr = self._col.array + if isinstance(self._col.dtype, BaseMaskedDtype): + np_arr = arr._data # type: ignore[attr-defined] + elif isinstance(self._col.dtype, ArrowDtype): + raise NotImplementedError("ArrowDtype not handled yet") + else: + np_arr = arr._ndarray # type: ignore[attr-defined] buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: @@ -328,6 +338,12 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: """ null, invalid = self.describe_null + if isinstance(self._col.dtype, BaseMaskedDtype): + mask = self._col.array._mask # type: ignore[attr-defined] + buffer = PandasBuffer(mask) + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) + return buffer, dtype + if self.dtype[0] == DtypeKind.STRING: # For now, use byte array as the mask. # TODO: maybe store as bit array to save space?.. diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index d47b533f92235..a1dedb6be456c 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -9,7 +9,6 @@ is_platform_windows, ) from pandas.compat.numpy import np_version_lt1p23 -import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -404,17 +403,50 @@ def test_non_str_names_w_duplicates(): pd.api.interchange.from_dataframe(dfi, allow_copy=False) -@pytest.mark.parametrize( - "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] -) -def test_nullable_integers(dtype: str) -> None: +def test_nullable_integers() -> None: + # https://github.com/pandas-dev/pandas/issues/55069 + df = pd.DataFrame({"a": [1]}, dtype="Int8") + expected = pd.DataFrame({"a": [1]}, dtype="int8") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664") +def test_nullable_integers_pyarrow() -> None: # https://github.com/pandas-dev/pandas/issues/55069 - df = pd.DataFrame({"a": [1]}, dtype=dtype) + df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]") expected = pd.DataFrame({"a": [1]}, dtype="int8") result = pd.api.interchange.from_dataframe(df.__dataframe__()) tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, None], "Int64", "int64"), + ( + [1, 2, None], + "UInt64", + "uint64", + ), + ([1.0, 2.25, None], "Float32", "float32"), + ], +) +def test_pandas_nullable_w_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() is None + + def test_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/56700 df = pd.DataFrame({"a": []}, dtype="int8") From d600189edeb0768477d929b667f57d0f6566eb23 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 8 Mar 2024 22:35:59 +0100 Subject: [PATCH 130/152] Backport PR #57780 on branch 2.2.x (COMPAT: Adapt to Numpy 2.0 dtype changes) (#57784) Backport PR #57780: COMPAT: Adapt to Numpy 2.0 dtype changes Co-authored-by: Sebastian Berg --- pandas/_libs/src/datetime/pd_datetime.c | 4 ++++ .../_libs/src/vendored/numpy/datetime/np_datetime.c | 12 ++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c index 19de51be6e1b2..4c1969f6d9f57 100644 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ b/pandas/_libs/src/datetime/pd_datetime.c @@ -20,6 +20,9 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include "datetime.h" +/* Need to import_array for np_datetime.c (for NumPy 1.x support only) */ +#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY +#include "numpy/ndarrayobject.h" #include "pandas/datetime/pd_datetime.h" #include "pandas/portable.h" @@ -255,5 +258,6 @@ static struct PyModuleDef pandas_datetimemodule = { PyMODINIT_FUNC PyInit_pandas_datetime(void) { PyDateTime_IMPORT; + import_array(); return PyModuleDef_Init(&pandas_datetimemodule); } diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 277d01807f2f3..934c54fafb634 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -16,8 +16,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt // Licence at LICENSES/NUMPY_LICENSE -#define NO_IMPORT - #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #endif // NPY_NO_DEPRECATED_API @@ -25,7 +23,10 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include "pandas/vendored/numpy/datetime/np_datetime.h" -#include + +#define NO_IMPORT_ARRAY +#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY +#include #include #if defined(_WIN32) @@ -1070,5 +1071,8 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, */ PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(PyArray_Descr *dtype) { - return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta); +#if NPY_ABI_VERSION < 0x02000000 +#define PyDataType_C_METADATA(dtype) ((dtype)->c_metadata) +#endif + return ((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(dtype))->meta; } From 33006cd14d5febd30176b0003be668770c0e1671 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 12 Mar 2024 22:15:24 +0100 Subject: [PATCH 131/152] Backport PR #57821 on branch 2.2.x (Fix doc build) (#57822) Backport PR #57821: Fix doc build Co-authored-by: Trinh Quoc Anh --- environment.yml | 1 + requirements-dev.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 58eb69ad1f070..9cec788ca600b 100644 --- a/environment.yml +++ b/environment.yml @@ -62,6 +62,7 @@ dependencies: # downstream packages - dask-core - seaborn-base + - dask-expr # local testing dependencies - moto diff --git a/requirements-dev.txt b/requirements-dev.txt index 5a63e59e1db88..2e7d06ea1c12d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -49,6 +49,7 @@ xlsxwriter>=3.0.5 zstandard>=0.19.0 dask seaborn +dask-expr moto flask asv>=0.6.1 From 9ed53829a2e2f2115c7805c3b255f3a5ef53e390 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 13 Mar 2024 20:30:32 +0100 Subject: [PATCH 132/152] Backport PR #57830 on branch 2.2.x (DOC: Pin dask/dask-expr for scale.rst) (#57832) Backport PR #57830: DOC: Pin dask/dask-expr for scale.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- environment.yml | 4 ++-- requirements-dev.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index 9cec788ca600b..7db2767661d62 100644 --- a/environment.yml +++ b/environment.yml @@ -60,9 +60,9 @@ dependencies: - zstandard>=0.19.0 # downstream packages - - dask-core + - dask-core<=2024.2.1 - seaborn-base - - dask-expr + - dask-expr<=0.5.3 # local testing dependencies - moto diff --git a/requirements-dev.txt b/requirements-dev.txt index 2e7d06ea1c12d..68e386edb0c31 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -47,9 +47,9 @@ xarray>=2022.12.0 xlrd>=2.0.1 xlsxwriter>=3.0.5 zstandard>=0.19.0 -dask +dask<=2024.2.1 seaborn -dask-expr +dask-expr<=0.5.3 moto flask asv>=0.6.1 From 4fdbe560664b12d26e68921a7d1f015d1f5030cd Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 14 Mar 2024 01:39:40 +0100 Subject: [PATCH 133/152] Backport PR #57796 on branch 2.2.x (Fix issue with Tempita recompilation) (#57834) Backport PR #57796: Fix issue with Tempita recompilation Co-authored-by: William Ayd --- pandas/_libs/meson.build | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index c27386743c6e9..7621915ebcfdb 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -54,25 +54,37 @@ _intervaltree_helper = custom_target('intervaltree_helper_pxi', py, tempita, '@INPUT@', '-o', '@OUTDIR@' ] ) -_khash_primitive_helper_dep = declare_dependency(sources: _khash_primitive_helper) + +_algos_pxi_dep = declare_dependency(sources: [_algos_take_helper, _algos_common_helper]) +_khash_pxi_dep = declare_dependency(sources: _khash_primitive_helper) +_hashtable_pxi_dep = declare_dependency( + sources: [_hashtable_class_helper, _hashtable_func_helper] +) +_index_pxi_dep = declare_dependency(sources: _index_class_helper) +_intervaltree_pxi_dep = declare_dependency(sources: _intervaltree_helper) +_sparse_pxi_dep = declare_dependency(sources: _sparse_op_helper) + subdir('tslibs') libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included - 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep}, + 'algos': {'sources': ['algos.pyx'], + 'deps': [_khash_pxi_dep, _algos_pxi_dep]}, 'arrays': {'sources': ['arrays.pyx']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, - 'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep}, - 'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep}, + 'hashtable': {'sources': ['hashtable.pyx'], + 'deps': [_khash_pxi_dep, _hashtable_pxi_dep]}, + 'index': {'sources': ['index.pyx'], + 'deps': [_khash_pxi_dep, _index_pxi_dep]}, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx', _intervaltree_helper], - 'deps': _khash_primitive_helper_dep}, - 'join': {'sources': ['join.pyx', _khash_primitive_helper], - 'deps': _khash_primitive_helper_dep}, + 'interval': {'sources': ['interval.pyx'], + 'deps': [_khash_pxi_dep, _intervaltree_pxi_dep]}, + 'join': {'sources': ['join.pyx'], + 'deps': [_khash_pxi_dep]}, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, 'missing': {'sources': ['missing.pyx']}, 'pandas_datetime': {'sources': ['src/vendored/numpy/datetime/np_datetime.c', @@ -83,7 +95,7 @@ libs_sources = { 'src/parser/io.c', 'src/parser/pd_parser.c']}, 'parsers': {'sources': ['parsers.pyx', 'src/parser/tokenizer.c', 'src/parser/io.c'], - 'deps': _khash_primitive_helper_dep}, + 'deps': [_khash_pxi_dep]}, 'json': {'sources': ['src/vendored/ujson/python/ujson.c', 'src/vendored/ujson/python/objToJSON.c', 'src/vendored/ujson/python/JSONtoObj.c', @@ -95,7 +107,8 @@ libs_sources = { 'reshape': {'sources': ['reshape.pyx']}, 'sas': {'sources': ['sas.pyx']}, 'byteswap': {'sources': ['byteswap.pyx']}, - 'sparse': {'sources': ['sparse.pyx', _sparse_op_helper]}, + 'sparse': {'sources': ['sparse.pyx'], + 'deps': [_sparse_pxi_dep]}, 'tslib': {'sources': ['tslib.pyx']}, 'testing': {'sources': ['testing.pyx']}, 'writers': {'sources': ['writers.pyx']} From b6488afd3bdcca877683ea2efe4183a0d8ea84d8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 15 Mar 2024 19:16:46 +0100 Subject: [PATCH 134/152] Backport PR #57848 on branch 2.2.x (DOC: Remove duplicated Series.dt.normalize from docs) (#57854) Backport PR #57848: DOC: Remove duplicated Series.dt.normalize from docs Co-authored-by: Marc Garcia --- doc/source/reference/series.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index a4ea0ec396ceb..d40f6e559b8fa 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -342,7 +342,6 @@ Datetime properties Series.dt.tz Series.dt.freq Series.dt.unit - Series.dt.normalize Datetime methods ^^^^^^^^^^^^^^^^ From 962e23398ea07a610af238dd8e6188479a46cbb9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Mar 2024 12:43:38 -1000 Subject: [PATCH 135/152] Backport PR #57843: DOC: Remove Dask and Modin sections in scale.rst in favor of linking to ecosystem docs. (#57861) Co-authored-by: Yuki Kitayama <47092819+yukikitayama@users.noreply.github.com> --- doc/source/user_guide/scale.rst | 164 ++------------------------------ environment.yml | 3 +- requirements-dev.txt | 3 +- 3 files changed, 9 insertions(+), 161 deletions(-) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index b262de5d71439..29df2994fbc35 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -156,7 +156,7 @@ fits in memory, you can work with datasets that are much larger than memory. Chunking works well when the operation you're performing requires zero or minimal coordination between chunks. For more complicated workflows, you're better off - :ref:`using another library `. + :ref:`using other libraries `. Suppose we have an even larger "logical dataset" on disk that's a directory of parquet files. Each file in the directory represents a different year of the entire dataset. @@ -219,160 +219,10 @@ different library that implements these out-of-core algorithms for you. .. _scale.other_libraries: -Use Dask --------- +Use Other Libraries +------------------- -pandas is just one library offering a DataFrame API. Because of its popularity, -pandas' API has become something of a standard that other libraries implement. -The pandas documentation maintains a list of libraries implementing a DataFrame API -in `the ecosystem page `_. - -For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a -pandas-like API for working with larger than memory datasets in parallel. Dask -can use multiple threads or processes on a single machine, or a cluster of -machines to process data in parallel. - - -We'll import ``dask.dataframe`` and notice that the API feels similar to pandas. -We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in. - -.. ipython:: python - :okwarning: - - import dask.dataframe as dd - - ddf = dd.read_parquet("data/timeseries/ts*.parquet", engine="pyarrow") - ddf - -Inspecting the ``ddf`` object, we see a few things - -* There are familiar attributes like ``.columns`` and ``.dtypes`` -* There are familiar methods like ``.groupby``, ``.sum``, etc. -* There are new attributes like ``.npartitions`` and ``.divisions`` - -The partitions and divisions are how Dask parallelizes computation. A **Dask** -DataFrame is made up of many pandas :class:`pandas.DataFrame`. A single method call on a -Dask DataFrame ends up making many pandas method calls, and Dask knows how to -coordinate everything to get the result. - -.. ipython:: python - - ddf.columns - ddf.dtypes - ddf.npartitions - -One major difference: the ``dask.dataframe`` API is *lazy*. If you look at the -repr above, you'll notice that the values aren't actually printed out; just the -column names and dtypes. That's because Dask hasn't actually read the data yet. -Rather than executing immediately, doing operations build up a **task graph**. - -.. ipython:: python - :okwarning: - - ddf - ddf["name"] - ddf["name"].value_counts() - -Each of these calls is instant because the result isn't being computed yet. -We're just building up a list of computation to do when someone needs the -result. Dask knows that the return type of a :class:`pandas.Series.value_counts` -is a pandas :class:`pandas.Series` with a certain dtype and a certain name. So the Dask version -returns a Dask Series with the same dtype and the same name. - -To get the actual result you can call ``.compute()``. - -.. ipython:: python - :okwarning: - - %time ddf["name"].value_counts().compute() - -At that point, you get back the same thing you'd get with pandas, in this case -a concrete pandas :class:`pandas.Series` with the count of each ``name``. - -Calling ``.compute`` causes the full task graph to be executed. This includes -reading the data, selecting the columns, and doing the ``value_counts``. The -execution is done *in parallel* where possible, and Dask tries to keep the -overall memory footprint small. You can work with datasets that are much larger -than memory, as long as each partition (a regular pandas :class:`pandas.DataFrame`) fits in memory. - -By default, ``dask.dataframe`` operations use a threadpool to do operations in -parallel. We can also connect to a cluster to distribute the work on many -machines. In this case we'll connect to a local "cluster" made up of several -processes on this single machine. - -.. code-block:: python - - >>> from dask.distributed import Client, LocalCluster - - >>> cluster = LocalCluster() - >>> client = Client(cluster) - >>> client - - -Once this ``client`` is created, all of Dask's computation will take place on -the cluster (which is just processes in this case). - -Dask implements the most used parts of the pandas API. For example, we can do -a familiar groupby aggregation. - -.. ipython:: python - :okwarning: - - %time ddf.groupby("name")[["x", "y"]].mean().compute().head() - -The grouping and aggregation is done out-of-core and in parallel. - -When Dask knows the ``divisions`` of a dataset, certain optimizations are -possible. When reading parquet datasets written by dask, the divisions will be -known automatically. In this case, since we created the parquet files manually, -we need to supply the divisions manually. - -.. ipython:: python - :okwarning: - - N = 12 - starts = [f"20{i:>02d}-01-01" for i in range(N)] - ends = [f"20{i:>02d}-12-13" for i in range(N)] - - divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),) - ddf.divisions = divisions - ddf - -Now we can do things like fast random access with ``.loc``. - -.. ipython:: python - :okwarning: - - ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() - -Dask knows to just look in the 3rd partition for selecting values in 2002. It -doesn't need to look at any other data. - -Many workflows involve a large amount of data and processing it in a way that -reduces the size to something that fits in memory. In this case, we'll resample -to daily frequency and take the mean. Once we've taken the mean, we know the -results will fit in memory, so we can safely call ``compute`` without running -out of memory. At that point it's just a regular pandas object. - -.. ipython:: python - :okwarning: - - @savefig dask_resample.png - ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot() - -.. ipython:: python - :suppress: - - import shutil - - shutil.rmtree("data/timeseries") - -These Dask examples have all be done using multiple processes on a single -machine. Dask can be `deployed on a cluster -`_ to scale up to even larger -datasets. - -You see more dask examples at https://examples.dask.org. - -.. _Dask: https://dask.org -.. _dask.dataframe: https://docs.dask.org/en/latest/dataframe.html +There are other libraries which provide similar APIs to pandas and work nicely with pandas DataFrame, +and can give you the ability to scale your large dataset processing and analytics +by parallel runtime, distributed memory, clustering, etc. You can find more information +in `the ecosystem page `_. diff --git a/environment.yml b/environment.yml index 7db2767661d62..58eb69ad1f070 100644 --- a/environment.yml +++ b/environment.yml @@ -60,9 +60,8 @@ dependencies: - zstandard>=0.19.0 # downstream packages - - dask-core<=2024.2.1 + - dask-core - seaborn-base - - dask-expr<=0.5.3 # local testing dependencies - moto diff --git a/requirements-dev.txt b/requirements-dev.txt index 68e386edb0c31..5a63e59e1db88 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -47,9 +47,8 @@ xarray>=2022.12.0 xlrd>=2.0.1 xlsxwriter>=3.0.5 zstandard>=0.19.0 -dask<=2024.2.1 +dask seaborn -dask-expr<=0.5.3 moto flask asv>=0.6.1 From cd6eeae3d3501ee316bbcd135ed7cbe969e2c856 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:43:02 +0100 Subject: [PATCH 136/152] Backport PR #57883 on branch 2.2.x (Bump pypa/cibuildwheel from 2.16.5 to 2.17.0) (#57888) Backport PR #57883: Bump pypa/cibuildwheel from 2.16.5 to 2.17.0 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index f79b2c51b5f92..470c044d2e99e 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -141,7 +141,7 @@ jobs: - name: Build normal wheels if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.17.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: @@ -150,7 +150,7 @@ jobs: - name: Build nightly wheels (with NumPy pre-release) if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.17.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 71a6797dfa55b8d937abc226462b322e32feb4da Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 18 Mar 2024 21:36:47 +0100 Subject: [PATCH 137/152] Backport PR #57892 on branch 2.2.x (CI: xfail Pyarrow slicing test) (#57898) Backport PR #57892: CI: xfail Pyarrow slicing test Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/compat/__init__.py | 2 ++ pandas/compat/pyarrow.py | 2 ++ pandas/tests/indexes/object/test_indexing.py | 12 +++++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 738442fab8c70..eb890c8b8c0ab 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -30,6 +30,7 @@ pa_version_under13p0, pa_version_under14p0, pa_version_under14p1, + pa_version_under16p0, ) if TYPE_CHECKING: @@ -186,6 +187,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under13p0", "pa_version_under14p0", "pa_version_under14p1", + "pa_version_under16p0", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index beb4814914101..a2dfa69bbf236 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -15,6 +15,7 @@ pa_version_under14p0 = _palv < Version("14.0.0") pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") + pa_version_under16p0 = _palv < Version("16.0.0") except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -23,3 +24,4 @@ pa_version_under14p0 = True pa_version_under14p1 = True pa_version_under15p0 = True + pa_version_under16p0 = True diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index ebf9dac715f8d..443cacf94d239 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -7,6 +7,7 @@ NA, is_matching_na, ) +from pandas.compat import pa_version_under16p0 import pandas.util._test_decorators as td import pandas as pd @@ -200,7 +201,16 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): + def test_slice_locs_negative_step(self, in_slice, expected, dtype, request): + if ( + not pa_version_under16p0 + and dtype == "string[pyarrow_numpy]" + and in_slice == slice("a", "a", -1) + ): + request.applymarker( + pytest.mark.xfail(reason="https://github.com/apache/arrow/issues/40642") + ) + index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) From cc5632159fb9f24586be29b80223e910c566a6e1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 18 Mar 2024 23:01:56 +0100 Subject: [PATCH 138/152] Backport PR #57889 on branch 2.2.x (BUG: Handle Series construction with Dask, dict-like, Series) (#57899) Backport PR #57889: BUG: Handle Series construction with Dask, dict-like, Series Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 236085c2a62e1..c1782206d4b67 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -533,7 +533,7 @@ def __init__( data = data.reindex(index, copy=copy) copy = False data = data._mgr - elif is_dict_like(data): + elif isinstance(data, Mapping): data, index = self._init_dict(data, index, dtype) dtype = None copy = False @@ -605,7 +605,7 @@ def __init__( ) def _init_dict( - self, data, index: Index | None = None, dtype: DtypeObj | None = None + self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None ): """ Derive the "_mgr" and "index" attributes of a new Series from a From 83497f53dee16f447f2df29af60ff5521c4f1177 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 19 Mar 2024 01:55:34 +0100 Subject: [PATCH 139/152] Backport PR #57905 on branch 2.2.x (Revert "Fix issue with Tempita recompilation (#57796)") (#57907) Backport PR #57905: Revert "Fix issue with Tempita recompilation (#57796)" Co-authored-by: William Ayd --- pandas/_libs/meson.build | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index 7621915ebcfdb..c27386743c6e9 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -54,37 +54,25 @@ _intervaltree_helper = custom_target('intervaltree_helper_pxi', py, tempita, '@INPUT@', '-o', '@OUTDIR@' ] ) - -_algos_pxi_dep = declare_dependency(sources: [_algos_take_helper, _algos_common_helper]) -_khash_pxi_dep = declare_dependency(sources: _khash_primitive_helper) -_hashtable_pxi_dep = declare_dependency( - sources: [_hashtable_class_helper, _hashtable_func_helper] -) -_index_pxi_dep = declare_dependency(sources: _index_class_helper) -_intervaltree_pxi_dep = declare_dependency(sources: _intervaltree_helper) -_sparse_pxi_dep = declare_dependency(sources: _sparse_op_helper) - +_khash_primitive_helper_dep = declare_dependency(sources: _khash_primitive_helper) subdir('tslibs') libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included - 'algos': {'sources': ['algos.pyx'], - 'deps': [_khash_pxi_dep, _algos_pxi_dep]}, + 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep}, 'arrays': {'sources': ['arrays.pyx']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, - 'hashtable': {'sources': ['hashtable.pyx'], - 'deps': [_khash_pxi_dep, _hashtable_pxi_dep]}, - 'index': {'sources': ['index.pyx'], - 'deps': [_khash_pxi_dep, _index_pxi_dep]}, + 'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep}, + 'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep}, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx'], - 'deps': [_khash_pxi_dep, _intervaltree_pxi_dep]}, - 'join': {'sources': ['join.pyx'], - 'deps': [_khash_pxi_dep]}, + 'interval': {'sources': ['interval.pyx', _intervaltree_helper], + 'deps': _khash_primitive_helper_dep}, + 'join': {'sources': ['join.pyx', _khash_primitive_helper], + 'deps': _khash_primitive_helper_dep}, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, 'missing': {'sources': ['missing.pyx']}, 'pandas_datetime': {'sources': ['src/vendored/numpy/datetime/np_datetime.c', @@ -95,7 +83,7 @@ libs_sources = { 'src/parser/io.c', 'src/parser/pd_parser.c']}, 'parsers': {'sources': ['parsers.pyx', 'src/parser/tokenizer.c', 'src/parser/io.c'], - 'deps': [_khash_pxi_dep]}, + 'deps': _khash_primitive_helper_dep}, 'json': {'sources': ['src/vendored/ujson/python/ujson.c', 'src/vendored/ujson/python/objToJSON.c', 'src/vendored/ujson/python/JSONtoObj.c', @@ -107,8 +95,7 @@ libs_sources = { 'reshape': {'sources': ['reshape.pyx']}, 'sas': {'sources': ['sas.pyx']}, 'byteswap': {'sources': ['byteswap.pyx']}, - 'sparse': {'sources': ['sparse.pyx'], - 'deps': [_sparse_pxi_dep]}, + 'sparse': {'sources': ['sparse.pyx', _sparse_op_helper]}, 'tslib': {'sources': ['tslib.pyx']}, 'testing': {'sources': ['testing.pyx']}, 'writers': {'sources': ['writers.pyx']} From 2a6d800df89a4878fbf432ae71f93e3803d67672 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 19 Mar 2024 17:29:17 +0100 Subject: [PATCH 140/152] Backport PR #57886 on branch 2.2.x (CI: Remove ASAN job) (#57910) Backport PR #57886: CI: Remove ASAN job Co-authored-by: William Ayd --- .github/actions/run-tests/action.yml | 9 +------- .github/workflows/unit-tests.yml | 14 ------------ ci/deps/actions-311-sanitizers.yaml | 32 ---------------------------- 3 files changed, 1 insertion(+), 54 deletions(-) delete mode 100644 ci/deps/actions-311-sanitizers.yaml diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index b4778b74df335..fd7c3587f2254 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -1,16 +1,9 @@ name: Run tests and report results -inputs: - preload: - description: Preload arguments for sanitizer - required: false - asan_options: - description: Arguments for Address Sanitizer (ASAN) - required: false runs: using: composite steps: - name: Test - run: ${{ inputs.asan_options }} ${{ inputs.preload }} ci/run_tests.sh + run: ci/run_tests.sh shell: bash -el {0} - name: Publish test results diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 8736674bbf965..bacc3d874a60d 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -96,14 +96,6 @@ jobs: - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" - - name: "ASAN / UBSAN" - env_file: actions-311-sanitizers.yaml - pattern: "not slow and not network and not single_cpu and not skip_ubsan" - asan_options: "ASAN_OPTIONS=detect_leaks=0" - preload: LD_PRELOAD=$(gcc -print-file-name=libasan.so) - meson_args: --config-settings=setup-args="-Db_sanitize=address,undefined" - cflags_adds: -fno-sanitize-recover=all - pytest_workers: -1 # disable pytest-xdist as it swallows stderr from ASAN fail-fast: false name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: @@ -190,18 +182,12 @@ jobs: - name: Test (not single_cpu) uses: ./.github/actions/run-tests if: ${{ matrix.name != 'Pypy' }} - with: - preload: ${{ matrix.preload }} - asan_options: ${{ matrix.asan_options }} env: # Set pattern to not single_cpu if not already set PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} - name: Test (single_cpu) uses: ./.github/actions/run-tests - with: - preload: ${{ matrix.preload }} - asan_options: ${{ matrix.asan_options }} env: PATTERN: 'single_cpu' PYTEST_WORKERS: 0 diff --git a/ci/deps/actions-311-sanitizers.yaml b/ci/deps/actions-311-sanitizers.yaml deleted file mode 100644 index f5f04c90bffad..0000000000000 --- a/ci/deps/actions-311-sanitizers.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.11 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 - - boto3 - - hypothesis>=6.46.1 - - pyqt>=5.15.9 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # pandas dependencies - - pip - - - pip: - - "tzdata>=2022.7" From 78f7a02d1f36cdcdf5c3420afbf4965a85db7f01 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 21 Mar 2024 04:05:31 +0100 Subject: [PATCH 141/152] Backport PR #57029 on branch 2.2.x (DOC: Add `DataFrame.to_numpy` method) (#57940) Backport PR #57029: DOC: Add `DataFrame.to_numpy` method Co-authored-by: Zhengbo Wang <77875500+luke396@users.noreply.github.com> --- doc/source/reference/frame.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index fefb02dd916cd..1d9019ff22c23 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -49,6 +49,7 @@ Conversion DataFrame.infer_objects DataFrame.copy DataFrame.bool + DataFrame.to_numpy Indexing, iteration ~~~~~~~~~~~~~~~~~~~ From 7e8d492b26bb933884492ffcb5ce8fc501c281d5 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 21 Mar 2024 16:37:58 +0000 Subject: [PATCH 142/152] Backport PR #57764 on branch 2.2.x (BUG: PyArrow dtypes were not supported in the interchange protocol) (#57947) --- doc/source/whatsnew/v2.2.2.rst | 4 +- pandas/core/interchange/buffer.py | 58 +++++++ pandas/core/interchange/column.py | 66 ++++++-- pandas/core/interchange/dataframe.py | 5 + pandas/core/interchange/from_dataframe.py | 17 +- pandas/core/interchange/utils.py | 28 ++++ pandas/tests/interchange/test_impl.py | 186 +++++++++++++++++++--- 7 files changed, 326 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 96f210ce6b7b9..54084abab7817 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`) - .. --------------------------------------------------------------------------- @@ -21,7 +22,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) +- :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) .. --------------------------------------------------------------------------- .. _whatsnew_222.other: diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index 5c97fc17d7070..5d24325e67f62 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -12,6 +12,7 @@ if TYPE_CHECKING: import numpy as np + import pyarrow as pa class PandasBuffer(Buffer): @@ -76,3 +77,60 @@ def __repr__(self) -> str: ) + ")" ) + + +class PandasBufferPyarrow(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__( + self, + buffer: pa.Buffer, + *, + length: int, + ) -> None: + """ + Handle pyarrow chunked arrays. + """ + self._buffer = buffer + self._length = length + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._buffer.size + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._buffer.address + + def __dlpack__(self) -> Any: + """ + Represent this structure as DLPack interface. + """ + raise NotImplementedError() + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return ( + "PandasBuffer[pyarrow](" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": "CPU", + } + ) + + ")" + ) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 7b39403ca1916..d59a3df694bb3 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import Any +from typing import ( + TYPE_CHECKING, + Any, +) import numpy as np @@ -9,15 +12,18 @@ from pandas.errors import NoBufferPresent from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.dtypes import ( +from pandas.core.dtypes.dtypes import BaseMaskedDtype + +import pandas as pd +from pandas import ( ArrowDtype, - BaseMaskedDtype, DatetimeTZDtype, ) - -import pandas as pd from pandas.api.types import is_string_dtype -from pandas.core.interchange.buffer import PandasBuffer +from pandas.core.interchange.buffer import ( + PandasBuffer, + PandasBufferPyarrow, +) from pandas.core.interchange.dataframe_protocol import ( Column, ColumnBuffers, @@ -30,6 +36,9 @@ dtype_to_arrow_c_fmt, ) +if TYPE_CHECKING: + from pandas.core.interchange.dataframe_protocol import Buffer + _NP_KINDS = { "i": DtypeKind.INT, "u": DtypeKind.UINT, @@ -157,6 +166,16 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: else: byteorder = dtype.byteorder + if dtype == "bool[pyarrow]": + # return early to avoid the `* 8` below, as this is a bitmask + # rather than a bytemask + return ( + kind, + dtype.itemsize, # pyright: ignore[reportGeneralTypeIssues] + ArrowCTypes.BOOL, + byteorder, + ) + return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder @property @@ -194,6 +213,12 @@ def describe_null(self): column_null_dtype = ColumnNullType.USE_BYTEMASK null_value = 1 return column_null_dtype, null_value + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + if self._col.array._pa_array.chunks[0].buffers()[0] is None: # type: ignore[attr-defined] + return ColumnNullType.NON_NULLABLE, None + return ColumnNullType.USE_BITMASK, 0 kind = self.dtype[0] try: null, value = _NULL_DESCRIPTION[kind] @@ -278,10 +303,11 @@ def get_buffers(self) -> ColumnBuffers: def _get_data_buffer( self, - ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple + ) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]: """ Return the buffer containing the data and the buffer's associated dtype. """ + buffer: Buffer if self.dtype[0] in ( DtypeKind.INT, DtypeKind.UINT, @@ -291,6 +317,7 @@ def _get_data_buffer( ): # self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make # it longer than 4 characters + dtype = self.dtype if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: np_arr = self._col.dt.tz_convert(None).to_numpy() else: @@ -298,11 +325,17 @@ def _get_data_buffer( if isinstance(self._col.dtype, BaseMaskedDtype): np_arr = arr._data # type: ignore[attr-defined] elif isinstance(self._col.dtype, ArrowDtype): - raise NotImplementedError("ArrowDtype not handled yet") + # We already rechunk (if necessary / allowed) upon initialization, + # so this is already single-chunk by the time we get here. + arr = arr._pa_array.chunks[0] # type: ignore[attr-defined] + buffer = PandasBufferPyarrow( + arr.buffers()[1], # type: ignore[attr-defined] + length=len(arr), + ) + return buffer, dtype else: np_arr = arr._ndarray # type: ignore[attr-defined] buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) - dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: codes = self._col.values._codes buffer = PandasBuffer(codes, allow_copy=self._allow_copy) @@ -330,13 +363,26 @@ def _get_data_buffer( return buffer, dtype - def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: + def _get_validity_buffer(self) -> tuple[Buffer, Any] | None: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. Raises NoBufferPresent if null representation is not a bit or byte mask. """ null, invalid = self.describe_null + buffer: Buffer + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + arr = self._col.array._pa_array.chunks[0] # type: ignore[attr-defined] + dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE) + if arr.buffers()[0] is None: + return None + buffer = PandasBufferPyarrow( + arr.buffers()[0], + length=len(arr), + ) + return buffer, dtype if isinstance(self._col.dtype, BaseMaskedDtype): mask = self._col.array._mask # type: ignore[attr-defined] diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 1ffe0e8e8dbb0..1abacddfc7e3b 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -5,6 +5,7 @@ from pandas.core.interchange.column import PandasColumn from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg +from pandas.core.interchange.utils import maybe_rechunk if TYPE_CHECKING: from collections.abc import ( @@ -34,6 +35,10 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: """ self._df = df.rename(columns=str, copy=False) self._allow_copy = allow_copy + for i, _col in enumerate(self._df.columns): + rechunked = maybe_rechunk(self._df.iloc[:, i], allow_copy=allow_copy) + if rechunked is not None: + self._df.isetitem(i, rechunked) def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index d45ae37890ba7..4162ebc33f0d6 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -295,13 +295,14 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): - assert buffers["validity"], "Validity buffers cannot be empty for masks" - valid_buff, valid_dtype = buffers["validity"] - null_pos = buffer_to_ndarray( - valid_buff, valid_dtype, offset=col.offset, length=col.size() - ) - if sentinel_val == 0: - null_pos = ~null_pos + validity = buffers["validity"] + if validity is not None: + valid_buff, valid_dtype = validity + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) + if sentinel_val == 0: + null_pos = ~null_pos # Assemble the strings from the code units str_list: list[None | float | str] = [None] * col.size() @@ -486,6 +487,8 @@ def set_nulls( np.ndarray or pd.Series Data with the nulls being set. """ + if validity is None: + return data null_kind, sentinel_val = col.describe_null null_pos = None diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 2e73e560e5740..2a19dd5046aa3 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -16,6 +16,8 @@ DatetimeTZDtype, ) +import pandas as pd + if typing.TYPE_CHECKING: from pandas._typing import DtypeObj @@ -145,3 +147,29 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: raise NotImplementedError( f"Conversion of {dtype} to Arrow C format string is not implemented." ) + + +def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None: + """ + Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary. + + - Returns `None` if the input series is not backed by a multi-chunk pyarrow array + (and so doesn't need rechunking) + - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk + pyarrow array and `allow_copy` is `True`. + - Raises a `RuntimeError` if `allow_copy` is `False` and input is a + based by a multi-chunk pyarrow array. + """ + if not isinstance(series.dtype, pd.ArrowDtype): + return None + chunked_array = series.array._pa_array # type: ignore[attr-defined] + if len(chunked_array.chunks) == 1: + return None + if not allow_copy: + raise RuntimeError( + "Found multi-chunk pyarrow array, but `allow_copy` is False. " + "Please rechunk the array before calling this function, or set " + "`allow_copy=True`." + ) + arr = chunked_array.combine_chunks() + return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index a1dedb6be456c..1ccada9116d4c 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -1,4 +1,7 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest @@ -301,6 +304,51 @@ def test_multi_chunk_pyarrow() -> None: pd.api.interchange.from_dataframe(table, allow_copy=False) +def test_multi_chunk_column() -> None: + pytest.importorskip("pyarrow", "11.0.0") + ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]") + df = pd.concat([ser, ser], ignore_index=True).to_frame("a") + df_orig = df.copy() + with pytest.raises( + RuntimeError, match="Found multi-chunk pyarrow array, but `allow_copy` is False" + ): + pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False)) + result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True)) + # Interchange protocol defaults to creating numpy-backed columns, so currently this + # is 'float64'. + expected = pd.DataFrame({"a": [1.0, 2.0, None, 1.0, 2.0, None]}, dtype="float64") + tm.assert_frame_equal(result, expected) + + # Check that the rechunking we did didn't modify the original DataFrame. + tm.assert_frame_equal(df, df_orig) + assert len(df["a"].array._pa_array.chunks) == 2 + assert len(df_orig["a"].array._pa_array.chunks) == 2 + + +def test_timestamp_ns_pyarrow(): + # GH 56712 + pytest.importorskip("pyarrow", "11.0.0") + timestamp_args = { + "year": 2000, + "month": 1, + "day": 1, + "hour": 1, + "minute": 1, + "second": 1, + } + df = pd.Series( + [datetime(**timestamp_args)], + dtype="timestamp[ns][pyarrow]", + name="col0", + ).to_frame() + + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi)["col0"].item() + + expected = pd.Timestamp(**timestamp_args) + assert result == expected + + @pytest.mark.parametrize("tz", ["UTC", "US/Pacific"]) @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_datetimetzdtype(tz, unit): @@ -403,42 +451,60 @@ def test_non_str_names_w_duplicates(): pd.api.interchange.from_dataframe(dfi, allow_copy=False) -def test_nullable_integers() -> None: - # https://github.com/pandas-dev/pandas/issues/55069 - df = pd.DataFrame({"a": [1]}, dtype="Int8") - expected = pd.DataFrame({"a": [1]}, dtype="int8") - result = pd.api.interchange.from_dataframe(df.__dataframe__()) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664") -def test_nullable_integers_pyarrow() -> None: - # https://github.com/pandas-dev/pandas/issues/55069 - df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]") - expected = pd.DataFrame({"a": [1]}, dtype="int8") - result = pd.api.interchange.from_dataframe(df.__dataframe__()) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( ("data", "dtype", "expected_dtype"), [ ([1, 2, None], "Int64", "int64"), + ([1, 2, None], "Int64[pyarrow]", "int64"), + ([1, 2, None], "Int8", "int8"), + ([1, 2, None], "Int8[pyarrow]", "int8"), ( [1, 2, None], "UInt64", "uint64", ), + ( + [1, 2, None], + "UInt64[pyarrow]", + "uint64", + ), ([1.0, 2.25, None], "Float32", "float32"), + ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), + ([True, False, None], "boolean[pyarrow]", "bool"), + (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", None], "string[pyarrow]", "large_string"), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), None], + "timestamp[ns][pyarrow]", + "timestamp[ns]", + ), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), None], + "timestamp[us][pyarrow]", + "timestamp[us]", + ), + ( + [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + None, + ], + "timestamp[us, Asia/Kathmandu][pyarrow]", + "timestamp[us, tz=Asia/Kathmandu]", + ), ], ) -def test_pandas_nullable_w_missing_values( +def test_pandas_nullable_with_missing_values( data: list, dtype: str, expected_dtype: str ) -> None: # https://github.com/pandas-dev/pandas/issues/57643 - pytest.importorskip("pyarrow", "11.0.0") + # https://github.com/pandas-dev/pandas/issues/57664 + pa = pytest.importorskip("pyarrow", "11.0.0") import pyarrow.interchange as pai + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": + expected_dtype = pa.timestamp("us", "Asia/Kathmandu") + df = pd.DataFrame({"a": data}, dtype=dtype) result = pai.from_dataframe(df.__dataframe__())["a"] assert result.type == expected_dtype @@ -447,6 +513,86 @@ def test_pandas_nullable_w_missing_values( assert result[2].as_py() is None +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, 3], "Int64", "int64"), + ([1, 2, 3], "Int64[pyarrow]", "int64"), + ([1, 2, 3], "Int8", "int8"), + ([1, 2, 3], "Int8[pyarrow]", "int8"), + ( + [1, 2, 3], + "UInt64", + "uint64", + ), + ( + [1, 2, 3], + "UInt64[pyarrow]", + "uint64", + ), + ([1.0, 2.25, 5.0], "Float32", "float32"), + ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), + ([True, False, False], "boolean[pyarrow]", "bool"), + (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "timestamp[ns][pyarrow]", + "timestamp[ns]", + ), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "timestamp[us][pyarrow]", + "timestamp[us]", + ), + ( + [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + datetime(2020, 1, 3, tzinfo=timezone.utc), + ], + "timestamp[us, Asia/Kathmandu][pyarrow]", + "timestamp[us, tz=Asia/Kathmandu]", + ), + ], +) +def test_pandas_nullable_without_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + pa = pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": + expected_dtype = pa.timestamp("us", "Asia/Kathmandu") + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() == data[2] + + +def test_string_validity_buffer() -> None: + # https://github.com/pandas-dev/pandas/issues/57761 + pytest.importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert result is None + + +def test_string_validity_buffer_no_missing() -> None: + # https://github.com/pandas-dev/pandas/issues/57762 + pytest.importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]") + validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert validity is not None + result = validity[1] + expected = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, "=") + assert result == expected + + def test_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/56700 df = pd.DataFrame({"a": []}, dtype="int8") From 40e621fc9f77d01e80516a8ca99294386a438958 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Mar 2024 18:51:23 +0100 Subject: [PATCH 143/152] Backport PR #57548 on branch 2.2.x (Fix accidental loss-of-precision for to_datetime(str, unit=...)) (#58034) Backport PR #57548: Fix accidental loss-of-precision for to_datetime(str, unit=...) Co-authored-by: Elliott Sales de Andrade --- doc/source/whatsnew/v2.2.2.rst | 2 +- pandas/_libs/tslib.pyx | 2 +- pandas/tests/tools/test_to_datetime.py | 8 ++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 54084abab7817..19539918b8c8f 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -15,7 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`) -- +- Fixed regression in precision of :func:`to_datetime` with string and ``unit`` input (:issue:`57051`) .. --------------------------------------------------------------------------- .. _whatsnew_222.bug_fixes: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 017fdc4bc834f..dd23c2f27ca09 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -277,7 +277,7 @@ def array_with_unit_to_datetime( bint is_raise = errors == "raise" ndarray[int64_t] iresult tzinfo tz = None - float fval + double fval assert is_ignore or is_coerce or is_raise diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 6791ac0340640..a1ed996dade8e 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1912,6 +1912,14 @@ def test_unit(self, cache): with pytest.raises(ValueError, match=msg): to_datetime([1], unit="D", format="%Y%m%d", cache=cache) + def test_unit_str(self, cache): + # GH 57051 + # Test that strs aren't dropping precision to 32-bit accidentally. + with tm.assert_produces_warning(FutureWarning): + res = to_datetime(["1704660000"], unit="s", origin="unix") + expected = to_datetime([1704660000], unit="s", origin="unix") + tm.assert_index_equal(res, expected) + def test_unit_array_mixed_nans(self, cache): values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] result = to_datetime(values, unit="D", errors="ignore", cache=cache) From e1a7302c8c0b5d33c5bc57aa81ee371782ad9289 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Mar 2024 20:02:51 +0100 Subject: [PATCH 144/152] Backport PR #57758 on branch 2.2.x (BUG: DataFrame Interchange Protocol errors on Boolean columns) (#58036) Backport PR #57758: BUG: DataFrame Interchange Protocol errors on Boolean columns Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.2.rst | 1 + pandas/core/interchange/utils.py | 3 +++ pandas/tests/interchange/test_impl.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 19539918b8c8f..d0f8951ac07ad 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the column's type was nullable boolean (:issue:`55332`) - :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) - :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 2a19dd5046aa3..fd1c7c9639242 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -144,6 +144,9 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: elif isinstance(dtype, DatetimeTZDtype): return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz) + elif isinstance(dtype, pd.BooleanDtype): + return ArrowCTypes.BOOL + raise NotImplementedError( f"Conversion of {dtype} to Arrow C format string is not implemented." ) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 1ccada9116d4c..25418b8bb2b37 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -470,6 +470,7 @@ def test_non_str_names_w_duplicates(): ), ([1.0, 2.25, None], "Float32", "float32"), ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), + ([True, False, None], "boolean", "bool"), ([True, False, None], "boolean[pyarrow]", "bool"), (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), (["much ado", "about", None], "string[pyarrow]", "large_string"), @@ -532,6 +533,7 @@ def test_pandas_nullable_with_missing_values( ), ([1.0, 2.25, 5.0], "Float32", "float32"), ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), + ([True, False, False], "boolean", "bool"), ([True, False, False], "boolean[pyarrow]", "bool"), (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), From f4554014bd15149e28fc5b77a3e30a88987ff8a3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Mar 2024 23:26:08 +0100 Subject: [PATCH 145/152] Backport PR #57974 on branch 2.2.x (BUG: Fixed ADBC to_sql creation of table when using public schema) (#58050) Backport PR #57974: BUG: Fixed ADBC to_sql creation of table when using public schema Co-authored-by: Shabab Karim --- doc/source/whatsnew/v2.2.2.rst | 1 + pandas/io/sql.py | 4 +++- pandas/tests/io/test_sql.py | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index d0f8951ac07ad..9e1a883d47cf8 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -25,6 +25,7 @@ Bug fixes - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the column's type was nullable boolean (:issue:`55332`) - :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) - :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) +- :meth:`DataFrame.to_sql` was failing to find the right table when using the schema argument (:issue:`57539`) .. --------------------------------------------------------------------------- .. _whatsnew_222.other: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 195a7c5040853..3e17175167f25 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2400,7 +2400,9 @@ def to_sql( raise ValueError("datatypes not supported") from exc with self.con.cursor() as cur: - total_inserted = cur.adbc_ingest(table_name, tbl, mode=mode) + total_inserted = cur.adbc_ingest( + table_name=name, data=tbl, mode=mode, db_schema_name=schema + ) self.con.commit() return total_inserted diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 791b6da3deeca..4f1f965f26aa9 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1373,6 +1373,30 @@ def insert_on_conflict(table, conn, keys, data_iter): pandasSQL.drop_table("test_insert_conflict") +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_on_public_schema(conn, request): + if "sqlite" in conn or "mysql" in conn: + request.applymarker( + pytest.mark.xfail( + reason="test for public schema only specific to postgresql" + ) + ) + + conn = request.getfixturevalue(conn) + + test_data = DataFrame([[1, 2.1, "a"], [2, 3.1, "b"]], columns=list("abc")) + test_data.to_sql( + name="test_public_schema", + con=conn, + if_exists="append", + index=False, + schema="public", + ) + + df_out = sql.read_sql_table("test_public_schema", conn, schema="public") + tm.assert_frame_equal(test_data, df_out) + + @pytest.mark.parametrize("conn", mysql_connectable) def test_insertion_method_on_conflict_update(conn, request): # GH 14553: Example in to_sql docstring From 810b2d032f1ba09da244ef913a49283c20d5c510 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 1 Apr 2024 11:22:03 -0700 Subject: [PATCH 146/152] Backport PR #57553 on branch 2.2.x (API: avoid passing Manager to subclass init) (#58008) * Backport PR #57553: API: avoid passing Manager to subclass __init__ * whatsnew, type ignores * merge 2.2.2 file from main * rebase on 2.2.x whatsnew --- pandas/core/frame.py | 45 +++++++++++++++++++---------- pandas/core/generic.py | 1 + pandas/core/resample.py | 3 +- pandas/core/series.py | 34 ++++++++++++---------- pandas/tests/frame/test_subclass.py | 11 +++++++ 5 files changed, 62 insertions(+), 32 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5c510d98596df..afcd4d014316e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -656,26 +656,37 @@ class DataFrame(NDFrame, OpsMixin): def _constructor(self) -> Callable[..., DataFrame]: return DataFrame - def _constructor_from_mgr(self, mgr, axes): - if self._constructor is DataFrame: - # we are pandas.DataFrame (or a subclass that doesn't override _constructor) - return DataFrame._from_mgr(mgr, axes=axes) - else: - assert axes is mgr.axes + def _constructor_from_mgr(self, mgr, axes) -> DataFrame: + df = DataFrame._from_mgr(mgr, axes=axes) + + if type(self) is DataFrame: + # This would also work `if self._constructor is DataFrame`, but + # this check is slightly faster, benefiting the most-common case. + return df + + elif type(self).__name__ == "GeoDataFrame": + # Shim until geopandas can override their _constructor_from_mgr + # bc they have different behavior for Managers than for DataFrames return self._constructor(mgr) + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor(df) + _constructor_sliced: Callable[..., Series] = Series - def _sliced_from_mgr(self, mgr, axes) -> Series: - return Series._from_mgr(mgr, axes) + def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: + ser = Series._from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name - def _constructor_sliced_from_mgr(self, mgr, axes): - if self._constructor_sliced is Series: - ser = self._sliced_from_mgr(mgr, axes) - ser._name = None # caller is responsible for setting real name + if type(self) is DataFrame: + # This would also work `if self._constructor_sliced is Series`, but + # this check is slightly faster, benefiting the most-common case. return ser - assert axes is mgr.axes - return self._constructor_sliced(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor_sliced(ser) # ---------------------------------------------------------------------- # Constructors @@ -1403,7 +1414,8 @@ def _get_values_for_csv( na_rep=na_rep, quoting=quoting, ) - return self._constructor_from_mgr(mgr, axes=mgr.axes) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value] # ---------------------------------------------------------------------- @@ -5077,7 +5089,8 @@ def predicate(arr: ArrayLike) -> bool: return True mgr = self._mgr._get_data_subset(predicate).copy(deep=None) - return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) # type: ignore[return-value] def insert( self, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2a86f75badecd..796357355fef4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -336,6 +336,7 @@ def _as_manager(self, typ: str, copy: bool_t = True) -> Self: # fastpath of passing a manager doesn't check the option/manager class return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self) + @final @classmethod def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self: """ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2d430ef4dcff6..0dd808a0ab296 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2548,7 +2548,8 @@ def _take_new_index( if axis == 1: raise NotImplementedError("axis 1 is not supported") new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) - return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + # error: Incompatible return value type (got "DataFrame", expected "NDFrameT") + return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) # type: ignore[return-value] else: raise ValueError("'obj' should be either a Series or a DataFrame") diff --git a/pandas/core/series.py b/pandas/core/series.py index c1782206d4b67..6fd019656d207 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -662,14 +662,17 @@ def _constructor(self) -> Callable[..., Series]: return Series def _constructor_from_mgr(self, mgr, axes): - if self._constructor is Series: - # we are pandas.Series (or a subclass that doesn't override _constructor) - ser = Series._from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name + ser = Series._from_mgr(mgr, axes=axes) + ser._name = None # caller is responsible for setting real name + + if type(self) is Series: + # This would also work `if self._constructor is Series`, but + # this check is slightly faster, benefiting the most-common case. return ser - else: - assert axes is mgr.axes - return self._constructor(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor(ser) @property def _constructor_expanddim(self) -> Callable[..., DataFrame]: @@ -681,18 +684,19 @@ def _constructor_expanddim(self) -> Callable[..., DataFrame]: return DataFrame - def _expanddim_from_mgr(self, mgr, axes) -> DataFrame: + def _constructor_expanddim_from_mgr(self, mgr, axes): from pandas.core.frame import DataFrame - return DataFrame._from_mgr(mgr, axes=mgr.axes) + df = DataFrame._from_mgr(mgr, axes=mgr.axes) - def _constructor_expanddim_from_mgr(self, mgr, axes): - from pandas.core.frame import DataFrame + if type(self) is Series: + # This would also work `if self._constructor_expanddim is DataFrame`, + # but this check is slightly faster, benefiting the most-common case. + return df - if self._constructor_expanddim is DataFrame: - return self._expanddim_from_mgr(mgr, axes) - assert axes is mgr.axes - return self._constructor_expanddim(mgr) + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor_expanddim(df) # types @property diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index ef78ae62cb4d6..855b58229cbdb 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -26,6 +26,17 @@ def _constructor(self): class TestDataFrameSubclassing: + def test_no_warning_on_mgr(self): + # GH#57032 + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"] + ) + with tm.assert_produces_warning(None): + # df.isna() goes through _constructor_from_mgr, which we want to + # *not* pass a Manager do __init__ + df.isna() + df["X"].isna() + def test_frame_subclassing_and_slicing(self): # Subclass frame and ensure it returns the right class on slicing it # In reference to PR 9632 From 822d285a5e901bff7aea6406ee46131753dbd6ee Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 1 Apr 2024 23:23:48 +0200 Subject: [PATCH 147/152] Backport PR #58075 on branch 2.2.x (DOC: whatsnew note for #57553) (#58080) Backport PR #58075: DOC: whatsnew note for #57553 Co-authored-by: jbrockmendel --- doc/source/whatsnew/v2.2.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 9e1a883d47cf8..0dac3660c76b2 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -15,6 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`) +- Avoid issuing a spurious ``DeprecationWarning`` when a custom :class:`DataFrame` or :class:`Series` subclass method is called (:issue:`57553`) - Fixed regression in precision of :func:`to_datetime` with string and ``unit`` input (:issue:`57051`) .. --------------------------------------------------------------------------- From e9b81ee3ecca2720f59e21091528e1e1e7eafe9a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 2 Apr 2024 22:57:16 -0400 Subject: [PATCH 148/152] Backport PR #58126: BLD: Build wheels with numpy 2.0rc1 (#58127) --- pyproject.toml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c225ed80dcb10..b2764b137a1f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,12 +6,9 @@ requires = [ "meson==1.2.1", "wheel", "Cython==3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json - # Any NumPy version should be fine for compiling. Users are unlikely - # to get a NumPy<1.25 so the result will be compatible with all relevant - # NumPy versions (if not it is presumably compatible with their version). - # Pin <2.0 for releases until tested against an RC. But explicitly allow - # testing the `.dev0` nightlies (which require the extra index). - "numpy>1.22.4,<=2.0.0.dev0", + # Force numpy higher than 2.0rc1, so that built wheels are compatible + # with both numpy 1 and 2 + "numpy>=2.0.0rc1", "versioneer[toml]" ] From 0f83d50c477f8aafb61a3db1d310d0b5fd261adc Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 3 Apr 2024 13:28:27 -0400 Subject: [PATCH 149/152] Revert "BLD: Pin numpy on 2.2.x" (#58093) Revert "BLD: Pin numpy on 2.2.x (#56812)" This reverts commit 24ea67fcf0cf982d011d249f2a711ef178e13065. --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b2764b137a1f8..778146bbcd909 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,9 +27,9 @@ authors = [ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ - "numpy>=1.22.4,<2; python_version<'3.11'", - "numpy>=1.23.2,<2; python_version=='3.11'", - "numpy>=1.26.0,<2; python_version>='3.12'", + "numpy>=1.22.4; python_version<'3.11'", + "numpy>=1.23.2; python_version=='3.11'", + "numpy>=1.26.0; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.7" From b56842d93dc76dc2a83b7ab640af6a419697decb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 3 Apr 2024 21:29:53 +0200 Subject: [PATCH 150/152] Backport PR #58100 on branch 2.2.x (MNT: fix compatibility with beautifulsoup4 4.13.0b2) (#58137) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #58100: MNT: fix compatibility with beautifulsoup4 4.13.0b2 Co-authored-by: Clément Robert --- pandas/io/html.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 26e71c9546ffd..4eeeb1b655f8a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -591,14 +591,8 @@ class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): :class:`pandas.io.html._HtmlFrameParser`. """ - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - from bs4 import SoupStrainer - - self._strainer = SoupStrainer("table") - def _parse_tables(self, document, match, attrs): - element_name = self._strainer.name + element_name = "table" tables = document.find_all(element_name, attrs=attrs) if not tables: raise ValueError("No tables found") From a947587c8ad419e9cf8f6cce1fd5ad80c32d759f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 Apr 2024 00:02:48 +0200 Subject: [PATCH 151/152] Backport PR #58138 on branch 2.2.x (BLD: Fix nightlies not building) (#58140) Backport PR #58138: BLD: Fix nightlies not building Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/wheels.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 470c044d2e99e..b9bfc766fb45c 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -139,8 +139,7 @@ jobs: shell: bash -el {0} run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - - name: Build normal wheels - if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} + - name: Build wheels uses: pypa/cibuildwheel@v2.17.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} From f2eb32d756920e0a4de08a2fd9b8abacf21731bb Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Mon, 8 Apr 2024 22:00:48 +0200 Subject: [PATCH 152/152] Backport PR #58181: CI: correct error msg in `test_view_index` --- pandas/tests/indexes/test_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 1fa48f98942c2..b7204d7af1cbb 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -358,7 +358,10 @@ def test_view_with_args_object_array_raises(self, index): with pytest.raises(NotImplementedError, match="i8"): index.view("i8") else: - msg = "Cannot change data-type for object array" + msg = ( + "Cannot change data-type for array of references|" + "Cannot change data-type for object array|" + ) with pytest.raises(TypeError, match=msg): index.view("i8")