diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 060de8ff8ef09..87268780b0def 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -41,7 +41,7 @@ Other enhancements - :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`) - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`) -- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` to enable automatic conversion to nullable dtypes (:issue:`36712`) +- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`) - Added new global configuration, ``io.nullable_backend`` to allow ``use_nullable_dtypes=True`` to return pyarrow-backed dtypes when set to ``"pyarrow"`` in :func:`read_parquet` (:issue:`48957`) - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index de226fcd19084..4151ba927adf0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2370,7 +2370,7 @@ def maybe_convert_numeric( # This occurs since we disabled float nulls showing as null in anticipation # of seeing ints that were never seen. So then, we return float - if allow_null_in_int and seen.null_ and not seen.int_: + if allow_null_in_int and seen.null_ and not seen.int_ and not seen.bool_: seen.float_ = True if seen.complex_: @@ -2390,6 +2390,8 @@ def maybe_convert_numeric( else: return (ints, None) elif seen.bool_: + if allow_null_in_int: + return (bools.view(np.bool_), mask.view(np.bool_)) return (bools.view(np.bool_), None) elif seen.uint_: return (uints, None) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 994887f487473..5698c1a5af0e9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -271,6 +271,13 @@ .. versionadded:: 1.2.0 +use_nullable_dtypes : bool, default False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. Dtype takes precedence if given. + + .. versionadded:: 2.0 + Returns ------- DataFrame or dict of DataFrames @@ -375,6 +382,7 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> DataFrame: ... @@ -413,6 +421,7 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> dict[IntStrT, DataFrame]: ... @@ -451,6 +460,7 @@ def read_excel( comment: str | None = None, skipfooter: int = 0, storage_options: StorageOptions = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | dict[IntStrT, DataFrame]: should_close = False @@ -487,6 +497,7 @@ def read_excel( decimal=decimal, comment=comment, skipfooter=skipfooter, + use_nullable_dtypes=use_nullable_dtypes, ) finally: # make sure to close opened file handles @@ -690,6 +701,7 @@ def parse( decimal: str = ".", comment: str | None = None, skipfooter: int = 0, + use_nullable_dtypes: bool = False, **kwds, ): @@ -848,6 +860,7 @@ def parse( comment=comment, skipfooter=skipfooter, usecols=usecols, + use_nullable_dtypes=use_nullable_dtypes, **kwds, ) @@ -1680,6 +1693,7 @@ def parse( thousands: str | None = None, comment: str | None = None, skipfooter: int = 0, + use_nullable_dtypes: bool = False, **kwds, ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]: """ @@ -1711,6 +1725,7 @@ def parse( thousands=thousands, comment=comment, skipfooter=skipfooter, + use_nullable_dtypes=use_nullable_dtypes, **kwds, ) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 41016f8f40b9f..44773f13276c0 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -774,7 +774,10 @@ def _infer_types( bool_mask = np.zeros(result.shape, dtype=np.bool_) result = BooleanArray(result, bool_mask) elif result.dtype == np.object_ and use_nullable_dtypes: - result = StringDtype().construct_array_type()._from_sequence(values) + # read_excel sends array of datetime objects + inferred_type, _ = lib.infer_datetimelike_array(result) + if inferred_type != "datetime": + result = StringDtype().construct_array_type()._from_sequence(values) return result, na_count diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 16fbf54bbe394..6a9d001e7a596 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -21,6 +21,10 @@ Series, ) import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ @@ -532,6 +536,84 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) + def test_use_nullable_dtypes(self, read_ext): + # GH#36712 + if read_ext == ".xlsb": + pytest.skip("No engine for filetype: 'xlsb'") + + df = DataFrame( + { + "a": Series([1, 3], dtype="Int64"), + "b": Series([2.5, 4.5], dtype="Float64"), + "c": Series([True, False], dtype="boolean"), + "d": Series(["a", "b"], dtype="string"), + "e": Series([pd.NA, 6], dtype="Int64"), + "f": Series([pd.NA, 7.5], dtype="Float64"), + "g": Series([pd.NA, True], dtype="boolean"), + "h": Series([pd.NA, "a"], dtype="string"), + "i": Series([pd.Timestamp("2019-12-31")] * 2), + "j": Series([pd.NA, pd.NA], dtype="Int64"), + } + ) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, "test", index=False) + result = pd.read_excel( + file_path, sheet_name="test", use_nullable_dtypes=True + ) + tm.assert_frame_equal(result, df) + + def test_use_nullabla_dtypes_and_dtype(self, read_ext): + # GH#36712 + if read_ext == ".xlsb": + pytest.skip("No engine for filetype: 'xlsb'") + + df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, "test", index=False) + result = pd.read_excel( + file_path, sheet_name="test", use_nullable_dtypes=True, dtype="float64" + ) + tm.assert_frame_equal(result, df) + + @td.skip_if_no("pyarrow") + @pytest.mark.parametrize("storage", ["pyarrow", "python"]) + def test_use_nullabla_dtypes_string(self, read_ext, storage): + # GH#36712 + if read_ext == ".xlsb": + pytest.skip("No engine for filetype: 'xlsb'") + + import pyarrow as pa + + with pd.option_context("mode.string_storage", storage): + + df = DataFrame( + { + "a": np.array(["a", "b"], dtype=np.object_), + "b": np.array(["x", pd.NA], dtype=np.object_), + } + ) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, "test", index=False) + result = pd.read_excel( + file_path, sheet_name="test", use_nullable_dtypes=True + ) + + if storage == "python": + expected = DataFrame( + { + "a": StringArray(np.array(["a", "b"], dtype=np.object_)), + "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), + } + ) + else: + expected = DataFrame( + { + "a": ArrowStringArray(pa.array(["a", "b"])), + "b": ArrowStringArray(pa.array(["x", None])), + } + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): # GH#35211