From 62d2e153265ee2b7abb845be925070fa77c69c48 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Nov 2020 09:44:41 +0100 Subject: [PATCH 1/5] BUG: preserve nullable dtype for float result in IntegerArray arithmetic ops --- pandas/core/arrays/integer.py | 5 +- .../tests/arrays/integer/test_arithmetic.py | 46 +++++++++++++------ 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 2897c18acfb09..2738c58e06fd9 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -636,8 +636,9 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): if (is_float_dtype(other) or is_float(other)) or ( op_name in ["rtruediv", "truediv"] ): - result[mask] = np.nan - return result + from pandas.core.arrays import FloatingArray + + return FloatingArray(result, mask, copy=False) if result.dtype == "timedelta64[ns]": from pandas.core.arrays import TimedeltaArray diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index cf382dd5e37e0..91ab2a8af6fba 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -5,7 +5,7 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays import integer_array +from pandas.core.arrays import FloatingArray, integer_array import pandas.core.ops as ops # Basic test for the arithmetic array ops @@ -43,13 +43,12 @@ def test_sub(dtype): def test_div(dtype): - # for now division gives a float numpy array a = pd.array([1, 2, 3, None, 5], dtype=dtype) b = pd.array([0, 1, None, 3, 4], dtype=dtype) result = a / b - expected = np.array([np.inf, 2, np.nan, np.nan, 1.25], dtype="float64") - tm.assert_numpy_array_equal(result, expected) + expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64") + tm.assert_extension_array_equal(result, expected) @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) @@ -57,10 +56,13 @@ def test_divide_by_zero(zero, negative): # https://github.com/pandas-dev/pandas/issues/27398 a = pd.array([0, 1, -1, None], dtype="Int64") result = a / zero - expected = np.array([np.nan, np.inf, -np.inf, np.nan]) + expected = FloatingArray( + np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"), + np.array([False, False, False, True]), + ) if negative: expected *= -1 - tm.assert_numpy_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected) def test_floordiv(dtype): @@ -97,8 +99,11 @@ def test_pow_scalar(): tm.assert_extension_array_equal(result, expected) result = a ** np.nan - expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), + np.array([False, False, False, True, False]), + ) + tm.assert_extension_array_equal(result, expected) # reversed a = a[1:] # Can't raise integers to negative powers. @@ -116,8 +121,11 @@ def test_pow_scalar(): tm.assert_extension_array_equal(result, expected) result = np.nan ** a - expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype="float64"), + np.array([False, False, True, False]), + ) + tm.assert_extension_array_equal(result, expected) def test_pow_array(): @@ -131,10 +139,10 @@ def test_pow_array(): def test_rpow_one_to_na(): # https://github.com/pandas-dev/pandas/issues/22022 # https://github.com/pandas-dev/pandas/issues/29997 - arr = integer_array([np.nan, np.nan]) + arr = pd.array([np.nan, np.nan], dtype="Int64") result = np.array([1.0, 2.0]) ** arr - expected = np.array([1.0, np.nan]) - tm.assert_numpy_array_equal(result, expected) + expected = pd.array([1.0, np.nan], dtype="Float64") + tm.assert_extension_array_equal(result, expected) @pytest.mark.parametrize("other", [0, 0.5]) @@ -196,9 +204,17 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators): result = op(s, other) expected = op(s.astype(float), other) + expected = expected.astype("Float64") # rfloordiv results in nan instead of inf if all_arithmetic_operators == "__rfloordiv__": - expected[(expected == np.inf) | (expected == -np.inf)] = np.nan + mask = ( + ((expected == np.inf) | (expected == -np.inf)).fillna(False).to_numpy(bool) + ) + expected.array._data[mask] = np.nan + # rmod results in NaN that wasn't NA in original nullable Series -> unmask it + elif all_arithmetic_operators == "__rmod__": + mask = (s == 0).fillna(False).to_numpy(bool) + expected.array._mask[mask] = False tm.assert_series_equal(result, expected) @@ -211,7 +227,7 @@ def test_arithmetic_conversion(all_arithmetic_operators, other): s = pd.Series([1, 2, 3], dtype="Int64") result = op(s, other) - assert result.dtype is np.dtype("float") + assert result.dtype == "Float64" def test_cross_type_arithmetic(): From d8a0c03a9d84ceda2863507bc1f0167cd1e6985a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Nov 2020 10:03:11 +0100 Subject: [PATCH 2/5] same change for boolean --- pandas/core/arrays/boolean.py | 7 ++++--- pandas/tests/arrays/boolean/test_arithmetic.py | 13 ++++++++----- pandas/tests/arrays/masked/test_arithmetic.py | 6 +----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c6c7396a980b0..44cc108ed9cfd 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -706,10 +706,11 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): if (is_float_dtype(other) or is_float(other)) or ( op_name in ["rtruediv", "truediv"] ): - result[mask] = np.nan - return result + from pandas.core.arrays import FloatingArray + + return FloatingArray(result, mask, copy=False) - if is_bool_dtype(result): + elif is_bool_dtype(result): return BooleanArray(result, mask, copy=False) elif is_integer_dtype(result): diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 1a4ab9799e8e5..01de64568a011 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -5,6 +5,7 @@ import pandas as pd import pandas._testing as tm +from pandas.arrays import FloatingArray @pytest.fixture @@ -51,13 +52,15 @@ def test_sub(left_array, right_array): def test_div(left_array, right_array): - # for now division gives a float numpy array result = left_array / right_array - expected = np.array( - [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan], - dtype="float64", + expected = FloatingArray( + np.array( + [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan], + dtype="float64", + ), + np.array([False, False, True, False, False, True, True, True, True]), ) - tm.assert_numpy_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index 6de10fd896878..1d2833c5da276 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -43,11 +43,7 @@ def test_array_scalar_like_equivalence(data, all_arithmetic_operators): for scalar in [scalar, data.dtype.type(scalar)]: result = op(data, scalar) expected = op(data, scalar_array) - if isinstance(expected, ExtensionArray): - tm.assert_extension_array_equal(result, expected) - else: - # TODO div still gives float ndarray -> remove this once we have Float EA - tm.assert_numpy_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected) def test_array_NA(data, all_arithmetic_operators): From e9ecfa76ef9f7207a5fc681f14e29dbec00e792b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Nov 2020 11:50:23 +0100 Subject: [PATCH 3/5] update base extension tests --- pandas/tests/arrays/boolean/test_function.py | 7 +++++++ pandas/tests/arrays/floating/test_function.py | 7 +++++++ pandas/tests/arrays/integer/test_function.py | 8 ++++++++ pandas/tests/arrays/string_/test_string.py | 12 ++++++++++++ pandas/tests/extension/test_boolean.py | 6 +++++- pandas/tests/extension/test_floating.py | 4 ++++ pandas/tests/extension/test_integer.py | 9 +++++---- pandas/tests/extension/test_string.py | 4 ++++ 8 files changed, 52 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 7665c350e3443..0f8743489b412 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -85,6 +85,13 @@ def test_value_counts_na(): tm.assert_series_equal(result, expected) +def test_value_counts_with_normalize(): + s = pd.Series([True, False, pd.NA], dtype="boolean") + result = s.value_counts(normalize=True) + expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2 + tm.assert_series_equal(result, expected) + + def test_diff(): a = pd.array( [True, True, False, False, True, None, True, None, False], dtype="boolean" diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index baf60a363ad29..ef95eac316397 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -113,6 +113,13 @@ def test_value_counts_empty(): tm.assert_series_equal(result, expected) +def test_value_counts_with_normalize(): + s = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64") + result = s.value_counts(normalize=True) + expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3 + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [0, 4]) def test_floating_array_sum(skipna, min_count, dtype): diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 9cdea1c71f109..521547cc7357d 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -127,6 +127,14 @@ def test_value_counts_empty(): tm.assert_series_equal(result, expected) +def test_value_counts_with_normalize(): + # GH 33172 + s = pd.Series([1, 2, 1, pd.NA], dtype="Int64") + result = s.value_counts(normalize=True) + expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3 + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [0, 4]) def test_integer_array_sum(skipna, min_count, any_nullable_int_dtype): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e35a632734779..c70d55b07661d 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -495,6 +495,18 @@ def test_value_counts_na(dtype, request): tm.assert_series_equal(result, expected) +def test_value_counts_with_normalize(dtype, request): + if dtype == "arrow_string": + reason = "TypeError: boolean value of NA is ambiguous" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) + result = s.value_counts(normalize=True) + expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3 + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "values, expected", [ diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 8acbeaf0b8170..ced7ea9261310 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -130,7 +130,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): elif op_name in ("__truediv__", "__rtruediv__"): # combine with bools does not generate the correct result # (numpy behaviour for div is to regard the bools as numeric) - expected = s.astype(float).combine(other, op) + expected = s.astype(float).combine(other, op).astype("Float64") if op_name == "__rpow__": # for rpow, combine does not propagate NaN expected[result.isna()] = np.nan @@ -235,6 +235,10 @@ def test_searchsorted(self, data_for_sorting, as_series): def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) + @pytest.mark.skip(reason="uses nullable integer") + def test_value_counts_with_normalize(self, data): + pass + def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting): # override because there are only 2 unique values diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 00881178de1b4..c08c31e90fecc 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -184,6 +184,10 @@ def test_value_counts(self, all_data, dropna): self.assert_series_equal(result, expected) + @pytest.mark.skip(reason="uses nullable integer") + def test_value_counts_with_normalize(self, data): + pass + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 725533765ca2c..b1461dcbd9e53 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -130,10 +130,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): expected = s.combine(other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.fillna(np.nan).astype(float) - if op_name == "__rtruediv__": - # TODO reverse operators result in object dtype - result = result.astype(float) + expected = expected.fillna(np.nan).astype("Float64") elif op_name.startswith("__r"): # TODO reverse operators result in object dtype # see https://github.com/pandas-dev/pandas/issues/22024 @@ -224,6 +221,10 @@ def test_value_counts(self, all_data, dropna): self.assert_series_equal(result, expected) + @pytest.mark.skip(reason="uses nullable integer") + def test_value_counts_with_normalize(self, data): + pass + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index db1940226e04e..d49c4c5cf4889 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -118,6 +118,10 @@ class TestMethods(base.BaseMethodsTests): def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) + @pytest.mark.skip(reason="returns nullable") + def test_value_counts_with_normalize(self, data): + pass + class TestCasting(base.BaseCastingTests): pass From ac687e5ecdec103e23aff3d987d2089aa8ba54b5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Nov 2020 14:06:29 +0100 Subject: [PATCH 4/5] fix series arithmetic tests --- pandas/core/arrays/integer.py | 4 +++- pandas/tests/series/test_arithmetic.py | 10 ++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 2738c58e06fd9..66d92238a9b08 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -539,13 +539,15 @@ def _cmp_method(self, other, op): return BooleanArray(result, mask) def _arith_method(self, other, op): + from pandas.core.arrays import FloatingArray + op_name = op.__name__ omask = None if getattr(other, "ndim", 0) > 1: raise NotImplementedError("can only perform ops with 1-d structures") - if isinstance(other, IntegerArray): + if isinstance(other, (IntegerArray, FloatingArray)): other, omask = other._data, other._mask elif is_list_like(other): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index c5196cea5d3bb..09109447e799a 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -838,14 +838,8 @@ class TestInplaceOperations: ( ("Int64", "Int64", "Int64", "Int64"), ("float", "float", "float", "float"), - ("Int64", "float", "float", "float"), - pytest.param( - "Int64", - "Float64", - "Float64", - "Float64", - marks=pytest.mark.xfail(reason="Not implemented yet"), - ), + ("Int64", "float", "Float64", "Float64"), + ("Int64", "Float64", "Float64", "Float64"), ), ) def test_series_inplace_ops(self, dtype1, dtype2, dtype_expected, dtype_mul): From 2649eb9029933da29fbc0d1cbbeafa78f745445d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Dec 2020 08:27:02 +0100 Subject: [PATCH 5/5] add sentence to whatsnew section of float dtype --- doc/source/whatsnew/v1.2.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9168041a4f474..84eb3b3f15780 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -196,6 +196,9 @@ Alternatively, you can also use the dtype object: pd.Series([1.5, None], dtype=pd.Float32Dtype()) +Operations with the existing integer or boolean nullable data types that +give float results will now also use the nullable floating data types (:issue:`38178`). + .. warning:: Experimental: the new floating data types are currently experimental, and their