From bf05e4c9ffe27459fffcc7471dce2e7b51e89b81 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 24 Feb 2019 03:41:01 +0100 Subject: [PATCH 01/27] TST: add test coverage for maybe_promote --- pandas/conftest.py | 65 +- pandas/tests/dtypes/cast/test_promote.py | 959 +++++++++++++++++++++++ 2 files changed, 1017 insertions(+), 7 deletions(-) create mode 100644 pandas/tests/dtypes/cast/test_promote.py diff --git a/pandas/conftest.py b/pandas/conftest.py index 35a6b5df35ddc..debc9734730f3 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -367,10 +367,15 @@ def unique_nulls_fixture(request): TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific', 'dateutil/Asia/Singapore', tzutc(), tzlocal(), FixedOffset(300), FixedOffset(0), FixedOffset(-300)] +TIMEZONE_IDS = ['None', 'UTC', 'US/Eastern', 'Asia/Tokyp', + 'dateutil/US/Pacific', 'dateutil/Asia/Singapore', + 'dateutil.tz.tzutz()', 'dateutil.tz.tzlocal()', + 'pytz.FixedOffset(300)', 'pytz.FixedOffset(0)', + 'pytz.FixedOffset(-300)'] -@td.parametrize_fixture_doc(str(TIMEZONES)) -@pytest.fixture(params=TIMEZONES) +@td.parametrize_fixture_doc(str(TIMEZONE_IDS)) +@pytest.fixture(params=TIMEZONES, ids=TIMEZONE_IDS) def tz_naive_fixture(request): """ Fixture for trying timezones including default (None): {0} @@ -378,8 +383,8 @@ def tz_naive_fixture(request): return request.param -@td.parametrize_fixture_doc(str(TIMEZONES[1:])) -@pytest.fixture(params=TIMEZONES[1:]) +@td.parametrize_fixture_doc(str(TIMEZONE_IDS[1:])) +@pytest.fixture(params=TIMEZONES[1:], ids=TIMEZONE_IDS[1:]) def tz_aware_fixture(request): """ Fixture for trying explicit timezones: {0} @@ -387,8 +392,14 @@ def tz_aware_fixture(request): return request.param +# Generate cartesian product of tz_aware_fixture: +tz_aware_fixture2 = tz_aware_fixture + + # ---------------------------------------------------------------- # Dtypes + + UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"] SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] @@ -400,8 +411,8 @@ def tz_aware_fixture(request): COMPLEX_DTYPES = [complex, "complex64", "complex128"] STRING_DTYPES = [str, 'str', 'U'] -DATETIME_DTYPES = ['datetime64[ns]', 'M8[ns]'] -TIMEDELTA_DTYPES = ['timedelta64[ns]', 'm8[ns]'] +DATETIME64_DTYPES = ['datetime64[ns]', 'M8[ns]'] +TIMEDELTA64_DTYPES = ['timedelta64[ns]', 'm8[ns]'] BOOL_DTYPES = [bool, 'bool'] BYTES_DTYPES = [bytes, 'bytes'] @@ -409,7 +420,7 @@ def tz_aware_fixture(request): ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES ALL_NUMPY_DTYPES = (ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES - + DATETIME_DTYPES + TIMEDELTA_DTYPES + BOOL_DTYPES + + DATETIME64_DTYPES + TIMEDELTA64_DTYPES + BOOL_DTYPES + OBJECT_DTYPES + BYTES_DTYPES * PY3) # bytes only for PY3 @@ -424,6 +435,46 @@ def string_dtype(request): return request.param +@pytest.fixture(params=BYTES_DTYPES) +def bytes_dtype(request): + """Parametrized fixture for bytes dtypes. + + * bytes + * 'bytes' + """ + return request.param + + +@pytest.fixture(params=OBJECT_DTYPES) +def object_dtype(request): + """Parametrized fixture for object dtypes. + + * object + * 'object' + """ + return request.param + + +@pytest.fixture(params=DATETIME64_DTYPES) +def datetime64_dtype(request): + """Parametrized fixture for datetime/timedelta dtypes. + + * 'datetime64[ns]' + * 'M8[ns]' + """ + return request.param + + +@pytest.fixture(params=TIMEDELTA64_DTYPES) +def timedelta64_dtype(request): + """Parametrized fixture for datetime/timedelta dtypes. + + * 'timedelta64[ns]' + * 'm8[ns]' + """ + return request.param + + @pytest.fixture(params=FLOAT_DTYPES) def float_dtype(request): """ diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py new file mode 100644 index 0000000000000..7cf553865942f --- /dev/null +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -0,0 +1,959 @@ +# -*- coding: utf-8 -*- + +""" +These test the method maybe_promote from core/dtypes/cast.py +""" + +import datetime + +import numpy as np +import pytest + +from pandas._libs.tslibs import NaT, iNaT +from pandas.compat import PY2, is_platform_windows + +from pandas.core.dtypes.cast import ( + maybe_promote, maybe_promote_with_array, maybe_promote_with_scalar) +from pandas.core.dtypes.common import ( + is_complex_dtype, is_datetime64_dtype, is_datetime_or_timedelta_dtype, + is_float_dtype, is_integer_dtype, is_object_dtype, is_scalar, + is_timedelta64_dtype) +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd + + +def _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar=None, exp_val_for_array=None): + assert is_scalar(fill_value) + + if boxed: + fill_array = np.array([fill_value], dtype=box_dtype) + result_dtype, result_fill_value = maybe_promote(dtype, fill_array) + expected_fill_value = exp_val_for_array + else: + result_dtype, result_fill_value = maybe_promote(dtype, fill_value) + expected_fill_value = exp_val_for_scalar + + # try/except as numpy dtypes (i.e. if result_dtype is np.object_) do not + # know some expected dtypes like DatetimeTZDtype, and hence raise TypeError + try: + assert result_dtype == expected_dtype + except TypeError: + assert expected_dtype == result_dtype + + # for equal values, also check type (relevant e.g. for int vs float, resp. + # for different datetimes and timedeltas) + # for missing values, None == None and iNaT == iNaT, but np.nan != np.nan + assert ((result_fill_value == expected_fill_value + and type(result_fill_value) == type(expected_fill_value)) + or (result_fill_value is np.nan and expected_fill_value is np.nan) + or (result_fill_value is NaT and expected_fill_value is NaT)) + + +@pytest.mark.parametrize('dtype, fill_value, expected_dtype', [ + # size 8 + ('int8', 1, 'int8'), + ('int8', np.iinfo('int8').max + 1, 'int16'), + ('int8', np.iinfo('int16').max + 1, 'int32'), + ('int8', np.iinfo('int32').max + 1, 'int64'), + ('int8', np.iinfo('int64').max + 1, 'object'), + ('int8', -1, 'int8'), + ('int8', np.iinfo('int8').min - 1, 'int16'), + ('int8', np.iinfo('int16').min - 1, 'int32'), + ('int8', np.iinfo('int32').min - 1, 'int64'), + ('int8', np.iinfo('int64').min - 1, 'object'), + # keep signed-ness as long as possible + ('uint8', 1, 'uint8'), + ('uint8', np.iinfo('int8').max + 1, 'uint8'), + ('uint8', np.iinfo('uint8').max + 1, 'uint16'), + ('uint8', np.iinfo('int16').max + 1, 'uint16'), + ('uint8', np.iinfo('uint16').max + 1, 'uint32'), + ('uint8', np.iinfo('int32').max + 1, 'uint32'), + ('uint8', np.iinfo('uint32').max + 1, 'uint64'), + ('uint8', np.iinfo('int64').max + 1, 'uint64'), + ('uint8', np.iinfo('uint64').max + 1, 'object'), + # max of uint8 cannot be contained in int8 + ('uint8', -1, 'int16'), + ('uint8', np.iinfo('int8').min - 1, 'int16'), + ('uint8', np.iinfo('int16').min - 1, 'int32'), + ('uint8', np.iinfo('int32').min - 1, 'int64'), + ('uint8', np.iinfo('int64').min - 1, 'object'), + # size 16 + ('int16', 1, 'int16'), + ('int16', np.iinfo('int8').max + 1, 'int16'), + ('int16', np.iinfo('int16').max + 1, 'int32'), + ('int16', np.iinfo('int32').max + 1, 'int64'), + ('int16', np.iinfo('int64').max + 1, 'object'), + ('int16', -1, 'int16'), + ('int16', np.iinfo('int8').min - 1, 'int16'), + ('int16', np.iinfo('int16').min - 1, 'int32'), + ('int16', np.iinfo('int32').min - 1, 'int64'), + ('int16', np.iinfo('int64').min - 1, 'object'), + ('uint16', 1, 'uint16'), + ('uint16', np.iinfo('int8').max + 1, 'uint16'), + ('uint16', np.iinfo('uint8').max + 1, 'uint16'), + ('uint16', np.iinfo('int16').max + 1, 'uint16'), + ('uint16', np.iinfo('uint16').max + 1, 'uint32'), + ('uint16', np.iinfo('int32').max + 1, 'uint32'), + ('uint16', np.iinfo('uint32').max + 1, 'uint64'), + ('uint16', np.iinfo('int64').max + 1, 'uint64'), + ('uint16', np.iinfo('uint64').max + 1, 'object'), + ('uint16', -1, 'int32'), + ('uint16', np.iinfo('int8').min - 1, 'int32'), + ('uint16', np.iinfo('int16').min - 1, 'int32'), + ('uint16', np.iinfo('int32').min - 1, 'int64'), + ('uint16', np.iinfo('int64').min - 1, 'object'), + # size 32 + ('int32', 1, 'int32'), + ('int32', np.iinfo('int8').max + 1, 'int32'), + ('int32', np.iinfo('int16').max + 1, 'int32'), + ('int32', np.iinfo('int32').max + 1, 'int64'), + ('int32', np.iinfo('int64').max + 1, 'object'), + ('int32', -1, 'int32'), + ('int32', np.iinfo('int8').min - 1, 'int32'), + ('int32', np.iinfo('int16').min - 1, 'int32'), + ('int32', np.iinfo('int32').min - 1, 'int64'), + ('int32', np.iinfo('int64').min - 1, 'object'), + ('uint32', 1, 'uint32'), + ('uint32', np.iinfo('int8').max + 1, 'uint32'), + ('uint32', np.iinfo('uint8').max + 1, 'uint32'), + ('uint32', np.iinfo('int16').max + 1, 'uint32'), + ('uint32', np.iinfo('uint16').max + 1, 'uint32'), + ('uint32', np.iinfo('int32').max + 1, 'uint32'), + ('uint32', np.iinfo('uint32').max + 1, 'uint64'), + ('uint32', np.iinfo('int64').max + 1, 'uint64'), + ('uint32', np.iinfo('uint64').max + 1, 'object'), + ('uint32', -1, 'int64'), + ('uint32', np.iinfo('int8').min - 1, 'int64'), + ('uint32', np.iinfo('int16').min - 1, 'int64'), + ('uint32', np.iinfo('int32').min - 1, 'int64'), + ('uint32', np.iinfo('int64').min - 1, 'object'), + # size 64 + ('int64', 1, 'int64'), + ('int64', np.iinfo('int8').max + 1, 'int64'), + ('int64', np.iinfo('int16').max + 1, 'int64'), + ('int64', np.iinfo('int32').max + 1, 'int64'), + ('int64', np.iinfo('int64').max + 1, 'object'), + ('int64', -1, 'int64'), + ('int64', np.iinfo('int8').min - 1, 'int64'), + ('int64', np.iinfo('int16').min - 1, 'int64'), + ('int64', np.iinfo('int32').min - 1, 'int64'), + ('int64', np.iinfo('int64').min - 1, 'object'), + ('uint64', 1, 'uint64'), + ('uint64', np.iinfo('int8').max + 1, 'uint64'), + ('uint64', np.iinfo('uint8').max + 1, 'uint64'), + ('uint64', np.iinfo('int16').max + 1, 'uint64'), + ('uint64', np.iinfo('uint16').max + 1, 'uint64'), + ('uint64', np.iinfo('int32').max + 1, 'uint64'), + ('uint64', np.iinfo('uint32').max + 1, 'uint64'), + ('uint64', np.iinfo('int64').max + 1, 'uint64'), + ('uint64', np.iinfo('uint64').max + 1, 'object'), + ('uint64', -1, 'object'), + ('uint64', np.iinfo('int8').min - 1, 'object'), + ('uint64', np.iinfo('int16').min - 1, 'object'), + ('uint64', np.iinfo('int32').min - 1, 'object'), + ('uint64', np.iinfo('int64').min - 1, 'object') +]) +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_int_with_int(dtype, fill_value, expected_dtype, + boxed, box_dtype): + dtype = np.dtype(dtype) + expected_dtype = np.dtype(expected_dtype) + + if not boxed: + if expected_dtype == object: + pytest.xfail('overflow error') + if expected_dtype == 'int32': + pytest.xfail('always upcasts to platform int') + if dtype == 'int8' and expected_dtype == 'int16': + pytest.xfail('casts to int32 instead of int16') + if (issubclass(dtype.type, np.unsignedinteger) + and np.iinfo(dtype).max < fill_value <= np.iinfo('int64').max): + pytest.xfail('falsely casts to signed') + if ((dtype, expected_dtype) in [('uint8', 'int16'), + ('uint32', 'int64')] + and fill_value != np.iinfo('int32').min - 1): + pytest.xfail('casts to int32 instead of int8/int16') + # this following xfail is "only" a consequence of the - now strictly + # enforced - principle that maybe_promote_with_scalar always casts + pytest.xfail('wrong return type of fill_value') + if boxed: + if expected_dtype != object: + pytest.xfail('falsely casts to object') + if box_dtype is None and (fill_value > np.iinfo('int64').max + or np.iinfo('int64').min < fill_value < 0): + pytest.xfail('falsely casts to float instead of object') + + # output is not a generic int, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + # no missing value marker for integers + exp_val_for_array = None if expected_dtype != 'object' else np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, + boxed, box_dtype): + dtype = np.dtype(any_int_dtype) + fill_dtype = np.dtype(float_dtype) + + if float_dtype == 'float32' and not boxed: + pytest.xfail('falsely upcasts to float64') + if box_dtype == object: + pytest.xfail('falsely upcasts to object') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling int with float always upcasts to float64 + expected_dtype = np.float64 + # fill_value can be different float type + exp_val_for_scalar = np.float64(fill_value) + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, + boxed, box_dtype): + + dtype = np.dtype(float_dtype) + fill_dtype = np.dtype(any_int_dtype) + + if box_dtype == object: + pytest.xfail('falsely upcasts to object') + # this following xfail is "only" a consequence of the - now strictly + # enforced - principle that maybe_promote_with_scalar always casts + if not boxed: + pytest.xfail('wrong return type of fill_value') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling float with int always keeps float dtype + # because: np.finfo('float32').max > np.iinfo('uint64').max + expected_dtype = dtype + # output is not a generic float, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('dtype, fill_value, expected_dtype', [ + # float filled with float + ('float32', 1, 'float32'), + ('float32', np.finfo('float32').max * 1.1, 'float64'), + ('float64', 1, 'float64'), + ('float64', np.finfo('float32').max * 1.1, 'float64'), + # complex filled with float + ('complex64', 1, 'complex64'), + ('complex64', np.finfo('float32').max * 1.1, 'complex128'), + ('complex128', 1, 'complex128'), + ('complex128', np.finfo('float32').max * 1.1, 'complex128'), + # float filled with complex + ('float32', 1 + 1j, 'complex64'), + ('float32', np.finfo('float32').max * (1.1 + 1j), 'complex128'), + ('float64', 1 + 1j, 'complex128'), + ('float64', np.finfo('float32').max * (1.1 + 1j), 'complex128'), + # complex filled with complex + ('complex64', 1 + 1j, 'complex64'), + ('complex64', np.finfo('float32').max * (1.1 + 1j), 'complex128'), + ('complex128', 1 + 1j, 'complex128'), + ('complex128', np.finfo('float32').max * (1.1 + 1j), 'complex128') +]) +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype, + boxed, box_dtype): + + dtype = np.dtype(dtype) + expected_dtype = np.dtype(expected_dtype) + + if box_dtype == object: + pytest.xfail('falsely upcasts to object') + if boxed and is_float_dtype(dtype) and is_complex_dtype(expected_dtype): + pytest.xfail('does not upcast to complex') + if (dtype, expected_dtype) in [('float32', 'float64'), + ('float32', 'complex64'), + ('complex64', 'complex128')]: + pytest.xfail('does not upcast correctly depending on value') + # this following xfails are "only" a consequence of the - now strictly + # enforced - principle that maybe_promote_with_scalar always casts + if not boxed and abs(fill_value) < 2: + pytest.xfail('wrong return type of fill_value') + if (not boxed and dtype == 'complex128' and expected_dtype == 'complex128' + and is_float_dtype(type(fill_value))): + pytest.xfail('wrong return type of fill_value') + + # output is not a generic float, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_bool_with_any(any_numpy_dtype, boxed, box_dtype): + dtype = np.dtype(bool) + fill_dtype = np.dtype(any_numpy_dtype) + + if boxed and fill_dtype == bool: + pytest.xfail('falsely upcasts to object') + if (boxed and box_dtype is None + and is_datetime_or_timedelta_dtype(fill_dtype)): + pytest.xfail('wrongly casts fill_value') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling bool with anything but bool casts to object + expected_dtype = np.dtype(object) if fill_dtype != bool else fill_dtype + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan if fill_dtype != bool else None + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_any_with_bool(any_numpy_dtype, boxed, box_dtype): + dtype = np.dtype(any_numpy_dtype) + fill_value = True + + if boxed: + if dtype == bool: + pytest.xfail('falsely upcasts to object') + if dtype not in (str, object) and box_dtype is None: + pytest.xfail('falsely upcasts to object') + if not boxed: + if is_datetime_or_timedelta_dtype(dtype): + pytest.xfail('raises error') + # this following xfail is "only" a consequence of the - now strictly + # enforced - principle that maybe_promote_with_scalar always casts + if dtype == bool: + pytest.xfail('wrong return type of fill_value') + + # filling anything but bool with bool casts to object + expected_dtype = np.dtype(object) if dtype != bool else dtype + # output is not a generic bool, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + exp_val_for_array = np.nan if dtype != bool else None + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.skipif(PY2, reason='no bytes in PY2') +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype, + boxed, box_dtype): + dtype = np.dtype(bytes_dtype) + fill_dtype = np.dtype(any_numpy_dtype) + + if issubclass(fill_dtype.type, np.bytes_): + if not boxed or box_dtype == object: + pytest.xfail('falsely upcasts to object') + # takes the opinion that bool dtype has no missing value marker + else: + pytest.xfail('wrong missing value marker') + else: + if boxed and box_dtype is None: + pytest.xfail('does not upcast to object') + if ((is_integer_dtype(fill_dtype) or is_float_dtype(fill_dtype) + or is_complex_dtype(fill_dtype) or is_object_dtype(fill_dtype) + or is_timedelta64_dtype(fill_dtype)) and not boxed): + pytest.xfail('does not upcast to object') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling bytes with anything but bytes casts to object + expected_dtype = (dtype if issubclass(fill_dtype.type, np.bytes_) + else np.dtype(object)) + exp_val_for_scalar = fill_value + exp_val_for_array = (None if issubclass(fill_dtype.type, np.bytes_) + else np.nan) + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.skipif(PY2, reason='no bytes in PY2') +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype (fixed len) + (True, 'bytes'), # fill_value wrapped in array with generic bytes-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_any_with_bytes(any_numpy_dtype, bytes_dtype, + boxed, box_dtype): + dtype = np.dtype(any_numpy_dtype) + fill_dtype = np.dtype(bytes_dtype) + + if issubclass(dtype.type, np.bytes_): + if not boxed or box_dtype == object: + pytest.xfail('falsely upcasts to object') + # takes the opinion that bool dtype has no missing value marker + else: + pytest.xfail('wrong missing value marker') + else: + pass + if (boxed and (box_dtype == 'bytes' or box_dtype is None) + and not (is_string_dtype(dtype) or dtype == bool)): + pytest.xfail('does not upcast to object') + if not boxed and is_datetime_or_timedelta_dtype(dtype): + pytest.xfail('raises error') + + # create array of given dtype + fill_value = b'abc' + + # special case for box_dtype (cannot use fixture in parametrization) + box_dtype = fill_dtype if box_dtype == 'bytes' else box_dtype + + # filling bytes with anything but bytes casts to object + expected_dtype = (dtype if issubclass(dtype.type, np.bytes_) + else np.dtype(object)) + # output is not a generic bytes, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + exp_val_for_array = None if issubclass(dtype.type, np.bytes_) else np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype, + boxed, box_dtype): + dtype = np.dtype(datetime64_dtype) + fill_dtype = np.dtype(any_numpy_dtype) + + if is_datetime64_dtype(fill_dtype): + if box_dtype == object: + pytest.xfail('falsely upcasts to object') + else: + if boxed and box_dtype is None: + pytest.xfail('does not upcast to object') + if not boxed: + pytest.xfail('does not upcast to object or raises') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling datetime with anything but datetime casts to object + if is_datetime64_dtype(fill_dtype): + expected_dtype = dtype + # for datetime dtypes, scalar values get cast to pd.Timestamp.value + exp_val_for_scalar = pd.Timestamp(fill_value).value + exp_val_for_array = iNaT + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value array with auto-dtype + (True, 'dt_dtype'), # fill_value array with explicit datetime dtype + (True, object), # fill_value array with object dtype + (False, None) # fill_value directly +]) +@pytest.mark.parametrize('fill_value', [ + pd.Timestamp('now'), np.datetime64('now'), + datetime.datetime.now(), datetime.date.today() +], ids=['pd.Timestamp', 'np.datetime64', 'datetime.datetime', 'datetime.date']) +def test_maybe_promote_any_with_datetime64(any_numpy_dtype, datetime64_dtype, + fill_value, boxed, box_dtype): + dtype = np.dtype(any_numpy_dtype) + + if is_datetime64_dtype(dtype): + if (boxed and (box_dtype == object + or (box_dtype is None + and not is_datetime64_dtype(type(fill_value))))): + pytest.xfail('falsely upcasts to object') + else: + if (boxed and (box_dtype == 'dt_dtype' + or (box_dtype is None + and is_datetime64_dtype(type(fill_value))))): + pytest.xfail('mix of lack of upcasting, resp. wrong missing value') + if not boxed and is_timedelta64_dtype(dtype): + pytest.xfail('raises error') + + # special case for box_dtype + box_dtype = (np.dtype(datetime64_dtype) if box_dtype == 'dt_dtype' + else box_dtype) + + # filling datetime with anything but datetime casts to object + if is_datetime64_dtype(dtype): + expected_dtype = dtype + # for datetime dtypes, scalar values get cast to pd.Timestamp.value + exp_val_for_scalar = pd.Timestamp(fill_value).value + exp_val_for_array = iNaT + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_datetimetz_with_any_numpy_dtype( + tz_aware_fixture, any_numpy_dtype, boxed, box_dtype): + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + fill_dtype = np.dtype(any_numpy_dtype) + + if box_dtype != object: + pytest.xfail('does not upcast correctly') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling datetimetz with any numpy dtype casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, + tz_aware_fixture2, + boxed, box_dtype): + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) + + from dateutil.tz import tzlocal + if is_platform_windows() and tz_aware_fixture2 == tzlocal(): + pytest.xfail('Cannot process fill_value with this dtype, see GH 24310') + if dtype.tz == fill_dtype.tz and boxed: + pytest.xfail('falsely upcasts') + if dtype.tz != fill_dtype.tz and not boxed: + pytest.xfail('falsely upcasts') + + # create array of given dtype; casts "1" to correct dtype + fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] + + # filling datetimetz with datetimetz casts to object, unless tz matches + exp_val_for_scalar = fill_value + if dtype.tz == fill_dtype.tz: + expected_dtype = dtype + exp_val_for_array = NaT + else: + expected_dtype = np.dtype(object) + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('fill_value', [None, np.nan, NaT, iNaT], + ids=['None', 'np.nan', 'pd.NaT', 'iNaT']) +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value, + boxed, box_dtype): + + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + + if (boxed and (box_dtype == object + or (box_dtype is None + and (fill_value is None or fill_value is NaT)))): + pytest.xfail('false upcasts to object') + # takes the opinion that DatetimeTZ should have single na-marker + # using iNaT would lead to errors elsewhere -> NaT + if not boxed and fill_value == iNaT: + pytest.xfail('wrong missing value marker') + + expected_dtype = dtype + # DatetimeTZDtype does not use iNaT as missing value marker + exp_val_for_scalar = NaT + exp_val_for_array = NaT + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('fill_value', [ + pd.Timestamp('now'), np.datetime64('now'), + datetime.datetime.now(), datetime.date.today() +], ids=['pd.Timestamp', 'np.datetime64', 'datetime.datetime', 'datetime.date']) +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_any_numpy_dtype_with_datetimetz( + any_numpy_dtype, tz_aware_fixture, fill_value, boxed, box_dtype): + dtype = np.dtype(any_numpy_dtype) + fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture) + + if is_datetime_or_timedelta_dtype(dtype) and not boxed: + pytest.xfail('raises error') + + fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] + + # filling any numpy dtype with datetimetz casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, any_numpy_dtype, + boxed, box_dtype): + dtype = np.dtype(timedelta64_dtype) + fill_dtype = np.dtype(any_numpy_dtype) + + if is_timedelta64_dtype(fill_dtype): + if box_dtype == object: + pytest.xfail('falsely upcasts to object') + else: + if boxed and box_dtype is None: + pytest.xfail('does not upcast to object') + if not boxed: + pytest.xfail('does not upcast to object or raises') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling timedelta with anything but timedelta casts to object + if is_timedelta64_dtype(fill_dtype): + expected_dtype = dtype + # for timedelta dtypes, scalar values get cast to pd.Timedelta.value + exp_val_for_scalar = pd.Timedelta(fill_value).value + exp_val_for_array = iNaT + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('fill_value', [ + pd.Timedelta(days=1), np.timedelta64(24, 'h'), datetime.timedelta(1) +], ids=['pd.Timedelta', 'np.timedelta64', 'datetime.timedelta']) +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value array with auto-dtype + (True, 'td_dtype'), # fill_value array with explicit timedelta dtype + (True, object), # fill_value array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_any_with_timedelta64(any_numpy_dtype, timedelta64_dtype, + fill_value, boxed, box_dtype): + dtype = np.dtype(any_numpy_dtype) + + if is_timedelta64_dtype(dtype): + if (boxed and (box_dtype == object + or (box_dtype is None + and not is_timedelta64_dtype(type(fill_value))))): + pytest.xfail('falsely upcasts to object') + else: + if (boxed and box_dtype is None + and is_timedelta64_dtype(type(fill_value))): + pytest.xfail('does not upcast correctly') + if (not boxed and is_timedelta64_dtype(type(fill_value)) and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) + or is_complex_dtype(dtype) + or issubclass(dtype.type, np.bytes_))): + pytest.xfail('does not upcast correctly') + if box_dtype == 'td_dtype': + pytest.xfail('falsely upcasts') + if not boxed and is_datetime64_dtype(dtype): + pytest.xfail('raises error') + + # special case for box_dtype + box_dtype = (np.dtype(timedelta64_dtype) if box_dtype == 'td_dtype' + else box_dtype) + + # filling anything but timedelta with timedelta casts to object + if is_timedelta64_dtype(dtype): + expected_dtype = dtype + # for timedelta dtypes, scalar values get cast to pd.Timedelta.value + exp_val_for_scalar = pd.Timedelta(fill_value).value + exp_val_for_array = iNaT + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype, + boxed, box_dtype): + dtype = np.dtype(string_dtype) + fill_dtype = np.dtype(any_numpy_dtype) + + if (boxed and box_dtype is None + and is_datetime_or_timedelta_dtype(fill_dtype)): + pytest.xfail('wrong missing value marker') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling string with anything casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype (fixed len) + (True, 'str'), # fill_value wrapped in array with generic string-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_any_with_string(any_numpy_dtype, string_dtype, + boxed, box_dtype): + dtype = np.dtype(any_numpy_dtype) + fill_dtype = np.dtype(string_dtype) + + if is_datetime_or_timedelta_dtype(dtype) and box_dtype != object: + pytest.xfail('does not upcast or raises') + if (boxed and box_dtype in (None, 'str') and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) + or is_complex_dtype(dtype) + or issubclass(dtype.type, np.bytes_))): + pytest.xfail('does not upcast correctly') + + # create array of given dtype + fill_value = 'abc' + + # special case for box_dtype (cannot use fixture in parametrization) + box_dtype = fill_dtype if box_dtype == 'str' else box_dtype + + # filling string with anything casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_object_with_any(object_dtype, any_numpy_dtype, + boxed, box_dtype): + dtype = np.dtype(object_dtype) + fill_dtype = np.dtype(any_numpy_dtype) + + if (boxed and box_dtype is None + and is_datetime_or_timedelta_dtype(fill_dtype)): + pytest.xfail('wrong missing value marker') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling object with anything stays object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, None), # fill_value wrapped in array with auto-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_any_with_object(any_numpy_dtype, object_dtype, + boxed, box_dtype): + dtype = np.dtype(any_numpy_dtype) + + if not boxed and is_datetime_or_timedelta_dtype(dtype): + pytest.xfail('raises error') + + # create array of object dtype from a scalar value (i.e. passing + # dtypes.common.is_scalar), which can however not be cast to int/float etc. + fill_value = pd.DateOffset(1) + + # filling object with anything stays object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('fill_value', [None, np.nan, NaT, iNaT], + ids=['None', 'np.nan', 'pd.NaT', 'iNaT']) +@pytest.mark.parametrize('boxed, box_dtype', [ + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value directly +]) +def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype, fill_value, + boxed, box_dtype): + dtype = np.dtype(any_numpy_dtype) + + if (dtype == bytes and not boxed + and fill_value is not None and fill_value is not NaT): + pytest.xfail('does not upcast to object') + elif dtype == 'uint64' and not boxed and fill_value == iNaT: + pytest.xfail('does not upcast correctly') + elif is_datetime_or_timedelta_dtype(dtype) and boxed: + pytest.xfail('falsely upcasts to object') + elif (boxed and (is_integer_dtype(dtype) or is_float_dtype(dtype) + or is_complex_dtype(dtype)) + and fill_value is not NaT and dtype != 'uint64'): + pytest.xfail('falsely upcasts to object') + elif (boxed and dtype == 'uint64' + and (fill_value is np.nan or fill_value is None)): + pytest.xfail('falsely upcasts to object') + # below: opinionated that iNaT should be interpreted as missing value + elif (not boxed and (is_float_dtype(dtype) or is_complex_dtype(dtype)) + and fill_value == iNaT): + pytest.xfail('does not cast to missing value marker correctly') + elif ((is_string_dtype(dtype) or dtype == bool) + and not boxed and fill_value == iNaT): + pytest.xfail('does not cast to missing value marker correctly') + + if is_integer_dtype(dtype) and dtype == 'uint64' and fill_value == iNaT: + # uint64 + negative int casts to object; iNaT is considered as missing + expected_dtype = np.dtype(object) + exp_val_for_scalar = np.nan + elif is_integer_dtype(dtype) and fill_value == iNaT: + # other integer + iNaT casts to int64 + expected_dtype = np.int64 + exp_val_for_scalar = iNaT + elif is_integer_dtype(dtype) and fill_value is not NaT: + # integer + other missing value (np.nan / None) casts to float + expected_dtype = np.float64 + exp_val_for_scalar = np.nan + elif is_object_dtype(dtype) and (fill_value == iNaT or fill_value is NaT): + # inserting into object does not cast the value + # but *does* cast None to np.nan + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + elif is_datetime_or_timedelta_dtype(dtype): + # datetime / timedelta cast all missing values to iNaT + expected_dtype = dtype + exp_val_for_scalar = iNaT + elif fill_value is NaT: + # NaT upcasts everything that's not datetime/timedelta to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = NaT + elif is_float_dtype(dtype) or is_complex_dtype(dtype): + # float / complex + missing value (!= NaT) stays the same + expected_dtype = dtype + exp_val_for_scalar = np.nan + else: + # all other cases cast to object, and use np.nan as missing value + expected_dtype = np.dtype(object) + exp_val_for_scalar = np.nan + + # array case has same expected_dtype; but returns corresponding na-marker + if is_integer_dtype(expected_dtype): + # integers cannot hold NaNs; maybe_promote_with_array returns None + exp_val_for_array = None + elif is_datetime_or_timedelta_dtype(expected_dtype): + exp_val_for_array = iNaT + else: # expected_dtype = float / complex / object + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('dim', [0, 2, 3]) +def test_maybe_promote_dimensions(any_numpy_dtype, dim): + dtype = np.dtype(any_numpy_dtype) + + # create 0-dim array of given dtype; casts "1" to correct dtype + fill_array = np.array(1, dtype=dtype) + + # expand to desired dimension: + for _ in range(dim): + fill_array = np.expand_dims(fill_array, 0) + + # test against 1-dimensional case + expected_dtype, expected_missing_value = maybe_promote( + dtype, np.array([1], dtype=dtype)) + + result_dtype, result_missing_value = maybe_promote(dtype, fill_array) + + assert result_dtype == expected_dtype + # None == None, iNaT == iNaT, but np.nan != np.nan + assert ((result_missing_value == expected_missing_value) + or (result_missing_value is np.nan + and expected_missing_value is np.nan)) From 60889eaa0a7e9a6bba5f95832e2cc52b15048039 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 24 Feb 2019 03:49:52 +0100 Subject: [PATCH 02/27] REF: refactor and fix maybe_promote --- pandas/core/dtypes/cast.py | 451 +++++++++++++++++++---- pandas/tests/dtypes/cast/test_promote.py | 244 ++---------- 2 files changed, 425 insertions(+), 270 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f6561948df99a..9676ec8e9f247 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1,6 +1,7 @@ """ routings for casting """ from datetime import datetime, timedelta +import warnings import numpy as np @@ -21,8 +22,8 @@ from .dtypes import ( DatetimeTZDtype, ExtensionDtype, PandasExtensionDtype, PeriodDtype) from .generic import ( - ABCDatetimeArray, ABCDatetimeIndex, ABCPeriodArray, ABCPeriodIndex, - ABCSeries) + ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, ABCPeriodArray, + ABCPeriodIndex, ABCSeries) from .inference import is_list_like from .missing import isna, notna @@ -30,6 +31,19 @@ _int16_max = np.iinfo(np.int16).max _int32_max = np.iinfo(np.int32).max _int64_max = np.iinfo(np.int64).max +_int64_min = np.iinfo(np.int64).min +_uint64_max = np.iinfo(np.uint64).max +_float32_max = np.finfo(np.float32).max + + +def _is_iNaT(x): + if not is_scalar(x): + return False + with warnings.catch_warnings(): + # bug in numpy warnings for timedelta, see numpy/numpy#10095 + warnings.filterwarnings('ignore', category=DeprecationWarning) + result = x == iNaT + return result def maybe_convert_platform(values): @@ -253,72 +267,385 @@ def changeit(): def maybe_promote(dtype, fill_value=np.nan): - # if we passed an array here, determine the fill value by dtype - if isinstance(fill_value, np.ndarray): - if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): - fill_value = iNaT - else: + """ + Determine minimal dtype to hold fill_value, when starting from dtype - # we need to change to object type as our - # fill_value is of object type - if fill_value.dtype == np.object_: - dtype = np.dtype(np.object_) - fill_value = np.nan - - # returns tuple of (dtype, fill_value) - if issubclass(dtype.type, np.datetime64): - fill_value = tslibs.Timestamp(fill_value).value - elif issubclass(dtype.type, np.timedelta64): - fill_value = tslibs.Timedelta(fill_value).value - elif is_datetime64tz_dtype(dtype): - if isna(fill_value): - fill_value = NaT - elif is_extension_array_dtype(dtype) and isna(fill_value): - fill_value = dtype.na_value - elif is_float(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, np.integer): - dtype = np.float64 - elif is_bool(fill_value): - if not issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif is_integer(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, np.integer): - # upcast to prevent overflow - arr = np.asarray(fill_value) - if arr != arr.astype(dtype): - dtype = arr.dtype - elif is_complex(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, (np.integer, np.floating)): - dtype = np.complex128 - elif fill_value is None: - if is_float_dtype(dtype) or is_complex_dtype(dtype): - fill_value = np.nan - elif is_integer_dtype(dtype): - dtype = np.float64 - fill_value = np.nan - elif is_datetime_or_timedelta_dtype(dtype): + Parameters + ---------- + dtype : DType + The dtype to start from. + fill_value : scalar or np.ndarray / Series / Index + The value that the output dtype needs to be able to hold. + + Returns + ------- + dtype : DType + The updated dtype. + fill_value : scalar + The type of this value depends on the type of the passed fill_value + + * If fill_value is a scalar, the method returns that scalar, but + modified to fit the updated dtype. For example, a datetime fill_value + will be returned as an integer (representing ns) for M8[ns], and + values considered missing (see pd.isna) will be returned as the + corresponding missing value marker for the updated dtype. + * If fill_value is an ndarray/Series/Index, this method will always + return the missing value marker for the updated dtype. This value + will be None for dtypes that cannot hold missing values (integers, + booleans, bytes). + + See Also + -------- + maybe_promote_with_scalar : underlying method for scalar case + maybe_promote_with_array : underlying method for array case + """ + if is_scalar(fill_value): + return maybe_promote_with_scalar(dtype, fill_value) + elif isinstance(fill_value, (np.ndarray, ABCSeries, ABCIndexClass)): + return maybe_promote_with_array(dtype, fill_value) + else: + fill_type = type(fill_value) + raise ValueError('fill_value must either be scalar, or a Series / ' + 'Index / np.ndarray; received {}'.format(fill_type)) + + +def maybe_promote_with_scalar(dtype, fill_value=np.nan): + """ + Determine minimal dtype to hold fill_value, when starting from dtype + + Parameters + ---------- + dtype : DType + The dtype to start from. + fill_value : scalar or np.ndarray / Series / Index + The value that the output dtype needs to be able to hold. + + Returns + ------- + dtype : DType + The updated dtype. + fill_value : scalar + The passed fill_value, potentially modified to fit the updated dtype. + For example, assuming a datetime dtype, a datetime.datetime fill_value + will be returned as an integer (representing ns) for M8[ns]. Similarly, + values considered missing (see pd.isna) will be returned as the + corresponding missing value marker for the updated dtype. + + See Also + -------- + maybe_promote_with_array : similar method for array case + This method contains the actual promotion logic for both cases. + + Examples + -------- + >>> maybe_promote(np.dtype('int'), fill_value=np.nan) + (dtype('float64'), nan) + >>> maybe_promote(np.dtype('float'), fill_value='abcd') + (dtype('O'), 'abcd') + + For datetimes, timedeltas and datetimes with a timezone, the missing value + marker is pandas._libs.tslibs.iNaT (== np.iinfo('int64').min): + + >>> maybe_promote(np.dtype('datetime64[ns]'), fill_value=np.nan) + (dtype('>> maybe_promote(np.dtype('uint8'), fill_value=np.iinfo('uint8').max + 1) + (dtype('uint16'), 256) + >>> maybe_promote(np.dtype('uint8'), fill_value=-1) + (dtype('int16'), -1) + """ + from pandas import Series + + if is_scalar(fill_value): + # unify handling of scalar and array values to simplify actual + # promotion logic in maybe_promote_with_array; + if is_object_dtype(dtype) and fill_value is not None: + # inserting into object does not cast (except for None -> np.nan) + return np.dtype(object), fill_value + + # use Series to construct, since np.array cannot deal with + # pandas-internal dtypes (e.g. DatetimeTZDtype) + fill_array = Series([fill_value], dtype=object) + dtype, na_value = maybe_promote_with_array(dtype, fill_array) + + # maybe_promote_with_array returns the na-marker for the new dtype; + # maybe_promote_with_scalar always casts fill_value to the new dtype + if is_integer_dtype(dtype) and _is_iNaT(fill_value): + # maybe_promote_with_array considers iNaT a missing value, and + # since int dtypes cannot hold missing values, that method returns + # None as the na_value. For scalars, we need to keep it however, + # to ensure correct operations for datetime/timedelta code. fill_value = iNaT + elif fill_value is NaT and is_object_dtype(dtype): + # the presence of pd.NaT forced upcasting to object, and therefore + # fill_value does not get cast to na-marker of object (cf. below) + pass + elif isna(fill_value) or _is_iNaT(fill_value): + # cast missing values (incl. iNaT) to correct missing value marker + # for the updated dtype + fill_value = na_value + # otherwise casts fill_value (= only entry of fill_array) to new dtype + elif is_datetime_or_timedelta_dtype(dtype): + # for datetime/timedelta, we need to return the underlying ints + fill_value = fill_array.astype(dtype)[0].value else: - dtype = np.object_ - fill_value = np.nan + fill_value = fill_array.astype(dtype)[0] + + return dtype, fill_value else: - dtype = np.object_ + raise ValueError('fill_value must be a scalar, received ' + '{}'.format(type(fill_value))) - # in case we have a string that looked like a number - if is_extension_array_dtype(dtype): - pass - elif is_datetime64tz_dtype(dtype): - pass - elif issubclass(np.dtype(dtype).type, string_types): - dtype = np.object_ - return dtype, fill_value +def maybe_promote_with_array(dtype, fill_value=np.nan): + """ + Determine minimal dtype to hold fill_value, when starting from dtype + + This will also return the default missing value for the resulting dtype, if + necessary (e.g. for datetime / timedelta, the missing value will be `iNaT`) + + Parameters + ---------- + dtype : DType + The dtype to start from. + fill_value : np.ndarray / Series / Index + Array-like of values that the output dtype needs to be able to hold. + + Returns + ------- + dtype : DType + The updated dtype. + na_value : scalar + The missing value for the new dtype. Returns None or dtypes that + cannot hold missing values (integers, booleans, bytes). + + See Also + -------- + maybe_promote_with_scalar : similar method for scalar case + + Examples + -------- + >>> maybe_promote(np.dtype('int'), fill_value=np.array([None])) + (dtype('float64'), nan) + >>> maybe_promote(np.dtype('float'), fill_value=np.array(['abcd'])) + (dtype('O'), nan) + + For datetimes, timedeltas and datetimes with a timezone, the missing value + marker is pandas._libs.tslibs.iNaT (== np.iinfo('int64').min): + + >>> maybe_promote(np.dtype('datetime64[ns]'), + ... fill_value=np.array(['2018-01-01'])) + (dtype('>> maybe_promote(np.dtype('uint8'), + ... fill_value=np.array([np.iinfo('uint8').max + 1])) + (dtype('uint16'), None) + >>> maybe_promote(np.dtype('uint8'), fill_value=np.array([-1])) + (dtype('int16'), None) + """ + + if isinstance(fill_value, np.ndarray): + if fill_value.ndim == 0: + # zero-dimensional arrays cannot be iterated over + fill_value = np.expand_dims(fill_value, 0) + elif fill_value.ndim > 1: + # ndarray, but too high-dimensional + fill_value = fill_value.ravel() + elif not isinstance(fill_value, (ABCSeries, ABCIndexClass)): + fill_type = type(fill_value) + raise ValueError('fill_value must either be a Series / Index / ' + 'np.ndarray, received {}'.format(fill_type)) + + if all(isna(x) or _is_iNaT(x) for x in fill_value): + # only missing values (or no values at all) + + if is_datetime_or_timedelta_dtype(dtype): + return dtype, iNaT + elif is_datetime64tz_dtype(dtype): + # DatetimeTZDtype does not use iNaT as missing value marker + return dtype, NaT + + na_value = np.nan + if len(fill_value) == 0: + # empty array; no values to force change + if is_integer_dtype(dtype) or dtype in (bool, bytes): + # these types do not have a missing value marker + na_value = None + # otherwise nothing changes + elif any(x is NaT for x in fill_value): + # presence of pd.NaT upcasts everything that's not + # datetime/timedelta (see above) to object + dtype = np.dtype(object) + elif (is_integer_dtype(dtype) and dtype == 'uint64' + and all(x == iNaT for x in fill_value)): + # uint64 + negative int casts to object + dtype = np.dtype(object) + elif is_integer_dtype(dtype) and all(x == iNaT for x in fill_value): + # integer + iNaT casts to int64 + dtype = np.dtype('int64') + na_value = None + elif is_integer_dtype(dtype): + # integer + other missing value (np.nan / None) casts to float + dtype = np.dtype('float64') + elif is_extension_array_dtype(dtype): + na_value = dtype.na_value + elif is_string_dtype(dtype) or dtype in (bool, bytes): + # original dtype cannot hold nans + dtype = np.dtype(object) + + return dtype, na_value + + fill_dtype = fill_value.dtype + if fill_dtype == object: + # for object dtype, we determine if we actually need to upcast + # by inferring the dtype of fill_value + inferred_dtype = lib.infer_dtype(fill_value, skipna=True) + + # cases that would yield 'empty' have been treated in branch above + if inferred_dtype in ['period', 'interval', 'datetime64tz']: + # TODO: handle & test pandas-dtypes + # TODO: lib.infer_dtype does not support datetime64tz yet + pass + else: + # rest can be mapped to numpy dtypes + map_inferred_to_numpy = { + 'floating': float, 'mixed-integer-float': float, + 'decimal': float, 'integer': int, 'boolean': bool, + 'complex': complex, 'bytes': bytes, + 'datetime64': 'datetime64[ns]', 'datetime': 'datetime64[ns]', + 'date': 'datetime64[ns]', 'timedelta64': 'timedelta64[ns]', + 'timedelta': 'timedelta64[ns]', + 'time': object, # time cannot be cast to datetime/timedelta + 'string': object, 'unicode': object, + 'mixed-integer': object, 'mixed': object, + } + fill_dtype = np.dtype(map_inferred_to_numpy[inferred_dtype]) + + # now that we have the correct dtype; check how we must upcast + # * extension arrays + # * int vs int + # * int vs float / complex + # * float vs float + # * float vs complex (and vice versa) + # * bool + # * bytes + # * datetimetz + # * datetime + # * timedelta + # * string/object + + # if (is_extension_array_dtype(dtype) + # or is_extension_array_dtype(fill_dtype)): + # # TODO: dispatch to ExtensionDType.maybe_promote? GH 24246 + if is_integer_dtype(dtype) and is_integer_dtype(fill_dtype): + if is_unsigned_integer_dtype(dtype) and all(fill_value >= 0): + # can stay unsigned + fill_max = fill_value.max() + if fill_max > _uint64_max: + return np.dtype(object), np.nan + + while fill_max > np.iinfo(dtype).max: + # itemsize is the number of bytes; times eight is number of + # bits, which is used in the string identifier of the dtype; + # if fill_max is above the max for that dtype, + # we double the number of bytes/bits. + dtype = np.dtype('uint{}'.format(dtype.itemsize * 8 * 2)) + return dtype, None + else: + # cannot stay unsigned + if dtype == 'uint64': + # need to hold negative values, but int64 cannot hold + # maximum of uint64 -> needs object + return np.dtype(object), np.nan + elif is_unsigned_integer_dtype(dtype): + # need to turn into signed integers to hold negative values + # int8 cannot hold maximum of uint8; similar for 16/32 + # therefore, upcast at least to next higher int-type + dtype = np.dtype('int{}'.format(dtype.itemsize * 8 * 2)) + + fill_max = fill_value.max() + fill_min = fill_value.min() + if isinstance(fill_max, np.uint64): + # numpy comparator is broken for uint64; + # see https://github.com/numpy/numpy/issues/12525 + # use .item to get int object + fill_max = fill_max.item() + + if fill_max > _int64_max or fill_min < _int64_min: + return np.dtype(object), np.nan + + while (fill_max > np.iinfo(dtype).max + or fill_min < np.iinfo(dtype).min): + # same mechanism as above, but for int instead of uint + dtype = np.dtype('int{}'.format(dtype.itemsize * 8 * 2)) + return dtype, None + elif is_integer_dtype(dtype) and is_float_dtype(fill_dtype): + # int with float: always upcasts to float64 + return np.dtype('float64'), np.nan + elif is_integer_dtype(dtype) and is_complex_dtype(fill_dtype): + # int with complex: always upcasts to complex128 + return np.dtype('complex128'), np.nan + elif ((is_float_dtype(dtype) or is_complex_dtype(dtype)) + and is_integer_dtype(fill_dtype)): + # float/complex with int: always stays original float/complex dtype + return dtype, np.nan + elif is_float_dtype(dtype) and is_float_dtype(fill_dtype): + # float with float; upcasts depending on absolute max of fill_value + fill_max = np.abs(fill_value).max() + if dtype == 'float32' and fill_max <= _float32_max: + return dtype, np.nan + # all other cases return float64 + return np.dtype('float64'), np.nan + elif ((is_float_dtype(dtype) or is_complex_dtype(dtype)) + and (is_float_dtype(fill_dtype) or is_complex_dtype(fill_dtype))): + # at least one is complex; otherwise we'd have hit float/float above + fill_max = max(np.abs(fill_value.real).max(), # also works for float + np.abs(fill_value.imag).max()) + if dtype in ['float32', 'complex64'] and fill_max <= _float32_max: + return np.complex64, np.nan + # all other cases return complex128 + return np.dtype('complex128'), np.nan + elif is_bool_dtype(dtype) and is_bool_dtype(fill_dtype): + # bool with bool is the only combination that stays bool; any other + # combination involving bool upcasts to object, see else-clause below + return dtype, None + elif (issubclass(dtype.type, np.bytes_) + and issubclass(fill_dtype.type, np.bytes_)): + # bytes with bytes is the only combination that stays bytes; any other + # combination involving bytes upcasts to object, see else-clause below + return dtype, None + elif (is_datetime64tz_dtype(dtype) and is_datetime64tz_dtype(fill_dtype) + and (dtype.tz == fill_dtype.tz)): + # datetimetz with datetimetz with the same timezone is the only + # combination that stays datetimetz (in particular, mixing timezones or + # tz-aware and tz-naive datetimes will cast to object); any other + # combination involving datetimetz upcasts to object, see below + return dtype, iNaT + elif ((is_timedelta64_dtype(dtype) and is_timedelta64_dtype(fill_dtype)) + or (is_datetime64_dtype(dtype) and is_datetime64_dtype(fill_dtype))): + # datetime and timedelta try to cast; if successful, keep dtype, + # otherwise upcast to object + try: + with warnings.catch_warnings(): + msg = ('parsing timezone aware datetimes is deprecated; ' + 'this will raise an error in the future') + warnings.filterwarnings('ignore', message=msg, + category=DeprecationWarning) + fill_value.astype(dtype) + na_value = iNaT + except (ValueError, TypeError): + dtype = np.dtype(object) + na_value = np.nan + return dtype, na_value + else: + # anything else (e.g. strings, objects, or unmatched + # bool / bytes / datetime / datetimetz / timedelta) + return np.dtype(object), np.nan def infer_dtype_from(val, pandas_dtype=False): diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 7cf553865942f..293cccdae7f11 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -165,30 +165,6 @@ def test_maybe_promote_int_with_int(dtype, fill_value, expected_dtype, dtype = np.dtype(dtype) expected_dtype = np.dtype(expected_dtype) - if not boxed: - if expected_dtype == object: - pytest.xfail('overflow error') - if expected_dtype == 'int32': - pytest.xfail('always upcasts to platform int') - if dtype == 'int8' and expected_dtype == 'int16': - pytest.xfail('casts to int32 instead of int16') - if (issubclass(dtype.type, np.unsignedinteger) - and np.iinfo(dtype).max < fill_value <= np.iinfo('int64').max): - pytest.xfail('falsely casts to signed') - if ((dtype, expected_dtype) in [('uint8', 'int16'), - ('uint32', 'int64')] - and fill_value != np.iinfo('int32').min - 1): - pytest.xfail('casts to int32 instead of int8/int16') - # this following xfail is "only" a consequence of the - now strictly - # enforced - principle that maybe_promote_with_scalar always casts - pytest.xfail('wrong return type of fill_value') - if boxed: - if expected_dtype != object: - pytest.xfail('falsely casts to object') - if box_dtype is None and (fill_value > np.iinfo('int64').max - or np.iinfo('int64').min < fill_value < 0): - pytest.xfail('falsely casts to float instead of object') - # output is not a generic int, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] # no missing value marker for integers @@ -208,11 +184,6 @@ def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, dtype = np.dtype(any_int_dtype) fill_dtype = np.dtype(float_dtype) - if float_dtype == 'float32' and not boxed: - pytest.xfail('falsely upcasts to float64') - if box_dtype == object: - pytest.xfail('falsely upcasts to object') - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -237,13 +208,6 @@ def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, dtype = np.dtype(float_dtype) fill_dtype = np.dtype(any_int_dtype) - if box_dtype == object: - pytest.xfail('falsely upcasts to object') - # this following xfail is "only" a consequence of the - now strictly - # enforced - principle that maybe_promote_with_scalar always casts - if not boxed: - pytest.xfail('wrong return type of fill_value') - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -291,22 +255,6 @@ def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype, dtype = np.dtype(dtype) expected_dtype = np.dtype(expected_dtype) - if box_dtype == object: - pytest.xfail('falsely upcasts to object') - if boxed and is_float_dtype(dtype) and is_complex_dtype(expected_dtype): - pytest.xfail('does not upcast to complex') - if (dtype, expected_dtype) in [('float32', 'float64'), - ('float32', 'complex64'), - ('complex64', 'complex128')]: - pytest.xfail('does not upcast correctly depending on value') - # this following xfails are "only" a consequence of the - now strictly - # enforced - principle that maybe_promote_with_scalar always casts - if not boxed and abs(fill_value) < 2: - pytest.xfail('wrong return type of fill_value') - if (not boxed and dtype == 'complex128' and expected_dtype == 'complex128' - and is_float_dtype(type(fill_value))): - pytest.xfail('wrong return type of fill_value') - # output is not a generic float, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] exp_val_for_array = np.nan @@ -324,12 +272,6 @@ def test_maybe_promote_bool_with_any(any_numpy_dtype, boxed, box_dtype): dtype = np.dtype(bool) fill_dtype = np.dtype(any_numpy_dtype) - if boxed and fill_dtype == bool: - pytest.xfail('falsely upcasts to object') - if (boxed and box_dtype is None - and is_datetime_or_timedelta_dtype(fill_dtype)): - pytest.xfail('wrongly casts fill_value') - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -351,19 +293,6 @@ def test_maybe_promote_any_with_bool(any_numpy_dtype, boxed, box_dtype): dtype = np.dtype(any_numpy_dtype) fill_value = True - if boxed: - if dtype == bool: - pytest.xfail('falsely upcasts to object') - if dtype not in (str, object) and box_dtype is None: - pytest.xfail('falsely upcasts to object') - if not boxed: - if is_datetime_or_timedelta_dtype(dtype): - pytest.xfail('raises error') - # this following xfail is "only" a consequence of the - now strictly - # enforced - principle that maybe_promote_with_scalar always casts - if dtype == bool: - pytest.xfail('wrong return type of fill_value') - # filling anything but bool with bool casts to object expected_dtype = np.dtype(object) if dtype != bool else dtype # output is not a generic bool, but corresponds to expected_dtype @@ -385,20 +314,6 @@ def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype, dtype = np.dtype(bytes_dtype) fill_dtype = np.dtype(any_numpy_dtype) - if issubclass(fill_dtype.type, np.bytes_): - if not boxed or box_dtype == object: - pytest.xfail('falsely upcasts to object') - # takes the opinion that bool dtype has no missing value marker - else: - pytest.xfail('wrong missing value marker') - else: - if boxed and box_dtype is None: - pytest.xfail('does not upcast to object') - if ((is_integer_dtype(fill_dtype) or is_float_dtype(fill_dtype) - or is_complex_dtype(fill_dtype) or is_object_dtype(fill_dtype) - or is_timedelta64_dtype(fill_dtype)) and not boxed): - pytest.xfail('does not upcast to object') - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -425,20 +340,6 @@ def test_maybe_promote_any_with_bytes(any_numpy_dtype, bytes_dtype, dtype = np.dtype(any_numpy_dtype) fill_dtype = np.dtype(bytes_dtype) - if issubclass(dtype.type, np.bytes_): - if not boxed or box_dtype == object: - pytest.xfail('falsely upcasts to object') - # takes the opinion that bool dtype has no missing value marker - else: - pytest.xfail('wrong missing value marker') - else: - pass - if (boxed and (box_dtype == 'bytes' or box_dtype is None) - and not (is_string_dtype(dtype) or dtype == bool)): - pytest.xfail('does not upcast to object') - if not boxed and is_datetime_or_timedelta_dtype(dtype): - pytest.xfail('raises error') - # create array of given dtype fill_value = b'abc' @@ -466,15 +367,6 @@ def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype, dtype = np.dtype(datetime64_dtype) fill_dtype = np.dtype(any_numpy_dtype) - if is_datetime64_dtype(fill_dtype): - if box_dtype == object: - pytest.xfail('falsely upcasts to object') - else: - if boxed and box_dtype is None: - pytest.xfail('does not upcast to object') - if not boxed: - pytest.xfail('does not upcast to object or raises') - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -507,19 +399,6 @@ def test_maybe_promote_any_with_datetime64(any_numpy_dtype, datetime64_dtype, fill_value, boxed, box_dtype): dtype = np.dtype(any_numpy_dtype) - if is_datetime64_dtype(dtype): - if (boxed and (box_dtype == object - or (box_dtype is None - and not is_datetime64_dtype(type(fill_value))))): - pytest.xfail('falsely upcasts to object') - else: - if (boxed and (box_dtype == 'dt_dtype' - or (box_dtype is None - and is_datetime64_dtype(type(fill_value))))): - pytest.xfail('mix of lack of upcasting, resp. wrong missing value') - if not boxed and is_timedelta64_dtype(dtype): - pytest.xfail('raises error') - # special case for box_dtype box_dtype = (np.dtype(datetime64_dtype) if box_dtype == 'dt_dtype' else box_dtype) @@ -549,9 +428,6 @@ def test_maybe_promote_datetimetz_with_any_numpy_dtype( dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = np.dtype(any_numpy_dtype) - if box_dtype != object: - pytest.xfail('does not upcast correctly') - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -578,10 +454,11 @@ def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, from dateutil.tz import tzlocal if is_platform_windows() and tz_aware_fixture2 == tzlocal(): pytest.xfail('Cannot process fill_value with this dtype, see GH 24310') - if dtype.tz == fill_dtype.tz and boxed: - pytest.xfail('falsely upcasts') - if dtype.tz != fill_dtype.tz and not boxed: - pytest.xfail('falsely upcasts') + if dtype.tz == fill_dtype.tz: + # here we should keep the datetime64tz dtype, but since that cannot be + # inferred correctly for fill_value, the calling dtype ends up being + # compared to a tz-naive datetime64-dtype, and must therefore upcast + pytest.xfail('cannot infer datetime64tz dtype, see GH 23554') # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] @@ -611,15 +488,6 @@ def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value, dtype = DatetimeTZDtype(tz=tz_aware_fixture) - if (boxed and (box_dtype == object - or (box_dtype is None - and (fill_value is None or fill_value is NaT)))): - pytest.xfail('false upcasts to object') - # takes the opinion that DatetimeTZ should have single na-marker - # using iNaT would lead to errors elsewhere -> NaT - if not boxed and fill_value == iNaT: - pytest.xfail('wrong missing value marker') - expected_dtype = dtype # DatetimeTZDtype does not use iNaT as missing value marker exp_val_for_scalar = NaT @@ -643,8 +511,10 @@ def test_maybe_promote_any_numpy_dtype_with_datetimetz( dtype = np.dtype(any_numpy_dtype) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture) - if is_datetime_or_timedelta_dtype(dtype) and not boxed: - pytest.xfail('raises error') + if is_datetime64_dtype(dtype): + # fill_dtype does not get inferred correctly to datetime64tz but to + # datetime64, which then falsely matches with datetime64 dtypes. + pytest.xfail('cannot infer datetime64tz dtype, see GH 23554') fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] @@ -667,15 +537,6 @@ def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, any_numpy_dtype, dtype = np.dtype(timedelta64_dtype) fill_dtype = np.dtype(any_numpy_dtype) - if is_timedelta64_dtype(fill_dtype): - if box_dtype == object: - pytest.xfail('falsely upcasts to object') - else: - if boxed and box_dtype is None: - pytest.xfail('does not upcast to object') - if not boxed: - pytest.xfail('does not upcast to object or raises') - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -707,25 +568,6 @@ def test_maybe_promote_any_with_timedelta64(any_numpy_dtype, timedelta64_dtype, fill_value, boxed, box_dtype): dtype = np.dtype(any_numpy_dtype) - if is_timedelta64_dtype(dtype): - if (boxed and (box_dtype == object - or (box_dtype is None - and not is_timedelta64_dtype(type(fill_value))))): - pytest.xfail('falsely upcasts to object') - else: - if (boxed and box_dtype is None - and is_timedelta64_dtype(type(fill_value))): - pytest.xfail('does not upcast correctly') - if (not boxed and is_timedelta64_dtype(type(fill_value)) and ( - is_integer_dtype(dtype) or is_float_dtype(dtype) - or is_complex_dtype(dtype) - or issubclass(dtype.type, np.bytes_))): - pytest.xfail('does not upcast correctly') - if box_dtype == 'td_dtype': - pytest.xfail('falsely upcasts') - if not boxed and is_datetime64_dtype(dtype): - pytest.xfail('raises error') - # special case for box_dtype box_dtype = (np.dtype(timedelta64_dtype) if box_dtype == 'td_dtype' else box_dtype) @@ -755,10 +597,6 @@ def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype, dtype = np.dtype(string_dtype) fill_dtype = np.dtype(any_numpy_dtype) - if (boxed and box_dtype is None - and is_datetime_or_timedelta_dtype(fill_dtype)): - pytest.xfail('wrong missing value marker') - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -782,14 +620,6 @@ def test_maybe_promote_any_with_string(any_numpy_dtype, string_dtype, dtype = np.dtype(any_numpy_dtype) fill_dtype = np.dtype(string_dtype) - if is_datetime_or_timedelta_dtype(dtype) and box_dtype != object: - pytest.xfail('does not upcast or raises') - if (boxed and box_dtype in (None, 'str') and ( - is_integer_dtype(dtype) or is_float_dtype(dtype) - or is_complex_dtype(dtype) - or issubclass(dtype.type, np.bytes_))): - pytest.xfail('does not upcast correctly') - # create array of given dtype fill_value = 'abc' @@ -815,10 +645,6 @@ def test_maybe_promote_object_with_any(object_dtype, any_numpy_dtype, dtype = np.dtype(object_dtype) fill_dtype = np.dtype(any_numpy_dtype) - if (boxed and box_dtype is None - and is_datetime_or_timedelta_dtype(fill_dtype)): - pytest.xfail('wrong missing value marker') - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -840,9 +666,6 @@ def test_maybe_promote_any_with_object(any_numpy_dtype, object_dtype, boxed, box_dtype): dtype = np.dtype(any_numpy_dtype) - if not boxed and is_datetime_or_timedelta_dtype(dtype): - pytest.xfail('raises error') - # create array of object dtype from a scalar value (i.e. passing # dtypes.common.is_scalar), which can however not be cast to int/float etc. fill_value = pd.DateOffset(1) @@ -866,28 +689,6 @@ def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype, fill_value, boxed, box_dtype): dtype = np.dtype(any_numpy_dtype) - if (dtype == bytes and not boxed - and fill_value is not None and fill_value is not NaT): - pytest.xfail('does not upcast to object') - elif dtype == 'uint64' and not boxed and fill_value == iNaT: - pytest.xfail('does not upcast correctly') - elif is_datetime_or_timedelta_dtype(dtype) and boxed: - pytest.xfail('falsely upcasts to object') - elif (boxed and (is_integer_dtype(dtype) or is_float_dtype(dtype) - or is_complex_dtype(dtype)) - and fill_value is not NaT and dtype != 'uint64'): - pytest.xfail('falsely upcasts to object') - elif (boxed and dtype == 'uint64' - and (fill_value is np.nan or fill_value is None)): - pytest.xfail('falsely upcasts to object') - # below: opinionated that iNaT should be interpreted as missing value - elif (not boxed and (is_float_dtype(dtype) or is_complex_dtype(dtype)) - and fill_value == iNaT): - pytest.xfail('does not cast to missing value marker correctly') - elif ((is_string_dtype(dtype) or dtype == bool) - and not boxed and fill_value == iNaT): - pytest.xfail('does not cast to missing value marker correctly') - if is_integer_dtype(dtype) and dtype == 'uint64' and fill_value == iNaT: # uint64 + negative int casts to object; iNaT is considered as missing expected_dtype = np.dtype(object) @@ -957,3 +758,30 @@ def test_maybe_promote_dimensions(any_numpy_dtype, dim): assert ((result_missing_value == expected_missing_value) or (result_missing_value is np.nan and expected_missing_value is np.nan)) + + # same again for maybe_promote_with_array (for coverage) + result_dtype, result_missing_value = maybe_promote_with_array( + dtype, fill_array) + + assert result_dtype == expected_dtype + # None == None, iNaT == iNaT, but np.nan != np.nan + assert ((result_missing_value == expected_missing_value) + or (result_missing_value is np.nan + and expected_missing_value is np.nan)) + + +def test_maybe_promote_raises(any_numpy_dtype): + msg = 'fill_value must either be scalar, or a Series / Index / np.ndarra.*' + with pytest.raises(ValueError, match=msg): + # something that's neither scalar, nor Series / Index / np.ndarray + maybe_promote(any_numpy_dtype, [1, 2, 3]) + + msg = 'fill_value must either be a Series / Index / np.ndarray, received.*' + with pytest.raises(ValueError, match=msg): + # something that's not a Series / Index / np.ndarray + maybe_promote_with_array(any_numpy_dtype, 1) + + msg = 'fill_value must be a scalar, received .*' + with pytest.raises(ValueError, match=msg): + # something that's not scalar + maybe_promote_with_scalar(any_numpy_dtype, pd.Series([1, 2, 3])) From 6792a54a3001e2df581d758b705e6e2a23ad81d9 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 24 Feb 2019 12:52:47 +0100 Subject: [PATCH 03/27] Fix remaining failures --- pandas/core/dtypes/cast.py | 7 ++++--- pandas/tests/dtypes/cast/test_promote.py | 6 ++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9676ec8e9f247..88ed8351a2de8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -299,7 +299,7 @@ def maybe_promote(dtype, fill_value=np.nan): maybe_promote_with_scalar : underlying method for scalar case maybe_promote_with_array : underlying method for array case """ - if is_scalar(fill_value): + if is_scalar(fill_value) or isinstance(fill_value, tuple): return maybe_promote_with_scalar(dtype, fill_value) elif isinstance(fill_value, (np.ndarray, ABCSeries, ABCIndexClass)): return maybe_promote_with_array(dtype, fill_value) @@ -367,7 +367,8 @@ def maybe_promote_with_scalar(dtype, fill_value=np.nan): return np.dtype(object), fill_value # use Series to construct, since np.array cannot deal with - # pandas-internal dtypes (e.g. DatetimeTZDtype) + # pandas-internal dtypes (e.g. DatetimeTZDtype); furthermore, we want + # to treat tuples as scalar, but numpy casts those to a new dimension fill_array = Series([fill_value], dtype=object) dtype, na_value = maybe_promote_with_array(dtype, fill_array) @@ -570,7 +571,7 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): fill_max = fill_value.max() fill_min = fill_value.min() - if isinstance(fill_max, np.uint64): + if isinstance(fill_max, (np.int64, np.uint64)): # numpy comparator is broken for uint64; # see https://github.com/numpy/numpy/issues/12525 # use .item to get int object diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 293cccdae7f11..77bfe7454b3f1 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -597,6 +597,9 @@ def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype, dtype = np.dtype(string_dtype) fill_dtype = np.dtype(any_numpy_dtype) + if PY2 and is_string_dtype(fill_dtype): + pytest.xfail('does not upcast to object on PY2') + # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -620,6 +623,9 @@ def test_maybe_promote_any_with_string(any_numpy_dtype, string_dtype, dtype = np.dtype(any_numpy_dtype) fill_dtype = np.dtype(string_dtype) + if PY2 and is_string_dtype(dtype): + pytest.xfail('does not upcast to object on PY2') + # create array of given dtype fill_value = 'abc' From 8d9a3b7780acd1f78d41cf8666adf68fdfc01388 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 24 Feb 2019 17:42:04 +0100 Subject: [PATCH 04/27] Forgot to flake --- pandas/tests/dtypes/cast/test_promote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 77bfe7454b3f1..49a0e9865c49d 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -17,7 +17,7 @@ from pandas.core.dtypes.common import ( is_complex_dtype, is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, is_scalar, - is_timedelta64_dtype) + is_string_dtype, is_timedelta64_dtype) from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd From b903d2e27e16a6d174a6e7d39e8fe93caf36c51e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 24 Feb 2019 17:43:09 +0100 Subject: [PATCH 05/27] Another try at int2int... --- pandas/core/dtypes/cast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 88ed8351a2de8..ee794b2d2cb45 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -571,13 +571,13 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): fill_max = fill_value.max() fill_min = fill_value.min() - if isinstance(fill_max, (np.int64, np.uint64)): + if isinstance(fill_max, np.uint64): # numpy comparator is broken for uint64; # see https://github.com/numpy/numpy/issues/12525 # use .item to get int object fill_max = fill_max.item() - if fill_max > _int64_max or fill_min < _int64_min: + if fill_max >= _int64_max + 1 or fill_min < _int64_min: return np.dtype(object), np.nan while (fill_max > np.iinfo(dtype).max From 91e66734ba682743f0f8d0f2f58ed3d214a5ecb2 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 24 Feb 2019 18:21:35 +0100 Subject: [PATCH 06/27] Last one? --- pandas/core/dtypes/cast.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ee794b2d2cb45..a14b6a50f0b77 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -359,7 +359,7 @@ def maybe_promote_with_scalar(dtype, fill_value=np.nan): """ from pandas import Series - if is_scalar(fill_value): + if is_scalar(fill_value) or isinstance(fill_value, tuple): # unify handling of scalar and array values to simplify actual # promotion logic in maybe_promote_with_array; if is_object_dtype(dtype) and fill_value is not None: @@ -577,7 +577,9 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): # use .item to get int object fill_max = fill_max.item() - if fill_max >= _int64_max + 1 or fill_min < _int64_min: + # comparison mechanics are broken above _int64_max; + # use greater equal instead of equal + if fill_max >= _int64_max + 1 or fill_min <= _int64_min - 1: return np.dtype(object), np.nan while (fill_max > np.iinfo(dtype).max From c0a3a4e1ccc7ae74f1988944c6d5964ae0e6c2b6 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 10 Mar 2019 13:49:58 +0100 Subject: [PATCH 07/27] Review (jbrockmendel) --- pandas/core/dtypes/cast.py | 80 +++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a14b6a50f0b77..d4836eca4aca4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -304,7 +304,7 @@ def maybe_promote(dtype, fill_value=np.nan): elif isinstance(fill_value, (np.ndarray, ABCSeries, ABCIndexClass)): return maybe_promote_with_array(dtype, fill_value) else: - fill_type = type(fill_value) + fill_type = type(fill_value).__name__ raise ValueError('fill_value must either be scalar, or a Series / ' 'Index / np.ndarray; received {}'.format(fill_type)) @@ -359,47 +359,47 @@ def maybe_promote_with_scalar(dtype, fill_value=np.nan): """ from pandas import Series - if is_scalar(fill_value) or isinstance(fill_value, tuple): - # unify handling of scalar and array values to simplify actual - # promotion logic in maybe_promote_with_array; - if is_object_dtype(dtype) and fill_value is not None: - # inserting into object does not cast (except for None -> np.nan) - return np.dtype(object), fill_value - - # use Series to construct, since np.array cannot deal with - # pandas-internal dtypes (e.g. DatetimeTZDtype); furthermore, we want - # to treat tuples as scalar, but numpy casts those to a new dimension - fill_array = Series([fill_value], dtype=object) - dtype, na_value = maybe_promote_with_array(dtype, fill_array) - - # maybe_promote_with_array returns the na-marker for the new dtype; - # maybe_promote_with_scalar always casts fill_value to the new dtype - if is_integer_dtype(dtype) and _is_iNaT(fill_value): - # maybe_promote_with_array considers iNaT a missing value, and - # since int dtypes cannot hold missing values, that method returns - # None as the na_value. For scalars, we need to keep it however, - # to ensure correct operations for datetime/timedelta code. - fill_value = iNaT - elif fill_value is NaT and is_object_dtype(dtype): - # the presence of pd.NaT forced upcasting to object, and therefore - # fill_value does not get cast to na-marker of object (cf. below) - pass - elif isna(fill_value) or _is_iNaT(fill_value): - # cast missing values (incl. iNaT) to correct missing value marker - # for the updated dtype - fill_value = na_value - # otherwise casts fill_value (= only entry of fill_array) to new dtype - elif is_datetime_or_timedelta_dtype(dtype): - # for datetime/timedelta, we need to return the underlying ints - fill_value = fill_array.astype(dtype)[0].value - else: - fill_value = fill_array.astype(dtype)[0] - - return dtype, fill_value - else: + if not (is_scalar(fill_value) or isinstance(fill_value, tuple)): raise ValueError('fill_value must be a scalar, received ' '{}'.format(type(fill_value))) + # unify handling of scalar and array values to simplify actual + # promotion logic in maybe_promote_with_array; + if is_object_dtype(dtype) and fill_value is not None: + # inserting into object does not cast (except for None -> np.nan) + return np.dtype(object), fill_value + + # use Series to construct, since np.array cannot deal with pandas-internal + # dtypes (e.g. DatetimeTZDtype); furthermore, we want to treat tuples as + # scalar, but numpy casts those to a new dimension + fill_array = Series([fill_value], dtype=object) + dtype, na_value = maybe_promote_with_array(dtype, fill_array) + + # maybe_promote_with_array returns the na-marker for the new dtype; + # maybe_promote_with_scalar always casts fill_value to the new dtype + if is_integer_dtype(dtype) and _is_iNaT(fill_value): + # maybe_promote_with_array considers iNaT a missing value, and since + # int dtypes cannot hold missing values, that method returns None as + # the na_value. For scalars, we need to keep it however, to ensure + # correct operations for datetime/timedelta code. + fill_value = iNaT + elif fill_value is NaT and is_object_dtype(dtype): + # the presence of pd.NaT forced upcasting to object, and therefore + # fill_value does not get cast to na-marker of object (cf. below) + pass + elif isna(fill_value) or _is_iNaT(fill_value): + # cast missing values (incl. iNaT) to correct missing value marker for + # the updated dtype + fill_value = na_value + # otherwise casts fill_value (= only entry of fill_array) to new dtype + elif is_datetime_or_timedelta_dtype(dtype): + # for datetime/timedelta, we need to return the underlying ints + fill_value = fill_array.astype(dtype)[0].value + else: + fill_value = fill_array.astype(dtype)[0] + + return dtype, fill_value + def maybe_promote_with_array(dtype, fill_value=np.nan): """ @@ -458,7 +458,7 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): # ndarray, but too high-dimensional fill_value = fill_value.ravel() elif not isinstance(fill_value, (ABCSeries, ABCIndexClass)): - fill_type = type(fill_value) + fill_type = type(fill_value).__name__ raise ValueError('fill_value must either be a Series / Index / ' 'np.ndarray, received {}'.format(fill_type)) From ce0efda4463c0c72d586e4134d6ab40a2096f9a1 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 23 Jun 2019 21:33:40 +0200 Subject: [PATCH 08/27] lint --- pandas/conftest.py | 1 - pandas/tests/dtypes/cast/test_promote.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 56e039ecc0f23..4bcd0ea8442e6 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -407,7 +407,6 @@ def tz_aware_fixture(request): # Dtypes # ---------------------------------------------------------------- - UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"] SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index df276703cacc5..c9275dcb08bdc 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -12,15 +12,12 @@ from pandas.core.dtypes.cast import ( maybe_promote, maybe_promote_with_array, maybe_promote_with_scalar) - from pandas.core.dtypes.common import ( is_complex_dtype, is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, is_scalar, is_timedelta64_dtype) - from pandas.core.dtypes.dtypes import DatetimeTZDtype, PandasExtensionDtype - import pandas as pd From e67dd994804e3bf1572e0b17586e187eee9df449 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 23 Jun 2019 22:03:53 +0200 Subject: [PATCH 09/27] reduce diff --- pandas/tests/dtypes/cast/test_promote.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index c9275dcb08bdc..1ce4eebdac778 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -117,12 +117,13 @@ def _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, # box_dtype; the expected value returned from maybe_promote is the # missing value marker for the returned dtype. fill_array = np.array([fill_value], dtype=box_dtype) - result_dtype, result_fill_value = maybe_promote_with_array(dtype, - fill_array) + result_dtype, result_fill_value = maybe_promote(dtype, fill_array) expected_fill_value = exp_val_for_array else: - result_dtype, result_fill_value = maybe_promote_with_scalar(dtype, - fill_value) + # here, we pass on fill_value as a scalar directly; the expected value + # returned from maybe_promote is fill_value, potentially upcast to the + # returned dtype. + result_dtype, result_fill_value = maybe_promote(dtype, fill_value) expected_fill_value = exp_val_for_scalar _safe_dtype_assert(result_dtype, expected_dtype) From 321f08d2f0ab11b842066baa26279b783ce45d0d Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 28 Jun 2019 19:35:56 +0200 Subject: [PATCH 10/27] review (jbrockmendel) --- pandas/core/dtypes/cast.py | 33 ++++++++++++++---------- pandas/tests/dtypes/cast/test_promote.py | 12 ++++----- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 65d6941335756..8066bf9d80d8a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -35,10 +35,15 @@ def _is_iNaT(x): + """ + Helper function to circumvent numpy bug for timedeltas + + Specifically, comparing a scalar timedelta against another scalar value may + raise a spurious DeprecationWarning, see numpy/numpy#10095 + """ if not is_scalar(x): return False with warnings.catch_warnings(): - # bug in numpy warnings for timedelta, see numpy/numpy#10095 warnings.filterwarnings('ignore', category=DeprecationWarning) result = x == iNaT return result @@ -307,20 +312,20 @@ def maybe_promote(dtype, fill_value=np.nan): See Also -------- - maybe_promote_with_scalar : underlying method for scalar case - maybe_promote_with_array : underlying method for array case + _maybe_promote_with_scalar : underlying method for scalar case + _maybe_promote_with_array : underlying method for array case """ if is_scalar(fill_value) or isinstance(fill_value, tuple): - return maybe_promote_with_scalar(dtype, fill_value) + return _maybe_promote_with_scalar(dtype, fill_value) elif isinstance(fill_value, (np.ndarray, ABCSeries, ABCIndexClass)): - return maybe_promote_with_array(dtype, fill_value) + return _maybe_promote_with_array(dtype, fill_value) else: fill_type = type(fill_value).__name__ raise ValueError('fill_value must either be scalar, or a Series / ' 'Index / np.ndarray; received {}'.format(fill_type)) -def maybe_promote_with_scalar(dtype, fill_value=np.nan): +def _maybe_promote_with_scalar(dtype, fill_value=np.nan): """ Determine minimal dtype to hold fill_value, when starting from dtype @@ -344,7 +349,7 @@ def maybe_promote_with_scalar(dtype, fill_value=np.nan): See Also -------- - maybe_promote_with_array : similar method for array case + _maybe_promote_with_array : similar method for array case This method contains the actual promotion logic for both cases. Examples @@ -375,7 +380,7 @@ def maybe_promote_with_scalar(dtype, fill_value=np.nan): '{}'.format(type(fill_value))) # unify handling of scalar and array values to simplify actual - # promotion logic in maybe_promote_with_array; + # promotion logic in _maybe_promote_with_array; if is_object_dtype(dtype) and fill_value is not None: # inserting into object does not cast (except for None -> np.nan) return np.dtype(object), fill_value @@ -384,12 +389,12 @@ def maybe_promote_with_scalar(dtype, fill_value=np.nan): # dtypes (e.g. DatetimeTZDtype); furthermore, we want to treat tuples as # scalar, but numpy casts those to a new dimension fill_array = Series([fill_value], dtype=object) - dtype, na_value = maybe_promote_with_array(dtype, fill_array) + dtype, na_value = _maybe_promote_with_array(dtype, fill_array) - # maybe_promote_with_array returns the na-marker for the new dtype; - # maybe_promote_with_scalar always casts fill_value to the new dtype + # _maybe_promote_with_array returns the na-marker for the new dtype; + # _maybe_promote_with_scalar always casts fill_value to the new dtype if is_integer_dtype(dtype) and _is_iNaT(fill_value): - # maybe_promote_with_array considers iNaT a missing value, and since + # _maybe_promote_with_array considers iNaT a missing value, and since # int dtypes cannot hold missing values, that method returns None as # the na_value. For scalars, we need to keep it however, to ensure # correct operations for datetime/timedelta code. @@ -412,7 +417,7 @@ def maybe_promote_with_scalar(dtype, fill_value=np.nan): return dtype, fill_value -def maybe_promote_with_array(dtype, fill_value=np.nan): +def _maybe_promote_with_array(dtype, fill_value=np.nan): """ Determine minimal dtype to hold fill_value, when starting from dtype @@ -436,7 +441,7 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): See Also -------- - maybe_promote_with_scalar : similar method for scalar case + _maybe_promote_with_scalar : similar method for scalar case Examples -------- diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 1ce4eebdac778..8fd6566025998 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -11,7 +11,7 @@ from pandas.compat import is_platform_windows from pandas.core.dtypes.cast import ( - maybe_promote, maybe_promote_with_array, maybe_promote_with_scalar) + maybe_promote, _maybe_promote_with_array, _maybe_promote_with_scalar) from pandas.core.dtypes.common import ( is_complex_dtype, is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, is_scalar, @@ -757,7 +757,7 @@ def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, # array case has same expected_dtype; but returns corresponding na-marker if is_integer_dtype(expected_dtype): - # integers cannot hold NaNs; maybe_promote_with_array returns None + # integers cannot hold NaNs; _maybe_promote_with_array returns None exp_val_for_array = None elif is_datetime_or_timedelta_dtype(expected_dtype): exp_val_for_array = iNaT @@ -791,8 +791,8 @@ def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): or (result_missing_value is np.nan and expected_missing_value is np.nan)) - # same again for maybe_promote_with_array (for coverage) - result_dtype, result_missing_value = maybe_promote_with_array( + # same again for _maybe_promote_with_array (for coverage) + result_dtype, result_missing_value = _maybe_promote_with_array( dtype, fill_array) assert result_dtype == expected_dtype @@ -811,9 +811,9 @@ def test_maybe_promote_raises(any_numpy_dtype): msg = 'fill_value must either be a Series / Index / np.ndarray, received.*' with pytest.raises(ValueError, match=msg): # something that's not a Series / Index / np.ndarray - maybe_promote_with_array(any_numpy_dtype, 1) + _maybe_promote_with_array(any_numpy_dtype, 1) msg = 'fill_value must be a scalar, received .*' with pytest.raises(ValueError, match=msg): # something that's not scalar - maybe_promote_with_scalar(any_numpy_dtype, pd.Series([1, 2, 3])) + _maybe_promote_with_scalar(any_numpy_dtype, pd.Series([1, 2, 3])) From 5c5e0f1384e3b6099f5a6b1f3879912cac6dd5f8 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 28 Jun 2019 20:23:27 +0200 Subject: [PATCH 11/27] fix isort --- pandas/tests/dtypes/cast/test_promote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 8fd6566025998..e5f23ef6ec7ca 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -11,7 +11,7 @@ from pandas.compat import is_platform_windows from pandas.core.dtypes.cast import ( - maybe_promote, _maybe_promote_with_array, _maybe_promote_with_scalar) + _maybe_promote_with_array, _maybe_promote_with_scalar, maybe_promote) from pandas.core.dtypes.common import ( is_complex_dtype, is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, is_scalar, From bfb8a2f144b30ebeadac2e184a572cb0ecae4873 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 28 Jun 2019 20:33:36 +0200 Subject: [PATCH 12/27] fix docstring example --- pandas/core/dtypes/cast.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8066bf9d80d8a..51150d5525c67 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -453,9 +453,15 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): For datetimes, timedeltas and datetimes with a timezone, the missing value marker is pandas._libs.tslibs.iNaT (== np.iinfo('int64').min): + >>> maybe_promote(np.dtype('datetime64[ns]'), fill_value=np.array([None])) + (dtype('>> maybe_promote(np.dtype('datetime64[ns]'), ... fill_value=np.array(['2018-01-01'])) - (dtype(' Date: Fri, 28 Jun 2019 20:38:01 +0200 Subject: [PATCH 13/27] remove unicode from possible returns of infer_dtype --- pandas/core/dtypes/cast.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 51150d5525c67..6d9bdfe2e5d4c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -544,8 +544,7 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): 'date': 'datetime64[ns]', 'timedelta64': 'timedelta64[ns]', 'timedelta': 'timedelta64[ns]', 'time': object, # time cannot be cast to datetime/timedelta - 'string': object, 'unicode': object, - 'mixed-integer': object, 'mixed': object, + 'string': object, 'mixed-integer': object, 'mixed': object } fill_dtype = np.dtype(map_inferred_to_numpy[inferred_dtype]) From bf12d67af6a9154e0e41f9ee29918547cc19cbcf Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 28 Jun 2019 20:41:18 +0200 Subject: [PATCH 14/27] perf: only compute max if necessary --- pandas/core/dtypes/cast.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6d9bdfe2e5d4c..c8bcb582412ba 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -620,17 +620,16 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): return dtype, np.nan elif is_float_dtype(dtype) and is_float_dtype(fill_dtype): # float with float; upcasts depending on absolute max of fill_value - fill_max = np.abs(fill_value).max() - if dtype == 'float32' and fill_max <= _float32_max: + if dtype == 'float32' and np.abs(fill_value).max() <= _float32_max: return dtype, np.nan # all other cases return float64 return np.dtype('float64'), np.nan elif ((is_float_dtype(dtype) or is_complex_dtype(dtype)) and (is_float_dtype(fill_dtype) or is_complex_dtype(fill_dtype))): # at least one is complex; otherwise we'd have hit float/float above - fill_max = max(np.abs(fill_value.real).max(), # also works for float - np.abs(fill_value.imag).max()) - if dtype in ['float32', 'complex64'] and fill_max <= _float32_max: + if (dtype in ['float32', 'complex64'] + and max(np.abs(fill_value.real).max(), # also works for float + np.abs(fill_value.imag).max()) <= _float32_max): return np.complex64, np.nan # all other cases return complex128 return np.dtype('complex128'), np.nan From d9bbf0792760354706c7e29c6c6b53c3cc25f5ce Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 29 Jun 2019 01:32:49 +0200 Subject: [PATCH 15/27] update comment --- pandas/tests/dtypes/cast/test_promote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index e5f23ef6ec7ca..7761dbd4ccfa7 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -253,7 +253,7 @@ def test_maybe_promote_int_with_int(dtype, fill_value, expected_dtype, box): expected_dtype = np.dtype(expected_dtype) boxed, box_dtype = box # read from parametrized fixture - # output is not a generic int, but corresponds to expected_dtype + # output is not a python int, but a numpy int of expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] # no missing value marker for integers exp_val_for_array = None if expected_dtype != 'object' else np.nan From 076926e87cbeacacb7ea075031be07982da3d40d Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 26 Jul 2019 12:58:44 +0200 Subject: [PATCH 16/27] format with black --- pandas/core/dtypes/cast.py | 124 ++++--- pandas/tests/dtypes/cast/test_promote.py | 444 +++++++++++++---------- 2 files changed, 328 insertions(+), 240 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5775614b90caa..89bcadf9c3398 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -76,7 +76,7 @@ def _is_iNaT(x): if not is_scalar(x): return False with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=DeprecationWarning) + warnings.filterwarnings("ignore", category=DeprecationWarning) result = x == iNaT return result @@ -355,8 +355,10 @@ def maybe_promote(dtype, fill_value=np.nan): return _maybe_promote_with_array(dtype, fill_value) else: fill_type = type(fill_value).__name__ - raise ValueError('fill_value must either be scalar, or a Series / ' - 'Index / np.ndarray; received {}'.format(fill_type)) + raise ValueError( + "fill_value must either be scalar, or a Series / " + "Index / np.ndarray; received {}".format(fill_type) + ) def _maybe_promote_with_scalar(dtype, fill_value=np.nan): @@ -410,8 +412,9 @@ def _maybe_promote_with_scalar(dtype, fill_value=np.nan): from pandas import Series if not (is_scalar(fill_value) or isinstance(fill_value, tuple)): - raise ValueError('fill_value must be a scalar, received ' - '{}'.format(type(fill_value))) + raise ValueError( + "fill_value must be a scalar, received " "{}".format(type(fill_value)) + ) # unify handling of scalar and array values to simplify actual # promotion logic in _maybe_promote_with_array; @@ -515,8 +518,10 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): fill_value = fill_value.ravel() elif not isinstance(fill_value, (ABCSeries, ABCIndexClass)): fill_type = type(fill_value).__name__ - raise ValueError('fill_value must either be a Series / Index / ' - 'np.ndarray, received {}'.format(fill_type)) + raise ValueError( + "fill_value must either be a Series / Index / " + "np.ndarray, received {}".format(fill_type) + ) if all(isna(x) or _is_iNaT(x) for x in fill_value): # only missing values (or no values at all) @@ -538,17 +543,20 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): # presence of pd.NaT upcasts everything that's not # datetime/timedelta (see above) to object dtype = np.dtype(object) - elif (is_integer_dtype(dtype) and dtype == 'uint64' - and all(x == iNaT for x in fill_value)): + elif ( + is_integer_dtype(dtype) + and dtype == "uint64" + and all(x == iNaT for x in fill_value) + ): # uint64 + negative int casts to object dtype = np.dtype(object) elif is_integer_dtype(dtype) and all(x == iNaT for x in fill_value): # integer + iNaT casts to int64 - dtype = np.dtype('int64') + dtype = np.dtype("int64") na_value = None elif is_integer_dtype(dtype): # integer + other missing value (np.nan / None) casts to float - dtype = np.dtype('float64') + dtype = np.dtype("float64") elif is_extension_array_dtype(dtype): na_value = dtype.na_value elif is_string_dtype(dtype) or dtype in (bool, bytes): @@ -564,21 +572,29 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): inferred_dtype = lib.infer_dtype(fill_value, skipna=True) # cases that would yield 'empty' have been treated in branch above - if inferred_dtype in ['period', 'interval', 'datetime64tz']: + if inferred_dtype in ["period", "interval", "datetime64tz"]: # TODO: handle & test pandas-dtypes # TODO: lib.infer_dtype does not support datetime64tz yet pass else: # rest can be mapped to numpy dtypes map_inferred_to_numpy = { - 'floating': float, 'mixed-integer-float': float, - 'decimal': float, 'integer': int, 'boolean': bool, - 'complex': complex, 'bytes': bytes, - 'datetime64': 'datetime64[ns]', 'datetime': 'datetime64[ns]', - 'date': 'datetime64[ns]', 'timedelta64': 'timedelta64[ns]', - 'timedelta': 'timedelta64[ns]', - 'time': object, # time cannot be cast to datetime/timedelta - 'string': object, 'mixed-integer': object, 'mixed': object + "floating": float, + "mixed-integer-float": float, + "decimal": float, + "integer": int, + "boolean": bool, + "complex": complex, + "bytes": bytes, + "datetime64": "datetime64[ns]", + "datetime": "datetime64[ns]", + "date": "datetime64[ns]", + "timedelta64": "timedelta64[ns]", + "timedelta": "timedelta64[ns]", + "time": object, # time cannot be cast to datetime/timedelta + "string": object, + "mixed-integer": object, + "mixed": object, } fill_dtype = np.dtype(map_inferred_to_numpy[inferred_dtype]) @@ -610,11 +626,11 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): # bits, which is used in the string identifier of the dtype; # if fill_max is above the max for that dtype, # we double the number of bytes/bits. - dtype = np.dtype('uint{}'.format(dtype.itemsize * 8 * 2)) + dtype = np.dtype("uint{}".format(dtype.itemsize * 8 * 2)) return dtype, None else: # cannot stay unsigned - if dtype == 'uint64': + if dtype == "uint64": # need to hold negative values, but int64 cannot hold # maximum of uint64 -> needs object return np.dtype(object), np.nan @@ -622,7 +638,7 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): # need to turn into signed integers to hold negative values # int8 cannot hold maximum of uint8; similar for 16/32 # therefore, upcast at least to next higher int-type - dtype = np.dtype('int{}'.format(dtype.itemsize * 8 * 2)) + dtype = np.dtype("int{}".format(dtype.itemsize * 8 * 2)) fill_max = fill_value.max() fill_min = fill_value.min() @@ -637,62 +653,74 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): if fill_max >= _int64_max + 1 or fill_min <= _int64_min - 1: return np.dtype(object), np.nan - while (fill_max > np.iinfo(dtype).max - or fill_min < np.iinfo(dtype).min): + while fill_max > np.iinfo(dtype).max or fill_min < np.iinfo(dtype).min: # same mechanism as above, but for int instead of uint - dtype = np.dtype('int{}'.format(dtype.itemsize * 8 * 2)) + dtype = np.dtype("int{}".format(dtype.itemsize * 8 * 2)) return dtype, None elif is_integer_dtype(dtype) and is_float_dtype(fill_dtype): # int with float: always upcasts to float64 - return np.dtype('float64'), np.nan + return np.dtype("float64"), np.nan elif is_integer_dtype(dtype) and is_complex_dtype(fill_dtype): # int with complex: always upcasts to complex128 - return np.dtype('complex128'), np.nan - elif ((is_float_dtype(dtype) or is_complex_dtype(dtype)) - and is_integer_dtype(fill_dtype)): + return np.dtype("complex128"), np.nan + elif (is_float_dtype(dtype) or is_complex_dtype(dtype)) and is_integer_dtype( + fill_dtype + ): # float/complex with int: always stays original float/complex dtype return dtype, np.nan elif is_float_dtype(dtype) and is_float_dtype(fill_dtype): # float with float; upcasts depending on absolute max of fill_value - if dtype == 'float32' and np.abs(fill_value).max() <= _float32_max: + if dtype == "float32" and np.abs(fill_value).max() <= _float32_max: return dtype, np.nan # all other cases return float64 - return np.dtype('float64'), np.nan - elif ((is_float_dtype(dtype) or is_complex_dtype(dtype)) - and (is_float_dtype(fill_dtype) or is_complex_dtype(fill_dtype))): + return np.dtype("float64"), np.nan + elif (is_float_dtype(dtype) or is_complex_dtype(dtype)) and ( + is_float_dtype(fill_dtype) or is_complex_dtype(fill_dtype) + ): # at least one is complex; otherwise we'd have hit float/float above - if (dtype in ['float32', 'complex64'] - and max(np.abs(fill_value.real).max(), # also works for float - np.abs(fill_value.imag).max()) <= _float32_max): + if ( + dtype in ["float32", "complex64"] + and max( + np.abs(fill_value.real).max(), # also works for float + np.abs(fill_value.imag).max(), + ) + <= _float32_max + ): return np.complex64, np.nan # all other cases return complex128 - return np.dtype('complex128'), np.nan + return np.dtype("complex128"), np.nan elif is_bool_dtype(dtype) and is_bool_dtype(fill_dtype): # bool with bool is the only combination that stays bool; any other # combination involving bool upcasts to object, see else-clause below return dtype, None - elif (issubclass(dtype.type, np.bytes_) - and issubclass(fill_dtype.type, np.bytes_)): + elif issubclass(dtype.type, np.bytes_) and issubclass(fill_dtype.type, np.bytes_): # bytes with bytes is the only combination that stays bytes; any other # combination involving bytes upcasts to object, see else-clause below return dtype, None - elif (is_datetime64tz_dtype(dtype) and is_datetime64tz_dtype(fill_dtype) - and (dtype.tz == fill_dtype.tz)): + elif ( + is_datetime64tz_dtype(dtype) + and is_datetime64tz_dtype(fill_dtype) + and (dtype.tz == fill_dtype.tz) + ): # datetimetz with datetimetz with the same timezone is the only # combination that stays datetimetz (in particular, mixing timezones or # tz-aware and tz-naive datetimes will cast to object); any other # combination involving datetimetz upcasts to object, see below return dtype, iNaT - elif ((is_timedelta64_dtype(dtype) and is_timedelta64_dtype(fill_dtype)) - or (is_datetime64_dtype(dtype) and is_datetime64_dtype(fill_dtype))): + elif (is_timedelta64_dtype(dtype) and is_timedelta64_dtype(fill_dtype)) or ( + is_datetime64_dtype(dtype) and is_datetime64_dtype(fill_dtype) + ): # datetime and timedelta try to cast; if successful, keep dtype, # otherwise upcast to object try: with warnings.catch_warnings(): - msg = ('parsing timezone aware datetimes is deprecated; ' - 'this will raise an error in the future') - warnings.filterwarnings('ignore', message=msg, - category=DeprecationWarning) + msg = ( + "parsing timezone aware datetimes is deprecated; " + "this will raise an error in the future" + ) + warnings.filterwarnings( + "ignore", message=msg, category=DeprecationWarning + ) fill_value.astype(dtype) na_value = iNaT except (ValueError, TypeError): diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 408c94eed098a..d5e7ffb3e33fb 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -11,7 +11,10 @@ from pandas.compat import is_platform_windows from pandas.core.dtypes.cast import ( - _maybe_promote_with_array, _maybe_promote_with_scalar, maybe_promote) + _maybe_promote_with_array, + _maybe_promote_with_scalar, + maybe_promote, +) from pandas.core.dtypes.common import ( is_complex_dtype, is_datetime64_dtype, @@ -160,9 +163,9 @@ def _check_promote( # for equal values, also check type (relevant e.g. for int vs float, resp. # for different datetimes and timedeltas) - match_value = (result_fill_value == expected_fill_value - and type(result_fill_value) == type(expected_fill_value) - ) + match_value = result_fill_value == expected_fill_value and type( + result_fill_value + ) == type(expected_fill_value) # for missing values, None == None and iNaT == iNaT (which is checked # through match_value above), but np.nan != np.nan and pd.NaT != pd.NaT @@ -173,110 +176,113 @@ def _check_promote( assert match_value or match_missing -@pytest.mark.parametrize('dtype, fill_value, expected_dtype', [ - # size 8 - ('int8', 1, 'int8'), - ('int8', np.iinfo('int8').max + 1, 'int16'), - ('int8', np.iinfo('int16').max + 1, 'int32'), - ('int8', np.iinfo('int32').max + 1, 'int64'), - ('int8', np.iinfo('int64').max + 1, 'object'), - ('int8', -1, 'int8'), - ('int8', np.iinfo('int8').min - 1, 'int16'), - ('int8', np.iinfo('int16').min - 1, 'int32'), - ('int8', np.iinfo('int32').min - 1, 'int64'), - ('int8', np.iinfo('int64').min - 1, 'object'), - # keep signed-ness as long as possible - ('uint8', 1, 'uint8'), - ('uint8', np.iinfo('int8').max + 1, 'uint8'), - ('uint8', np.iinfo('uint8').max + 1, 'uint16'), - ('uint8', np.iinfo('int16').max + 1, 'uint16'), - ('uint8', np.iinfo('uint16').max + 1, 'uint32'), - ('uint8', np.iinfo('int32').max + 1, 'uint32'), - ('uint8', np.iinfo('uint32').max + 1, 'uint64'), - ('uint8', np.iinfo('int64').max + 1, 'uint64'), - ('uint8', np.iinfo('uint64').max + 1, 'object'), - # max of uint8 cannot be contained in int8 - ('uint8', -1, 'int16'), - ('uint8', np.iinfo('int8').min - 1, 'int16'), - ('uint8', np.iinfo('int16').min - 1, 'int32'), - ('uint8', np.iinfo('int32').min - 1, 'int64'), - ('uint8', np.iinfo('int64').min - 1, 'object'), - # size 16 - ('int16', 1, 'int16'), - ('int16', np.iinfo('int8').max + 1, 'int16'), - ('int16', np.iinfo('int16').max + 1, 'int32'), - ('int16', np.iinfo('int32').max + 1, 'int64'), - ('int16', np.iinfo('int64').max + 1, 'object'), - ('int16', -1, 'int16'), - ('int16', np.iinfo('int8').min - 1, 'int16'), - ('int16', np.iinfo('int16').min - 1, 'int32'), - ('int16', np.iinfo('int32').min - 1, 'int64'), - ('int16', np.iinfo('int64').min - 1, 'object'), - ('uint16', 1, 'uint16'), - ('uint16', np.iinfo('int8').max + 1, 'uint16'), - ('uint16', np.iinfo('uint8').max + 1, 'uint16'), - ('uint16', np.iinfo('int16').max + 1, 'uint16'), - ('uint16', np.iinfo('uint16').max + 1, 'uint32'), - ('uint16', np.iinfo('int32').max + 1, 'uint32'), - ('uint16', np.iinfo('uint32').max + 1, 'uint64'), - ('uint16', np.iinfo('int64').max + 1, 'uint64'), - ('uint16', np.iinfo('uint64').max + 1, 'object'), - ('uint16', -1, 'int32'), - ('uint16', np.iinfo('int8').min - 1, 'int32'), - ('uint16', np.iinfo('int16').min - 1, 'int32'), - ('uint16', np.iinfo('int32').min - 1, 'int64'), - ('uint16', np.iinfo('int64').min - 1, 'object'), - # size 32 - ('int32', 1, 'int32'), - ('int32', np.iinfo('int8').max + 1, 'int32'), - ('int32', np.iinfo('int16').max + 1, 'int32'), - ('int32', np.iinfo('int32').max + 1, 'int64'), - ('int32', np.iinfo('int64').max + 1, 'object'), - ('int32', -1, 'int32'), - ('int32', np.iinfo('int8').min - 1, 'int32'), - ('int32', np.iinfo('int16').min - 1, 'int32'), - ('int32', np.iinfo('int32').min - 1, 'int64'), - ('int32', np.iinfo('int64').min - 1, 'object'), - ('uint32', 1, 'uint32'), - ('uint32', np.iinfo('int8').max + 1, 'uint32'), - ('uint32', np.iinfo('uint8').max + 1, 'uint32'), - ('uint32', np.iinfo('int16').max + 1, 'uint32'), - ('uint32', np.iinfo('uint16').max + 1, 'uint32'), - ('uint32', np.iinfo('int32').max + 1, 'uint32'), - ('uint32', np.iinfo('uint32').max + 1, 'uint64'), - ('uint32', np.iinfo('int64').max + 1, 'uint64'), - ('uint32', np.iinfo('uint64').max + 1, 'object'), - ('uint32', -1, 'int64'), - ('uint32', np.iinfo('int8').min - 1, 'int64'), - ('uint32', np.iinfo('int16').min - 1, 'int64'), - ('uint32', np.iinfo('int32').min - 1, 'int64'), - ('uint32', np.iinfo('int64').min - 1, 'object'), - # size 64 - ('int64', 1, 'int64'), - ('int64', np.iinfo('int8').max + 1, 'int64'), - ('int64', np.iinfo('int16').max + 1, 'int64'), - ('int64', np.iinfo('int32').max + 1, 'int64'), - ('int64', np.iinfo('int64').max + 1, 'object'), - ('int64', -1, 'int64'), - ('int64', np.iinfo('int8').min - 1, 'int64'), - ('int64', np.iinfo('int16').min - 1, 'int64'), - ('int64', np.iinfo('int32').min - 1, 'int64'), - ('int64', np.iinfo('int64').min - 1, 'object'), - ('uint64', 1, 'uint64'), - ('uint64', np.iinfo('int8').max + 1, 'uint64'), - ('uint64', np.iinfo('uint8').max + 1, 'uint64'), - ('uint64', np.iinfo('int16').max + 1, 'uint64'), - ('uint64', np.iinfo('uint16').max + 1, 'uint64'), - ('uint64', np.iinfo('int32').max + 1, 'uint64'), - ('uint64', np.iinfo('uint32').max + 1, 'uint64'), - ('uint64', np.iinfo('int64').max + 1, 'uint64'), - ('uint64', np.iinfo('uint64').max + 1, 'object'), - ('uint64', -1, 'object'), - ('uint64', np.iinfo('int8').min - 1, 'object'), - ('uint64', np.iinfo('int16').min - 1, 'object'), - ('uint64', np.iinfo('int32').min - 1, 'object'), - ('uint64', np.iinfo('int64').min - 1, 'object') -]) +@pytest.mark.parametrize( + "dtype, fill_value, expected_dtype", + [ + # size 8 + ("int8", 1, "int8"), + ("int8", np.iinfo("int8").max + 1, "int16"), + ("int8", np.iinfo("int16").max + 1, "int32"), + ("int8", np.iinfo("int32").max + 1, "int64"), + ("int8", np.iinfo("int64").max + 1, "object"), + ("int8", -1, "int8"), + ("int8", np.iinfo("int8").min - 1, "int16"), + ("int8", np.iinfo("int16").min - 1, "int32"), + ("int8", np.iinfo("int32").min - 1, "int64"), + ("int8", np.iinfo("int64").min - 1, "object"), + # keep signed-ness as long as possible + ("uint8", 1, "uint8"), + ("uint8", np.iinfo("int8").max + 1, "uint8"), + ("uint8", np.iinfo("uint8").max + 1, "uint16"), + ("uint8", np.iinfo("int16").max + 1, "uint16"), + ("uint8", np.iinfo("uint16").max + 1, "uint32"), + ("uint8", np.iinfo("int32").max + 1, "uint32"), + ("uint8", np.iinfo("uint32").max + 1, "uint64"), + ("uint8", np.iinfo("int64").max + 1, "uint64"), + ("uint8", np.iinfo("uint64").max + 1, "object"), + # max of uint8 cannot be contained in int8 + ("uint8", -1, "int16"), + ("uint8", np.iinfo("int8").min - 1, "int16"), + ("uint8", np.iinfo("int16").min - 1, "int32"), + ("uint8", np.iinfo("int32").min - 1, "int64"), + ("uint8", np.iinfo("int64").min - 1, "object"), + # size 16 + ("int16", 1, "int16"), + ("int16", np.iinfo("int8").max + 1, "int16"), + ("int16", np.iinfo("int16").max + 1, "int32"), + ("int16", np.iinfo("int32").max + 1, "int64"), + ("int16", np.iinfo("int64").max + 1, "object"), + ("int16", -1, "int16"), + ("int16", np.iinfo("int8").min - 1, "int16"), + ("int16", np.iinfo("int16").min - 1, "int32"), + ("int16", np.iinfo("int32").min - 1, "int64"), + ("int16", np.iinfo("int64").min - 1, "object"), + ("uint16", 1, "uint16"), + ("uint16", np.iinfo("int8").max + 1, "uint16"), + ("uint16", np.iinfo("uint8").max + 1, "uint16"), + ("uint16", np.iinfo("int16").max + 1, "uint16"), + ("uint16", np.iinfo("uint16").max + 1, "uint32"), + ("uint16", np.iinfo("int32").max + 1, "uint32"), + ("uint16", np.iinfo("uint32").max + 1, "uint64"), + ("uint16", np.iinfo("int64").max + 1, "uint64"), + ("uint16", np.iinfo("uint64").max + 1, "object"), + ("uint16", -1, "int32"), + ("uint16", np.iinfo("int8").min - 1, "int32"), + ("uint16", np.iinfo("int16").min - 1, "int32"), + ("uint16", np.iinfo("int32").min - 1, "int64"), + ("uint16", np.iinfo("int64").min - 1, "object"), + # size 32 + ("int32", 1, "int32"), + ("int32", np.iinfo("int8").max + 1, "int32"), + ("int32", np.iinfo("int16").max + 1, "int32"), + ("int32", np.iinfo("int32").max + 1, "int64"), + ("int32", np.iinfo("int64").max + 1, "object"), + ("int32", -1, "int32"), + ("int32", np.iinfo("int8").min - 1, "int32"), + ("int32", np.iinfo("int16").min - 1, "int32"), + ("int32", np.iinfo("int32").min - 1, "int64"), + ("int32", np.iinfo("int64").min - 1, "object"), + ("uint32", 1, "uint32"), + ("uint32", np.iinfo("int8").max + 1, "uint32"), + ("uint32", np.iinfo("uint8").max + 1, "uint32"), + ("uint32", np.iinfo("int16").max + 1, "uint32"), + ("uint32", np.iinfo("uint16").max + 1, "uint32"), + ("uint32", np.iinfo("int32").max + 1, "uint32"), + ("uint32", np.iinfo("uint32").max + 1, "uint64"), + ("uint32", np.iinfo("int64").max + 1, "uint64"), + ("uint32", np.iinfo("uint64").max + 1, "object"), + ("uint32", -1, "int64"), + ("uint32", np.iinfo("int8").min - 1, "int64"), + ("uint32", np.iinfo("int16").min - 1, "int64"), + ("uint32", np.iinfo("int32").min - 1, "int64"), + ("uint32", np.iinfo("int64").min - 1, "object"), + # size 64 + ("int64", 1, "int64"), + ("int64", np.iinfo("int8").max + 1, "int64"), + ("int64", np.iinfo("int16").max + 1, "int64"), + ("int64", np.iinfo("int32").max + 1, "int64"), + ("int64", np.iinfo("int64").max + 1, "object"), + ("int64", -1, "int64"), + ("int64", np.iinfo("int8").min - 1, "int64"), + ("int64", np.iinfo("int16").min - 1, "int64"), + ("int64", np.iinfo("int32").min - 1, "int64"), + ("int64", np.iinfo("int64").min - 1, "object"), + ("uint64", 1, "uint64"), + ("uint64", np.iinfo("int8").max + 1, "uint64"), + ("uint64", np.iinfo("uint8").max + 1, "uint64"), + ("uint64", np.iinfo("int16").max + 1, "uint64"), + ("uint64", np.iinfo("uint16").max + 1, "uint64"), + ("uint64", np.iinfo("int32").max + 1, "uint64"), + ("uint64", np.iinfo("uint32").max + 1, "uint64"), + ("uint64", np.iinfo("int64").max + 1, "uint64"), + ("uint64", np.iinfo("uint64").max + 1, "object"), + ("uint64", -1, "object"), + ("uint64", np.iinfo("int8").min - 1, "object"), + ("uint64", np.iinfo("int16").min - 1, "object"), + ("uint64", np.iinfo("int32").min - 1, "object"), + ("uint64", np.iinfo("int64").min - 1, "object"), + ], +) def test_maybe_promote_int_with_int(dtype, fill_value, expected_dtype, box): dtype = np.dtype(dtype) expected_dtype = np.dtype(expected_dtype) @@ -285,10 +291,17 @@ def test_maybe_promote_int_with_int(dtype, fill_value, expected_dtype, box): # output is not a python int, but a numpy int of expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] # no missing value marker for integers - exp_val_for_array = None if expected_dtype != 'object' else np.nan + exp_val_for_array = None if expected_dtype != "object" else np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, box): @@ -343,30 +356,32 @@ def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, box): ) -@pytest.mark.parametrize('dtype, fill_value, expected_dtype', [ - # float filled with float - ('float32', 1, 'float32'), - ('float32', np.finfo('float32').max * 1.1, 'float64'), - ('float64', 1, 'float64'), - ('float64', np.finfo('float32').max * 1.1, 'float64'), - # complex filled with float - ('complex64', 1, 'complex64'), - ('complex64', np.finfo('float32').max * 1.1, 'complex128'), - ('complex128', 1, 'complex128'), - ('complex128', np.finfo('float32').max * 1.1, 'complex128'), - # float filled with complex - ('float32', 1 + 1j, 'complex64'), - ('float32', np.finfo('float32').max * (1.1 + 1j), 'complex128'), - ('float64', 1 + 1j, 'complex128'), - ('float64', np.finfo('float32').max * (1.1 + 1j), 'complex128'), - # complex filled with complex - ('complex64', 1 + 1j, 'complex64'), - ('complex64', np.finfo('float32').max * (1.1 + 1j), 'complex128'), - ('complex128', 1 + 1j, 'complex128'), - ('complex128', np.finfo('float32').max * (1.1 + 1j), 'complex128') -]) -def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype, - box): +@pytest.mark.parametrize( + "dtype, fill_value, expected_dtype", + [ + # float filled with float + ("float32", 1, "float32"), + ("float32", np.finfo("float32").max * 1.1, "float64"), + ("float64", 1, "float64"), + ("float64", np.finfo("float32").max * 1.1, "float64"), + # complex filled with float + ("complex64", 1, "complex64"), + ("complex64", np.finfo("float32").max * 1.1, "complex128"), + ("complex128", 1, "complex128"), + ("complex128", np.finfo("float32").max * 1.1, "complex128"), + # float filled with complex + ("float32", 1 + 1j, "complex64"), + ("float32", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("float64", 1 + 1j, "complex128"), + ("float64", np.finfo("float32").max * (1.1 + 1j), "complex128"), + # complex filled with complex + ("complex64", 1 + 1j, "complex64"), + ("complex64", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("complex128", 1 + 1j, "complex128"), + ("complex128", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ], +) +def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype, box): dtype = np.dtype(dtype) expected_dtype = np.dtype(expected_dtype) @@ -376,8 +391,15 @@ def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype, exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced, box): @@ -426,8 +448,7 @@ def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): ) -def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, - box): +def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, box): dtype = np.dtype(bytes_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture @@ -436,48 +457,65 @@ def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, fill_value = np.array([1], dtype=fill_dtype)[0] # filling bytes with anything but bytes casts to object - expected_dtype = (dtype if issubclass(fill_dtype.type, np.bytes_) - else np.dtype(object)) + expected_dtype = ( + dtype if issubclass(fill_dtype.type, np.bytes_) else np.dtype(object) + ) exp_val_for_scalar = fill_value - exp_val_for_array = (None if issubclass(fill_dtype.type, np.bytes_) - else np.nan) + exp_val_for_array = None if issubclass(fill_dtype.type, np.bytes_) else np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) # override parametrization of box to add special case for bytes -@pytest.mark.parametrize('box', [ - (True, None), # fill_value wrapped in array with default dtype - (True, 'bytes'), # fill_value in array with generic bytes dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None) # fill_value passed on as scalar -], ids=['True-None', 'True-bytes', 'True-object', 'False-None']) -def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype, - box): +@pytest.mark.parametrize( + "box", + [ + (True, None), # fill_value wrapped in array with default dtype + (True, "bytes"), # fill_value in array with generic bytes dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], + ids=["True-None", "True-bytes", "True-object", "False-None"], +) +def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = np.dtype(bytes_dtype) boxed, box_dtype = box # read from parametrized fixture # create array of given dtype - fill_value = b'abc' + fill_value = b"abc" # special case for box_dtype (cannot use fixture in parametrization) - box_dtype = fill_dtype if box_dtype == 'bytes' else box_dtype + box_dtype = fill_dtype if box_dtype == "bytes" else box_dtype # filling bytes with anything but bytes casts to object - expected_dtype = (dtype if issubclass(dtype.type, np.bytes_) - else np.dtype(object)) + expected_dtype = dtype if issubclass(dtype.type, np.bytes_) else np.dtype(object) # output is not a generic bytes, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] exp_val_for_array = None if issubclass(dtype.type, np.bytes_) else np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -def test_maybe_promote_datetime64_with_any(datetime64_dtype, - any_numpy_dtype_reduced, box): +def test_maybe_promote_datetime64_with_any( + datetime64_dtype, any_numpy_dtype_reduced, box +): dtype = np.dtype(datetime64_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture @@ -496,17 +534,28 @@ def test_maybe_promote_datetime64_with_any(datetime64_dtype, exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) # override parametrization of box to add special case for dt_dtype -@pytest.mark.parametrize('box', [ - (True, None), # fill_value wrapped in array with default dtype - (True, 'dt_dtype'), # fill_value in array with explicit datetime dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None) # fill_value passed on as scalar -], ids=['True-None', 'True-dt_dtype', 'True-object', 'False-None']) +@pytest.mark.parametrize( + "box", + [ + (True, None), # fill_value wrapped in array with default dtype + (True, "dt_dtype"), # fill_value in array with explicit datetime dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], + ids=["True-None", "True-dt_dtype", "True-object", "False-None"], +) @pytest.mark.parametrize( "fill_value", [ @@ -574,8 +623,9 @@ def test_maybe_promote_datetimetz_with_any_numpy_dtype( ) -def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, - tz_aware_fixture2, box): +def test_maybe_promote_datetimetz_with_datetimetz( + tz_aware_fixture, tz_aware_fixture2, box +): dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) boxed, box_dtype = box # read from parametrized fixture @@ -583,12 +633,12 @@ def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, from dateutil.tz import tzlocal if is_platform_windows() and tz_aware_fixture2 == tzlocal(): - pytest.xfail('Cannot process fill_value with this dtype, see GH 24310') + pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") if dtype.tz == fill_dtype.tz: # here we should keep the datetime64tz dtype, but since that cannot be # inferred correctly for fill_value, the calling dtype ends up being # compared to a tz-naive datetime64-dtype, and must therefore upcast - pytest.xfail('cannot infer datetime64tz dtype, see GH 23554') + pytest.xfail("cannot infer datetime64tz dtype, see GH 23554") # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] @@ -657,7 +707,7 @@ def test_maybe_promote_any_numpy_dtype_with_datetimetz( if is_datetime64_dtype(dtype): # fill_dtype does not get inferred correctly to datetime64tz but to # datetime64, which then falsely matches with datetime64 dtypes. - pytest.xfail('cannot infer datetime64tz dtype, see GH 23554') + pytest.xfail("cannot infer datetime64tz dtype, see GH 23554") fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] @@ -677,8 +727,9 @@ def test_maybe_promote_any_numpy_dtype_with_datetimetz( ) -def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, - any_numpy_dtype_reduced, box): +def test_maybe_promote_timedelta64_with_any( + timedelta64_dtype, any_numpy_dtype_reduced, box +): dtype = np.dtype(timedelta64_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture @@ -697,8 +748,15 @@ def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) @pytest.mark.parametrize( @@ -711,10 +769,11 @@ def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, "box", [ (True, None), # fill_value wrapped in array with default dtype - (True, 'td_dtype'), # fill_value in array with explicit timedelta dtype + (True, "td_dtype"), # fill_value in array with explicit timedelta dtype (True, object), # fill_value wrapped in array with object dtype (False, None), # fill_value passed on as scalar - ], ids=['True-None', 'True-td_dtype', 'True-object', 'False-None'] + ], + ids=["True-None", "True-td_dtype", "True-object", "False-None"], ) def test_maybe_promote_any_with_timedelta64( any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box @@ -776,11 +835,12 @@ def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype_reduced, bo "box", [ # disabled due to too many xfails; see GH 23982 / 25425 - (True, None), # fill_value wrapped in array with default dtype - (True, 'str'), # fill_value wrapped in array with generic string-dtype + (True, None), # fill_value wrapped in array with default dtype + (True, "str"), # fill_value wrapped in array with generic string-dtype (True, object), # fill_value wrapped in array with object dtype (False, None), # fill_value passed on as scalar - ], ids=['True-None', 'True-str', 'True-object', 'False-None'] + ], + ids=["True-None", "True-str", "True-object", "False-None"], ) def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, string_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) @@ -862,11 +922,12 @@ def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype, bo ) # override parametrization of box, because default dtype for na is always float @pytest.mark.parametrize( - 'box', + "box", [ (True, object), # fill_value wrapped in array with object dtype - (False, None) # fill_value passed on as scalar - ], ids=['True-object', 'False-None'] + (False, None), # fill_value passed on as scalar + ], + ids=["True-object", "False-None"], ) def test_maybe_promote_any_numpy_dtype_with_na( any_numpy_dtype_reduced, fill_value, box @@ -874,7 +935,7 @@ def test_maybe_promote_any_numpy_dtype_with_na( dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if is_integer_dtype(dtype) and dtype == 'uint64' and fill_value == iNaT: + if is_integer_dtype(dtype) and dtype == "uint64" and fill_value == iNaT: # uint64 + negative int casts to object; iNaT is considered as missing expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan @@ -953,8 +1014,7 @@ def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): ) # same again for _maybe_promote_with_array (for coverage) - result_dtype, result_missing_value = _maybe_promote_with_array( - dtype, fill_array) + result_dtype, result_missing_value = _maybe_promote_with_array(dtype, fill_array) assert result_dtype == expected_dtype # None == None, iNaT == iNaT, but np.nan != np.nan @@ -964,17 +1024,17 @@ def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): def test_maybe_promote_raises(any_numpy_dtype): - msg = 'fill_value must either be scalar, or a Series / Index / np.ndarra.*' + msg = "fill_value must either be scalar, or a Series / Index / np.ndarra.*" with pytest.raises(ValueError, match=msg): # something that's neither scalar, nor Series / Index / np.ndarray maybe_promote(any_numpy_dtype, [1, 2, 3]) - msg = 'fill_value must either be a Series / Index / np.ndarray, received.*' + msg = "fill_value must either be a Series / Index / np.ndarray, received.*" with pytest.raises(ValueError, match=msg): # something that's not a Series / Index / np.ndarray _maybe_promote_with_array(any_numpy_dtype, 1) - msg = 'fill_value must be a scalar, received .*' + msg = "fill_value must be a scalar, received .*" with pytest.raises(ValueError, match=msg): # something that's not scalar _maybe_promote_with_scalar(any_numpy_dtype, pd.Series([1, 2, 3])) From 06e22a3110f015d34b71d3d97e47352ead009901 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 26 Jul 2019 13:41:44 +0200 Subject: [PATCH 17/27] remove usage of deprecated .real/.imag --- pandas/core/dtypes/cast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 89bcadf9c3398..c6eabe1f41fd7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -681,8 +681,8 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): if ( dtype in ["float32", "complex64"] and max( - np.abs(fill_value.real).max(), # also works for float - np.abs(fill_value.imag).max(), + np.abs(np.real(fill_value)).max(), # also works for float + np.abs(np.imag(fill_value)).max(), ) <= _float32_max ): From 0f8f064689905ee0ec0956ba26867807e6ac9913 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 26 Jul 2019 15:46:31 +0200 Subject: [PATCH 18/27] lint --- pandas/tests/dtypes/cast/test_promote.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index d5e7ffb3e33fb..ddd739e4b0f75 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -23,7 +23,6 @@ is_integer_dtype, is_object_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype, PandasExtensionDtype From 98c584eab10fb0cbde92b494efe50fcf3d170662 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 26 Jul 2019 15:47:47 +0200 Subject: [PATCH 19/27] work around GH 27610 --- pandas/core/dtypes/cast.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c6eabe1f41fd7..ffc2fcf3a0f03 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -678,17 +678,20 @@ def _maybe_promote_with_array(dtype, fill_value=np.nan): is_float_dtype(fill_dtype) or is_complex_dtype(fill_dtype) ): # at least one is complex; otherwise we'd have hit float/float above - if ( - dtype in ["float32", "complex64"] - and max( - np.abs(np.real(fill_value)).max(), # also works for float - np.abs(np.imag(fill_value)).max(), - ) - <= _float32_max - ): - return np.complex64, np.nan - # all other cases return complex128 - return np.dtype("complex128"), np.nan + with warnings.catch_warnings(): + # work around GH 27610 + warnings.filterwarnings("ignore", category=FutureWarning) + if ( + dtype in ["float32", "complex64"] + and max( + np.abs(np.real(fill_value)).max(), # also works for float + np.abs(np.imag(fill_value)).max(), + ) + <= _float32_max + ): + return np.complex64, np.nan + # all other cases return complex128 + return np.dtype("complex128"), np.nan elif is_bool_dtype(dtype) and is_bool_dtype(fill_dtype): # bool with bool is the only combination that stays bool; any other # combination involving bool upcasts to object, see else-clause below From 32d8a2b7eeb811fa785aac42c4afa5db4a900b98 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 21 Sep 2019 11:21:00 +0200 Subject: [PATCH 20/27] Review (jbrockmendel) --- pandas/core/dtypes/cast.py | 7 ++----- pandas/tests/dtypes/cast/test_promote.py | 8 +++++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ac0f6f424cd3d..105147b02a772 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -74,12 +74,9 @@ def _is_iNaT(x): Specifically, comparing a scalar timedelta against another scalar value may raise a spurious DeprecationWarning, see numpy/numpy#10095 """ - if not is_scalar(x): + if not is_integer(x): return False - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - result = x == iNaT - return result + return x == iNaT def maybe_convert_platform(values): diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index ddd739e4b0f75..98fdcdb3ade5d 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -162,9 +162,11 @@ def _check_promote( # for equal values, also check type (relevant e.g. for int vs float, resp. # for different datetimes and timedeltas) - match_value = result_fill_value == expected_fill_value and type( - result_fill_value - ) == type(expected_fill_value) + result_type = type(result_fill_value) + expected_type = type(expected_fill_value) + match_value = ( + result_fill_value == expected_fill_value and result_type == expected_type + ) # for missing values, None == None and iNaT == iNaT (which is checked # through match_value above), but np.nan != np.nan and pd.NaT != pd.NaT From 2a8691a40f774323fe3aa4f2ff558531c202570e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 21 Sep 2019 13:10:58 +0200 Subject: [PATCH 21/27] minor comment improvements/cleanups --- pandas/core/dtypes/cast.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 105147b02a772..fa3c2cae25f86 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -452,20 +452,24 @@ def _maybe_promote_with_scalar(dtype, fill_value=np.nan): "fill_value must be a scalar, received " "{}".format(type(fill_value)) ) - # unify handling of scalar and array values to simplify actual - # promotion logic in _maybe_promote_with_array; + # shortcut: inserting into object does not cast (except for None -> np.nan) if is_object_dtype(dtype) and fill_value is not None: - # inserting into object does not cast (except for None -> np.nan) return np.dtype(object), fill_value + # unify handling of scalar and array values to simplify actual promotion + # logic in _maybe_promote_with_array; therefore, we pack fill_value into an + # array, use the array-promotion, and then handle the differences between + # the array/scalar case. + # use Series to construct, since np.array cannot deal with pandas-internal # dtypes (e.g. DatetimeTZDtype); furthermore, we want to treat tuples as # scalar, but numpy casts those to a new dimension fill_array = Series([fill_value], dtype=object) dtype, na_value = _maybe_promote_with_array(dtype, fill_array) - # _maybe_promote_with_array returns the na-marker for the new dtype; - # _maybe_promote_with_scalar always casts fill_value to the new dtype + # main difference between the array and scalar case: + # _maybe_promote_with_array returns the na-marker for the new dtype, but + # _maybe_promote_with_scalar always casts fill_value to the new dtype! if is_integer_dtype(dtype) and _is_iNaT(fill_value): # _maybe_promote_with_array considers iNaT a missing value, and since # int dtypes cannot hold missing values, that method returns None as @@ -480,7 +484,7 @@ def _maybe_promote_with_scalar(dtype, fill_value=np.nan): # cast missing values (incl. iNaT) to correct missing value marker for # the updated dtype fill_value = na_value - # otherwise casts fill_value (= only entry of fill_array) to new dtype + # otherwise cast fill_value (= only entry of fill_array) to new dtype elif is_datetime_or_timedelta_dtype(dtype): # for datetime/timedelta, we need to return the underlying ints fill_value = fill_array.astype(dtype)[0].value From d5aa77b487b44d3b3d195a4c61716314a0d7c386 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 22 Sep 2019 11:11:55 +0200 Subject: [PATCH 22/27] exploration: check if array-case is required --- pandas/core/dtypes/cast.py | 2 - pandas/tests/dtypes/cast/test_promote.py | 93 ++---------------------- 2 files changed, 6 insertions(+), 89 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index fa3c2cae25f86..d627f61c5ed95 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -387,8 +387,6 @@ def maybe_promote(dtype, fill_value=np.nan): """ if is_scalar(fill_value) or isinstance(fill_value, tuple): return _maybe_promote_with_scalar(dtype, fill_value) - elif isinstance(fill_value, (np.ndarray, ABCSeries, ABCIndexClass)): - return _maybe_promote_with_array(dtype, fill_value) else: fill_type = type(fill_value).__name__ raise ValueError( diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 98fdcdb3ade5d..626c06b506c39 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -67,10 +67,7 @@ def any_numpy_dtype_reduced(request): return request.param -@pytest.fixture( - params=[(True, None), (True, object), (False, None)], - ids=["True-None", "True-object", "False-None"], -) +@pytest.fixture(params=[(False, None)], ids=["False-None"]) def box(request): """ Parametrized fixture determining whether/how to transform fill_value. @@ -476,16 +473,7 @@ def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, box) # override parametrization of box to add special case for bytes -@pytest.mark.parametrize( - "box", - [ - (True, None), # fill_value wrapped in array with default dtype - (True, "bytes"), # fill_value in array with generic bytes dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None), # fill_value passed on as scalar - ], - ids=["True-None", "True-bytes", "True-object", "False-None"], -) +@pytest.mark.parametrize("box", [(False, None)]) # fill_value passed on as scalar def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = np.dtype(bytes_dtype) @@ -547,16 +535,7 @@ def test_maybe_promote_datetime64_with_any( # override parametrization of box to add special case for dt_dtype -@pytest.mark.parametrize( - "box", - [ - (True, None), # fill_value wrapped in array with default dtype - (True, "dt_dtype"), # fill_value in array with explicit datetime dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None), # fill_value passed on as scalar - ], - ids=["True-None", "True-dt_dtype", "True-object", "False-None"], -) +@pytest.mark.parametrize("box", [(False, None)]) # fill_value passed on as scalar @pytest.mark.parametrize( "fill_value", [ @@ -766,16 +745,7 @@ def test_maybe_promote_timedelta64_with_any( ids=["pd.Timedelta", "np.timedelta64", "datetime.timedelta"], ) # override parametrization of box to add special case for td_dtype -@pytest.mark.parametrize( - "box", - [ - (True, None), # fill_value wrapped in array with default dtype - (True, "td_dtype"), # fill_value in array with explicit timedelta dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None), # fill_value passed on as scalar - ], - ids=["True-None", "True-td_dtype", "True-object", "False-None"], -) +@pytest.mark.parametrize("box", [(False, None)]) # fill_value passed on as scalar def test_maybe_promote_any_with_timedelta64( any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box ): @@ -832,17 +802,7 @@ def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype_reduced, bo # override parametrization of box to add special case for str -@pytest.mark.parametrize( - "box", - [ - # disabled due to too many xfails; see GH 23982 / 25425 - (True, None), # fill_value wrapped in array with default dtype - (True, "str"), # fill_value wrapped in array with generic string-dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None), # fill_value passed on as scalar - ], - ids=["True-None", "True-str", "True-object", "False-None"], -) +@pytest.mark.parametrize("box", [(False, None)]) # fill_value passed on as scalar def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, string_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = np.dtype(string_dtype) @@ -922,14 +882,7 @@ def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype, bo "fill_value", [None, np.nan, NaT, iNaT], ids=["None", "np.nan", "pd.NaT", "iNaT"] ) # override parametrization of box, because default dtype for na is always float -@pytest.mark.parametrize( - "box", - [ - (True, object), # fill_value wrapped in array with object dtype - (False, None), # fill_value passed on as scalar - ], - ids=["True-object", "False-None"], -) +@pytest.mark.parametrize("box", [(False, None)]) # fill_value passed on as scalar def test_maybe_promote_any_numpy_dtype_with_na( any_numpy_dtype_reduced, fill_value, box ): @@ -990,40 +943,6 @@ def test_maybe_promote_any_numpy_dtype_with_na( ) -@pytest.mark.parametrize("dim", [0, 2, 3]) -def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): - dtype = np.dtype(any_numpy_dtype_reduced) - - # create 0-dim array of given dtype; casts "1" to correct dtype - fill_array = np.array(1, dtype=dtype) - - # expand to desired dimension: - for _ in range(dim): - fill_array = np.expand_dims(fill_array, 0) - - # test against 1-dimensional case - expected_dtype, expected_missing_value = maybe_promote( - dtype, np.array([1], dtype=dtype) - ) - - result_dtype, result_missing_value = maybe_promote(dtype, fill_array) - - assert result_dtype == expected_dtype - # None == None, iNaT == iNaT, but np.nan != np.nan - assert (result_missing_value == expected_missing_value) or ( - result_missing_value is np.nan and expected_missing_value is np.nan - ) - - # same again for _maybe_promote_with_array (for coverage) - result_dtype, result_missing_value = _maybe_promote_with_array(dtype, fill_array) - - assert result_dtype == expected_dtype - # None == None, iNaT == iNaT, but np.nan != np.nan - assert (result_missing_value == expected_missing_value) or ( - result_missing_value is np.nan and expected_missing_value is np.nan - ) - - def test_maybe_promote_raises(any_numpy_dtype): msg = "fill_value must either be scalar, or a Series / Index / np.ndarra.*" with pytest.raises(ValueError, match=msg): From fa347b68c6a06ff9680fc08f5c0ceb1f112ab23c Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 22 Sep 2019 22:19:02 +0200 Subject: [PATCH 23/27] Revert "exploration: check if array-case is required" This reverts commit d5aa77b487b44d3b3d195a4c61716314a0d7c386. --- pandas/core/dtypes/cast.py | 2 + pandas/tests/dtypes/cast/test_promote.py | 93 ++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 6 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d627f61c5ed95..fa3c2cae25f86 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -387,6 +387,8 @@ def maybe_promote(dtype, fill_value=np.nan): """ if is_scalar(fill_value) or isinstance(fill_value, tuple): return _maybe_promote_with_scalar(dtype, fill_value) + elif isinstance(fill_value, (np.ndarray, ABCSeries, ABCIndexClass)): + return _maybe_promote_with_array(dtype, fill_value) else: fill_type = type(fill_value).__name__ raise ValueError( diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 626c06b506c39..98fdcdb3ade5d 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -67,7 +67,10 @@ def any_numpy_dtype_reduced(request): return request.param -@pytest.fixture(params=[(False, None)], ids=["False-None"]) +@pytest.fixture( + params=[(True, None), (True, object), (False, None)], + ids=["True-None", "True-object", "False-None"], +) def box(request): """ Parametrized fixture determining whether/how to transform fill_value. @@ -473,7 +476,16 @@ def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, box) # override parametrization of box to add special case for bytes -@pytest.mark.parametrize("box", [(False, None)]) # fill_value passed on as scalar +@pytest.mark.parametrize( + "box", + [ + (True, None), # fill_value wrapped in array with default dtype + (True, "bytes"), # fill_value in array with generic bytes dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], + ids=["True-None", "True-bytes", "True-object", "False-None"], +) def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = np.dtype(bytes_dtype) @@ -535,7 +547,16 @@ def test_maybe_promote_datetime64_with_any( # override parametrization of box to add special case for dt_dtype -@pytest.mark.parametrize("box", [(False, None)]) # fill_value passed on as scalar +@pytest.mark.parametrize( + "box", + [ + (True, None), # fill_value wrapped in array with default dtype + (True, "dt_dtype"), # fill_value in array with explicit datetime dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], + ids=["True-None", "True-dt_dtype", "True-object", "False-None"], +) @pytest.mark.parametrize( "fill_value", [ @@ -745,7 +766,16 @@ def test_maybe_promote_timedelta64_with_any( ids=["pd.Timedelta", "np.timedelta64", "datetime.timedelta"], ) # override parametrization of box to add special case for td_dtype -@pytest.mark.parametrize("box", [(False, None)]) # fill_value passed on as scalar +@pytest.mark.parametrize( + "box", + [ + (True, None), # fill_value wrapped in array with default dtype + (True, "td_dtype"), # fill_value in array with explicit timedelta dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], + ids=["True-None", "True-td_dtype", "True-object", "False-None"], +) def test_maybe_promote_any_with_timedelta64( any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box ): @@ -802,7 +832,17 @@ def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype_reduced, bo # override parametrization of box to add special case for str -@pytest.mark.parametrize("box", [(False, None)]) # fill_value passed on as scalar +@pytest.mark.parametrize( + "box", + [ + # disabled due to too many xfails; see GH 23982 / 25425 + (True, None), # fill_value wrapped in array with default dtype + (True, "str"), # fill_value wrapped in array with generic string-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], + ids=["True-None", "True-str", "True-object", "False-None"], +) def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, string_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = np.dtype(string_dtype) @@ -882,7 +922,14 @@ def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype, bo "fill_value", [None, np.nan, NaT, iNaT], ids=["None", "np.nan", "pd.NaT", "iNaT"] ) # override parametrization of box, because default dtype for na is always float -@pytest.mark.parametrize("box", [(False, None)]) # fill_value passed on as scalar +@pytest.mark.parametrize( + "box", + [ + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], + ids=["True-object", "False-None"], +) def test_maybe_promote_any_numpy_dtype_with_na( any_numpy_dtype_reduced, fill_value, box ): @@ -943,6 +990,40 @@ def test_maybe_promote_any_numpy_dtype_with_na( ) +@pytest.mark.parametrize("dim", [0, 2, 3]) +def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): + dtype = np.dtype(any_numpy_dtype_reduced) + + # create 0-dim array of given dtype; casts "1" to correct dtype + fill_array = np.array(1, dtype=dtype) + + # expand to desired dimension: + for _ in range(dim): + fill_array = np.expand_dims(fill_array, 0) + + # test against 1-dimensional case + expected_dtype, expected_missing_value = maybe_promote( + dtype, np.array([1], dtype=dtype) + ) + + result_dtype, result_missing_value = maybe_promote(dtype, fill_array) + + assert result_dtype == expected_dtype + # None == None, iNaT == iNaT, but np.nan != np.nan + assert (result_missing_value == expected_missing_value) or ( + result_missing_value is np.nan and expected_missing_value is np.nan + ) + + # same again for _maybe_promote_with_array (for coverage) + result_dtype, result_missing_value = _maybe_promote_with_array(dtype, fill_array) + + assert result_dtype == expected_dtype + # None == None, iNaT == iNaT, but np.nan != np.nan + assert (result_missing_value == expected_missing_value) or ( + result_missing_value is np.nan and expected_missing_value is np.nan + ) + + def test_maybe_promote_raises(any_numpy_dtype): msg = "fill_value must either be scalar, or a Series / Index / np.ndarra.*" with pytest.raises(ValueError, match=msg): From 82ec973a1d41290449c51af542ca702502249771 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 29 Oct 2019 00:12:52 +0100 Subject: [PATCH 24/27] adapt array-path to new test behaviour --- pandas/core/dtypes/cast.py | 50 +++++++----------------- pandas/tests/dtypes/cast/test_promote.py | 45 ++++++++------------- 2 files changed, 31 insertions(+), 64 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 839592e59882b..0a3e5115bee98 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -66,18 +66,6 @@ _float32_max = np.finfo(np.float32).max -def _is_iNaT(x): - """ - Helper function to circumvent numpy bug for timedeltas - - Specifically, comparing a scalar timedelta against another scalar value may - raise a spurious DeprecationWarning, see numpy/numpy#10095 - """ - if not is_integer(x): - return False - return x == iNaT - - def maybe_convert_platform(values): """ try to do platform conversion, allow ndarray or list here """ @@ -585,13 +573,14 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): "np.ndarray, received {}".format(fill_type) ) - if all(isna(x) or _is_iNaT(x) for x in fill_value): + if all(isna(x) for x in fill_value): # only missing values (or no values at all) - if is_datetime_or_timedelta_dtype(dtype): - return dtype, iNaT + if is_datetime64_dtype(dtype): + return dtype, np.datetime64("NaT", "ns") + elif is_timedelta64_dtype(dtype): + return dtype, np.timedelta64("NaT", "ns") elif is_datetime64tz_dtype(dtype): - # DatetimeTZDtype does not use iNaT as missing value marker return dtype, NaT na_value = np.nan @@ -605,17 +594,6 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): # presence of pd.NaT upcasts everything that's not # datetime/timedelta (see above) to object dtype = np.dtype(object) - elif ( - is_integer_dtype(dtype) - and dtype == "uint64" - and all(x == iNaT for x in fill_value) - ): - # uint64 + negative int casts to object - dtype = np.dtype(object) - elif is_integer_dtype(dtype) and all(x == iNaT for x in fill_value): - # integer + iNaT casts to int64 - dtype = np.dtype("int64") - na_value = None elif is_integer_dtype(dtype): # integer + other missing value (np.nan / None) casts to float dtype = np.dtype("float64") @@ -667,7 +645,6 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): # * float vs float # * float vs complex (and vice versa) # * bool - # * bytes # * datetimetz # * datetime # * timedelta @@ -758,10 +735,6 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): # bool with bool is the only combination that stays bool; any other # combination involving bool upcasts to object, see else-clause below return dtype, None - elif issubclass(dtype.type, np.bytes_) and issubclass(fill_dtype.type, np.bytes_): - # bytes with bytes is the only combination that stays bytes; any other - # combination involving bytes upcasts to object, see else-clause below - return dtype, None elif ( is_datetime64tz_dtype(dtype) and is_datetime64tz_dtype(fill_dtype) @@ -771,7 +744,7 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): # combination that stays datetimetz (in particular, mixing timezones or # tz-aware and tz-naive datetimes will cast to object); any other # combination involving datetimetz upcasts to object, see below - return dtype, iNaT + return dtype, NaT elif (is_timedelta64_dtype(dtype) and is_timedelta64_dtype(fill_dtype)) or ( is_datetime64_dtype(dtype) and is_datetime64_dtype(fill_dtype) ): @@ -787,14 +760,19 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): "ignore", message=msg, category=DeprecationWarning ) fill_value.astype(dtype) - na_value = iNaT + + # can simplify if-cond. compared to cond. for entering this branch + if is_datetime64_dtype(dtype): + na_value = np.datetime64("NaT", "ns") + else: + na_value = np.timedelta64("NaT", "ns") except (ValueError, TypeError): dtype = np.dtype(object) na_value = np.nan return dtype, na_value else: - # anything else (e.g. strings, objects, or unmatched - # bool / bytes / datetime / datetimetz / timedelta) + # anything else (e.g. strings, objects, bytes, or unmatched + # bool / datetime / datetimetz / timedelta) return np.dtype(object), np.nan diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index f01e51c066337..9b1d3e9c279ce 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -10,11 +10,7 @@ from pandas._libs.tslibs import NaT from pandas.compat import is_platform_windows -from pandas.core.dtypes.cast import ( - maybe_promote_with_array, - _maybe_promote_with_scalar, - maybe_promote, -) +from pandas.core.dtypes.cast import maybe_promote_with_array, maybe_promote from pandas.core.dtypes.common import ( is_complex_dtype, is_datetime64_dtype, @@ -137,7 +133,7 @@ def _check_promote( # box_dtype; the expected value returned from maybe_promote is the # missing value marker for the returned dtype. fill_array = np.array([fill_value], dtype=box_dtype) - result_dtype, result_fill_value = maybe_promote(dtype, fill_array) + result_dtype, result_fill_value = maybe_promote_with_array(dtype, fill_array) expected_fill_value = exp_val_for_array else: # here, we pass on fill_value as a scalar directly; the expected value @@ -456,12 +452,10 @@ def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, box) # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] - # filling bytes with anything but bytes casts to object - expected_dtype = ( - dtype if issubclass(fill_dtype.type, np.bytes_) else np.dtype(object) - ) + # we never use bytes dtype internally, always promote to object + expected_dtype = np.dtype(np.object_) exp_val_for_scalar = fill_value - exp_val_for_array = None if issubclass(fill_dtype.type, np.bytes_) else np.nan + exp_val_for_array = np.nan _check_promote( dtype, @@ -496,11 +490,11 @@ def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype, box) # special case for box_dtype (cannot use fixture in parametrization) box_dtype = fill_dtype if box_dtype == "bytes" else box_dtype - # filling bytes with anything but bytes casts to object - expected_dtype = dtype if issubclass(dtype.type, np.bytes_) else np.dtype(object) + # we never use bytes dtype internally, always promote to object + expected_dtype = np.dtype(np.object_) # output is not a generic bytes, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] - exp_val_for_array = None if issubclass(dtype.type, np.bytes_) else np.nan + exp_val_for_array = np.nan _check_promote( dtype, @@ -575,7 +569,7 @@ def test_maybe_promote_any_with_datetime64( # special case for box_dtype box_dtype = np.dtype(datetime64_dtype) if box_dtype == "dt_dtype" else box_dtype - # filling datetime with anything but datetime casts to object + # filling anything but datetime with datetime casts to object if is_datetime64_dtype(dtype): expected_dtype = dtype # for datetime dtypes, scalar values get cast to pd.Timestamp.value @@ -604,6 +598,9 @@ def test_maybe_promote_datetimetz_with_any_numpy_dtype( fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture + if not boxed: + pytest.xfail("unfixed error: does not upcast correctly") + # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -632,6 +629,8 @@ def test_maybe_promote_datetimetz_with_datetimetz( from dateutil.tz import tzlocal + if not boxed: + pytest.xfail("unfixed error: does not upcast for unmatched timezones") if is_platform_windows() and tz_aware_fixture2 == tzlocal(): pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") if dtype.tz == fill_dtype.tz: @@ -640,7 +639,7 @@ def test_maybe_promote_datetimetz_with_datetimetz( # compared to a tz-naive datetime64-dtype, and must therefore upcast pytest.xfail("cannot infer datetime64tz dtype, see GH 23554") - # create array of given dtype; casts "1" to correct dtype + # create array of given dtype; casts "10 ** 9" to correct dtype fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] # filling datetimetz with datetimetz casts to object, unless tz matches @@ -988,28 +987,18 @@ def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): fill_array = np.expand_dims(fill_array, 0) # test against 1-dimensional case - expected_dtype, expected_missing_value = maybe_promote( + expected_dtype, expected_missing_value = maybe_promote_with_array( dtype, np.array([1], dtype=dtype) ) - result_dtype, result_missing_value = maybe_promote(dtype, fill_array) + result_dtype, result_missing_value = maybe_promote_with_array(dtype, fill_array) assert result_dtype == expected_dtype _assert_match(result_missing_value, expected_missing_value) def test_maybe_promote_raises(any_numpy_dtype): - msg = "fill_value must either be scalar, or a Series / Index / np.ndarra.*" - with pytest.raises(ValueError, match=msg): - # something that's neither scalar, nor Series / Index / np.ndarray - maybe_promote(any_numpy_dtype, [1, 2, 3]) - msg = "fill_value must either be a Series / Index / np.ndarray, received.*" with pytest.raises(ValueError, match=msg): # something that's not a Series / Index / np.ndarray maybe_promote_with_array(any_numpy_dtype, 1) - - msg = "fill_value must be a scalar, received .*" - with pytest.raises(ValueError, match=msg): - # something that's not scalar - _maybe_promote_with_scalar(any_numpy_dtype, pd.Series([1, 2, 3])) From b5eb1c49e28ce76b9d1e9a7a5f26d8160689b6b4 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 29 Oct 2019 09:33:48 +0100 Subject: [PATCH 25/27] lint: isort --- pandas/tests/dtypes/cast/test_promote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 9b1d3e9c279ce..bd17259b73a00 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -10,7 +10,7 @@ from pandas._libs.tslibs import NaT from pandas.compat import is_platform_windows -from pandas.core.dtypes.cast import maybe_promote_with_array, maybe_promote +from pandas.core.dtypes.cast import maybe_promote, maybe_promote_with_array from pandas.core.dtypes.common import ( is_complex_dtype, is_datetime64_dtype, From 3976220905fae8e162d462b4f990e6e55d81613d Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 29 Oct 2019 10:15:44 +0100 Subject: [PATCH 26/27] catch irrelevant warning --- pandas/tests/dtypes/cast/test_promote.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index bd17259b73a00..34aaa0605b15f 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -3,6 +3,7 @@ """ import datetime +import warnings import numpy as np import pytest @@ -162,7 +163,11 @@ def _assert_match(result_fill_value, expected_fill_value): # On some builds, type comparison fails, e.g. np.int32 != np.int32 assert res_type == ex_type or res_type.__name__ == ex_type.__name__ - match_value = result_fill_value == expected_fill_value + with warnings.catch_warnings(): + # we do not care about this warning, NaT is handled below anyway + msg = "In the future, 'NAT == x' and 'x == NAT' will always be False" + warnings.filterwarnings("ignore", message=msg, category=FutureWarning) + match_value = result_fill_value == expected_fill_value # Note: type check above ensures that we have the _same_ NA value # for missing values, None == None (which is checked From b8cd4f03d770155e61e0436e2b28adf191d86abd Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 29 Oct 2019 18:38:56 +0100 Subject: [PATCH 27/27] fix outdated iNaT-documentation --- pandas/core/dtypes/cast.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0a3e5115bee98..7720dfa713c03 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -506,7 +506,7 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): Determine minimal dtype to hold fill_value, when starting from dtype This will also return the default missing value for the resulting dtype, if - necessary (e.g. for datetime / timedelta, the missing value will be `iNaT`) + necessary (e.g. for datetime / timedelta, the missing value will be `NaT`) Parameters ---------- @@ -535,12 +535,12 @@ def maybe_promote_with_array(dtype, fill_value=np.nan): ... fill_value=np.array(['abcd'])) (dtype('O'), nan) - For datetimes, timedeltas and datetimes with a timezone, the missing value - marker is pandas._libs.tslibs.iNaT (== np.iinfo('int64').min): + For datetimes without timezones, the missing value marker is + numpy.datetime64('NaT'), and similarly for timedelta values. >>> maybe_promote_with_array(np.dtype('datetime64[ns]'), ... fill_value=np.array([None])) - (dtype('