From e377b6967f3f9b42557757397b41ab22a103ff33 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 29 Nov 2018 23:51:20 +0100 Subject: [PATCH 01/16] API: Series.str-accessor infers dtype --- doc/source/whatsnew/v0.24.0.rst | 3 + pandas/core/strings.py | 207 +++++++++++++++++++++++++------- pandas/tests/test_strings.py | 56 +++------ 3 files changed, 182 insertions(+), 84 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index f888648a9363e..ac869b522502b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -374,6 +374,8 @@ Backwards incompatible API changes - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) +- The `.str`-accessor will perform more rigorous type checking for inputs. Previously, some types that were never intended to be used + "worked" purely due to limitations of dtype checking -- e.g. ``bytes``, which is now disabled except for `encode`, `decode` and `len` (:issue:`23011`, :issue:`23163`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) @@ -1363,6 +1365,7 @@ Strings - Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`). - Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`). - Bug :func:`Series.str.contains` not respecting the ``na`` argument for a ``Categorical`` dtype ``Series`` (:issue:`22158`) +- Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) Interval ^^^^^^^^ diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 0b791f6f91aa3..164ba0f450420 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -9,19 +9,20 @@ import pandas._libs.lib as lib import pandas._libs.ops as libops import pandas.compat as compat -from pandas.compat import zip +from pandas.compat import wraps, zip from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, is_integer, - is_list_like, is_object_dtype, is_re, is_scalar, is_string_like) -from pandas.core.dtypes.generic import ABCIndex, ABCSeries + is_list_like, is_re, is_scalar, is_string_like) +from pandas.core.dtypes.generic import ABCIndex, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin import pandas.core.common as com + _cpython_optimized_encoders = ( "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" ) @@ -1733,12 +1734,80 @@ def str_encode(arr, encoding, errors="strict"): return _na_map(f, arr) -def _noarg_wrapper(f, docstring=None, **kargs): +def forbid_nonstring_types(forbidden, name=None): + """ + Decorator to forbid specific types for a method of StringMethods. + + For calling `.str.{method}` on a Series or Index, it is necessary to first + initialize the :class:`StringMethods` object, and then call the method. + However, different methods allow different input types, and so this can not + be checked during :meth:`StringMethods.__init__`, but must be done on a + per-method basis. This decorator exists to facilitate this process, and + make it explicit which (inferred) types are disallowed by the method. + + :meth:`StringMethods.__init__` allows the *union* of types its different + methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), + namely: ['string', 'unicode', 'empty', 'bytes', 'mixed', 'mixed-integer']. + + The default string types ['string', 'unicode', 'empty'] are allowed for all + methods. For the additional types ['bytes', 'mixed', 'mixed-integer'], each + method then needs to forbid the types it is not intended for. + + Parameters + ---------- + forbidden : list or None + List of forbidden non-string types, may be one or more of + `['bytes', 'mixed', 'mixed-integer']`. + name : str, default None + Name of the method to use in the error message. By default, this is + None, in which case the name from the method being wrapped will be + copied. However, for working with further wrappers (like _pat_wrapper + and _noarg_wrapper), it is necessary to specify the name. + + Returns + ------- + func : wrapper + The method to which the decorator is applied, with an added check that + enforces the inferred type to not be in the list of forbidden types. + + Raises + ------ + TypeError + If the inferred type of the underlying data is in `forbidden`. + """ + + # deal with None + forbidden = [] if forbidden is None else forbidden + # deal with single string instead of list + forbidden = [forbidden] if isinstance(forbidden, str) else forbidden + + allowed_types = {'string', 'unicode', 'empty', + 'bytes', 'mixed', 'mixed-integer'} - set(forbidden) + + def _forbid_nonstring_types(func): + func_name = func.__name__ if name is None else name + + @wraps(func) + def wrapper(self, *args, **kwargs): + if self._inferred_dtype not in allowed_types: + msg = ('Cannot use .str.{name} with values of inferred dtype ' + '{inf_type!r}.'.format(name=func_name, + inf_type=self._inferred_dtype)) + raise TypeError(msg) + return func(self, *args, **kwargs) + wrapper.__name__ = func_name + return wrapper + return _forbid_nonstring_types + + +def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=['bytes'], + **kargs): + @forbid_nonstring_types(forbidden_types, name=name) def wrapper(self): result = _na_map(f, self._parent, **kargs) return self._wrap_result(result) - wrapper.__name__ = f.__name__ + wrapper.__name__ = f.__name__ if name is None else name if docstring is not None: wrapper.__doc__ = docstring else: @@ -1747,22 +1816,26 @@ def wrapper(self): return wrapper -def _pat_wrapper(f, flags=False, na=False, **kwargs): +def _pat_wrapper(f, flags=False, na=False, name=None, + forbidden_types=['bytes'], **kwargs): + @forbid_nonstring_types(forbidden_types, name=name) def wrapper1(self, pat): result = f(self._parent, pat) return self._wrap_result(result) + @forbid_nonstring_types(forbidden_types, name=name) def wrapper2(self, pat, flags=0, **kwargs): result = f(self._parent, pat, flags=flags, **kwargs) return self._wrap_result(result) + @forbid_nonstring_types(forbidden_types, name=name) def wrapper3(self, pat, na=np.nan): result = f(self._parent, pat, na=na) return self._wrap_result(result) wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 - wrapper.__name__ = f.__name__ + wrapper.__name__ = f.__name__ if name is None else name if f.__doc__: wrapper.__doc__ = f.__doc__ @@ -1793,7 +1866,7 @@ class StringMethods(NoNewAttributesMixin): """ def __init__(self, data): - self._validate(data) + self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data) # .values.categories works for both Series/Index @@ -1804,38 +1877,32 @@ def __init__(self, data): @staticmethod def _validate(data): - from pandas.core.index import Index - - if (isinstance(data, ABCSeries) and - not ((is_categorical_dtype(data.dtype) and - is_object_dtype(data.values.categories)) or - (is_object_dtype(data.dtype)))): - # it's neither a string series not a categorical series with - # strings inside the categories. - # this really should exclude all series with any non-string values - # (instead of test for object dtype), but that isn't practical for - # performance reasons until we have a str dtype (GH 9343) + if isinstance(data, ABCMultiIndex): + raise AttributeError('Can only use .str accessor with Index, ' + 'not MultiIndex') + + # see _libs/lib.pyx for list of inferred types + allowed_types = ['string', 'unicode', 'empty', + 'bytes', 'mixed', 'mixed-integer'] + + values = getattr(data, 'values', data) # Series / Index + values = getattr(values, 'categories', values) # categorical / normal + + # missing values obfuscate type inference -> skip + inferred_dtype = lib.infer_dtype(values, skipna=True) + + if inferred_dtype not in allowed_types: + # this is a "first line of defence" and just checks that the type + # is in the *union* of the allowed types over all methods below; + # this restriction is then refined on a per-method basis using the + # decorator @forbid_nonstring_types + # + # this really should exclude all series/index with any non-string + # values, but that isn't practical for performance reasons until we + # have a str dtype (GH 9343 / 13877) raise AttributeError("Can only use .str accessor with string " - "values, which use np.object_ dtype in " - "pandas") - elif isinstance(data, Index): - # can't use ABCIndex to exclude non-str - - # see src/inference.pyx which can contain string values - allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') - if is_categorical_dtype(data.dtype): - inf_type = data.categories.inferred_type - else: - inf_type = data.inferred_type - if inf_type not in allowed_types: - message = ("Can only use .str accessor with string values " - "(i.e. inferred_type is 'string', 'unicode' or " - "'mixed')") - raise AttributeError(message) - if data.nlevels > 1: - message = ("Can only use .str accessor with Index, not " - "MultiIndex") - raise AttributeError(message) + "values!") + return inferred_dtype def __getitem__(self, key): if isinstance(key, slice): @@ -2037,12 +2104,13 @@ def _get_series_list(self, others, ignore_index=False): warnings.warn('list-likes other than Series, Index, or ' 'np.ndarray WITHIN another list-like are ' 'deprecated and will be removed in a future ' - 'version.', FutureWarning, stacklevel=3) + 'version.', FutureWarning, stacklevel=4) return (los, join_warn) elif all(not is_list_like(x) for x in others): return ([Series(others, index=idx)], False) raise TypeError(err_msg) + @forbid_nonstring_types(['bytes', 'mixed', 'mixed-integer']) def cat(self, others=None, sep=None, na_rep=None, join=None): """ Concatenate strings in the Series/Index with given separator. @@ -2223,7 +2291,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): "Index/DataFrame in `others`. To enable alignment " "and silence this warning, pass `join='left'|" "'outer'|'inner'|'right'`. The future default will " - "be `join='left'`.", FutureWarning, stacklevel=2) + "be `join='left'`.", FutureWarning, stacklevel=3) # if join is None, _get_series_list already force-aligned indexes join = 'left' if join is None else join @@ -2385,6 +2453,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): @Appender(_shared_docs['str_split'] % { 'side': 'beginning', 'method': 'split'}) + @forbid_nonstring_types(['bytes']) def split(self, pat=None, n=-1, expand=False): result = str_split(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) @@ -2392,6 +2461,7 @@ def split(self, pat=None, n=-1, expand=False): @Appender(_shared_docs['str_split'] % { 'side': 'end', 'method': 'rsplit'}) + @forbid_nonstring_types(['bytes']) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) @@ -2485,6 +2555,7 @@ def rsplit(self, pat=None, n=-1, expand=False): 'also': 'rpartition : Split the string at the last occurrence of `sep`' }) @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') + @forbid_nonstring_types(['bytes']) def partition(self, sep=' ', expand=True): f = lambda x: x.partition(sep) result = _na_map(f, self._parent) @@ -2497,44 +2568,52 @@ def partition(self, sep=' ', expand=True): 'also': 'partition : Split the string at the first occurrence of `sep`' }) @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') + @forbid_nonstring_types(['bytes']) def rpartition(self, sep=' ', expand=True): f = lambda x: x.rpartition(sep) result = _na_map(f, self._parent) return self._wrap_result(result, expand=expand) @copy(str_get) + @forbid_nonstring_types(['bytes']) def get(self, i): result = str_get(self._parent, i) return self._wrap_result(result) @copy(str_join) + @forbid_nonstring_types(['bytes']) def join(self, sep): result = str_join(self._parent, sep) return self._wrap_result(result) @copy(str_contains) + @forbid_nonstring_types(['bytes']) def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._parent, pat, case=case, flags=flags, na=na, regex=regex) return self._wrap_result(result, fill_value=na) @copy(str_match) + @forbid_nonstring_types(['bytes']) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na) @copy(str_replace) + @forbid_nonstring_types(['bytes']) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): result = str_replace(self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex) return self._wrap_result(result) @copy(str_repeat) + @forbid_nonstring_types(['bytes']) def repeat(self, repeats): result = str_repeat(self._parent, repeats) return self._wrap_result(result) @copy(str_pad) + @forbid_nonstring_types(['bytes']) def pad(self, width, side='left', fillchar=' '): result = str_pad(self._parent, width, side=side, fillchar=fillchar) return self._wrap_result(result) @@ -2558,17 +2637,21 @@ def pad(self, width, side='left', fillchar=' '): @Appender(_shared_docs['str_pad'] % dict(side='left and right', method='center')) + @forbid_nonstring_types(['bytes']) def center(self, width, fillchar=' '): return self.pad(width, side='both', fillchar=fillchar) @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust')) + @forbid_nonstring_types(['bytes']) def ljust(self, width, fillchar=' '): return self.pad(width, side='right', fillchar=fillchar) @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust')) + @forbid_nonstring_types(['bytes']) def rjust(self, width, fillchar=' '): return self.pad(width, side='left', fillchar=fillchar) + @forbid_nonstring_types(['bytes']) def zfill(self, width): """ Pad strings in the Series/Index by prepending '0' characters. @@ -2633,22 +2716,26 @@ def zfill(self, width): return self._wrap_result(result) @copy(str_slice) + @forbid_nonstring_types(['bytes']) def slice(self, start=None, stop=None, step=None): result = str_slice(self._parent, start, stop, step) return self._wrap_result(result) @copy(str_slice_replace) + @forbid_nonstring_types(['bytes']) def slice_replace(self, start=None, stop=None, repl=None): result = str_slice_replace(self._parent, start, stop, repl) return self._wrap_result(result) @copy(str_decode) def decode(self, encoding, errors="strict"): + # need to allow bytes here result = str_decode(self._parent, encoding, errors) return self._wrap_result(result) @copy(str_encode) def encode(self, encoding, errors="strict"): + # allowing bytes here for easily dealing with mixed str/bytes Series result = str_encode(self._parent, encoding, errors) return self._wrap_result(result) @@ -2717,28 +2804,33 @@ def encode(self, encoding, errors="strict"): @Appender(_shared_docs['str_strip'] % dict(side='left and right sides', method='strip')) + @forbid_nonstring_types(['bytes']) def strip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='both') return self._wrap_result(result) @Appender(_shared_docs['str_strip'] % dict(side='left side', method='lstrip')) + @forbid_nonstring_types(['bytes']) def lstrip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='left') return self._wrap_result(result) @Appender(_shared_docs['str_strip'] % dict(side='right side', method='rstrip')) + @forbid_nonstring_types(['bytes']) def rstrip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='right') return self._wrap_result(result) @copy(str_wrap) + @forbid_nonstring_types(['bytes']) def wrap(self, width, **kwargs): result = str_wrap(self._parent, width, **kwargs) return self._wrap_result(result) @copy(str_get_dummies) + @forbid_nonstring_types(['bytes']) def get_dummies(self, sep='|'): # we need to cast to Series of strings as only that has all # methods available for making the dummies... @@ -2748,20 +2840,23 @@ def get_dummies(self, sep='|'): name=name, expand=True) @copy(str_translate) + @forbid_nonstring_types(['bytes']) def translate(self, table, deletechars=None): result = str_translate(self._parent, table, deletechars) return self._wrap_result(result) - count = _pat_wrapper(str_count, flags=True) - startswith = _pat_wrapper(str_startswith, na=True) - endswith = _pat_wrapper(str_endswith, na=True) - findall = _pat_wrapper(str_findall, flags=True) + count = _pat_wrapper(str_count, flags=True, name='count') + startswith = _pat_wrapper(str_startswith, na=True, name='startswith') + endswith = _pat_wrapper(str_endswith, na=True, name='endswith') + findall = _pat_wrapper(str_findall, flags=True, name='findall') @copy(str_extract) + @forbid_nonstring_types(['bytes']) def extract(self, pat, flags=0, expand=True): return str_extract(self, pat, flags=flags, expand=expand) @copy(str_extractall) + @forbid_nonstring_types(['bytes']) def extractall(self, pat, flags=0): return str_extractall(self._orig, pat, flags=flags) @@ -2791,6 +2886,7 @@ def extractall(self, pat, flags=0): @Appender(_shared_docs['find'] % dict(side='lowest', method='find', also='rfind : Return highest indexes in each strings')) + @forbid_nonstring_types(['bytes']) def find(self, sub, start=0, end=None): result = str_find(self._parent, sub, start=start, end=end, side='left') return self._wrap_result(result) @@ -2798,11 +2894,13 @@ def find(self, sub, start=0, end=None): @Appender(_shared_docs['find'] % dict(side='highest', method='rfind', also='find : Return lowest indexes in each strings')) + @forbid_nonstring_types(['bytes']) def rfind(self, sub, start=0, end=None): result = str_find(self._parent, sub, start=start, end=end, side='right') return self._wrap_result(result) + @forbid_nonstring_types(['bytes']) def normalize(self, form): """Return the Unicode normal form for the strings in the Series/Index. For more information on the forms, see the @@ -2849,6 +2947,7 @@ def normalize(self, form): @Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index', also='rindex : Return highest indexes in each strings')) + @forbid_nonstring_types(['bytes']) def index(self, sub, start=0, end=None): result = str_index(self._parent, sub, start=start, end=end, side='left') @@ -2857,6 +2956,7 @@ def index(self, sub, start=0, end=None): @Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex', also='index : Return lowest indexes in each strings')) + @forbid_nonstring_types(['bytes']) def rindex(self, sub, start=0, end=None): result = str_index(self._parent, sub, start=start, end=end, side='right') @@ -2906,7 +3006,8 @@ def rindex(self, sub, start=0, end=None): 5 3.0 dtype: float64 """) - len = _noarg_wrapper(len, docstring=_shared_docs['len'], dtype=int) + len = _noarg_wrapper(len, docstring=_shared_docs['len'], + forbidden_types=None, dtype=int) _shared_docs['casemethods'] = (""" Convert strings in the Series/Index to %(type)s. @@ -2980,18 +3081,23 @@ def rindex(self, sub, start=0, end=None): method='capitalize') _shared_docs['swapcase'] = dict(type='be swapcased', method='swapcase') lower = _noarg_wrapper(lambda x: x.lower(), + name='lower', docstring=_shared_docs['casemethods'] % _shared_docs['lower']) upper = _noarg_wrapper(lambda x: x.upper(), + name='upper', docstring=_shared_docs['casemethods'] % _shared_docs['upper']) title = _noarg_wrapper(lambda x: x.title(), + name='title', docstring=_shared_docs['casemethods'] % _shared_docs['title']) capitalize = _noarg_wrapper(lambda x: x.capitalize(), + name='capitalize', docstring=_shared_docs['casemethods'] % _shared_docs['capitalize']) swapcase = _noarg_wrapper(lambda x: x.swapcase(), + name='swapcase', docstring=_shared_docs['casemethods'] % _shared_docs['swapcase']) @@ -3145,30 +3251,39 @@ def rindex(self, sub, start=0, end=None): _shared_docs['isnumeric'] = dict(type='numeric', method='isnumeric') _shared_docs['isdecimal'] = dict(type='decimal', method='isdecimal') isalnum = _noarg_wrapper(lambda x: x.isalnum(), + name='isalnum', docstring=_shared_docs['ismethods'] % _shared_docs['isalnum']) isalpha = _noarg_wrapper(lambda x: x.isalpha(), + name='isalpha', docstring=_shared_docs['ismethods'] % _shared_docs['isalpha']) isdigit = _noarg_wrapper(lambda x: x.isdigit(), + name='isdigit', docstring=_shared_docs['ismethods'] % _shared_docs['isdigit']) isspace = _noarg_wrapper(lambda x: x.isspace(), + name='isspace', docstring=_shared_docs['ismethods'] % _shared_docs['isspace']) islower = _noarg_wrapper(lambda x: x.islower(), + name='islower', docstring=_shared_docs['ismethods'] % _shared_docs['islower']) isupper = _noarg_wrapper(lambda x: x.isupper(), + name='isupper', docstring=_shared_docs['ismethods'] % _shared_docs['isupper']) istitle = _noarg_wrapper(lambda x: x.istitle(), + name='istitle', docstring=_shared_docs['ismethods'] % _shared_docs['istitle']) isnumeric = _noarg_wrapper(lambda x: compat.u_safe(x).isnumeric(), + name='isnumeric', docstring=_shared_docs['ismethods'] % _shared_docs['isnumeric']) isdecimal = _noarg_wrapper(lambda x: compat.u_safe(x).isdecimal(), + name='isdecimal', docstring=_shared_docs['ismethods'] % _shared_docs['isdecimal']) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 117984ce89743..30cdea554e778 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -135,7 +135,10 @@ def any_allowed_skipna_inferred_dtype(request): """ Fixture for all (inferred) dtypes allowed in StringMethods.__init__ - The covered (inferred) types are: + Returns an np.ndarray that will be inferred to have the given dtype (when + skipping missing values). + + The allowed (inferred) types are: * 'string' * 'unicode' (if PY2) * 'empty' @@ -156,9 +159,12 @@ def any_allowed_skipna_inferred_dtype(request): >>> import pandas._libs.lib as lib >>> >>> def test_something(any_allowed_skipna_inferred_dtype): - ... inferred_dtype, values = any_skipna_inferred_dtype + ... inferred_dtype, values = any_allowed_skipna_inferred_dtype ... # will pass ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... + ... # constructor for .str-accessor will also pass + ... pd.Series(values).str """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting @@ -188,20 +194,6 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): pytest.xfail(reason='Conversion to numpy array fails because ' 'the ._values-attribute is not a numpy array for ' 'PeriodArray/IntervalArray; see GH 23553') - if box == Index and inferred_dtype in ['empty', 'bytes']: - pytest.xfail(reason='Raising too restrictively; ' - 'solved by GH 23167') - if (box == Index and dtype == object - and inferred_dtype in ['boolean', 'date', 'time']): - pytest.xfail(reason='Inferring incorrectly because of NaNs; ' - 'solved by GH 23167') - if (box == Series - and (dtype == object and inferred_dtype not in [ - 'string', 'unicode', 'empty', - 'bytes', 'mixed', 'mixed-integer']) - or (dtype == 'category' - and inferred_dtype in ['decimal', 'boolean', 'time'])): - pytest.xfail(reason='Not raising correctly; solved by GH 23167') types_passing_constructor = ['string', 'unicode', 'empty', 'bytes', 'mixed', 'mixed-integer'] @@ -229,25 +221,19 @@ def test_api_per_method(self, box, dtype, method_name, args, kwargs = any_string_method # TODO: get rid of these xfails - if (method_name not in ['encode', 'decode', 'len'] - and inferred_dtype == 'bytes'): - pytest.xfail(reason='Not raising for "bytes", see GH 23011;' - 'Also: malformed method names, see GH 23551; ' - 'solved by GH 23167') - if (method_name == 'cat' - and inferred_dtype in ['mixed', 'mixed-integer']): - pytest.xfail(reason='Bad error message; should raise better; ' - 'solved by GH 23167') - if box == Index and inferred_dtype in ['empty', 'bytes']: - pytest.xfail(reason='Raising too restrictively; ' - 'solved by GH 23167') - if (box == Index and dtype == object - and inferred_dtype in ['boolean', 'date', 'time']): - pytest.xfail(reason='Inferring incorrectly because of NaNs; ' - 'solved by GH 23167') if box == Index and dtype == 'category': pytest.xfail(reason='Broken methods on CategoricalIndex; ' 'see GH 23556') + if (method_name in ['partition', 'rpartition'] and box == Index + and inferred_dtype == 'empty'): + pytest.xfail(reason='Method cannot deal with empty Index') + if (method_name == 'split' and box == Index and values.size == 0 + and kwargs.get('expand', None) is not None): + pytest.xfail(reason='Split fails on empty Series when expand=True') + if (method_name == 'get_dummies' and box == Index + and inferred_dtype == 'empty' and (dtype == object + or values.size == 0)): + pytest.xfail(reason='Need to fortify get_dummies corner cases') t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) @@ -630,12 +616,6 @@ def test_str_cat_align_mixed_inputs(self, join): with pytest.raises(ValueError, match=rgx): s.str.cat([t, z], join=join) - def test_str_cat_raises(self): - # non-strings hiding behind object dtype - s = Series([1, 2, 3, 4], dtype='object') - with pytest.raises(TypeError, match="unsupported operand type.*"): - s.str.cat(s) - def test_str_cat_special_cases(self): s = Series(['a', 'b', 'c', 'd']) t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) From 88b7b5336307d9fc92a9dab35e69ca05ba6ae582 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 30 Nov 2018 00:16:28 +0100 Subject: [PATCH 02/16] Forbid encode on pure bytes as well --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/strings.py | 2 +- pandas/tests/test_strings.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index ac869b522502b..2c9aecdde2f45 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -375,7 +375,7 @@ Backwards incompatible API changes - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) - The `.str`-accessor will perform more rigorous type checking for inputs. Previously, some types that were never intended to be used - "worked" purely due to limitations of dtype checking -- e.g. ``bytes``, which is now disabled except for `encode`, `decode` and `len` (:issue:`23011`, :issue:`23163`) + "worked" purely due to limitations of dtype checking -- e.g. ``bytes``, which is now disabled except for `decode` and `len` (:issue:`23011`, :issue:`23163`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 164ba0f450420..67b88b11c1bef 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2734,8 +2734,8 @@ def decode(self, encoding, errors="strict"): return self._wrap_result(result) @copy(str_encode) + @forbid_nonstring_types(['bytes']) def encode(self, encoding, errors="strict"): - # allowing bytes here for easily dealing with mixed str/bytes Series result = str_encode(self._parent, encoding, errors) return self._wrap_result(result) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 30cdea554e778..46c44adfd4f6d 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -238,7 +238,7 @@ def test_api_per_method(self, box, dtype, t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) - bytes_allowed = method_name in ['encode', 'decode', 'len'] + bytes_allowed = method_name in ['decode', 'len'] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. # This could be changed with an 'errors'-kwarg to the `str`-accessor, From b19a40d4d4bef93e8a6c9839f972b924002e6f1f Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 30 Nov 2018 01:05:46 +0100 Subject: [PATCH 03/16] Remove merge artefact --- pandas/tests/test_strings.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 46c44adfd4f6d..d6c6a8652e728 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -135,10 +135,7 @@ def any_allowed_skipna_inferred_dtype(request): """ Fixture for all (inferred) dtypes allowed in StringMethods.__init__ - Returns an np.ndarray that will be inferred to have the given dtype (when - skipping missing values). - - The allowed (inferred) types are: + The covered (inferred) types are: * 'string' * 'unicode' (if PY2) * 'empty' From fb7da6b5a63f2b5647bad11dff5463b0b778d9e8 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 30 Nov 2018 09:05:58 +0100 Subject: [PATCH 04/16] fix isort --- pandas/core/strings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 67b88b11c1bef..4d9f1567b371a 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -22,7 +22,6 @@ from pandas.core.base import NoNewAttributesMixin import pandas.core.common as com - _cpython_optimized_encoders = ( "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" ) From f8ffb0d1337564fcdf1062b854a6125a323f3ced Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 2 Dec 2018 23:32:22 +0100 Subject: [PATCH 05/16] merge in API: fix str-accessor on CategoricalIndex --- .circleci/config.yml | 4 +- .travis.yml | 24 +- asv_bench/benchmarks/categoricals.py | 8 + ci/README.txt | 17 - ci/azure/linux.yml | 10 +- ci/azure/macos.yml | 6 +- ci/azure/windows-py27.yml | 2 +- ci/azure/windows.yml | 2 +- ci/deps/travis-36.yaml | 2 +- ci/print_versions.py | 29 -- ci/run_tests.sh | 52 ++ ci/script_multi.sh | 49 -- ci/script_single.sh | 39 -- ci/upload_coverage.sh | 11 - doc/source/comparison_with_r.rst | 88 ++-- doc/source/comparison_with_sql.rst | 20 +- doc/source/comparison_with_stata.rst | 23 +- doc/source/computation.rst | 82 ++-- doc/source/io.rst | 371 +++++++------- doc/source/timeseries.rst | 518 ++++++++++---------- doc/source/whatsnew/v0.24.0.rst | 8 +- environment.yml | 18 +- pandas/_libs/parsers.pyx | 4 + pandas/_libs/tslib.pyx | 141 +++--- pandas/_libs/tslibs/conversion.pyx | 15 +- pandas/core/arrays/categorical.py | 10 + pandas/core/arrays/datetimelike.py | 224 ++++++++- pandas/core/arrays/datetimes.py | 4 +- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/frame.py | 22 +- pandas/core/generic.py | 131 +++-- pandas/core/indexes/category.py | 6 +- pandas/core/indexes/datetimelike.py | 222 +-------- pandas/core/indexes/datetimes.py | 6 +- pandas/core/indexes/period.py | 4 +- pandas/core/indexes/timedeltas.py | 5 +- pandas/core/reshape/tile.py | 7 +- pandas/core/strings.py | 9 +- pandas/io/parsers.py | 16 +- pandas/tests/frame/test_convert_to.py | 28 +- pandas/tests/io/parser/test_na_values.py | 18 + pandas/tests/reshape/test_tile.py | 8 + pandas/tests/test_strings.py | 4 +- pandas/tests/util/test_hashing.py | 585 ++++++++++++----------- requirements-dev.txt | 18 +- setup.cfg | 46 +- 46 files changed, 1547 insertions(+), 1371 deletions(-) delete mode 100644 ci/README.txt delete mode 100755 ci/print_versions.py create mode 100755 ci/run_tests.sh delete mode 100755 ci/script_multi.sh delete mode 100755 ci/script_single.sh delete mode 100755 ci/upload_coverage.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index dc4162a0674fd..6b516b21722ac 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -34,5 +34,5 @@ jobs: command: | export PATH="$MINICONDA_DIR/bin:$PATH" source activate pandas-dev - echo "pytest --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml --skip-slow --skip-network pandas" - pytest --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml --skip-slow --skip-network pandas + echo "pytest -m "not slow and not network" --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml pandas" + pytest -m "not slow and not network" --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml pandas diff --git a/.travis.yml b/.travis.yml index 3217fc5aa1ed6..6bbc44fba864a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,28 +34,28 @@ matrix: include: - dist: trusty env: - - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" TEST_ARGS="--skip-slow --skip-network" + - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="not slow and not network" - dist: trusty env: - - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/deps/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true + - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/deps/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" PATTERN="slow" addons: apt: packages: - language-pack-zh-hans - dist: trusty env: - - JOB="2.7" ENV_FILE="ci/deps/travis-27.yaml" TEST_ARGS="--skip-slow" + - JOB="2.7" ENV_FILE="ci/deps/travis-27.yaml" PATTERN="not slow" addons: apt: packages: - python-gtk2 - dist: trusty env: - - JOB="3.6, lint, coverage" ENV_FILE="ci/deps/travis-36.yaml" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" COVERAGE=true LINT=true + - JOB="3.6, lint, coverage" ENV_FILE="ci/deps/travis-36.yaml" PATTERN="not slow and not network" PANDAS_TESTING_MODE="deprecate" COVERAGE=true LINT=true - dist: trusty env: - - JOB="3.7, NumPy dev" ENV_FILE="ci/deps/travis-37-numpydev.yaml" TEST_ARGS="--skip-slow --skip-network -W error" PANDAS_TESTING_MODE="deprecate" + - JOB="3.7, NumPy dev" ENV_FILE="ci/deps/travis-37-numpydev.yaml" PATTERN="not slow and not network" TEST_ARGS="-W error" PANDAS_TESTING_MODE="deprecate" addons: apt: packages: @@ -64,7 +64,7 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" SLOW=true + - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" # In allow_failures - dist: trusty @@ -73,7 +73,7 @@ matrix: allow_failures: - dist: trusty env: - - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" SLOW=true + - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - dist: trusty env: - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true @@ -107,20 +107,16 @@ script: - echo "script start" - source activate pandas-dev - ci/run_build_docs.sh - - ci/script_single.sh - - ci/script_multi.sh + - ci/run_tests.sh - ci/code_checks.sh -after_success: - - ci/upload_coverage.sh - after_script: - echo "after_script start" - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - if [ -e test-data-single.xml ]; then - ci/print_skipped.py test-data-single.xml; + ci/print_skipped.py test-data-single.xml; fi - if [ -e test-data-multiple.xml ]; then - ci/print_skipped.py test-data-multiple.xml; + ci/print_skipped.py test-data-multiple.xml; fi - echo "after_script done" diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 8a0fbc48755b5..7318b40efc8fb 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -46,6 +46,8 @@ def setup(self): self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) self.values_all_nan = [np.nan] * len(self.values) self.values_all_int8 = np.ones(N, 'int8') + self.categorical = pd.Categorical(self.values, self.categories) + self.series = pd.Series(self.categorical) def time_regular(self): pd.Categorical(self.values, self.categories) @@ -68,6 +70,12 @@ def time_all_nan(self): def time_from_codes_all_int8(self): pd.Categorical.from_codes(self.values_all_int8, self.categories) + def time_existing_categorical(self): + pd.Categorical(self.categorical) + + def time_existing_series(self): + pd.Categorical(self.series) + class ValueCounts(object): diff --git a/ci/README.txt b/ci/README.txt deleted file mode 100644 index bb71dc25d6093..0000000000000 --- a/ci/README.txt +++ /dev/null @@ -1,17 +0,0 @@ -Travis is a ci service that's well-integrated with GitHub. -The following types of breakage should be detected -by Travis builds: - -1) Failing tests on any supported version of Python. -2) Pandas should install and the tests should run if no optional deps are installed. -That also means tests which rely on optional deps need to raise SkipTest() -if the dep is missing. -3) unicode related fails when running under exotic locales. - -We tried running the vbench suite for a while, but with varying load -on Travis machines, that wasn't useful. - -Travis currently (4/2013) has a 5-job concurrency limit. Exceeding it -basically doubles the total runtime for a commit through travis, and -since dep+pandas installation is already quite long, this should become -a hard limit on concurrent travis runs. diff --git a/ci/azure/linux.yml b/ci/azure/linux.yml index 7fa8a9a1783f9..fe64307e9d08f 100644 --- a/ci/azure/linux.yml +++ b/ci/azure/linux.yml @@ -12,18 +12,18 @@ jobs: py27_np_120: ENV_FILE: ci/deps/azure-27-compat.yaml CONDA_PY: "27" - TEST_ARGS: "--skip-slow --skip-network" + PATTERN: "not slow and not network" py37_locale: ENV_FILE: ci/deps/azure-37-locale.yaml CONDA_PY: "37" - TEST_ARGS: "--skip-slow --skip-network" + PATTERN: "not slow and not network" LOCALE_OVERRIDE: "zh_CN.UTF-8" py36_locale_slow: ENV_FILE: ci/deps/azure-36-locale_slow.yaml CONDA_PY: "36" - TEST_ARGS: "--only-slow --skip-network" + PATTERN: "not slow and not network" LOCALE_OVERRIDE: "it_IT.UTF-8" steps: @@ -43,9 +43,7 @@ jobs: - script: | export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev - ci/script_single.sh - ci/script_multi.sh - echo "[Test done]" + ci/run_tests.sh displayName: 'Test' - script: | export PATH=$HOME/miniconda3/bin:$PATH diff --git a/ci/azure/macos.yml b/ci/azure/macos.yml index d537f0c70cbec..98409576a5a87 100644 --- a/ci/azure/macos.yml +++ b/ci/azure/macos.yml @@ -12,7 +12,7 @@ jobs: py35_np_120: ENV_FILE: ci/deps/azure-macos-35.yaml CONDA_PY: "35" - TEST_ARGS: "--skip-slow --skip-network" + PATTERN: "not slow and not network" steps: - script: | @@ -31,9 +31,7 @@ jobs: - script: | export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev - ci/script_single.sh - ci/script_multi.sh - echo "[Test done]" + ci/run_tests.sh displayName: 'Test' - script: | export PATH=$HOME/miniconda3/bin:$PATH diff --git a/ci/azure/windows-py27.yml b/ci/azure/windows-py27.yml index ac918f3becd2e..0d9aea816c4ad 100644 --- a/ci/azure/windows-py27.yml +++ b/ci/azure/windows-py27.yml @@ -37,7 +37,7 @@ jobs: displayName: 'Build' - script: | call activate pandas-dev - pytest --junitxml=test-data.xml --skip-slow --skip-network pandas -n 2 -r sxX --strict --durations=10 %* + pytest -m "not slow and not network" --junitxml=test-data.xml pandas -n 2 -r sxX --strict --durations=10 %* displayName: 'Test' - task: PublishTestResults@2 inputs: diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index f0ebba509e441..b69c210ca27ba 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -28,7 +28,7 @@ jobs: displayName: 'Build' - script: | call activate pandas-dev - pytest --junitxml=test-data.xml --skip-slow --skip-network pandas -n 2 -r sxX --strict --durations=10 %* + pytest -m "not slow and not network" --junitxml=test-data.xml pandas -n 2 -r sxX --strict --durations=10 %* displayName: 'Test' - task: PublishTestResults@2 inputs: diff --git a/ci/deps/travis-36.yaml b/ci/deps/travis-36.yaml index 1781f67041f44..de76f5d6d763f 100644 --- a/ci/deps/travis-36.yaml +++ b/ci/deps/travis-36.yaml @@ -9,7 +9,7 @@ dependencies: - fastparquet - flake8>=3.5 - flake8-comprehensions - - flake8-rst=0.4.2 + - flake8-rst>=0.6.0 - gcsfs - geopandas - html5lib diff --git a/ci/print_versions.py b/ci/print_versions.py deleted file mode 100755 index a2c93748b0388..0000000000000 --- a/ci/print_versions.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python - - -def show_versions(as_json=False): - import imp - import os - fn = __file__ - this_dir = os.path.dirname(fn) - pandas_dir = os.path.abspath(os.path.join(this_dir, "..")) - sv_path = os.path.join(pandas_dir, 'pandas', 'util') - mod = imp.load_module( - 'pvmod', *imp.find_module('print_versions', [sv_path])) - return mod.show_versions(as_json) - - -if __name__ == '__main__': - # optparse is 2.6-safe - from optparse import OptionParser - parser = OptionParser() - parser.add_option("-j", "--json", metavar="FILE", nargs=1, - help="Save output as JSON into file, " - "pass in '-' to output to stdout") - - (options, args) = parser.parse_args() - - if options.json == "-": - options.json = True - - show_versions(as_json=options.json) diff --git a/ci/run_tests.sh b/ci/run_tests.sh new file mode 100755 index 0000000000000..77efc60a8cf97 --- /dev/null +++ b/ci/run_tests.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +if [ "$DOC" ]; then + echo "We are not running pytest as this is a doc-build" + exit 0 +fi + +# Workaround for pytest-xdist flaky collection order +# https://github.com/pytest-dev/pytest/issues/920 +# https://github.com/pytest-dev/pytest/issues/1075 +export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') + +if [ -n "$LOCALE_OVERRIDE" ]; then + export LC_ALL="$LOCALE_OVERRIDE" + export LANG="$LOCALE_OVERRIDE" + PANDAS_LOCALE=`python -c 'import pandas; pandas.get_option("display.encoding")'` + if [[ "$LOCALE_OVERIDE" != "$PANDAS_LOCALE" ]]; then + echo "pandas could not detect the locale. System locale: $LOCALE_OVERRIDE, pandas detected: $PANDAS_LOCALE" + # TODO Not really aborting the tests until https://github.com/pandas-dev/pandas/issues/23923 is fixed + # exit 1 + fi +fi +if [[ "not network" == *"$PATTERN"* ]]; then + export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; +fi + + +if [ -n "$PATTERN" ]; then + PATTERN=" and $PATTERN" +fi + +for TYPE in single multiple +do + if [ "$COVERAGE" ]; then + COVERAGE_FNAME="/tmp/coc-$TYPE.xml" + COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" + fi + + TYPE_PATTERN=$TYPE + NUM_JOBS=1 + if [[ "$TYPE_PATTERN" == "multiple" ]]; then + TYPE_PATTERN="not single" + NUM_JOBS=2 + fi + + pytest -m "$TYPE_PATTERN$PATTERN" -n $NUM_JOBS -s --strict --durations=10 --junitxml=test-data-$TYPE.xml $TEST_ARGS $COVERAGE pandas + + if [[ "$COVERAGE" && $? == 0 ]]; then + echo "uploading coverage for $TYPE tests" + bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME + fi +done diff --git a/ci/script_multi.sh b/ci/script_multi.sh deleted file mode 100755 index fba0c7ba19dd4..0000000000000 --- a/ci/script_multi.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -e - -echo "[script multi]" - -if [ -n "$LOCALE_OVERRIDE" ]; then - export LC_ALL="$LOCALE_OVERRIDE"; - export LANG="$LOCALE_OVERRIDE"; - echo "Setting LC_ALL to $LOCALE_OVERRIDE" - - pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' - python -c "$pycmd" -fi - -# Enforce absent network during testing by faking a proxy -if echo "$TEST_ARGS" | grep -e --skip-network -q; then - export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; -fi - -# Workaround for pytest-xdist flaky collection order -# https://github.com/pytest-dev/pytest/issues/920 -# https://github.com/pytest-dev/pytest/issues/1075 -export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -echo PYTHONHASHSEED=$PYTHONHASHSEED - -if [ "$DOC" ]; then - echo "We are not running pytest as this is a doc-build" - -elif [ "$COVERAGE" ]; then - echo pytest -s -n 2 -m "not single" --durations=10 --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas - pytest -s -n 2 -m "not single" --durations=10 --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas - -elif [ "$SLOW" ]; then - TEST_ARGS="--only-slow --skip-network" - # The `-m " and slow"` is redundant here, as `--only-slow` is already used (via $TEST_ARGS). But is needed, because with - # `--only-slow` fast tests are skipped, but each of them is printed in the log (which can be avoided with `-q`), - # and also added to `test-data-multiple.xml`, and then printed in the log in the call to `ci/print_skipped.py`. - # Printing them to the log makes the log exceed the maximum size allowed by Travis and makes the build fail. - echo pytest -n 2 -m "not single and slow" --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas - pytest -n 2 -m "not single and slow" --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas - -else - echo pytest -n 2 -m "not single" --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas - pytest -n 2 -m "not single" --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas # TODO: doctest - -fi - -RET="$?" - -exit "$RET" diff --git a/ci/script_single.sh b/ci/script_single.sh deleted file mode 100755 index cbbb7a49541c2..0000000000000 --- a/ci/script_single.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -echo "[script_single]" - -if [ -n "$LOCALE_OVERRIDE" ]; then - echo "Setting LC_ALL and LANG to $LOCALE_OVERRIDE" - export LC_ALL="$LOCALE_OVERRIDE"; - export LANG="$LOCALE_OVERRIDE"; - - pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' - python -c "$pycmd" -fi - -if [ "$SLOW" ]; then - TEST_ARGS="--only-slow --skip-network" -fi - -# Enforce absent network during testing by faking a proxy -if echo "$TEST_ARGS" | grep -e --skip-network -q; then - export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; -fi - -if [ "$DOC" ]; then - echo "We are not running pytest as this is a doc-build" - -elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --durations=10 --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=test-data-single.xml $TEST_ARGS pandas - pytest -s -m "single" --durations=10 --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=test-data-single.xml $TEST_ARGS pandas - echo pytest -s --strict scripts - pytest -s --strict scripts -else - echo pytest -m "single" --durations=10 --junitxml=test-data-single.xml --strict $TEST_ARGS pandas - pytest -m "single" --durations=10 --junitxml=test-data-single.xml --strict $TEST_ARGS pandas - -fi - -RET="$?" - -exit "$RET" diff --git a/ci/upload_coverage.sh b/ci/upload_coverage.sh deleted file mode 100755 index 88aca20590505..0000000000000 --- a/ci/upload_coverage.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -if [ -z "$COVERAGE" ]; then - echo "coverage is not selected for this build" - exit 0 -fi - - -echo "uploading coverage" -bash <(curl -s https://codecov.io/bash) -Z -c -F single -f /tmp/cov-single.xml -bash <(curl -s https://codecov.io/bash) -Z -c -F multiple -f /tmp/cov-multiple.xml diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index eecacde8ad14e..704b0c4d80537 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -6,7 +6,7 @@ import pandas as pd import numpy as np - pd.options.display.max_rows=15 + pd.options.display.max_rows = 15 Comparison with R / R libraries ******************************* @@ -165,16 +165,15 @@ function. .. ipython:: python - df = pd.DataFrame({ - 'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9], - 'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, - np.nan] - }) + df = pd.DataFrame( + {'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, + np.nan]}) - g = df.groupby(['by1','by2']) - g[['v1','v2']].mean() + g = df.groupby(['by1', 'by2']) + g[['v1', 'v2']].mean() For more details and examples see :ref:`the groupby documentation `. @@ -195,7 +194,7 @@ The :meth:`~pandas.DataFrame.isin` method is similar to R ``%in%`` operator: .. ipython:: python - s = pd.Series(np.arange(5),dtype=np.float32) + s = pd.Series(np.arange(5), dtype=np.float32) s.isin([2, 4]) The ``match`` function returns a vector of the positions of matches @@ -234,11 +233,11 @@ In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: import random import string - baseball = pd.DataFrame({ - 'team': ["team %d" % (x+1) for x in range(5)]*5, - 'player': random.sample(list(string.ascii_lowercase),25), - 'batting avg': np.random.uniform(.200, .400, 25) - }) + baseball = pd.DataFrame( + {'team': ["team %d" % (x + 1) for x in range(5)] * 5, + 'player': random.sample(list(string.ascii_lowercase), 25), + 'batting avg': np.random.uniform(.200, .400, 25)}) + baseball.pivot_table(values='batting avg', columns='team', aggfunc=np.max) For more details and examples see :ref:`the reshaping documentation @@ -341,15 +340,13 @@ In ``pandas`` the equivalent expression, using the .. ipython:: python - df = pd.DataFrame({ - 'x': np.random.uniform(1., 168., 120), - 'y': np.random.uniform(7., 334., 120), - 'z': np.random.uniform(1.7, 20.7, 120), - 'month': [5,6,7,8]*30, - 'week': np.random.randint(1,4, 120) - }) + df = pd.DataFrame({'x': np.random.uniform(1., 168., 120), + 'y': np.random.uniform(7., 334., 120), + 'z': np.random.uniform(1.7, 20.7, 120), + 'month': [5, 6, 7, 8] * 30, + 'week': np.random.randint(1, 4, 120)}) - grouped = df.groupby(['month','week']) + grouped = df.groupby(['month', 'week']) grouped['x'].agg([np.mean, np.std]) @@ -374,8 +371,8 @@ In Python, since ``a`` is a list, you can simply use list comprehension. .. ipython:: python - a = np.array(list(range(1,24))+[np.NAN]).reshape(2,3,4) - pd.DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)]) + a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4) + pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)]) |meltlist|_ ~~~~~~~~~~~~ @@ -393,7 +390,7 @@ In Python, this list would be a list of tuples, so .. ipython:: python - a = list(enumerate(list(range(1,5))+[np.NAN])) + a = list(enumerate(list(range(1, 5)) + [np.NAN])) pd.DataFrame(a) For more details and examples see :ref:`the Into to Data Structures @@ -419,12 +416,13 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: .. ipython:: python - cheese = pd.DataFrame({'first' : ['John', 'Mary'], - 'last' : ['Doe', 'Bo'], - 'height' : [5.5, 6.0], - 'weight' : [130, 150]}) + cheese = pd.DataFrame({'first': ['John', 'Mary'], + 'last': ['Doe', 'Bo'], + 'height': [5.5, 6.0], + 'weight': [130, 150]}) + pd.melt(cheese, id_vars=['first', 'last']) - cheese.set_index(['first', 'last']).stack() # alternative way + cheese.set_index(['first', 'last']).stack() # alternative way For more details and examples see :ref:`the reshaping documentation `. @@ -452,16 +450,15 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: .. ipython:: python - df = pd.DataFrame({ - 'x': np.random.uniform(1., 168., 12), - 'y': np.random.uniform(7., 334., 12), - 'z': np.random.uniform(1.7, 20.7, 12), - 'month': [5,6,7]*4, - 'week': [1,2]*6 - }) + df = pd.DataFrame({'x': np.random.uniform(1., 168., 12), + 'y': np.random.uniform(7., 334., 12), + 'z': np.random.uniform(1.7, 20.7, 12), + 'month': [5, 6, 7] * 4, + 'week': [1, 2] * 6}) + mdf = pd.melt(df, id_vars=['month', 'week']) - pd.pivot_table(mdf, values='value', index=['variable','week'], - columns=['month'], aggfunc=np.mean) + pd.pivot_table(mdf, values='value', index=['variable', 'week'], + columns=['month'], aggfunc=np.mean) Similarly for ``dcast`` which uses a data.frame called ``df`` in R to aggregate information based on ``Animal`` and ``FeedType``: @@ -491,13 +488,14 @@ using :meth:`~pandas.pivot_table`: 'Amount': [10, 7, 4, 2, 5, 6, 2], }) - df.pivot_table(values='Amount', index='Animal', columns='FeedType', aggfunc='sum') + df.pivot_table(values='Amount', index='Animal', columns='FeedType', + aggfunc='sum') The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: .. ipython:: python - df.groupby(['Animal','FeedType'])['Amount'].sum() + df.groupby(['Animal', 'FeedType'])['Amount'].sum() For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. @@ -516,8 +514,8 @@ In pandas this is accomplished with ``pd.cut`` and ``astype("category")``: .. ipython:: python - pd.cut(pd.Series([1,2,3,4,5,6]), 3) - pd.Series([1,2,3,2,2,3]).astype("category") + pd.cut(pd.Series([1, 2, 3, 4, 5, 6]), 3) + pd.Series([1, 2, 3, 2, 2, 3]).astype("category") For more details and examples see :ref:`categorical introduction ` and the :ref:`API documentation `. There is also a documentation regarding the diff --git a/doc/source/comparison_with_sql.rst b/doc/source/comparison_with_sql.rst index db143cd586441..021f37eb5c66f 100644 --- a/doc/source/comparison_with_sql.rst +++ b/doc/source/comparison_with_sql.rst @@ -23,7 +23,8 @@ structure. .. ipython:: python - url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv' + url = ('https://raw.github.com/pandas-dev' + '/pandas/master/pandas/tests/data/tips.csv') tips = pd.read_csv(url) tips.head() @@ -387,7 +388,7 @@ Top N rows with offset .. ipython:: python - tips.nlargest(10+5, columns='tip').tail(10) + tips.nlargest(10 + 5, columns='tip').tail(10) Top N rows per group ~~~~~~~~~~~~~~~~~~~~ @@ -411,8 +412,7 @@ Top N rows per group .groupby(['day']) .cumcount() + 1) .query('rn < 3') - .sort_values(['day','rn']) - ) + .sort_values(['day', 'rn'])) the same using `rank(method='first')` function @@ -421,8 +421,7 @@ the same using `rank(method='first')` function (tips.assign(rnk=tips.groupby(['day'])['total_bill'] .rank(method='first', ascending=False)) .query('rnk < 3') - .sort_values(['day','rnk']) - ) + .sort_values(['day', 'rnk'])) .. code-block:: sql @@ -445,11 +444,10 @@ Notice that when using ``rank(method='min')`` function .. ipython:: python (tips[tips['tip'] < 2] - .assign(rnk_min=tips.groupby(['sex'])['tip'] - .rank(method='min')) - .query('rnk_min < 3') - .sort_values(['sex','rnk_min']) - ) + .assign(rnk_min=tips.groupby(['sex'])['tip'] + .rank(method='min')) + .query('rnk_min < 3') + .sort_values(['sex', 'rnk_min'])) UPDATE diff --git a/doc/source/comparison_with_stata.rst b/doc/source/comparison_with_stata.rst index 6c518983d5904..e039843b22065 100644 --- a/doc/source/comparison_with_stata.rst +++ b/doc/source/comparison_with_stata.rst @@ -102,9 +102,7 @@ and the values are the data. .. ipython:: python - df = pd.DataFrame({ - 'x': [1, 3, 5], - 'y': [2, 4, 6]}) + df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) df @@ -128,7 +126,8 @@ the data set if presented with a url. .. ipython:: python - url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv' + url = ('https://raw.github.com/pandas-dev' + '/pandas/master/pandas/tests/data/tips.csv') tips = pd.read_csv(url) tips.head() @@ -278,17 +277,17 @@ see the :ref:`timeseries documentation` for more details. tips['date1_year'] = tips['date1'].dt.year tips['date2_month'] = tips['date2'].dt.month tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = (tips['date2'].dt.to_period('M') - - tips['date1'].dt.to_period('M')) + tips['months_between'] = (tips['date2'].dt.to_period('M') + - tips['date1'].dt.to_period('M')) - tips[['date1','date2','date1_year','date2_month', - 'date1_next','months_between']].head() + tips[['date1', 'date2', 'date1_year', 'date2_month', 'date1_next', + 'months_between']].head() .. ipython:: python :suppress: - tips = tips.drop(['date1','date2','date1_year', - 'date2_month','date1_next','months_between'], axis=1) + tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month', + 'date1_next', 'months_between'], axis=1) Selection of Columns ~~~~~~~~~~~~~~~~~~~~ @@ -472,7 +471,7 @@ The following tables will be used in the merge examples 'value': np.random.randn(4)}) df1 df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + 'value': np.random.randn(4)}) df2 In Stata, to perform a merge, one data set must be in memory @@ -661,7 +660,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex','smoker']).first() + tips.groupby(['sex', 'smoker']).first() Other Considerations diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 0d2021de8f88e..251dce5141ea5 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -4,14 +4,15 @@ :suppress: import numpy as np + import matplotlib.pyplot as plt + + import pandas as pd + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - import pandas as pd - import matplotlib - # matplotlib.style.use('default') - import matplotlib.pyplot as plt + pd.options.display.max_rows = 15 + plt.close('all') - pd.options.display.max_rows=15 .. _computation: @@ -75,7 +76,8 @@ series in the DataFrame, also excluding NA/null values. .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), + columns=['a', 'b', 'c', 'd', 'e']) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -127,7 +129,8 @@ Wikipedia has articles covering the above correlation coefficients: .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), + columns=['a', 'b', 'c', 'd', 'e']) frame.iloc[::2] = np.nan # Series with Series @@ -163,9 +166,10 @@ compute the correlation based on histogram intersection: .. ipython:: python # histogram intersection - histogram_intersection = lambda a, b: np.minimum( - np.true_divide(a, a.sum()), np.true_divide(b, b.sum()) - ).sum() + def histogram_intersection(a, b): + return np.minimum(np.true_divide(a, a.sum()), + np.true_divide(b, b.sum())).sum() + frame.corr(method=histogram_intersection) A related method :meth:`~DataFrame.corrwith` is implemented on DataFrame to @@ -192,7 +196,7 @@ assigned the mean of the ranks (by default) for the group: .. ipython:: python s = pd.Series(np.random.np.random.randn(5), index=list('abcde')) - s['d'] = s['b'] # so there's a tie + s['d'] = s['b'] # so there's a tie s.rank() :meth:`~DataFrame.rank` is also a DataFrame method and can rank either the rows @@ -202,7 +206,7 @@ ranking. .. ipython:: python df = pd.DataFrame(np.random.np.random.randn(10, 6)) - df[4] = df[2][:5] # some ties + df[4] = df[2][:5] # some ties df df.rank(1) @@ -243,7 +247,8 @@ objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expan .. ipython:: python - s = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + s = pd.Series(np.random.randn(1000), + index=pd.date_range('1/1/2000', periods=1000)) s = s.cumsum() s @@ -258,7 +263,7 @@ These object provide tab-completion of the available methods and properties. .. code-block:: ipython - In [14]: r. + In [14]: r. # noqa: E225, E999 r.agg r.apply r.count r.exclusions r.max r.median r.name r.skew r.sum r.aggregate r.corr r.cov r.kurt r.mean r.min r.quantile r.std r.var @@ -336,7 +341,9 @@ compute the mean absolute deviation on a rolling basis: .. ipython:: python - mad = lambda x: np.fabs(x - x.mean()).mean() + def mad(x): + return np.fabs(x - x.mean()).mean() + @savefig rolling_apply_ex.png s.rolling(window=60).apply(mad, raw=True).plot(style='k') @@ -376,7 +383,8 @@ The list of recognized types are the `scipy.signal window functions .. ipython:: python - ser = pd.Series(np.random.randn(10), index=pd.date_range('1/1/2000', periods=10)) + ser = pd.Series(np.random.randn(10), + index=pd.date_range('1/1/2000', periods=10)) ser.rolling(window=5, win_type='triang').mean() @@ -423,7 +431,9 @@ This can be particularly useful for a non-regular time frequency index. .. ipython:: python dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + index=pd.date_range('20130101 09:00:00', + periods=5, + freq='s')) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -445,12 +455,12 @@ Using a non-regular, but still monotonic index, rolling with an integer window d .. ipython:: python dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index = pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + index=pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) dft dft.rolling(2).sum() @@ -496,11 +506,11 @@ from present information back to past information. This allows the rolling windo .. ipython:: python df = pd.DataFrame({'x': 1}, - index = [pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:06')]) + index=[pd.Timestamp('20130101 09:00:01'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:04'), + pd.Timestamp('20130101 09:00:06')]) df["right"] = df.rolling('2s', closed='right').x.sum() # default df["both"] = df.rolling('2s', closed='both').x.sum() @@ -601,7 +611,8 @@ can even be omitted: .. ipython:: python - covs = df[['B','C','D']].rolling(window=50).cov(df[['A','B','C']], pairwise=True) + covs = (df[['B', 'C', 'D']].rolling(window=50) + .cov(df[['A', 'B', 'C']], pairwise=True)) covs.loc['2002-09-22':] .. ipython:: python @@ -637,7 +648,7 @@ perform multiple computations on the data. These operations are similar to the : dfa = pd.DataFrame(np.random.randn(1000, 3), index=pd.date_range('1/1/2000', periods=1000), columns=['A', 'B', 'C']) - r = dfa.rolling(window=60,min_periods=1) + r = dfa.rolling(window=60, min_periods=1) r We can aggregate by passing a function to the entire DataFrame, or select a @@ -649,7 +660,7 @@ Series (or multiple Series) via standard ``__getitem__``. r['A'].aggregate(np.sum) - r[['A','B']].aggregate(np.sum) + r[['A', 'B']].aggregate(np.sum) As you can see, the result of the aggregation will have the selected columns, or all columns if none are selected. @@ -683,24 +694,21 @@ By passing a dict to ``aggregate`` you can apply a different aggregation to the columns of a ``DataFrame``: .. ipython:: python - :okexcept: - :okwarning: - r.agg({'A' : np.sum, - 'B' : lambda x: np.std(x, ddof=1)}) + r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the windowed object .. ipython:: python - r.agg({'A' : 'sum', 'B' : 'std'}) + r.agg({'A': 'sum', 'B': 'std'}) Furthermore you can pass a nested dict to indicate different aggregations on different columns. .. ipython:: python - r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) + r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) .. _stats.moments.expanding: diff --git a/doc/source/io.rst b/doc/source/io.rst index 2b91836d5449d..372a7b8a325e7 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -5,25 +5,23 @@ .. ipython:: python :suppress: - import os import csv - from pandas.compat import StringIO, BytesIO - import pandas as pd - ExcelWriter = pd.ExcelWriter + import os + import matplotlib.pyplot as plt import numpy as np - np.random.seed(123456) + import pandas as pd + from pandas.compat import StringIO, BytesIO + + randn = np.random.randn np.set_printoptions(precision=4, suppress=True) - - import matplotlib.pyplot as plt plt.close('all') - - import pandas.util.testing as tm pd.options.display.max_rows = 15 clipdf = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': ['p', 'q', 'r']}, index=['x', 'y', 'z']) + =============================== IO Tools (Text, CSV, HDF5, ...) =============================== @@ -146,7 +144,10 @@ usecols : list-like or callable, default ``None`` .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3']) @@ -192,7 +193,10 @@ skiprows : list-like or integer, default ``None`` .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0) @@ -367,7 +371,10 @@ columns: .. ipython:: python - data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') print(data) df = pd.read_csv(StringIO(data), dtype=object) @@ -388,7 +395,11 @@ of :func:`~pandas.read_csv`: .. ipython:: python - data = "col_1\n1\n2\n'A'\n4.22" + data = ("col_1\n" + "1\n" + "2\n" + "'A'\n" + "4.22") df = pd.read_csv(StringIO(data), converters={'col_1': str}) df df['col_1'].apply(type).value_counts() @@ -427,7 +438,8 @@ worth trying. .. ipython:: python :okwarning: - df = pd.DataFrame({'col_1': list(range(500000)) + ['a', 'b'] + list(range(500000))}) + col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000)) + df = pd.DataFrame({'col_1': col_1}) df.to_csv('foo.csv') mixed_df = pd.read_csv('foo.csv') mixed_df['col_1'].apply(type).value_counts() @@ -455,7 +467,10 @@ Specifying Categorical dtype .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data)).dtypes @@ -479,7 +494,6 @@ that column's ``dtype``. .. ipython:: python from pandas.api.types import CategoricalDtype - dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True) pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes @@ -525,7 +539,10 @@ used as the column names: .. ipython:: python - data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') print(data) pd.read_csv(StringIO(data)) @@ -544,7 +561,11 @@ If the header is in a row other than the first, pass the row number to .. ipython:: python - data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('skip this skip it\n' + 'a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') pd.read_csv(StringIO(data), header=1) .. note:: @@ -565,7 +586,9 @@ distinguish between them so as to prevent overwriting data: .. ipython :: python - data = 'a,b,a\n0,1,2\n3,4,5' + data = ('a,b,a\n' + '0,1,2\n' + '3,4,5') pd.read_csv(StringIO(data)) There is no more duplicate data because ``mangle_dupe_cols=True`` by default, @@ -633,7 +656,13 @@ be ignored. By default, completely blank lines will be ignored as well. .. ipython:: python - data = '\na,b,c\n \n# commented line\n1,2,3\n\n4,5,6' + data = ('\n' + 'a,b,c\n' + ' \n' + '# commented line\n' + '1,2,3\n' + '\n' + '4,5,6') print(data) pd.read_csv(StringIO(data), comment='#') @@ -641,7 +670,12 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = 'a,b,c\n\n1,2,3\n\n\n4,5,6' + data = ('a,b,c\n' + '\n' + '1,2,3\n' + '\n' + '\n' + '4,5,6') pd.read_csv(StringIO(data), skip_blank_lines=False) .. warning:: @@ -652,20 +686,32 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = '#comment\na,b,c\nA,B,C\n1,2,3' + data = ('#comment\n' + 'a,b,c\n' + 'A,B,C\n' + '1,2,3') pd.read_csv(StringIO(data), comment='#', header=1) - data = 'A,B,C\n#comment\na,b,c\n1,2,3' + data = ('A,B,C\n' + '#comment\n' + 'a,b,c\n' + '1,2,3') pd.read_csv(StringIO(data), comment='#', skiprows=2) If both ``header`` and ``skiprows`` are specified, ``header`` will be relative to the end of ``skiprows``. For example: - .. ipython:: python +.. ipython:: python - data = ('# empty\n# second empty line\n# third empty' - 'line\nX,Y,Z\n1,2,3\nA,B,C\n1,2.,4.\n5.,NaN,10.0') - print(data) - pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + data = ('# empty\n' + '# second empty line\n' + '# third emptyline\n' + 'X,Y,Z\n' + '1,2,3\n' + 'A,B,C\n' + '1,2.,4.\n' + '5.,NaN,10.0\n') + print(data) + pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) .. _io.comments: @@ -677,10 +723,10 @@ Sometimes comments or meta data may be included in a file: .. ipython:: python :suppress: - data = ("ID,level,category\n" - "Patient1,123000,x # really unpleasant\n" - "Patient2,23000,y # wouldn't take his medicine\n" - "Patient3,1234018,z # awesome") + data = ("ID,level,category\n" + "Patient1,123000,x # really unpleasant\n" + "Patient2,23000,y # wouldn't take his medicine\n" + "Patient3,1234018,z # awesome") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -718,7 +764,10 @@ result in byte strings being decoded to unicode in the result: .. ipython:: python - data = b'word,length\nTr\xc3\xa4umen,7\nGr\xc3\xbc\xc3\x9fe,5'.decode('utf8').encode('latin-1') + data = (b'word,length\n' + b'Tr\xc3\xa4umen,7\n' + b'Gr\xc3\xbc\xc3\x9fe,5') + data = data.decode('utf8').encode('latin-1') df = pd.read_csv(BytesIO(data), encoding='latin-1') df df['word'][1] @@ -738,12 +787,16 @@ first column will be used as the ``DataFrame``'s row names: .. ipython:: python - data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + data = ('a,b,c\n' + '4,apple,bat,5.7\n' + '8,orange,cow,10') pd.read_csv(StringIO(data)) .. ipython:: python - data = 'index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + data = ('index,a,b,c\n' + '4,apple,bat,5.7\n' + '8,orange,cow,10') pd.read_csv(StringIO(data), index_col=0) Ordinarily, you can achieve this behavior using the ``index_col`` option. @@ -754,7 +807,9 @@ index column inference and discard the last column, pass ``index_col=False``: .. ipython:: python - data = 'a,b,c\n4,apple,bat,\n8,orange,cow,' + data = ('a,b,c\n' + '4,apple,bat,\n' + '8,orange,cow,') print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), index_col=False) @@ -764,7 +819,9 @@ If a subset of data is being parsed using the ``usecols`` option, the .. ipython:: python - data = 'a,b,c\n4,apple,bat,\n8,orange,cow,' + data = ('a,b,c\n' + '4,apple,bat,\n' + '8,orange,cow,') print(data) pd.read_csv(StringIO(data), usecols=['b', 'c']) pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0) @@ -812,12 +869,12 @@ column names: .. ipython:: python :suppress: - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -895,9 +952,8 @@ take full advantage of the flexibility of the date parsing API: .. ipython:: python - import pandas.io.date_converters as conv df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - date_parser=conv.parse_date_time) + date_parser=pd.io.date_converters.parse_date_time) df Pandas will try to call the ``date_parser`` function in three different ways. If @@ -990,9 +1046,12 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: .. ipython:: python :suppress: - data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c" + data = ("date,value,cat\n" + "1/6/2000,5,a\n" + "2/6/2000,10,b\n" + "3/6/2000,15,c") with open('tmp.csv', 'w') as fh: - fh.write(data) + fh.write(data) .. ipython:: python @@ -1016,9 +1075,12 @@ writing to a file). For example: val = '0.3066101993807095471566981359501369297504425048828125' data = 'a,b,c\n1,2,{0}'.format(val) - abs(pd.read_csv(StringIO(data), engine='c', float_precision=None)['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', float_precision='high')['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', float_precision='round_trip')['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision=None)['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision='high')['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision='round_trip')['c'][0] - float(val)) .. _io.thousands: @@ -1033,10 +1095,10 @@ correctly: .. ipython:: python :suppress: - data = ("ID|level|category\n" - "Patient1|123,000|x\n" - "Patient2|23,000|y\n" - "Patient3|1,234,018|z") + data = ("ID|level|category\n" + "Patient1|123,000|x\n" + "Patient2|23,000|y\n" + "Patient3|1,234,018|z") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -1132,10 +1194,10 @@ as a ``Series``: .. ipython:: python :suppress: - data = ("level\n" - "Patient1,123000\n" - "Patient2,23000\n" - "Patient3,1234018") + data = ("level\n" + "Patient1,123000\n" + "Patient2,23000\n" + "Patient3,1234018") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -1144,7 +1206,7 @@ as a ``Series``: print(open('tmp.csv').read()) - output = pd.read_csv('tmp.csv', squeeze=True) + output = pd.read_csv('tmp.csv', squeeze=True) output type(output) @@ -1166,7 +1228,9 @@ options as follows: .. ipython:: python - data= 'a,b,c\n1,Yes,2\n3,No,4' + data = ('a,b,c\n' + '1,Yes,2\n' + '3,No,4') print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No']) @@ -1183,11 +1247,17 @@ too many fields will raise an error by default: .. ipython:: python :suppress: - data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6,7\n' + '8,9,10') .. code-block:: ipython - In [27]: data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' + In [27]: data = ('a,b,c\n' + '1,2,3\n' + '4,5,6,7\n' + '8,9,10') In [28]: pd.read_csv(StringIO(data)) --------------------------------------------------------------------------- @@ -1437,7 +1507,7 @@ returned object: .. ipython:: python - df = pd.read_csv("data/mindex_ex.csv", index_col=[0,1]) + df = pd.read_csv("data/mindex_ex.csv", index_col=[0, 1]) df df.loc[1978] @@ -1480,7 +1550,6 @@ with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index wi .. ipython:: python :suppress: - import os os.remove('mi.csv') os.remove('mi2.csv') @@ -1966,9 +2035,8 @@ Preserve string indices: .. ipython:: python - si = pd.DataFrame(np.zeros((4, 4)), - columns=list(range(4)), - index=[str(i) for i in range(4)]) + si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)), + index=[str(i) for i in range(4)]) si si.index si.columns @@ -2020,11 +2088,11 @@ data: .. ipython:: python - timeit pd.read_json(jsonfloats) + %timeit pd.read_json(jsonfloats) .. ipython:: python - timeit pd.read_json(jsonfloats, numpy=True) + %timeit pd.read_json(jsonfloats, numpy=True) The speedup is less noticeable for smaller datasets: @@ -2034,11 +2102,11 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python - timeit pd.read_json(jsonfloats) + %timeit pd.read_json(jsonfloats) .. ipython:: python - timeit pd.read_json(jsonfloats, numpy=True) + %timeit pd.read_json(jsonfloats, numpy=True) .. warning:: @@ -2059,7 +2127,6 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python :suppress: - import os os.remove('test.json') .. _io.json_normalize: @@ -2081,20 +2148,16 @@ into a flat table. .. ipython:: python data = [{'state': 'Florida', - 'shortname': 'FL', - 'info': { - 'governor': 'Rick Scott' - }, - 'counties': [{'name': 'Dade', 'population': 12345}, + 'shortname': 'FL', + 'info': {'governor': 'Rick Scott'}, + 'counties': [{'name': 'Dade', 'population': 12345}, {'name': 'Broward', 'population': 40000}, {'name': 'Palm Beach', 'population': 60000}]}, - {'state': 'Ohio', - 'shortname': 'OH', - 'info': { - 'governor': 'John Kasich' - }, - 'counties': [{'name': 'Summit', 'population': 1234}, - {'name': 'Cuyahoga', 'population': 1337}]}] + {'state': 'Ohio', + 'shortname': 'OH', + 'info': {'governor': 'John Kasich'}, + 'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) @@ -2142,11 +2205,10 @@ a JSON string with two fields, ``schema`` and ``data``. .. ipython:: python - df = pd.DataFrame( - {'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3), - }, index=pd.Index(range(3), name='idx')) + df = pd.DataFrame({'A': [1, 2, 3], + 'B': ['a', 'b', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, + index=pd.Index(range(3), name='idx')) df df.to_json(orient='table', date_format="iso") @@ -2322,7 +2384,6 @@ as a string: .. ipython:: python :suppress: - import os file_path = os.path.abspath(os.path.join('source', '_static', 'banklist.html')) .. ipython:: python @@ -2820,8 +2881,8 @@ For example, to read in a ``MultiIndex`` index without names: .. ipython:: python - df = pd.DataFrame({'a':[1, 2, 3, 4], 'b':[5, 6, 7, 8]}, - index=pd.MultiIndex.from_product([['a', 'b'],['c', 'd']])) + df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}, + index=pd.MultiIndex.from_product([['a', 'b'], ['c', 'd']])) df.to_excel('path_to_file.xlsx') df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) df @@ -2842,7 +2903,8 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python - df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], names=['c1', 'c2']) + df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], + names=['c1', 'c2']) df.to_excel('path_to_file.xlsx') df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1], header=[0, 1]) df @@ -2850,7 +2912,6 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python :suppress: - import os os.remove('path_to_file.xlsx') @@ -2997,7 +3058,7 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python - with ExcelWriter('path_to_file.xlsx') as writer: + with pd.ExcelWriter('path_to_file.xlsx') as writer: df1.to_excel(writer, sheet_name='Sheet1') df2.to_excel(writer, sheet_name='Sheet2') @@ -3029,7 +3090,7 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` bio = BytesIO() # By setting the 'engine' in the ExcelWriter constructor. - writer = ExcelWriter(bio, engine='xlsxwriter') + writer = pd.ExcelWriter(bio, engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1') # Save the workbook @@ -3082,7 +3143,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: df.to_excel('path_to_file.xlsx', sheet_name='Sheet1', engine='xlsxwriter') # By setting the 'engine' in the ExcelWriter constructor. - writer = ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') + writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') # Or via pandas configuration. from pandas import options # noqa: E402 @@ -3172,7 +3233,6 @@ any pickled pandas object (or any other pickled object) from file: .. ipython:: python :suppress: - import os os.remove('foo.pkl') .. warning:: @@ -3249,7 +3309,6 @@ The default is to 'infer': .. ipython:: python :suppress: - import os os.remove("data.pkl.compress") os.remove("data.pkl.xz") os.remove("data.pkl.gz") @@ -3306,7 +3365,7 @@ pandas objects. .. ipython:: python - pd.to_msgpack('foo2.msg', {'dict': [{ 'df': df }, {'string': 'foo'}, + pd.to_msgpack('foo2.msg', {'dict': [{'df': df}, {'string': 'foo'}, {'scalar': 1.}, {'s': s}]}) pd.read_msgpack('foo2.msg') @@ -3365,7 +3424,6 @@ dict: .. ipython:: python - np.random.seed(1234) index = pd.date_range('1/1/2000', periods=8) s = pd.Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) df = pd.DataFrame(randn(8, 3), index=index, @@ -3421,7 +3479,6 @@ Closing a Store and using a context manager: :suppress: store.close() - import os os.remove('store.h5') @@ -3434,8 +3491,8 @@ similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python - df_tl = pd.DataFrame(dict(A=list(range(5)), B=list(range(5)))) - df_tl.to_hdf('store_tl.h5','table', append=True) + df_tl = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))}) + df_tl.to_hdf('store_tl.h5', 'table', append=True) pd.read_hdf('store_tl.h5', 'table', where=['index>2']) .. ipython:: python @@ -3447,10 +3504,6 @@ similar to how ``read_csv`` and ``to_csv`` work. HDFStore will by default not drop rows that are all missing. This behavior can be changed by setting ``dropna=True``. -.. ipython:: python - :suppress: - - import os .. ipython:: python @@ -3459,12 +3512,12 @@ HDFStore will by default not drop rows that are all missing. This behavior can b df_with_missing df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w') + format='table', mode='w') pd.read_hdf('file.h5', 'df_with_missing') df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w', dropna=True) + format='table', mode='w', dropna=True) pd.read_hdf('file.h5', 'df_with_missing') @@ -3478,13 +3531,13 @@ This is also true for the major axis of a ``Panel``: .. ipython:: python matrix = [[[np.nan, np.nan, np.nan], [1, np.nan, np.nan]], - [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], - [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] + [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], + [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] - panel_with_major_axis_all_missing=pd.Panel(matrix, - items=['Item1', 'Item2', 'Item3'], - major_axis=[1, 2], - minor_axis=['A', 'B', 'C']) + panel_with_major_axis_all_missing = pd.Panel(matrix, + items=['Item1', 'Item2', 'Item3'], + major_axis=[1, 2], + minor_axis=['A', 'B', 'C']) panel_with_major_axis_all_missing @@ -3585,7 +3638,7 @@ everything in the sub-store and **below**, so be *careful*. store.put('foo/bar/bah', df) store.append('food/orange', df) - store.append('food/apple', df) + store.append('food/apple', df) store # a list of keys are returned @@ -3660,14 +3713,15 @@ defaults to `nan`. df_mixed = pd.DataFrame({'A': randn(8), 'B': randn(8), 'C': np.array(randn(8), dtype='float32'), - 'string':'string', + 'string': 'string', 'int': 1, 'bool': True, 'datetime64': pd.Timestamp('20010102')}, index=list(range(8))) - df_mixed.loc[df_mixed.index[3:5], ['A', 'B', 'string', 'datetime64']] = np.nan + df_mixed.loc[df_mixed.index[3:5], + ['A', 'B', 'string', 'datetime64']] = np.nan - store.append('df_mixed', df_mixed, min_itemsize = {'values': 50}) + store.append('df_mixed', df_mixed, min_itemsize={'values': 50}) df_mixed1 = store.select('df_mixed') df_mixed1 df_mixed1.get_dtype_counts() @@ -3820,7 +3874,8 @@ Works with a Panel as well. store.append('wp', wp) store - store.select('wp', "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") + store.select('wp', + "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") The ``columns`` keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a @@ -3863,7 +3918,10 @@ specified in the format: ``()``, where float may be signed (and fra .. ipython:: python from datetime import timedelta - dftd = pd.DataFrame(dict(A = pd.Timestamp('20130101'), B = [ pd.Timestamp('20130101') + timedelta(days=i, seconds=10) for i in range(10) ])) + dftd = pd.DataFrame({'A': pd.Timestamp('20130101'), + 'B': [pd.Timestamp('20130101') + timedelta(days=i, + seconds=10) + for i in range(10)]}) dftd['C'] = dftd['A'] - dftd['B'] dftd store.append('dftd', dftd, data_columns=True) @@ -3940,14 +3998,14 @@ be ``data_columns``. df_dc = df.copy() df_dc['string'] = 'foo' - df_dc.loc[df_dc.index[4: 6], 'string'] = np.nan - df_dc.loc[df_dc.index[7: 9], 'string'] = 'bar' + df_dc.loc[df_dc.index[4:6], 'string'] = np.nan + df_dc.loc[df_dc.index[7:9], 'string'] = 'bar' df_dc['string2'] = 'cool' - df_dc.loc[df_dc.index[1: 3], ['B', 'C']] = 1.0 + df_dc.loc[df_dc.index[1:3], ['B', 'C']] = 1.0 df_dc # on-disk operations - store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) + store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2']) store.select('df_dc', where='B > 0') # getting creative @@ -3976,7 +4034,7 @@ The default is 50,000 rows returned in a chunk. .. ipython:: python for df in store.select('df', chunksize=3): - print(df) + print(df) .. note:: @@ -4003,12 +4061,12 @@ chunks. store.append('dfeq', dfeq, data_columns=['number']) def chunks(l, n): - return [l[i: i+n] for i in range(0, len(l), n)] + return [l[i:i + n] for i in range(0, len(l), n)] evens = [2, 4, 6, 8, 10] coordinates = store.select_as_coordinates('dfeq', 'number=evens') for c in chunks(coordinates, 2): - print(store.select('dfeq', where=c)) + print(store.select('dfeq', where=c)) Advanced Queries ++++++++++++++++ @@ -4105,13 +4163,13 @@ results. .. ipython:: python df_mt = pd.DataFrame(randn(8, 6), index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) + columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' df_mt.loc[df_mt.index[1], ('A', 'B')] = np.nan # you can also create the tables individually - store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None }, - df_mt, selector='df1_mt') + store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None}, + df_mt, selector='df1_mt') store # individual tables were created @@ -4120,7 +4178,7 @@ results. # as a multiple store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], - selector = 'df1_mt') + selector='df1_mt') Delete from a Table @@ -4159,7 +4217,7 @@ the table using a ``where`` that selects all but the missing data. .. ipython:: python # returns the number of rows deleted - store.remove('wp', 'major_axis > 20000102' ) + store.remove('wp', 'major_axis > 20000102') store.select('wp') .. warning:: @@ -4332,7 +4390,7 @@ stored in a more efficient manner. .. ipython:: python dfcat = pd.DataFrame({'A': pd.Series(list('aabbcdba')).astype('category'), - 'B': np.random.randn(8) }) + 'B': np.random.randn(8)}) dfcat dfcat.dtypes cstore = pd.HDFStore('cats.h5', mode='w') @@ -4346,7 +4404,6 @@ stored in a more efficient manner. :okexcept: cstore.close() - import os os.remove('cats.h5') @@ -4374,7 +4431,7 @@ Passing a ``min_itemsize`` dict will cause all passed columns to be created as * .. ipython:: python - dfs = pd.DataFrame(dict(A='foo', B='bar'), index=list(range(5))) + dfs = pd.DataFrame({'A': 'foo', 'B': 'bar'}, index=list(range(5))) dfs # A and B have a size of 30 @@ -4393,7 +4450,7 @@ You could inadvertently turn an actual ``nan`` value into a missing value. .. ipython:: python - dfss = pd.DataFrame(dict(A=['foo', 'bar', 'nan'])) + dfss = pd.DataFrame({'A': ['foo', 'bar', 'nan']}) dfss store.append('dfss', dfss) @@ -4420,11 +4477,10 @@ It is possible to write an ``HDFStore`` object that can easily be imported into .. ipython:: python - np.random.seed(1) df_for_r = pd.DataFrame({"first": np.random.rand(100), "second": np.random.rand(100), "class": np.random.randint(0, 2, (100, ))}, - index=range(100)) + index=range(100)) df_for_r.head() store_export = pd.HDFStore('export.h5') @@ -4435,7 +4491,6 @@ It is possible to write an ``HDFStore`` object that can easily be imported into :suppress: store_export.close() - import os os.remove('export.h5') In R this file can be read into a ``data.frame`` object using the ``rhdf5`` @@ -4523,7 +4578,6 @@ Performance :suppress: store.close() - import os os.remove('store.h5') @@ -4589,7 +4643,6 @@ Read from a feather file. .. ipython:: python :suppress: - import os os.remove('example.feather') @@ -4673,7 +4726,6 @@ Read only certain columns of a parquet file. .. ipython:: python :suppress: - import os os.remove('example_pa.parquet') os.remove('example_fp.parquet') @@ -4722,7 +4774,8 @@ Parquet supports partitioning of data based on the values of one or more columns .. ipython:: python df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]}) - df.to_parquet(fname='test', engine='pyarrow', partition_cols=['a'], compression=None) + df.to_parquet(fname='test', engine='pyarrow', + partition_cols=['a'], compression=None) The `fname` specifies the parent directory to which data will be saved. The `partition_cols` are the column names by which the dataset will be partitioned. @@ -4835,14 +4888,15 @@ the database using :func:`~pandas.DataFrame.to_sql`. import datetime c = ['id', 'Date', 'Col_1', 'Col_2', 'Col_3'] - d = [(26, datetime.datetime(2010,10,18), 'X', 27.5, True), - (42, datetime.datetime(2010,10,19), 'Y', -12.5, False), - (63, datetime.datetime(2010,10,20), 'Z', 5.73, True)] + d = [(26, datetime.datetime(2010, 10, 18), 'X', 27.5, True), + (42, datetime.datetime(2010, 10, 19), 'Y', -12.5, False), + (63, datetime.datetime(2010, 10, 20), 'Z', 5.73, True)] - data = pd.DataFrame(d, columns=c) + data = pd.DataFrame(d, columns=c) .. ipython:: python + data data.to_sql('data', engine) With some databases, writing large DataFrames can result in errors due to @@ -4999,7 +5053,8 @@ Specifying this will return an iterator through chunks of the query result: .. ipython:: python - for chunk in pd.read_sql_query("SELECT * FROM data_chunks", engine, chunksize=5): + for chunk in pd.read_sql_query("SELECT * FROM data_chunks", + engine, chunksize=5): print(chunk) You can also run a plain query without creating a ``DataFrame`` with @@ -5064,12 +5119,12 @@ If you have an SQLAlchemy description of your database you can express where con metadata = sa.MetaData() data_table = sa.Table('data', metadata, - sa.Column('index', sa.Integer), - sa.Column('Date', sa.DateTime), - sa.Column('Col_1', sa.String), - sa.Column('Col_2', sa.Float), - sa.Column('Col_3', sa.Boolean), - ) + sa.Column('index', sa.Integer), + sa.Column('Date', sa.DateTime), + sa.Column('Col_1', sa.String), + sa.Column('Col_2', sa.Float), + sa.Column('Col_3', sa.Boolean), + ) pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 == True), engine) @@ -5239,7 +5294,6 @@ values will have ``object`` data type. .. ipython:: python :suppress: - import os os.remove('stata.dta') .. _io.stata-categorical: @@ -5452,9 +5506,6 @@ And here's the code: .. code-block:: python - import os - import pandas as pd - import sqlite3 from numpy.random import randn sz = 1000000 diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 4fa1cb8be9234..bca7b6a601dd2 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -4,18 +4,12 @@ .. ipython:: python :suppress: - from datetime import datetime, timedelta, time import numpy as np import pandas as pd - from pandas import offsets + np.random.seed(123456) - randn = np.random.randn - randint = np.random.randint np.set_printoptions(precision=4, suppress=True) - pd.options.display.max_rows=15 - import dateutil - import pytz - from dateutil.relativedelta import relativedelta + pd.options.display.max_rows = 15 ******************************** Time Series / Date functionality @@ -32,7 +26,10 @@ Parsing time series information from various sources and formats .. ipython:: python - dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), datetime(2018, 1, 1)]) + import datetime + + dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), + datetime.datetime(2018, 1, 1)]) dti Generate sequences of fixed-frequency dates and time spans @@ -69,7 +66,7 @@ Performing date and time arithmetic with absolute or relative time increments saturday = friday + pd.Timedelta('1 day') saturday.day_name() # Add 1 business day (Friday --> Monday) - monday = friday + pd.tseries.offsets.BDay() + monday = friday + pd.offsets.BDay() monday.day_name() pandas provides a relatively compact and self-contained set of tools for @@ -110,12 +107,14 @@ However, :class:`Series` and :class:`DataFrame` can directly also support the ti pd.Series(pd.date_range('2000', freq='D', periods=3)) -:class:`Series` and :class:`DataFrame` have extended data type support and functionality for ``datetime`` and ``timedelta`` -data when the time data is used as data itself. The ``Period`` and ``DateOffset`` data will be stored as ``object`` data. +:class:`Series` and :class:`DataFrame` have extended data type support and functionality for ``datetime``, ``timedelta`` +and ``Period`` data when passed into those constructors. ``DateOffset`` +data however will be stored as ``object`` data. .. ipython:: python pd.Series(pd.period_range('1/1/2011', freq='M', periods=3)) + pd.Series([pd.DateOffset(1), pd.DateOffset(2)]) pd.Series(pd.date_range('1/1/2011', freq='M', periods=3)) Lastly, pandas represents null date times, time deltas, and time spans as ``NaT`` which @@ -141,7 +140,7 @@ time. .. ipython:: python - pd.Timestamp(datetime(2012, 5, 1)) + pd.Timestamp(datetime.datetime(2012, 5, 1)) pd.Timestamp('2012-05-01') pd.Timestamp(2012, 5, 1) @@ -163,7 +162,9 @@ and :class:`PeriodIndex` respectively. .. ipython:: python - dates = [pd.Timestamp('2012-05-01'), pd.Timestamp('2012-05-02'), pd.Timestamp('2012-05-03')] + dates = [pd.Timestamp('2012-05-01'), + pd.Timestamp('2012-05-02'), + pd.Timestamp('2012-05-03')] ts = pd.Series(np.random.randn(3), dates) type(ts.index) @@ -327,7 +328,7 @@ which can be specified. These are computed from the starting point specified by 1349979305, 1350065705], unit='s') pd.to_datetime([1349720105100, 1349720105200, 1349720105300, - 1349720105400, 1349720105500 ], unit='ms') + 1349720105400, 1349720105500], unit='ms') .. note:: @@ -400,7 +401,9 @@ To generate an index with timestamps, you can use either the ``DatetimeIndex`` o .. ipython:: python - dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)] + dates = [datetime.datetime(2012, 5, 1), + datetime.datetime(2012, 5, 2), + datetime.datetime(2012, 5, 3)] # Note the frequency information index = pd.DatetimeIndex(dates) @@ -418,8 +421,8 @@ to create a ``DatetimeIndex``. The default frequency for ``date_range`` is a .. ipython:: python - start = datetime(2011, 1, 1) - end = datetime(2012, 1, 1) + start = datetime.datetime(2011, 1, 1) + end = datetime.datetime(2012, 1, 1) index = pd.date_range(start, end) index @@ -486,7 +489,7 @@ used if a custom frequency string is passed. weekmask = 'Mon Wed Fri' - holidays = [datetime(2011, 1, 5), datetime(2011, 3, 14)] + holidays = [datetime.datetime(2011, 1, 5), datetime.datetime(2011, 3, 14)] pd.bdate_range(start, end, freq='C', weekmask=weekmask, holidays=holidays) @@ -564,7 +567,7 @@ Dates and strings that parse to timestamps can be passed as indexing parameters: ts['1/31/2011'] - ts[datetime(2011, 12, 25):] + ts[datetime.datetime(2011, 12, 25):] ts['10/31/2011':'12/31/2011'] @@ -583,9 +586,8 @@ would include matching times on an included date: .. ipython:: python - dft = pd.DataFrame(randn(100000,1), - columns=['A'], - index=pd.date_range('20130101',periods=100000,freq='T')) + dft = pd.DataFrame(np.random.randn(100000, 1), columns=['A'], + index=pd.date_range('20130101', periods=100000, freq='T')) dft dft['2013'] @@ -622,10 +624,9 @@ We are stopping on the included end-point as it is part of the index: dft2 = pd.DataFrame(np.random.randn(20, 1), columns=['A'], - index=pd.MultiIndex.from_product([pd.date_range('20130101', - periods=10, - freq='12H'), - ['a', 'b']])) + index=pd.MultiIndex.from_product( + [pd.date_range('20130101', periods=10, freq='12H'), + ['a', 'b']])) dft2 dft2.loc['2013-01-05'] idx = pd.IndexSlice @@ -681,7 +682,7 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python dft_minute = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, - index=series_minute.index) + index=series_minute.index) dft_minute['2011-12-31 23'] @@ -693,18 +694,16 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python - dft_minute.loc['2011-12-31 23:59'] + dft_minute.loc['2011-12-31 23:59'] Note also that ``DatetimeIndex`` resolution cannot be less precise than day. .. ipython:: python series_monthly = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12', - '2012-01', - '2012-02'])) + pd.DatetimeIndex(['2011-12', '2012-01', '2012-02'])) series_monthly.index.resolution - series_monthly['2011-12'] # returns Series + series_monthly['2011-12'] # returns Series Exact Indexing @@ -716,13 +715,14 @@ These ``Timestamp`` and ``datetime`` objects have exact ``hours, minutes,`` and .. ipython:: python - dft[datetime(2013, 1, 1):datetime(2013,2,28)] + dft[datetime.datetime(2013, 1, 1):datetime.datetime(2013, 2, 28)] With no defaults. .. ipython:: python - dft[datetime(2013, 1, 1, 10, 12, 0):datetime(2013, 2, 28, 10, 12, 0)] + dft[datetime.datetime(2013, 1, 1, 10, 12, 0): + datetime.datetime(2013, 2, 28, 10, 12, 0)] Truncating & Fancy Indexing @@ -823,120 +823,119 @@ on :ref:`.dt accessors`. DateOffset Objects ------------------ -In the preceding examples, we created ``DatetimeIndex`` objects at various -frequencies by passing in :ref:`frequency strings ` -like 'M', 'W', and 'BM' to the ``freq`` keyword. Under the hood, these frequency -strings are being translated into an instance of :class:`DateOffset`, -which represents a regular frequency increment. Specific offset logic like -"month", "business day", or "one hour" is represented in its various subclasses. - -.. csv-table:: - :header: "Class name", "Description" - :widths: 15, 65 - - DateOffset, "Generic offset class, defaults to 1 calendar day" - BDay, "business day (weekday)" - CDay, "custom business day" - Week, "one week, optionally anchored on a day of the week" - WeekOfMonth, "the x-th day of the y-th week of each month" - LastWeekOfMonth, "the x-th day of the last week of each month" - MonthEnd, "calendar month end" - MonthBegin, "calendar month begin" - BMonthEnd, "business month end" - BMonthBegin, "business month begin" - CBMonthEnd, "custom business month end" - CBMonthBegin, "custom business month begin" - SemiMonthEnd, "15th (or other day_of_month) and calendar month end" - SemiMonthBegin, "15th (or other day_of_month) and calendar month begin" - QuarterEnd, "calendar quarter end" - QuarterBegin, "calendar quarter begin" - BQuarterEnd, "business quarter end" - BQuarterBegin, "business quarter begin" - FY5253Quarter, "retail (aka 52-53 week) quarter" - YearEnd, "calendar year end" - YearBegin, "calendar year begin" - BYearEnd, "business year end" - BYearBegin, "business year begin" - FY5253, "retail (aka 52-53 week) year" - BusinessHour, "business hour" - CustomBusinessHour, "custom business hour" - Hour, "one hour" - Minute, "one minute" - Second, "one second" - Milli, "one millisecond" - Micro, "one microsecond" - Nano, "one nanosecond" - -The basic ``DateOffset`` takes the same arguments as -``dateutil.relativedelta``, which works as follows: - -.. ipython:: python - - d = datetime(2008, 8, 18, 9, 0) - d + relativedelta(months=4, days=5) - -We could have done the same thing with ``DateOffset``: - -.. ipython:: python - - from pandas.tseries.offsets import * - d + DateOffset(months=4, days=5) +In the preceding examples, frequency strings (e.g. ``'D'``) were used to specify +a frequency that defined: -The key features of a ``DateOffset`` object are: +* how the date times in :class:`DatetimeIndex` were spaced when using :meth:`date_range` +* the frequency of a :class:`Period` or :class:`PeriodIndex` -* It can be added / subtracted to/from a datetime object to obtain a - shifted date. -* It can be multiplied by an integer (positive or negative) so that the - increment will be applied multiple times. -* It has :meth:`~pandas.DateOffset.rollforward` and - :meth:`~pandas.DateOffset.rollback` methods for moving a date forward or - backward to the next or previous "offset date". +These frequency strings map to a :class:`DateOffset` object and its subclasses. A :class:`DateOffset` +is similar to a :class:`Timedelta` that represents a duration of time but follows specific calendar duration rules. +For example, a :class:`Timedelta` day will always increment ``datetimes`` by 24 hours, while a :class:`DateOffset` day +will increment ``datetimes`` to the same time the next day whether a day represents 23, 24 or 25 hours due to daylight +savings time. However, all :class:`DateOffset` subclasses that are an hour or smaller +(``Hour``, ``Minute``, ``Second``, ``Milli``, ``Micro``, ``Nano``) behave like +:class:`Timedelta` and respect absolute time. -Subclasses of ``DateOffset`` define the ``apply`` function which dictates -custom date increment logic, such as adding business days: - -.. code-block:: python - - class BDay(DateOffset): - """DateOffset increments between business days""" - def apply(self, other): - ... +The basic :class:`DateOffset` acts similar to ``dateutil.relativedelta`` (`relativedelta documentation`_) +that shifts a date time by the corresponding calendar duration specified. The +arithmetic operator (``+``) or the ``apply`` method can be used to perform the shift. .. ipython:: python - d - 5 * BDay() - d + BMonthEnd() - -The ``rollforward`` and ``rollback`` methods do exactly what you would expect: - -.. ipython:: python - - d - offset = BMonthEnd() - offset.rollforward(d) - offset.rollback(d) - -It's definitely worth exploring the ``pandas.tseries.offsets`` module and the -various docstrings for the classes. + # This particular day contains a day light savings time transition + ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + # Respects absolute time + ts + pd.Timedelta(days=1) + # Respects calendar time + ts + pd.DateOffset(days=1) + friday = pd.Timestamp('2018-01-05') + friday.day_name() + # Add 2 business days (Friday --> Tuesday) + two_business_days = 2 * pd.offsets.BDay() + two_business_days.apply(friday) + friday + two_business_days + (friday + two_business_days).day_name() + +Most ``DateOffsets`` have associated frequencies strings, or offset aliases, that can be passed +into ``freq`` keyword arguments. The available date offsets and associated frequency strings can be found below: -These operations (``apply``, ``rollforward`` and ``rollback``) preserve time -(hour, minute, etc) information by default. To reset time, use ``normalize`` -before or after applying the operation (depending on whether you want the -time information included in the operation. +.. csv-table:: + :header: "Date Offset", "Frequency String", "Description" + :widths: 15, 15, 65 + + ``DateOffset``, None, "Generic offset class, defaults to 1 calendar day" + ``BDay`` or ``BusinessDay``, ``'B'``,"business day (weekday)" + ``CDay`` or ``CustomBusinessDay``, ``'C'``, "custom business day" + ``Week``, ``'W'``, "one week, optionally anchored on a day of the week" + ``WeekOfMonth``, ``'WOM'``, "the x-th day of the y-th week of each month" + ``LastWeekOfMonth``, ``'LWOM'``, "the x-th day of the last week of each month" + ``MonthEnd``, ``'M'``, "calendar month end" + ``MonthBegin``, ``'MS'``, "calendar month begin" + ``BMonthEnd`` or ``BusinessMonthEnd``, ``'BM'``, "business month end" + ``BMonthBegin`` or ``BusinessMonthBegin``, ``'BMS'``, "business month begin" + ``CBMonthEnd`` or ``CustomBusinessMonthEnd``, ``'CBM'``, "custom business month end" + ``CBMonthBegin`` or ``CustomBusinessMonthBegin``, ``'CBMS'``, "custom business month begin" + ``SemiMonthEnd``, ``'SM'``, "15th (or other day_of_month) and calendar month end" + ``SemiMonthBegin``, ``'SMS'``, "15th (or other day_of_month) and calendar month begin" + ``QuarterEnd``, ``'Q'``, "calendar quarter end" + ``QuarterBegin``, ``'QS'``, "calendar quarter begin" + ``BQuarterEnd``, ``'BQ``, "business quarter end" + ``BQuarterBegin``, ``'BQS'``, "business quarter begin" + ``FY5253Quarter``, ``'REQ'``, "retail (aka 52-53 week) quarter" + ``YearEnd``, ``'A'``, "calendar year end" + ``YearBegin``, ``'AS'`` or ``'BYS'``,"calendar year begin" + ``BYearEnd``, ``'BA'``, "business year end" + ``BYearBegin``, ``'BAS'``, "business year begin" + ``FY5253``, ``'RE'``, "retail (aka 52-53 week) year" + ``Easter``, None, "Easter holiday" + ``BusinessHour``, ``'BH'``, "business hour" + ``CustomBusinessHour``, ``'CBH'``, "custom business hour" + ``Day``, ``'D'``, "one absolute day" + ``Hour``, ``'H'``, "one hour" + ``Minute``, ``'T'`` or ``'min'``,"one minute" + ``Second``, ``'S'``, "one second" + ``Milli``, ``'L'`` or ``'ms'``, "one millisecond" + ``Micro``, ``'U'`` or ``'us'``, "one microsecond" + ``Nano``, ``'N'``, "one nanosecond" + +``DateOffsets`` additionally have :meth:`rollforward` and :meth:`rollback` +methods for moving a date forward or backward respectively to a valid offset +date relative to the offset. For example, business offsets will roll dates +that land on the weekends (Saturday and Sunday) forward to Monday since +business offsets operate on the weekdays. + +.. ipython:: python + + ts = pd.Timestamp('2018-01-06 00:00:00') + ts.day_name() + # BusinessHour's valid offset dates are Monday through Friday + offset = pd.offsets.BusinessHour(start='09:00') + # Bring the date to the closest offset date (Monday) + offset.rollforward(ts) + # Date is brought to the closest offset date first and then the hour is added + ts + offset + +These operations preserve time (hour, minute, etc) information by default. +To reset time to midnight, use :meth:`normalize` before or after applying +the operation (depending on whether you want the time information included +in the operation). .. ipython:: python ts = pd.Timestamp('2014-01-01 09:00') - day = Day() + day = pd.offsets.Day() day.apply(ts) day.apply(ts).normalize() ts = pd.Timestamp('2014-01-01 22:00') - hour = Hour() + hour = pd.offsets.Hour() hour.apply(ts) hour.apply(ts).normalize() hour.apply(pd.Timestamp("2014-01-01 23:30")).normalize() +.. _relativedelta documentation: https://dateutil.readthedocs.io/en/stable/relativedelta.html + .. _timeseries.dayvscalendarday: Day vs. CalendarDay @@ -968,27 +967,28 @@ particular day of the week: .. ipython:: python + d = datetime.datetime(2008, 8, 18, 9, 0) d - d + Week() - d + Week(weekday=4) - (d + Week(weekday=4)).weekday() + d + pd.offsets.Week() + d + pd.offsets.Week(weekday=4) + (d + pd.offsets.Week(weekday=4)).weekday() - d - Week() + d - pd.offsets.Week() The ``normalize`` option will be effective for addition and subtraction. .. ipython:: python - d + Week(normalize=True) - d - Week(normalize=True) + d + pd.offsets.Week(normalize=True) + d - pd.offsets.Week(normalize=True) Another example is parameterizing ``YearEnd`` with the specific ending month: .. ipython:: python - d + YearEnd() - d + YearEnd(month=6) + d + pd.offsets.YearEnd() + d + pd.offsets.YearEnd(month=6) .. _timeseries.offsetseries: @@ -1004,9 +1004,9 @@ apply the offset to each element. rng = pd.date_range('2012-01-01', '2012-01-03') s = pd.Series(rng) rng - rng + DateOffset(months=2) - s + DateOffset(months=2) - s - DateOffset(months=2) + rng + pd.DateOffset(months=2) + s + pd.DateOffset(months=2) + s - pd.DateOffset(months=2) If the offset class maps directly to a ``Timedelta`` (``Day``, ``Hour``, ``Minute``, ``Second``, ``Micro``, ``Milli``, ``Nano``) it can be @@ -1015,10 +1015,10 @@ used exactly like a ``Timedelta`` - see the .. ipython:: python - s - Day(2) + s - pd.offsets.Day(2) td = s - pd.Series(pd.date_range('2011-12-29', '2011-12-31')) td - td + Minute(15) + td + pd.offsets.Minute(15) Note that some offsets (such as ``BQuarterEnd``) do not have a vectorized implementation. They can still be used but may @@ -1027,7 +1027,7 @@ calculate significantly slower and will show a ``PerformanceWarning`` .. ipython:: python :okwarning: - rng + BQuarterEnd() + rng + pd.offsets.BQuarterEnd() .. _timeseries.custombusinessdays: @@ -1043,15 +1043,17 @@ As an interesting example, let's look at Egypt where a Friday-Saturday weekend i .. ipython:: python - from pandas.tseries.offsets import CustomBusinessDay weekmask_egypt = 'Sun Mon Tue Wed Thu' # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ['2012-05-01', datetime(2013, 5, 1), np.datetime64('2014-05-01')] - bday_egypt = CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) - dt = datetime(2013, 4, 30) + holidays = ['2012-05-01', + datetime.datetime(2013, 5, 1), + np.datetime64('2014-05-01')] + bday_egypt = pd.offsets.CustomBusinessDay(holidays=holidays, + weekmask=weekmask_egypt) + dt = datetime.datetime(2013, 4, 30) dt + 2 * bday_egypt Let's map to the weekday names: @@ -1060,7 +1062,8 @@ Let's map to the weekday names: dts = pd.date_range(dt, periods=5, freq=bday_egypt) - pd.Series(dts.weekday, dts).map(pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())) + pd.Series(dts.weekday, dts).map( + pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())) Holiday calendars can be used to provide the list of holidays. See the :ref:`holiday calendar` section for more information. @@ -1069,10 +1072,10 @@ Holiday calendars can be used to provide the list of holidays. See the from pandas.tseries.holiday import USFederalHolidayCalendar - bday_us = CustomBusinessDay(calendar=USFederalHolidayCalendar()) + bday_us = pd.offsets.CustomBusinessDay(calendar=USFederalHolidayCalendar()) # Friday before MLK Day - dt = datetime(2014, 1, 17) + dt = datetime.datetime(2014, 1, 17) # Tuesday after MLK Day (Monday is skipped because it's a holiday) dt + bday_us @@ -1082,15 +1085,15 @@ in the usual way. .. ipython:: python - from pandas.tseries.offsets import CustomBusinessMonthBegin - bmth_us = CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar()) + bmth_us = pd.offsets.CustomBusinessMonthBegin( + calendar=USFederalHolidayCalendar()) # Skip new years - dt = datetime(2013, 12, 17) + dt = datetime.datetime(2013, 12, 17) dt + bmth_us # Define date index with custom offset - pd.DatetimeIndex(start='20100101',end='20120101',freq=bmth_us) + pd.DatetimeIndex(start='20100101', end='20120101', freq=bmth_us) .. note:: @@ -1111,13 +1114,13 @@ allowing to use specific start and end times. By default, ``BusinessHour`` uses 9:00 - 17:00 as business hours. Adding ``BusinessHour`` will increment ``Timestamp`` by hourly frequency. -If target ``Timestamp`` is out of business hours, move to the next business hour -then increment it. If the result exceeds the business hours end, the remaining +If target ``Timestamp`` is out of business hours, move to the next business hour +then increment it. If the result exceeds the business hours end, the remaining hours are added to the next business day. .. ipython:: python - bh = BusinessHour() + bh = pd.offsets.BusinessHour() bh # 2014-08-01 is Friday @@ -1134,19 +1137,19 @@ hours are added to the next business day. pd.Timestamp('2014-08-01 16:30') + bh # Adding 2 business hours - pd.Timestamp('2014-08-01 10:00') + BusinessHour(2) + pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(2) # Subtracting 3 business hours - pd.Timestamp('2014-08-01 10:00') + BusinessHour(-3) + pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(-3) -You can also specify ``start`` and ``end`` time by keywords. The argument must -be a ``str`` with an ``hour:minute`` representation or a ``datetime.time`` -instance. Specifying seconds, microseconds and nanoseconds as business hour +You can also specify ``start`` and ``end`` time by keywords. The argument must +be a ``str`` with an ``hour:minute`` representation or a ``datetime.time`` +instance. Specifying seconds, microseconds and nanoseconds as business hour results in ``ValueError``. .. ipython:: python - bh = BusinessHour(start='11:00', end=time(20, 0)) + bh = pd.offsets.BusinessHour(start='11:00', end=datetime.time(20, 0)) bh pd.Timestamp('2014-08-01 13:00') + bh @@ -1159,7 +1162,7 @@ Valid business hours are distinguished by whether it started from valid ``Busine .. ipython:: python - bh = BusinessHour(start='17:00', end='09:00') + bh = pd.offsets.BusinessHour(start='17:00', end='09:00') bh pd.Timestamp('2014-08-01 17:00') + bh @@ -1184,22 +1187,22 @@ under the default business hours (9:00 - 17:00), there is no gap (0 minutes) bet .. ipython:: python # This adjusts a Timestamp to business hour edge - BusinessHour().rollback(pd.Timestamp('2014-08-02 15:00')) - BusinessHour().rollforward(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().rollback(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02 15:00')) # It is the same as BusinessHour().apply(pd.Timestamp('2014-08-01 17:00')). # And it is the same as BusinessHour().apply(pd.Timestamp('2014-08-04 09:00')) - BusinessHour().apply(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02 15:00')) # BusinessDay results (for reference) - BusinessHour().rollforward(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02')) # It is the same as BusinessDay().apply(pd.Timestamp('2014-08-01')) # The result is the same as rollworward because BusinessDay never overlap. - BusinessHour().apply(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02')) -``BusinessHour`` regards Saturday and Sunday as holidays. To use arbitrary -holidays, you can use ``CustomBusinessHour`` offset, as explained in the +``BusinessHour`` regards Saturday and Sunday as holidays. To use arbitrary +holidays, you can use ``CustomBusinessHour`` offset, as explained in the following subsection. .. _timeseries.custombusinesshour: @@ -1216,9 +1219,9 @@ as ``BusinessHour`` except that it skips specified custom holidays. .. ipython:: python from pandas.tseries.holiday import USFederalHolidayCalendar - bhour_us = CustomBusinessHour(calendar=USFederalHolidayCalendar()) + bhour_us = pd.offsets.CustomBusinessHour(calendar=USFederalHolidayCalendar()) # Friday before MLK Day - dt = datetime(2014, 1, 17, 15) + dt = datetime.datetime(2014, 1, 17, 15) dt + bhour_us @@ -1229,7 +1232,8 @@ You can use keyword arguments supported by either ``BusinessHour`` and ``CustomB .. ipython:: python - bhour_mon = CustomBusinessHour(start='10:00', weekmask='Tue Wed Thu Fri') + bhour_mon = pd.offsets.CustomBusinessHour(start='10:00', + weekmask='Tue Wed Thu Fri') # Monday is skipped because it's a holiday, business hour starts from 10:00 dt + bhour_mon * 2 @@ -1285,7 +1289,7 @@ most functions: pd.date_range(start, periods=5, freq='B') - pd.date_range(start, periods=5, freq=BDay()) + pd.date_range(start, periods=5, freq=pd.offsets.BDay()) You can combine together day and intraday offsets: @@ -1352,39 +1356,39 @@ anchor point, and moved ``|n|-1`` additional steps forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-02') + MonthBegin(n=1) - pd.Timestamp('2014-01-02') + MonthEnd(n=1) + pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') - MonthBegin(n=1) - pd.Timestamp('2014-01-02') - MonthEnd(n=1) + pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-02') - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') + MonthBegin(n=4) - pd.Timestamp('2014-01-02') - MonthBegin(n=4) + pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=4) + pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=4) If the given date *is* on an anchor point, it is moved ``|n|`` points forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-01') + MonthBegin(n=1) - pd.Timestamp('2014-01-31') + MonthEnd(n=1) + pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') - MonthBegin(n=1) - pd.Timestamp('2014-01-31') - MonthEnd(n=1) + pd.Timestamp('2014-01-01') - pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-31') - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') + MonthBegin(n=4) - pd.Timestamp('2014-01-31') - MonthBegin(n=4) + pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=4) + pd.Timestamp('2014-01-31') - pd.offsets.MonthBegin(n=4) For the case when ``n=0``, the date is not moved if on an anchor point, otherwise it is rolled forward to the next anchor point. .. ipython:: python - pd.Timestamp('2014-01-02') + MonthBegin(n=0) - pd.Timestamp('2014-01-02') + MonthEnd(n=0) + pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=0) + pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=0) - pd.Timestamp('2014-01-01') + MonthBegin(n=0) - pd.Timestamp('2014-01-31') + MonthEnd(n=0) + pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=0) + pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=0) .. _timeseries.holiday: @@ -1427,10 +1431,13 @@ An example of how holidays and holiday calendars are defined: USMemorialDay, Holiday('July 4th', month=7, day=4, observance=nearest_workday), Holiday('Columbus Day', month=10, day=1, - offset=DateOffset(weekday=MO(2))), #same as 2*Week(weekday=2) - ] + offset=pd.DateOffset(weekday=MO(2)))] + cal = ExampleCalendar() - cal.holidays(datetime(2012, 1, 1), datetime(2012, 12, 31)) + cal.holidays(datetime.datetime(2012, 1, 1), datetime.datetime(2012, 12, 31)) + +:hint: + **weekday=MO(2)** is same as **2 * Week(weekday=2)** Using this calendar, creating an index or doing offset arithmetic skips weekends and holidays (i.e., Memorial Day/July 4th). For example, the below defines @@ -1440,14 +1447,13 @@ or ``Timestamp`` objects. .. ipython:: python - from pandas.tseries.offsets import CDay pd.DatetimeIndex(start='7/1/2012', end='7/10/2012', - freq=CDay(calendar=cal)).to_pydatetime() - offset = CustomBusinessDay(calendar=cal) - datetime(2012, 5, 25) + offset - datetime(2012, 7, 3) + offset - datetime(2012, 7, 3) + 2 * offset - datetime(2012, 7, 6) + offset + freq=pd.offsets.CDay(calendar=cal)).to_pydatetime() + offset = pd.offsets.CustomBusinessDay(calendar=cal) + datetime.datetime(2012, 5, 25) + offset + datetime.datetime(2012, 7, 3) + offset + datetime.datetime(2012, 7, 3) + 2 * offset + datetime.datetime(2012, 7, 6) + offset Ranges are defined by the ``start_date`` and ``end_date`` class attributes of ``AbstractHolidayCalendar``. The defaults are shown below. @@ -1462,8 +1468,8 @@ datetime/Timestamp/string. .. ipython:: python - AbstractHolidayCalendar.start_date = datetime(2012, 1, 1) - AbstractHolidayCalendar.end_date = datetime(2012, 12, 31) + AbstractHolidayCalendar.start_date = datetime.datetime(2012, 1, 1) + AbstractHolidayCalendar.end_date = datetime.datetime(2012, 12, 31) cal.holidays() Every calendar class is accessible by name using the ``get_calendar`` function @@ -1490,7 +1496,7 @@ Shifting / Lagging ~~~~~~~~~~~~~~~~~~ One may want to *shift* or *lag* the values in a time series back and forward in -time. The method for this is :meth:`~Series.shift`, which is available on all of +time. The method for this is :meth:`~Series.shift`, which is available on all of the pandas objects. .. ipython:: python @@ -1500,16 +1506,16 @@ the pandas objects. ts.shift(1) The ``shift`` method accepts an ``freq`` argument which can accept a -``DateOffset`` class or other ``timedelta``-like object or also an +``DateOffset`` class or other ``timedelta``-like object or also an :ref:`offset alias `: .. ipython:: python - ts.shift(5, freq=offsets.BDay()) + ts.shift(5, freq=pd.offsets.BDay()) ts.shift(5, freq='BM') Rather than changing the alignment of the data and the index, ``DataFrame`` and -``Series`` objects also have a :meth:`~Series.tshift` convenience method that +``Series`` objects also have a :meth:`~Series.tshift` convenience method that changes all the dates in the index by a specified number of offsets: .. ipython:: python @@ -1522,35 +1528,35 @@ is not being realigned. Frequency Conversion ~~~~~~~~~~~~~~~~~~~~ -The primary function for changing frequencies is the :meth:`~Series.asfreq` -method. For a ``DatetimeIndex``, this is basically just a thin, but convenient -wrapper around :meth:`~Series.reindex` which generates a ``date_range`` and +The primary function for changing frequencies is the :meth:`~Series.asfreq` +method. For a ``DatetimeIndex``, this is basically just a thin, but convenient +wrapper around :meth:`~Series.reindex` which generates a ``date_range`` and calls ``reindex``. .. ipython:: python - dr = pd.date_range('1/1/2010', periods=3, freq=3 * offsets.BDay()) - ts = pd.Series(randn(3), index=dr) + dr = pd.date_range('1/1/2010', periods=3, freq=3 * pd.offsets.BDay()) + ts = pd.Series(np.random.randn(3), index=dr) ts - ts.asfreq(BDay()) + ts.asfreq(pd.offsets.BDay()) ``asfreq`` provides a further convenience so you can specify an interpolation method for any gaps that may appear after the frequency conversion. .. ipython:: python - ts.asfreq(BDay(), method='pad') + ts.asfreq(pd.offsets.BDay(), method='pad') Filling Forward / Backward ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Related to ``asfreq`` and ``reindex`` is :meth:`~Series.fillna`, which is +Related to ``asfreq`` and ``reindex`` is :meth:`~Series.fillna`, which is documented in the :ref:`missing data section `. Converting to Python Datetimes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``DatetimeIndex`` can be converted to an array of Python native +``DatetimeIndex`` can be converted to an array of Python native :py:class:`datetime.datetime` objects using the ``to_pydatetime`` method. .. _timeseries.resampling: @@ -1563,13 +1569,13 @@ Resampling The interface to ``.resample`` has changed in 0.18.0 to be more groupby-like and hence more flexible. See the :ref:`whatsnew docs ` for a comparison with prior versions. -Pandas has a simple, powerful, and efficient functionality for performing -resampling operations during frequency conversion (e.g., converting secondly -data into 5-minutely data). This is extremely common in, but not limited to, +Pandas has a simple, powerful, and efficient functionality for performing +resampling operations during frequency conversion (e.g., converting secondly +data into 5-minutely data). This is extremely common in, but not limited to, financial applications. -:meth:`~Series.resample` is a time-based groupby, followed by a reduction method -on each of its groups. See some :ref:`cookbook examples ` for +:meth:`~Series.resample` is a time-based groupby, followed by a reduction method +on each of its groups. See some :ref:`cookbook examples ` for some advanced strategies. Starting in version 0.18.1, the ``resample()`` function can be used directly from @@ -1577,7 +1583,7 @@ Starting in version 0.18.1, the ``resample()`` function can be used directly fro .. note:: - ``.resample()`` is similar to using a :meth:`~Series.rolling` operation with + ``.resample()`` is similar to using a :meth:`~Series.rolling` operation with a time-based offset, see a discussion :ref:`here `. Basics @@ -1624,7 +1630,7 @@ labels. .. ipython:: python - ts.resample('5Min').mean() # by default label='left' + ts.resample('5Min').mean() # by default label='left' ts.resample('5Min', label='left').mean() @@ -1632,8 +1638,8 @@ labels. .. note:: - The default values for ``label`` and ``closed`` is 'left' for all - frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' + The default values for ``label`` and ``closed`` is 'left' for all + frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. .. ipython:: python @@ -1680,9 +1686,9 @@ Sparse Resampling ~~~~~~~~~~~~~~~~~ Sparse timeseries are the ones where you have a lot fewer points relative -to the amount of time you are looking to resample. Naively upsampling a sparse -series can potentially generate lots of intermediate values. When you don't want -to use a method to fill these values, e.g. ``fill_method`` is ``None``, then +to the amount of time you are looking to resample. Naively upsampling a sparse +series can potentially generate lots of intermediate values. When you don't want +to use a method to fill these values, e.g. ``fill_method`` is ``None``, then intermediate values will be filled with ``NaN``. Since ``resample`` is a time-based groupby, the following is a method to efficiently @@ -1737,7 +1743,7 @@ We can select a specific column or columns using standard getitem. r['A'].mean() - r[['A','B']].mean() + r[['A', 'B']].mean() You can pass a list or dict of functions to do aggregation with, outputting a ``DataFrame``: @@ -1758,21 +1764,21 @@ columns of a ``DataFrame``: .. ipython:: python :okexcept: - r.agg({'A' : np.sum, - 'B' : lambda x: np.std(x, ddof=1)}) + r.agg({'A': np.sum, + 'B': lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the resampled object: .. ipython:: python - r.agg({'A' : 'sum', 'B' : 'std'}) + r.agg({'A': 'sum', 'B': 'std'}) Furthermore, you can also specify multiple aggregation functions for each column separately. .. ipython:: python - r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) + r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) If a ``DataFrame`` does not have a datetimelike index, but instead you want @@ -1784,9 +1790,9 @@ to resample based on datetimelike column in the frame, it can passed to the df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), 'a': np.arange(5)}, index=pd.MultiIndex.from_arrays([ - [1,2,3,4,5], - pd.date_range('2015-01-01', freq='W', periods=5)], - names=['v','d'])) + [1, 2, 3, 4, 5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v', 'd'])) df df.resample('M', on='date').sum() @@ -1845,13 +1851,13 @@ If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, .. ipython:: python p = pd.Period('2014-07-01 09:00', freq='H') - p + Hour(2) - p + timedelta(minutes=120) + p + pd.offsets.Hour(2) + p + datetime.timedelta(minutes=120) p + np.timedelta64(7200, 's') .. code-block:: ipython - In [1]: p + Minute(5) + In [1]: p + pd.offsets.Minute(5) Traceback ... ValueError: Input has different freq from Period(freq=H) @@ -1861,11 +1867,11 @@ If ``Period`` has other frequencies, only the same ``offsets`` can be added. Oth .. ipython:: python p = pd.Period('2014-07', freq='M') - p + MonthEnd(3) + p + pd.offsets.MonthEnd(3) .. code-block:: ipython - In [1]: p + MonthBegin(3) + In [1]: p + pd.offsets.MonthBegin(3) Traceback ... ValueError: Input has different freq from Period(freq=M) @@ -1923,11 +1929,11 @@ objects: idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') idx - idx + Hour(2) + idx + pd.offsets.Hour(2) idx = pd.period_range('2014-07', periods=5, freq='M') idx - idx + MonthEnd(3) + idx + pd.offsets.MonthEnd(3) ``PeriodIndex`` has its own dtype named ``period``, refer to :ref:`Period Dtypes `. @@ -1977,7 +1983,7 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI ps['2011-01'] - ps[datetime(2011, 12, 25):] + ps[datetime.datetime(2011, 12, 25):] ps['10/31/2011':'12/31/2011'] @@ -1987,9 +1993,11 @@ Passing a string representing a lower frequency than ``PeriodIndex`` returns par ps['2011'] - dfp = pd.DataFrame(np.random.randn(600,1), + dfp = pd.DataFrame(np.random.randn(600, 1), columns=['A'], - index=pd.period_range('2013-01-01 9:00', periods=600, freq='T')) + index=pd.period_range('2013-01-01 9:00', + periods=600, + freq='T')) dfp dfp['2013-01-01 10H'] @@ -2178,6 +2186,8 @@ time zones by starting with ``dateutil/``. .. ipython:: python + import dateutil + # pytz rng_pytz = pd.date_range('3/6/2012 00:00', periods=10, freq='D', tz='Europe/London') @@ -2199,6 +2209,8 @@ which gives you more control over which time zone is used: .. ipython:: python + import pytz + # pytz tz_pytz = pytz.timezone('Europe/London') rng_pytz = pd.date_range('3/6/2012 00:00', periods=10, freq='D', @@ -2297,7 +2309,8 @@ To remove timezone from tz-aware ``DatetimeIndex``, use ``tz_localize(None)`` or .. ipython:: python - didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', periods=10, tz='US/Eastern') + didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', + periods=10, tz='US/Eastern') didx didx.tz_localize(None) didx.tz_convert(None) @@ -2350,7 +2363,8 @@ constructor as well as ``tz_localize``. rng_hourly.tz_localize('US/Eastern', ambiguous=rng_hourly_dst).tolist() rng_hourly.tz_localize('US/Eastern', ambiguous='NaT').tolist() - didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', periods=10, tz='US/Eastern') + didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', + periods=10, tz='US/Eastern') didx didx.tz_localize(None) didx.tz_convert(None) @@ -2373,7 +2387,7 @@ can be controlled by the ``nonexistent`` argument. The following options are ava .. ipython:: python - dti = pd.date_range(start='2015-03-29 01:30:00', periods=3, freq='H') + dti = pd.date_range(start='2015-03-29 02:30:00', periods=3, freq='H') # 2:30 is a nonexistent time Localization of nonexistent times will raise an error by default. @@ -2401,14 +2415,14 @@ TZ Aware Dtypes .. ipython:: python - s_naive = pd.Series(pd.date_range('20130101',periods=3)) + s_naive = pd.Series(pd.date_range('20130101', periods=3)) s_naive ``Series/DatetimeIndex`` with a timezone **aware** value are represented with a dtype of ``datetime64[ns, tz]``. .. ipython:: python - s_aware = pd.Series(pd.date_range('20130101',periods=3,tz='US/Eastern')) + s_aware = pd.Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) s_aware Both of these ``Series`` can be manipulated via the ``.dt`` accessor, see :ref:`here `. diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 2c9aecdde2f45..f9e3dc0f5c348 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -378,6 +378,7 @@ Backwards incompatible API changes "worked" purely due to limitations of dtype checking -- e.g. ``bytes``, which is now disabled except for `decode` and `len` (:issue:`23011`, :issue:`23163`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) +- :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) .. _whatsnew_0240.api_breaking.deps: @@ -1215,6 +1216,7 @@ Performance Improvements The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`) Slicing a monotonically increasing :class:`CategoricalIndex` itself (i.e. ``ci[1000:2000]``) shows similar speed improvements as above (:issue:`21659`) +- Improved performance of :meth:`CategoricalIndex.equals` when comparing to another :class:`CategoricalIndex` (:issue:`24023`) - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) @@ -1230,7 +1232,7 @@ Performance Improvements - Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`) - Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`) - +- Improved performance of :class:`Categorical` constructor for `Series` objects (:issue:`23814`) .. _whatsnew_0240.docs: @@ -1256,6 +1258,7 @@ Categorical - Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`). - In meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`) - Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) +- Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) Datetimelike ^^^^^^^^^^^^ @@ -1418,6 +1421,7 @@ MultiIndex I/O ^^^ + .. _whatsnew_0240.bug_fixes.nan_with_str_dtype: Proper handling of `np.NaN` in a string data-typed column with the Python engine @@ -1481,6 +1485,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :meth:`read_excel()` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`) - Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`) - Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) +- Bug in :meth:`DataFrame.to_dict` when the resulting dict contains non-Python scalars in the case of numeric data (:issue:`23753`) - :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`) Plotting @@ -1532,6 +1537,7 @@ Reshaping - Bug in :func:`pandas.melt` when passing column names that are not present in ``DataFrame`` (:issue:`23575`) - Bug in :meth:`DataFrame.append` with a :class:`Series` with a dateutil timezone would raise a ``TypeError`` (:issue:`23682`) - Bug in ``Series`` construction when passing no data and ``dtype=str`` (:issue:`22477`) +- Bug in :func:`cut` with ``bins`` as an overlapping ``IntervalIndex`` where multiple bins were returned per item instead of raising a ``ValueError`` (:issue:`23980`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/environment.yml b/environment.yml index fc35f1290f1b1..4daaa90247fa8 100644 --- a/environment.yml +++ b/environment.yml @@ -4,22 +4,21 @@ channels: - conda-forge dependencies: # required - - NumPy + - numpy>=1.15 - python=3 - python-dateutil>=2.5.0 - pytz # development - - Cython>=0.28.2 + - cython>=0.28.2 - flake8 - flake8-comprehensions - - flake8-rst=0.4.2 + - flake8-rst>=0.6.0 - gitpython - - hypothesis>=3.58.0 + - hypothesis>=3.82 - isort - moto - - pytest>=3.6 - - setuptools>=24.2.0 + - pytest>=4.0 - sphinx - sphinxcontrib-spelling @@ -28,7 +27,6 @@ dependencies: - blosc - bottleneck>=1.2.0 - fastparquet>=0.1.2 - - gcsfs - html5lib - ipython>=5.6.0 - ipykernel @@ -36,15 +34,13 @@ dependencies: - lxml - matplotlib>=2.0.0 - nbsphinx - - numexpr>=2.6.1 + - numexpr>=2.6.8 - openpyxl - pyarrow>=0.7.0 - - pymysql - pytables>=3.4.2 - pytest-cov - pytest-xdist - - s3fs - - scipy>=0.18.1 + - scipy>=1.1 - seaborn - sqlalchemy - statsmodels diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1dc71264c94dd..a459057555cf3 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1245,6 +1245,10 @@ cdef class TextReader: result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, self.true_set, self.false_set) + if user_dtype and na_count is not None: + if na_count > 0: + raise ValueError("Bool column has NA values in " + "column {column}".format(column=i)) return result, na_count elif dtype.kind == 'S': diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 609608a0948c5..a3e6c7e344940 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -520,9 +520,10 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', # specify error conditions assert is_raise or is_ignore or is_coerce + result = np.empty(n, dtype='M8[ns]') + iresult = result.view('i8') + try: - result = np.empty(n, dtype='M8[ns]') - iresult = result.view('i8') for i in range(n): val = values[i] @@ -571,16 +572,13 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', elif is_datetime64_object(val): seen_datetime = 1 - if get_datetime64_value(val) == NPY_NAT: - iresult[i] = NPY_NAT - else: - try: - iresult[i] = get_datetime64_nanos(val) - except OutOfBoundsDatetime: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise + try: + iresult[i] = get_datetime64_nanos(val) + except OutOfBoundsDatetime: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition @@ -706,62 +704,85 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', raise TypeError("{typ} is not convertible to datetime" .format(typ=type(val))) - if seen_datetime and seen_integer: - # we have mixed datetimes & integers - - if is_coerce: - # coerce all of the integers/floats to NaT, preserve - # the datetimes and other convertibles - for i in range(n): - val = values[i] - if is_integer_object(val) or is_float_object(val): - result[i] = NPY_NAT - elif is_raise: - raise ValueError( - "mixed datetimes and integers in passed array") - else: - raise TypeError - - if seen_datetime_offset and not utc_convert: - # GH 17697 - # 1) If all the offsets are equal, return one offset for - # the parsed dates to (maybe) pass to DatetimeIndex - # 2) If the offsets are different, then force the parsing down the - # object path where an array of datetimes - # (with individual dateutil.tzoffsets) are returned - is_same_offsets = len(out_tzoffset_vals) == 1 - if not is_same_offsets: - return array_to_datetime_object(values, is_raise, - dayfirst, yearfirst) - else: - tz_offset = out_tzoffset_vals.pop() - tz_out = pytz.FixedOffset(tz_offset / 60.) - return result, tz_out except OutOfBoundsDatetime: if is_raise: raise - oresult = np.empty(n, dtype=object) - for i in range(n): - val = values[i] + return ignore_errors_out_of_bounds_fallback(values), tz_out - # set as nan except if its a NaT - if checknull_with_nat(val): - if isinstance(val, float): - oresult[i] = np.nan - else: - oresult[i] = NaT - elif is_datetime64_object(val): - if get_datetime64_value(val) == NPY_NAT: - oresult[i] = NaT - else: - oresult[i] = val.item() - else: - oresult[i] = val - return oresult, tz_out except TypeError: return array_to_datetime_object(values, is_raise, dayfirst, yearfirst) + if seen_datetime and seen_integer: + # we have mixed datetimes & integers + + if is_coerce: + # coerce all of the integers/floats to NaT, preserve + # the datetimes and other convertibles + for i in range(n): + val = values[i] + if is_integer_object(val) or is_float_object(val): + result[i] = NPY_NAT + elif is_raise: + raise ValueError("mixed datetimes and integers in passed array") + else: + return array_to_datetime_object(values, is_raise, + dayfirst, yearfirst) + + if seen_datetime_offset and not utc_convert: + # GH#17697 + # 1) If all the offsets are equal, return one offset for + # the parsed dates to (maybe) pass to DatetimeIndex + # 2) If the offsets are different, then force the parsing down the + # object path where an array of datetimes + # (with individual dateutil.tzoffsets) are returned + is_same_offsets = len(out_tzoffset_vals) == 1 + if not is_same_offsets: + return array_to_datetime_object(values, is_raise, + dayfirst, yearfirst) + else: + tz_offset = out_tzoffset_vals.pop() + tz_out = pytz.FixedOffset(tz_offset / 60.) + return result, tz_out + + +cdef inline ignore_errors_out_of_bounds_fallback(ndarray[object] values): + """ + Fallback for array_to_datetime if an OutOfBoundsDatetime is raised + and errors == "ignore" + + Parameters + ---------- + values : ndarray[object] + + Returns + ------- + ndarray[object] + """ + cdef: + Py_ssize_t i, n = len(values) + object val + + oresult = np.empty(n, dtype=object) + + for i in range(n): + val = values[i] + + # set as nan except if its a NaT + if checknull_with_nat(val): + if isinstance(val, float): + oresult[i] = np.nan + else: + oresult[i] = NaT + elif is_datetime64_object(val): + if get_datetime64_value(val) == NPY_NAT: + oresult[i] = NaT + else: + oresult[i] = val.item() + else: + oresult[i] = val + return oresult + @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 67c2793e4bcef..4a34065fe471f 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -62,8 +62,11 @@ cdef inline int64_t get_datetime64_nanos(object val) except? -1: NPY_DATETIMEUNIT unit npy_datetime ival - unit = get_datetime64_unit(val) ival = get_datetime64_value(val) + if ival == NPY_NAT: + return NPY_NAT + + unit = get_datetime64_unit(val) if unit != NPY_FR_ns: pandas_datetime_to_datetimestruct(ival, unit, &dts) @@ -283,10 +286,8 @@ cdef convert_to_tsobject(object ts, object tz, object unit, if ts is None or ts is NaT: obj.value = NPY_NAT elif is_datetime64_object(ts): - if ts.view('i8') == NPY_NAT: - obj.value = NPY_NAT - else: - obj.value = get_datetime64_nanos(ts) + obj.value = get_datetime64_nanos(ts) + if obj.value != NPY_NAT: dt64_to_dtstruct(obj.value, &obj.dts) elif is_integer_object(ts): if ts == NPY_NAT: @@ -887,8 +888,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, int64_t *tdata int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins int64_t HOURS_NS = HOUR_SECONDS * 1000000000 - ndarray[int64_t] trans, result, result_a, result_b, dst_hours - ndarray[int64_t] trans_idx, grp, delta, a_idx, b_idx, one_diff + ndarray[int64_t] trans, result, result_a, result_b, dst_hours, delta + ndarray trans_idx, grp, a_idx, b_idx, one_diff npy_datetimestruct dts bint infer_dst = False, is_dst = False, fill = False bint shift = False, fill_nonexist = False diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5db851d4bf021..42696e4796fe0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -347,6 +347,16 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # the "ordered" and "categories" arguments dtype = values.dtype._from_categorical_dtype(values.dtype, categories, ordered) + + # GH23814, for perf, if values._values already an instance of + # Categorical, set values to codes, and run fastpath + if (isinstance(values, (ABCSeries, ABCIndexClass)) and + isinstance(values._values, type(self))): + values = values._values.codes.copy() + if categories is None: + categories = dtype.categories + fastpath = True + else: # If dtype=None and values is not categorical, create a new dtype dtype = CategoricalDtype(categories, ordered) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 83ee335aa5465..a6f254c79fb51 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -10,11 +10,12 @@ from pandas._libs.tslibs.period import ( DIFFERENT_FREQ_INDEX, IncompatibleFrequency, Period) from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds -from pandas._libs.tslibs.timestamps import maybe_integer_op_deprecated +from pandas._libs.tslibs.timestamps import ( + RoundTo, maybe_integer_op_deprecated, round_nsint64) import pandas.compat as compat from pandas.errors import ( AbstractMethodError, NullFrequencyError, PerformanceWarning) -from pandas.util._decorators import deprecate_kwarg +from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_any_dtype, is_datetime64_dtype, @@ -80,6 +81,189 @@ def _get_attributes_dict(self): return {k: getattr(self, k, None) for k in self._attributes} +class DatelikeOps(object): + """ + Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. + """ + + def strftime(self, date_format): + from pandas import Index + return Index(self.format(date_format=date_format), + dtype=compat.text_type) + strftime.__doc__ = """ + Convert to Index using specified date_format. + + Return an Index of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format doc <{0}>`__ + + Parameters + ---------- + date_format : str + Date format string (e.g. "%Y-%m-%d"). + + Returns + ------- + Index + Index of formatted strings + + See Also + -------- + to_datetime : Convert the given argument to datetime. + DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. + DatetimeIndex.round : Round the DatetimeIndex to the specified freq. + DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. + + Examples + -------- + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), + ... periods=3, freq='s') + >>> rng.strftime('%B %d, %Y, %r') + Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', + 'March 10, 2018, 09:00:02 AM'], + dtype='object') + """.format("https://docs.python.org/3/library/datetime.html" + "#strftime-and-strptime-behavior") + + +class TimelikeOps(object): + """ + Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. + """ + + _round_doc = ( + """ + Perform {op} operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times + + .. versionadded:: 0.24.0 + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 + + Returns + ------- + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='T') + """) + + _round_example = ( + """>>> rng.round('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.round("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """) + + _floor_example = ( + """>>> rng.floor('H') + DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.floor("H") + 0 2018-01-01 11:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + ) + + _ceil_example = ( + """>>> rng.ceil('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 13:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.ceil("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 13:00:00 + dtype: datetime64[ns] + """ + ) + + def _round(self, freq, mode, ambiguous, nonexistent): + # round the local times + values = _ensure_datetimelike_to_i8(self) + result = round_nsint64(values, mode, freq) + result = self._maybe_mask_results(result, fill_value=NaT) + + attribs = self._get_attributes_dict() + attribs['freq'] = None + if 'tz' in attribs: + attribs['tz'] = None + return self._ensure_localized( + self._shallow_copy(result, **attribs), ambiguous, nonexistent + ) + + @Appender((_round_doc + _round_example).format(op="round")) + def round(self, freq, ambiguous='raise', nonexistent='raise'): + return self._round( + freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent + ) + + @Appender((_round_doc + _floor_example).format(op="floor")) + def floor(self, freq, ambiguous='raise', nonexistent='raise'): + return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) + + @Appender((_round_doc + _ceil_example).format(op="ceil")) + def ceil(self, freq, ambiguous='raise', nonexistent='raise'): + return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) + + class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin): """ Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray @@ -1023,3 +1207,39 @@ def validate_dtype_freq(dtype, freq): raise IncompatibleFrequency('specified freq and dtype ' 'are different') return freq + + +def _ensure_datetimelike_to_i8(other, to_utc=False): + """ + Helper for coercing an input scalar or array to i8. + + Parameters + ---------- + other : 1d array + to_utc : bool, default False + If True, convert the values to UTC before extracting the i8 values + If False, extract the i8 values directly. + + Returns + ------- + i8 1d array + """ + from pandas import Index + from pandas.core.arrays import PeriodArray + + if lib.is_scalar(other) and isna(other): + return iNaT + elif isinstance(other, (PeriodArray, ABCIndexClass)): + # convert tz if needed + if getattr(other, 'tz', None) is not None: + if to_utc: + other = other.tz_convert('UTC') + else: + other = other.tz_localize(None) + else: + try: + return np.array(other, copy=False).view('i8') + except TypeError: + # period array cannot be coerced to int + other = Index(other) + return other.asi8 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 4d3caaacca1c1..050442c530314 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -156,7 +156,9 @@ def wrapper(self, other): return compat.set_function_name(wrapper, opname, cls) -class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin): +class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin, + dtl.TimelikeOps, + dtl.DatelikeOps): """ Assumes that subclass __new__/__init__ defines: tz diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 856a01e41ce13..6a7ce7033efa0 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -129,7 +129,7 @@ def method(self, other): return method -class TimedeltaArrayMixin(dtl.DatetimeLikeArrayMixin): +class TimedeltaArrayMixin(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): _typ = "timedeltaarray" __array_priority__ = 1000 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 06519da9a26d5..f50be694b47c6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1298,10 +1298,10 @@ def to_dict(self, orient='dict', into=dict): >>> df.to_dict('split') {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], - 'data': [[1.0, 0.5], [2.0, 0.75]]} + 'data': [[1, 0.5], [2, 0.75]]} >>> df.to_dict('records') - [{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}] + [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] >>> df.to_dict('index') {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} @@ -1317,8 +1317,8 @@ def to_dict(self, orient='dict', into=dict): >>> dd = defaultdict(list) >>> df.to_dict('records', into=dd) - [defaultdict(, {'col1': 1.0, 'col2': 0.5}), - defaultdict(, {'col1': 2.0, 'col2': 0.75})] + [defaultdict(, {'col1': 1, 'col2': 0.5}), + defaultdict(, {'col1': 2, 'col2': 0.75})] """ if not self.columns.is_unique: warnings.warn("DataFrame columns are not unique, some " @@ -1334,16 +1334,18 @@ def to_dict(self, orient='dict', into=dict): elif orient.lower().startswith('sp'): return into_c((('index', self.index.tolist()), ('columns', self.columns.tolist()), - ('data', lib.map_infer(self.values.ravel(), - com.maybe_box_datetimelike) - .reshape(self.values.shape).tolist()))) + ('data', [ + list(map(com.maybe_box_datetimelike, t)) + for t in self.itertuples(index=False)] + ))) elif orient.lower().startswith('s'): return into_c((k, com.maybe_box_datetimelike(v)) for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): - return [into_c((k, com.maybe_box_datetimelike(v)) - for k, v in zip(self.columns, np.atleast_1d(row))) - for row in self.values] + return [ + into_c((k, com.maybe_box_datetimelike(v)) + for k, v in compat.iteritems(row._asdict())) + for row in self.itertuples(index=False)] elif orient.lower().startswith('i'): if not self.index.is_unique: raise ValueError( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 08c07da39128f..c58c84b422209 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3420,71 +3420,102 @@ class max_speed def xs(self, key, axis=0, level=None, drop_level=True): """ - Returns a cross-section (row(s) or column(s)) from the - Series/DataFrame. Defaults to cross-section on the rows (axis=0). + Return cross-section from the Series/DataFrame. + + This method takes a `key` argument to select data at a particular + level of a MultiIndex. Parameters ---------- - key : object - Some label contained in the index, or partially in a MultiIndex - axis : int, default 0 - Axis to retrieve cross-section on + key : label or tuple of label + Label contained in the index, or partially in a MultiIndex. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis to retrieve cross-section on. level : object, defaults to first n levels (n=1 or len(key)) In case of a key partially contained in a MultiIndex, indicate which levels are used. Levels can be referred by label or position. - drop_level : boolean, default True + drop_level : bool, default True If False, returns object with same levels as self. + Returns + ------- + Series or DataFrame + Cross-section from the original Series or DataFrame + corresponding to the selected index levels. + + See Also + -------- + DataFrame.loc : Access a group of rows and columns + by label(s) or a boolean array. + DataFrame.iloc : Purely integer-location based indexing + for selection by position. + + Notes + ----- + `xs` can not be used to set values. + + MultiIndex Slicers is a generic way to get/set values on + any level or levels. + It is a superset of `xs` functionality, see + :ref:`MultiIndex Slicers `. + Examples -------- + >>> d = {'num_legs': [4, 4, 2, 2], + ... 'num_wings': [0, 0, 2, 2], + ... 'class': ['mammal', 'mammal', 'mammal', 'bird'], + ... 'animal': ['cat', 'dog', 'bat', 'penguin'], + ... 'locomotion': ['walks', 'walks', 'flies', 'walks']} + >>> df = pd.DataFrame(data=d) + >>> df = df.set_index(['class', 'animal', 'locomotion']) >>> df - A B C - a 4 5 2 - b 4 0 9 - c 9 7 3 - >>> df.xs('a') - A 4 - B 5 - C 2 - Name: a - >>> df.xs('C', axis=1) - a 2 - b 9 - c 3 - Name: C + num_legs num_wings + class animal locomotion + mammal cat walks 4 0 + dog walks 4 0 + bat flies 2 2 + bird penguin walks 2 2 - >>> df - A B C D - first second third - bar one 1 4 1 8 9 - two 1 7 5 5 0 - baz one 1 6 6 8 0 - three 2 5 3 5 3 - >>> df.xs(('baz', 'three')) - A B C D - third - 2 5 3 5 3 - >>> df.xs('one', level=1) - A B C D - first third - bar 1 4 1 8 9 - baz 1 6 6 8 0 - >>> df.xs(('baz', 2), level=[0, 'third']) - A B C D - second - three 5 3 5 3 + Get values at specified index - Returns - ------- - xs : Series or DataFrame + >>> df.xs('mammal') + num_legs num_wings + animal locomotion + cat walks 4 0 + dog walks 4 0 + bat flies 2 2 - Notes - ----- - xs is only for getting, not setting values. + Get values at several indexes + + >>> df.xs(('mammal', 'dog')) + num_legs num_wings + locomotion + walks 4 0 + + Get values at specified index and level + + >>> df.xs('cat', level=1) + num_legs num_wings + class locomotion + mammal walks 4 0 + + Get values at several indexes and levels + + >>> df.xs(('bird', 'walks'), + ... level=[0, 'locomotion']) + num_legs num_wings + animal + penguin 2 2 + + Get values at specified column and axis - MultiIndex Slicers is a generic way to get/set values on any level or - levels. It is a superset of xs functionality, see - :ref:`MultiIndex Slicers ` + >>> df.xs('num_wings', axis=1) + class animal locomotion + mammal cat walks 0 + dog walks 0 + bat flies 2 + bird penguin walks 2 + Name: num_wings, dtype: int64 """ axis = self._get_axis_number(axis) labels = self._get_axis(axis) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 6b84e8deea493..91c7648d5cf2e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -13,7 +13,7 @@ is_scalar) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCCategorical, ABCSeries -from pandas.core.dtypes.missing import array_equivalent, isna +from pandas.core.dtypes.missing import isna from pandas.core import accessor from pandas.core.algorithms import take_1d @@ -283,7 +283,9 @@ def equals(self, other): try: other = self._is_dtype_compat(other) - return array_equivalent(self._data, other) + if isinstance(other, type(self)): + other = other._data + return self._data.equals(other) except (TypeError, ValueError): pass diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5e25efe77d8b9..0dedd8fe1cf4b 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -7,8 +7,6 @@ import numpy as np from pandas._libs import NaT, iNaT, lib -from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64 -import pandas.compat as compat from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly @@ -19,11 +17,10 @@ is_integer, is_integer_dtype, is_list_like, is_object_dtype, is_period_dtype, is_scalar, is_string_dtype) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna from pandas.core import algorithms, ops -from pandas.core.arrays import PeriodArray -from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +from pandas.core.arrays.datetimelike import ( + DatetimeLikeArrayMixin, _ensure_datetimelike_to_i8) import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.tools.timedeltas import to_timedelta @@ -33,188 +30,6 @@ _index_doc_kwargs = dict(ibase._index_doc_kwargs) -class DatelikeOps(object): - """ - Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. - """ - - def strftime(self, date_format): - return Index(self.format(date_format=date_format), - dtype=compat.text_type) - strftime.__doc__ = """ - Convert to Index using specified date_format. - - Return an Index of formatted strings specified by date_format, which - supports the same string format as the python standard library. Details - of the string format can be found in `python string format doc <{0}>`__ - - Parameters - ---------- - date_format : str - Date format string (e.g. "%Y-%m-%d"). - - Returns - ------- - Index - Index of formatted strings - - See Also - -------- - to_datetime : Convert the given argument to datetime. - DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. - DatetimeIndex.round : Round the DatetimeIndex to the specified freq. - DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. - - Examples - -------- - >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), - ... periods=3, freq='s') - >>> rng.strftime('%B %d, %Y, %r') - Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', - 'March 10, 2018, 09:00:02 AM'], - dtype='object') - """.format("https://docs.python.org/3/library/datetime.html" - "#strftime-and-strptime-behavior") - - -class TimelikeOps(object): - """ - Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. - """ - - _round_doc = ( - """ - Perform {op} operation on the data to the specified `freq`. - - Parameters - ---------- - freq : str or Offset - The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See - :ref:`frequency aliases ` for - a list of possible `freq` values. - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - Only relevant for DatetimeIndex: - - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False designates - a non-DST time (note that this flag is only applicable for - ambiguous times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times - - .. versionadded:: 0.24.0 - nonexistent : 'shift', 'NaT', default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift' will shift the nonexistent time forward to the closest - existing time - - 'NaT' will return NaT where there are nonexistent times - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times - - .. versionadded:: 0.24.0 - - Returns - ------- - DatetimeIndex, TimedeltaIndex, or Series - Index of the same type for a DatetimeIndex or TimedeltaIndex, - or a Series with the same index for a Series. - - Raises - ------ - ValueError if the `freq` cannot be converted. - - Examples - -------- - **DatetimeIndex** - - >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') - >>> rng - DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', - '2018-01-01 12:01:00'], - dtype='datetime64[ns]', freq='T') - """) - - _round_example = ( - """>>> rng.round('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.round("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """) - - _floor_example = ( - """>>> rng.floor('H') - DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.floor("H") - 0 2018-01-01 11:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """ - ) - - _ceil_example = ( - """>>> rng.ceil('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 13:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.ceil("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 13:00:00 - dtype: datetime64[ns] - """ - ) - - def _round(self, freq, mode, ambiguous, nonexistent): - # round the local times - values = _ensure_datetimelike_to_i8(self) - result = round_nsint64(values, mode, freq) - result = self._maybe_mask_results(result, fill_value=NaT) - - attribs = self._get_attributes_dict() - attribs['freq'] = None - if 'tz' in attribs: - attribs['tz'] = None - return self._ensure_localized( - self._shallow_copy(result, **attribs), ambiguous, nonexistent - ) - - @Appender((_round_doc + _round_example).format(op="round")) - def round(self, freq, ambiguous='raise', nonexistent='raise'): - return self._round( - freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent - ) - - @Appender((_round_doc + _floor_example).format(op="floor")) - def floor(self, freq, ambiguous='raise', nonexistent='raise'): - return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) - - @Appender((_round_doc + _ceil_example).format(op="ceil")) - def ceil(self, freq, ambiguous='raise', nonexistent='raise'): - return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) - - class DatetimeIndexOpsMixin(DatetimeLikeArrayMixin): """ common ops mixin to support a unified interface datetimelike Index """ @@ -754,39 +569,6 @@ def _time_shift(self, periods, freq=None): return result -def _ensure_datetimelike_to_i8(other, to_utc=False): - """ - Helper for coercing an input scalar or array to i8. - - Parameters - ---------- - other : 1d array - to_utc : bool, default False - If True, convert the values to UTC before extracting the i8 values - If False, extract the i8 values directly. - - Returns - ------- - i8 1d array - """ - if is_scalar(other) and isna(other): - return iNaT - elif isinstance(other, (PeriodArray, ABCIndexClass)): - # convert tz if needed - if getattr(other, 'tz', None) is not None: - if to_utc: - other = other.tz_convert('UTC') - else: - other = other.tz_localize(None) - else: - try: - return np.array(other, copy=False).view('i8') - except TypeError: - # period array cannot be coerced to int - other = Index(other) - return other.asi8 - - def wrap_arithmetic_op(self, other, result): if result is NotImplemented: return NotImplemented diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 01901d022da32..8f36096d128c2 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -31,8 +31,7 @@ import pandas.core.common as com from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.datetimelike import ( - DatelikeOps, DatetimeIndexOpsMixin, TimelikeOps, wrap_array_method, - wrap_field_accessor) + DatetimeIndexOpsMixin, wrap_array_method, wrap_field_accessor) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools @@ -62,8 +61,7 @@ def _new_DatetimeIndex(cls, d): return result -class DatetimeIndex(DatetimeArray, DatelikeOps, TimelikeOps, - DatetimeIndexOpsMixin, Int64Index): +class DatetimeIndex(DatetimeArray, DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray of datetime64 data, represented internally as int64, and which can be boxed to Timestamp objects that are subclasses of datetime and diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 56df454bddf1c..3cdefb02ef8b3 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -26,7 +26,7 @@ import pandas.core.indexes.base as ibase from pandas.core.indexes.base import _index_shared_docs, ensure_index from pandas.core.indexes.datetimelike import ( - DatelikeOps, DatetimeIndexOpsMixin, wrap_arithmetic_op) + DatetimeIndexOpsMixin, wrap_arithmetic_op) from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index from pandas.core.missing import isna from pandas.core.ops import get_op_result_name @@ -83,7 +83,7 @@ def _delegate_method(self, name, *args, **kwargs): if x not in {"asfreq", "to_timestamp"}], typ="method", overwrite=True) -class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, +class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): """ Immutable ndarray holding ordinal values indicating regular periods in diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index ed4e43df8f41a..e33d61d29d302 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -22,7 +22,7 @@ import pandas.core.common as com from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, TimelikeOps, wrap_arithmetic_op, wrap_array_method, + DatetimeIndexOpsMixin, wrap_arithmetic_op, wrap_array_method, wrap_field_accessor) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name @@ -31,8 +31,7 @@ from pandas.tseries.frequencies import to_offset -class TimedeltaIndex(TimedeltaArray, DatetimeIndexOpsMixin, - TimelikeOps, Int64Index): +class TimedeltaIndex(TimedeltaArray, DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray of timedelta64 data, represented internally as int64, and which can be boxed to timedelta objects diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 8ad2a48e8767c..5d5f6cf8102be 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -43,7 +43,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, and maximum values of `x`. * sequence of scalars : Defines the bin edges allowing for non-uniform width. No extension of the range of `x` is done. - * IntervalIndex : Defines the exact bins to be used. + * IntervalIndex : Defines the exact bins to be used. Note that + IntervalIndex for `bins` must be non-overlapping. right : bool, default True Indicates whether `bins` includes the rightmost edge or not. If @@ -217,7 +218,9 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, bins[-1] += adj elif isinstance(bins, IntervalIndex): - pass + if bins.is_overlapping: + raise ValueError('Overlapping IntervalIndex is not accepted.') + else: bins = np.asarray(bins) bins = _convert_bin_to_numeric_type(bins, dtype) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 4d9f1567b371a..447eac8a57013 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -15,7 +15,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, is_integer, is_list_like, is_re, is_scalar, is_string_like) -from pandas.core.dtypes.generic import ABCIndex, ABCMultiIndex, ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core.algorithms import take_1d @@ -931,7 +931,7 @@ def str_extractall(arr, pat, flags=0): if regex.groups == 0: raise ValueError("pattern contains no capture groups") - if isinstance(arr, ABCIndex): + if isinstance(arr, ABCIndexClass): arr = arr.to_series().reset_index(drop=True) names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) @@ -1920,7 +1920,7 @@ def __iter__(self): def _wrap_result(self, result, use_codes=True, name=None, expand=None, fill_value=np.nan): - from pandas.core.index import Index, MultiIndex + from pandas import Index, Series, MultiIndex # for category, we do the stuff on the categories, so blow it up # to the full series again @@ -1928,7 +1928,8 @@ def _wrap_result(self, result, use_codes=True, # so make it possible to skip this step as the method already did this # before the transformation... if use_codes and self._is_categorical: - result = take_1d(result, self._orig.cat.codes, + # if self._orig is a CategoricalIndex, there is no .cat-accessor + result = take_1d(result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value) if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7ade35f93a858..aadca1fcb3bef 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -27,9 +27,9 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( - ensure_object, is_categorical_dtype, is_dtype_equal, is_float, is_integer, - is_integer_dtype, is_list_like, is_object_dtype, is_scalar, - is_string_dtype) + ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal, + is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype, + is_scalar, is_string_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna @@ -1669,6 +1669,16 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, # type specified in dtype param if cast_type and not is_dtype_equal(cvals, cast_type): + try: + if (is_bool_dtype(cast_type) and + not is_categorical_dtype(cast_type) + and na_count > 0): + raise ValueError("Bool column has NA values in " + "column {column}" + .format(column=c)) + except (AttributeError, TypeError): + # invalid input to is_bool_dtype + pass cvals = self._cast_types(cvals, cast_type, c) result[c] = cvals diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 61fe9d12c173c..f1eb6a33eddeb 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -150,7 +150,7 @@ def test_to_records_index_name(self): def test_to_records_with_unicode_index(self): # GH13172 # unicode_literals conflict with to_records - result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a')\ + result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a') \ .to_records() expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')]) tm.assert_almost_equal(result, expected) @@ -281,17 +281,23 @@ def test_to_records_datetimeindex_with_tz(self, tz): # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected) - def test_to_dict_box_scalars(self): - # 14216 + # orient - orient argument to to_dict function + # item_getter - function for extracting value from + # the resulting dict using column name and index + @pytest.mark.parametrize('orient,item_getter', [ + ('dict', lambda d, col, idx: d[col][idx]), + ('records', lambda d, col, idx: d[idx][col]), + ('list', lambda d, col, idx: d[col][idx]), + ('split', lambda d, col, idx: d['data'][idx][d['columns'].index(col)]), + ('index', lambda d, col, idx: d[idx][col]) + ]) + def test_to_dict_box_scalars(self, orient, item_getter): + # 14216, 23753 # make sure that we are boxing properly - d = {'a': [1], 'b': ['b']} - - result = DataFrame(d).to_dict() - assert isinstance(list(result['a'])[0], (int, long)) - assert isinstance(list(result['b'])[0], (int, long)) - - result = DataFrame(d).to_dict(orient='records') - assert isinstance(result[0]['a'], (int, long)) + df = DataFrame({'a': [1, 2], 'b': [.1, .2]}) + result = df.to_dict(orient=orient) + assert isinstance(item_getter(result, 'a', 0), (int, long)) + assert isinstance(item_getter(result, 'b', 0), float) def test_frame_to_dict_tz(self): # GH18372 When converting to dict with orient='records' columns of diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 921984bc44e50..1b6d2ee8a062e 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -421,3 +421,21 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data, na_values", [ + ("false,1\n,1\ntrue", None), + ("false,1\nnull,1\ntrue", None), + ("false,1\nnan,1\ntrue", None), + ("false,1\nfoo,1\ntrue", 'foo'), + ("false,1\nfoo,1\ntrue", ['foo']), + ("false,1\nfoo,1\ntrue", {'a': 'foo'}), +]) +def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): + parser = all_parsers + msg = ("(Bool column has NA values in column [0a])|" + "(cannot safely convert passed user dtype of " + "bool for object dtyped data in column 0)") + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=None, names=['a', 'b'], + dtype={'a': 'bool'}, na_values=na_values) diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index f04e9a55a6c8d..19f1a9a8b65c7 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -91,6 +91,14 @@ def test_bins_from_intervalindex(self): tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype='int8')) + def test_bins_not_overlapping_from_intervalindex(self): + # see gh-23980 + msg = "Overlapping IntervalIndex is not accepted" + ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)]) + + with pytest.raises(ValueError, match=msg): + cut([5, 6], bins=ii) + def test_bins_not_monotonic(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] pytest.raises(ValueError, cut, data, [0.1, 1.5, 1, 10]) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d6c6a8652e728..7a88a96e9c609 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -218,9 +218,6 @@ def test_api_per_method(self, box, dtype, method_name, args, kwargs = any_string_method # TODO: get rid of these xfails - if box == Index and dtype == 'category': - pytest.xfail(reason='Broken methods on CategoricalIndex; ' - 'see GH 23556') if (method_name in ['partition', 'rpartition'] and box == Index and inferred_dtype == 'empty'): pytest.xfail(reason='Method cannot deal with empty Index') @@ -247,6 +244,7 @@ def test_api_per_method(self, box, dtype, + ['mixed', 'mixed-integer'] * mixed_allowed) if inferred_dtype in allowed_types: + # xref GH 23555, GH 23556 method(*args, **kwargs) # works! else: # GH 23011, GH 23163 diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 9f5b4f7b90d9f..84bc1863aadd9 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -10,272 +10,319 @@ import pandas.util.testing as tm -class TestHashing(object): - - @pytest.fixture(params=[ - Series([1, 2, 3] * 3, dtype='int32'), - Series([None, 2.5, 3.5] * 3, dtype='float32'), - Series(['a', 'b', 'c'] * 3, dtype='category'), - Series(['d', 'e', 'f'] * 3), - Series([True, False, True] * 3), - Series(pd.date_range('20130101', periods=9)), - Series(pd.date_range('20130101', periods=9, tz='US/Eastern')), - Series(pd.timedelta_range('2000', periods=9))]) - def series(self, request): - return request.param - - def test_consistency(self): - # check that our hash doesn't change because of a mistake - # in the actual code; this is the ground truth - result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) - expected = Series(np.array([3600424527151052760, 1374399572096150070, - 477881037637427054], dtype='uint64'), - index=['foo', 'bar', 'baz']) - tm.assert_series_equal(result, expected) - - def test_hash_array(self, series): - a = series.values - tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) - - def test_hash_array_mixed(self): - result1 = hash_array(np.array([3, 4, 'All'])) - result2 = hash_array(np.array(['3', '4', 'All'])) - result3 = hash_array(np.array([3, 4, 'All'], dtype=object)) - tm.assert_numpy_array_equal(result1, result2) - tm.assert_numpy_array_equal(result1, result3) - - @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')]) - def test_hash_array_errors(self, val): - msg = 'must pass a ndarray-like' - with pytest.raises(TypeError, match=msg): - hash_array(val) - - def check_equal(self, obj, **kwargs): - a = hash_pandas_object(obj, **kwargs) - b = hash_pandas_object(obj, **kwargs) - tm.assert_series_equal(a, b) - - kwargs.pop('index', None) - a = hash_pandas_object(obj, **kwargs) - b = hash_pandas_object(obj, **kwargs) - tm.assert_series_equal(a, b) - - def check_not_equal_with_index(self, obj): - - # check that we are not hashing the same if - # we include the index - if not isinstance(obj, Index): - a = hash_pandas_object(obj, index=True) - b = hash_pandas_object(obj, index=False) - if len(obj): - assert not (a == b).all() - - def test_hash_tuples(self): - tups = [(1, 'one'), (1, 'two'), (2, 'one')] - result = hash_tuples(tups) - expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values - tm.assert_numpy_array_equal(result, expected) - - result = hash_tuples(tups[0]) - assert result == expected[0] - - @pytest.mark.parametrize('tup', [ - (1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), - ('A', pd.Timestamp("2012-01-01"))]) - def test_hash_tuple(self, tup): - # test equivalence between hash_tuples and hash_tuple - result = hash_tuple(tup) - expected = hash_tuples([tup])[0] - assert result == expected - - @pytest.mark.parametrize('val', [ - 1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), - pd.Timestamp("2012-01-01", tz='Europe/Brussels'), - datetime.datetime(2012, 1, 1), - pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), - pd.Timedelta('1 days'), datetime.timedelta(1), - pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), - np.nan, pd.NaT, None]) - def test_hash_scalar(self, val): - result = _hash_scalar(val) - expected = hash_array(np.array([val], dtype=object), categorize=True) - assert result[0] == expected[0] - - @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')]) - def test_hash_tuples_err(self, val): - msg = 'must be convertible to a list-of-tuples' - with pytest.raises(TypeError, match=msg): - hash_tuples(val) - - def test_multiindex_unique(self): - mi = MultiIndex.from_tuples([(118, 472), (236, 118), - (51, 204), (102, 51)]) - assert mi.is_unique is True - result = hash_pandas_object(mi) - assert result.is_unique is True - - def test_multiindex_objects(self): - mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], - labels=[[0, 1, 0, 2], [2, 0, 0, 1]], - names=['col1', 'col2']) - recons = mi._sort_levels_monotonic() - - # these are equal - assert mi.equals(recons) - assert Index(mi.values).equals(Index(recons.values)) - - # _hashed_values and hash_pandas_object(..., index=False) - # equivalency - expected = hash_pandas_object( - mi, index=False).values - result = mi._hashed_values - tm.assert_numpy_array_equal(result, expected) - - expected = hash_pandas_object( - recons, index=False).values - result = recons._hashed_values - tm.assert_numpy_array_equal(result, expected) - - expected = mi._hashed_values - result = recons._hashed_values - - # values should match, but in different order - tm.assert_numpy_array_equal(np.sort(result), - np.sort(expected)) - - @pytest.mark.parametrize('obj', [ - Series([1, 2, 3]), - Series([1.0, 1.5, 3.2]), - Series([1.0, 1.5, np.nan]), - Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), - Series(['a', 'b', 'c']), - Series(['a', np.nan, 'c']), - Series(['a', None, 'c']), - Series([True, False, True]), - Series(), - Index([1, 2, 3]), - Index([True, False, True]), - DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), - DataFrame(), - tm.makeMissingDataframe(), - tm.makeMixedDataFrame(), - tm.makeTimeDataFrame(), - tm.makeTimeSeries(), - tm.makeTimedeltaIndex(), - tm.makePeriodIndex(), - Series(tm.makePeriodIndex()), - Series(pd.date_range('20130101', periods=3, tz='US/Eastern')), - MultiIndex.from_product([range(5), ['foo', 'bar', 'baz'], - pd.date_range('20130101', periods=2)]), - MultiIndex.from_product([pd.CategoricalIndex(list('aabc')), range(3)]) - ]) - def test_hash_pandas_object(self, obj): - self.check_equal(obj) - self.check_not_equal_with_index(obj) - - def test_hash_pandas_object2(self, series): - self.check_equal(series) - self.check_not_equal_with_index(series) - - @pytest.mark.parametrize('obj', [ - Series([], dtype='float64'), Series([], dtype='object'), Index([])]) - def test_hash_pandas_empty_object(self, obj): - # these are by-definition the same with - # or w/o the index as the data is empty - self.check_equal(obj) - - @pytest.mark.parametrize('s1', [ - Series(['a', 'b', 'c', 'd']), - Series([1000, 2000, 3000, 4000]), - Series(pd.date_range(0, periods=4))]) - @pytest.mark.parametrize('categorize', [True, False]) - def test_categorical_consistency(self, s1, categorize): - # GH15143 - # Check that categoricals hash consistent with their values, not codes - # This should work for categoricals of any dtype - s2 = s1.astype('category').cat.set_categories(s1) - s3 = s2.cat.set_categories(list(reversed(s1))) - - # These should all hash identically - h1 = hash_pandas_object(s1, categorize=categorize) - h2 = hash_pandas_object(s2, categorize=categorize) - h3 = hash_pandas_object(s3, categorize=categorize) - tm.assert_series_equal(h1, h2) - tm.assert_series_equal(h1, h3) - - def test_categorical_with_nan_consistency(self): - c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], - categories=pd.date_range('2012-01-01', periods=5, name='B')) - expected = hash_array(c, categorize=False) - c = pd.Categorical.from_codes( - [-1, 0], - categories=[pd.Timestamp('2012-01-01')]) - result = hash_array(c, categorize=False) - assert result[0] in expected - assert result[1] in expected - - @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") - def test_pandas_errors(self): - with pytest.raises(TypeError): - hash_pandas_object(pd.Timestamp('20130101')) - - obj = tm.makePanel() - - with pytest.raises(TypeError): - hash_pandas_object(obj) - - def test_hash_keys(self): - # using different hash keys, should have different hashes - # for the same data - - # this only matters for object dtypes - obj = Series(list('abc')) - a = hash_pandas_object(obj, hash_key='9876543210123456') - b = hash_pandas_object(obj, hash_key='9876543210123465') - assert (a != b).all() - - def test_invalid_key(self): - # this only matters for object dtypes - msg = 'key should be a 16-byte string encoded' - with pytest.raises(ValueError, match=msg): - hash_pandas_object(Series(list('abc')), hash_key='foo') - - def test_alread_encoded(self): - # if already encoded then ok - - obj = Series(list('abc')).str.encode('utf8') - self.check_equal(obj) - - def test_alternate_encoding(self): - - obj = Series(list('abc')) - self.check_equal(obj, encoding='ascii') - - @pytest.mark.parametrize('l_exp', range(8)) - @pytest.mark.parametrize('l_add', [0, 1]) - def test_same_len_hash_collisions(self, l_exp, l_add): - length = 2**(l_exp + 8) + l_add - s = tm.rands_array(length, 2) - result = hash_array(s, 'utf8') - assert not result[0] == result[1] - - def test_hash_collisions(self): - - # hash collisions are bad - # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 - L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa - 'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'] # noqa - - # these should be different! - result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8') - expected1 = np.array([14963968704024874985], dtype=np.uint64) - tm.assert_numpy_array_equal(result1, expected1) - - result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8') - expected2 = np.array([16428432627716348016], dtype=np.uint64) - tm.assert_numpy_array_equal(result2, expected2) - - result = hash_array(np.asarray(L, dtype=object), 'utf8') - tm.assert_numpy_array_equal( - result, np.concatenate([expected1, expected2], axis=0)) +@pytest.fixture(params=[ + Series([1, 2, 3] * 3, dtype="int32"), + Series([None, 2.5, 3.5] * 3, dtype="float32"), + Series(["a", "b", "c"] * 3, dtype="category"), + Series(["d", "e", "f"] * 3), + Series([True, False, True] * 3), + Series(pd.date_range("20130101", periods=9)), + Series(pd.date_range("20130101", periods=9, tz="US/Eastern")), + Series(pd.timedelta_range("2000", periods=9))]) +def series(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def index(request): + return request.param + + +def _check_equal(obj, **kwargs): + """ + Check that hashing an objects produces the same value each time. + + Parameters + ---------- + obj : object + The object to hash. + kwargs : kwargs + Keyword arguments to pass to the hashing function. + """ + a = hash_pandas_object(obj, **kwargs) + b = hash_pandas_object(obj, **kwargs) + tm.assert_series_equal(a, b) + + +def _check_not_equal_with_index(obj): + """ + Check the hash of an object with and without its index is not the same. + + Parameters + ---------- + obj : object + The object to hash. + """ + if not isinstance(obj, Index): + a = hash_pandas_object(obj, index=True) + b = hash_pandas_object(obj, index=False) + + if len(obj): + assert not (a == b).all() + + +def test_consistency(): + # Check that our hash doesn't change because of a mistake + # in the actual code; this is the ground truth. + result = hash_pandas_object(Index(["foo", "bar", "baz"])) + expected = Series(np.array([3600424527151052760, 1374399572096150070, + 477881037637427054], dtype="uint64"), + index=["foo", "bar", "baz"]) + tm.assert_series_equal(result, expected) + + +def test_hash_array(series): + arr = series.values + tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr)) + + +@pytest.mark.parametrize("arr2", [ + np.array([3, 4, "All"]), + np.array([3, 4, "All"], dtype=object), +]) +def test_hash_array_mixed(arr2): + result1 = hash_array(np.array(["3", "4", "All"])) + result2 = hash_array(arr2) + + tm.assert_numpy_array_equal(result1, result2) + + +@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")]) +def test_hash_array_errors(val): + msg = "must pass a ndarray-like" + with pytest.raises(TypeError, match=msg): + hash_array(val) + + +def test_hash_tuples(): + tuples = [(1, "one"), (1, "two"), (2, "one")] + result = hash_tuples(tuples) + + expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values + tm.assert_numpy_array_equal(result, expected) + + result = hash_tuples(tuples[0]) + assert result == expected[0] + + +@pytest.mark.parametrize("tup", [ + (1, "one"), (1, np.nan), (1.0, pd.NaT, "A"), + ("A", pd.Timestamp("2012-01-01"))]) +def test_hash_tuple(tup): + # Test equivalence between + # hash_tuples and hash_tuple. + result = hash_tuple(tup) + expected = hash_tuples([tup])[0] + + assert result == expected + + +@pytest.mark.parametrize("val", [ + 1, 1.4, "A", b"A", u"A", pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz="Europe/Brussels"), + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(), + pd.Timedelta("1 days"), datetime.timedelta(1), + pd.Period("2012-01-01", freq="D"), pd.Interval(0, 1), + np.nan, pd.NaT, None]) +def test_hash_scalar(val): + result = _hash_scalar(val) + expected = hash_array(np.array([val], dtype=object), categorize=True) + + assert result[0] == expected[0] + + +@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")]) +def test_hash_tuples_err(val): + msg = "must be convertible to a list-of-tuples" + with pytest.raises(TypeError, match=msg): + hash_tuples(val) + + +def test_multiindex_unique(): + mi = MultiIndex.from_tuples([(118, 472), (236, 118), + (51, 204), (102, 51)]) + assert mi.is_unique is True + + result = hash_pandas_object(mi) + assert result.is_unique is True + + +def test_multiindex_objects(): + mi = MultiIndex(levels=[["b", "d", "a"], [1, 2, 3]], + labels=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=["col1", "col2"]) + recons = mi._sort_levels_monotonic() + + # These are equal. + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # _hashed_values and hash_pandas_object(..., index=False) equivalency. + expected = hash_pandas_object(mi, index=False).values + result = mi._hashed_values + + tm.assert_numpy_array_equal(result, expected) + + expected = hash_pandas_object(recons, index=False).values + result = recons._hashed_values + + tm.assert_numpy_array_equal(result, expected) + + expected = mi._hashed_values + result = recons._hashed_values + + # Values should match, but in different order. + tm.assert_numpy_array_equal(np.sort(result), np.sort(expected)) + + +@pytest.mark.parametrize("obj", [ + Series([1, 2, 3]), + Series([1.0, 1.5, 3.2]), + Series([1.0, 1.5, np.nan]), + Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), + Series(["a", "b", "c"]), + Series(["a", np.nan, "c"]), + Series(["a", None, "c"]), + Series([True, False, True]), + Series(), + Index([1, 2, 3]), + Index([True, False, True]), + DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), + DataFrame(), + tm.makeMissingDataframe(), + tm.makeMixedDataFrame(), + tm.makeTimeDataFrame(), + tm.makeTimeSeries(), + tm.makeTimedeltaIndex(), + tm.makePeriodIndex(), + Series(tm.makePeriodIndex()), + Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), + MultiIndex.from_product([range(5), ["foo", "bar", "baz"], + pd.date_range("20130101", periods=2)]), + MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]) +]) +def test_hash_pandas_object(obj, index): + _check_equal(obj, index=index) + _check_not_equal_with_index(obj) + + +def test_hash_pandas_object2(series, index): + _check_equal(series, index=index) + _check_not_equal_with_index(series) + + +@pytest.mark.parametrize("obj", [ + Series([], dtype="float64"), Series([], dtype="object"), Index([])]) +def test_hash_pandas_empty_object(obj, index): + # These are by-definition the same with + # or without the index as the data is empty. + _check_equal(obj, index=index) + + +@pytest.mark.parametrize("s1", [ + Series(["a", "b", "c", "d"]), + Series([1000, 2000, 3000, 4000]), + Series(pd.date_range(0, periods=4))]) +@pytest.mark.parametrize("categorize", [True, False]) +def test_categorical_consistency(s1, categorize): + # see gh-15143 + # + # Check that categoricals hash consistent with their values, + # not codes. This should work for categoricals of any dtype. + s2 = s1.astype("category").cat.set_categories(s1) + s3 = s2.cat.set_categories(list(reversed(s1))) + + # These should all hash identically. + h1 = hash_pandas_object(s1, categorize=categorize) + h2 = hash_pandas_object(s2, categorize=categorize) + h3 = hash_pandas_object(s3, categorize=categorize) + + tm.assert_series_equal(h1, h2) + tm.assert_series_equal(h1, h3) + + +def test_categorical_with_nan_consistency(): + c = pd.Categorical.from_codes( + [-1, 0, 1, 2, 3, 4], + categories=pd.date_range("2012-01-01", periods=5, name="B")) + expected = hash_array(c, categorize=False) + + c = pd.Categorical.from_codes( + [-1, 0], + categories=[pd.Timestamp("2012-01-01")]) + result = hash_array(c, categorize=False) + + assert result[0] in expected + assert result[1] in expected + + +@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") +@pytest.mark.parametrize("obj", [pd.Timestamp("20130101"), tm.makePanel()]) +def test_pandas_errors(obj): + msg = "Unexpected type for hashing" + with pytest.raises(TypeError, match=msg): + hash_pandas_object(obj) + + +def test_hash_keys(): + # Using different hash keys, should have + # different hashes for the same data. + # + # This only matters for object dtypes. + obj = Series(list("abc")) + + a = hash_pandas_object(obj, hash_key="9876543210123456") + b = hash_pandas_object(obj, hash_key="9876543210123465") + + assert (a != b).all() + + +def test_invalid_key(): + # This only matters for object dtypes. + msg = "key should be a 16-byte string encoded" + + with pytest.raises(ValueError, match=msg): + hash_pandas_object(Series(list("abc")), hash_key="foo") + + +def test_already_encoded(index): + # If already encoded, then ok. + obj = Series(list("abc")).str.encode("utf8") + _check_equal(obj, index=index) + + +def test_alternate_encoding(index): + obj = Series(list("abc")) + _check_equal(obj, index=index, encoding="ascii") + + +@pytest.mark.parametrize("l_exp", range(8)) +@pytest.mark.parametrize("l_add", [0, 1]) +def test_same_len_hash_collisions(l_exp, l_add): + length = 2**(l_exp + 8) + l_add + s = tm.rands_array(length, 2) + + result = hash_array(s, "utf8") + assert not result[0] == result[1] + + +def test_hash_collisions(): + # Hash collisions are bad. + # + # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 + hashes = ["Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa + "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe"] # noqa + + # These should be different. + result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8") + expected1 = np.array([14963968704024874985], dtype=np.uint64) + tm.assert_numpy_array_equal(result1, expected1) + + result2 = hash_array(np.asarray(hashes[1:2], dtype=object), "utf8") + expected2 = np.array([16428432627716348016], dtype=np.uint64) + tm.assert_numpy_array_equal(result2, expected2) + + result = hash_array(np.asarray(hashes, dtype=object), "utf8") + tm.assert_numpy_array_equal(result, np.concatenate([expected1, + expected2], axis=0)) diff --git a/requirements-dev.txt b/requirements-dev.txt index d01a21ac5fed5..5e2da69df5f26 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,23 +1,21 @@ -NumPy +numpy>=1.15 python-dateutil>=2.5.0 pytz -Cython>=0.28.2 +cython>=0.28.2 flake8 flake8-comprehensions -flake8-rst==0.4.2 +flake8-rst>=0.6.0 gitpython -hypothesis>=3.58.0 +hypothesis>=3.82 isort moto -pytest>=3.6 -setuptools>=24.2.0 +pytest>=4.0 sphinx sphinxcontrib-spelling beautifulsoup4>=4.2.1 blosc bottleneck>=1.2.0 fastparquet>=0.1.2 -gcsfs html5lib ipython>=5.6.0 ipykernel @@ -25,15 +23,13 @@ jinja2 lxml matplotlib>=2.0.0 nbsphinx -numexpr>=2.6.1 +numexpr>=2.6.8 openpyxl pyarrow>=0.7.0 -pymysql tables>=3.4.2 pytest-cov pytest-xdist -s3fs -scipy>=0.18.1 +scipy>=1.1 seaborn sqlalchemy statsmodels diff --git a/setup.cfg b/setup.cfg index 8fba814188af5..cc7393e5a09b9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,25 +31,65 @@ exclude = env # exclude asv benchmark environments from linting [flake8-rst] -ignore = - F821, # undefined name - W391, # blank line at end of file [Seems to be a bug (v0.4.1)] +ignore = E402, # module level import not at top of file + W503, # line break before binary operator exclude = doc/source/whatsnew/v0.7.0.rst + doc/source/whatsnew/v0.7.3.rst + doc/source/whatsnew/v0.8.0.rst + doc/source/whatsnew/v0.9.0.rst + doc/source/whatsnew/v0.9.1.rst + doc/source/whatsnew/v0.10.0.rst doc/source/whatsnew/v0.10.1.rst + doc/source/whatsnew/v0.11.0.rst doc/source/whatsnew/v0.12.0.rst doc/source/whatsnew/v0.13.0.rst doc/source/whatsnew/v0.13.1.rst doc/source/whatsnew/v0.14.0.rst + doc/source/whatsnew/v0.14.1.rst doc/source/whatsnew/v0.15.0.rst + doc/source/whatsnew/v0.15.1.rst + doc/source/whatsnew/v0.15.2.rst doc/source/whatsnew/v0.16.0.rst + doc/source/whatsnew/v0.16.1.rst doc/source/whatsnew/v0.16.2.rst doc/source/whatsnew/v0.17.0.rst + doc/source/whatsnew/v0.17.1.rst doc/source/whatsnew/v0.18.0.rst doc/source/whatsnew/v0.18.1.rst + doc/source/whatsnew/v0.19.0.rst doc/source/whatsnew/v0.20.0.rst doc/source/whatsnew/v0.21.0.rst + doc/source/whatsnew/v0.22.0.rst doc/source/whatsnew/v0.23.0.rst + doc/source/whatsnew/v0.23.1.rst + doc/source/whatsnew/v0.23.2.rst + doc/source/whatsnew/v0.24.0.rst + doc/source/10min.rst + doc/source/advanced.rst + doc/source/basics.rst + doc/source/categorical.rst + doc/source/comparison_with_r.rst + doc/source/comparison_with_sql.rst + doc/source/comparison_with_stata.rst + doc/source/computation.rst + doc/source/contributing.rst + doc/source/contributing_docstring.rst + doc/source/dsintro.rst + doc/source/enhancingperf.rst + doc/source/extending.rst + doc/source/groupby.rst + doc/source/indexing.rst + doc/source/io.rst + doc/source/merging.rst + doc/source/missing_data.rst + doc/source/options.rst + doc/source/release.rst + doc/source/reshaping.rst + doc/source/timedeltas.rst + doc/source/timeseries.rst + doc/source/visualization.rst + [yapf] based_on_style = pep8 From 332d14b06ca33d8d003ea508c92e9649501ec585 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 3 Dec 2018 08:36:41 +0100 Subject: [PATCH 06/16] Review (jreback) --- doc/source/whatsnew/v0.24.0.rst | 4 ++-- pandas/core/strings.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4ccdc985a751a..f5c6a4a5bbf6c 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -374,8 +374,8 @@ Backwards incompatible API changes - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) -- The `.str`-accessor will perform more rigorous type checking for inputs. Previously, some types that were never intended to be used - "worked" purely due to limitations of dtype checking -- e.g. ``bytes``, which is now disabled except for `decode` and `len` (:issue:`23011`, :issue:`23163`) +- The `.str`-accessor will perform more rigorous type checking for inputs. Previously, some types that were never intended to be used, + "worked" due to limitations of dtype checking -- e.g. ``bytes``, which is now disabled except for `decode` and `len` (:issue:`23011`, :issue:`23163`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) - :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 0e86460c64ddd..138b867c5b5a8 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1777,8 +1777,6 @@ def forbid_nonstring_types(forbidden, name=None): # deal with None forbidden = [] if forbidden is None else forbidden - # deal with single string instead of list - forbidden = [forbidden] if isinstance(forbidden, str) else forbidden allowed_types = {'string', 'unicode', 'empty', 'bytes', 'mixed', 'mixed-integer'} - set(forbidden) From e34097fd04565137a8e8d4d837cac936dd37f2b8 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 4 Mar 2019 08:33:25 +0100 Subject: [PATCH 07/16] Add method name for casefold --- pandas/core/strings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 9d4df9b1944b8..b9565f1f430e8 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -3118,6 +3118,7 @@ def rindex(self, sub, start=0, end=None): docstring=_shared_docs['casemethods'] % _shared_docs['swapcase']) casefold = _noarg_wrapper(lambda x: x.casefold(), + name='casefold', docstring=_shared_docs['casemethods'] % _shared_docs['casefold']) From 25a046c2050dae4353baf4e8d515d308ffc29a33 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 4 Mar 2019 19:31:26 +0100 Subject: [PATCH 08/16] Adapt error msg --- pandas/tests/test_strings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index e580fdd4b0580..bb3cf063c569b 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3416,7 +3416,8 @@ def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) if compat.PY3: - with pytest.raises(TypeError, match="can't concat str to bytes"): + with pytest.raises(TypeError, + match="Cannot use .str.cat with values of.*"): lhs.str.cat(rhs) else: result = lhs.str.cat(rhs) From c9c74966e90ce6745c3c9481163ed2404988629c Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 8 Mar 2019 15:22:54 +0100 Subject: [PATCH 09/16] Review (jreback) --- doc/source/user_guide/text.rst | 11 +++++++++++ doc/source/whatsnew/v0.25.0.rst | 9 +++++++++ pandas/core/strings.py | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 6f21a7d9beb36..cf5b6b85b5ac2 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -70,6 +70,17 @@ and replacing any remaining white spaces with underscores: ``.str`` methods which operate on elements of type ``list`` are not available on such a ``Series``. +.. _text.warn_types: + +.. warning:: + + Before v.0.25.0, the ``.str``-accessor did only the most rudimentary type checks. Starting with + v.0.25.0, the type of the Series is inferred (like it has already been the case for ``Index.str``), + and the allowed types (i.e. strings) are enforced more rigorously. + + Generally speaking, the ``.str`` accessor is intended to work only on strings. With very few + exceptions, other uses are not supported, and may be disabled at a later point. + Splitting and Replacing Strings ------------------------------- diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 136ee274ac5e7..858338595bb65 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -172,6 +172,15 @@ Conversion Strings ^^^^^^^ +**The ``.str``-accessor performs stricter type checks** + +Due to the lack of a native string dtype in numpy, :attr:`Series.str` only checked whether the data was of ``object`` dtype. +From now on, the inferred dtype of the Series is checked to be correct (particularly, not ``'bytes'``), as :attr:`Index.str` does already. + +For more details, see this :ref:`warning`. + +**Other bugs** + - Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) - - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b9565f1f430e8..0c18bf6745e09 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1764,7 +1764,7 @@ def forbid_nonstring_types(forbidden, name=None): Parameters ---------- - forbidden : list or None + forbidden : list-of-str or None List of forbidden non-string types, may be one or more of `['bytes', 'mixed', 'mixed-integer']`. name : str, default None From bf4d7cf7949052d76070b6f6db0f8c2673ebc637 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 27 Mar 2019 08:31:42 +0100 Subject: [PATCH 10/16] Review (jreback) --- doc/source/whatsnew/v0.25.0.rst | 45 ++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 05463c35c2e57..6acc04cde6267 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -153,6 +153,42 @@ returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwi Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before. +The ``.str``-accessor performs stricter type checks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Due to the lack of a native string dtype in numpy, :attr:`Series.str` only checked whether the data was of ``object`` dtype. +From now on, the inferred dtype of the Series is checked to be correct (particularly, not ``'bytes'``), as :attr:`Index.str` does already. + +*Previous Behaviour*: + +.. code-block:: python + + In [1]: s = pd.Series(np.array(['a', 'ba', 'cba'], 'S'), dtype=object) + + In [2]: s + Out[2]: + 0 b'a' + 1 b'ba' + 2 b'cba' + dtype: object + + In [3]: s.str.startswith(b'a') + Out[3]: + 0 True + 1 False + 2 False + dtype: bool + +*New Behaviour*: + +.. ipython:: python + :okexcept: + + s = pd.Series(np.array(['a', 'ba', 'cba'], 'S'), dtype=object) + s + s.str.startswith(b'a') + +For more details, see this :ref:`warning`. .. _whatsnew_0250.api_breaking.deps: @@ -282,15 +318,6 @@ Conversion Strings ^^^^^^^ -**The ``.str``-accessor performs stricter type checks** - -Due to the lack of a native string dtype in numpy, :attr:`Series.str` only checked whether the data was of ``object`` dtype. -From now on, the inferred dtype of the Series is checked to be correct (particularly, not ``'bytes'``), as :attr:`Index.str` does already. - -For more details, see this :ref:`warning`. - -**Other bugs** - - Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) - - From 0c7e23302c4faeb2109f4d5cdeabe30fd349e468 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 30 May 2019 15:46:52 +0200 Subject: [PATCH 11/16] fix merge artefact --- doc/source/whatsnew/v0.25.0.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a6945d062c82a..543514f131e73 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -302,8 +302,6 @@ This change is backward compatible for direct usage of Pandas, but if you subcla Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). ->>>>>>> upstream/master - .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies From 1168ca2fb2fdabeefe8d228fb4ec513078f96db6 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 30 May 2019 16:06:29 +0200 Subject: [PATCH 12/16] Review (jreback) --- doc/source/user_guide/text.rst | 3 +-- pandas/core/strings.py | 30 ++++++++++++++++++++---------- pandas/tests/test_strings.py | 2 +- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index a1803cd04945d..87c75e8bcd91f 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -75,8 +75,7 @@ and replacing any remaining whitespaces with underscores: .. warning:: Before v.0.25.0, the ``.str``-accessor did only the most rudimentary type checks. Starting with - v.0.25.0, the type of the Series is inferred (like it has already been the case for ``Index.str``), - and the allowed types (i.e. strings) are enforced more rigorously. + v.0.25.0, the type of the Series is inferred and the allowed types (i.e. strings) are enforced more rigorously. Generally speaking, the ``.str`` accessor is intended to work only on strings. With very few exceptions, other uses are not supported, and may be disabled at a later point. diff --git a/pandas/core/strings.py b/pandas/core/strings.py index f695095f672d3..3831195518c65 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1862,6 +1862,26 @@ def __init__(self, data): @staticmethod def _validate(data): + """ + Auxiliary function for string methods, infers and checks dtype of data. + + This is a "first line of defence" and just checks that the dtype is in + the *union* of the allowed types over all string methods below; this + restriction is then refined on a per-method basis using the decorator + @forbid_nonstring_types. + + This really should exclude all series/index with any non-string values, + but that isn't practical for performance reasons until we have a str + dtype (GH 9343 / 13877) + + Parameters + ---------- + data : The content of the Series + + Returns + ------- + dtype : inferred dtype of data + """ if isinstance(data, ABCMultiIndex): raise AttributeError('Can only use .str accessor with Index, ' 'not MultiIndex') @@ -1877,14 +1897,6 @@ def _validate(data): inferred_dtype = lib.infer_dtype(values, skipna=True) if inferred_dtype not in allowed_types: - # this is a "first line of defence" and just checks that the type - # is in the *union* of the allowed types over all methods below; - # this restriction is then refined on a per-method basis using the - # decorator @forbid_nonstring_types - # - # this really should exclude all series/index with any non-string - # values, but that isn't practical for performance reasons until we - # have a str dtype (GH 9343 / 13877) raise AttributeError("Can only use .str accessor with string " "values!") return inferred_dtype @@ -2574,7 +2586,6 @@ def rpartition(self, sep=' ', expand=True): return self._wrap_result(result, expand=expand) @copy(str_get) - @forbid_nonstring_types(['bytes']) def get(self, i): result = str_get(self._parent, i) return self._wrap_result(result) @@ -2715,7 +2726,6 @@ def zfill(self, width): return self._wrap_result(result) @copy(str_slice) - @forbid_nonstring_types(['bytes']) def slice(self, start=None, stop=None, step=None): result = str_slice(self._parent, start, stop, step) return self._wrap_result(result) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d89cdb6cb01ee..1ba0ef3918fb7 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -223,7 +223,7 @@ def test_api_per_method(self, box, dtype, t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) - bytes_allowed = method_name in ['decode', 'len'] + bytes_allowed = method_name in ['decode', 'get', 'len', 'slice'] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. # This could be changed with an 'errors'-kwarg to the `str`-accessor, From ab980ecce12ee77b943a17e1842243d3fb03aa0c Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 30 May 2019 17:08:24 +0200 Subject: [PATCH 13/16] commit whatsnew changes --- doc/source/whatsnew/v0.25.0.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 543514f131e73..2dc3a0655a7c8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -198,8 +198,10 @@ cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before. The ``.str``-accessor performs stricter type checks ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Due to the lack of a native string dtype in numpy, :attr:`Series.str` only checked whether the data was of ``object`` dtype. -From now on, the inferred dtype of the Series is checked to be correct (particularly, not ``'bytes'``), as :attr:`Index.str` does already. +Due to the lack of more fine-grained dtypes, :attr:`Series.str` so far only checked whether the data was +of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* the Series; in particular, +``'bytes'``-only data will raise an exception (except for :meth:`Series.str.decode`, :meth:`Series.str.get`, +:meth:`Series.str.len`, :meth:`Series.str.slice`), see :issue:`23163`, :issue:`23011`, :issue:`23551`. *Previous Behaviour*: From a9968896f24e98f62a88fc2afaf41bc9c4d21bab Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 30 May 2019 17:10:07 +0200 Subject: [PATCH 14/16] retrigger azure From 4adef353d52c788046ad24c944a7b9d50a6c4344 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 30 May 2019 17:17:07 +0200 Subject: [PATCH 15/16] remove mentions of 'unicode' --- pandas/core/strings.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3831195518c65..f0d382f78aa3d 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1734,11 +1734,11 @@ def forbid_nonstring_types(forbidden, name=None): :meth:`StringMethods.__init__` allows the *union* of types its different methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), - namely: ['string', 'unicode', 'empty', 'bytes', 'mixed', 'mixed-integer']. + namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. - The default string types ['string', 'unicode', 'empty'] are allowed for all - methods. For the additional types ['bytes', 'mixed', 'mixed-integer'], each - method then needs to forbid the types it is not intended for. + The default string types ['string', 'empty'] are allowed for all methods. + For the additional types ['bytes', 'mixed', 'mixed-integer'], each method + then needs to forbid the types it is not intended for. Parameters ---------- @@ -1766,8 +1766,8 @@ def forbid_nonstring_types(forbidden, name=None): # deal with None forbidden = [] if forbidden is None else forbidden - allowed_types = {'string', 'unicode', 'empty', - 'bytes', 'mixed', 'mixed-integer'} - set(forbidden) + allowed_types = {'string', 'empty', 'bytes', + 'mixed', 'mixed-integer'} - set(forbidden) def _forbid_nonstring_types(func): func_name = func.__name__ if name is None else name @@ -1887,8 +1887,7 @@ def _validate(data): 'not MultiIndex') # see _libs/lib.pyx for list of inferred types - allowed_types = ['string', 'unicode', 'empty', - 'bytes', 'mixed', 'mixed-integer'] + allowed_types = ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'] values = getattr(data, 'values', data) # Series / Index values = getattr(values, 'categories', values) # categorical / normal From f62e344f49dfe5df95a4d7caf26585e387de5386 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 30 May 2019 17:17:30 +0200 Subject: [PATCH 16/16] improve docstring for ._validate --- pandas/core/strings.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index f0d382f78aa3d..bd756491abd2f 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1863,12 +1863,13 @@ def __init__(self, data): @staticmethod def _validate(data): """ - Auxiliary function for string methods, infers and checks dtype of data. + Auxiliary function for StringMethods, infers and checks dtype of data. - This is a "first line of defence" and just checks that the dtype is in - the *union* of the allowed types over all string methods below; this + This is a "first line of defence" at the creation of the StringMethods- + object (see _make_accessor), and just checks that the dtype is in the + *union* of the allowed types over all string methods below; this restriction is then refined on a per-method basis using the decorator - @forbid_nonstring_types. + @forbid_nonstring_types (more info in the corresponding docstring). This really should exclude all series/index with any non-string values, but that isn't practical for performance reasons until we have a str