Skip to content

Commit 434af4b

Browse files
committed
Merge remote-tracking branch 'upstream/master' into str_infer
2 parents 0454cc3 + 4cac923 commit 434af4b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+2314
-1167
lines changed

asv_bench/benchmarks/indexing.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,17 @@
22

33
import numpy as np
44
import pandas.util.testing as tm
5-
from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index,
6-
Float64Index, IntervalIndex, CategoricalIndex,
5+
from pandas import (Series, DataFrame, Panel, MultiIndex,
6+
Int64Index, UInt64Index, Float64Index,
7+
IntervalIndex, CategoricalIndex,
78
IndexSlice, concat, date_range)
89

910

1011
class NumericSeriesIndexing(object):
1112

1213
goal_time = 0.2
1314
params = [
14-
(Int64Index, Float64Index),
15+
(Int64Index, UInt64Index, Float64Index),
1516
('unique_monotonic_inc', 'nonunique_monotonic_inc'),
1617
]
1718
param_names = ['index_dtype', 'index_structure']
+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import numpy as np
2+
3+
from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine,
4+
ObjectEngine)
5+
6+
7+
class NumericEngineIndexing(object):
8+
9+
goal_time = 0.2
10+
params = [[Int64Engine, UInt64Engine, Float64Engine],
11+
[np.int64, np.uint64, np.float64],
12+
['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
13+
]
14+
param_names = ['engine', 'dtype', 'index_type']
15+
16+
def setup(self, engine, dtype, index_type):
17+
N = 10**5
18+
values = list([1] * N + [2] * N + [3] * N)
19+
arr = {
20+
'monotonic_incr': np.array(values, dtype=dtype),
21+
'monotonic_decr': np.array(list(reversed(values)),
22+
dtype=dtype),
23+
'non_monotonic': np.array([1, 2, 3] * N, dtype=dtype),
24+
}[index_type]
25+
26+
self.data = engine(lambda: arr, len(arr))
27+
# code belows avoids populating the mapping etc. while timing.
28+
self.data.get_loc(2)
29+
30+
def time_get_loc(self, engine, dtype, index_type):
31+
self.data.get_loc(2)
32+
33+
34+
class ObjectEngineIndexing(object):
35+
36+
goal_time = 0.2
37+
params = [('monotonic_incr', 'monotonic_decr', 'non_monotonic')]
38+
param_names = ['index_type']
39+
40+
def setup(self, index_type):
41+
N = 10**5
42+
values = list('a' * N + 'b' * N + 'c' * N)
43+
arr = {
44+
'monotonic_incr': np.array(values, dtype=object),
45+
'monotonic_decr': np.array(list(reversed(values)), dtype=object),
46+
'non_monotonic': np.array(list('abc') * N, dtype=object),
47+
}[index_type]
48+
49+
self.data = ObjectEngine(lambda: arr, len(arr))
50+
# code belows avoids populating the mapping etc. while timing.
51+
self.data.get_loc('b')
52+
53+
def time_get_loc(self, index_type):
54+
self.data.get_loc('b')

ci/code_checks.sh

+5-5
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
echo "inside $0"
1818
[[ $LINT ]] || { echo "NOT Linting. To lint use: LINT=true $0 $1"; exit 0; }
19-
[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "doctests" ]] || { echo "Unkown command $1. Usage: $0 [lint|patterns|doctests]"; exit 9999; }
19+
[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "doctests" ]] || { echo "Unknown command $1. Usage: $0 [lint|patterns|doctests]"; exit 9999; }
2020

2121
source activate pandas
2222
RET=0
@@ -122,22 +122,22 @@ fi
122122
if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
123123

124124
MSG='Doctests frame.py' ; echo $MSG
125-
pytest --doctest-modules -v pandas/core/frame.py \
125+
pytest -q --doctest-modules pandas/core/frame.py \
126126
-k"-axes -combine -itertuples -join -nlargest -nsmallest -nunique -pivot_table -quantile -query -reindex -reindex_axis -replace -round -set_index -stack -to_stata"
127127
RET=$(($RET + $?)) ; echo $MSG "DONE"
128128

129129
MSG='Doctests series.py' ; echo $MSG
130-
pytest --doctest-modules -v pandas/core/series.py \
130+
pytest -q --doctest-modules pandas/core/series.py \
131131
-k"-nonzero -reindex -searchsorted -to_dict"
132132
RET=$(($RET + $?)) ; echo $MSG "DONE"
133133

134134
MSG='Doctests generic.py' ; echo $MSG
135-
pytest --doctest-modules -v pandas/core/generic.py \
135+
pytest -q --doctest-modules pandas/core/generic.py \
136136
-k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -resample -to_json -transpose -values -xs"
137137
RET=$(($RET + $?)) ; echo $MSG "DONE"
138138

139139
MSG='Doctests top-level reshaping functions' ; echo $MSG
140-
pytest --doctest-modules -v \
140+
pytest -q --doctest-modules \
141141
pandas/core/reshape/concat.py \
142142
pandas/core/reshape/pivot.py \
143143
pandas/core/reshape/reshape.py \

doc/source/extending.rst

+16
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,12 @@ There are two approaches for providing operator support for your ExtensionArray:
135135
2. Use an operator implementation from pandas that depends on operators that are already defined
136136
on the underlying elements (scalars) of the ExtensionArray.
137137

138+
.. note::
139+
140+
Regardless of the approach, you may want to set ``__array_priority__``
141+
if you want your implementation to be called when involved in binary operations
142+
with NumPy arrays.
143+
138144
For the first approach, you define selected operators, e.g., ``__add__``, ``__le__``, etc. that
139145
you want your ``ExtensionArray`` subclass to support.
140146

@@ -173,6 +179,16 @@ or not that succeeds depends on whether the operation returns a result
173179
that's valid for the ``ExtensionArray``. If an ``ExtensionArray`` cannot
174180
be reconstructed, an ndarray containing the scalars returned instead.
175181

182+
For ease of implementation and consistency with operations between pandas
183+
and NumPy ndarrays, we recommend *not* handling Series and Indexes in your binary ops.
184+
Instead, you should detect these cases and return ``NotImplemented``.
185+
When pandas encounters an operation like ``op(Series, ExtensionArray)``, pandas
186+
will
187+
188+
1. unbox the array from the ``Series`` (roughly ``Series.values``)
189+
2. call ``result = op(values, ExtensionArray)``
190+
3. re-box the result in a ``Series``
191+
176192
.. _extending.extension.testing:
177193

178194
Testing Extension Arrays

doc/source/whatsnew/v0.24.0.txt

+139-2
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ Other Enhancements
198198
- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
199199
- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
200200
- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
201+
- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
202+
all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
201203
- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
202204
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
203205
- Compatibility with Matplotlib 3.0 (:issue:`22790`).
@@ -235,6 +237,97 @@ If installed, we now require:
235237
| scipy | 0.18.1 | |
236238
+-----------------+-----------------+----------+
237239

240+
.. _whatsnew_0240.api_breaking.csv_line_terminator:
241+
242+
`os.linesep` is used for ``line_terminator`` of ``DataFrame.to_csv``
243+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
244+
245+
:func:`DataFrame.to_csv` now uses :func:`os.linesep` rather than ``'\n'``
246+
for the default line terminator (:issue:`20353`).
247+
This change only affects when running on Windows, where ``'\r\n'`` was used for line terminator
248+
even when ``'\n'`` was passed in ``line_terminator``.
249+
250+
Previous Behavior on Windows:
251+
252+
.. code-block:: ipython
253+
254+
In [1]: data = pd.DataFrame({
255+
...: "string_with_lf": ["a\nbc"],
256+
...: "string_with_crlf": ["a\r\nbc"]
257+
...: })
258+
259+
In [2]: # When passing file PATH to to_csv, line_terminator does not work, and csv is saved with '\r\n'.
260+
...: # Also, this converts all '\n's in the data to '\r\n'.
261+
...: data.to_csv("test.csv", index=False, line_terminator='\n')
262+
263+
In [3]: with open("test.csv", mode='rb') as f:
264+
...: print(f.read())
265+
b'string_with_lf,string_with_crlf\r\n"a\r\nbc","a\r\r\nbc"\r\n'
266+
267+
In [4]: # When passing file OBJECT with newline option to to_csv, line_terminator works.
268+
...: with open("test2.csv", mode='w', newline='\n') as f:
269+
...: data.to_csv(f, index=False, line_terminator='\n')
270+
271+
In [5]: with open("test2.csv", mode='rb') as f:
272+
...: print(f.read())
273+
b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n'
274+
275+
276+
New Behavior on Windows:
277+
278+
- By passing ``line_terminator`` explicitly, line terminator is set to that character.
279+
- The value of ``line_terminator`` only affects the line terminator of CSV,
280+
so it does not change the value inside the data.
281+
282+
.. code-block:: ipython
283+
284+
In [1]: data = pd.DataFrame({
285+
...: "string_with_lf": ["a\nbc"],
286+
...: "string_with_crlf": ["a\r\nbc"]
287+
...: })
288+
289+
In [2]: data.to_csv("test.csv", index=False, line_terminator='\n')
290+
291+
In [3]: with open("test.csv", mode='rb') as f:
292+
...: print(f.read())
293+
b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n'
294+
295+
296+
- On Windows, the value of ``os.linesep`` is ``'\r\n'``,
297+
so if ``line_terminator`` is not set, ``'\r\n'`` is used for line terminator.
298+
- Again, it does not affect the value inside the data.
299+
300+
.. code-block:: ipython
301+
302+
In [1]: data = pd.DataFrame({
303+
...: "string_with_lf": ["a\nbc"],
304+
...: "string_with_crlf": ["a\r\nbc"]
305+
...: })
306+
307+
In [2]: data.to_csv("test.csv", index=False)
308+
309+
In [3]: with open("test.csv", mode='rb') as f:
310+
...: print(f.read())
311+
b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n'
312+
313+
314+
- For files objects, specifying ``newline`` is not sufficient to set the line terminator.
315+
You must pass in the ``line_terminator`` explicitly, even in this case.
316+
317+
.. code-block:: ipython
318+
319+
In [1]: data = pd.DataFrame({
320+
...: "string_with_lf": ["a\nbc"],
321+
...: "string_with_crlf": ["a\r\nbc"]
322+
...: })
323+
324+
In [2]: with open("test2.csv", mode='w', newline='\n') as f:
325+
...: data.to_csv(f, index=False)
326+
327+
In [3]: with open("test2.csv", mode='rb') as f:
328+
...: print(f.read())
329+
b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n'
330+
238331
.. _whatsnew_0240.api_breaking.interval_values:
239332

240333
``IntervalIndex.values`` is now an ``IntervalArray``
@@ -442,15 +535,15 @@ In addition to these API breaking changes, many :ref:`performance improvements a
442535
Raise ValueError in ``DataFrame.to_dict(orient='index')``
443536
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
444537

445-
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
538+
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
446539
``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)
447540

448541
.. ipython:: python
449542
:okexcept:
450543

451544
df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
452545
df
453-
546+
454547
df.to_dict(orient='index')
455548

456549
.. _whatsnew_0240.api.datetimelike.normalize:
@@ -628,6 +721,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
628721
- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
629722
- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
630723
- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
724+
- Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`)
631725
- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
632726
- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
633727
- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`).
@@ -713,6 +807,8 @@ Other API Changes
713807
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
714808
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
715809
- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
810+
- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types,
811+
has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
716812
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
717813
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
718814

@@ -732,6 +828,7 @@ Deprecations
732828
many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`)
733829
- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`)
734830
- :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`)
831+
- The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`).
735832

736833
.. _whatsnew_0240.prior_deprecations:
737834

@@ -749,6 +846,8 @@ Removal of prior version deprecations/changes
749846
- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`)
750847
- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`)
751848
- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`)
849+
- :meth:`SparseArray.get_values` and :meth:`SparseArray.to_dense` have dropped the ``fill`` parameter (:issue:`14686`)
850+
- :meth:`SparseSeries.to_dense` has dropped the ``sparse_only`` parameter (:issue:`14686`)
752851

753852
.. _whatsnew_0240.performance:
754853

@@ -790,6 +889,7 @@ Categorical
790889
^^^^^^^^^^^
791890

792891
- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``.
892+
- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
793893
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
794894
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
795895

@@ -873,6 +973,7 @@ Numeric
873973
- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`)
874974
- Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`)
875975
- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`)
976+
- Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`).
876977

877978
Strings
878979
^^^^^^^
@@ -925,6 +1026,41 @@ MultiIndex
9251026
I/O
9261027
^^^
9271028

1029+
.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
1030+
1031+
Proper handling of `np.NaN` in a string data-typed column with the Python engine
1032+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1033+
1034+
There was bug in :func:`read_excel` and :func:`read_csv` with the Python
1035+
engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
1036+
``na_filter=True``. Now, these missing values are converted to the string
1037+
missing indicator, ``np.nan``. (:issue `20377`)
1038+
1039+
.. ipython:: python
1040+
:suppress:
1041+
1042+
from pandas.compat import StringIO
1043+
1044+
Previous Behavior:
1045+
1046+
.. code-block:: ipython
1047+
1048+
In [5]: data = 'a,b,c\n1,,3\n4,5,6'
1049+
In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
1050+
In [7]: df.loc[0, 'b']
1051+
Out[7]:
1052+
'nan'
1053+
1054+
Current Behavior:
1055+
1056+
.. ipython:: python
1057+
1058+
data = 'a,b,c\n1,,3\n4,5,6'
1059+
df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
1060+
df.loc[0, 'b']
1061+
1062+
Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
1063+
9281064
- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
9291065
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
9301066
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
@@ -988,6 +1124,7 @@ Sparse
9881124
- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array.
9891125
- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`)
9901126
- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`)
1127+
- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`)
9911128

9921129
Build Changes
9931130
^^^^^^^^^^^^^

0 commit comments

Comments
 (0)