pandas-dev
diff --git a/‎asv_bench/benchmarks/indexing.py
+4-3 b/‎asv_bench/benchmarks/indexing.py
+4-3
diff --git a/‎asv_bench/benchmarks/indexing_engines.py
+54 b/‎asv_bench/benchmarks/indexing_engines.py
+54
diff --git a/‎ci/code_checks.sh
+5-5 b/‎ci/code_checks.sh
+5-5
diff --git a/‎doc/source/extending.rst
+16 b/‎doc/source/extending.rst
+16
diff --git a/‎doc/source/whatsnew/v0.24.0.txt
+139-2 b/‎doc/source/whatsnew/v0.24.0.txt
+139-2
@@ -2,16 +2,17 @@
 
 import numpy as np
 import pandas.util.testing as tm
-from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index,
-                    Float64Index, IntervalIndex, CategoricalIndex,
+from pandas import (Series, DataFrame, Panel, MultiIndex,
+                    Int64Index, UInt64Index, Float64Index,
+                    IntervalIndex, CategoricalIndex,
                     IndexSlice, concat, date_range)
 
 
 class NumericSeriesIndexing(object):
 
     goal_time = 0.2
     params = [
-        (Int64Index, Float64Index),
+        (Int64Index, UInt64Index, Float64Index),
         ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
     ]
     param_names = ['index_dtype', 'index_structure']
 
@@ -0,0 +1,54 @@
+import numpy as np
+
+from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine,
+                                ObjectEngine)
+
+
+class NumericEngineIndexing(object):
+
+    goal_time = 0.2
+    params = [[Int64Engine, UInt64Engine, Float64Engine],
+              [np.int64, np.uint64, np.float64],
+              ['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
+              ]
+    param_names = ['engine', 'dtype', 'index_type']
+
+    def setup(self, engine, dtype, index_type):
+        N = 10**5
+        values = list([1] * N + [2] * N + [3] * N)
+        arr = {
+            'monotonic_incr': np.array(values, dtype=dtype),
+            'monotonic_decr': np.array(list(reversed(values)),
+                                       dtype=dtype),
+            'non_monotonic': np.array([1, 2, 3] * N, dtype=dtype),
+        }[index_type]
+
+        self.data = engine(lambda: arr, len(arr))
+        # code belows avoids populating the mapping etc. while timing.
+        self.data.get_loc(2)
+
+    def time_get_loc(self, engine, dtype, index_type):
+        self.data.get_loc(2)
+
+
+class ObjectEngineIndexing(object):
+
+    goal_time = 0.2
+    params = [('monotonic_incr', 'monotonic_decr', 'non_monotonic')]
+    param_names = ['index_type']
+
+    def setup(self, index_type):
+        N = 10**5
+        values = list('a' * N + 'b' * N + 'c' * N)
+        arr = {
+            'monotonic_incr': np.array(values, dtype=object),
+            'monotonic_decr': np.array(list(reversed(values)), dtype=object),
+            'non_monotonic': np.array(list('abc') * N, dtype=object),
+        }[index_type]
+
+        self.data = ObjectEngine(lambda: arr, len(arr))
+        # code belows avoids populating the mapping etc. while timing.
+        self.data.get_loc('b')
+
+    def time_get_loc(self, index_type):
+        self.data.get_loc('b')
@@ -16,7 +16,7 @@
 
 echo "inside $0"
 [[ $LINT ]] || { echo "NOT Linting. To lint use: LINT=true $0 $1"; exit 0; }
-[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "doctests" ]] || { echo "Unkown command $1. Usage: $0 [lint|patterns|doctests]"; exit 9999; }
+[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "doctests" ]] || { echo "Unknown command $1. Usage: $0 [lint|patterns|doctests]"; exit 9999; }
 
 source activate pandas
 RET=0
@@ -122,22 +122,22 @@ fi
 if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
 
     MSG='Doctests frame.py' ; echo $MSG
-    pytest --doctest-modules -v pandas/core/frame.py \
+    pytest -q --doctest-modules pandas/core/frame.py \
         -k"-axes -combine -itertuples -join -nlargest -nsmallest -nunique -pivot_table -quantile -query -reindex -reindex_axis -replace -round -set_index -stack -to_stata"
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     MSG='Doctests series.py' ; echo $MSG
-    pytest --doctest-modules -v pandas/core/series.py \
+    pytest -q --doctest-modules pandas/core/series.py \
         -k"-nonzero -reindex -searchsorted -to_dict"
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     MSG='Doctests generic.py' ; echo $MSG
-    pytest --doctest-modules -v pandas/core/generic.py \
+    pytest -q --doctest-modules pandas/core/generic.py \
         -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -resample -to_json -transpose -values -xs"
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     MSG='Doctests top-level reshaping functions' ; echo $MSG
-    pytest --doctest-modules -v \
+    pytest -q --doctest-modules \
         pandas/core/reshape/concat.py \
         pandas/core/reshape/pivot.py \
         pandas/core/reshape/reshape.py \
 
@@ -135,6 +135,12 @@ There are two approaches for providing operator support for your ExtensionArray:
 2. Use an operator implementation from pandas that depends on operators that are already defined
    on the underlying elements (scalars) of the ExtensionArray.
 
+.. note::
+
+   Regardless of the approach, you may want to set ``__array_priority__``
+   if you want your implementation to be called when involved in binary operations
+   with NumPy arrays.
+
 For the first approach, you define selected operators, e.g., ``__add__``, ``__le__``, etc. that
 you want your ``ExtensionArray`` subclass to support.
 
@@ -173,6 +179,16 @@ or not that succeeds depends on whether the operation returns a result
 that's valid for the ``ExtensionArray``. If an ``ExtensionArray`` cannot
 be reconstructed, an ndarray containing the scalars returned instead.
 
+For ease of implementation and consistency with operations between pandas
+and NumPy ndarrays, we recommend *not* handling Series and Indexes in your binary ops.
+Instead, you should detect these cases and return ``NotImplemented``.
+When pandas encounters an operation like ``op(Series, ExtensionArray)``, pandas
+will
+
+1. unbox the array from the ``Series`` (roughly ``Series.values``)
+2. call ``result = op(values, ExtensionArray)``
+3. re-box the result in a ``Series``
+
 .. _extending.extension.testing:
 
 Testing Extension Arrays
 
@@ -198,6 +198,8 @@ Other Enhancements
 - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
 - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
 - :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
+- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
+  all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
 - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
 - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
 - Compatibility with Matplotlib 3.0 (:issue:`22790`).
@@ -235,6 +237,97 @@ If installed, we now require:
 | scipy           | 0.18.1          |          |
 +-----------------+-----------------+----------+
 
+.. _whatsnew_0240.api_breaking.csv_line_terminator:
+
+`os.linesep` is used for ``line_terminator`` of ``DataFrame.to_csv``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`DataFrame.to_csv` now uses :func:`os.linesep` rather than ``'\n'``
+ for the default line terminator (:issue:`20353`).
+This change only affects when running on Windows, where ``'\r\n'`` was used for line terminator
+even when ``'\n'`` was passed in ``line_terminator``.
+
+Previous Behavior on Windows:
+
+.. code-block:: ipython
+
+In [1]: data = pd.DataFrame({
+   ...:     "string_with_lf": ["a\nbc"],
+   ...:     "string_with_crlf": ["a\r\nbc"]
+   ...: })
+
+In [2]: # When passing file PATH to to_csv, line_terminator does not work, and csv is saved with '\r\n'.
+   ...: # Also, this converts all '\n's in the data to '\r\n'.
+   ...: data.to_csv("test.csv", index=False, line_terminator='\n')
+
+In [3]: with open("test.csv", mode='rb') as f:
+   ...:     print(f.read())
+b'string_with_lf,string_with_crlf\r\n"a\r\nbc","a\r\r\nbc"\r\n'
+
+In [4]: # When passing file OBJECT with newline option to to_csv, line_terminator works.
+   ...: with open("test2.csv", mode='w', newline='\n') as f:
+   ...:     data.to_csv(f, index=False, line_terminator='\n')
+
+In [5]: with open("test2.csv", mode='rb') as f:
+   ...:     print(f.read())
+b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n'
+
+
+New Behavior on Windows:
+
+- By passing ``line_terminator`` explicitly, line terminator is set to that character.
+- The value of ``line_terminator`` only affects the line terminator of CSV,
+  so it does not change the value inside the data.
+
+.. code-block:: ipython
+
+In [1]: data = pd.DataFrame({
+   ...:     "string_with_lf": ["a\nbc"],
+   ...:     "string_with_crlf": ["a\r\nbc"]
+   ...: })
+
+In [2]: data.to_csv("test.csv", index=False, line_terminator='\n')
+
+In [3]: with open("test.csv", mode='rb') as f:
+   ...:     print(f.read())
+b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n'
+
+
+- On Windows, the value of ``os.linesep`` is ``'\r\n'``,
+  so if ``line_terminator`` is not set, ``'\r\n'`` is used for line terminator.
+- Again, it does not affect the value inside the data.
+
+.. code-block:: ipython
+
+In [1]: data = pd.DataFrame({
+   ...: "string_with_lf": ["a\nbc"],
+   ...: "string_with_crlf": ["a\r\nbc"]
+   ...: })
+
+In [2]: data.to_csv("test.csv", index=False)
+
+In [3]: with open("test.csv", mode='rb') as f:
+   ...:     print(f.read())
+b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n'
+
+
+- For files objects, specifying ``newline`` is not sufficient to set the line terminator.
+  You must pass in the ``line_terminator`` explicitly, even in this case.
+
+.. code-block:: ipython
+
+In [1]: data = pd.DataFrame({
+   ...: "string_with_lf": ["a\nbc"],
+   ...: "string_with_crlf": ["a\r\nbc"]
+   ...: })
+
+In [2]: with open("test2.csv", mode='w', newline='\n') as f:
+   ...:     data.to_csv(f, index=False)
+
+In [3]: with open("test2.csv", mode='rb') as f:
+   ...:     print(f.read())
+b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n'
+
 .. _whatsnew_0240.api_breaking.interval_values:
 
 ``IntervalIndex.values`` is now an ``IntervalArray``
@@ -442,15 +535,15 @@ In addition to these API breaking changes, many :ref:`performance improvements a
 Raise ValueError in ``DataFrame.to_dict(orient='index')``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with 
+Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
 ``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)
 
 .. ipython:: python
     :okexcept:
 
     df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
     df
-    
+
     df.to_dict(orient='index')
 
 .. _whatsnew_0240.api.datetimelike.normalize:
@@ -628,6 +721,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
 - Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
+- Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`)
 - Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
 - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
 - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`).
@@ -713,6 +807,8 @@ Other API Changes
 - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
 - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
 - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
+- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types,
+  has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
 - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
 
@@ -732,6 +828,7 @@ Deprecations
   many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`)
 - :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`)
 - :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`)
+- The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`).
 
 .. _whatsnew_0240.prior_deprecations:
 
@@ -749,6 +846,8 @@ Removal of prior version deprecations/changes
 - :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`)
 - :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`)
 - Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`)
+- :meth:`SparseArray.get_values` and :meth:`SparseArray.to_dense` have dropped the ``fill`` parameter (:issue:`14686`)
+- :meth:`SparseSeries.to_dense` has dropped the ``sparse_only`` parameter (:issue:`14686`)
 
 .. _whatsnew_0240.performance:
 
@@ -790,6 +889,7 @@ Categorical
 ^^^^^^^^^^^
 
 - Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``.
+- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
 - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
 - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
 
@@ -873,6 +973,7 @@ Numeric
 - Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`)
 - Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`)
 - Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`)
+- Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`).
 
 Strings
 ^^^^^^^
@@ -925,6 +1026,41 @@ MultiIndex
 I/O
 ^^^
 
+.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
+
+Proper handling of `np.NaN` in a string data-typed column with the Python engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There was bug in :func:`read_excel` and :func:`read_csv` with the Python
+engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
+``na_filter=True``. Now, these missing values are converted to the string
+missing indicator, ``np.nan``. (:issue `20377`)
+
+.. ipython:: python
+   :suppress:
+
+   from pandas.compat import StringIO
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [5]: data = 'a,b,c\n1,,3\n4,5,6'
+   In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
+   In [7]: df.loc[0, 'b']
+   Out[7]:
+   'nan'
+
+Current Behavior:
+
+.. ipython:: python
+
+   data = 'a,b,c\n1,,3\n4,5,6'
+   df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
+   df.loc[0, 'b']
+
+Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
+
 - :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
 - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
@@ -988,6 +1124,7 @@ Sparse
 - Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array.
 - Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`)
 - Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`)
+- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`)
 
 Build Changes
 ^^^^^^^^^^^^^