From 8390f55e2bfcf6df14f8fb3c3a31372a8550a9a1 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sat, 20 Jul 2019 18:31:06 +0000 Subject: [PATCH 1/6] ENH: treat list of namedtuples like list of dict in DataFrame() --- doc/source/whatsnew/v0.25.0.rst | 50 ++++++++++++++++++++++ pandas/_libs/lib.pyx | 55 ++++++++++++++++++------- pandas/core/arrays/timedeltas.py | 5 ++- pandas/core/dtypes/inference.py | 6 ++- pandas/core/frame.py | 3 -- pandas/core/internals/construction.py | 29 +++++++------ pandas/tests/frame/test_constructors.py | 32 +++++++++++++- 7 files changed, 144 insertions(+), 36 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 42e756635e739..63267ebfe7cbf 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -235,6 +235,7 @@ Other enhancements - :func:`read_excel` can now use ``openpyxl`` to read Excel files via the ``engine='openpyxl'`` argument. This will become the default in a future release (:issue:`11499`) - :func:`pandas.io.excel.read_excel` supports reading OpenDocument tables. Specify ``engine='odf'`` to enable. Consult the :ref:`IO User Guide ` for more details (:issue:`9070`) - :class:`Interval`, :class:`IntervalIndex`, and :class:`~arrays.IntervalArray` have gained an :attr:`~Interval.is_empty` attribute denoting if the given interval(s) are empty (:issue:`27219`) +- :class:`DataFrame` now treats lists of typing.NameTuple equivalently to lists of nametuples. The behavior of the latter has changed in this release, please see the relevant section in "Breaking Changes". .. _whatsnew_0250.api_breaking: @@ -803,6 +804,55 @@ order of the resulting DataFrame has changed compared to previous pandas verison pd.DataFrame(data) +DataFrame constructor treats list of namedtuple/dict in the same way +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, only the first element in the list was checked and if it was a +namedtuple, the field names of that single tuple were used as the column names. +Subsequent tuples were assumed to be of the same type, and their values were +looked up by position. As a consequence, if subsequent tuples of different types +were included, any additional fields were ignored, and if similarly named fields +appeard in a different order, alignment was not performed. + +This behavior has now changed so that namedtuples are treated much as list of +dict behaves, i.e as a "list of records". + +Additionaly, this change implies a change in the semantics of the `columns` +argument to :class:`DataFrame` when passing a list of namedtuples. Previously, +`columns` has "rename" semantics, now it has the same "lookup" semantics as a +list of records. Meaning that any name given in `columns` which doesn't appear +as a key in the record will be assigned a NaN value. + +Due to this change, The performance of constructing frames from a list +of namedtuples is roughly 50% slower. + +.. ipython:: python + + from collections import namedtuple + Foo = namedtuple("Foo", list("ab")) + tuples = [Foo(1, 3), Foo(2, 4)] + +*Previous Behavior*: + +The columns were lexicographically sorted previously, + +.. code-block:: python + + In [1]: pd.DataFrame(tuples, columns=['y', 'z']) + Out[1]: + y z + 0 1 3 + 1 2 4 + +*New Behavior*: + +The column order now matches the insertion-order of the keys in the ``dict``, +considering all the records from top to bottom. + +.. ipython:: python + + pd.DataFrame(tuples, columns=['Q', 'a']) + .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 27ee685acfde7..9d1a07ef34371 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,4 +1,4 @@ -from collections import abc +from collections import abc, OrderedDict from decimal import Decimal from fractions import Fraction from numbers import Number @@ -312,28 +312,53 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): @cython.wraparound(False) @cython.boundscheck(False) -def dicts_to_array(dicts: list, columns: list): +def dicts_to_array(dicts: list, _columns : list): cdef: - Py_ssize_t i, j, k, n - ndarray[object, ndim=2] result - dict row + Py_ssize_t i, j, n + object result, columns + object row object col, onan = np.nan + dict d, nt_lookup - k = len(columns) n = len(dicts) + have_columns = len(_columns) > 0 + columns = OrderedDict.fromkeys(list(_columns or [])) + result = OrderedDict((k, np.full(n, np.nan, dtype='O')) for k in _columns) - result = np.empty((n, k), dtype='O') - + nt_lookup = {} for i in range(n): row = dicts[i] - for j in range(k): - col = columns[j] - if col in row: - result[i, j] = row[col] - else: - result[i, j] = onan + if hasattr(row, 'keys'): + d = row + for k in d: + v = d[k] + if k not in columns: + if have_columns: + continue + columns[k] = None + result[k] = np.full(n, np.nan, dtype='O') + result[k][i] = v + elif hasattr(row, "_fields"): + if type(row) not in nt_lookup: + l = [] + for j, k in enumerate(row._fields): + if k in columns or not have_columns: + # include this field in result + l.append((k, j)) + # create an array to store it + if k not in columns: + columns[k] = None + result[k] = np.full(n, np.nan, dtype='O') + # save (column_name, index) pairs + nt_lookup[type(row)] = l + + for k, j in nt_lookup[type(row)]: + result[k][i] = row[j] + else: + msg = "'%s' at row %d is not a valid record type" + raise ValueError(msg % (type(row), i)) - return result + return list(columns), list(result.values()) def fast_zip(list ndarrays): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 9d622d92e0979..ca6caa6053a03 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -9,6 +9,7 @@ from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import ( + Components, array_to_timedelta64, parse_timedelta_unit, precision_from_unit, @@ -901,7 +902,9 @@ def components(self): def f(x): if isna(x): - return [np.nan] * len(columns) + return Components( + np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan + ) return x.components else: diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 461b5cc6232cd..843d372ee7103 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -3,7 +3,7 @@ from collections import abc from numbers import Number import re -from typing import Pattern +from typing import NamedTuple, Pattern import numpy as np @@ -380,7 +380,9 @@ def is_named_tuple(obj): False """ - return isinstance(obj, tuple) and hasattr(obj, "_fields") + return isinstance(obj, NamedTuple) or ( + isinstance(obj, tuple) and hasattr(obj, "_fields") + ) def is_hashable(obj): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c15f4ad8e1900..5b7bea1fb2e0c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -65,7 +65,6 @@ is_integer_dtype, is_iterator, is_list_like, - is_named_tuple, is_nested_list_like, is_object_dtype, is_scalar, @@ -444,8 +443,6 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): data = list(data) if len(data) > 0: if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: - if is_named_tuple(data[0]) and columns is None: - columns = data[0]._fields arrays, columns = to_arrays(data, columns, dtype=dtype) columns = ensure_index(columns) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c437f686bd17b..cb2853023a600 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -34,6 +34,7 @@ is_integer_dtype, is_iterator, is_list_like, + is_named_tuple, is_object_dtype, pandas_dtype, ) @@ -460,12 +461,12 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): if columns is not None: return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] - if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) - elif isinstance(data[0], abc.Mapping): - return _list_of_dict_to_arrays( + if isinstance(data[0], abc.Mapping) or is_named_tuple(data[0]): + return _list_of_records_to_arrays( data, columns, coerce_float=coerce_float, dtype=dtype ) + elif isinstance(data[0], (list, tuple)): + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], ABCSeries): return _list_of_series_to_arrays( data, columns, coerce_float=coerce_float, dtype=dtype @@ -535,8 +536,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): return values.T, columns -def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): - """Convert list of dicts to numpy arrays +def _list_of_records_to_arrays(data, columns, coerce_float=False, dtype=None): + """Convert list of OrderedDict to numpy array if `columns` is not passed, column names are inferred from the records - for OrderedDict and (on Python>=3.6) dicts, the column names match @@ -556,17 +557,19 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): tuple arrays, columns """ - if columns is None: - gen = (list(x.keys()) for x in data) - types = (dict, OrderedDict) if PY36 else OrderedDict - sort = not any(isinstance(d, types) for d in data) + if not PY36 and columns is None: + gen = (list(x.keys() if hasattr(x, "keys") else x._fields) for x in data) + sort = not any(isinstance(d, OrderedDict) or is_named_tuple(d) for d in data) columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) + else: + columns = list(columns) if columns is not None else [] # assure that they are of the base dict class and not of derived # classes - data = [(type(d) is dict) and d or dict(d) for d in data] - - content = list(lib.dicts_to_array(data, list(columns)).T) + data = [ + ((type(d) is dict) and d) or (is_named_tuple(d) and d) or dict(d) for d in data + ] + columns, content = lib.dicts_to_array(data, columns) return _convert_object_array( content, columns, dtype=dtype, coerce_float=coerce_float ) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 29e46ac70c943..3fded92c7467b 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1336,8 +1336,36 @@ def test_constructor_list_of_namedtuples(self): tm.assert_frame_equal(result, expected) # with columns - expected = DataFrame({"y": [1, 2], "z": [3, 4]}) - result = DataFrame(tuples, columns=["y", "z"]) + # namedtuples now behave like records, so columns + # act like lookups, not rename + expected = DataFrame({"a": [1, 2], "x": [np.nan, np.nan]}) + result = DataFrame(tuples, columns=["a", "x"]) + tm.assert_frame_equal(result, expected) + + # new-style NamedTuple + # NOTE: Enable after py3.5 support is dropped + # from typing import NamedTuple + # class named_tuple3(NamedTuple): + # a: int + # b: int + # named_tuple3 = namedtuple("named_tuple3", list("ab")) + # tuples = [named_tuple3(1, 3), named_tuple3(2, 4)] + # expected = DataFrame({"a": [1, 2], "b": [3, 4]}) + # result = DataFrame(tuples) + # tm.assert_frame_equal(result, expected) + + expected = DataFrame({"a": [1, 2], "b": [3, 4]}) + result = DataFrame(tuples, columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + # hetero columns + named_tuple1 = namedtuple("Pandas", list("ab")) + named_tuple2 = namedtuple("sandaP", list("yabx")) + tuples = [named_tuple1(1, 2), named_tuple2(3, 4, 5, 6)] + result = DataFrame(tuples) + expected = pd.DataFrame( + {"a": [1, 4], "b": [2, 5], "y": [np.nan, 3.0], "x": [np.nan, 6.0]} + ) tm.assert_frame_equal(result, expected) def test_constructor_list_of_dict_order(self): From 04b52953a7976d03151824fb3a2cb2de9e3be802 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sat, 20 Jul 2019 14:16:14 -0500 Subject: [PATCH 2/6] Insertion Order is kept even on Python3.5 --- pandas/tests/frame/test_constructors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3fded92c7467b..999a0e81a29a5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1365,7 +1365,7 @@ def test_constructor_list_of_namedtuples(self): result = DataFrame(tuples) expected = pd.DataFrame( {"a": [1, 4], "b": [2, 5], "y": [np.nan, 3.0], "x": [np.nan, 6.0]} - ) + )[["a", "b", "y", "x"]] tm.assert_frame_equal(result, expected) def test_constructor_list_of_dict_order(self): From 6c114f5932c52d05ae246b78267b9e99108db68e Mon Sep 17 00:00:00 2001 From: pilkibun Date: Sat, 20 Jul 2019 14:17:40 -0500 Subject: [PATCH 3/6] DOC --- doc/source/whatsnew/v0.25.0.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 63267ebfe7cbf..25529b59de165 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -824,7 +824,9 @@ list of records. Meaning that any name given in `columns` which doesn't appear as a key in the record will be assigned a NaN value. Due to this change, The performance of constructing frames from a list -of namedtuples is roughly 50% slower. +of namedtuples is now roughly 50% slower. + +This change affects all supported python versions. .. ipython:: python From 1115bece528937db1003780d3c5254ba2ee1e3d8 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 18:26:41 -0500 Subject: [PATCH 4/6] move to v1.0.0 --- doc/source/whatsnew/v0.25.0.rst | 52 ------------------------------- doc/source/whatsnew/v1.0.0.rst | 55 +++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 25529b59de165..42e756635e739 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -235,7 +235,6 @@ Other enhancements - :func:`read_excel` can now use ``openpyxl`` to read Excel files via the ``engine='openpyxl'`` argument. This will become the default in a future release (:issue:`11499`) - :func:`pandas.io.excel.read_excel` supports reading OpenDocument tables. Specify ``engine='odf'`` to enable. Consult the :ref:`IO User Guide ` for more details (:issue:`9070`) - :class:`Interval`, :class:`IntervalIndex`, and :class:`~arrays.IntervalArray` have gained an :attr:`~Interval.is_empty` attribute denoting if the given interval(s) are empty (:issue:`27219`) -- :class:`DataFrame` now treats lists of typing.NameTuple equivalently to lists of nametuples. The behavior of the latter has changed in this release, please see the relevant section in "Breaking Changes". .. _whatsnew_0250.api_breaking: @@ -804,57 +803,6 @@ order of the resulting DataFrame has changed compared to previous pandas verison pd.DataFrame(data) -DataFrame constructor treats list of namedtuple/dict in the same way -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously, only the first element in the list was checked and if it was a -namedtuple, the field names of that single tuple were used as the column names. -Subsequent tuples were assumed to be of the same type, and their values were -looked up by position. As a consequence, if subsequent tuples of different types -were included, any additional fields were ignored, and if similarly named fields -appeard in a different order, alignment was not performed. - -This behavior has now changed so that namedtuples are treated much as list of -dict behaves, i.e as a "list of records". - -Additionaly, this change implies a change in the semantics of the `columns` -argument to :class:`DataFrame` when passing a list of namedtuples. Previously, -`columns` has "rename" semantics, now it has the same "lookup" semantics as a -list of records. Meaning that any name given in `columns` which doesn't appear -as a key in the record will be assigned a NaN value. - -Due to this change, The performance of constructing frames from a list -of namedtuples is now roughly 50% slower. - -This change affects all supported python versions. - -.. ipython:: python - - from collections import namedtuple - Foo = namedtuple("Foo", list("ab")) - tuples = [Foo(1, 3), Foo(2, 4)] - -*Previous Behavior*: - -The columns were lexicographically sorted previously, - -.. code-block:: python - - In [1]: pd.DataFrame(tuples, columns=['y', 'z']) - Out[1]: - y z - 0 1 3 - 1 2 4 - -*New Behavior*: - -The column order now matches the insertion-order of the keys in the ``dict``, -considering all the records from top to bottom. - -.. ipython:: python - - pd.DataFrame(tuples, columns=['Q', 'a']) - .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c352a36bf6de1..e30e42c0ed2ea 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -42,6 +42,61 @@ Backwards incompatible API changes - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). - +DataFrame constructor treats list of namedtuple/dict in the same way +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Lists of typing.NameTuple are now treated the same way as lists of :class:`collections.namedtuple.` +by the :class:`DataFrame` constructor. + +The treatment of such lists has also changed in this release. Previously, only +the first element in the list was checked, and if it was a namedtuple, the +field names of that single tuple were used as the column names. Subsequent +tuples were assumed to be of the same type, and their values were looked up by +position. As a consequence, if subsequent tuples of different types were +included, any additional fields were dropped, and if similarly named fields +appeared in a different order, alignment was not performed. + +This behavior has now changed so that namedtuples are treated in the same way +as list of dict behaves, i.e as a "list of records" (:issue:`27329`) (:issue:`27494`). + +Additionally, this change implies a change in the semantics of the `columns` +argument to :class:`DataFrame` when passing a list of namedtuples. Previously, +`columns` has "rename" semantics, now it has the same "lookup" semantics as a +list of records. Meaning that any name given in `columns` which doesn't appear +as a key in the record will be assigned a NaN value. + +Due to this change, The performance of constructing frames from a list of +namedtuples is now roughly 50% slower. + +This change affects all supported python versions. + +.. ipython:: python + + from collections import namedtuple + Foo = namedtuple("Foo", list("ab")) + tuples = [Foo(1, 3), Foo(2, 4)] + +*Previous Behavior*: + +The columns were lexicographically sorted previously, + +.. code-block:: python + + In [1]: pd.DataFrame(tuples, columns=['y', 'z']) + Out[1]: + y z + 0 1 3 + 1 2 4 + +*New Behavior*: + +The column order now matches the insertion-order of the keys in the ``dict``, +considering all the records from top to bottom. + +.. ipython:: python + + pd.DataFrame(tuples, columns=['Q', 'a']) + Other API changes ^^^^^^^^^^^^^^^^^ From 1972e53e8b9196c9a1e80cbcff3d6fcf5d234a9d Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 18:37:03 -0500 Subject: [PATCH 5/6] undo rename --- pandas/_libs/lib.pyx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index da627885c2936..7a6a2ecc96bba 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -312,18 +312,18 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): @cython.wraparound(False) @cython.boundscheck(False) -def dicts_to_array(dicts: list, _columns : list): +def dicts_to_array(dicts: list, columns : list): cdef: Py_ssize_t i, j, n - object result, columns + object result, result_columns object row object col, onan = np.nan dict d, nt_lookup n = len(dicts) - have_columns = len(_columns) > 0 - columns = OrderedDict.fromkeys(list(_columns or [])) - result = OrderedDict((k, np.full(n, np.nan, dtype='O')) for k in _columns) + have_columns = len(columns) > 0 + result_columns = OrderedDict.fromkeys(list(columns or [])) + result = OrderedDict((k, np.full(n, np.nan, dtype='O')) for k in columns) nt_lookup = {} for i in range(n): @@ -332,22 +332,22 @@ def dicts_to_array(dicts: list, _columns : list): d = row for k in d: v = d[k] - if k not in columns: + if k not in result_columns: if have_columns: continue - columns[k] = None + result_columns[k] = None result[k] = np.full(n, np.nan, dtype='O') result[k][i] = v elif hasattr(row, "_fields"): if type(row) not in nt_lookup: l = [] for j, k in enumerate(row._fields): - if k in columns or not have_columns: + if k in result_columns or not have_columns: # include this field in result l.append((k, j)) # create an array to store it - if k not in columns: - columns[k] = None + if k not in result_columns: + result_columns[k] = None result[k] = np.full(n, np.nan, dtype='O') # save (column_name, index) pairs nt_lookup[type(row)] = l @@ -358,7 +358,7 @@ def dicts_to_array(dicts: list, _columns : list): msg = "'%s' at row %d is not a valid record type" raise ValueError(msg % (type(row), i)) - return list(columns), list(result.values()) + return list(result_columns), list(result.values()) def fast_zip(list ndarrays): From 2e3998198147547383ebefbd03eea3165d392c8f Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 18:40:46 -0500 Subject: [PATCH 6/6] Split into new test --- pandas/tests/frame/test_constructors.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 4f5a6649d962a..be3d22f2619c1 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1331,9 +1331,16 @@ def test_constructor_list_of_namedtuples(self): result = DataFrame(tuples) tm.assert_frame_equal(result, expected) - # with columns - # namedtuples now behave like records, so columns - # act like lookups, not rename + def test_constructor_list_of_namedtuples_new_behavior(self): + # GH27329 + from collections import namedtuple + + named_tuple = namedtuple("Pandas", list("ab")) + tuples = [named_tuple(1, 3), named_tuple(2, 4)] + + # namedtuples now behave like records, so if `columns` + # is passed it's treated as field selection. Previously + # it was treated as a rename. expected = DataFrame({"a": [1, 2], "x": [np.nan, np.nan]}) result = DataFrame(tuples, columns=["a", "x"]) tm.assert_frame_equal(result, expected)