diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c352a36bf6de1..e30e42c0ed2ea 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -42,6 +42,61 @@ Backwards incompatible API changes - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). - +DataFrame constructor treats list of namedtuple/dict in the same way +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Lists of typing.NameTuple are now treated the same way as lists of :class:`collections.namedtuple.` +by the :class:`DataFrame` constructor. + +The treatment of such lists has also changed in this release. Previously, only +the first element in the list was checked, and if it was a namedtuple, the +field names of that single tuple were used as the column names. Subsequent +tuples were assumed to be of the same type, and their values were looked up by +position. As a consequence, if subsequent tuples of different types were +included, any additional fields were dropped, and if similarly named fields +appeared in a different order, alignment was not performed. + +This behavior has now changed so that namedtuples are treated in the same way +as list of dict behaves, i.e as a "list of records" (:issue:`27329`) (:issue:`27494`). + +Additionally, this change implies a change in the semantics of the `columns` +argument to :class:`DataFrame` when passing a list of namedtuples. Previously, +`columns` has "rename" semantics, now it has the same "lookup" semantics as a +list of records. Meaning that any name given in `columns` which doesn't appear +as a key in the record will be assigned a NaN value. + +Due to this change, The performance of constructing frames from a list of +namedtuples is now roughly 50% slower. + +This change affects all supported python versions. + +.. ipython:: python + + from collections import namedtuple + Foo = namedtuple("Foo", list("ab")) + tuples = [Foo(1, 3), Foo(2, 4)] + +*Previous Behavior*: + +The columns were lexicographically sorted previously, + +.. code-block:: python + + In [1]: pd.DataFrame(tuples, columns=['y', 'z']) + Out[1]: + y z + 0 1 3 + 1 2 4 + +*New Behavior*: + +The column order now matches the insertion-order of the keys in the ``dict``, +considering all the records from top to bottom. + +.. ipython:: python + + pd.DataFrame(tuples, columns=['Q', 'a']) + Other API changes ^^^^^^^^^^^^^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d430cb3d3913f..7a6a2ecc96bba 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,4 +1,4 @@ -from collections import abc +from collections import abc, OrderedDict from decimal import Decimal from fractions import Fraction from numbers import Number @@ -312,28 +312,53 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): @cython.wraparound(False) @cython.boundscheck(False) -def dicts_to_array(dicts: list, columns: list): +def dicts_to_array(dicts: list, columns : list): cdef: - Py_ssize_t i, j, k, n - ndarray[object, ndim=2] result - dict row + Py_ssize_t i, j, n + object result, result_columns + object row object col, onan = np.nan + dict d, nt_lookup - k = len(columns) n = len(dicts) + have_columns = len(columns) > 0 + result_columns = OrderedDict.fromkeys(list(columns or [])) + result = OrderedDict((k, np.full(n, np.nan, dtype='O')) for k in columns) - result = np.empty((n, k), dtype='O') - + nt_lookup = {} for i in range(n): row = dicts[i] - for j in range(k): - col = columns[j] - if col in row: - result[i, j] = row[col] - else: - result[i, j] = onan + if hasattr(row, 'keys'): + d = row + for k in d: + v = d[k] + if k not in result_columns: + if have_columns: + continue + result_columns[k] = None + result[k] = np.full(n, np.nan, dtype='O') + result[k][i] = v + elif hasattr(row, "_fields"): + if type(row) not in nt_lookup: + l = [] + for j, k in enumerate(row._fields): + if k in result_columns or not have_columns: + # include this field in result + l.append((k, j)) + # create an array to store it + if k not in result_columns: + result_columns[k] = None + result[k] = np.full(n, np.nan, dtype='O') + # save (column_name, index) pairs + nt_lookup[type(row)] = l + + for k, j in nt_lookup[type(row)]: + result[k][i] = row[j] + else: + msg = "'%s' at row %d is not a valid record type" + raise ValueError(msg % (type(row), i)) - return result + return list(result_columns), list(result.values()) def fast_zip(list ndarrays): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 9d622d92e0979..ca6caa6053a03 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -9,6 +9,7 @@ from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import ( + Components, array_to_timedelta64, parse_timedelta_unit, precision_from_unit, @@ -901,7 +902,9 @@ def components(self): def f(x): if isna(x): - return [np.nan] * len(columns) + return Components( + np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan + ) return x.components else: diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 461b5cc6232cd..843d372ee7103 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -3,7 +3,7 @@ from collections import abc from numbers import Number import re -from typing import Pattern +from typing import NamedTuple, Pattern import numpy as np @@ -380,7 +380,9 @@ def is_named_tuple(obj): False """ - return isinstance(obj, tuple) and hasattr(obj, "_fields") + return isinstance(obj, NamedTuple) or ( + isinstance(obj, tuple) and hasattr(obj, "_fields") + ) def is_hashable(obj): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cdbe0e9d22eb4..df32c6aad7820 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -65,7 +65,6 @@ is_integer_dtype, is_iterator, is_list_like, - is_named_tuple, is_nested_list_like, is_object_dtype, is_scalar, @@ -441,8 +440,6 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): data = list(data) if len(data) > 0: if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: - if is_named_tuple(data[0]) and columns is None: - columns = data[0]._fields arrays, columns = to_arrays(data, columns, dtype=dtype) columns = ensure_index(columns) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 74b16f0e72883..61013462f8d80 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -25,6 +25,7 @@ is_extension_array_dtype, is_integer_dtype, is_list_like, + is_named_tuple, is_object_dtype, ) from pandas.core.dtypes.generic import ( @@ -449,12 +450,12 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): if columns is not None: return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] - if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) - elif isinstance(data[0], abc.Mapping): - return _list_of_dict_to_arrays( + if isinstance(data[0], abc.Mapping) or is_named_tuple(data[0]): + return _list_of_records_to_arrays( data, columns, coerce_float=coerce_float, dtype=dtype ) + elif isinstance(data[0], (list, tuple)): + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], ABCSeries): return _list_of_series_to_arrays( data, columns, coerce_float=coerce_float, dtype=dtype @@ -524,8 +525,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): return values.T, columns -def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): - """Convert list of dicts to numpy arrays +def _list_of_records_to_arrays(data, columns, coerce_float=False, dtype=None): + """Convert list of OrderedDict to numpy array if `columns` is not passed, column names are inferred from the records - for OrderedDict and (on Python>=3.6) dicts, the column names match @@ -545,17 +546,19 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): tuple arrays, columns """ - if columns is None: - gen = (list(x.keys()) for x in data) - types = (dict, OrderedDict) if PY36 else OrderedDict - sort = not any(isinstance(d, types) for d in data) + if not PY36 and columns is None: + gen = (list(x.keys() if hasattr(x, "keys") else x._fields) for x in data) + sort = not any(isinstance(d, OrderedDict) or is_named_tuple(d) for d in data) columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) + else: + columns = list(columns) if columns is not None else [] # assure that they are of the base dict class and not of derived # classes - data = [(type(d) is dict) and d or dict(d) for d in data] - - content = list(lib.dicts_to_array(data, list(columns)).T) + data = [ + ((type(d) is dict) and d) or (is_named_tuple(d) and d) or dict(d) for d in data + ] + columns, content = lib.dicts_to_array(data, columns) return _convert_object_array( content, columns, dtype=dtype, coerce_float=coerce_float ) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ebffeeaa3063e..be3d22f2619c1 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1331,9 +1331,44 @@ def test_constructor_list_of_namedtuples(self): result = DataFrame(tuples) tm.assert_frame_equal(result, expected) - # with columns - expected = DataFrame({"y": [1, 2], "z": [3, 4]}) - result = DataFrame(tuples, columns=["y", "z"]) + def test_constructor_list_of_namedtuples_new_behavior(self): + # GH27329 + from collections import namedtuple + + named_tuple = namedtuple("Pandas", list("ab")) + tuples = [named_tuple(1, 3), named_tuple(2, 4)] + + # namedtuples now behave like records, so if `columns` + # is passed it's treated as field selection. Previously + # it was treated as a rename. + expected = DataFrame({"a": [1, 2], "x": [np.nan, np.nan]}) + result = DataFrame(tuples, columns=["a", "x"]) + tm.assert_frame_equal(result, expected) + + # new-style NamedTuple + # NOTE: Enable after py3.5 support is dropped + # from typing import NamedTuple + # class named_tuple3(NamedTuple): + # a: int + # b: int + # named_tuple3 = namedtuple("named_tuple3", list("ab")) + # tuples = [named_tuple3(1, 3), named_tuple3(2, 4)] + # expected = DataFrame({"a": [1, 2], "b": [3, 4]}) + # result = DataFrame(tuples) + # tm.assert_frame_equal(result, expected) + + expected = DataFrame({"a": [1, 2], "b": [3, 4]}) + result = DataFrame(tuples, columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + # hetero columns + named_tuple1 = namedtuple("Pandas", list("ab")) + named_tuple2 = namedtuple("sandaP", list("yabx")) + tuples = [named_tuple1(1, 2), named_tuple2(3, 4, 5, 6)] + result = DataFrame(tuples) + expected = pd.DataFrame( + {"a": [1, 4], "b": [2, 5], "y": [np.nan, 3.0], "x": [np.nan, 6.0]} + )[["a", "b", "y", "x"]] tm.assert_frame_equal(result, expected) def test_constructor_list_of_dict_order(self):