pandas-dev · ghost · Jul 20, 2019 · Jul 20, 2019 · Jul 20, 2019 · Jul 25, 2019
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -42,6 +42,61 @@ Backwards incompatible API changes
 - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`).
 -
 
+DataFrame constructor treats list of namedtuple/dict in the same way
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Lists of typing.NameTuple are now treated the same way as lists of :class:`collections.namedtuple.`
+by the :class:`DataFrame` constructor.
+
+The treatment of such lists has also changed in this release. Previously, only
+the first element in the list was checked, and if it was a namedtuple, the
+field names of that single tuple were used as the column names. Subsequent
+tuples were assumed to be of the same type, and their values were looked up by
+position. As a consequence, if subsequent tuples of different types were
+included, any additional fields were dropped, and if similarly named fields
+appeared in a different order, alignment was not performed.
+
+This behavior has now changed so that namedtuples are treated in the same way
+as list of dict behaves, i.e as a "list of records" (:issue:`27329`) (:issue:`27494`).
+
+Additionally, this change implies a change in the semantics of the `columns`
+argument to :class:`DataFrame` when passing a list of namedtuples. Previously,
+`columns` has "rename" semantics, now it has the same "lookup" semantics as a
+list of records. Meaning that any name given in `columns` which doesn't appear
+as a key in the record will be assigned a NaN value.
+
+Due to this change, The performance of constructing frames from a list of
+namedtuples is now roughly 50% slower.
+
+This change affects all supported python versions.
+
+.. ipython:: python
+
+   from collections import namedtuple
+   Foo = namedtuple("Foo", list("ab"))
+   tuples = [Foo(1, 3), Foo(2, 4)]
+
+*Previous Behavior*:
+
+The columns were lexicographically sorted previously,
+
+.. code-block:: python
+
+   In [1]: pd.DataFrame(tuples, columns=['y', 'z'])
+   Out[1]:
+      y  z
+   0  1  3
+   1  2  4
+
+*New Behavior*:
+
+The column order now matches the insertion-order of the keys in the ``dict``,
+considering all the records from top to bottom.
+
+.. ipython:: python
+
+   pd.DataFrame(tuples, columns=['Q', 'a'])
+
 Other API changes
 ^^^^^^^^^^^^^^^^^
 

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1,4 +1,4 @@
-from collections import abc
+from collections import abc, OrderedDict
 from decimal import Decimal
 from fractions import Fraction
 from numbers import Number
@@ -312,28 +312,53 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def dicts_to_array(dicts: list, columns: list):
+def dicts_to_array(dicts: list, columns : list):
     cdef:
-        Py_ssize_t i, j, k, n
-        ndarray[object, ndim=2] result
-        dict row
+        Py_ssize_t i, j, n
+        object result, result_columns
+        object row
         object col, onan = np.nan
+        dict d, nt_lookup
 
-    k = len(columns)
     n = len(dicts)
+    have_columns = len(columns) > 0
+    result_columns = OrderedDict.fromkeys(list(columns or []))
+    result = OrderedDict((k, np.full(n, np.nan, dtype='O')) for k in columns)
 
-    result = np.empty((n, k), dtype='O')
-
+    nt_lookup = {}
     for i in range(n):
         row = dicts[i]
-        for j in range(k):
-            col = columns[j]
-            if col in row:
-                result[i, j] = row[col]
-            else:
-                result[i, j] = onan
+        if hasattr(row, 'keys'):
+            d = row
+            for k in d:
+                v = d[k]
+                if k not in result_columns:
+                    if have_columns:
+                        continue
+                    result_columns[k] = None
+                    result[k] = np.full(n, np.nan, dtype='O')
+                result[k][i] = v
+        elif hasattr(row, "_fields"):
+            if type(row) not in nt_lookup:
+                l = []
+                for j, k in enumerate(row._fields):
+                    if k in result_columns or not have_columns:
+                        # include this field in result
+                        l.append((k, j))
+                        # create an array to store it
+                        if k not in result_columns:
+                            result_columns[k] = None
+                            result[k] = np.full(n, np.nan, dtype='O')
+                # save (column_name, index) pairs
+                nt_lookup[type(row)] = l
+
+            for k, j in nt_lookup[type(row)]:
+                result[k][i] = row[j]
+        else:
+            msg = "'%s' at row %d is not a valid record type"
+            raise ValueError(msg % (type(row), i))
 
-    return result
+    return list(result_columns), list(result.values())
 
 
 def fast_zip(list ndarrays):

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -9,6 +9,7 @@
 from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT
 from pandas._libs.tslibs.fields import get_timedelta_field
 from pandas._libs.tslibs.timedeltas import (
+    Components,
     array_to_timedelta64,
     parse_timedelta_unit,
     precision_from_unit,
@@ -901,7 +902,9 @@ def components(self):
 
             def f(x):
                 if isna(x):
-                    return [np.nan] * len(columns)
+                    return Components(
+                        np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
+                    )
                 return x.components
 
         else:

diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py
@@ -3,7 +3,7 @@
 from collections import abc
 from numbers import Number
 import re
-from typing import Pattern
+from typing import NamedTuple, Pattern
 
 import numpy as np
 
@@ -380,7 +380,9 @@ def is_named_tuple(obj):
     False
     """
 
-    return isinstance(obj, tuple) and hasattr(obj, "_fields")
+    return isinstance(obj, NamedTuple) or (
+        isinstance(obj, tuple) and hasattr(obj, "_fields")
+    )
 
 
 def is_hashable(obj):

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -65,7 +65,6 @@
     is_integer_dtype,
     is_iterator,
     is_list_like,
-    is_named_tuple,
     is_nested_list_like,
     is_object_dtype,
     is_scalar,
@@ -441,8 +440,6 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False):
                 data = list(data)
             if len(data) > 0:
                 if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
-                    if is_named_tuple(data[0]) and columns is None:
-                        columns = data[0]._fields
                     arrays, columns = to_arrays(data, columns, dtype=dtype)
                     columns = ensure_index(columns)
 

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -25,6 +25,7 @@
     is_extension_array_dtype,
     is_integer_dtype,
     is_list_like,
+    is_named_tuple,
     is_object_dtype,
 )
 from pandas.core.dtypes.generic import (
@@ -449,12 +450,12 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
             if columns is not None:
                 return [[]] * len(columns), columns
         return [], []  # columns if columns is not None else []
-    if isinstance(data[0], (list, tuple)):
-        return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
-    elif isinstance(data[0], abc.Mapping):
-        return _list_of_dict_to_arrays(
+    if isinstance(data[0], abc.Mapping) or is_named_tuple(data[0]):
+        return _list_of_records_to_arrays(
             data, columns, coerce_float=coerce_float, dtype=dtype
         )
+    elif isinstance(data[0], (list, tuple)):
+        return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
     elif isinstance(data[0], ABCSeries):
         return _list_of_series_to_arrays(
             data, columns, coerce_float=coerce_float, dtype=dtype
@@ -524,8 +525,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
         return values.T, columns
 
 
-def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
-    """Convert list of dicts to numpy arrays
+def _list_of_records_to_arrays(data, columns, coerce_float=False, dtype=None):
+    """Convert list of OrderedDict to numpy array
 
     if `columns` is not passed, column names are inferred from the records
     - for OrderedDict and (on Python>=3.6) dicts, the column names match
@@ -545,17 +546,19 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
     tuple
         arrays, columns
     """
-    if columns is None:
-        gen = (list(x.keys()) for x in data)
-        types = (dict, OrderedDict) if PY36 else OrderedDict
-        sort = not any(isinstance(d, types) for d in data)
+    if not PY36 and columns is None:
+        gen = (list(x.keys() if hasattr(x, "keys") else x._fields) for x in data)
+        sort = not any(isinstance(d, OrderedDict) or is_named_tuple(d) for d in data)
         columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
+    else:
+        columns = list(columns) if columns is not None else []
 
     # assure that they are of the base dict class and not of derived
     # classes
-    data = [(type(d) is dict) and d or dict(d) for d in data]
-
-    content = list(lib.dicts_to_array(data, list(columns)).T)
+    data = [
+        ((type(d) is dict) and d) or (is_named_tuple(d) and d) or dict(d) for d in data
+    ]
+    columns, content = lib.dicts_to_array(data, columns)
     return _convert_object_array(
         content, columns, dtype=dtype, coerce_float=coerce_float
     )

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -1331,9 +1331,44 @@ def test_constructor_list_of_namedtuples(self):
         result = DataFrame(tuples)
         tm.assert_frame_equal(result, expected)
 
-        # with columns
-        expected = DataFrame({"y": [1, 2], "z": [3, 4]})
-        result = DataFrame(tuples, columns=["y", "z"])
+    def test_constructor_list_of_namedtuples_new_behavior(self):
+        # GH27329
+        from collections import namedtuple
+
+        named_tuple = namedtuple("Pandas", list("ab"))
+        tuples = [named_tuple(1, 3), named_tuple(2, 4)]
+
+        # namedtuples now behave like records, so if `columns`
+        # is passed it's treated as field selection. Previously
+        # it was treated as a rename.
+        expected = DataFrame({"a": [1, 2], "x": [np.nan, np.nan]})
+        result = DataFrame(tuples, columns=["a", "x"])
+        tm.assert_frame_equal(result, expected)
+
+        # new-style NamedTuple
+        # NOTE: Enable after py3.5 support is dropped
+        # from typing import NamedTuple
+        # class named_tuple3(NamedTuple):
+        #     a: int
+        #     b: int
+        # named_tuple3 = namedtuple("named_tuple3", list("ab"))
+        # tuples = [named_tuple3(1, 3), named_tuple3(2, 4)]
+        # expected = DataFrame({"a": [1, 2], "b": [3, 4]})
+        # result = DataFrame(tuples)
+        # tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame({"a": [1, 2], "b": [3, 4]})
+        result = DataFrame(tuples, columns=["a", "b"])
+        tm.assert_frame_equal(result, expected)
+
+        # hetero columns
+        named_tuple1 = namedtuple("Pandas", list("ab"))
+        named_tuple2 = namedtuple("sandaP", list("yabx"))
+        tuples = [named_tuple1(1, 2), named_tuple2(3, 4, 5, 6)]
+        result = DataFrame(tuples)
+        expected = pd.DataFrame(
+            {"a": [1, 4], "b": [2, 5], "y": [np.nan, 3.0], "x": [np.nan, 6.0]}
+        )[["a", "b", "y", "x"]]
         tm.assert_frame_equal(result, expected)
 
     def test_constructor_list_of_dict_order(self):