Skip to content

ENH: treat list of namedtuples like list of dict in DataFrame() #27494

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,61 @@ Backwards incompatible API changes
- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`).
-

DataFrame constructor treats list of namedtuple/dict in the same way
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Lists of typing.NameTuple are now treated the same way as lists of :class:`collections.namedtuple.`
by the :class:`DataFrame` constructor.

The treatment of such lists has also changed in this release. Previously, only
the first element in the list was checked, and if it was a namedtuple, the
field names of that single tuple were used as the column names. Subsequent
tuples were assumed to be of the same type, and their values were looked up by
position. As a consequence, if subsequent tuples of different types were
included, any additional fields were dropped, and if similarly named fields
appeared in a different order, alignment was not performed.

This behavior has now changed so that namedtuples are treated in the same way
as list of dict behaves, i.e as a "list of records" (:issue:`27329`) (:issue:`27494`).

Additionally, this change implies a change in the semantics of the `columns`
argument to :class:`DataFrame` when passing a list of namedtuples. Previously,
`columns` has "rename" semantics, now it has the same "lookup" semantics as a
list of records. Meaning that any name given in `columns` which doesn't appear
as a key in the record will be assigned a NaN value.

Due to this change, The performance of constructing frames from a list of
namedtuples is now roughly 50% slower.

This change affects all supported python versions.

.. ipython:: python

from collections import namedtuple
Foo = namedtuple("Foo", list("ab"))
tuples = [Foo(1, 3), Foo(2, 4)]

*Previous Behavior*:

The columns were lexicographically sorted previously,

.. code-block:: python

In [1]: pd.DataFrame(tuples, columns=['y', 'z'])
Out[1]:
y z
0 1 3
1 2 4

*New Behavior*:

The column order now matches the insertion-order of the keys in the ``dict``,
considering all the records from top to bottom.

.. ipython:: python

pd.DataFrame(tuples, columns=['Q', 'a'])

Other API changes
^^^^^^^^^^^^^^^^^

Expand Down
55 changes: 40 additions & 15 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from collections import abc
from collections import abc, OrderedDict
from decimal import Decimal
from fractions import Fraction
from numbers import Number
Expand Down Expand Up @@ -312,28 +312,53 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True):

@cython.wraparound(False)
@cython.boundscheck(False)
def dicts_to_array(dicts: list, columns: list):
def dicts_to_array(dicts: list, columns : list):
cdef:
Py_ssize_t i, j, k, n
ndarray[object, ndim=2] result
dict row
Py_ssize_t i, j, n
object result, result_columns
object row
object col, onan = np.nan
dict d, nt_lookup

k = len(columns)
n = len(dicts)
have_columns = len(columns) > 0
result_columns = OrderedDict.fromkeys(list(columns or []))
result = OrderedDict((k, np.full(n, np.nan, dtype='O')) for k in columns)

result = np.empty((n, k), dtype='O')

nt_lookup = {}
for i in range(n):
row = dicts[i]
for j in range(k):
col = columns[j]
if col in row:
result[i, j] = row[col]
else:
result[i, j] = onan
if hasattr(row, 'keys'):
d = row
for k in d:
v = d[k]
if k not in result_columns:
if have_columns:
continue
result_columns[k] = None
result[k] = np.full(n, np.nan, dtype='O')
result[k][i] = v
elif hasattr(row, "_fields"):
if type(row) not in nt_lookup:
l = []
for j, k in enumerate(row._fields):
if k in result_columns or not have_columns:
# include this field in result
l.append((k, j))
# create an array to store it
if k not in result_columns:
result_columns[k] = None
result[k] = np.full(n, np.nan, dtype='O')
# save (column_name, index) pairs
nt_lookup[type(row)] = l

for k, j in nt_lookup[type(row)]:
result[k][i] = row[j]
else:
msg = "'%s' at row %d is not a valid record type"
raise ValueError(msg % (type(row), i))

return result
return list(result_columns), list(result.values())


def fast_zip(list ndarrays):
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT
from pandas._libs.tslibs.fields import get_timedelta_field
from pandas._libs.tslibs.timedeltas import (
Components,
array_to_timedelta64,
parse_timedelta_unit,
precision_from_unit,
Expand Down Expand Up @@ -901,7 +902,9 @@ def components(self):

def f(x):
if isna(x):
return [np.nan] * len(columns)
return Components(
np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
)
return x.components

else:
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/dtypes/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections import abc
from numbers import Number
import re
from typing import Pattern
from typing import NamedTuple, Pattern

import numpy as np

Expand Down Expand Up @@ -380,7 +380,9 @@ def is_named_tuple(obj):
False
"""

return isinstance(obj, tuple) and hasattr(obj, "_fields")
return isinstance(obj, NamedTuple) or (
isinstance(obj, tuple) and hasattr(obj, "_fields")
)


def is_hashable(obj):
Expand Down
3 changes: 0 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@
is_integer_dtype,
is_iterator,
is_list_like,
is_named_tuple,
is_nested_list_like,
is_object_dtype,
is_scalar,
Expand Down Expand Up @@ -441,8 +440,6 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False):
data = list(data)
if len(data) > 0:
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = to_arrays(data, columns, dtype=dtype)
columns = ensure_index(columns)

Expand Down
29 changes: 16 additions & 13 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
is_extension_array_dtype,
is_integer_dtype,
is_list_like,
is_named_tuple,
is_object_dtype,
)
from pandas.core.dtypes.generic import (
Expand Down Expand Up @@ -449,12 +450,12 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
if columns is not None:
return [[]] * len(columns), columns
return [], [] # columns if columns is not None else []
if isinstance(data[0], (list, tuple)):
return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
elif isinstance(data[0], abc.Mapping):
return _list_of_dict_to_arrays(
if isinstance(data[0], abc.Mapping) or is_named_tuple(data[0]):
return _list_of_records_to_arrays(
data, columns, coerce_float=coerce_float, dtype=dtype
)
elif isinstance(data[0], (list, tuple)):
return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
elif isinstance(data[0], ABCSeries):
return _list_of_series_to_arrays(
data, columns, coerce_float=coerce_float, dtype=dtype
Expand Down Expand Up @@ -524,8 +525,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
return values.T, columns


def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
"""Convert list of dicts to numpy arrays
def _list_of_records_to_arrays(data, columns, coerce_float=False, dtype=None):
"""Convert list of OrderedDict to numpy array

if `columns` is not passed, column names are inferred from the records
- for OrderedDict and (on Python>=3.6) dicts, the column names match
Expand All @@ -545,17 +546,19 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
tuple
arrays, columns
"""
if columns is None:
gen = (list(x.keys()) for x in data)
types = (dict, OrderedDict) if PY36 else OrderedDict
sort = not any(isinstance(d, types) for d in data)
if not PY36 and columns is None:
gen = (list(x.keys() if hasattr(x, "keys") else x._fields) for x in data)
sort = not any(isinstance(d, OrderedDict) or is_named_tuple(d) for d in data)
columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
else:
columns = list(columns) if columns is not None else []

# assure that they are of the base dict class and not of derived
# classes
data = [(type(d) is dict) and d or dict(d) for d in data]

content = list(lib.dicts_to_array(data, list(columns)).T)
data = [
((type(d) is dict) and d) or (is_named_tuple(d) and d) or dict(d) for d in data
]
columns, content = lib.dicts_to_array(data, columns)
return _convert_object_array(
content, columns, dtype=dtype, coerce_float=coerce_float
)
Expand Down
41 changes: 38 additions & 3 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1331,9 +1331,44 @@ def test_constructor_list_of_namedtuples(self):
result = DataFrame(tuples)
tm.assert_frame_equal(result, expected)

# with columns
expected = DataFrame({"y": [1, 2], "z": [3, 4]})
result = DataFrame(tuples, columns=["y", "z"])
def test_constructor_list_of_namedtuples_new_behavior(self):
# GH27329
from collections import namedtuple

named_tuple = namedtuple("Pandas", list("ab"))
tuples = [named_tuple(1, 3), named_tuple(2, 4)]

# namedtuples now behave like records, so if `columns`
# is passed it's treated as field selection. Previously
# it was treated as a rename.
expected = DataFrame({"a": [1, 2], "x": [np.nan, np.nan]})
result = DataFrame(tuples, columns=["a", "x"])
tm.assert_frame_equal(result, expected)

# new-style NamedTuple
# NOTE: Enable after py3.5 support is dropped
# from typing import NamedTuple
# class named_tuple3(NamedTuple):
# a: int
# b: int
# named_tuple3 = namedtuple("named_tuple3", list("ab"))
# tuples = [named_tuple3(1, 3), named_tuple3(2, 4)]
# expected = DataFrame({"a": [1, 2], "b": [3, 4]})
# result = DataFrame(tuples)
# tm.assert_frame_equal(result, expected)

expected = DataFrame({"a": [1, 2], "b": [3, 4]})
result = DataFrame(tuples, columns=["a", "b"])
tm.assert_frame_equal(result, expected)

# hetero columns
named_tuple1 = namedtuple("Pandas", list("ab"))
named_tuple2 = namedtuple("sandaP", list("yabx"))
tuples = [named_tuple1(1, 2), named_tuple2(3, 4, 5, 6)]
result = DataFrame(tuples)
expected = pd.DataFrame(
{"a": [1, 4], "b": [2, 5], "y": [np.nan, 3.0], "x": [np.nan, 6.0]}
)[["a", "b", "y", "x"]]
tm.assert_frame_equal(result, expected)

def test_constructor_list_of_dict_order(self):
Expand Down