Skip to content

Commit f1684a1

Browse files
pilkibunjreback
pilkibun
authored andcommitted
ENH: Preserve key order when passing list of dicts to DataFrame on py 3.6+ (#27309)
1 parent 58f56b7 commit f1684a1

File tree

6 files changed

+110
-17
lines changed

6 files changed

+110
-17
lines changed

doc/source/whatsnew/v0.25.0.rst

+42-1
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t
400400
.. _whatsnew_0250.api_breaking.groupby_categorical:
401401

402402
Categorical dtypes are preserved during groupby
403-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
403+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
404404

405405
Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`)
406406

@@ -740,6 +740,47 @@ consistent with NumPy and the rest of pandas (:issue:`21801`).
740740
cat.argsort()
741741
cat[cat.argsort()]
742742
743+
.. _whatsnew_0250.api_breaking.list_of_dict:
744+
745+
Column order is preserved when passing a list of dicts to DataFrame
746+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
747+
748+
Starting with Python 3.7 the key-order of ``dict`` is `guaranteed <https://mail.python.org/pipermail/python-dev/2017-December/151283.html>`_. In practice, this has been true since
749+
Python 3.6. The :class:`DataFrame` constructor now treats a list of dicts in the same way as
750+
it does a list of ``OrderedDict``, i.e. preserving the order of the dicts.
751+
This change applies only when pandas is running on Python>=3.6 (:issue:`27309`).
752+
753+
.. ipython:: python
754+
755+
data = [
756+
{'name': 'Joe', 'state': 'NY', 'age': 18},
757+
{'name': 'Jane', 'state': 'KY', 'age': 19, 'hobby': 'Minecraft'},
758+
{'name': 'Jean', 'state': 'OK', 'age': 20, 'finances': 'good'}
759+
]
760+
761+
*Previous Behavior*:
762+
763+
The columns were lexicographically sorted previously,
764+
765+
.. code-block:: python
766+
767+
In [1]: pd.DataFrame(data)
768+
Out[1]:
769+
age finances hobby name state
770+
0 18 NaN NaN Joe NY
771+
1 19 NaN Minecraft Jane KY
772+
2 20 good NaN Jean OK
773+
774+
*New Behavior*:
775+
776+
The column order now matches the insertion-order of the keys in the ``dict``,
777+
considering all the records from top to bottom. As a consequence, the column
778+
order of the resulting DataFrame has changed compared to previous pandas verisons.
779+
780+
.. ipython:: python
781+
782+
pd.DataFrame(data)
783+
743784
.. _whatsnew_0250.api_breaking.deps:
744785
745786
Increased minimum versions for dependencies

pandas/core/frame.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,12 @@ class DataFrame(NDFrame):
314314
Dict can contain Series, arrays, constants, or list-like objects
315315
316316
.. versionchanged :: 0.23.0
317-
If data is a dict, argument order is maintained for Python 3.6
318-
and later.
317+
If data is a dict, column order follows insertion-order for
318+
Python 3.6 and later.
319+
320+
.. versionchanged :: 0.25.0
321+
If data is a list of dicts, column order follows insertion-order
322+
Python 3.6 and later.
319323
320324
index : Index or array-like
321325
Index to use for resulting frame. Will default to RangeIndex if

pandas/core/internals/construction.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pandas._libs import lib
1111
from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime
1212
import pandas.compat as compat
13-
from pandas.compat import raise_with_traceback
13+
from pandas.compat import PY36, raise_with_traceback
1414

1515
from pandas.core.dtypes.cast import (
1616
construct_1d_arraylike_from_scalar,
@@ -536,9 +536,30 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
536536

537537

538538
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
539+
"""Convert list of dicts to numpy arrays
540+
541+
if `columns` is not passed, column names are inferred from the records
542+
- for OrderedDict and (on Python>=3.6) dicts, the column names match
543+
the key insertion-order from the first record to the last.
544+
- For other kinds of dict-likes, the keys are lexically sorted.
545+
546+
Parameters
547+
----------
548+
data : iterable
549+
collection of records (OrderedDict, dict)
550+
columns: iterables or None
551+
coerce_float : bool
552+
dtype : np.dtype
553+
554+
Returns
555+
-------
556+
tuple
557+
arrays, columns
558+
"""
539559
if columns is None:
540560
gen = (list(x.keys()) for x in data)
541-
sort = not any(isinstance(d, OrderedDict) for d in data)
561+
types = (dict, OrderedDict) if PY36 else OrderedDict
562+
sort = not any(isinstance(d, types) for d in data)
542563
columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
543564

544565
# assure that they are of the base dict class and not of derived

pandas/tests/frame/test_constructors.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -1119,7 +1119,7 @@ def test_constructor_generator(self):
11191119
expected = DataFrame({0: range(10), 1: "a"})
11201120
tm.assert_frame_equal(result, expected, check_dtype=False)
11211121

1122-
def test_constructor_list_of_dicts(self):
1122+
def test_constructor_list_of_odicts(self):
11231123
data = [
11241124
OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]]),
11251125
OrderedDict([["a", 1.5], ["b", 3], ["d", 6]]),
@@ -1340,6 +1340,26 @@ def test_constructor_list_of_namedtuples(self):
13401340
result = DataFrame(tuples, columns=["y", "z"])
13411341
tm.assert_frame_equal(result, expected)
13421342

1343+
def test_constructor_list_of_dict_order(self):
1344+
# GH10056
1345+
data = [
1346+
{"First": 1, "Second": 4, "Third": 7, "Fourth": 10},
1347+
{"Second": 5, "First": 2, "Fourth": 11, "Third": 8},
1348+
{"Second": 6, "First": 3, "Fourth": 12, "Third": 9, "YYY": 14, "XXX": 13},
1349+
]
1350+
expected = DataFrame(
1351+
{
1352+
"First": [1, 2, 3],
1353+
"Second": [4, 5, 6],
1354+
"Third": [7, 8, 9],
1355+
"Fourth": [10, 11, 12],
1356+
"YYY": [None, None, 14],
1357+
"XXX": [None, None, 13],
1358+
}
1359+
)
1360+
result = DataFrame(data)
1361+
tm.assert_frame_equal(result, expected, check_like=not PY36)
1362+
13431363
def test_constructor_orient(self, float_string_frame):
13441364
data_dict = float_string_frame.T._series
13451365
recons = DataFrame.from_dict(data_dict, orient="index")

pandas/tests/indexing/test_indexing.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import numpy as np
99
import pytest
1010

11+
from pandas.compat import PY36
12+
1113
from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
1214

1315
import pandas as pd
@@ -228,8 +230,10 @@ def test_setitem_dtype_upcast(self):
228230
assert df["c"].dtype == np.float64
229231

230232
df.loc[0, "c"] = "foo"
231-
expected = DataFrame([{"a": 1, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}])
232-
tm.assert_frame_equal(df, expected)
233+
expected = DataFrame(
234+
[{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}]
235+
)
236+
tm.assert_frame_equal(df, expected, check_like=not PY36)
233237

234238
# GH10280
235239
df = DataFrame(

pandas/tests/io/json/test_normalize.py

+12-9
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
from pandas.compat import PY36
7+
68
from pandas import DataFrame, Index
79
import pandas.util.testing as tm
810

@@ -351,9 +353,9 @@ def test_non_ascii_key(self):
351353
).decode("utf8")
352354

353355
testdata = {
356+
b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
354357
"sub.A": [1, 3],
355358
"sub.B": [2, 4],
356-
b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
357359
}
358360
expected = DataFrame(testdata)
359361

@@ -366,21 +368,21 @@ def test_missing_field(self, author_missing_data):
366368
ex_data = [
367369
{
368370
"info": np.nan,
369-
"author_name.first": np.nan,
370-
"author_name.last_name": np.nan,
371371
"info.created_at": np.nan,
372372
"info.last_updated": np.nan,
373+
"author_name.first": np.nan,
374+
"author_name.last_name": np.nan,
373375
},
374376
{
375377
"info": None,
376-
"author_name.first": "Jane",
377-
"author_name.last_name": "Doe",
378378
"info.created_at": "11/08/1993",
379379
"info.last_updated": "26/05/2012",
380+
"author_name.first": "Jane",
381+
"author_name.last_name": "Doe",
380382
},
381383
]
382384
expected = DataFrame(ex_data)
383-
tm.assert_frame_equal(result, expected)
385+
tm.assert_frame_equal(result, expected, check_like=not PY36)
384386

385387
@pytest.mark.parametrize(
386388
"max_level,expected",
@@ -508,12 +510,13 @@ def test_missing_meta(self, missing_metadata):
508510
data=missing_metadata, record_path="addresses", meta="name", errors="ignore"
509511
)
510512
ex_data = [
511-
["Massillon", 9562, "OH", "Morris St.", 44646, "Alice"],
512-
["Elizabethton", 8449, "TN", "Spring St.", 37643, np.nan],
513+
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
514+
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
513515
]
514516
columns = ["city", "number", "state", "street", "zip", "name"]
517+
columns = ["number", "street", "city", "state", "zip", "name"]
515518
expected = DataFrame(ex_data, columns=columns)
516-
tm.assert_frame_equal(result, expected)
519+
tm.assert_frame_equal(result, expected, check_like=not PY36)
517520

518521
def test_donot_drop_nonevalues(self):
519522
# GH21356

0 commit comments

Comments
 (0)