ENH: Preserve key order when passing list of dicts to DataFrame on py 3.6+ (#27309)

pilkibun · jreback · commit f1684a15d0ab · 2019-07-17T07:46:53.000-04:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -400,7 +400,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t
 .. _whatsnew_0250.api_breaking.groupby_categorical:
 
 Categorical dtypes are preserved during groupby
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`)
 
@@ -740,6 +740,47 @@ consistent with NumPy and the rest of pandas (:issue:`21801`).
    cat.argsort()
    cat[cat.argsort()]
 
+.. _whatsnew_0250.api_breaking.list_of_dict:
+
+Column order is preserved when passing a list of dicts to DataFrame
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Starting with Python 3.7 the key-order of ``dict`` is `guaranteed <https://mail.python.org/pipermail/python-dev/2017-December/151283.html>`_. In practice, this has been true since
+Python 3.6. The :class:`DataFrame` constructor now treats a list of dicts in the same way as
+it does a list of ``OrderedDict``, i.e. preserving the order of the dicts.
+This change applies only when pandas is running on Python>=3.6 (:issue:`27309`).
+
+.. ipython:: python
+
+   data = [
+       {'name': 'Joe', 'state': 'NY', 'age': 18},
+       {'name': 'Jane', 'state': 'KY', 'age': 19, 'hobby': 'Minecraft'},
+       {'name': 'Jean', 'state': 'OK', 'age': 20, 'finances': 'good'}
+   ]
+
+*Previous Behavior*:
+
+The columns were lexicographically sorted previously,
+
+.. code-block:: python
+
+   In [1]: pd.DataFrame(data)
+   Out[1]:
+      age finances      hobby  name state
+   0   18      NaN        NaN   Joe    NY
+   1   19      NaN  Minecraft  Jane    KY
+   2   20     good        NaN  Jean    OK
+
+*New Behavior*:
+
+The column order now matches the insertion-order of the keys in the ``dict``,
+considering all the records from top to bottom. As a consequence, the column
+order of the resulting DataFrame has changed compared to previous pandas verisons.
+
+.. ipython:: python
+
+   pd.DataFrame(data)
+
 .. _whatsnew_0250.api_breaking.deps:
 
 Increased minimum versions for dependencies
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -314,8 +314,12 @@ class DataFrame(NDFrame):
         Dict can contain Series, arrays, constants, or list-like objects
 
         .. versionchanged :: 0.23.0
-           If data is a dict, argument order is maintained for Python 3.6
-           and later.
+           If data is a dict, column order follows insertion-order for
+           Python 3.6 and later.
+
+        .. versionchanged :: 0.25.0
+           If data is a list of dicts, column order follows insertion-order
+           Python 3.6 and later.
 
     index : Index or array-like
         Index to use for resulting frame. Will default to RangeIndex if
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -10,7 +10,7 @@
 from pandas._libs import lib
 from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime
 import pandas.compat as compat
-from pandas.compat import raise_with_traceback
+from pandas.compat import PY36, raise_with_traceback
 
 from pandas.core.dtypes.cast import (
     construct_1d_arraylike_from_scalar,
@@ -536,9 +536,30 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
 
 
 def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
+    """Convert list of dicts to numpy arrays
+
+    if `columns` is not passed, column names are inferred from the records
+    - for OrderedDict and (on Python>=3.6) dicts, the column names match
+      the key insertion-order from the first record to the last.
+    - For other kinds of dict-likes, the keys are lexically sorted.
+
+    Parameters
+    ----------
+    data : iterable
+        collection of records (OrderedDict, dict)
+    columns: iterables or None
+    coerce_float : bool
+    dtype : np.dtype
+
+    Returns
+    -------
+    tuple
+        arrays, columns
+    """
     if columns is None:
         gen = (list(x.keys()) for x in data)
-        sort = not any(isinstance(d, OrderedDict) for d in data)
+        types = (dict, OrderedDict) if PY36 else OrderedDict
+        sort = not any(isinstance(d, types) for d in data)
         columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
 
     # assure that they are of the base dict class and not of derived
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -1119,7 +1119,7 @@ def test_constructor_generator(self):
         expected = DataFrame({0: range(10), 1: "a"})
         tm.assert_frame_equal(result, expected, check_dtype=False)
 
-    def test_constructor_list_of_dicts(self):
+    def test_constructor_list_of_odicts(self):
         data = [
             OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]]),
             OrderedDict([["a", 1.5], ["b", 3], ["d", 6]]),
@@ -1340,6 +1340,26 @@ def test_constructor_list_of_namedtuples(self):
         result = DataFrame(tuples, columns=["y", "z"])
         tm.assert_frame_equal(result, expected)
 
+    def test_constructor_list_of_dict_order(self):
+        # GH10056
+        data = [
+            {"First": 1, "Second": 4, "Third": 7, "Fourth": 10},
+            {"Second": 5, "First": 2, "Fourth": 11, "Third": 8},
+            {"Second": 6, "First": 3, "Fourth": 12, "Third": 9, "YYY": 14, "XXX": 13},
+        ]
+        expected = DataFrame(
+            {
+                "First": [1, 2, 3],
+                "Second": [4, 5, 6],
+                "Third": [7, 8, 9],
+                "Fourth": [10, 11, 12],
+                "YYY": [None, None, 14],
+                "XXX": [None, None, 13],
+            }
+        )
+        result = DataFrame(data)
+        tm.assert_frame_equal(result, expected, check_like=not PY36)
+
     def test_constructor_orient(self, float_string_frame):
         data_dict = float_string_frame.T._series
         recons = DataFrame.from_dict(data_dict, orient="index")
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat import PY36
+
 from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
 
 import pandas as pd
@@ -228,8 +230,10 @@ def test_setitem_dtype_upcast(self):
         assert df["c"].dtype == np.float64
 
         df.loc[0, "c"] = "foo"
-        expected = DataFrame([{"a": 1, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}])
-        tm.assert_frame_equal(df, expected)
+        expected = DataFrame(
+            [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}]
+        )
+        tm.assert_frame_equal(df, expected, check_like=not PY36)
 
         # GH10280
         df = DataFrame(
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat import PY36
+
 from pandas import DataFrame, Index
 import pandas.util.testing as tm
 
@@ -351,9 +353,9 @@ def test_non_ascii_key(self):
         ).decode("utf8")
 
         testdata = {
+            b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
             "sub.A": [1, 3],
             "sub.B": [2, 4],
-            b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
         }
         expected = DataFrame(testdata)
 
@@ -366,21 +368,21 @@ def test_missing_field(self, author_missing_data):
         ex_data = [
             {
                 "info": np.nan,
-                "author_name.first": np.nan,
-                "author_name.last_name": np.nan,
                 "info.created_at": np.nan,
                 "info.last_updated": np.nan,
+                "author_name.first": np.nan,
+                "author_name.last_name": np.nan,
             },
             {
                 "info": None,
-                "author_name.first": "Jane",
-                "author_name.last_name": "Doe",
                 "info.created_at": "11/08/1993",
                 "info.last_updated": "26/05/2012",
+                "author_name.first": "Jane",
+                "author_name.last_name": "Doe",
             },
         ]
         expected = DataFrame(ex_data)
-        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result, expected, check_like=not PY36)
 
     @pytest.mark.parametrize(
         "max_level,expected",
@@ -508,12 +510,13 @@ def test_missing_meta(self, missing_metadata):
             data=missing_metadata, record_path="addresses", meta="name", errors="ignore"
         )
         ex_data = [
-            ["Massillon", 9562, "OH", "Morris St.", 44646, "Alice"],
-            ["Elizabethton", 8449, "TN", "Spring St.", 37643, np.nan],
+            [9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
+            [8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
         ]
         columns = ["city", "number", "state", "street", "zip", "name"]
+        columns = ["number", "street", "city", "state", "zip", "name"]
         expected = DataFrame(ex_data, columns=columns)
-        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result, expected, check_like=not PY36)
 
     def test_donot_drop_nonevalues(self):
         # GH21356