diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 193a0edee5e96..805fe21bdcc9d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -400,7 +400,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t .. _whatsnew_0250.api_breaking.groupby_categorical: Categorical dtypes are preserved during groupby -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`) @@ -741,6 +741,47 @@ consistent with NumPy and the rest of pandas (:issue:`21801`). cat.argsort() cat[cat.argsort()] +.. _whatsnew_0250.api_breaking.list_of_dict: + +Column order is preserved when passing a list of dicts to DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Starting with Python 3.7 the key-order of ``dict`` is `guaranteed `_. In practice, this has been true since +Python 3.6. The :class:`DataFrame` constructor now treats a list of dicts in the same way as +it does a list of ``OrderedDict``, i.e. preserving the order of the dicts. +This change applies only when pandas is running on Python>=3.6 (:issue:`27309`). + +.. ipython:: python + + data = [ + {'name': 'Joe', 'state': 'NY', 'age': 18}, + {'name': 'Jane', 'state': 'KY', 'age': 19, 'hobby': 'Minecraft'}, + {'name': 'Jean', 'state': 'OK', 'age': 20, 'finances': 'good'} + ] + +*Previous Behavior*: + +The columns were lexicographically sorted previously, + +.. code-block:: python + + In [1]: pd.DataFrame(data) + Out[1]: + age finances hobby name state + 0 18 NaN NaN Joe NY + 1 19 NaN Minecraft Jane KY + 2 20 good NaN Jean OK + +*New Behavior*: + +The column order now matches the insertion-order of the keys in the ``dict``, +considering all the records from top to bottom. As a consequence, the column +order of the resulting DataFrame has changed compared to previous pandas verisons. + +.. ipython:: python + + pd.DataFrame(data) + .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a1989fd62b6ee..a4d355de3d8f0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -313,8 +313,12 @@ class DataFrame(NDFrame): Dict can contain Series, arrays, constants, or list-like objects .. versionchanged :: 0.23.0 - If data is a dict, argument order is maintained for Python 3.6 - and later. + If data is a dict, column order follows insertion-order for + Python 3.6 and later. + + .. versionchanged :: 0.25.0 + If data is a list of dicts, column order follows insertion-order + Python 3.6 and later. index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b4752039cf5b1..f44cb5207891f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -10,7 +10,7 @@ from pandas._libs import lib from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime import pandas.compat as compat -from pandas.compat import raise_with_traceback +from pandas.compat import PY36, raise_with_traceback from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -536,9 +536,30 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): + """Convert list of dicts to numpy arrays + + if `columns` is not passed, column names are inferred from the records + - for OrderedDict and (on Python>=3.6) dicts, the column names match + the key insertion-order from the first record to the last. + - For other kinds of dict-likes, the keys are lexically sorted. + + Parameters + ---------- + data : iterable + collection of records (OrderedDict, dict) + columns: iterables or None + coerce_float : bool + dtype : np.dtype + + Returns + ------- + tuple + arrays, columns + """ if columns is None: gen = (list(x.keys()) for x in data) - sort = not any(isinstance(d, OrderedDict) for d in data) + types = (dict, OrderedDict) if PY36 else OrderedDict + sort = not any(isinstance(d, types) for d in data) columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) # assure that they are of the base dict class and not of derived diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index eca827f82e296..736258899a41e 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1119,7 +1119,7 @@ def test_constructor_generator(self): expected = DataFrame({0: range(10), 1: "a"}) tm.assert_frame_equal(result, expected, check_dtype=False) - def test_constructor_list_of_dicts(self): + def test_constructor_list_of_odicts(self): data = [ OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]]), OrderedDict([["a", 1.5], ["b", 3], ["d", 6]]), @@ -1340,6 +1340,26 @@ def test_constructor_list_of_namedtuples(self): result = DataFrame(tuples, columns=["y", "z"]) tm.assert_frame_equal(result, expected) + def test_constructor_list_of_dict_order(self): + # GH10056 + data = [ + {"First": 1, "Second": 4, "Third": 7, "Fourth": 10}, + {"Second": 5, "First": 2, "Fourth": 11, "Third": 8}, + {"Second": 6, "First": 3, "Fourth": 12, "Third": 9, "YYY": 14, "XXX": 13}, + ] + expected = DataFrame( + { + "First": [1, 2, 3], + "Second": [4, 5, 6], + "Third": [7, 8, 9], + "Fourth": [10, 11, 12], + "YYY": [None, None, 14], + "XXX": [None, None, 13], + } + ) + result = DataFrame(data) + tm.assert_frame_equal(result, expected, check_like=not PY36) + def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient="index") diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index e06047b52ac15..f6bb5f774e758 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.compat import PY36 + from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype import pandas as pd @@ -230,8 +232,10 @@ def test_setitem_dtype_upcast(self): assert df["c"].dtype == np.float64 df.loc[0, "c"] = "foo" - expected = DataFrame([{"a": 1, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}]) - tm.assert_frame_equal(df, expected) + expected = DataFrame( + [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}] + ) + tm.assert_frame_equal(df, expected, check_like=not PY36) # GH10280 df = DataFrame( diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a32103d7b29b9..3ceddfc3c1db4 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat import PY36 + from pandas import DataFrame, Index import pandas.util.testing as tm @@ -351,9 +353,9 @@ def test_non_ascii_key(self): ).decode("utf8") testdata = { + b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1], "sub.A": [1, 3], "sub.B": [2, 4], - b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1], } expected = DataFrame(testdata) @@ -366,21 +368,21 @@ def test_missing_field(self, author_missing_data): ex_data = [ { "info": np.nan, - "author_name.first": np.nan, - "author_name.last_name": np.nan, "info.created_at": np.nan, "info.last_updated": np.nan, + "author_name.first": np.nan, + "author_name.last_name": np.nan, }, { "info": None, - "author_name.first": "Jane", - "author_name.last_name": "Doe", "info.created_at": "11/08/1993", "info.last_updated": "26/05/2012", + "author_name.first": "Jane", + "author_name.last_name": "Doe", }, ] expected = DataFrame(ex_data) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_like=not PY36) @pytest.mark.parametrize( "max_level,expected", @@ -508,12 +510,13 @@ def test_missing_meta(self, missing_metadata): data=missing_metadata, record_path="addresses", meta="name", errors="ignore" ) ex_data = [ - ["Massillon", 9562, "OH", "Morris St.", 44646, "Alice"], - ["Elizabethton", 8449, "TN", "Spring St.", 37643, np.nan], + [9562, "Morris St.", "Massillon", "OH", 44646, "Alice"], + [8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan], ] columns = ["city", "number", "state", "street", "zip", "name"] + columns = ["number", "street", "city", "state", "zip", "name"] expected = DataFrame(ex_data, columns=columns) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_like=not PY36) def test_donot_drop_nonevalues(self): # GH21356 @@ -684,7 +687,7 @@ def test_with_large_max_level(self): "CreatedBy.user.family_tree.father.name": "Father001", "CreatedBy.user.family_tree.father.father.Name": "Father002", "CreatedBy.user.family_tree.father.father.father.name": "Father003", - "CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", + "CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501 } ] output = nested_to_record(input_data, max_level=max_level)