ENH: Add max_level param to json_normalize (#26876)

bhavaniravi · TomAugspurger · commit 13f3f5aaec91 · 2019-07-03T06:21:57.000-05:00
* ENH add max_level and ignore_keys configuration to nested_to_records

max_level param defines at the level of nesting at which normalizing should stop.
ignore_keys defines the keys to ignore without normalizing
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -2176,6 +2176,19 @@ into a flat table.
 
    json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
 
+The max_level parameter provides more control over which level to end normalization.
+With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict.
+
+.. ipython:: python
+
+    data = [{'CreatedBy': {'Name': 'User001'},
+             'Lookup': {'TextField': 'Some text',
+                        'UserField': {'Id': 'ID001',
+                                      'Name': 'Name001'}},
+             'Image': {'a': 'b'}
+             }]
+    json_normalize(data, max_level=1)
+
 .. _io.jsonl:
 
 Line delimited json
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -135,6 +135,29 @@ the output will truncate, if it's wider than :attr:`options.display.width`
 (default: 80 characters).
 
 
+.. _whatsnew_0250.enhancements.json_normalize_with_max_level:
+
+Json normalize with max_level param support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`json_normalize` normalizes the provided input dict to all
+nested levels. The new max_level parameter provides more control over
+which level to end normalization (:issue:`23843`):
+
+The repr now looks like this:
+
+.. ipython:: python
+
+    from pandas.io.json import json_normalize
+    data = [{
+        'CreatedBy': {'Name': 'User001'},
+        'Lookup': {'TextField': 'Some text',
+                   'UserField': {'Id': 'ID001', 'Name': 'Name001'}},
+        'Image': {'a': 'b'}
+    }]
+    json_normalize(data, max_level=1)
+
+
 .. _whatsnew_0250.enhancements.other:
 
 Other enhancements
diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
@@ -3,6 +3,7 @@
 
 from collections import defaultdict
 import copy
+from typing import DefaultDict, Dict, List, Optional, Union
 
 import numpy as np
 
@@ -25,9 +26,11 @@ def _convert_to_line_delimits(s):
     return convert_json_to_lines(s)
 
 
-def nested_to_record(ds, prefix="", sep=".", level=0):
+def nested_to_record(ds, prefix: str = "",
+                     sep: str = ".", level: int = 0,
+                     max_level: Optional[int] = None):
     """
-    A simplified json_normalize.
+    A simplified json_normalize
 
     Converts a nested dict into a flat dict ("record"), unlike json_normalize,
     it does not attempt to extract a subset of the data.
@@ -36,13 +39,19 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
     ----------
     ds : dict or list of dicts
     prefix: the prefix, optional, default: ""
-    sep : string, default '.'
+    sep : str, default '.'
         Nested records will generate names separated by sep,
         e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
 
         .. versionadded:: 0.20.0
 
-    level: the number of levels in the jason string, optional, default: 0
+    level: int, optional, default: 0
+        The number of levels in the json string.
+
+    max_level: int, optional, default: None
+        The max depth to normalize.
+
+        .. versionadded:: 0.25.0
 
     Returns
     -------
@@ -65,10 +74,8 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
     if isinstance(ds, dict):
         ds = [ds]
         singleton = True
-
     new_ds = []
     for d in ds:
-
         new_d = copy.deepcopy(d)
         for k, v in d.items():
             # each key gets renamed with prefix
@@ -79,62 +86,79 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
             else:
                 newkey = prefix + sep + k
 
+            # flatten if type is dict and
+            # current dict level  < maximum level provided and
             # only dicts gets recurse-flattened
             # only at level>1 do we rename the rest of the keys
-            if not isinstance(v, dict):
+            if (not isinstance(v, dict) or
+                    (max_level is not None and level >= max_level)):
                 if level != 0:  # so we skip copying for top level, common case
                     v = new_d.pop(k)
                     new_d[newkey] = v
                 continue
             else:
                 v = new_d.pop(k)
-                new_d.update(nested_to_record(v, newkey, sep, level + 1))
+                new_d.update(nested_to_record(v, newkey, sep, level + 1,
+                                              max_level))
         new_ds.append(new_d)
 
     if singleton:
         return new_ds[0]
     return new_ds
 
 
-def json_normalize(data, record_path=None, meta=None,
-                   meta_prefix=None,
-                   record_prefix=None,
-                   errors='raise',
-                   sep='.'):
+def json_normalize(data: List[Dict],
+                   record_path: Optional[Union[str, List]] = None,
+                   meta: Optional[Union[str, List]] = None,
+                   meta_prefix: Optional[str] = None,
+                   record_prefix: Optional[str] = None,
+                   errors: Optional[str] = 'raise',
+                   sep: str = '.',
+                   max_level: Optional[int] = None):
     """
     Normalize semi-structured JSON data into a flat table.
 
     Parameters
     ----------
     data : dict or list of dicts
-        Unserialized JSON objects
-    record_path : string or list of strings, default None
+        Unserialized JSON objects.
+    record_path : str or list of str, default None
         Path in each object to list of records. If not passed, data will be
-        assumed to be an array of records
-    meta : list of paths (string or list of strings), default None
-        Fields to use as metadata for each record in resulting table
-    meta_prefix : string, default None
-    record_prefix : string, default None
+        assumed to be an array of records.
+    meta : list of paths (str or list of str), default None
+        Fields to use as metadata for each record in resulting table.
+    meta_prefix : str, default None
         If True, prefix records with dotted (?) path, e.g. foo.bar.field if
-        path to records is ['foo', 'bar']
+        meta is ['foo', 'bar'].
+    record_prefix : str, default None
+        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
+        path to records is ['foo', 'bar'].
     errors : {'raise', 'ignore'}, default 'raise'
+        Configures error handling.
 
         * 'ignore' : will ignore KeyError if keys listed in meta are not
-          always present
+          always present.
         * 'raise' : will raise KeyError if keys listed in meta are not
-          always present
+          always present.
 
         .. versionadded:: 0.20.0
 
-    sep : string, default '.'
-        Nested records will generate names separated by sep,
-        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+    sep : str, default '.'
+        Nested records will generate names separated by sep.
+        e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
 
         .. versionadded:: 0.20.0
 
+    max_level : int, default None
+        Max number of levels(depth of dict) to normalize.
+        if None, normalizes all levels.
+
+        .. versionadded:: 0.25.0
+
     Returns
     -------
     frame : DataFrame
+    Normalize semi-structured JSON data into a flat table.
 
     Examples
     --------
@@ -149,36 +173,62 @@ def json_normalize(data, record_path=None, meta=None,
     1  NaN         NaN      Regner        NaN       Mose       NaN
     2  2.0  Faye Raker         NaN        NaN        NaN       NaN
 
+    >>> data = [{'id': 1,
+    ...          'name': "Cole Volk",
+    ...          'fitness': {'height': 130, 'weight': 60}},
+    ...         {'name': "Mose Reg",
+    ...          'fitness': {'height': 130, 'weight': 60}},
+    ...         {'id': 2, 'name': 'Faye Raker',
+    ...          'fitness': {'height': 130, 'weight': 60}}]
+    >>> json_normalize(data, max_level=0)
+                fitness                 id        name
+    0   {'height': 130, 'weight': 60}  1.0   Cole Volk
+    1   {'height': 130, 'weight': 60}  NaN    Mose Reg
+    2   {'height': 130, 'weight': 60}  2.0  Faye Raker
+
+    Normalizes nested data upto level 1.
+
+    >>> data = [{'id': 1,
+    ...          'name': "Cole Volk",
+    ...          'fitness': {'height': 130, 'weight': 60}},
+    ...         {'name': "Mose Reg",
+    ...          'fitness': {'height': 130, 'weight': 60}},
+    ...         {'id': 2, 'name': 'Faye Raker',
+    ...          'fitness': {'height': 130, 'weight': 60}}]
+    >>> json_normalize(data, max_level=1)
+      fitness.height  fitness.weight   id    name
+    0   130              60          1.0    Cole Volk
+    1   130              60          NaN    Mose Reg
+    2   130              60          2.0    Faye Raker
+
     >>> data = [{'state': 'Florida',
     ...          'shortname': 'FL',
-    ...          'info': {
-    ...               'governor': 'Rick Scott'
-    ...          },
+    ...          'info': {'governor': 'Rick Scott'},
     ...          'counties': [{'name': 'Dade', 'population': 12345},
-    ...                      {'name': 'Broward', 'population': 40000},
-    ...                      {'name': 'Palm Beach', 'population': 60000}]},
+    ...                       {'name': 'Broward', 'population': 40000},
+    ...                       {'name': 'Palm Beach', 'population': 60000}]},
     ...         {'state': 'Ohio',
     ...          'shortname': 'OH',
-    ...          'info': {
-    ...               'governor': 'John Kasich'
-    ...          },
+    ...          'info': {'governor': 'John Kasich'},
     ...          'counties': [{'name': 'Summit', 'population': 1234},
     ...                       {'name': 'Cuyahoga', 'population': 1337}]}]
     >>> result = json_normalize(data, 'counties', ['state', 'shortname',
-    ...                                           ['info', 'governor']])
+    ...                                            ['info', 'governor']])
     >>> result
-             name  population info.governor    state shortname
-    0        Dade       12345    Rick Scott  Florida        FL
-    1     Broward       40000    Rick Scott  Florida        FL
-    2  Palm Beach       60000    Rick Scott  Florida        FL
-    3      Summit        1234   John Kasich     Ohio        OH
-    4    Cuyahoga        1337   John Kasich     Ohio        OH
+             name  population    state shortname info.governor
+    0        Dade       12345   Florida    FL    Rick Scott
+    1     Broward       40000   Florida    FL    Rick Scott
+    2  Palm Beach       60000   Florida    FL    Rick Scott
+    3      Summit        1234   Ohio       OH    John Kasich
+    4    Cuyahoga        1337   Ohio       OH    John Kasich
 
     >>> data = {'A': [1, 2]}
     >>> json_normalize(data, 'A', record_prefix='Prefix.')
         Prefix.0
     0          1
     1          2
+
+    Returns normalized data with columns prefixed with the given string.
     """
     def _pull_field(js, spec):
         result = js
@@ -206,7 +256,8 @@ def _pull_field(js, spec):
             #
             # TODO: handle record value which are lists, at least error
             #       reasonably
-            data = nested_to_record(data, sep=sep)
+            data = nested_to_record(data, sep=sep,
+                                    max_level=max_level)
         return DataFrame(data)
     elif not isinstance(record_path, list):
         record_path = [record_path]
@@ -219,10 +270,10 @@ def _pull_field(js, spec):
     meta = [m if isinstance(m, list) else [m] for m in meta]
 
     # Disastrously inefficient for now
-    records = []
+    records = []  # type: List
     lengths = []
 
-    meta_vals = defaultdict(list)
+    meta_vals = defaultdict(list)  # type: DefaultDict
     if not isinstance(sep, str):
         sep = str(sep)
     meta_keys = [sep.join(val) for val in meta]
@@ -241,10 +292,12 @@ def _recursive_extract(data, path, seen_meta, level=0):
         else:
             for obj in data:
                 recs = _pull_field(obj, path[0])
+                recs = [nested_to_record(r, sep=sep,
+                                         max_level=max_level)
+                        if isinstance(r, dict) else r for r in recs]
 
                 # For repeating the metadata later
                 lengths.append(len(recs))
-
                 for val, key in zip(meta, meta_keys):
                     if level + 1 > len(val):
                         meta_val = seen_meta[key]
@@ -260,7 +313,6 @@ def _recursive_extract(data, path, seen_meta, level=0):
                                                "{err} is not always present"
                                                .format(err=e))
                     meta_vals[key].append(meta_val)
-
                 records.extend(recs)
 
     _recursive_extract(data, record_path, {}, level=0)
@@ -279,8 +331,5 @@ def _recursive_extract(data, path, seen_meta, level=0):
         if k in result:
             raise ValueError('Conflicting metadata name {name}, '
                              'need distinguishing prefix '.format(name=k))
-
-        # forcing dtype to object to avoid the metadata being casted to string
         result[k] = np.array(v, dtype=object).repeat(lengths)
-
     return result
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py