Skip to content

Commit 13f3f5a

Browse files
bhavaniraviTomAugspurger
authored andcommitted
ENH: Add max_level param to json_normalize (#26876)
* ENH add max_level and ignore_keys configuration to nested_to_records max_level param defines at the level of nesting at which normalizing should stop. ignore_keys defines the keys to ignore without normalizing
1 parent 8393e37 commit 13f3f5a

File tree

4 files changed

+256
-66
lines changed

4 files changed

+256
-66
lines changed

doc/source/user_guide/io.rst

+13
Original file line numberDiff line numberDiff line change
@@ -2176,6 +2176,19 @@ into a flat table.
21762176
21772177
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
21782178
2179+
The max_level parameter provides more control over which level to end normalization.
2180+
With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict.
2181+
2182+
.. ipython:: python
2183+
2184+
data = [{'CreatedBy': {'Name': 'User001'},
2185+
'Lookup': {'TextField': 'Some text',
2186+
'UserField': {'Id': 'ID001',
2187+
'Name': 'Name001'}},
2188+
'Image': {'a': 'b'}
2189+
}]
2190+
json_normalize(data, max_level=1)
2191+
21792192
.. _io.jsonl:
21802193

21812194
Line delimited json

doc/source/whatsnew/v0.25.0.rst

+23
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,29 @@ the output will truncate, if it's wider than :attr:`options.display.width`
135135
(default: 80 characters).
136136

137137

138+
.. _whatsnew_0250.enhancements.json_normalize_with_max_level:
139+
140+
Json normalize with max_level param support
141+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
142+
143+
:func:`json_normalize` normalizes the provided input dict to all
144+
nested levels. The new max_level parameter provides more control over
145+
which level to end normalization (:issue:`23843`):
146+
147+
The repr now looks like this:
148+
149+
.. ipython:: python
150+
151+
from pandas.io.json import json_normalize
152+
data = [{
153+
'CreatedBy': {'Name': 'User001'},
154+
'Lookup': {'TextField': 'Some text',
155+
'UserField': {'Id': 'ID001', 'Name': 'Name001'}},
156+
'Image': {'a': 'b'}
157+
}]
158+
json_normalize(data, max_level=1)
159+
160+
138161
.. _whatsnew_0250.enhancements.other:
139162

140163
Other enhancements

pandas/io/json/normalize.py

+98-49
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from collections import defaultdict
55
import copy
6+
from typing import DefaultDict, Dict, List, Optional, Union
67

78
import numpy as np
89

@@ -25,9 +26,11 @@ def _convert_to_line_delimits(s):
2526
return convert_json_to_lines(s)
2627

2728

28-
def nested_to_record(ds, prefix="", sep=".", level=0):
29+
def nested_to_record(ds, prefix: str = "",
30+
sep: str = ".", level: int = 0,
31+
max_level: Optional[int] = None):
2932
"""
30-
A simplified json_normalize.
33+
A simplified json_normalize
3134
3235
Converts a nested dict into a flat dict ("record"), unlike json_normalize,
3336
it does not attempt to extract a subset of the data.
@@ -36,13 +39,19 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
3639
----------
3740
ds : dict or list of dicts
3841
prefix: the prefix, optional, default: ""
39-
sep : string, default '.'
42+
sep : str, default '.'
4043
Nested records will generate names separated by sep,
4144
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
4245
4346
.. versionadded:: 0.20.0
4447
45-
level: the number of levels in the jason string, optional, default: 0
48+
level: int, optional, default: 0
49+
The number of levels in the json string.
50+
51+
max_level: int, optional, default: None
52+
The max depth to normalize.
53+
54+
.. versionadded:: 0.25.0
4655
4756
Returns
4857
-------
@@ -65,10 +74,8 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
6574
if isinstance(ds, dict):
6675
ds = [ds]
6776
singleton = True
68-
6977
new_ds = []
7078
for d in ds:
71-
7279
new_d = copy.deepcopy(d)
7380
for k, v in d.items():
7481
# each key gets renamed with prefix
@@ -79,62 +86,79 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
7986
else:
8087
newkey = prefix + sep + k
8188

89+
# flatten if type is dict and
90+
# current dict level < maximum level provided and
8291
# only dicts gets recurse-flattened
8392
# only at level>1 do we rename the rest of the keys
84-
if not isinstance(v, dict):
93+
if (not isinstance(v, dict) or
94+
(max_level is not None and level >= max_level)):
8595
if level != 0: # so we skip copying for top level, common case
8696
v = new_d.pop(k)
8797
new_d[newkey] = v
8898
continue
8999
else:
90100
v = new_d.pop(k)
91-
new_d.update(nested_to_record(v, newkey, sep, level + 1))
101+
new_d.update(nested_to_record(v, newkey, sep, level + 1,
102+
max_level))
92103
new_ds.append(new_d)
93104

94105
if singleton:
95106
return new_ds[0]
96107
return new_ds
97108

98109

99-
def json_normalize(data, record_path=None, meta=None,
100-
meta_prefix=None,
101-
record_prefix=None,
102-
errors='raise',
103-
sep='.'):
110+
def json_normalize(data: List[Dict],
111+
record_path: Optional[Union[str, List]] = None,
112+
meta: Optional[Union[str, List]] = None,
113+
meta_prefix: Optional[str] = None,
114+
record_prefix: Optional[str] = None,
115+
errors: Optional[str] = 'raise',
116+
sep: str = '.',
117+
max_level: Optional[int] = None):
104118
"""
105119
Normalize semi-structured JSON data into a flat table.
106120
107121
Parameters
108122
----------
109123
data : dict or list of dicts
110-
Unserialized JSON objects
111-
record_path : string or list of strings, default None
124+
Unserialized JSON objects.
125+
record_path : str or list of str, default None
112126
Path in each object to list of records. If not passed, data will be
113-
assumed to be an array of records
114-
meta : list of paths (string or list of strings), default None
115-
Fields to use as metadata for each record in resulting table
116-
meta_prefix : string, default None
117-
record_prefix : string, default None
127+
assumed to be an array of records.
128+
meta : list of paths (str or list of str), default None
129+
Fields to use as metadata for each record in resulting table.
130+
meta_prefix : str, default None
118131
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
119-
path to records is ['foo', 'bar']
132+
meta is ['foo', 'bar'].
133+
record_prefix : str, default None
134+
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
135+
path to records is ['foo', 'bar'].
120136
errors : {'raise', 'ignore'}, default 'raise'
137+
Configures error handling.
121138
122139
* 'ignore' : will ignore KeyError if keys listed in meta are not
123-
always present
140+
always present.
124141
* 'raise' : will raise KeyError if keys listed in meta are not
125-
always present
142+
always present.
126143
127144
.. versionadded:: 0.20.0
128145
129-
sep : string, default '.'
130-
Nested records will generate names separated by sep,
131-
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
146+
sep : str, default '.'
147+
Nested records will generate names separated by sep.
148+
e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
132149
133150
.. versionadded:: 0.20.0
134151
152+
max_level : int, default None
153+
Max number of levels(depth of dict) to normalize.
154+
if None, normalizes all levels.
155+
156+
.. versionadded:: 0.25.0
157+
135158
Returns
136159
-------
137160
frame : DataFrame
161+
Normalize semi-structured JSON data into a flat table.
138162
139163
Examples
140164
--------
@@ -149,36 +173,62 @@ def json_normalize(data, record_path=None, meta=None,
149173
1 NaN NaN Regner NaN Mose NaN
150174
2 2.0 Faye Raker NaN NaN NaN NaN
151175
176+
>>> data = [{'id': 1,
177+
... 'name': "Cole Volk",
178+
... 'fitness': {'height': 130, 'weight': 60}},
179+
... {'name': "Mose Reg",
180+
... 'fitness': {'height': 130, 'weight': 60}},
181+
... {'id': 2, 'name': 'Faye Raker',
182+
... 'fitness': {'height': 130, 'weight': 60}}]
183+
>>> json_normalize(data, max_level=0)
184+
fitness id name
185+
0 {'height': 130, 'weight': 60} 1.0 Cole Volk
186+
1 {'height': 130, 'weight': 60} NaN Mose Reg
187+
2 {'height': 130, 'weight': 60} 2.0 Faye Raker
188+
189+
Normalizes nested data upto level 1.
190+
191+
>>> data = [{'id': 1,
192+
... 'name': "Cole Volk",
193+
... 'fitness': {'height': 130, 'weight': 60}},
194+
... {'name': "Mose Reg",
195+
... 'fitness': {'height': 130, 'weight': 60}},
196+
... {'id': 2, 'name': 'Faye Raker',
197+
... 'fitness': {'height': 130, 'weight': 60}}]
198+
>>> json_normalize(data, max_level=1)
199+
fitness.height fitness.weight id name
200+
0 130 60 1.0 Cole Volk
201+
1 130 60 NaN Mose Reg
202+
2 130 60 2.0 Faye Raker
203+
152204
>>> data = [{'state': 'Florida',
153205
... 'shortname': 'FL',
154-
... 'info': {
155-
... 'governor': 'Rick Scott'
156-
... },
206+
... 'info': {'governor': 'Rick Scott'},
157207
... 'counties': [{'name': 'Dade', 'population': 12345},
158-
... {'name': 'Broward', 'population': 40000},
159-
... {'name': 'Palm Beach', 'population': 60000}]},
208+
... {'name': 'Broward', 'population': 40000},
209+
... {'name': 'Palm Beach', 'population': 60000}]},
160210
... {'state': 'Ohio',
161211
... 'shortname': 'OH',
162-
... 'info': {
163-
... 'governor': 'John Kasich'
164-
... },
212+
... 'info': {'governor': 'John Kasich'},
165213
... 'counties': [{'name': 'Summit', 'population': 1234},
166214
... {'name': 'Cuyahoga', 'population': 1337}]}]
167215
>>> result = json_normalize(data, 'counties', ['state', 'shortname',
168-
... ['info', 'governor']])
216+
... ['info', 'governor']])
169217
>>> result
170-
name population info.governor state shortname
171-
0 Dade 12345 Rick Scott Florida FL
172-
1 Broward 40000 Rick Scott Florida FL
173-
2 Palm Beach 60000 Rick Scott Florida FL
174-
3 Summit 1234 John Kasich Ohio OH
175-
4 Cuyahoga 1337 John Kasich Ohio OH
218+
name population state shortname info.governor
219+
0 Dade 12345 Florida FL Rick Scott
220+
1 Broward 40000 Florida FL Rick Scott
221+
2 Palm Beach 60000 Florida FL Rick Scott
222+
3 Summit 1234 Ohio OH John Kasich
223+
4 Cuyahoga 1337 Ohio OH John Kasich
176224
177225
>>> data = {'A': [1, 2]}
178226
>>> json_normalize(data, 'A', record_prefix='Prefix.')
179227
Prefix.0
180228
0 1
181229
1 2
230+
231+
Returns normalized data with columns prefixed with the given string.
182232
"""
183233
def _pull_field(js, spec):
184234
result = js
@@ -206,7 +256,8 @@ def _pull_field(js, spec):
206256
#
207257
# TODO: handle record value which are lists, at least error
208258
# reasonably
209-
data = nested_to_record(data, sep=sep)
259+
data = nested_to_record(data, sep=sep,
260+
max_level=max_level)
210261
return DataFrame(data)
211262
elif not isinstance(record_path, list):
212263
record_path = [record_path]
@@ -219,10 +270,10 @@ def _pull_field(js, spec):
219270
meta = [m if isinstance(m, list) else [m] for m in meta]
220271

221272
# Disastrously inefficient for now
222-
records = []
273+
records = [] # type: List
223274
lengths = []
224275

225-
meta_vals = defaultdict(list)
276+
meta_vals = defaultdict(list) # type: DefaultDict
226277
if not isinstance(sep, str):
227278
sep = str(sep)
228279
meta_keys = [sep.join(val) for val in meta]
@@ -241,10 +292,12 @@ def _recursive_extract(data, path, seen_meta, level=0):
241292
else:
242293
for obj in data:
243294
recs = _pull_field(obj, path[0])
295+
recs = [nested_to_record(r, sep=sep,
296+
max_level=max_level)
297+
if isinstance(r, dict) else r for r in recs]
244298

245299
# For repeating the metadata later
246300
lengths.append(len(recs))
247-
248301
for val, key in zip(meta, meta_keys):
249302
if level + 1 > len(val):
250303
meta_val = seen_meta[key]
@@ -260,7 +313,6 @@ def _recursive_extract(data, path, seen_meta, level=0):
260313
"{err} is not always present"
261314
.format(err=e))
262315
meta_vals[key].append(meta_val)
263-
264316
records.extend(recs)
265317

266318
_recursive_extract(data, record_path, {}, level=0)
@@ -279,8 +331,5 @@ def _recursive_extract(data, path, seen_meta, level=0):
279331
if k in result:
280332
raise ValueError('Conflicting metadata name {name}, '
281333
'need distinguishing prefix '.format(name=k))
282-
283-
# forcing dtype to object to avoid the metadata being casted to string
284334
result[k] = np.array(v, dtype=object).repeat(lengths)
285-
286335
return result

0 commit comments

Comments
 (0)