3
3
4
4
from collections import defaultdict
5
5
import copy
6
+ from typing import DefaultDict , Dict , List , Optional , Union
6
7
7
8
import numpy as np
8
9
@@ -25,9 +26,11 @@ def _convert_to_line_delimits(s):
25
26
return convert_json_to_lines (s )
26
27
27
28
28
- def nested_to_record (ds , prefix = "" , sep = "." , level = 0 ):
29
+ def nested_to_record (ds , prefix : str = "" ,
30
+ sep : str = "." , level : int = 0 ,
31
+ max_level : Optional [int ] = None ):
29
32
"""
30
- A simplified json_normalize.
33
+ A simplified json_normalize
31
34
32
35
Converts a nested dict into a flat dict ("record"), unlike json_normalize,
33
36
it does not attempt to extract a subset of the data.
@@ -36,13 +39,19 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
36
39
----------
37
40
ds : dict or list of dicts
38
41
prefix: the prefix, optional, default: ""
39
- sep : string , default '.'
42
+ sep : str , default '.'
40
43
Nested records will generate names separated by sep,
41
44
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
42
45
43
46
.. versionadded:: 0.20.0
44
47
45
- level: the number of levels in the jason string, optional, default: 0
48
+ level: int, optional, default: 0
49
+ The number of levels in the json string.
50
+
51
+ max_level: int, optional, default: None
52
+ The max depth to normalize.
53
+
54
+ .. versionadded:: 0.25.0
46
55
47
56
Returns
48
57
-------
@@ -65,10 +74,8 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
65
74
if isinstance (ds , dict ):
66
75
ds = [ds ]
67
76
singleton = True
68
-
69
77
new_ds = []
70
78
for d in ds :
71
-
72
79
new_d = copy .deepcopy (d )
73
80
for k , v in d .items ():
74
81
# each key gets renamed with prefix
@@ -79,62 +86,79 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
79
86
else :
80
87
newkey = prefix + sep + k
81
88
89
+ # flatten if type is dict and
90
+ # current dict level < maximum level provided and
82
91
# only dicts gets recurse-flattened
83
92
# only at level>1 do we rename the rest of the keys
84
- if not isinstance (v , dict ):
93
+ if (not isinstance (v , dict ) or
94
+ (max_level is not None and level >= max_level )):
85
95
if level != 0 : # so we skip copying for top level, common case
86
96
v = new_d .pop (k )
87
97
new_d [newkey ] = v
88
98
continue
89
99
else :
90
100
v = new_d .pop (k )
91
- new_d .update (nested_to_record (v , newkey , sep , level + 1 ))
101
+ new_d .update (nested_to_record (v , newkey , sep , level + 1 ,
102
+ max_level ))
92
103
new_ds .append (new_d )
93
104
94
105
if singleton :
95
106
return new_ds [0 ]
96
107
return new_ds
97
108
98
109
99
- def json_normalize (data , record_path = None , meta = None ,
100
- meta_prefix = None ,
101
- record_prefix = None ,
102
- errors = 'raise' ,
103
- sep = '.' ):
110
+ def json_normalize (data : List [Dict ],
111
+ record_path : Optional [Union [str , List ]] = None ,
112
+ meta : Optional [Union [str , List ]] = None ,
113
+ meta_prefix : Optional [str ] = None ,
114
+ record_prefix : Optional [str ] = None ,
115
+ errors : Optional [str ] = 'raise' ,
116
+ sep : str = '.' ,
117
+ max_level : Optional [int ] = None ):
104
118
"""
105
119
Normalize semi-structured JSON data into a flat table.
106
120
107
121
Parameters
108
122
----------
109
123
data : dict or list of dicts
110
- Unserialized JSON objects
111
- record_path : string or list of strings , default None
124
+ Unserialized JSON objects.
125
+ record_path : str or list of str , default None
112
126
Path in each object to list of records. If not passed, data will be
113
- assumed to be an array of records
114
- meta : list of paths (string or list of strings), default None
115
- Fields to use as metadata for each record in resulting table
116
- meta_prefix : string, default None
117
- record_prefix : string, default None
127
+ assumed to be an array of records.
128
+ meta : list of paths (str or list of str), default None
129
+ Fields to use as metadata for each record in resulting table.
130
+ meta_prefix : str, default None
118
131
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
119
- path to records is ['foo', 'bar']
132
+ meta is ['foo', 'bar'].
133
+ record_prefix : str, default None
134
+ If True, prefix records with dotted (?) path, e.g. foo.bar.field if
135
+ path to records is ['foo', 'bar'].
120
136
errors : {'raise', 'ignore'}, default 'raise'
137
+ Configures error handling.
121
138
122
139
* 'ignore' : will ignore KeyError if keys listed in meta are not
123
- always present
140
+ always present.
124
141
* 'raise' : will raise KeyError if keys listed in meta are not
125
- always present
142
+ always present.
126
143
127
144
.. versionadded:: 0.20.0
128
145
129
- sep : string , default '.'
130
- Nested records will generate names separated by sep,
131
- e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
146
+ sep : str , default '.'
147
+ Nested records will generate names separated by sep.
148
+ e.g., for sep='.', {'foo': {'bar': 0} } -> foo.bar.
132
149
133
150
.. versionadded:: 0.20.0
134
151
152
+ max_level : int, default None
153
+ Max number of levels(depth of dict) to normalize.
154
+ if None, normalizes all levels.
155
+
156
+ .. versionadded:: 0.25.0
157
+
135
158
Returns
136
159
-------
137
160
frame : DataFrame
161
+ Normalize semi-structured JSON data into a flat table.
138
162
139
163
Examples
140
164
--------
@@ -149,36 +173,62 @@ def json_normalize(data, record_path=None, meta=None,
149
173
1 NaN NaN Regner NaN Mose NaN
150
174
2 2.0 Faye Raker NaN NaN NaN NaN
151
175
176
+ >>> data = [{'id': 1,
177
+ ... 'name': "Cole Volk",
178
+ ... 'fitness': {'height': 130, 'weight': 60}},
179
+ ... {'name': "Mose Reg",
180
+ ... 'fitness': {'height': 130, 'weight': 60}},
181
+ ... {'id': 2, 'name': 'Faye Raker',
182
+ ... 'fitness': {'height': 130, 'weight': 60}}]
183
+ >>> json_normalize(data, max_level=0)
184
+ fitness id name
185
+ 0 {'height': 130, 'weight': 60} 1.0 Cole Volk
186
+ 1 {'height': 130, 'weight': 60} NaN Mose Reg
187
+ 2 {'height': 130, 'weight': 60} 2.0 Faye Raker
188
+
189
+ Normalizes nested data upto level 1.
190
+
191
+ >>> data = [{'id': 1,
192
+ ... 'name': "Cole Volk",
193
+ ... 'fitness': {'height': 130, 'weight': 60}},
194
+ ... {'name': "Mose Reg",
195
+ ... 'fitness': {'height': 130, 'weight': 60}},
196
+ ... {'id': 2, 'name': 'Faye Raker',
197
+ ... 'fitness': {'height': 130, 'weight': 60}}]
198
+ >>> json_normalize(data, max_level=1)
199
+ fitness.height fitness.weight id name
200
+ 0 130 60 1.0 Cole Volk
201
+ 1 130 60 NaN Mose Reg
202
+ 2 130 60 2.0 Faye Raker
203
+
152
204
>>> data = [{'state': 'Florida',
153
205
... 'shortname': 'FL',
154
- ... 'info': {
155
- ... 'governor': 'Rick Scott'
156
- ... },
206
+ ... 'info': {'governor': 'Rick Scott'},
157
207
... 'counties': [{'name': 'Dade', 'population': 12345},
158
- ... {'name': 'Broward', 'population': 40000},
159
- ... {'name': 'Palm Beach', 'population': 60000}]},
208
+ ... {'name': 'Broward', 'population': 40000},
209
+ ... {'name': 'Palm Beach', 'population': 60000}]},
160
210
... {'state': 'Ohio',
161
211
... 'shortname': 'OH',
162
- ... 'info': {
163
- ... 'governor': 'John Kasich'
164
- ... },
212
+ ... 'info': {'governor': 'John Kasich'},
165
213
... 'counties': [{'name': 'Summit', 'population': 1234},
166
214
... {'name': 'Cuyahoga', 'population': 1337}]}]
167
215
>>> result = json_normalize(data, 'counties', ['state', 'shortname',
168
- ... ['info', 'governor']])
216
+ ... ['info', 'governor']])
169
217
>>> result
170
- name population info.governor state shortname
171
- 0 Dade 12345 Rick Scott Florida FL
172
- 1 Broward 40000 Rick Scott Florida FL
173
- 2 Palm Beach 60000 Rick Scott Florida FL
174
- 3 Summit 1234 John Kasich Ohio OH
175
- 4 Cuyahoga 1337 John Kasich Ohio OH
218
+ name population state shortname info.governor
219
+ 0 Dade 12345 Florida FL Rick Scott
220
+ 1 Broward 40000 Florida FL Rick Scott
221
+ 2 Palm Beach 60000 Florida FL Rick Scott
222
+ 3 Summit 1234 Ohio OH John Kasich
223
+ 4 Cuyahoga 1337 Ohio OH John Kasich
176
224
177
225
>>> data = {'A': [1, 2]}
178
226
>>> json_normalize(data, 'A', record_prefix='Prefix.')
179
227
Prefix.0
180
228
0 1
181
229
1 2
230
+
231
+ Returns normalized data with columns prefixed with the given string.
182
232
"""
183
233
def _pull_field (js , spec ):
184
234
result = js
@@ -206,7 +256,8 @@ def _pull_field(js, spec):
206
256
#
207
257
# TODO: handle record value which are lists, at least error
208
258
# reasonably
209
- data = nested_to_record (data , sep = sep )
259
+ data = nested_to_record (data , sep = sep ,
260
+ max_level = max_level )
210
261
return DataFrame (data )
211
262
elif not isinstance (record_path , list ):
212
263
record_path = [record_path ]
@@ -219,10 +270,10 @@ def _pull_field(js, spec):
219
270
meta = [m if isinstance (m , list ) else [m ] for m in meta ]
220
271
221
272
# Disastrously inefficient for now
222
- records = []
273
+ records = [] # type: List
223
274
lengths = []
224
275
225
- meta_vals = defaultdict (list )
276
+ meta_vals = defaultdict (list ) # type: DefaultDict
226
277
if not isinstance (sep , str ):
227
278
sep = str (sep )
228
279
meta_keys = [sep .join (val ) for val in meta ]
@@ -241,10 +292,12 @@ def _recursive_extract(data, path, seen_meta, level=0):
241
292
else :
242
293
for obj in data :
243
294
recs = _pull_field (obj , path [0 ])
295
+ recs = [nested_to_record (r , sep = sep ,
296
+ max_level = max_level )
297
+ if isinstance (r , dict ) else r for r in recs ]
244
298
245
299
# For repeating the metadata later
246
300
lengths .append (len (recs ))
247
-
248
301
for val , key in zip (meta , meta_keys ):
249
302
if level + 1 > len (val ):
250
303
meta_val = seen_meta [key ]
@@ -260,7 +313,6 @@ def _recursive_extract(data, path, seen_meta, level=0):
260
313
"{err} is not always present"
261
314
.format (err = e ))
262
315
meta_vals [key ].append (meta_val )
263
-
264
316
records .extend (recs )
265
317
266
318
_recursive_extract (data , record_path , {}, level = 0 )
@@ -279,8 +331,5 @@ def _recursive_extract(data, path, seen_meta, level=0):
279
331
if k in result :
280
332
raise ValueError ('Conflicting metadata name {name}, '
281
333
'need distinguishing prefix ' .format (name = k ))
282
-
283
- # forcing dtype to object to avoid the metadata being casted to string
284
334
result [k ] = np .array (v , dtype = object ).repeat (lengths )
285
-
286
335
return result
0 commit comments