8
8
9
9
import json
10
10
from pathlib import Path
11
- from typing import Dict , List , Union
11
+ from typing import IO , Any , Dict , List , Optional , Union
12
12
13
13
import numpy as np
14
14
@@ -37,7 +37,12 @@ def convert_list_to_batch_data(
37
37
list_data : List [Dict [str , np .ndarray ]]
38
38
) -> Dict [str , Union [np .ndarray , Dict [str , np .ndarray ]]]:
39
39
"""
40
- Convert list of dataset to one single batch dataset
40
+ Convert a list of datasets to one single batch dataset
41
+
42
+ Example data formats:
43
+ input: [{"node": <1d-array>, "line": <1d-array>}, {"node": <1d-array>, "line": <1d-array>}]
44
+ output: {"node": <2d-array>, "line": <2d-array>}
45
+ -or-: {"indptr": <1d-array>, "data": <1d-array>}
41
46
Args:
42
47
list_data: list of dataset
43
48
@@ -48,25 +53,38 @@ def convert_list_to_batch_data(
48
53
"""
49
54
50
55
# List all *unique* types
51
- all_types = list ( {x for single_batch in list_data for x in single_batch .keys ()})
56
+ components = {x for dataset in list_data for x in dataset .keys ()}
52
57
53
58
batch_data = {}
54
- for comp_type in all_types :
55
- # use 2D array if the type exists in all single dataset and the size is the same
56
- if np .all ([comp_type in x for x in list_data ]) and np .unique ([x [comp_type ].size for x in list_data ]).size == 1 :
57
- batch_data [comp_type ] = np .stack ([x [comp_type ] for x in list_data ], axis = 0 )
59
+ for component in components :
60
+
61
+ # Create a 2D array if the component exists in all datasets and number of objects is the same in each dataset
62
+ comp_exists_in_all_datasets = all (component in x for x in list_data )
63
+ all_sizes_are_the_same = lambda : all (x [component ].size == list_data [0 ][component ].size for x in list_data )
64
+ if comp_exists_in_all_datasets and all_sizes_are_the_same ():
65
+ batch_data [component ] = np .stack ([x [component ] for x in list_data ], axis = 0 )
58
66
continue
67
+
59
68
# otherwise use indptr/data dict
60
69
indptr = [0 ]
61
70
data = []
62
- for single_batch in list_data :
63
- if comp_type not in single_batch :
64
- indptr .append (indptr [- 1 ])
71
+ for dataset in list_data :
72
+
73
+ if component in dataset :
74
+ # If the current dataset contains the component, increase the indptr for this batch and append the data
75
+ objects = dataset [component ]
76
+ indptr .append (indptr [- 1 ] + len (objects ))
77
+ data .append (objects )
78
+
65
79
else :
66
- single_data = single_batch [comp_type ]
67
- indptr .append (indptr [- 1 ] + single_data .shape [0 ])
68
- data .append (single_data )
69
- batch_data [comp_type ] = {"indptr" : np .array (indptr , dtype = np .int32 ), "data" : np .concatenate (data , axis = 0 )}
80
+ # If the current dataset does not contain the component, add the last indptr again.
81
+ indptr .append (indptr [- 1 ])
82
+
83
+ # Convert the index pointers to a numpy array and combine the list of object numpy arrays into a singe
84
+ # numpy array. All objects of all batches are now stores in one large array, the index pointers define
85
+ # which elemets of the array (rows) belong to which batch.
86
+ batch_data [component ] = {"indptr" : np .array (indptr , dtype = np .int32 ), "data" : np .concatenate (data , axis = 0 )}
87
+
70
88
return batch_data
71
89
72
90
@@ -83,27 +101,46 @@ def convert_python_to_numpy(
83
101
A single or batch dataset for power-grid-model
84
102
85
103
"""
86
- if isinstance (data , dict ):
87
- return_dict = {}
88
- for component_name , component_list in data .items ():
89
- arr : np .ndarray = initialize_array (data_type , component_name , len (component_list ))
90
- for i , component in enumerate (component_list ):
91
- for property_name , value in component .items ():
92
- if property_name not in arr [i ].dtype .names :
93
- raise ValueError (f"Invalid property '{ property_name } ' for { component_name } { data_type } data." )
94
- try :
95
- arr [i ][property_name ] = value
96
- except ValueError as ex :
97
- raise ValueError (f"Invalid '{ property_name } ' value for { component_name } { data_type } data: { ex } " )
98
-
99
- return_dict [component_name ] = arr
100
- return return_dict
101
104
105
+ # If the inpute data is a list, we are dealing with batch data. Each element in the list is a batch. We'll
106
+ # first convert each batch seperately, by recusively calling this function for each batch. Then the numpy
107
+ # data for all batches in converted into a proper and compact numpy structure.
102
108
if isinstance (data , list ):
103
109
list_data = [convert_python_to_numpy (json_dict , data_type = data_type ) for json_dict in data ]
104
110
return convert_list_to_batch_data (list_data )
105
111
106
- raise TypeError ("Only list or dict is allowed in JSON data!" )
112
+ # This should be a normal (non-batch) structure, with a list of objects (dictionaries) per component.
113
+ if not isinstance (data , dict ):
114
+ raise TypeError ("Only list or dict is allowed in JSON data!" )
115
+
116
+ dataset : Dict [str , np .ndarray ] = {}
117
+ for component , objects in data .items ():
118
+
119
+ # We'll initialize an 1d-array with NaN values for all the objects of this component type
120
+ dataset [component ] = initialize_array (data_type , component , len (objects ))
121
+
122
+ for i , obj in enumerate (objects ):
123
+ # As each object is a separate dictionary, and the properties may differ per object, we need to check
124
+ # all properties. Non-existing properties
125
+ for property , value in obj .items ():
126
+ if property == "extra" :
127
+ # The "extra" property is a special one. It can store any type of information associated with
128
+ # an object, but it will not be used in the calculations. Therefore it is not included in the
129
+ # numpy array, so we can skip this property
130
+ continue
131
+
132
+ if property not in dataset [component ].dtype .names :
133
+ # If a property doen't exist, the user made a mistake. Let's be merciless in that case,
134
+ # for their own good.
135
+ raise ValueError (f"Invalid property '{ property } ' for { component } { data_type } data." )
136
+
137
+ # Now just assign the value and raise an error if the value cannot be stored in the specific
138
+ # numpy array data format for this property.
139
+ try :
140
+ dataset [component ][i ][property ] = value
141
+ except ValueError as ex :
142
+ raise ValueError (f"Invalid '{ property } ' value for { component } { data_type } data: { ex } " )
143
+ return dataset
107
144
108
145
109
146
def convert_batch_to_list_data (
@@ -117,29 +154,52 @@ def convert_batch_to_list_data(
117
154
Returns:
118
155
list of single dataset
119
156
"""
120
- list_data = []
121
- # return empty list
157
+
158
+ # If the batch data is empty, return an empty list
122
159
if not batch_data :
123
- return list_data
124
- # get n_batch
125
- one_data = next (iter (batch_data .values ()))
126
- if isinstance (one_data , dict ):
127
- n_batch = one_data ["indptr" ].size - 1
160
+ return []
161
+
162
+ # Get the data for an arbitrary component; assuming that the number of batches of each component is the same.
163
+ # The structure may differ per component
164
+ example_batch_data = next (iter (batch_data .values ()))
165
+
166
+ if isinstance (example_batch_data , np .ndarray ):
167
+ # We expect the batch data to be a 2d numpy array of n_batches x n_objects
168
+ if len (example_batch_data .shape ) != 2 :
169
+ raise ValueError ("Invalid batch data format" )
170
+ n_batches = example_batch_data .shape [0 ]
171
+ elif isinstance (example_batch_data , dict ):
172
+ # If the batch data is a dictionary, we assume that it is an indptr/data structure (otherwise it is an
173
+ # invalid dictionary). There is always one indptr more than there are batches.
174
+ if "indptr" not in example_batch_data :
175
+ raise ValueError ("Invalid batch data format" )
176
+ n_batches = example_batch_data ["indptr" ].size - 1
128
177
else :
129
- n_batch = one_data .shape [0 ]
130
- # convert
131
- for i in range (n_batch ):
132
- single_dataset = {}
133
- for key , batch in batch_data .items ():
134
- if isinstance (batch , dict ):
135
- single_dataset [key ] = batch ["data" ][batch ["indptr" ][i ] : batch ["indptr" ][i + 1 ]]
136
- else :
137
- single_dataset [key ] = batch [i , ...]
138
- list_data .append (single_dataset )
178
+ # If the batch data is not a numpy array and not a dictionary, it is invalid
179
+ raise ValueError ("Invalid batch data format" )
180
+
181
+ # Initialize an empty list with dictionaries
182
+ # Note that [{}] * n_batches would result in n copies of the same dict.
183
+ list_data = [{} for _ in range (n_batches )]
184
+
185
+ # While the number of batches must be the same for each component, the structure (2d numpy array or indptr/data)
186
+ # doesn't have to be. Therefore, we'll check the structure for each component and copy the data accordingly.
187
+ for component , data in batch_data .items ():
188
+ if isinstance (data , np .ndarray ):
189
+ # For 2d numpy arrays, copy each batch into an element of the list
190
+ for i , batch in enumerate (data ):
191
+ list_data [i ][component ] = batch
192
+ else :
193
+ # For indptr/data structures, use the indptr to select the items for each batch.
194
+ indptr = data ["indptr" ]
195
+ for i , (idx0 , idx1 ) in enumerate (zip (indptr [:- 1 ], indptr [1 :])):
196
+ list_data [i ][component ] = data ["data" ][idx0 :idx1 ]
139
197
return list_data
140
198
141
199
142
- def convert_numpy_to_python (data : Dict [str , Union [np .ndarray , Dict [str , np .ndarray ]]]) -> Union [Dict , List ]:
200
+ def convert_numpy_to_python (
201
+ data : Dict [str , Union [np .ndarray , Dict [str , np .ndarray ]]]
202
+ ) -> Union [Dict [str , List [Dict [str , Union [int , float ]]]], List [Dict [str , List [Dict [str , Union [int , float ]]]]]]:
143
203
"""
144
204
Convert internal numpy arrays to native python data
145
205
If an attribute is not available (NaN value), it will not be exported.
@@ -150,18 +210,29 @@ def convert_numpy_to_python(data: Dict[str, Union[np.ndarray, Dict[str, np.ndarr
150
210
A json list for batch dataset
151
211
152
212
"""
153
- # check the dataset is single or batch
154
- if data :
155
- one_data = next (iter (data .values ()))
156
- # it is batch dataset if it is 2D array of a dict of indptr/data
157
- if isinstance (one_data , dict ) or one_data .ndim == 2 :
158
- list_data = convert_batch_to_list_data (data )
159
- return [convert_numpy_to_python (x ) for x in list_data ]
160
- # otherwise it is single dataset
161
- single_dataset : Dict [str , np .ndarray ] = data
213
+ # Check if the dataset is a single dataset or batch dataset
214
+ # It is batch dataset if it is 2D array or a indptr/data structure
215
+ example_data = next (iter (data .values ()))
216
+ is_dense_batch = isinstance (example_data , np .ndarray ) and example_data .ndim == 2
217
+ is_sparse_batch = isinstance (example_data , dict ) and "indptr" in example_data and "data" in example_data
218
+
219
+ # If it is a batch, convert the batch data to a list of batches, then convert each batch individually.
220
+ if is_dense_batch or is_sparse_batch :
221
+ list_data = convert_batch_to_list_data (data )
222
+ return [convert_numpy_to_python (x ) for x in list_data ]
223
+
224
+ # Otherwise it should be a single data set
225
+ if not isinstance (example_data , np .ndarray ) or example_data .ndim != 1 :
226
+ raise ValueError ("Invalid data format" )
227
+
228
+ # Convert each numpy array to a list of objects, which contains only the non-NaN properties:
229
+ # For example: {"node": [{"id": 0, ...}, {"id": 1, ...}], "line": [{"id": 2, ...}]}
162
230
return {
163
- name : [{k : item [k ].tolist () for k in array .dtype .names if not is_nan (item [k ])} for item in array ]
164
- for name , array in single_dataset .items ()
231
+ component : [
232
+ {property : obj [property ].tolist () for property in objects .dtype .names if not is_nan (obj [property ])}
233
+ for obj in objects
234
+ ]
235
+ for component , objects in data .items ()
165
236
}
166
237
167
238
@@ -181,18 +252,130 @@ def import_json_data(json_file: Path, data_type: str) -> Union[Dict[str, np.ndar
181
252
return convert_python_to_numpy (json_data , data_type )
182
253
183
254
184
- def export_json_data (json_file : Path , data : Union [Dict [str , np .ndarray ], List [Dict [str , np .ndarray ]]], indent = 2 ):
255
+ def export_json_data (
256
+ json_file : Path ,
257
+ data : Union [Dict [str , np .ndarray ], List [Dict [str , np .ndarray ]]],
258
+ indent : Optional [int ] = 2 ,
259
+ compact : bool = False ,
260
+ extra_info : Optional [Union [Dict [int , Any ], List [Dict [int , Any ]]]] = None ,
261
+ ):
185
262
"""
186
263
export json data
187
264
Args:
188
265
json_file: path to json file
189
- data: A single or batch dataset for power-grid-model
190
- indent:
191
- indent of the file, default 2
266
+ data: a single or batch dataset for power-grid-model
267
+ indent: indent of the file, default 2
268
+ compact: write components on a single line
269
+ extra_info: extra information (in any json-serializable format), indexed on the object ids
270
+ e.g. a string representing the original id, or a dictionary storing even more information.
192
271
193
272
Returns:
194
273
Save to file
195
274
"""
196
275
json_data = convert_numpy_to_python (data )
276
+ if extra_info is not None :
277
+ _inject_extra_info (data = json_data , extra_info = extra_info )
278
+
197
279
with open (json_file , mode = "w" , encoding = "utf-8" ) as file_pointer :
198
- json .dump (json_data , file_pointer , indent = indent )
280
+ if compact and indent :
281
+ is_batch_data = isinstance (json_data , list )
282
+ max_level = 4 if is_batch_data else 3
283
+ _compact_json_dump (json_data , file_pointer , indent = indent , max_level = max_level )
284
+ else :
285
+ json .dump (json_data , file_pointer , indent = indent )
286
+
287
+
288
+ def _inject_extra_info (
289
+ data : Union [Dict [str , List [Dict [str , Union [float , int ]]]], List [Dict [str , List [Dict [str , Union [float , int ]]]]]],
290
+ extra_info : Union [Dict [int , Any ], List [Dict [int , Any ]]],
291
+ ):
292
+ """
293
+ Injects extra info to the objects by ID
294
+
295
+ Args:
296
+ data: Power Grid Model Python data, as written to pgm json files.
297
+ extra_info: A dictionary indexed by object id. The value may be anything.
298
+
299
+ """
300
+ if isinstance (data , list ):
301
+ if isinstance (extra_info , list ):
302
+ # If both data and extra_info are lists, expect one extra info set per batch
303
+ for batch , info in zip (data , extra_info ):
304
+ _inject_extra_info (batch , info )
305
+ else :
306
+ # If only data is a list, copy extra_info for each batch
307
+ for batch in data :
308
+ _inject_extra_info (batch , extra_info )
309
+ elif isinstance (data , dict ):
310
+ if not isinstance (extra_info , dict ):
311
+ raise TypeError ("Invalid extra info data type" )
312
+ for component , objects in data .items ():
313
+ for obj in objects :
314
+ if obj ["id" ] in extra_info :
315
+ obj ["extra" ] = extra_info [obj ["id" ]]
316
+ else :
317
+ raise TypeError ("Invalid data type" )
318
+
319
+
320
+ def _compact_json_dump (data : Any , io_stream : IO [str ], indent : int , max_level : int , level : int = 0 ):
321
+ """Custom compact JSON writer that is intended to put data belonging to a single object on a single line.
322
+
323
+ For example:
324
+ {
325
+ "node": [
326
+ {"id": 0, "u_rated": 10500.0, "extra": {"original_id": 123}},
327
+ {"id": 1, "u_rated": 10500.0, "extra": {"original_id": 456}},
328
+ ],
329
+ "line": [
330
+ {"id": 2, "node_from": 0, "node_to": 1, ...}
331
+ ]
332
+ }
333
+
334
+ The function is being called recursively, starting at level 0 and recursing until max_level is reached. It is
335
+ basically a full json writer, but for efficiency reasons, on the last levels the native json.dump method is used.
336
+ """
337
+
338
+ # Let's define a 'tab' indent, depending on the level
339
+ tab = " " * level * indent
340
+
341
+ # If we are at the max_level, or the data simply doesn't contain any more levels, write the indent and serialize
342
+ # the data on a single line.
343
+ if level >= max_level or not isinstance (data , (list , dict )):
344
+ io_stream .write (tab )
345
+ json .dump (data , io_stream , indent = None )
346
+ return
347
+
348
+ # We'll need the number of objects later on
349
+ n_obj = len (data )
350
+
351
+ # If the data is a list:
352
+ # 1. start with an opening bracket
353
+ # 2. dump each element in the list
354
+ # 3. add a comma and a new line after each element, except for the last element, there we don't need a comma.
355
+ # 4. finish with a closing bracket
356
+ if isinstance (data , list ):
357
+ io_stream .write (tab + "[\n " )
358
+ for i , obj in enumerate (data , start = 1 ):
359
+ _compact_json_dump (obj , io_stream , indent , max_level , level + 1 )
360
+ io_stream .write (",\n " if i < n_obj else "\n " )
361
+ io_stream .write (tab + "]" )
362
+ return
363
+
364
+ # If the data is a dictionary:
365
+ # 1. start with an opening curly bracket
366
+ # 2. for each element: write it's key, plus a colon ':'
367
+ # 3. if the next level would be the max_level, add a space and dump the element on a single,
368
+ # else add a new line before dumping the element recursively.
369
+ # 4. add a comma and a new line after each element, except for the last element, there we don't need a comma.
370
+ # 5. finish with a closing curly bracket
371
+ io_stream .write (tab + "{\n " )
372
+ for i , (key , obj ) in enumerate (data .items (), start = 1 ):
373
+ io_stream .write (tab + " " * indent + f'"{ key } ":' )
374
+ if level == max_level - 1 or not isinstance (obj , (list , dict )):
375
+ io_stream .write (" " )
376
+ json .dump (obj , io_stream , indent = None )
377
+ else :
378
+ io_stream .write ("\n " )
379
+ _compact_json_dump (obj , io_stream , indent , max_level , level + 2 )
380
+ io_stream .write (",\n " if i < n_obj else "\n " )
381
+ io_stream .write (tab + "}" )
0 commit comments