88
99import json
1010from pathlib import Path
11- from typing import Dict , List , Union
11+ from typing import IO , Any , Dict , List , Optional , Union
1212
1313import numpy as np
1414
@@ -37,7 +37,12 @@ def convert_list_to_batch_data(
3737 list_data : List [Dict [str , np .ndarray ]]
3838) -> Dict [str , Union [np .ndarray , Dict [str , np .ndarray ]]]:
3939 """
40- Convert list of dataset to one single batch dataset
40+ Convert a list of datasets to one single batch dataset
41+
42+ Example data formats:
43+ input: [{"node": <1d-array>, "line": <1d-array>}, {"node": <1d-array>, "line": <1d-array>}]
44+ output: {"node": <2d-array>, "line": <2d-array>}
45+ -or-: {"indptr": <1d-array>, "data": <1d-array>}
4146 Args:
4247 list_data: list of dataset
4348
@@ -48,25 +53,38 @@ def convert_list_to_batch_data(
4853 """
4954
5055 # List all *unique* types
51- all_types = list ( {x for single_batch in list_data for x in single_batch .keys ()})
56+ components = {x for dataset in list_data for x in dataset .keys ()}
5257
5358 batch_data = {}
54- for comp_type in all_types :
55- # use 2D array if the type exists in all single dataset and the size is the same
56- if np .all ([comp_type in x for x in list_data ]) and np .unique ([x [comp_type ].size for x in list_data ]).size == 1 :
57- batch_data [comp_type ] = np .stack ([x [comp_type ] for x in list_data ], axis = 0 )
59+ for component in components :
60+
61+ # Create a 2D array if the component exists in all datasets and number of objects is the same in each dataset
62+ comp_exists_in_all_datasets = all (component in x for x in list_data )
63+ all_sizes_are_the_same = lambda : all (x [component ].size == list_data [0 ][component ].size for x in list_data )
64+ if comp_exists_in_all_datasets and all_sizes_are_the_same ():
65+ batch_data [component ] = np .stack ([x [component ] for x in list_data ], axis = 0 )
5866 continue
67+
5968 # otherwise use indptr/data dict
6069 indptr = [0 ]
6170 data = []
62- for single_batch in list_data :
63- if comp_type not in single_batch :
64- indptr .append (indptr [- 1 ])
71+ for dataset in list_data :
72+
73+ if component in dataset :
74+ # If the current dataset contains the component, increase the indptr for this batch and append the data
75+ objects = dataset [component ]
76+ indptr .append (indptr [- 1 ] + len (objects ))
77+ data .append (objects )
78+
6579 else :
66- single_data = single_batch [comp_type ]
67- indptr .append (indptr [- 1 ] + single_data .shape [0 ])
68- data .append (single_data )
69- batch_data [comp_type ] = {"indptr" : np .array (indptr , dtype = np .int32 ), "data" : np .concatenate (data , axis = 0 )}
80+ # If the current dataset does not contain the component, add the last indptr again.
81+ indptr .append (indptr [- 1 ])
82+
83+ # Convert the index pointers to a numpy array and combine the list of object numpy arrays into a singe
84+ # numpy array. All objects of all batches are now stores in one large array, the index pointers define
85+ # which elemets of the array (rows) belong to which batch.
86+ batch_data [component ] = {"indptr" : np .array (indptr , dtype = np .int32 ), "data" : np .concatenate (data , axis = 0 )}
87+
7088 return batch_data
7189
7290
@@ -83,27 +101,46 @@ def convert_python_to_numpy(
83101 A single or batch dataset for power-grid-model
84102
85103 """
86- if isinstance (data , dict ):
87- return_dict = {}
88- for component_name , component_list in data .items ():
89- arr : np .ndarray = initialize_array (data_type , component_name , len (component_list ))
90- for i , component in enumerate (component_list ):
91- for property_name , value in component .items ():
92- if property_name not in arr [i ].dtype .names :
93- raise ValueError (f"Invalid property '{ property_name } ' for { component_name } { data_type } data." )
94- try :
95- arr [i ][property_name ] = value
96- except ValueError as ex :
97- raise ValueError (f"Invalid '{ property_name } ' value for { component_name } { data_type } data: { ex } " )
98-
99- return_dict [component_name ] = arr
100- return return_dict
101104
105+ # If the inpute data is a list, we are dealing with batch data. Each element in the list is a batch. We'll
106+ # first convert each batch seperately, by recusively calling this function for each batch. Then the numpy
107+ # data for all batches in converted into a proper and compact numpy structure.
102108 if isinstance (data , list ):
103109 list_data = [convert_python_to_numpy (json_dict , data_type = data_type ) for json_dict in data ]
104110 return convert_list_to_batch_data (list_data )
105111
106- raise TypeError ("Only list or dict is allowed in JSON data!" )
112+ # This should be a normal (non-batch) structure, with a list of objects (dictionaries) per component.
113+ if not isinstance (data , dict ):
114+ raise TypeError ("Only list or dict is allowed in JSON data!" )
115+
116+ dataset : Dict [str , np .ndarray ] = {}
117+ for component , objects in data .items ():
118+
119+ # We'll initialize an 1d-array with NaN values for all the objects of this component type
120+ dataset [component ] = initialize_array (data_type , component , len (objects ))
121+
122+ for i , obj in enumerate (objects ):
123+ # As each object is a separate dictionary, and the properties may differ per object, we need to check
124+ # all properties. Non-existing properties
125+ for property , value in obj .items ():
126+ if property == "extra" :
127+ # The "extra" property is a special one. It can store any type of information associated with
128+ # an object, but it will not be used in the calculations. Therefore it is not included in the
129+ # numpy array, so we can skip this property
130+ continue
131+
132+ if property not in dataset [component ].dtype .names :
133+ # If a property doen't exist, the user made a mistake. Let's be merciless in that case,
134+ # for their own good.
135+ raise ValueError (f"Invalid property '{ property } ' for { component } { data_type } data." )
136+
137+ # Now just assign the value and raise an error if the value cannot be stored in the specific
138+ # numpy array data format for this property.
139+ try :
140+ dataset [component ][i ][property ] = value
141+ except ValueError as ex :
142+ raise ValueError (f"Invalid '{ property } ' value for { component } { data_type } data: { ex } " )
143+ return dataset
107144
108145
109146def convert_batch_to_list_data (
@@ -117,29 +154,52 @@ def convert_batch_to_list_data(
117154 Returns:
118155 list of single dataset
119156 """
120- list_data = []
121- # return empty list
157+
158+ # If the batch data is empty, return an empty list
122159 if not batch_data :
123- return list_data
124- # get n_batch
125- one_data = next (iter (batch_data .values ()))
126- if isinstance (one_data , dict ):
127- n_batch = one_data ["indptr" ].size - 1
160+ return []
161+
162+ # Get the data for an arbitrary component; assuming that the number of batches of each component is the same.
163+ # The structure may differ per component
164+ example_batch_data = next (iter (batch_data .values ()))
165+
166+ if isinstance (example_batch_data , np .ndarray ):
167+ # We expect the batch data to be a 2d numpy array of n_batches x n_objects
168+ if len (example_batch_data .shape ) != 2 :
169+ raise ValueError ("Invalid batch data format" )
170+ n_batches = example_batch_data .shape [0 ]
171+ elif isinstance (example_batch_data , dict ):
172+ # If the batch data is a dictionary, we assume that it is an indptr/data structure (otherwise it is an
173+ # invalid dictionary). There is always one indptr more than there are batches.
174+ if "indptr" not in example_batch_data :
175+ raise ValueError ("Invalid batch data format" )
176+ n_batches = example_batch_data ["indptr" ].size - 1
128177 else :
129- n_batch = one_data .shape [0 ]
130- # convert
131- for i in range (n_batch ):
132- single_dataset = {}
133- for key , batch in batch_data .items ():
134- if isinstance (batch , dict ):
135- single_dataset [key ] = batch ["data" ][batch ["indptr" ][i ] : batch ["indptr" ][i + 1 ]]
136- else :
137- single_dataset [key ] = batch [i , ...]
138- list_data .append (single_dataset )
178+ # If the batch data is not a numpy array and not a dictionary, it is invalid
179+ raise ValueError ("Invalid batch data format" )
180+
181+ # Initialize an empty list with dictionaries
182+ # Note that [{}] * n_batches would result in n copies of the same dict.
183+ list_data = [{} for _ in range (n_batches )]
184+
185+ # While the number of batches must be the same for each component, the structure (2d numpy array or indptr/data)
186+ # doesn't have to be. Therefore, we'll check the structure for each component and copy the data accordingly.
187+ for component , data in batch_data .items ():
188+ if isinstance (data , np .ndarray ):
189+ # For 2d numpy arrays, copy each batch into an element of the list
190+ for i , batch in enumerate (data ):
191+ list_data [i ][component ] = batch
192+ else :
193+ # For indptr/data structures, use the indptr to select the items for each batch.
194+ indptr = data ["indptr" ]
195+ for i , (idx0 , idx1 ) in enumerate (zip (indptr [:- 1 ], indptr [1 :])):
196+ list_data [i ][component ] = data ["data" ][idx0 :idx1 ]
139197 return list_data
140198
141199
142- def convert_numpy_to_python (data : Dict [str , Union [np .ndarray , Dict [str , np .ndarray ]]]) -> Union [Dict , List ]:
200+ def convert_numpy_to_python (
201+ data : Dict [str , Union [np .ndarray , Dict [str , np .ndarray ]]]
202+ ) -> Union [Dict [str , List [Dict [str , Union [int , float ]]]], List [Dict [str , List [Dict [str , Union [int , float ]]]]]]:
143203 """
144204 Convert internal numpy arrays to native python data
145205 If an attribute is not available (NaN value), it will not be exported.
@@ -150,18 +210,29 @@ def convert_numpy_to_python(data: Dict[str, Union[np.ndarray, Dict[str, np.ndarr
150210 A json list for batch dataset
151211
152212 """
153- # check the dataset is single or batch
154- if data :
155- one_data = next (iter (data .values ()))
156- # it is batch dataset if it is 2D array of a dict of indptr/data
157- if isinstance (one_data , dict ) or one_data .ndim == 2 :
158- list_data = convert_batch_to_list_data (data )
159- return [convert_numpy_to_python (x ) for x in list_data ]
160- # otherwise it is single dataset
161- single_dataset : Dict [str , np .ndarray ] = data
213+ # Check if the dataset is a single dataset or batch dataset
214+ # It is batch dataset if it is 2D array or a indptr/data structure
215+ example_data = next (iter (data .values ()))
216+ is_dense_batch = isinstance (example_data , np .ndarray ) and example_data .ndim == 2
217+ is_sparse_batch = isinstance (example_data , dict ) and "indptr" in example_data and "data" in example_data
218+
219+ # If it is a batch, convert the batch data to a list of batches, then convert each batch individually.
220+ if is_dense_batch or is_sparse_batch :
221+ list_data = convert_batch_to_list_data (data )
222+ return [convert_numpy_to_python (x ) for x in list_data ]
223+
224+ # Otherwise it should be a single data set
225+ if not isinstance (example_data , np .ndarray ) or example_data .ndim != 1 :
226+ raise ValueError ("Invalid data format" )
227+
228+ # Convert each numpy array to a list of objects, which contains only the non-NaN properties:
229+ # For example: {"node": [{"id": 0, ...}, {"id": 1, ...}], "line": [{"id": 2, ...}]}
162230 return {
163- name : [{k : item [k ].tolist () for k in array .dtype .names if not is_nan (item [k ])} for item in array ]
164- for name , array in single_dataset .items ()
231+ component : [
232+ {property : obj [property ].tolist () for property in objects .dtype .names if not is_nan (obj [property ])}
233+ for obj in objects
234+ ]
235+ for component , objects in data .items ()
165236 }
166237
167238
@@ -181,18 +252,130 @@ def import_json_data(json_file: Path, data_type: str) -> Union[Dict[str, np.ndar
181252 return convert_python_to_numpy (json_data , data_type )
182253
183254
184- def export_json_data (json_file : Path , data : Union [Dict [str , np .ndarray ], List [Dict [str , np .ndarray ]]], indent = 2 ):
255+ def export_json_data (
256+ json_file : Path ,
257+ data : Union [Dict [str , np .ndarray ], List [Dict [str , np .ndarray ]]],
258+ indent : Optional [int ] = 2 ,
259+ compact : bool = False ,
260+ extra_info : Optional [Union [Dict [int , Any ], List [Dict [int , Any ]]]] = None ,
261+ ):
185262 """
186263 export json data
187264 Args:
188265 json_file: path to json file
189- data: A single or batch dataset for power-grid-model
190- indent:
191- indent of the file, default 2
266+ data: a single or batch dataset for power-grid-model
267+ indent: indent of the file, default 2
268+ compact: write components on a single line
269+ extra_info: extra information (in any json-serializable format), indexed on the object ids
270+ e.g. a string representing the original id, or a dictionary storing even more information.
192271
193272 Returns:
194273 Save to file
195274 """
196275 json_data = convert_numpy_to_python (data )
276+ if extra_info is not None :
277+ _inject_extra_info (data = json_data , extra_info = extra_info )
278+
197279 with open (json_file , mode = "w" , encoding = "utf-8" ) as file_pointer :
198- json .dump (json_data , file_pointer , indent = indent )
280+ if compact and indent :
281+ is_batch_data = isinstance (json_data , list )
282+ max_level = 4 if is_batch_data else 3
283+ _compact_json_dump (json_data , file_pointer , indent = indent , max_level = max_level )
284+ else :
285+ json .dump (json_data , file_pointer , indent = indent )
286+
287+
288+ def _inject_extra_info (
289+ data : Union [Dict [str , List [Dict [str , Union [float , int ]]]], List [Dict [str , List [Dict [str , Union [float , int ]]]]]],
290+ extra_info : Union [Dict [int , Any ], List [Dict [int , Any ]]],
291+ ):
292+ """
293+ Injects extra info to the objects by ID
294+
295+ Args:
296+ data: Power Grid Model Python data, as written to pgm json files.
297+ extra_info: A dictionary indexed by object id. The value may be anything.
298+
299+ """
300+ if isinstance (data , list ):
301+ if isinstance (extra_info , list ):
302+ # If both data and extra_info are lists, expect one extra info set per batch
303+ for batch , info in zip (data , extra_info ):
304+ _inject_extra_info (batch , info )
305+ else :
306+ # If only data is a list, copy extra_info for each batch
307+ for batch in data :
308+ _inject_extra_info (batch , extra_info )
309+ elif isinstance (data , dict ):
310+ if not isinstance (extra_info , dict ):
311+ raise TypeError ("Invalid extra info data type" )
312+ for component , objects in data .items ():
313+ for obj in objects :
314+ if obj ["id" ] in extra_info :
315+ obj ["extra" ] = extra_info [obj ["id" ]]
316+ else :
317+ raise TypeError ("Invalid data type" )
318+
319+
320+ def _compact_json_dump (data : Any , io_stream : IO [str ], indent : int , max_level : int , level : int = 0 ):
321+ """Custom compact JSON writer that is intended to put data belonging to a single object on a single line.
322+
323+ For example:
324+ {
325+ "node": [
326+ {"id": 0, "u_rated": 10500.0, "extra": {"original_id": 123}},
327+ {"id": 1, "u_rated": 10500.0, "extra": {"original_id": 456}},
328+ ],
329+ "line": [
330+ {"id": 2, "node_from": 0, "node_to": 1, ...}
331+ ]
332+ }
333+
334+ The function is being called recursively, starting at level 0 and recursing until max_level is reached. It is
335+ basically a full json writer, but for efficiency reasons, on the last levels the native json.dump method is used.
336+ """
337+
338+ # Let's define a 'tab' indent, depending on the level
339+ tab = " " * level * indent
340+
341+ # If we are at the max_level, or the data simply doesn't contain any more levels, write the indent and serialize
342+ # the data on a single line.
343+ if level >= max_level or not isinstance (data , (list , dict )):
344+ io_stream .write (tab )
345+ json .dump (data , io_stream , indent = None )
346+ return
347+
348+ # We'll need the number of objects later on
349+ n_obj = len (data )
350+
351+ # If the data is a list:
352+ # 1. start with an opening bracket
353+ # 2. dump each element in the list
354+ # 3. add a comma and a new line after each element, except for the last element, there we don't need a comma.
355+ # 4. finish with a closing bracket
356+ if isinstance (data , list ):
357+ io_stream .write (tab + "[\n " )
358+ for i , obj in enumerate (data , start = 1 ):
359+ _compact_json_dump (obj , io_stream , indent , max_level , level + 1 )
360+ io_stream .write (",\n " if i < n_obj else "\n " )
361+ io_stream .write (tab + "]" )
362+ return
363+
364+ # If the data is a dictionary:
365+ # 1. start with an opening curly bracket
366+ # 2. for each element: write it's key, plus a colon ':'
367+ # 3. if the next level would be the max_level, add a space and dump the element on a single,
368+ # else add a new line before dumping the element recursively.
369+ # 4. add a comma and a new line after each element, except for the last element, there we don't need a comma.
370+ # 5. finish with a closing curly bracket
371+ io_stream .write (tab + "{\n " )
372+ for i , (key , obj ) in enumerate (data .items (), start = 1 ):
373+ io_stream .write (tab + " " * indent + f'"{ key } ":' )
374+ if level == max_level - 1 or not isinstance (obj , (list , dict )):
375+ io_stream .write (" " )
376+ json .dump (obj , io_stream , indent = None )
377+ else :
378+ io_stream .write ("\n " )
379+ _compact_json_dump (obj , io_stream , indent , max_level , level + 2 )
380+ io_stream .write (",\n " if i < n_obj else "\n " )
381+ io_stream .write (tab + "}" )
0 commit comments