Skip to content

Commit 158d062

Browse files
Merge pull request #99 from alliander-opensource/feature/file-io
2 parents 1794b76 + d9dde06 commit 158d062

File tree

4 files changed

+450
-68
lines changed

4 files changed

+450
-68
lines changed

src/power_grid_model/manual_testing.py

Lines changed: 247 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import json
1010
from pathlib import Path
11-
from typing import Dict, List, Union
11+
from typing import IO, Any, Dict, List, Optional, Union
1212

1313
import numpy as np
1414

@@ -37,7 +37,12 @@ def convert_list_to_batch_data(
3737
list_data: List[Dict[str, np.ndarray]]
3838
) -> Dict[str, Union[np.ndarray, Dict[str, np.ndarray]]]:
3939
"""
40-
Convert list of dataset to one single batch dataset
40+
Convert a list of datasets to one single batch dataset
41+
42+
Example data formats:
43+
input: [{"node": <1d-array>, "line": <1d-array>}, {"node": <1d-array>, "line": <1d-array>}]
44+
output: {"node": <2d-array>, "line": <2d-array>}
45+
-or-: {"indptr": <1d-array>, "data": <1d-array>}
4146
Args:
4247
list_data: list of dataset
4348
@@ -48,25 +53,38 @@ def convert_list_to_batch_data(
4853
"""
4954

5055
# List all *unique* types
51-
all_types = list({x for single_batch in list_data for x in single_batch.keys()})
56+
components = {x for dataset in list_data for x in dataset.keys()}
5257

5358
batch_data = {}
54-
for comp_type in all_types:
55-
# use 2D array if the type exists in all single dataset and the size is the same
56-
if np.all([comp_type in x for x in list_data]) and np.unique([x[comp_type].size for x in list_data]).size == 1:
57-
batch_data[comp_type] = np.stack([x[comp_type] for x in list_data], axis=0)
59+
for component in components:
60+
61+
# Create a 2D array if the component exists in all datasets and number of objects is the same in each dataset
62+
comp_exists_in_all_datasets = all(component in x for x in list_data)
63+
all_sizes_are_the_same = lambda: all(x[component].size == list_data[0][component].size for x in list_data)
64+
if comp_exists_in_all_datasets and all_sizes_are_the_same():
65+
batch_data[component] = np.stack([x[component] for x in list_data], axis=0)
5866
continue
67+
5968
# otherwise use indptr/data dict
6069
indptr = [0]
6170
data = []
62-
for single_batch in list_data:
63-
if comp_type not in single_batch:
64-
indptr.append(indptr[-1])
71+
for dataset in list_data:
72+
73+
if component in dataset:
74+
# If the current dataset contains the component, increase the indptr for this batch and append the data
75+
objects = dataset[component]
76+
indptr.append(indptr[-1] + len(objects))
77+
data.append(objects)
78+
6579
else:
66-
single_data = single_batch[comp_type]
67-
indptr.append(indptr[-1] + single_data.shape[0])
68-
data.append(single_data)
69-
batch_data[comp_type] = {"indptr": np.array(indptr, dtype=np.int32), "data": np.concatenate(data, axis=0)}
80+
# If the current dataset does not contain the component, add the last indptr again.
81+
indptr.append(indptr[-1])
82+
83+
# Convert the index pointers to a numpy array and combine the list of object numpy arrays into a singe
84+
# numpy array. All objects of all batches are now stores in one large array, the index pointers define
85+
# which elemets of the array (rows) belong to which batch.
86+
batch_data[component] = {"indptr": np.array(indptr, dtype=np.int32), "data": np.concatenate(data, axis=0)}
87+
7088
return batch_data
7189

7290

@@ -83,27 +101,46 @@ def convert_python_to_numpy(
83101
A single or batch dataset for power-grid-model
84102
85103
"""
86-
if isinstance(data, dict):
87-
return_dict = {}
88-
for component_name, component_list in data.items():
89-
arr: np.ndarray = initialize_array(data_type, component_name, len(component_list))
90-
for i, component in enumerate(component_list):
91-
for property_name, value in component.items():
92-
if property_name not in arr[i].dtype.names:
93-
raise ValueError(f"Invalid property '{property_name}' for {component_name} {data_type} data.")
94-
try:
95-
arr[i][property_name] = value
96-
except ValueError as ex:
97-
raise ValueError(f"Invalid '{property_name}' value for {component_name} {data_type} data: {ex}")
98-
99-
return_dict[component_name] = arr
100-
return return_dict
101104

105+
# If the inpute data is a list, we are dealing with batch data. Each element in the list is a batch. We'll
106+
# first convert each batch seperately, by recusively calling this function for each batch. Then the numpy
107+
# data for all batches in converted into a proper and compact numpy structure.
102108
if isinstance(data, list):
103109
list_data = [convert_python_to_numpy(json_dict, data_type=data_type) for json_dict in data]
104110
return convert_list_to_batch_data(list_data)
105111

106-
raise TypeError("Only list or dict is allowed in JSON data!")
112+
# This should be a normal (non-batch) structure, with a list of objects (dictionaries) per component.
113+
if not isinstance(data, dict):
114+
raise TypeError("Only list or dict is allowed in JSON data!")
115+
116+
dataset: Dict[str, np.ndarray] = {}
117+
for component, objects in data.items():
118+
119+
# We'll initialize an 1d-array with NaN values for all the objects of this component type
120+
dataset[component] = initialize_array(data_type, component, len(objects))
121+
122+
for i, obj in enumerate(objects):
123+
# As each object is a separate dictionary, and the properties may differ per object, we need to check
124+
# all properties. Non-existing properties
125+
for property, value in obj.items():
126+
if property == "extra":
127+
# The "extra" property is a special one. It can store any type of information associated with
128+
# an object, but it will not be used in the calculations. Therefore it is not included in the
129+
# numpy array, so we can skip this property
130+
continue
131+
132+
if property not in dataset[component].dtype.names:
133+
# If a property doen't exist, the user made a mistake. Let's be merciless in that case,
134+
# for their own good.
135+
raise ValueError(f"Invalid property '{property}' for {component} {data_type} data.")
136+
137+
# Now just assign the value and raise an error if the value cannot be stored in the specific
138+
# numpy array data format for this property.
139+
try:
140+
dataset[component][i][property] = value
141+
except ValueError as ex:
142+
raise ValueError(f"Invalid '{property}' value for {component} {data_type} data: {ex}")
143+
return dataset
107144

108145

109146
def convert_batch_to_list_data(
@@ -117,29 +154,52 @@ def convert_batch_to_list_data(
117154
Returns:
118155
list of single dataset
119156
"""
120-
list_data = []
121-
# return empty list
157+
158+
# If the batch data is empty, return an empty list
122159
if not batch_data:
123-
return list_data
124-
# get n_batch
125-
one_data = next(iter(batch_data.values()))
126-
if isinstance(one_data, dict):
127-
n_batch = one_data["indptr"].size - 1
160+
return []
161+
162+
# Get the data for an arbitrary component; assuming that the number of batches of each component is the same.
163+
# The structure may differ per component
164+
example_batch_data = next(iter(batch_data.values()))
165+
166+
if isinstance(example_batch_data, np.ndarray):
167+
# We expect the batch data to be a 2d numpy array of n_batches x n_objects
168+
if len(example_batch_data.shape) != 2:
169+
raise ValueError("Invalid batch data format")
170+
n_batches = example_batch_data.shape[0]
171+
elif isinstance(example_batch_data, dict):
172+
# If the batch data is a dictionary, we assume that it is an indptr/data structure (otherwise it is an
173+
# invalid dictionary). There is always one indptr more than there are batches.
174+
if "indptr" not in example_batch_data:
175+
raise ValueError("Invalid batch data format")
176+
n_batches = example_batch_data["indptr"].size - 1
128177
else:
129-
n_batch = one_data.shape[0]
130-
# convert
131-
for i in range(n_batch):
132-
single_dataset = {}
133-
for key, batch in batch_data.items():
134-
if isinstance(batch, dict):
135-
single_dataset[key] = batch["data"][batch["indptr"][i] : batch["indptr"][i + 1]]
136-
else:
137-
single_dataset[key] = batch[i, ...]
138-
list_data.append(single_dataset)
178+
# If the batch data is not a numpy array and not a dictionary, it is invalid
179+
raise ValueError("Invalid batch data format")
180+
181+
# Initialize an empty list with dictionaries
182+
# Note that [{}] * n_batches would result in n copies of the same dict.
183+
list_data = [{} for _ in range(n_batches)]
184+
185+
# While the number of batches must be the same for each component, the structure (2d numpy array or indptr/data)
186+
# doesn't have to be. Therefore, we'll check the structure for each component and copy the data accordingly.
187+
for component, data in batch_data.items():
188+
if isinstance(data, np.ndarray):
189+
# For 2d numpy arrays, copy each batch into an element of the list
190+
for i, batch in enumerate(data):
191+
list_data[i][component] = batch
192+
else:
193+
# For indptr/data structures, use the indptr to select the items for each batch.
194+
indptr = data["indptr"]
195+
for i, (idx0, idx1) in enumerate(zip(indptr[:-1], indptr[1:])):
196+
list_data[i][component] = data["data"][idx0:idx1]
139197
return list_data
140198

141199

142-
def convert_numpy_to_python(data: Dict[str, Union[np.ndarray, Dict[str, np.ndarray]]]) -> Union[Dict, List]:
200+
def convert_numpy_to_python(
201+
data: Dict[str, Union[np.ndarray, Dict[str, np.ndarray]]]
202+
) -> Union[Dict[str, List[Dict[str, Union[int, float]]]], List[Dict[str, List[Dict[str, Union[int, float]]]]]]:
143203
"""
144204
Convert internal numpy arrays to native python data
145205
If an attribute is not available (NaN value), it will not be exported.
@@ -150,18 +210,29 @@ def convert_numpy_to_python(data: Dict[str, Union[np.ndarray, Dict[str, np.ndarr
150210
A json list for batch dataset
151211
152212
"""
153-
# check the dataset is single or batch
154-
if data:
155-
one_data = next(iter(data.values()))
156-
# it is batch dataset if it is 2D array of a dict of indptr/data
157-
if isinstance(one_data, dict) or one_data.ndim == 2:
158-
list_data = convert_batch_to_list_data(data)
159-
return [convert_numpy_to_python(x) for x in list_data]
160-
# otherwise it is single dataset
161-
single_dataset: Dict[str, np.ndarray] = data
213+
# Check if the dataset is a single dataset or batch dataset
214+
# It is batch dataset if it is 2D array or a indptr/data structure
215+
example_data = next(iter(data.values()))
216+
is_dense_batch = isinstance(example_data, np.ndarray) and example_data.ndim == 2
217+
is_sparse_batch = isinstance(example_data, dict) and "indptr" in example_data and "data" in example_data
218+
219+
# If it is a batch, convert the batch data to a list of batches, then convert each batch individually.
220+
if is_dense_batch or is_sparse_batch:
221+
list_data = convert_batch_to_list_data(data)
222+
return [convert_numpy_to_python(x) for x in list_data]
223+
224+
# Otherwise it should be a single data set
225+
if not isinstance(example_data, np.ndarray) or example_data.ndim != 1:
226+
raise ValueError("Invalid data format")
227+
228+
# Convert each numpy array to a list of objects, which contains only the non-NaN properties:
229+
# For example: {"node": [{"id": 0, ...}, {"id": 1, ...}], "line": [{"id": 2, ...}]}
162230
return {
163-
name: [{k: item[k].tolist() for k in array.dtype.names if not is_nan(item[k])} for item in array]
164-
for name, array in single_dataset.items()
231+
component: [
232+
{property: obj[property].tolist() for property in objects.dtype.names if not is_nan(obj[property])}
233+
for obj in objects
234+
]
235+
for component, objects in data.items()
165236
}
166237

167238

@@ -181,18 +252,130 @@ def import_json_data(json_file: Path, data_type: str) -> Union[Dict[str, np.ndar
181252
return convert_python_to_numpy(json_data, data_type)
182253

183254

184-
def export_json_data(json_file: Path, data: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]], indent=2):
255+
def export_json_data(
256+
json_file: Path,
257+
data: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
258+
indent: Optional[int] = 2,
259+
compact: bool = False,
260+
extra_info: Optional[Union[Dict[int, Any], List[Dict[int, Any]]]] = None,
261+
):
185262
"""
186263
export json data
187264
Args:
188265
json_file: path to json file
189-
data: A single or batch dataset for power-grid-model
190-
indent:
191-
indent of the file, default 2
266+
data: a single or batch dataset for power-grid-model
267+
indent: indent of the file, default 2
268+
compact: write components on a single line
269+
extra_info: extra information (in any json-serializable format), indexed on the object ids
270+
e.g. a string representing the original id, or a dictionary storing even more information.
192271
193272
Returns:
194273
Save to file
195274
"""
196275
json_data = convert_numpy_to_python(data)
276+
if extra_info is not None:
277+
_inject_extra_info(data=json_data, extra_info=extra_info)
278+
197279
with open(json_file, mode="w", encoding="utf-8") as file_pointer:
198-
json.dump(json_data, file_pointer, indent=indent)
280+
if compact and indent:
281+
is_batch_data = isinstance(json_data, list)
282+
max_level = 4 if is_batch_data else 3
283+
_compact_json_dump(json_data, file_pointer, indent=indent, max_level=max_level)
284+
else:
285+
json.dump(json_data, file_pointer, indent=indent)
286+
287+
288+
def _inject_extra_info(
289+
data: Union[Dict[str, List[Dict[str, Union[float, int]]]], List[Dict[str, List[Dict[str, Union[float, int]]]]]],
290+
extra_info: Union[Dict[int, Any], List[Dict[int, Any]]],
291+
):
292+
"""
293+
Injects extra info to the objects by ID
294+
295+
Args:
296+
data: Power Grid Model Python data, as written to pgm json files.
297+
extra_info: A dictionary indexed by object id. The value may be anything.
298+
299+
"""
300+
if isinstance(data, list):
301+
if isinstance(extra_info, list):
302+
# If both data and extra_info are lists, expect one extra info set per batch
303+
for batch, info in zip(data, extra_info):
304+
_inject_extra_info(batch, info)
305+
else:
306+
# If only data is a list, copy extra_info for each batch
307+
for batch in data:
308+
_inject_extra_info(batch, extra_info)
309+
elif isinstance(data, dict):
310+
if not isinstance(extra_info, dict):
311+
raise TypeError("Invalid extra info data type")
312+
for component, objects in data.items():
313+
for obj in objects:
314+
if obj["id"] in extra_info:
315+
obj["extra"] = extra_info[obj["id"]]
316+
else:
317+
raise TypeError("Invalid data type")
318+
319+
320+
def _compact_json_dump(data: Any, io_stream: IO[str], indent: int, max_level: int, level: int = 0):
321+
"""Custom compact JSON writer that is intended to put data belonging to a single object on a single line.
322+
323+
For example:
324+
{
325+
"node": [
326+
{"id": 0, "u_rated": 10500.0, "extra": {"original_id": 123}},
327+
{"id": 1, "u_rated": 10500.0, "extra": {"original_id": 456}},
328+
],
329+
"line": [
330+
{"id": 2, "node_from": 0, "node_to": 1, ...}
331+
]
332+
}
333+
334+
The function is being called recursively, starting at level 0 and recursing until max_level is reached. It is
335+
basically a full json writer, but for efficiency reasons, on the last levels the native json.dump method is used.
336+
"""
337+
338+
# Let's define a 'tab' indent, depending on the level
339+
tab = " " * level * indent
340+
341+
# If we are at the max_level, or the data simply doesn't contain any more levels, write the indent and serialize
342+
# the data on a single line.
343+
if level >= max_level or not isinstance(data, (list, dict)):
344+
io_stream.write(tab)
345+
json.dump(data, io_stream, indent=None)
346+
return
347+
348+
# We'll need the number of objects later on
349+
n_obj = len(data)
350+
351+
# If the data is a list:
352+
# 1. start with an opening bracket
353+
# 2. dump each element in the list
354+
# 3. add a comma and a new line after each element, except for the last element, there we don't need a comma.
355+
# 4. finish with a closing bracket
356+
if isinstance(data, list):
357+
io_stream.write(tab + "[\n")
358+
for i, obj in enumerate(data, start=1):
359+
_compact_json_dump(obj, io_stream, indent, max_level, level + 1)
360+
io_stream.write(",\n" if i < n_obj else "\n")
361+
io_stream.write(tab + "]")
362+
return
363+
364+
# If the data is a dictionary:
365+
# 1. start with an opening curly bracket
366+
# 2. for each element: write it's key, plus a colon ':'
367+
# 3. if the next level would be the max_level, add a space and dump the element on a single,
368+
# else add a new line before dumping the element recursively.
369+
# 4. add a comma and a new line after each element, except for the last element, there we don't need a comma.
370+
# 5. finish with a closing curly bracket
371+
io_stream.write(tab + "{\n")
372+
for i, (key, obj) in enumerate(data.items(), start=1):
373+
io_stream.write(tab + " " * indent + f'"{key}":')
374+
if level == max_level - 1 or not isinstance(obj, (list, dict)):
375+
io_stream.write(" ")
376+
json.dump(obj, io_stream, indent=None)
377+
else:
378+
io_stream.write("\n")
379+
_compact_json_dump(obj, io_stream, indent, max_level, level + 2)
380+
io_stream.write(",\n" if i < n_obj else "\n")
381+
io_stream.write(tab + "}")

tests/cpp_unit_tests/test_validation.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ struct Buffer {
5050
void parse_single_object(void* ptr, json const& j, MetaData const& meta, Idx position) {
5151
meta.set_nan(ptr, position);
5252
for (auto const& it : j.items()) {
53+
// skip extra info
54+
if (it.key() == "extra") {
55+
continue;
56+
}
5357
DataAttribute const& attr = meta.find_attr(it.key());
5458
if (attr.numpy_type == "i1") {
5559
int8_t const value = it.value().get<int8_t>();

0 commit comments

Comments
 (0)