|
| 1 | +"""DataContainer class for storing data used in pipeline processing.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import json |
| 6 | +import logging |
| 7 | +import pickle |
| 8 | +import sys |
| 9 | +from typing import Optional, Union |
| 10 | + |
| 11 | + |
| 12 | +class DataContainer: |
| 13 | + """ |
| 14 | + A container for storing and manipulating data in a pipeline. |
| 15 | +
|
| 16 | + Attributes |
| 17 | + ---------- |
| 18 | + data : dict |
| 19 | + A dictionary to store data items. |
| 20 | + """ |
| 21 | + |
| 22 | + GENERATE_CONFIGS = "generate_configs" |
| 23 | + CLEAN_CONFIGS = "clean_configs" |
| 24 | + SPLIT_CONFIGS = "split_configs" |
| 25 | + TARGET_SCALING_CONFIGS = "target_scaling_configs" |
| 26 | + RAW = "raw" |
| 27 | + CLEAN = "clean" |
| 28 | + TRAIN = "train" |
| 29 | + VALIDATION = "validation" |
| 30 | + TEST = "test" |
| 31 | + MODEL = "model" |
| 32 | + MODEL_CONFIGS = "model_configs" |
| 33 | + MODEL_INPUT = "model_input" |
| 34 | + MODEL_OUTPUT = "model_output" |
| 35 | + METRICS = "metrics" |
| 36 | + PREDICTIONS = "predictions" |
| 37 | + EXPLAINER = "explainer" |
| 38 | + TUNING_PARAMS = "tuning_params" |
| 39 | + |
| 40 | + def __init__(self, initial_data: Optional[dict] = None): |
| 41 | + """ |
| 42 | + Initialize the DataContainer with an empty dictionary or provided data. |
| 43 | +
|
| 44 | + Parameters |
| 45 | + ---------- |
| 46 | + initial_data : dict, optional |
| 47 | + Initial data to populate the container. |
| 48 | + """ |
| 49 | + self.data = initial_data if initial_data is not None else {} |
| 50 | + self.logger = logging.getLogger(self.__class__.__name__) |
| 51 | + self.logger.debug(f"{self.__class__.__name__} initialized") |
| 52 | + |
| 53 | + def add(self, key: str, value): |
| 54 | + """ |
| 55 | + Add a new item to the container. |
| 56 | +
|
| 57 | + Parameters |
| 58 | + ---------- |
| 59 | + key : str |
| 60 | + The key under which the value is stored. |
| 61 | + value |
| 62 | + The data to be stored. |
| 63 | +
|
| 64 | + Returns |
| 65 | + ------- |
| 66 | + None |
| 67 | + """ |
| 68 | + self.data[key] = value |
| 69 | + self.logger.debug(f"Data added under key: {key}") |
| 70 | + |
| 71 | + def get(self, key: str, default=None): |
| 72 | + """ |
| 73 | + Retrieve an item from the container by its key. |
| 74 | +
|
| 75 | + Parameters |
| 76 | + ---------- |
| 77 | + key : str |
| 78 | + The key of the item to retrieve. |
| 79 | + default |
| 80 | + The default value to return if the key is not found. Defaults to None. |
| 81 | +
|
| 82 | + Returns |
| 83 | + ------- |
| 84 | + The data stored under the given key or the default value. |
| 85 | + """ |
| 86 | + return self.data.get(key, default) |
| 87 | + |
| 88 | + def __getitem__(self, key: str): |
| 89 | + """ |
| 90 | + Retrieve an item using bracket notation. |
| 91 | +
|
| 92 | + Parameters |
| 93 | + ---------- |
| 94 | + key : str |
| 95 | + The key of the item to retrieve. |
| 96 | +
|
| 97 | + Returns |
| 98 | + ------- |
| 99 | + The data stored under the given key. |
| 100 | + """ |
| 101 | + return self.get(key) |
| 102 | + |
| 103 | + def __setitem__(self, key: str, value): |
| 104 | + """ |
| 105 | + Add or update an item using bracket notation. |
| 106 | +
|
| 107 | + Parameters |
| 108 | + ---------- |
| 109 | + key : str |
| 110 | + The key under which the value is stored. |
| 111 | + value |
| 112 | + The data to be stored. |
| 113 | +
|
| 114 | + Returns |
| 115 | + ------- |
| 116 | + None |
| 117 | + """ |
| 118 | + self.add(key, value) |
| 119 | + |
| 120 | + def contains(self, key: str) -> bool: |
| 121 | + """ |
| 122 | + Check if the container contains an item with the specified key. |
| 123 | +
|
| 124 | + Parameters |
| 125 | + ---------- |
| 126 | + key : str |
| 127 | + The key to check in the container. |
| 128 | +
|
| 129 | + Returns |
| 130 | + ------- |
| 131 | + bool |
| 132 | + True if the key exists, False otherwise. |
| 133 | + """ |
| 134 | + return key in self.data |
| 135 | + |
| 136 | + def __contains__(self, key: str) -> bool: |
| 137 | + """ |
| 138 | + Enable usage of the 'in' keyword. |
| 139 | +
|
| 140 | + Parameters |
| 141 | + ---------- |
| 142 | + key : str |
| 143 | + The key to check in the container. |
| 144 | +
|
| 145 | + Returns |
| 146 | + ------- |
| 147 | + bool |
| 148 | + True if the key exists, False otherwise. |
| 149 | + """ |
| 150 | + return self.contains(key) |
| 151 | + |
| 152 | + @property |
| 153 | + def keys(self) -> list[str]: |
| 154 | + """ |
| 155 | + Return the keys of the container. |
| 156 | +
|
| 157 | + Returns |
| 158 | + ------- |
| 159 | + list[str] |
| 160 | + The keys of the container. |
| 161 | + """ |
| 162 | + return list(self.data.keys()) |
| 163 | + |
| 164 | + def save(self, file_path: str, keys: Optional[Union[str, list[str]]] = None): |
| 165 | + """ |
| 166 | + Serialize the container data using pickle and save it to a file. |
| 167 | +
|
| 168 | + Parameters |
| 169 | + ---------- |
| 170 | + file_path : str |
| 171 | + The path of the file where the serialized data should be saved. |
| 172 | + keys : Optional[Union[str, List[str]]], optional |
| 173 | + The keys of the data to be saved. If None, all data is saved. |
| 174 | +
|
| 175 | + Returns |
| 176 | + ------- |
| 177 | + None |
| 178 | + """ |
| 179 | + if isinstance(keys, str): |
| 180 | + keys = [keys] |
| 181 | + |
| 182 | + data_to_save = {k: self.data[k] for k in keys} if keys else self.data |
| 183 | + |
| 184 | + serialized_data = pickle.dumps(data_to_save) |
| 185 | + data_size_bytes = sys.getsizeof(serialized_data) |
| 186 | + data_size_mb = data_size_bytes / 1048576 # Convert bytes to megabytes |
| 187 | + |
| 188 | + with open(file_path, "wb") as file: |
| 189 | + file.write(serialized_data) |
| 190 | + self.logger.info( |
| 191 | + f"{self.__class__.__name__} serialized and saved to {file_path}. Size:" |
| 192 | + f" {data_size_mb:.2f} MB" |
| 193 | + ) |
| 194 | + |
| 195 | + @classmethod |
| 196 | + def load(cls, file_path: str, keys: Optional[Union[str, list[str]]] = None) -> DataContainer: |
| 197 | + """ |
| 198 | + Load data from a file and return a new instance of DataContainer. |
| 199 | +
|
| 200 | + Parameters |
| 201 | + ---------- |
| 202 | + file_path : str |
| 203 | + The path of the file from which the serialized data should be read. |
| 204 | + keys : Optional[Union[str, List[str]]], optional |
| 205 | + The keys of the data to be loaded. If None, all data is loaded. |
| 206 | +
|
| 207 | + Returns |
| 208 | + ------- |
| 209 | + DataContainer |
| 210 | + A new instance of DataContainer populated with the deserialized data. |
| 211 | + """ |
| 212 | + with open(file_path, "rb") as file: |
| 213 | + data = pickle.loads(file.read()) |
| 214 | + |
| 215 | + if isinstance(keys, str): |
| 216 | + keys = [keys] |
| 217 | + |
| 218 | + if keys: |
| 219 | + data = {k: v for k, v in data.items() if k in keys} |
| 220 | + |
| 221 | + new_container = cls(initial_data=data) |
| 222 | + |
| 223 | + if keys: |
| 224 | + loaded_keys = set(new_container.keys) |
| 225 | + not_loaded_keys = set(keys) - loaded_keys if keys else set() |
| 226 | + if not_loaded_keys: |
| 227 | + new_container.logger.warning(f"Keys without values: {not_loaded_keys}") |
| 228 | + |
| 229 | + new_container.logger.info(f"{cls.__name__} loaded from {file_path}") |
| 230 | + return new_container |
| 231 | + |
| 232 | + def __eq__(self, other) -> bool: |
| 233 | + """ |
| 234 | + Compare this DataContainer with another for equality. |
| 235 | +
|
| 236 | + Parameters |
| 237 | + ---------- |
| 238 | + other : DataContainer |
| 239 | + Another DataContainer instance to compare with. |
| 240 | +
|
| 241 | + Returns |
| 242 | + ------- |
| 243 | + bool |
| 244 | + True if containers are equal, False otherwise. |
| 245 | + """ |
| 246 | + if isinstance(other, DataContainer): |
| 247 | + return self.data == other.data |
| 248 | + return False |
| 249 | + |
| 250 | + def __ne__(self, other) -> bool: |
| 251 | + """ |
| 252 | + Compare this DataContainer with another for inequality. |
| 253 | +
|
| 254 | + Parameters |
| 255 | + ---------- |
| 256 | + other : DataContainer |
| 257 | + Another DataContainer instance to compare with. |
| 258 | +
|
| 259 | + Returns |
| 260 | + ------- |
| 261 | + bool |
| 262 | + True if containers are not equal, False otherwise. |
| 263 | + """ |
| 264 | + return not self.__eq__(other) |
| 265 | + |
| 266 | + def __str__(self): |
| 267 | + """ |
| 268 | + Generate a user-friendly JSON string representation of the DataContainer. |
| 269 | +
|
| 270 | + Returns |
| 271 | + ------- |
| 272 | + str |
| 273 | + A JSON string describing the keys and types of contents of the DataContainer. |
| 274 | + """ |
| 275 | + data_summary = {key: type(value).__name__ for key, value in self.data.items()} |
| 276 | + return json.dumps(data_summary, indent=4) |
| 277 | + |
| 278 | + def __repr__(self): |
| 279 | + """ |
| 280 | + Generate an official string representation of the DataContainer. |
| 281 | +
|
| 282 | + Returns |
| 283 | + ------- |
| 284 | + str |
| 285 | + A formal string representation of the DataContainer's state. |
| 286 | + """ |
| 287 | + return f"<DataContainer({self.data})>" |
0 commit comments