Skip to content

Commit ad8e35c

Browse files
committed
init project library
1 parent 6aab106 commit ad8e35c

30 files changed

+3932
-32
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ __pycache__/
66
# C extensions
77
*.so
88

9+
# ignore examples folder
10+
examples/
11+
912
# Distribution / packaging
1013
.Python
1114
build/

.vscode/settings.json

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// Set correct python path to venv's one
44
//
55
"python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
6-
//
6+
//`
77
// Very optional: type checking. Remove the line if your project doesn't really use or respect
88
// type hints. You should give it a try, though. They're great.
99
//
@@ -25,13 +25,18 @@
2525
//
2626
"editor.formatOnSave": true,
2727
"editor.codeActionsOnSave": {
28-
"source.organizeImports": true
28+
"source.organizeImports": "explicit"
2929
},
3030
"black-formatter.importStrategy": "fromEnvironment",
3131
"isort.importStrategy": "fromEnvironment",
3232
"flake8.importStrategy": "fromEnvironment",
33-
"isort.args": ["--settings-path", "${workspaceFolder}/pyproject.toml"],
34-
"flake8.args": ["--config=${workspaceFolder}/.flake8"],
33+
"isort.args": [
34+
"--settings-path",
35+
"${workspaceFolder}/pyproject.toml"
36+
],
37+
"flake8.args": [
38+
"--config=${workspaceFolder}/.flake8"
39+
],
3540
"editor.rulers": [
3641
100 // if changing line length, also do it in .flake8 and pyproject.toml's [tool.black] section
3742
],
@@ -45,4 +50,4 @@
4550
"jupyter.interactiveWindow.textEditor.executeSelection": true,
4651
// TODO: this setting is showing a deprecation warning. Maybe we should drop it?
4752
"jupyter.generateSVGPlots": true,
48-
}
53+
}

pipeline_lib/__init__.py

Whitespace-only changes.

pipeline_lib/core/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .data_container import DataContainer # noqa: F401
2+
from .pipeline import Pipeline # noqa: F401
3+
from .steps import PipelineStep # noqa: F401

pipeline_lib/core/data_container.py

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
"""DataContainer class for storing data used in pipeline processing."""
2+
3+
from __future__ import annotations
4+
5+
import json
6+
import logging
7+
import pickle
8+
import sys
9+
from typing import Optional, Union
10+
11+
12+
class DataContainer:
13+
"""
14+
A container for storing and manipulating data in a pipeline.
15+
16+
Attributes
17+
----------
18+
data : dict
19+
A dictionary to store data items.
20+
"""
21+
22+
GENERATE_CONFIGS = "generate_configs"
23+
CLEAN_CONFIGS = "clean_configs"
24+
SPLIT_CONFIGS = "split_configs"
25+
TARGET_SCALING_CONFIGS = "target_scaling_configs"
26+
RAW = "raw"
27+
CLEAN = "clean"
28+
TRAIN = "train"
29+
VALIDATION = "validation"
30+
TEST = "test"
31+
MODEL = "model"
32+
MODEL_CONFIGS = "model_configs"
33+
MODEL_INPUT = "model_input"
34+
MODEL_OUTPUT = "model_output"
35+
METRICS = "metrics"
36+
PREDICTIONS = "predictions"
37+
EXPLAINER = "explainer"
38+
TUNING_PARAMS = "tuning_params"
39+
40+
def __init__(self, initial_data: Optional[dict] = None):
41+
"""
42+
Initialize the DataContainer with an empty dictionary or provided data.
43+
44+
Parameters
45+
----------
46+
initial_data : dict, optional
47+
Initial data to populate the container.
48+
"""
49+
self.data = initial_data if initial_data is not None else {}
50+
self.logger = logging.getLogger(self.__class__.__name__)
51+
self.logger.debug(f"{self.__class__.__name__} initialized")
52+
53+
def add(self, key: str, value):
54+
"""
55+
Add a new item to the container.
56+
57+
Parameters
58+
----------
59+
key : str
60+
The key under which the value is stored.
61+
value
62+
The data to be stored.
63+
64+
Returns
65+
-------
66+
None
67+
"""
68+
self.data[key] = value
69+
self.logger.debug(f"Data added under key: {key}")
70+
71+
def get(self, key: str, default=None):
72+
"""
73+
Retrieve an item from the container by its key.
74+
75+
Parameters
76+
----------
77+
key : str
78+
The key of the item to retrieve.
79+
default
80+
The default value to return if the key is not found. Defaults to None.
81+
82+
Returns
83+
-------
84+
The data stored under the given key or the default value.
85+
"""
86+
return self.data.get(key, default)
87+
88+
def __getitem__(self, key: str):
89+
"""
90+
Retrieve an item using bracket notation.
91+
92+
Parameters
93+
----------
94+
key : str
95+
The key of the item to retrieve.
96+
97+
Returns
98+
-------
99+
The data stored under the given key.
100+
"""
101+
return self.get(key)
102+
103+
def __setitem__(self, key: str, value):
104+
"""
105+
Add or update an item using bracket notation.
106+
107+
Parameters
108+
----------
109+
key : str
110+
The key under which the value is stored.
111+
value
112+
The data to be stored.
113+
114+
Returns
115+
-------
116+
None
117+
"""
118+
self.add(key, value)
119+
120+
def contains(self, key: str) -> bool:
121+
"""
122+
Check if the container contains an item with the specified key.
123+
124+
Parameters
125+
----------
126+
key : str
127+
The key to check in the container.
128+
129+
Returns
130+
-------
131+
bool
132+
True if the key exists, False otherwise.
133+
"""
134+
return key in self.data
135+
136+
def __contains__(self, key: str) -> bool:
137+
"""
138+
Enable usage of the 'in' keyword.
139+
140+
Parameters
141+
----------
142+
key : str
143+
The key to check in the container.
144+
145+
Returns
146+
-------
147+
bool
148+
True if the key exists, False otherwise.
149+
"""
150+
return self.contains(key)
151+
152+
@property
153+
def keys(self) -> list[str]:
154+
"""
155+
Return the keys of the container.
156+
157+
Returns
158+
-------
159+
list[str]
160+
The keys of the container.
161+
"""
162+
return list(self.data.keys())
163+
164+
def save(self, file_path: str, keys: Optional[Union[str, list[str]]] = None):
165+
"""
166+
Serialize the container data using pickle and save it to a file.
167+
168+
Parameters
169+
----------
170+
file_path : str
171+
The path of the file where the serialized data should be saved.
172+
keys : Optional[Union[str, List[str]]], optional
173+
The keys of the data to be saved. If None, all data is saved.
174+
175+
Returns
176+
-------
177+
None
178+
"""
179+
if isinstance(keys, str):
180+
keys = [keys]
181+
182+
data_to_save = {k: self.data[k] for k in keys} if keys else self.data
183+
184+
serialized_data = pickle.dumps(data_to_save)
185+
data_size_bytes = sys.getsizeof(serialized_data)
186+
data_size_mb = data_size_bytes / 1048576 # Convert bytes to megabytes
187+
188+
with open(file_path, "wb") as file:
189+
file.write(serialized_data)
190+
self.logger.info(
191+
f"{self.__class__.__name__} serialized and saved to {file_path}. Size:"
192+
f" {data_size_mb:.2f} MB"
193+
)
194+
195+
@classmethod
196+
def load(cls, file_path: str, keys: Optional[Union[str, list[str]]] = None) -> DataContainer:
197+
"""
198+
Load data from a file and return a new instance of DataContainer.
199+
200+
Parameters
201+
----------
202+
file_path : str
203+
The path of the file from which the serialized data should be read.
204+
keys : Optional[Union[str, List[str]]], optional
205+
The keys of the data to be loaded. If None, all data is loaded.
206+
207+
Returns
208+
-------
209+
DataContainer
210+
A new instance of DataContainer populated with the deserialized data.
211+
"""
212+
with open(file_path, "rb") as file:
213+
data = pickle.loads(file.read())
214+
215+
if isinstance(keys, str):
216+
keys = [keys]
217+
218+
if keys:
219+
data = {k: v for k, v in data.items() if k in keys}
220+
221+
new_container = cls(initial_data=data)
222+
223+
if keys:
224+
loaded_keys = set(new_container.keys)
225+
not_loaded_keys = set(keys) - loaded_keys if keys else set()
226+
if not_loaded_keys:
227+
new_container.logger.warning(f"Keys without values: {not_loaded_keys}")
228+
229+
new_container.logger.info(f"{cls.__name__} loaded from {file_path}")
230+
return new_container
231+
232+
def __eq__(self, other) -> bool:
233+
"""
234+
Compare this DataContainer with another for equality.
235+
236+
Parameters
237+
----------
238+
other : DataContainer
239+
Another DataContainer instance to compare with.
240+
241+
Returns
242+
-------
243+
bool
244+
True if containers are equal, False otherwise.
245+
"""
246+
if isinstance(other, DataContainer):
247+
return self.data == other.data
248+
return False
249+
250+
def __ne__(self, other) -> bool:
251+
"""
252+
Compare this DataContainer with another for inequality.
253+
254+
Parameters
255+
----------
256+
other : DataContainer
257+
Another DataContainer instance to compare with.
258+
259+
Returns
260+
-------
261+
bool
262+
True if containers are not equal, False otherwise.
263+
"""
264+
return not self.__eq__(other)
265+
266+
def __str__(self):
267+
"""
268+
Generate a user-friendly JSON string representation of the DataContainer.
269+
270+
Returns
271+
-------
272+
str
273+
A JSON string describing the keys and types of contents of the DataContainer.
274+
"""
275+
data_summary = {key: type(value).__name__ for key, value in self.data.items()}
276+
return json.dumps(data_summary, indent=4)
277+
278+
def __repr__(self):
279+
"""
280+
Generate an official string representation of the DataContainer.
281+
282+
Returns
283+
-------
284+
str
285+
A formal string representation of the DataContainer's state.
286+
"""
287+
return f"<DataContainer({self.data})>"

pipeline_lib/core/pipeline.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
from abc import ABC, abstractmethod
5+
from typing import Optional
6+
7+
from pipeline_lib.core.data_container import DataContainer
8+
from pipeline_lib.core.steps import PipelineStep
9+
10+
11+
class Pipeline(ABC):
12+
"""Base class for pipelines."""
13+
14+
def __init__(self, initial_data: Optional[DataContainer] = None):
15+
self.steps = self.define_steps()
16+
if not all(isinstance(step, PipelineStep) for step in self.steps):
17+
raise TypeError("All steps must be instances of PipelineStep")
18+
self.initial_data = initial_data
19+
20+
def run(self, data: Optional[DataContainer] = None) -> DataContainer:
21+
"""Run the pipeline on the given data."""
22+
if data is None:
23+
if self.initial_data is None:
24+
raise ValueError("No data given and no initial data set")
25+
self.logger.debug("No data given, using initial data")
26+
data = self.initial_data
27+
28+
for i, step in enumerate(self.steps):
29+
self.logger.info(f"Running {step.__class__.__name__} - {i + 1} / {len(self.steps)}")
30+
data = step.execute(data)
31+
return data
32+
33+
@abstractmethod
34+
def define_steps(self) -> list[PipelineStep]:
35+
"""
36+
Subclasses should implement this method to define their specific steps.
37+
"""
38+
39+
def init_logger(self) -> None:
40+
"""Initialize the logger."""
41+
self.logger = logging.getLogger(self.__class__.__name__)
42+
self.logger.debug(f"{self.__class__.__name__} initialized")
43+
44+
def __str__(self) -> str:
45+
step_names = [f"{i + 1}. {step.__class__.__name__}" for i, step in enumerate(self.steps)]
46+
return f"{self.__class__.__name__} with steps:\n" + "\n".join(step_names)
47+
48+
def __repr__(self) -> str:
49+
"""Return an unambiguous string representation of the pipeline."""
50+
step_names = [f"{step.__class__.__name__}()" for step in self.steps]
51+
return f"{self.__class__.__name__}({', '.join(step_names)})"

0 commit comments

Comments
 (0)