Skip to content

Commit bc9a4df

Browse files
committed
improve tabular split and add calc train metrics
1 parent c32b1ab commit bc9a4df

File tree

7 files changed

+185
-85
lines changed

7 files changed

+185
-85
lines changed

pipeline_lib/core/data_container.py

Lines changed: 18 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -377,76 +377,52 @@ def validation(self, value: Any):
377377
self["validation"] = value
378378

379379
@property
380-
def model(self) -> Any:
381-
"""
382-
Get the model from the DataContainer.
383-
384-
Returns
385-
-------
386-
Any
387-
The model stored in the DataContainer.
388-
"""
389-
return self["model"]
390-
391-
@model.setter
392-
def model(self, value: Any):
393-
"""
394-
Set the model in the DataContainer.
395-
396-
Parameters
397-
----------
398-
value
399-
The model to be stored in the DataContainer.
400-
"""
401-
self["model"] = value
402-
403-
@property
404-
def model_input(self) -> Any:
380+
def test(self) -> Any:
405381
"""
406-
Get the model input from the DataContainer.
382+
Get the test data from the DataContainer.
407383
408384
Returns
409385
-------
410386
Any
411-
The model input stored in the DataContainer.
387+
The test data stored in the DataContainer.
412388
"""
413-
return self["model_input"]
389+
return self["test"]
414390

415-
@model_input.setter
416-
def model_input(self, value: Any):
391+
@test.setter
392+
def test(self, value: Any):
417393
"""
418-
Set the model input in the DataContainer.
394+
Set the test data in the DataContainer.
419395
420396
Parameters
421397
----------
422398
value
423-
The model input to be stored in the DataContainer.
399+
The test data to be stored in the DataContainer.
424400
"""
425-
self["model_input"] = value
401+
self["test"] = value
426402

427403
@property
428-
def model_output(self) -> Any:
404+
def model(self) -> Any:
429405
"""
430-
Get the model output from the DataContainer.
406+
Get the model from the DataContainer.
431407
432408
Returns
433409
-------
434410
Any
435-
The model output stored in the DataContainer.
411+
The model stored in the DataContainer.
436412
"""
437-
return self["model_output"]
413+
return self["model"]
438414

439-
@model_output.setter
440-
def model_output(self, value: Any):
415+
@model.setter
416+
def model(self, value: Any):
441417
"""
442-
Set the model output in the DataContainer.
418+
Set the model in the DataContainer.
443419
444420
Parameters
445421
----------
446422
value
447-
The model output to be stored in the DataContainer.
423+
The model to be stored in the DataContainer.
448424
"""
449-
self["model_output"] = value
425+
self["model"] = value
450426

451427
@property
452428
def metrics(self) -> Any:
@@ -568,30 +544,6 @@ def target(self, value: Any):
568544
"""
569545
self["target"] = value
570546

571-
@property
572-
def features(self) -> Any:
573-
"""
574-
Get the features from the DataContainer.
575-
576-
Returns
577-
-------
578-
Any
579-
The features stored in the DataContainer.
580-
"""
581-
return self["features"]
582-
583-
@features.setter
584-
def features(self, value: Any):
585-
"""
586-
Set the features in the DataContainer.
587-
588-
Parameters
589-
----------
590-
value
591-
The features to be stored in the DataContainer.
592-
"""
593-
self["features"] = value
594-
595547
@property
596548
def flow(self) -> Any:
597549
"""

pipeline_lib/core/model.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from abc import ABC, abstractmethod
2+
from pathlib import Path
23
from typing import List, Optional, Tuple
34

5+
import joblib
46
import pandas as pd
57

68

@@ -20,3 +22,20 @@ def fit(
2022
@abstractmethod
2123
def predict(self, X: pd.DataFrame) -> pd.Series:
2224
"""Abstract method for making predictions."""
25+
26+
def save(self, path: str) -> None:
27+
"""Save the model."""
28+
if not path.endswith(".joblib"):
29+
raise ValueError("The path must end with .joblib")
30+
joblib.dump(self, path)
31+
32+
@classmethod
33+
def from_file(cls, path: str) -> "Model":
34+
"""Load the model from a .joblib file."""
35+
if not Path(path).exists():
36+
raise FileNotFoundError(f"File not found: {path}")
37+
38+
if not path.endswith(".joblib"):
39+
raise ValueError("The path must end with .joblib")
40+
41+
return joblib.load(path)

pipeline_lib/core/pipeline.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,10 @@ def run(self, is_train: bool) -> DataContainer:
3737

3838
if is_train:
3939
steps_to_run = [step for step in self.steps if step.used_for_training]
40+
self.logger.info("Training the pipeline")
4041
else:
4142
steps_to_run = [step for step in self.steps if step.used_for_prediction]
43+
self.logger.info("Predicting with the pipeline")
4244

4345
for i, step in enumerate(steps_to_run):
4446
Pipeline.logger.info(
@@ -53,17 +55,11 @@ def run(self, is_train: bool) -> DataContainer:
5355

5456
def train(self) -> DataContainer:
5557
"""Run the pipeline on the given data."""
56-
self.logger.info("Training the pipeline")
5758
return self.run(is_train=True)
5859

5960
def predict(self) -> DataContainer:
6061
"""Run the pipeline on the given data."""
61-
self.logger.info("Predicting with the pipeline")
62-
data = self.run(is_train=False)
63-
data.predictions = data.model.predict(data.flow)
64-
self.logger.info("Predictions:")
65-
self.logger.info(data.predictions)
66-
return data
62+
return self.run(is_train=False)
6763

6864
@classmethod
6965
def from_json(cls, path: str) -> Pipeline:
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import json
2+
import time
3+
from typing import List, Optional
4+
5+
import numpy as np
6+
import pandas as pd
7+
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
8+
9+
from pipeline_lib.core import DataContainer
10+
from pipeline_lib.core.model import Model
11+
from pipeline_lib.core.steps.base import PipelineStep
12+
13+
14+
class CalculateTrainMetricsStep(PipelineStep):
15+
"""Calculate metrics."""
16+
17+
used_for_prediction = False
18+
used_for_training = True
19+
20+
def __init__(self) -> None:
21+
"""Initialize CalculateMetricsStep."""
22+
super().__init__()
23+
self.init_logger()
24+
25+
def _calculate_metrics(self, true_values: pd.Series, predictions: pd.Series) -> dict:
26+
return {
27+
"MAE": str(mean_absolute_error(true_values, predictions)),
28+
"RMSE": str(np.sqrt(mean_squared_error(true_values, predictions))),
29+
"R^2": str(r2_score(true_values, predictions)),
30+
"Mean Error": str(np.mean(true_values - predictions)),
31+
"Max Error": str(np.max(np.abs(true_values - predictions))),
32+
"Median Absolute Error": str(np.median(np.abs(true_values - predictions))),
33+
}
34+
35+
def _get_predictions(
36+
self, model: Model, df: pd.DataFrame, target: str, drop_columns: Optional[List[str]] = None
37+
) -> pd.Series:
38+
drop_columns = (drop_columns or []) + [target]
39+
return model.predict(df.drop(columns=drop_columns))
40+
41+
def _log_metrics(self, dataset_name: str, metrics: dict) -> None:
42+
self.logger.info(f"Metrics for {dataset_name} dataset:")
43+
for metric, value in metrics.items():
44+
self.logger.info(f"{metric}: {value}")
45+
46+
def execute(self, data: DataContainer) -> DataContainer:
47+
self.logger.debug("Starting metric calculation")
48+
49+
target_column_name = data.target
50+
if target_column_name is None:
51+
raise ValueError("Target column not found on any configuration.")
52+
53+
metrics = {}
54+
55+
for dataset_name in ["train", "validation", "test"]:
56+
start_time = time.time()
57+
dataset = getattr(data, dataset_name, None)
58+
59+
if dataset is None:
60+
self.logger.warning(
61+
f"Dataset '{dataset_name}' not found. Skipping metric calculation."
62+
)
63+
continue
64+
65+
predictions = self._get_predictions(
66+
model=data.model,
67+
df=dataset,
68+
target=target_column_name,
69+
drop_columns=data._drop_columns,
70+
)
71+
metrics[dataset_name] = self._calculate_metrics(
72+
true_values=dataset[target_column_name],
73+
predictions=predictions,
74+
)
75+
elapsed_time = time.time() - start_time
76+
self.logger.info(f"Elapsed time for {dataset_name} dataset: {elapsed_time:.2f} seconds")
77+
78+
# pretty print metrics
79+
self.logger.info(f"Metrics: {json.dumps(metrics, indent=4)}")
80+
81+
data.metrics = metrics
82+
83+
return data

pipeline_lib/core/steps/fit_model.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import Optional, Type
22

33
import optuna
4-
from joblib import dump
54
from sklearn.metrics import mean_absolute_error
65

76
from pipeline_lib.core import DataContainer
@@ -70,11 +69,11 @@ def execute(self, data: DataContainer) -> DataContainer:
7069

7170
data.model = self.model
7271
data.target = self.target
73-
data.model_path = self.save_path
72+
data._drop_columns = self.drop_columns
7473

7574
if self.save_path:
7675
self.logger.info(f"Saving the model to {self.save_path}")
77-
dump(self.model, self.save_path)
76+
self.model.save(self.save_path)
7877

7978
return data
8079

pipeline_lib/core/steps/predict.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from typing import List, Optional
22

3-
from joblib import load
4-
53
from pipeline_lib.core import DataContainer
4+
from pipeline_lib.core.model import Model
65
from pipeline_lib.core.steps.base import PipelineStep
76

87

@@ -22,7 +21,7 @@ def __init__(
2221
super().__init__()
2322
self.init_logger()
2423
self.load_path = load_path
25-
self.model = load(self.load_path)
24+
self.model = Model.from_file(load_path)
2625
self.target = target
2726
self.drop_columns = drop_columns or []
2827

0 commit comments

Comments
 (0)