Skip to content

Commit

Permalink
improve tabular split and add calc train metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
diegomarvid committed Mar 25, 2024
1 parent c32b1ab commit bc9a4df
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 85 deletions.
84 changes: 18 additions & 66 deletions pipeline_lib/core/data_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,76 +377,52 @@ def validation(self, value: Any):
self["validation"] = value

@property
def model(self) -> Any:
"""
Get the model from the DataContainer.
Returns
-------
Any
The model stored in the DataContainer.
"""
return self["model"]

@model.setter
def model(self, value: Any):
"""
Set the model in the DataContainer.
Parameters
----------
value
The model to be stored in the DataContainer.
"""
self["model"] = value

@property
def model_input(self) -> Any:
def test(self) -> Any:
"""
Get the model input from the DataContainer.
Get the test data from the DataContainer.
Returns
-------
Any
The model input stored in the DataContainer.
The test data stored in the DataContainer.
"""
return self["model_input"]
return self["test"]

@model_input.setter
def model_input(self, value: Any):
@test.setter
def test(self, value: Any):
"""
Set the model input in the DataContainer.
Set the test data in the DataContainer.
Parameters
----------
value
The model input to be stored in the DataContainer.
The test data to be stored in the DataContainer.
"""
self["model_input"] = value
self["test"] = value

@property
def model_output(self) -> Any:
def model(self) -> Any:
"""
Get the model output from the DataContainer.
Get the model from the DataContainer.
Returns
-------
Any
The model output stored in the DataContainer.
The model stored in the DataContainer.
"""
return self["model_output"]
return self["model"]

@model_output.setter
def model_output(self, value: Any):
@model.setter
def model(self, value: Any):
"""
Set the model output in the DataContainer.
Set the model in the DataContainer.
Parameters
----------
value
The model output to be stored in the DataContainer.
The model to be stored in the DataContainer.
"""
self["model_output"] = value
self["model"] = value

@property
def metrics(self) -> Any:
Expand Down Expand Up @@ -568,30 +544,6 @@ def target(self, value: Any):
"""
self["target"] = value

@property
def features(self) -> Any:
"""
Get the features from the DataContainer.
Returns
-------
Any
The features stored in the DataContainer.
"""
return self["features"]

@features.setter
def features(self, value: Any):
"""
Set the features in the DataContainer.
Parameters
----------
value
The features to be stored in the DataContainer.
"""
self["features"] = value

@property
def flow(self) -> Any:
"""
Expand Down
19 changes: 19 additions & 0 deletions pipeline_lib/core/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Optional, Tuple

import joblib
import pandas as pd


Expand All @@ -20,3 +22,20 @@ def fit(
@abstractmethod
def predict(self, X: pd.DataFrame) -> pd.Series:
"""Abstract method for making predictions."""

def save(self, path: str) -> None:
"""Save the model."""
if not path.endswith(".joblib"):
raise ValueError("The path must end with .joblib")
joblib.dump(self, path)

@classmethod
def from_file(cls, path: str) -> "Model":
"""Load the model from a .joblib file."""
if not Path(path).exists():
raise FileNotFoundError(f"File not found: {path}")

if not path.endswith(".joblib"):
raise ValueError("The path must end with .joblib")

return joblib.load(path)
10 changes: 3 additions & 7 deletions pipeline_lib/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ def run(self, is_train: bool) -> DataContainer:

if is_train:
steps_to_run = [step for step in self.steps if step.used_for_training]
self.logger.info("Training the pipeline")
else:
steps_to_run = [step for step in self.steps if step.used_for_prediction]
self.logger.info("Predicting with the pipeline")

for i, step in enumerate(steps_to_run):
Pipeline.logger.info(
Expand All @@ -53,17 +55,11 @@ def run(self, is_train: bool) -> DataContainer:

def train(self) -> DataContainer:
"""Run the pipeline on the given data."""
self.logger.info("Training the pipeline")
return self.run(is_train=True)

def predict(self) -> DataContainer:
"""Run the pipeline on the given data."""
self.logger.info("Predicting with the pipeline")
data = self.run(is_train=False)
data.predictions = data.model.predict(data.flow)
self.logger.info("Predictions:")
self.logger.info(data.predictions)
return data
return self.run(is_train=False)

@classmethod
def from_json(cls, path: str) -> Pipeline:
Expand Down
83 changes: 83 additions & 0 deletions pipeline_lib/core/steps/calculate_train_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import json
import time
from typing import List, Optional

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from pipeline_lib.core import DataContainer
from pipeline_lib.core.model import Model
from pipeline_lib.core.steps.base import PipelineStep


class CalculateTrainMetricsStep(PipelineStep):
"""Calculate metrics."""

used_for_prediction = False
used_for_training = True

def __init__(self) -> None:
"""Initialize CalculateMetricsStep."""
super().__init__()
self.init_logger()

def _calculate_metrics(self, true_values: pd.Series, predictions: pd.Series) -> dict:
return {
"MAE": str(mean_absolute_error(true_values, predictions)),
"RMSE": str(np.sqrt(mean_squared_error(true_values, predictions))),
"R^2": str(r2_score(true_values, predictions)),
"Mean Error": str(np.mean(true_values - predictions)),
"Max Error": str(np.max(np.abs(true_values - predictions))),
"Median Absolute Error": str(np.median(np.abs(true_values - predictions))),
}

def _get_predictions(
self, model: Model, df: pd.DataFrame, target: str, drop_columns: Optional[List[str]] = None
) -> pd.Series:
drop_columns = (drop_columns or []) + [target]
return model.predict(df.drop(columns=drop_columns))

def _log_metrics(self, dataset_name: str, metrics: dict) -> None:
self.logger.info(f"Metrics for {dataset_name} dataset:")
for metric, value in metrics.items():
self.logger.info(f"{metric}: {value}")

def execute(self, data: DataContainer) -> DataContainer:
self.logger.debug("Starting metric calculation")

target_column_name = data.target
if target_column_name is None:
raise ValueError("Target column not found on any configuration.")

metrics = {}

for dataset_name in ["train", "validation", "test"]:
start_time = time.time()
dataset = getattr(data, dataset_name, None)

if dataset is None:
self.logger.warning(
f"Dataset '{dataset_name}' not found. Skipping metric calculation."
)
continue

predictions = self._get_predictions(
model=data.model,
df=dataset,
target=target_column_name,
drop_columns=data._drop_columns,
)
metrics[dataset_name] = self._calculate_metrics(
true_values=dataset[target_column_name],
predictions=predictions,
)
elapsed_time = time.time() - start_time
self.logger.info(f"Elapsed time for {dataset_name} dataset: {elapsed_time:.2f} seconds")

# pretty print metrics
self.logger.info(f"Metrics: {json.dumps(metrics, indent=4)}")

data.metrics = metrics

return data
5 changes: 2 additions & 3 deletions pipeline_lib/core/steps/fit_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Optional, Type

import optuna
from joblib import dump
from sklearn.metrics import mean_absolute_error

from pipeline_lib.core import DataContainer
Expand Down Expand Up @@ -70,11 +69,11 @@ def execute(self, data: DataContainer) -> DataContainer:

data.model = self.model
data.target = self.target
data.model_path = self.save_path
data._drop_columns = self.drop_columns

if self.save_path:
self.logger.info(f"Saving the model to {self.save_path}")
dump(self.model, self.save_path)
self.model.save(self.save_path)

return data

Expand Down
5 changes: 2 additions & 3 deletions pipeline_lib/core/steps/predict.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import List, Optional

from joblib import load

from pipeline_lib.core import DataContainer
from pipeline_lib.core.model import Model
from pipeline_lib.core.steps.base import PipelineStep


Expand All @@ -22,7 +21,7 @@ def __init__(
super().__init__()
self.init_logger()
self.load_path = load_path
self.model = load(self.load_path)
self.model = Model.from_file(load_path)
self.target = target
self.drop_columns = drop_columns or []

Expand Down
Loading

0 comments on commit bc9a4df

Please sign in to comment.