improve tabular split and add calc train metrics

diegomarvid · diegomarvid · commit bc9a4dfdc349 · 2024-03-25T15:12:18.000-03:00
diff --git a/pipeline_lib/core/data_container.py b/pipeline_lib/core/data_container.py
@@ -377,76 +377,52 @@ def validation(self, value: Any):
         self["validation"] = value
 
     @property
-    def model(self) -> Any:
-        """
-        Get the model from the DataContainer.
-
-        Returns
-        -------
-        Any
-            The model stored in the DataContainer.
-        """
-        return self["model"]
-
-    @model.setter
-    def model(self, value: Any):
-        """
-        Set the model in the DataContainer.
-
-        Parameters
-        ----------
-        value
-            The model to be stored in the DataContainer.
-        """
-        self["model"] = value
-
-    @property
-    def model_input(self) -> Any:
+    def test(self) -> Any:
         """
-        Get the model input from the DataContainer.
+        Get the test data from the DataContainer.
 
         Returns
         -------
         Any
-            The model input stored in the DataContainer.
+        The test data stored in the DataContainer.
         """
-        return self["model_input"]
+        return self["test"]
 
-    @model_input.setter
-    def model_input(self, value: Any):
+    @test.setter
+    def test(self, value: Any):
         """
-        Set the model input in the DataContainer.
+        Set the test data in the DataContainer.
 
         Parameters
         ----------
         value
-            The model input to be stored in the DataContainer.
+        The test data to be stored in the DataContainer.
         """
-        self["model_input"] = value
+        self["test"] = value
 
     @property
-    def model_output(self) -> Any:
+    def model(self) -> Any:
         """
-        Get the model output from the DataContainer.
+        Get the model from the DataContainer.
 
         Returns
         -------
         Any
-            The model output stored in the DataContainer.
+            The model stored in the DataContainer.
         """
-        return self["model_output"]
+        return self["model"]
 
-    @model_output.setter
-    def model_output(self, value: Any):
+    @model.setter
+    def model(self, value: Any):
         """
-        Set the model output in the DataContainer.
+        Set the model in the DataContainer.
 
         Parameters
         ----------
         value
-            The model output to be stored in the DataContainer.
+            The model to be stored in the DataContainer.
         """
-        self["model_output"] = value
+        self["model"] = value
 
     @property
     def metrics(self) -> Any:
@@ -568,30 +544,6 @@ def target(self, value: Any):
         """
         self["target"] = value
 
-    @property
-    def features(self) -> Any:
-        """
-        Get the features from the DataContainer.
-
-        Returns
-        -------
-        Any
-            The features stored in the DataContainer.
-        """
-        return self["features"]
-
-    @features.setter
-    def features(self, value: Any):
-        """
-        Set the features in the DataContainer.
-
-        Parameters
-        ----------
-        value
-            The features to be stored in the DataContainer.
-        """
-        self["features"] = value
-
     @property
     def flow(self) -> Any:
         """
diff --git a/pipeline_lib/core/model.py b/pipeline_lib/core/model.py
@@ -1,6 +1,8 @@
 from abc import ABC, abstractmethod
+from pathlib import Path
 from typing import List, Optional, Tuple
 
+import joblib
 import pandas as pd
 
 
@@ -20,3 +22,20 @@ def fit(
     @abstractmethod
     def predict(self, X: pd.DataFrame) -> pd.Series:
         """Abstract method for making predictions."""
+
+    def save(self, path: str) -> None:
+        """Save the model."""
+        if not path.endswith(".joblib"):
+            raise ValueError("The path must end with .joblib")
+        joblib.dump(self, path)
+
+    @classmethod
+    def from_file(cls, path: str) -> "Model":
+        """Load the model from a .joblib file."""
+        if not Path(path).exists():
+            raise FileNotFoundError(f"File not found: {path}")
+
+        if not path.endswith(".joblib"):
+            raise ValueError("The path must end with .joblib")
+
+        return joblib.load(path)
diff --git a/pipeline_lib/core/pipeline.py b/pipeline_lib/core/pipeline.py
@@ -37,8 +37,10 @@ def run(self, is_train: bool) -> DataContainer:
 
         if is_train:
             steps_to_run = [step for step in self.steps if step.used_for_training]
+            self.logger.info("Training the pipeline")
         else:
             steps_to_run = [step for step in self.steps if step.used_for_prediction]
+            self.logger.info("Predicting with the pipeline")
 
         for i, step in enumerate(steps_to_run):
             Pipeline.logger.info(
@@ -53,17 +55,11 @@ def run(self, is_train: bool) -> DataContainer:
 
     def train(self) -> DataContainer:
         """Run the pipeline on the given data."""
-        self.logger.info("Training the pipeline")
         return self.run(is_train=True)
 
     def predict(self) -> DataContainer:
         """Run the pipeline on the given data."""
-        self.logger.info("Predicting with the pipeline")
-        data = self.run(is_train=False)
-        data.predictions = data.model.predict(data.flow)
-        self.logger.info("Predictions:")
-        self.logger.info(data.predictions)
-        return data
+        return self.run(is_train=False)
 
     @classmethod
     def from_json(cls, path: str) -> Pipeline:
diff --git a/pipeline_lib/core/steps/calculate_train_metrics.py b/pipeline_lib/core/steps/calculate_train_metrics.py
@@ -0,0 +1,83 @@
+import json
+import time
+from typing import List, Optional
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+
+from pipeline_lib.core import DataContainer
+from pipeline_lib.core.model import Model
+from pipeline_lib.core.steps.base import PipelineStep
+
+
+class CalculateTrainMetricsStep(PipelineStep):
+    """Calculate metrics."""
+
+    used_for_prediction = False
+    used_for_training = True
+
+    def __init__(self) -> None:
+        """Initialize CalculateMetricsStep."""
+        super().__init__()
+        self.init_logger()
+
+    def _calculate_metrics(self, true_values: pd.Series, predictions: pd.Series) -> dict:
+        return {
+            "MAE": str(mean_absolute_error(true_values, predictions)),
+            "RMSE": str(np.sqrt(mean_squared_error(true_values, predictions))),
+            "R^2": str(r2_score(true_values, predictions)),
+            "Mean Error": str(np.mean(true_values - predictions)),
+            "Max Error": str(np.max(np.abs(true_values - predictions))),
+            "Median Absolute Error": str(np.median(np.abs(true_values - predictions))),
+        }
+
+    def _get_predictions(
+        self, model: Model, df: pd.DataFrame, target: str, drop_columns: Optional[List[str]] = None
+    ) -> pd.Series:
+        drop_columns = (drop_columns or []) + [target]
+        return model.predict(df.drop(columns=drop_columns))
+
+    def _log_metrics(self, dataset_name: str, metrics: dict) -> None:
+        self.logger.info(f"Metrics for {dataset_name} dataset:")
+        for metric, value in metrics.items():
+            self.logger.info(f"{metric}: {value}")
+
+    def execute(self, data: DataContainer) -> DataContainer:
+        self.logger.debug("Starting metric calculation")
+
+        target_column_name = data.target
+        if target_column_name is None:
+            raise ValueError("Target column not found on any configuration.")
+
+        metrics = {}
+
+        for dataset_name in ["train", "validation", "test"]:
+            start_time = time.time()
+            dataset = getattr(data, dataset_name, None)
+
+            if dataset is None:
+                self.logger.warning(
+                    f"Dataset '{dataset_name}' not found. Skipping metric calculation."
+                )
+                continue
+
+            predictions = self._get_predictions(
+                model=data.model,
+                df=dataset,
+                target=target_column_name,
+                drop_columns=data._drop_columns,
+            )
+            metrics[dataset_name] = self._calculate_metrics(
+                true_values=dataset[target_column_name],
+                predictions=predictions,
+            )
+            elapsed_time = time.time() - start_time
+            self.logger.info(f"Elapsed time for {dataset_name} dataset: {elapsed_time:.2f} seconds")
+
+        # pretty print metrics
+        self.logger.info(f"Metrics: {json.dumps(metrics, indent=4)}")
+
+        data.metrics = metrics
+
+        return data
diff --git a/pipeline_lib/core/steps/fit_model.py b/pipeline_lib/core/steps/fit_model.py
@@ -1,7 +1,6 @@
 from typing import Optional, Type
 
 import optuna
-from joblib import dump
 from sklearn.metrics import mean_absolute_error
 
 from pipeline_lib.core import DataContainer
@@ -70,11 +69,11 @@ def execute(self, data: DataContainer) -> DataContainer:
 
         data.model = self.model
         data.target = self.target
-        data.model_path = self.save_path
+        data._drop_columns = self.drop_columns
 
         if self.save_path:
             self.logger.info(f"Saving the model to {self.save_path}")
-            dump(self.model, self.save_path)
+            self.model.save(self.save_path)
 
         return data
 
diff --git a/pipeline_lib/core/steps/predict.py b/pipeline_lib/core/steps/predict.py
@@ -1,8 +1,7 @@
 from typing import List, Optional
 
-from joblib import load
-
 from pipeline_lib.core import DataContainer
+from pipeline_lib.core.model import Model
 from pipeline_lib.core.steps.base import PipelineStep
 
 
@@ -22,7 +21,7 @@ def __init__(
         super().__init__()
         self.init_logger()
         self.load_path = load_path
-        self.model = load(self.load_path)
+        self.model = Model.from_file(load_path)
         self.target = target
         self.drop_columns = drop_columns or []
 
diff --git a/pipeline_lib/core/steps/tabular_split.py b/pipeline_lib/core/steps/tabular_split.py