tryolabs · diegomarvid · Apr 12, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/pipeline_lib/core/data_container.py b/pipeline_lib/core/data_container.py
@@ -10,6 +10,7 @@
 
 import pandas as pd
 import yaml
+from sklearn.compose import ColumnTransformer
 
 from pipeline_lib.core.model import Model
 
@@ -166,7 +167,7 @@ def save(self, file_path: str, keys: Optional[Union[str, list[str]]] = None):
         if isinstance(keys, str):
             keys = [keys]
 
-        data_to_save = {k: self.data[k] for k in keys} if keys else self.data
+        data_to_save = {k: self.data.get(k) for k in keys} if keys else self.data
 
         serialized_data = pickle.dumps(data_to_save)
         data_size_bytes = sys.getsizeof(serialized_data)
@@ -485,6 +486,11 @@ def explainer(self) -> Any:
         Any
             The explainer stored in the DataContainer.
         """
+        if self.is_train:
+            raise ValueError(
+                "Explainer is only available for prediction. Pipeline was executed on training"
+                " mode."
+            )
         return self["explainer"]
 
     @explainer.setter
@@ -619,6 +625,30 @@ def is_train(self, value: bool):
         """
         self["is_train"] = value
 
+    @property
+    def _encoder(self) -> ColumnTransformer:
+        """
+        Get the encoder from the DataContainer.
+
+        Returns
+        -------
+        ColumnTransformer
+            The encoder stored in the DataContainer.
+        """
+        return self["encoder"]
+
+    @_encoder.setter
+    def _encoder(self, value: ColumnTransformer):
+        """
+        Set the encoder in the DataContainer.
+
+        Parameters
+        ----------
+        value
+            The encoder to be stored in the DataContainer.
+        """
+        self["encoder"] = value
+
     def __eq__(self, other) -> bool:
         """
         Compare this DataContainer with another for equality.

diff --git a/pipeline_lib/core/pipeline.py b/pipeline_lib/core/pipeline.py
@@ -20,13 +20,13 @@ class Pipeline:
     step_registry = StepRegistry()
     model_registry = ModelRegistry()
 
+    KEYS_TO_SAVE = ["model", "encoder", "_drop_columns", "target"]
+
     def __init__(self, initial_data: Optional[DataContainer] = None):
         self.steps = []
         self.initial_data = initial_data
-        self.save_path = None
-        self.load_path = None
-        self.model_path = None
         self.config = None
+        self.save_data_path = None
 
     def add_steps(self, steps: list[PipelineStep]):
         """Add steps to the pipeline."""
@@ -35,22 +35,32 @@ def add_steps(self, steps: list[PipelineStep]):
     def run(self, is_train: bool, save: bool = True) -> DataContainer:
         """Run the pipeline on the given data."""
 
-        data = DataContainer.from_pickle(self.load_path) if self.load_path else DataContainer()
-        data.is_train = is_train
+        if not self.save_data_path:
+            raise ValueError(
+                "A path for saving the data must be provided. Use the `save_data_path` attribute."
+            )
+
+        data = DataContainer()
 
         if is_train:
             steps_to_run = [step for step in self.steps if step.used_for_training]
             self.logger.info("Training the pipeline")
         else:
+            data = DataContainer.from_pickle(self.save_data_path)
             steps_to_run = [step for step in self.steps if step.used_for_prediction]
             self.logger.info("Predicting with the pipeline")
 
+        data.is_train = is_train
+
         for i, step in enumerate(steps_to_run):
             Pipeline.logger.info(
                 f"Running {step.__class__.__name__} - {i + 1} / {len(steps_to_run)}"
             )
             data = step.execute(data)
 
+        if is_train:
+            data.save(self.save_data_path, keys=self.KEYS_TO_SAVE)
+
         if save:
             self.save_run(data)
 
@@ -78,18 +88,15 @@ def from_json(cls, path: str) -> Pipeline:
         if custom_steps_path:
             cls.step_registry.load_and_register_custom_steps(custom_steps_path)
 
+        save_data_path = config["pipeline"].get("save_data_path")
+
         pipeline = Pipeline()
 
-        pipeline.load_path = config.get("load_path")
-        pipeline.save_path = config.get("save_path")
         pipeline.config = config
+        pipeline.save_data_path = save_data_path
 
         steps = []
 
-        model_path = None
-        drop_columns = None
-        target = None
-
         for step_config in config["pipeline"]["steps"]:
             step_type = step_config["step_type"]
             parameters = step_config.get("parameters", {})
@@ -103,15 +110,6 @@ def from_json(cls, path: str) -> Pipeline:
                 model_class_name = parameters.pop("model_class")
                 model_class = cls.model_registry.get_model_class(model_class_name)
                 parameters["model_class"] = model_class
-                model_path = parameters.get("save_path")
-                drop_columns = parameters.get("drop_columns")
-                target = parameters.get("target")
-
-            # if step type is prediction, add model path
-            if step_type == "PredictStep":
-                parameters["load_path"] = model_path
-                parameters["drop_columns"] = drop_columns
-                parameters["target"] = target
 
             step_class = cls.step_registry.get_step_class(step_type)
             step = step_class(**parameters)