From 38279c0e00ee03c2e261fd09f3e5ab03ecb29005 Mon Sep 17 00:00:00 2001
From: Diego Marvid <dmarvid@tryolabs.com>
Date: Mon, 25 Mar 2024 11:59:59 -0300
Subject: [PATCH] delete unnecesary steps

---
 pipeline_lib/core/pipeline.py                 |   2 -
 pipeline_lib/core/steps/target_scaling.py     |   2 -
 .../tabular/xgboost/__init__.py               |   3 +-
 .../tabular/xgboost/fit_model.py              | 166 ------------------
 .../implementation/tabular/xgboost/predict.py |  48 -----
 5 files changed, 1 insertion(+), 220 deletions(-)
 delete mode 100644 pipeline_lib/implementation/tabular/xgboost/fit_model.py
 delete mode 100644 pipeline_lib/implementation/tabular/xgboost/predict.py

diff --git a/pipeline_lib/core/pipeline.py b/pipeline_lib/core/pipeline.py
index 090e3ae..112293d 100644
--- a/pipeline_lib/core/pipeline.py
+++ b/pipeline_lib/core/pipeline.py
@@ -4,8 +4,6 @@
 import logging
 from typing import Optional
 
-from joblib import load
-
 from pipeline_lib.core.data_container import DataContainer
 from pipeline_lib.core.model_registry import ModelRegistry
 from pipeline_lib.core.step_registry import StepRegistry
diff --git a/pipeline_lib/core/steps/target_scaling.py b/pipeline_lib/core/steps/target_scaling.py
index 36a1198..5be5b3c 100644
--- a/pipeline_lib/core/steps/target_scaling.py
+++ b/pipeline_lib/core/steps/target_scaling.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 from pipeline_lib.core import DataContainer
 from pipeline_lib.core.steps.base import PipelineStep
 
diff --git a/pipeline_lib/implementation/tabular/xgboost/__init__.py b/pipeline_lib/implementation/tabular/xgboost/__init__.py
index 2887234..479c0b9 100644
--- a/pipeline_lib/implementation/tabular/xgboost/__init__.py
+++ b/pipeline_lib/implementation/tabular/xgboost/__init__.py
@@ -1,2 +1 @@
-from .fit_model import XGBoostFitModelStep  # noqa: F401
-from .predict import XGBoostPredictStep  # noqa: F401
+from .model import XGBoostModel  # noqa: F401
diff --git a/pipeline_lib/implementation/tabular/xgboost/fit_model.py b/pipeline_lib/implementation/tabular/xgboost/fit_model.py
deleted file mode 100644
index ed82260..0000000
--- a/pipeline_lib/implementation/tabular/xgboost/fit_model.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import time
-from typing import Optional
-
-import optuna
-from joblib import dump
-from optuna.pruners import MedianPruner
-from sklearn.metrics import mean_absolute_error
-
-from pipeline_lib.core import DataContainer
-from pipeline_lib.core.steps import FitModelStep
-
-from .model import XGBoostModel
-
-
-class XGBoostFitModelStep(FitModelStep):
-    """Fit the model with XGBoost."""
-
-    def __init__(
-        self,
-        target: str,
-        drop_columns: Optional[list[str]] = None,
-        xgb_params: Optional[dict] = None,
-        optuna_params: Optional[dict] = None,
-        save_path: Optional[str] = None,
-    ) -> None:
-        self.init_logger()
-
-        if target is None:
-            raise ValueError("Target column not found in the parameters.")
-
-        self.target = target
-        self.drop_columns = drop_columns
-
-        if optuna_params and xgb_params:
-            raise ValueError("Both optuna_params and xgb_params are defined. Please choose one.")
-
-        if not optuna_params and not xgb_params:
-            raise ValueError(
-                "No parameters defined. Please define either optuna_params or xgb_params."
-            )
-
-        self.xgb_params = xgb_params
-        self.optuna_params = optuna_params
-
-        if save_path:
-            if not save_path.endswith(".joblib"):
-                raise ValueError("Only joblib format is supported for saving the model.")
-
-        self.save_path = save_path
-
-    def execute(self, data: DataContainer) -> DataContainer:
-        self.logger.debug("Starting model fitting with XGBoost")
-
-        start_time = time.time()
-
-        df_train = data.train
-        df_valid = data.validation
-
-        if self.drop_columns:
-            df_train = df_train.drop(columns=self.drop_columns)
-            df_valid = df_valid.drop(columns=self.drop_columns)
-
-        # Prepare the data
-        X_train = df_train.drop(columns=[self.target])
-        y_train = df_train[self.target]
-
-        X_valid = df_valid.drop(columns=[self.target])
-        y_valid = df_valid[self.target]
-
-        params = self.xgb_params
-
-        if self.optuna_params:
-            params = self.optimize_with_optuna(
-                X_train, y_train, X_valid, y_valid, self.optuna_params
-            )
-            data.tuning_params = params
-
-        model = XGBoostModel(**params)
-
-        model.fit(
-            X_train,
-            y_train,
-            eval_set=[(X_valid, y_valid)],
-            verbose=True,
-        )
-
-        end_time = time.time()
-        elapsed_time = end_time - start_time
-        minutes = int(elapsed_time // 60)
-        seconds = int(elapsed_time % 60)
-        self.logger.info(f"XGBoost model fitting took {minutes} minutes and {seconds} seconds.")
-
-        # Save the model to the data container
-        data.model = model
-        data.target = self.target
-
-        if self.save_path:
-            self.logger.info(f"Saving the model to {self.save_path}")
-            dump(model, self.save_path)
-        return data
-
-    def optimize_with_optuna(self, X_train, y_train, X_valid, y_valid, optuna_params):
-        def objective(trial):
-            # Define the search space
-            max_depth = optuna_params.get("max_depth", [3, 12])
-            eta = optuna_params.get("eta", [1e-8, 1.0])
-            subsample = optuna_params.get("subsample", [0.2, 1.0])
-            colsample_bytree = optuna_params.get("colsample_bytree", [0.2, 1.0])
-            min_child_weight = optuna_params.get("min_child_weight", [1, 10])
-            n_estimators = optuna_params.get("n_estimators", [100, 1000])
-
-            param = {
-                "verbosity": 0,
-                "objective": "reg:squarederror",
-                "eval_metric": "mae",
-                "n_jobs": -1,
-                "max_depth": trial.suggest_int("max_depth", max_depth[0], max_depth[1]),
-                "eta": trial.suggest_float("eta", eta[0], eta[1], log=True),
-                "subsample": trial.suggest_float("subsample", subsample[0], subsample[1]),
-                "colsample_bytree": trial.suggest_float(
-                    "colsample_bytree", colsample_bytree[0], colsample_bytree[1]
-                ),
-                "min_child_weight": trial.suggest_int(
-                    "min_child_weight", min_child_weight[0], min_child_weight[1]
-                ),
-                "n_estimators": trial.suggest_int("n_estimators", n_estimators[0], n_estimators[1]),
-            }
-
-            model = XGBoostModel(**param)
-            model.fit(
-                X_train,
-                y_train,
-                eval_set=[(X_valid, y_valid)],
-                verbose=True,
-            )
-            preds = model.predict(X_valid)
-            mae = mean_absolute_error(y_valid, preds)
-            return mae
-
-        def optuna_logging_callback(study, trial):
-            if trial.state == optuna.trial.TrialState.COMPLETE:
-                self.logger.info(
-                    f"Trial {trial.number} finished with value: {trial.value} and parameters:"
-                    f" {trial.params}. Best is trial {study.best_trial.number} with value:"
-                    f" {study.best_value}."
-                )
-
-        optuna_trials = optuna_params.get("trials", 20)
-
-        self.logger.info(f"Optimizing XGBoost hyperparameters with {optuna_trials} trials.")
-
-        study_name = optuna_params.get("study_name", "xgboost_optimization")
-        storage = optuna_params.get("storage", "sqlite:///db.sqlite3")
-
-        study = optuna.create_study(
-            direction="minimize",
-            study_name=study_name,
-            storage=storage,
-            pruner=MedianPruner(),
-        )
-
-        study.optimize(objective, n_trials=optuna_trials, callbacks=[optuna_logging_callback])
-
-        best_params = study.best_params
-        self.logger.info(f"Best parameters found by Optuna: {best_params}")
-        return best_params
diff --git a/pipeline_lib/implementation/tabular/xgboost/predict.py b/pipeline_lib/implementation/tabular/xgboost/predict.py
deleted file mode 100644
index 8502f7a..0000000
--- a/pipeline_lib/implementation/tabular/xgboost/predict.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from typing import Optional
-
-import pandas as pd
-from joblib import load
-
-from pipeline_lib.core import DataContainer
-from pipeline_lib.core.steps import PredictStep
-
-
-class XGBoostPredictStep(PredictStep):
-    """Obtain the predictions for XGBoost model."""
-
-    def __init__(
-        self,
-        target: str,
-        load_path: str,
-        drop_columns: Optional[list[str]] = None,
-    ) -> None:
-        self.init_logger()
-
-        if not load_path.endswith(".joblib"):
-            raise ValueError("Only joblib format is supported for loading the model.")
-
-        self.target = target
-        self.load_path = load_path
-        self.drop_columns = drop_columns
-
-        self.model = load(self.load_path)
-
-    def execute(self, data: DataContainer) -> DataContainer:
-        self.logger.debug("Obtaining predictions for XGBoost model.")
-
-        model_input = data.flow
-
-        if self.drop_columns:
-            self.logger.info(f"Dropping columns: {self.drop_columns}")
-            model_input = model_input.drop(columns=self.drop_columns)
-
-        predictions = self.model.predict(model_input.drop(columns=[self.target]))
-
-        predictions_df = pd.DataFrame(predictions, columns=["prediction"])
-
-        model_input["predictions"] = predictions_df
-        data.model = self.model
-        data.model_output = model_input
-        data.target = self.target
-        data._drop_columns = self.drop_columns
-        return data