From 38279c0e00ee03c2e261fd09f3e5ab03ecb29005 Mon Sep 17 00:00:00 2001 From: Diego Marvid Date: Mon, 25 Mar 2024 11:59:59 -0300 Subject: [PATCH] delete unnecesary steps --- pipeline_lib/core/pipeline.py | 2 - pipeline_lib/core/steps/target_scaling.py | 2 - .../tabular/xgboost/__init__.py | 3 +- .../tabular/xgboost/fit_model.py | 166 ------------------ .../implementation/tabular/xgboost/predict.py | 48 ----- 5 files changed, 1 insertion(+), 220 deletions(-) delete mode 100644 pipeline_lib/implementation/tabular/xgboost/fit_model.py delete mode 100644 pipeline_lib/implementation/tabular/xgboost/predict.py diff --git a/pipeline_lib/core/pipeline.py b/pipeline_lib/core/pipeline.py index 090e3ae..112293d 100644 --- a/pipeline_lib/core/pipeline.py +++ b/pipeline_lib/core/pipeline.py @@ -4,8 +4,6 @@ import logging from typing import Optional -from joblib import load - from pipeline_lib.core.data_container import DataContainer from pipeline_lib.core.model_registry import ModelRegistry from pipeline_lib.core.step_registry import StepRegistry diff --git a/pipeline_lib/core/steps/target_scaling.py b/pipeline_lib/core/steps/target_scaling.py index 36a1198..5be5b3c 100644 --- a/pipeline_lib/core/steps/target_scaling.py +++ b/pipeline_lib/core/steps/target_scaling.py @@ -1,5 +1,3 @@ -from typing import Optional - from pipeline_lib.core import DataContainer from pipeline_lib.core.steps.base import PipelineStep diff --git a/pipeline_lib/implementation/tabular/xgboost/__init__.py b/pipeline_lib/implementation/tabular/xgboost/__init__.py index 2887234..479c0b9 100644 --- a/pipeline_lib/implementation/tabular/xgboost/__init__.py +++ b/pipeline_lib/implementation/tabular/xgboost/__init__.py @@ -1,2 +1 @@ -from .fit_model import XGBoostFitModelStep # noqa: F401 -from .predict import XGBoostPredictStep # noqa: F401 +from .model import XGBoostModel # noqa: F401 diff --git a/pipeline_lib/implementation/tabular/xgboost/fit_model.py b/pipeline_lib/implementation/tabular/xgboost/fit_model.py deleted file mode 100644 index ed82260..0000000 --- a/pipeline_lib/implementation/tabular/xgboost/fit_model.py +++ /dev/null @@ -1,166 +0,0 @@ -import time -from typing import Optional - -import optuna -from joblib import dump -from optuna.pruners import MedianPruner -from sklearn.metrics import mean_absolute_error - -from pipeline_lib.core import DataContainer -from pipeline_lib.core.steps import FitModelStep - -from .model import XGBoostModel - - -class XGBoostFitModelStep(FitModelStep): - """Fit the model with XGBoost.""" - - def __init__( - self, - target: str, - drop_columns: Optional[list[str]] = None, - xgb_params: Optional[dict] = None, - optuna_params: Optional[dict] = None, - save_path: Optional[str] = None, - ) -> None: - self.init_logger() - - if target is None: - raise ValueError("Target column not found in the parameters.") - - self.target = target - self.drop_columns = drop_columns - - if optuna_params and xgb_params: - raise ValueError("Both optuna_params and xgb_params are defined. Please choose one.") - - if not optuna_params and not xgb_params: - raise ValueError( - "No parameters defined. Please define either optuna_params or xgb_params." - ) - - self.xgb_params = xgb_params - self.optuna_params = optuna_params - - if save_path: - if not save_path.endswith(".joblib"): - raise ValueError("Only joblib format is supported for saving the model.") - - self.save_path = save_path - - def execute(self, data: DataContainer) -> DataContainer: - self.logger.debug("Starting model fitting with XGBoost") - - start_time = time.time() - - df_train = data.train - df_valid = data.validation - - if self.drop_columns: - df_train = df_train.drop(columns=self.drop_columns) - df_valid = df_valid.drop(columns=self.drop_columns) - - # Prepare the data - X_train = df_train.drop(columns=[self.target]) - y_train = df_train[self.target] - - X_valid = df_valid.drop(columns=[self.target]) - y_valid = df_valid[self.target] - - params = self.xgb_params - - if self.optuna_params: - params = self.optimize_with_optuna( - X_train, y_train, X_valid, y_valid, self.optuna_params - ) - data.tuning_params = params - - model = XGBoostModel(**params) - - model.fit( - X_train, - y_train, - eval_set=[(X_valid, y_valid)], - verbose=True, - ) - - end_time = time.time() - elapsed_time = end_time - start_time - minutes = int(elapsed_time // 60) - seconds = int(elapsed_time % 60) - self.logger.info(f"XGBoost model fitting took {minutes} minutes and {seconds} seconds.") - - # Save the model to the data container - data.model = model - data.target = self.target - - if self.save_path: - self.logger.info(f"Saving the model to {self.save_path}") - dump(model, self.save_path) - return data - - def optimize_with_optuna(self, X_train, y_train, X_valid, y_valid, optuna_params): - def objective(trial): - # Define the search space - max_depth = optuna_params.get("max_depth", [3, 12]) - eta = optuna_params.get("eta", [1e-8, 1.0]) - subsample = optuna_params.get("subsample", [0.2, 1.0]) - colsample_bytree = optuna_params.get("colsample_bytree", [0.2, 1.0]) - min_child_weight = optuna_params.get("min_child_weight", [1, 10]) - n_estimators = optuna_params.get("n_estimators", [100, 1000]) - - param = { - "verbosity": 0, - "objective": "reg:squarederror", - "eval_metric": "mae", - "n_jobs": -1, - "max_depth": trial.suggest_int("max_depth", max_depth[0], max_depth[1]), - "eta": trial.suggest_float("eta", eta[0], eta[1], log=True), - "subsample": trial.suggest_float("subsample", subsample[0], subsample[1]), - "colsample_bytree": trial.suggest_float( - "colsample_bytree", colsample_bytree[0], colsample_bytree[1] - ), - "min_child_weight": trial.suggest_int( - "min_child_weight", min_child_weight[0], min_child_weight[1] - ), - "n_estimators": trial.suggest_int("n_estimators", n_estimators[0], n_estimators[1]), - } - - model = XGBoostModel(**param) - model.fit( - X_train, - y_train, - eval_set=[(X_valid, y_valid)], - verbose=True, - ) - preds = model.predict(X_valid) - mae = mean_absolute_error(y_valid, preds) - return mae - - def optuna_logging_callback(study, trial): - if trial.state == optuna.trial.TrialState.COMPLETE: - self.logger.info( - f"Trial {trial.number} finished with value: {trial.value} and parameters:" - f" {trial.params}. Best is trial {study.best_trial.number} with value:" - f" {study.best_value}." - ) - - optuna_trials = optuna_params.get("trials", 20) - - self.logger.info(f"Optimizing XGBoost hyperparameters with {optuna_trials} trials.") - - study_name = optuna_params.get("study_name", "xgboost_optimization") - storage = optuna_params.get("storage", "sqlite:///db.sqlite3") - - study = optuna.create_study( - direction="minimize", - study_name=study_name, - storage=storage, - pruner=MedianPruner(), - ) - - study.optimize(objective, n_trials=optuna_trials, callbacks=[optuna_logging_callback]) - - best_params = study.best_params - self.logger.info(f"Best parameters found by Optuna: {best_params}") - return best_params diff --git a/pipeline_lib/implementation/tabular/xgboost/predict.py b/pipeline_lib/implementation/tabular/xgboost/predict.py deleted file mode 100644 index 8502f7a..0000000 --- a/pipeline_lib/implementation/tabular/xgboost/predict.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import Optional - -import pandas as pd -from joblib import load - -from pipeline_lib.core import DataContainer -from pipeline_lib.core.steps import PredictStep - - -class XGBoostPredictStep(PredictStep): - """Obtain the predictions for XGBoost model.""" - - def __init__( - self, - target: str, - load_path: str, - drop_columns: Optional[list[str]] = None, - ) -> None: - self.init_logger() - - if not load_path.endswith(".joblib"): - raise ValueError("Only joblib format is supported for loading the model.") - - self.target = target - self.load_path = load_path - self.drop_columns = drop_columns - - self.model = load(self.load_path) - - def execute(self, data: DataContainer) -> DataContainer: - self.logger.debug("Obtaining predictions for XGBoost model.") - - model_input = data.flow - - if self.drop_columns: - self.logger.info(f"Dropping columns: {self.drop_columns}") - model_input = model_input.drop(columns=self.drop_columns) - - predictions = self.model.predict(model_input.drop(columns=[self.target])) - - predictions_df = pd.DataFrame(predictions, columns=["prediction"]) - - model_input["predictions"] = predictions_df - data.model = self.model - data.model_output = model_input - data.target = self.target - data._drop_columns = self.drop_columns - return data