diff --git a/pipeline_lib/core/data_container.py b/pipeline_lib/core/data_container.py index a0393e6..8fe73c5 100644 --- a/pipeline_lib/core/data_container.py +++ b/pipeline_lib/core/data_container.py @@ -35,6 +35,7 @@ class DataContainer: TUNING_PARAMS = "tuning_params" TARGET = "target" IMPORTANCE = "importance" + DROP_COLUMNS = "drop_columns" def __init__(self, initial_data: Optional[dict] = None): """ diff --git a/pipeline_lib/core/steps/__init__.py b/pipeline_lib/core/steps/__init__.py index 9c8593c..56f63c1 100644 --- a/pipeline_lib/core/steps/__init__.py +++ b/pipeline_lib/core/steps/__init__.py @@ -11,3 +11,4 @@ from .predict import PredictStep # noqa: F401 from .tabular_split import TabularSplitStep # noqa: F401 from .target_scaling import TargetScalingStep # noqa: F401 +from .explainer_dashboard import ExplainerDashboardStep # noqa: F401 diff --git a/pipeline_lib/core/steps/explainer_dashboard.py b/pipeline_lib/core/steps/explainer_dashboard.py index 98bf397..3ebcf67 100644 --- a/pipeline_lib/core/steps/explainer_dashboard.py +++ b/pipeline_lib/core/steps/explainer_dashboard.py @@ -1,19 +1,46 @@ -from typing import Optional +import pandas as pd +from explainerdashboard import RegressionExplainer from pipeline_lib.core import DataContainer - -from .base import PipelineStep +from pipeline_lib.core.steps import PipelineStep class ExplainerDashboardStep(PipelineStep): - """Explainer Dashboard.""" - - def __init__(self, config: Optional[dict] = None) -> None: - """Initialize ExplainerDashboardStep.""" - super().__init__(config=config) + """Scale the target using Quantile Transformer.""" + def __init__( + self, + max_samples: int = 1000, + ) -> None: self.init_logger() + self.max_samples = max_samples def execute(self, data: DataContainer) -> DataContainer: - """Execute the step.""" - self.logger.info("Creating explainer dashboard.") + self.logger.debug("Starting explainer dashboard") + + model = data.get(DataContainer.MODEL) + if model is None: + raise ValueError("Model not found in data container.") + + target = data.get(DataContainer.TARGET) + if target is None: + raise ValueError("Target column not found in any parameter.") + + df = data.get(DataContainer.CLEAN) + + if len(df) > self.max_samples: + # Randomly sample a subset of data points if the dataset is larger than max_samples + self.logger.info(f"Sampling {self.max_samples} data points from the dataset.") + df = df.sample(n=self.max_samples, random_state=42) + + drop_columns = data.get("drop_columns") + if drop_columns: + df = df.drop(columns=drop_columns) + + X_test = df.drop(columns=[target]) + y_test = df[target] + + explainer = RegressionExplainer(model, X_test, y_test,) + + data[DataContainer.EXPLAINER] = explainer + return data diff --git a/pipeline_lib/implementation/tabular/xgboost/__init__.py b/pipeline_lib/implementation/tabular/xgboost/__init__.py index b299d0c..2887234 100644 --- a/pipeline_lib/implementation/tabular/xgboost/__init__.py +++ b/pipeline_lib/implementation/tabular/xgboost/__init__.py @@ -1,3 +1,2 @@ -from .explainer_dashboard import XGBoostExplainerDashboardStep # noqa: F401 from .fit_model import XGBoostFitModelStep # noqa: F401 from .predict import XGBoostPredictStep # noqa: F401 diff --git a/pipeline_lib/implementation/tabular/xgboost/explainer_dashboard.py b/pipeline_lib/implementation/tabular/xgboost/explainer_dashboard.py deleted file mode 100644 index f1b319d..0000000 --- a/pipeline_lib/implementation/tabular/xgboost/explainer_dashboard.py +++ /dev/null @@ -1,40 +0,0 @@ -from explainerdashboard import RegressionExplainer - -from pipeline_lib.core import DataContainer -from pipeline_lib.core.steps import ExplainerDashboardStep - - -class XGBoostExplainerDashboardStep(ExplainerDashboardStep): - """Scale the target using Quantile Transformer.""" - - def execute(self, data: DataContainer) -> DataContainer: - self.logger.debug("Starting explainer dashboard") - - model = data.get(DataContainer.MODEL) - if model is None: - raise ValueError("Model not found in data container.") - - val_df = data.get(DataContainer.VALIDATION) - if val_df is None: - raise ValueError("Validation data not found in data container.") - - model_configs = data[DataContainer.MODEL_CONFIGS] - if model_configs is None: - raise ValueError("Model configs not found in data container.") - - target = model_configs.get("target") - if target is None: - raise ValueError("Target column not found in model_configs.") - - drop_columns = model_configs.get("drop_columns") - if drop_columns: - val_df = val_df.drop(columns=drop_columns) - - X_test = val_df.drop(columns=[target]) - y_test = val_df[target] - - explainer = RegressionExplainer(model, X_test, y_test) - - data[DataContainer.EXPLAINER] = explainer - - return data diff --git a/pipeline_lib/implementation/tabular/xgboost/fit_model.py b/pipeline_lib/implementation/tabular/xgboost/fit_model.py index d0a538f..ea8e44f 100644 --- a/pipeline_lib/implementation/tabular/xgboost/fit_model.py +++ b/pipeline_lib/implementation/tabular/xgboost/fit_model.py @@ -48,6 +48,7 @@ def execute(self, data: DataContainer) -> DataContainer: start_time = time.time() data[DataContainer.TARGET] = self.target + data[DataContainer.DROP_COLUMNS] = self.drop_columns df_train = data[DataContainer.TRAIN] df_valid = data[DataContainer.VALIDATION] diff --git a/pipeline_lib/implementation/tabular/xgboost/predict.py b/pipeline_lib/implementation/tabular/xgboost/predict.py index 9dcd57f..ce9b62f 100644 --- a/pipeline_lib/implementation/tabular/xgboost/predict.py +++ b/pipeline_lib/implementation/tabular/xgboost/predict.py @@ -40,6 +40,8 @@ def execute(self, data: DataContainer) -> DataContainer: predictions_df = pd.DataFrame(predictions, columns=["prediction"]) model_input[DataContainer.PREDICTIONS] = predictions_df + data[DataContainer.MODEL] = self.model data[DataContainer.MODEL_OUTPUT] = model_input data[DataContainer.TARGET] = self.target + data[DataContainer.DROP_COLUMNS] = self.drop_columns return data