diff --git a/pipeline_lib/core/steps/__init__.py b/pipeline_lib/core/steps/__init__.py index 56f63c1..9c8593c 100644 --- a/pipeline_lib/core/steps/__init__.py +++ b/pipeline_lib/core/steps/__init__.py @@ -11,4 +11,3 @@ from .predict import PredictStep # noqa: F401 from .tabular_split import TabularSplitStep # noqa: F401 from .target_scaling import TargetScalingStep # noqa: F401 -from .explainer_dashboard import ExplainerDashboardStep # noqa: F401 diff --git a/pipeline_lib/core/steps/explainer_dashboard.py b/pipeline_lib/core/steps/explainer_dashboard.py index 3ebcf67..41d3802 100644 --- a/pipeline_lib/core/steps/explainer_dashboard.py +++ b/pipeline_lib/core/steps/explainer_dashboard.py @@ -1,4 +1,3 @@ -import pandas as pd from explainerdashboard import RegressionExplainer from pipeline_lib.core import DataContainer @@ -7,6 +6,7 @@ class ExplainerDashboardStep(PipelineStep): """Scale the target using Quantile Transformer.""" + def __init__( self, max_samples: int = 1000, @@ -29,6 +29,10 @@ def execute(self, data: DataContainer) -> DataContainer: if len(df) > self.max_samples: # Randomly sample a subset of data points if the dataset is larger than max_samples + self.logger.info( + f"Dataset contains {len(df)} data points and max_samples is set to" + f" {self.max_samples}." + ) self.logger.info(f"Sampling {self.max_samples} data points from the dataset.") df = df.sample(n=self.max_samples, random_state=42) @@ -39,7 +43,11 @@ def execute(self, data: DataContainer) -> DataContainer: X_test = df.drop(columns=[target]) y_test = df[target] - explainer = RegressionExplainer(model, X_test, y_test,) + explainer = RegressionExplainer( + model, + X_test, + y_test, + ) data[DataContainer.EXPLAINER] = explainer