MNT Pipeline Refactor - Reduce Code Footprint (#654)

MattEding · glemaitre · commit bea191528b78 · 2019-12-05T14:22:53.000+01:00
diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py
@@ -12,10 +12,9 @@
 #         Christos Aridas
 #         Guillaume Lemaitre <g.lemaitre58@gmail.com>
 # License: BSD
-
 from sklearn import pipeline
 from sklearn.base import clone
-from sklearn.utils import Bunch, _print_elapsed_time
+from sklearn.utils import _print_elapsed_time
 from sklearn.utils.metaestimators import if_delegate_has_method
 from sklearn.utils.validation import check_memory
 
@@ -145,7 +144,8 @@ def _validate_steps(self):
             ):
                 raise TypeError(
                     "All intermediate steps of the chain should "
-                    "be estimators that implement fit and transform or sample."
+                    "be estimators that implement fit and transform or "
+                    "fit_resample."
                     " '%s' implements both)" % (t)
                 )
 
@@ -167,6 +167,21 @@ def _validate_steps(self):
                 % (estimator, type(estimator))
             )
 
+    def _iter(
+        self, with_final=True, filter_passthrough=True, filter_resample=True
+    ):
+        """Generate (idx, (name, trans)) tuples from self.steps.
+
+        When `filter_passthrough` is `True`, 'passthrough' and None
+        transformers are filtered out. When `filter_resample` is `True`,
+        estimator with a method `fit_resample` are filtered out.
+        """
+        it = super()._iter(with_final, filter_passthrough)
+        if filter_resample:
+            return filter(lambda x: not hasattr(x[-1], "fit_resample"), it)
+        else:
+            return it
+
     # Estimator interface
 
     def _fit(self, X, y=None, **fit_params):
@@ -175,7 +190,7 @@ def _fit(self, X, y=None, **fit_params):
         # Setup the memory
         memory = check_memory(self.memory)
 
-        fit_transform_one_cached = memory.cache(_fit_transform_one)
+        fit_transform_one_cached = memory.cache(pipeline._fit_transform_one)
         fit_resample_one_cached = memory.cache(_fit_resample_one)
 
         fit_params_steps = {
@@ -194,7 +209,8 @@ def _fit(self, X, y=None, **fit_params):
         for (step_idx,
              name,
              transformer) in self._iter(with_final=False,
-                                        filter_passthrough=False):
+                                        filter_passthrough=False,
+                                        filter_resample=False):
             if (transformer is None or transformer == 'passthrough'):
                 with _print_elapsed_time('Pipeline',
                                          self._log_message(step_idx)):
@@ -208,7 +224,7 @@ def _fit(self, X, y=None, **fit_params):
                 else:
                     cloned_transformer = clone(transformer)
             elif hasattr(memory, "cachedir"):
-                # joblib < 0.11
+                # joblib <= 0.11
                 if memory.cachedir is None:
                     # we do not clone when caching is disabled to
                     # preserve backward compatibility
@@ -354,38 +370,6 @@ def fit_resample(self, X, y=None, **fit_params):
             elif hasattr(last_step, "fit_resample"):
                 return last_step.fit_resample(Xt, yt, **fit_params)
 
-    @if_delegate_has_method(delegate="_final_estimator")
-    def predict(self, X, **predict_params):
-        """Apply transformers/samplers to the data, and predict with the final
-        estimator
-
-        Parameters
-        ----------
-        X : iterable
-            Data to predict on. Must fulfill input requirements of first step
-            of the pipeline.
-
-        **predict_params : dict of string -> object
-            Parameters to the ``predict`` called at the end of all
-            transformations in the pipeline. Note that while this may be
-            used to return uncertainties from some models with return_std
-            or return_cov, uncertainties that are generated by the
-            transformations in the pipeline are not propagated to the
-            final estimator.
-
-        Returns
-        -------
-        y_pred : array-like
-
-        """
-        Xt = X
-        for _, _, transform in self._iter(with_final=False):
-            if hasattr(transform, "fit_resample"):
-                pass
-            else:
-                Xt = transform.transform(Xt)
-        return self.steps[-1][-1].predict(Xt, **predict_params)
-
     @if_delegate_has_method(delegate="_final_estimator")
     def fit_predict(self, X, y=None, **fit_params):
         """Applies fit_predict of last step in pipeline after transforms.
@@ -419,233 +403,6 @@ def fit_predict(self, X, y=None, **fit_params):
             y_pred = self.steps[-1][-1].fit_predict(Xt, yt, **fit_params)
         return y_pred
 
-    @if_delegate_has_method(delegate="_final_estimator")
-    def predict_proba(self, X):
-        """Apply transformers/samplers, and predict_proba of the final
-        estimator
-
-        Parameters
-        ----------
-        X : iterable
-            Data to predict on. Must fulfill input requirements of first step
-            of the pipeline.
-
-        Returns
-        -------
-        y_proba : array-like, shape = [n_samples, n_classes]
-
-        """
-        Xt = X
-        for _, _, transform in self._iter(with_final=False):
-            if hasattr(transform, "fit_resample"):
-                pass
-            else:
-                Xt = transform.transform(Xt)
-        return self.steps[-1][-1].predict_proba(Xt)
-
-    @if_delegate_has_method(delegate="_final_estimator")
-    def score_samples(self, X):
-        """Apply transforms, and score_samples of the final estimator.
-        Parameters
-        ----------
-        X : iterable
-            Data to predict on. Must fulfill input requirements of first step
-            of the pipeline.
-        Returns
-        -------
-        y_score : ndarray, shape (n_samples,)
-        """
-        Xt = X
-        for _, _, transformer in self._iter(with_final=False):
-            if hasattr(transformer, "fit_resample"):
-                pass
-            else:
-                Xt = transformer.transform(Xt)
-        return self.steps[-1][-1].score_samples(Xt)
-
-    @if_delegate_has_method(delegate="_final_estimator")
-    def decision_function(self, X):
-        """Apply transformers/samplers, and decision_function of the final
-        estimator
-
-        Parameters
-        ----------
-        X : iterable
-            Data to predict on. Must fulfill input requirements of first step
-            of the pipeline.
-
-        Returns
-        -------
-        y_score : array-like, shape = [n_samples, n_classes]
-
-        """
-        Xt = X
-        for _, _, transform in self._iter(with_final=False):
-            if hasattr(transform, "fit_resample"):
-                pass
-            else:
-                Xt = transform.transform(Xt)
-        return self.steps[-1][-1].decision_function(Xt)
-
-    @if_delegate_has_method(delegate="_final_estimator")
-    def predict_log_proba(self, X):
-        """Apply transformers/samplers, and predict_log_proba of the final
-        estimator
-
-        Parameters
-        ----------
-        X : iterable
-            Data to predict on. Must fulfill input requirements of first step
-            of the pipeline.
-
-        Returns
-        -------
-        y_score : array-like, shape = [n_samples, n_classes]
-
-        """
-        Xt = X
-        for _, _, transform in self._iter(with_final=False):
-            if hasattr(transform, "fit_resample"):
-                pass
-            else:
-                Xt = transform.transform(Xt)
-        return self.steps[-1][-1].predict_log_proba(Xt)
-
-    @property
-    def transform(self):
-        """Apply transformers/samplers, and transform with the final estimator
-
-        This also works where final estimator is ``None``: all prior
-        transformations are applied.
-
-        Parameters
-        ----------
-        X : iterable
-            Data to transform. Must fulfill input requirements of first step
-            of the pipeline.
-
-        Returns
-        -------
-        Xt : array-like, shape = [n_samples, n_transformed_features]
-        """
-        # _final_estimator is None or has transform, otherwise attribute error
-        if self._final_estimator != "passthrough":
-            self._final_estimator.transform
-        return self._transform
-
-    def _transform(self, X):
-        Xt = X
-        for _, _, transform in self._iter():
-            if hasattr(transform, "fit_resample"):
-                pass
-            else:
-                Xt = transform.transform(Xt)
-        return Xt
-
-    @property
-    def inverse_transform(self):
-        """Apply inverse transformations in reverse order
-
-        All estimators in the pipeline must support ``inverse_transform``.
-
-        Parameters
-        ----------
-        Xt : array-like, shape = [n_samples, n_transformed_features]
-            Data samples, where ``n_samples`` is the number of samples and
-            ``n_features`` is the number of features. Must fulfill
-            input requirements of last step of pipeline's
-            ``inverse_transform`` method.
-
-        Returns
-        -------
-        Xt : array-like, shape = [n_samples, n_features]
-        """
-        # raise AttributeError if necessary for hasattr behaviour
-        for _, _, transform in self._iter():
-            transform.inverse_transform
-        return self._inverse_transform
-
-    def _inverse_transform(self, X):
-        Xt = X
-        reverse_iter = reversed(list(self._iter()))
-        for _, _, transform in reverse_iter:
-            if hasattr(transform, "fit_resample"):
-                pass
-            else:
-                Xt = transform.inverse_transform(Xt)
-        return Xt
-
-    @if_delegate_has_method(delegate="_final_estimator")
-    def score(self, X, y=None, sample_weight=None):
-        """Apply transformers/samplers, and score with the final estimator
-
-        Parameters
-        ----------
-        X : iterable
-            Data to predict on. Must fulfill input requirements of first step
-            of the pipeline.
-
-        y : iterable, default=None
-            Targets used for scoring. Must fulfill label requirements for all
-            steps of the pipeline.
-
-        sample_weight : array-like, default=None
-            If not None, this argument is passed as ``sample_weight`` keyword
-            argument to the ``score`` method of the final estimator.
-
-        Returns
-        -------
-        score : float
-        """
-        Xt = X
-        for _, _, transform in self._iter(with_final=False):
-            if hasattr(transform, "fit_resample"):
-                pass
-            else:
-                Xt = transform.transform(Xt)
-        score_params = {}
-        if sample_weight is not None:
-            score_params["sample_weight"] = sample_weight
-        return self.steps[-1][-1].score(Xt, y, **score_params)
-
-    @if_delegate_has_method(delegate='_final_estimator')
-    def score_samples(self, X):
-        """Apply transforms, and score_samples of the final estimator.
-        Parameters
-        ----------
-        X : iterable
-            Data to predict on. Must fulfill input requirements of first step
-            of the pipeline.
-        Returns
-        -------
-        y_score : ndarray, shape (n_samples,)
-        """
-        Xt = X
-        for _, _, transformer in self._iter(with_final=False):
-            if hasattr(transformer, "fit_resample"):
-                pass
-            else:
-                Xt = transformer.transform(Xt)
-        return self.steps[-1][-1].score_samples(Xt)
-
-
-def _fit_transform_one(transformer,
-                       X,
-                       y,
-                       weight,
-                       message_clsname='',
-                       message=None,
-                       **fit_params):
-    with _print_elapsed_time(message_clsname, message):
-        if hasattr(transformer, "fit_transform"):
-            res = transformer.fit_transform(X, y, **fit_params)
-        else:
-            res = transformer.fit(X, y, **fit_params).transform(X)
-    # if we have a weight for this transformer, multiply output
-    if weight is None:
-        return res, transformer
-    return res * weight, transformer
-
 
 def _fit_resample_one(sampler,
                       X,