scikit-learn-contrib · glemaitre · Sep 4, 2017 · Sep 4, 2017
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -9,6 +9,8 @@
 import logging
 from abc import ABCMeta, abstractmethod
 
+import numpy as np
+
 from sklearn.base import BaseEstimator
 from sklearn.externals import six
 from sklearn.utils import check_X_y
@@ -26,13 +28,6 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
 
     _estimator_type = 'sampler'
 
-    def _check_X_y(self, X, y):
-        """Private function to check that the X and y in fitting are the same
-        than in sampling."""
-        X_hash, y_hash = hash_X_y(X, y)
-        if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
-            raise RuntimeError("X and y need to be same array earlier fitted.")
-
     def sample(self, X, y):
         """Resample the dataset.
 
@@ -55,13 +50,33 @@ def sample(self, X, y):
 
         """
 
-        # Check the consistency of X and y
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
-
         check_is_fitted(self, 'ratio_')
-        self._check_X_y(X, y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True)
+        if self.target_encoder_ is not None:
+            y = self.target_encoder_.inverse_transform(y)
+        X_hash, y_hash = hash_X_y(X, y)
+        if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
+            raise RuntimeError("X and y need to be same array earlier fitted.")
+
+        result = self._sample(X, y)
+
+        if not getattr(self, 'return_indices', False):
+            X_res, y_res = result
+        else:
+            X_res, y_res, indices_res = result
+
+        if self.target_encoder_ is not None:
+            # find the case that we have ensemble
+            if y_res.ndim == 2:
+                y_res = np.hstack([self.target_encoder_.transform(y_res_subset)
+                                   for y_res_subset in y_res])
+            else:
+                y_res = self.target_encoder_.transform(y_res)
 
-        return self._sample(X, y)
+        if not getattr(self, 'return_indices', False):
+            return X_res, y_res
+        else:
+            return X_res, y_res, indices_res
 
     def fit_sample(self, X, y):
         """Fit the statistics and resample the data directly.
@@ -153,8 +168,8 @@ def fit(self, X, y):
             Return self.
 
         """
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
-        y = check_target_type(y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True)
+        y = check_target_type(y, self)
         self.X_hash_, self.y_hash_ = hash_X_y(X, y)
         # self.sampling_type is already checked in check_ratio
         self.ratio_ = check_ratio(self.ratio, y, self._sampling_type)

diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py
@@ -145,8 +145,8 @@ def fit(self, X, y):
             Return self.
 
         """
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
-        y = check_target_type(y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True)
+        y = check_target_type(y, self)
         self.ratio_ = self.ratio
         self.X_hash_, self.y_hash_ = hash_X_y(X, y)
 

diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py
@@ -8,7 +8,6 @@
 from __future__ import division
 
 import logging
-import warnings
 
 from sklearn.utils import check_X_y
 
@@ -154,8 +153,8 @@ def fit(self, X, y):
             Return self.
 
         """
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
-        y = check_target_type(y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True)
+        y = check_target_type(y, self)
         self.ratio_ = self.ratio
         self.X_hash_, self.y_hash_ = hash_X_y(X, y)
 

diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py
@@ -14,7 +14,7 @@
 from sklearn.model_selection import cross_val_predict
 
 from .base import BaseEnsembleSampler
-from ..utils import check_ratio
+from ..utils import check_ratio, check_target_type
 
 
 class BalanceCascade(BaseEnsembleSampler):
@@ -137,6 +137,7 @@ def fit(self, X, y):
 
         """
         super(BalanceCascade, self).fit(X, y)
+        y = check_target_type(y, self)
         self.ratio_ = check_ratio(self.ratio, y, 'under-sampling')
         return self
 

diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -24,6 +24,8 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import set_random_state
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.utils.multiclass import type_of_target
 
 from imblearn.base import SamplerMixin
 from imblearn.over_sampling.base import BaseOverSampler
@@ -37,6 +39,8 @@
 
 def _yield_sampler_checks(name, Estimator):
     yield check_target_type
+    yield check_multilabel_type
+    # yield check_multioutput_type_error
     yield check_samplers_one_label
     yield check_samplers_no_fit_error
     yield check_samplers_X_consistancy_sample
@@ -85,6 +89,41 @@ def check_target_type(name, Estimator):
         estimator.fit(X, y)
 
 
+def check_multilabel_type(name, Estimator):
+    x = np.random.random((1000, 10))
+    y = np.array([0] * 900 + [1] * 75 + [2] * 25)
+
+    binarizer = LabelBinarizer(sparse_output=True)
+    y_multilabel = binarizer.fit_transform(y)
+
+    sampler = Estimator(random_state=0)
+    X_res, y_res = sampler.fit_sample(x, y_multilabel)
+
+    if isinstance(sampler, BaseEnsembleSampler):
+        assert type_of_target(y_res[0]) == type_of_target(y_multilabel[0])
+    else:
+        assert type_of_target(y_res) == type_of_target(y_multilabel)
+
+    binarizer = LabelBinarizer(sparse_output=False)
+    y_multilabel = binarizer.fit_transform(y)
+
+    sampler = Estimator(random_state=0)
+    X_res, y_res = sampler.fit_sample(x, y_multilabel)
+
+    if isinstance(sampler, BaseEnsembleSampler):
+        assert type_of_target(y_res[0]) == type_of_target(y_multilabel[0])
+    else:
+        assert type_of_target(y_res) == type_of_target(y_multilabel)
+
+
+# def check_multioutput_type_error(name, Estimator):
+#     x = np.random((2, 10))
+#     y = np.array([[0, 1, 1], [0, 1, 0]])
+
+#     sampler = Estimator(random_state=0)
+#     y_res = sampler.fit_sample(x)
+
+
 def check_samplers_one_label(name, Sampler):
     error_string_fit = "Sampler can't balance when only one class is present."
     sampler = Sampler()

diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py
@@ -7,17 +7,19 @@
 from numbers import Integral
 
 import numpy as np
+from scipy import sparse
 
 from sklearn.neighbors.base import KNeighborsMixin
 from sklearn.neighbors import NearestNeighbors
 from sklearn.externals import six, joblib
 from sklearn.utils.multiclass import type_of_target
+from sklearn.preprocessing import LabelBinarizer
 
 from ..exceptions import raise_isinstance_error
 
 SAMPLING_KIND = ('over-sampling', 'under-sampling', 'clean-sampling',
                  'ensemble')
-TARGET_KIND = ('binary', 'multiclass')
+TARGET_KIND = ('binary', 'multiclass', 'multilabel-indicator')
 
 
 def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
@@ -52,11 +54,11 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
         raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object)
 
 
-def check_target_type(y):
+def check_target_type(y, sampler):
     """Check the target types to be conform to the current samplers.
 
-    The current samplers should be compatible with ``'binary'`` and
-    ``'multiclass'`` targets only.
+    The current samplers should be compatible with ``'binary'``,
+    ``'multiclass'`` and ``'multilabel-indicator'`` targets only.
 
     Parameters
     ----------
@@ -74,6 +76,23 @@ def check_target_type(y):
         # not allow for it
         warnings.warn("'y' should be of types {} only. Got {} instead.".format(
             TARGET_KIND, type_of_target(y)))
+    elif type_of_target(y) == 'multilabel-indicator':
+        if np.any(y.sum(axis=1) > 1):
+            raise ValueError("'y' as 'multilabel' is supported only to"
+                             " represent a 'multiclass' problem. 'y' contains"
+                             " multiple tasks and samplers do not support"
+                             " these targets.")
+        # create a label binarizer and simulate a fit
+        sampler.target_encoder_ = LabelBinarizer(
+            sparse_output=sparse.issparse(y))
+        sampler.target_encoder_.y_type_ = 'multiclass'
+        sampler.target_encoder_.sparse_input_ = False
+        sampler.target_encoder_.classes_ = np.arange(y.shape[1], dtype=int)
+
+        return sampler.target_encoder_.inverse_transform(y)
+    else:
+        sampler.target_encoder_ = None
+
     return y