diff --git a/imblearn/base.py b/imblearn/base.py index 99b4ac33f..57c4593bc 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -9,6 +9,8 @@ import logging from abc import ABCMeta, abstractmethod +import numpy as np + from sklearn.base import BaseEstimator from sklearn.externals import six from sklearn.utils import check_X_y @@ -26,13 +28,6 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): _estimator_type = 'sampler' - def _check_X_y(self, X, y): - """Private function to check that the X and y in fitting are the same - than in sampling.""" - X_hash, y_hash = hash_X_y(X, y) - if self.X_hash_ != X_hash or self.y_hash_ != y_hash: - raise RuntimeError("X and y need to be same array earlier fitted.") - def sample(self, X, y): """Resample the dataset. @@ -55,13 +50,33 @@ def sample(self, X, y): """ - # Check the consistency of X and y - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - check_is_fitted(self, 'ratio_') - self._check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True) + if self.target_encoder_ is not None: + y = self.target_encoder_.inverse_transform(y) + X_hash, y_hash = hash_X_y(X, y) + if self.X_hash_ != X_hash or self.y_hash_ != y_hash: + raise RuntimeError("X and y need to be same array earlier fitted.") + + result = self._sample(X, y) + + if not getattr(self, 'return_indices', False): + X_res, y_res = result + else: + X_res, y_res, indices_res = result + + if self.target_encoder_ is not None: + # find the case that we have ensemble + if y_res.ndim == 2: + y_res = np.hstack([self.target_encoder_.transform(y_res_subset) + for y_res_subset in y_res]) + else: + y_res = self.target_encoder_.transform(y_res) - return self._sample(X, y) + if not getattr(self, 'return_indices', False): + return X_res, y_res + else: + return X_res, y_res, indices_res def fit_sample(self, X, y): """Fit the statistics and resample the data directly. @@ -153,8 +168,8 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - y = check_target_type(y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True) + y = check_target_type(y, self) self.X_hash_, self.y_hash_ = hash_X_y(X, y) # self.sampling_type is already checked in check_ratio self.ratio_ = check_ratio(self.ratio, y, self._sampling_type) diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 0e1eacae9..28be4825e 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -145,8 +145,8 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - y = check_target_type(y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True) + y = check_target_type(y, self) self.ratio_ = self.ratio self.X_hash_, self.y_hash_ = hash_X_y(X, y) diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index 3642c81a7..124a7993b 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -8,7 +8,6 @@ from __future__ import division import logging -import warnings from sklearn.utils import check_X_y @@ -154,8 +153,8 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - y = check_target_type(y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True) + y = check_target_type(y, self) self.ratio_ = self.ratio self.X_hash_, self.y_hash_ = hash_X_y(X, y) diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index f451d96d5..0c67c2c68 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -14,7 +14,7 @@ from sklearn.model_selection import cross_val_predict from .base import BaseEnsembleSampler -from ..utils import check_ratio +from ..utils import check_ratio, check_target_type class BalanceCascade(BaseEnsembleSampler): @@ -137,6 +137,7 @@ def fit(self, X, y): """ super(BalanceCascade, self).fit(X, y) + y = check_target_type(y, self) self.ratio_ = check_ratio(self.ratio, y, 'under-sampling') return self diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 0328ee84d..9ebaf5b9d 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -24,6 +24,8 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import set_random_state +from sklearn.preprocessing import LabelBinarizer +from sklearn.utils.multiclass import type_of_target from imblearn.base import SamplerMixin from imblearn.over_sampling.base import BaseOverSampler @@ -37,6 +39,8 @@ def _yield_sampler_checks(name, Estimator): yield check_target_type + yield check_multilabel_type + # yield check_multioutput_type_error yield check_samplers_one_label yield check_samplers_no_fit_error yield check_samplers_X_consistancy_sample @@ -85,6 +89,41 @@ def check_target_type(name, Estimator): estimator.fit(X, y) +def check_multilabel_type(name, Estimator): + x = np.random.random((1000, 10)) + y = np.array([0] * 900 + [1] * 75 + [2] * 25) + + binarizer = LabelBinarizer(sparse_output=True) + y_multilabel = binarizer.fit_transform(y) + + sampler = Estimator(random_state=0) + X_res, y_res = sampler.fit_sample(x, y_multilabel) + + if isinstance(sampler, BaseEnsembleSampler): + assert type_of_target(y_res[0]) == type_of_target(y_multilabel[0]) + else: + assert type_of_target(y_res) == type_of_target(y_multilabel) + + binarizer = LabelBinarizer(sparse_output=False) + y_multilabel = binarizer.fit_transform(y) + + sampler = Estimator(random_state=0) + X_res, y_res = sampler.fit_sample(x, y_multilabel) + + if isinstance(sampler, BaseEnsembleSampler): + assert type_of_target(y_res[0]) == type_of_target(y_multilabel[0]) + else: + assert type_of_target(y_res) == type_of_target(y_multilabel) + + +# def check_multioutput_type_error(name, Estimator): +# x = np.random((2, 10)) +# y = np.array([[0, 1, 1], [0, 1, 0]]) + +# sampler = Estimator(random_state=0) +# y_res = sampler.fit_sample(x) + + def check_samplers_one_label(name, Sampler): error_string_fit = "Sampler can't balance when only one class is present." sampler = Sampler() diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py index a363cd18f..0a2e21e00 100644 --- a/imblearn/utils/validation.py +++ b/imblearn/utils/validation.py @@ -7,17 +7,19 @@ from numbers import Integral import numpy as np +from scipy import sparse from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.externals import six, joblib from sklearn.utils.multiclass import type_of_target +from sklearn.preprocessing import LabelBinarizer from ..exceptions import raise_isinstance_error SAMPLING_KIND = ('over-sampling', 'under-sampling', 'clean-sampling', 'ensemble') -TARGET_KIND = ('binary', 'multiclass') +TARGET_KIND = ('binary', 'multiclass', 'multilabel-indicator') def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): @@ -52,11 +54,11 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object) -def check_target_type(y): +def check_target_type(y, sampler): """Check the target types to be conform to the current samplers. - The current samplers should be compatible with ``'binary'`` and - ``'multiclass'`` targets only. + The current samplers should be compatible with ``'binary'``, + ``'multiclass'`` and ``'multilabel-indicator'`` targets only. Parameters ---------- @@ -74,6 +76,23 @@ def check_target_type(y): # not allow for it warnings.warn("'y' should be of types {} only. Got {} instead.".format( TARGET_KIND, type_of_target(y))) + elif type_of_target(y) == 'multilabel-indicator': + if np.any(y.sum(axis=1) > 1): + raise ValueError("'y' as 'multilabel' is supported only to" + " represent a 'multiclass' problem. 'y' contains" + " multiple tasks and samplers do not support" + " these targets.") + # create a label binarizer and simulate a fit + sampler.target_encoder_ = LabelBinarizer( + sparse_output=sparse.issparse(y)) + sampler.target_encoder_.y_type_ = 'multiclass' + sampler.target_encoder_.sparse_input_ = False + sampler.target_encoder_.classes_ = np.arange(y.shape[1], dtype=int) + + return sampler.target_encoder_.inverse_transform(y) + else: + sampler.target_encoder_ = None + return y