Skip to content

[WIP] Add support for multilabel #341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 29 additions & 14 deletions imblearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import logging
from abc import ABCMeta, abstractmethod

import numpy as np

from sklearn.base import BaseEstimator
from sklearn.externals import six
from sklearn.utils import check_X_y
Expand All @@ -26,13 +28,6 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):

_estimator_type = 'sampler'

def _check_X_y(self, X, y):
"""Private function to check that the X and y in fitting are the same
than in sampling."""
X_hash, y_hash = hash_X_y(X, y)
if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
raise RuntimeError("X and y need to be same array earlier fitted.")

def sample(self, X, y):
"""Resample the dataset.

Expand All @@ -55,13 +50,33 @@ def sample(self, X, y):

"""

# Check the consistency of X and y
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])

check_is_fitted(self, 'ratio_')
self._check_X_y(X, y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True)
if self.target_encoder_ is not None:
y = self.target_encoder_.inverse_transform(y)
X_hash, y_hash = hash_X_y(X, y)
if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
raise RuntimeError("X and y need to be same array earlier fitted.")

result = self._sample(X, y)

if not getattr(self, 'return_indices', False):
X_res, y_res = result
else:
X_res, y_res, indices_res = result

if self.target_encoder_ is not None:
# find the case that we have ensemble
if y_res.ndim == 2:
y_res = np.hstack([self.target_encoder_.transform(y_res_subset)
for y_res_subset in y_res])
else:
y_res = self.target_encoder_.transform(y_res)

return self._sample(X, y)
if not getattr(self, 'return_indices', False):
return X_res, y_res
else:
return X_res, y_res, indices_res

def fit_sample(self, X, y):
"""Fit the statistics and resample the data directly.
Expand Down Expand Up @@ -153,8 +168,8 @@ def fit(self, X, y):
Return self.

"""
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
y = check_target_type(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True)
y = check_target_type(y, self)
self.X_hash_, self.y_hash_ = hash_X_y(X, y)
# self.sampling_type is already checked in check_ratio
self.ratio_ = check_ratio(self.ratio, y, self._sampling_type)
Expand Down
4 changes: 2 additions & 2 deletions imblearn/combine/smote_enn.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ def fit(self, X, y):
Return self.

"""
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
y = check_target_type(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True)
y = check_target_type(y, self)
self.ratio_ = self.ratio
self.X_hash_, self.y_hash_ = hash_X_y(X, y)

Expand Down
5 changes: 2 additions & 3 deletions imblearn/combine/smote_tomek.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from __future__ import division

import logging
import warnings

from sklearn.utils import check_X_y

Expand Down Expand Up @@ -154,8 +153,8 @@ def fit(self, X, y):
Return self.

"""
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
y = check_target_type(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], multi_output=True)
y = check_target_type(y, self)
self.ratio_ = self.ratio
self.X_hash_, self.y_hash_ = hash_X_y(X, y)

Expand Down
3 changes: 2 additions & 1 deletion imblearn/ensemble/balance_cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from sklearn.model_selection import cross_val_predict

from .base import BaseEnsembleSampler
from ..utils import check_ratio
from ..utils import check_ratio, check_target_type


class BalanceCascade(BaseEnsembleSampler):
Expand Down Expand Up @@ -137,6 +137,7 @@ def fit(self, X, y):

"""
super(BalanceCascade, self).fit(X, y)
y = check_target_type(y, self)
self.ratio_ = check_ratio(self.ratio, y, 'under-sampling')
return self

Expand Down
39 changes: 39 additions & 0 deletions imblearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from sklearn.exceptions import NotFittedError
from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import set_random_state
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils.multiclass import type_of_target

from imblearn.base import SamplerMixin
from imblearn.over_sampling.base import BaseOverSampler
Expand All @@ -37,6 +39,8 @@

def _yield_sampler_checks(name, Estimator):
yield check_target_type
yield check_multilabel_type
# yield check_multioutput_type_error
yield check_samplers_one_label
yield check_samplers_no_fit_error
yield check_samplers_X_consistancy_sample
Expand Down Expand Up @@ -85,6 +89,41 @@ def check_target_type(name, Estimator):
estimator.fit(X, y)


def check_multilabel_type(name, Estimator):
x = np.random.random((1000, 10))
y = np.array([0] * 900 + [1] * 75 + [2] * 25)

binarizer = LabelBinarizer(sparse_output=True)
y_multilabel = binarizer.fit_transform(y)

sampler = Estimator(random_state=0)
X_res, y_res = sampler.fit_sample(x, y_multilabel)

if isinstance(sampler, BaseEnsembleSampler):
assert type_of_target(y_res[0]) == type_of_target(y_multilabel[0])
else:
assert type_of_target(y_res) == type_of_target(y_multilabel)

binarizer = LabelBinarizer(sparse_output=False)
y_multilabel = binarizer.fit_transform(y)

sampler = Estimator(random_state=0)
X_res, y_res = sampler.fit_sample(x, y_multilabel)

if isinstance(sampler, BaseEnsembleSampler):
assert type_of_target(y_res[0]) == type_of_target(y_multilabel[0])
else:
assert type_of_target(y_res) == type_of_target(y_multilabel)


# def check_multioutput_type_error(name, Estimator):
# x = np.random((2, 10))
# y = np.array([[0, 1, 1], [0, 1, 0]])

# sampler = Estimator(random_state=0)
# y_res = sampler.fit_sample(x)


def check_samplers_one_label(name, Sampler):
error_string_fit = "Sampler can't balance when only one class is present."
sampler = Sampler()
Expand Down
27 changes: 23 additions & 4 deletions imblearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,19 @@
from numbers import Integral

import numpy as np
from scipy import sparse

from sklearn.neighbors.base import KNeighborsMixin
from sklearn.neighbors import NearestNeighbors
from sklearn.externals import six, joblib
from sklearn.utils.multiclass import type_of_target
from sklearn.preprocessing import LabelBinarizer

from ..exceptions import raise_isinstance_error

SAMPLING_KIND = ('over-sampling', 'under-sampling', 'clean-sampling',
'ensemble')
TARGET_KIND = ('binary', 'multiclass')
TARGET_KIND = ('binary', 'multiclass', 'multilabel-indicator')


def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
Expand Down Expand Up @@ -52,11 +54,11 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object)


def check_target_type(y):
def check_target_type(y, sampler):
"""Check the target types to be conform to the current samplers.

The current samplers should be compatible with ``'binary'`` and
``'multiclass'`` targets only.
The current samplers should be compatible with ``'binary'``,
``'multiclass'`` and ``'multilabel-indicator'`` targets only.

Parameters
----------
Expand All @@ -74,6 +76,23 @@ def check_target_type(y):
# not allow for it
warnings.warn("'y' should be of types {} only. Got {} instead.".format(
TARGET_KIND, type_of_target(y)))
elif type_of_target(y) == 'multilabel-indicator':
if np.any(y.sum(axis=1) > 1):
raise ValueError("'y' as 'multilabel' is supported only to"
" represent a 'multiclass' problem. 'y' contains"
" multiple tasks and samplers do not support"
" these targets.")
# create a label binarizer and simulate a fit
sampler.target_encoder_ = LabelBinarizer(
sparse_output=sparse.issparse(y))
sampler.target_encoder_.y_type_ = 'multiclass'
sampler.target_encoder_.sparse_input_ = False
sampler.target_encoder_.classes_ = np.arange(y.shape[1], dtype=int)

return sampler.target_encoder_.inverse_transform(y)
else:
sampler.target_encoder_ = None

return y


Expand Down