Skip to content

EasyEnsembleGeneralization #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,9 @@ target/
*.sln
*.pyproj
*.suo
*.vs
*.vs
/*.csproj
/.spyproject
/.vscode
/bin/Debug
/obj/x86/Debug
3 changes: 2 additions & 1 deletion imblearn/ensemble/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

from .easy_ensemble import EasyEnsemble
from .easy_ensemble_generalization import EasyEnsembleGeneralization
from .balance_cascade import BalanceCascade

__all__ = ['EasyEnsemble', 'BalanceCascade']
__all__ = ['EasyEnsemble', 'EasyEnsembleGeneralization', 'BalanceCascade']
201 changes: 201 additions & 0 deletions imblearn/ensemble/easy_ensemble_generalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
"Easy Ensemble Generalization"

# Authors: Christos Aridas
#
# License: MIT
from __future__ import print_function

import numpy as np
from sklearn.base import ClassifierMixin, clone
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble.base import BaseEnsemble, _set_random_states
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_random_state
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_is_fitted

from ..pipeline import Pipeline
from ..under_sampling import RandomUnderSampler

MAX_INT = np.iinfo(np.int32).max


class EasyEnsembleGeneralization(BaseEnsemble, ClassifierMixin):
"""This classifier generalize the Easy Ensemble algorithm for imbalanced
datasets.

Parameters
----------
estimator : object or None, optional (default=None)
Invoking the ``fit`` method on the ``EasyEnsembleGeneralization`` will fit clones
of those original estimators that will be stored in the class attribute
``self.estimators_``. An estimator can be set to `None` using
``set_params``.

sampler: object or None, optional (default=None)
Invoking the ``fit`` method on the ``EasyEnsembleGeneralization`` will fit clones
of those original samplers.

n_estimators : int, optional (default=10)
The number of base estimators in the ensemble.

voting : str, {'hard', 'soft'} (default='hard')
If 'hard', uses predicted class labels for majority rule voting.
Else if 'soft', predicts the class label based on the argmax of
the sums of the predicted probabilities, which is recommended for
an ensemble of well-calibrated classifiers.

random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.

n_jobs : int, optional (default=1)
The number of jobs to run in parallel for ``fit``.
If -1, then the number of jobs is set to the number of cores.

Attributes
----------
estimators_ : list of classifiers
The collection of fitted estimators.

classes_ : array-like, shape = [n_predictions]
The classes labels.

Examples
--------
>>> import numpy as np
>>> from imblearn.ensemble import EasyEnsembleGeneralization as EEG
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> y = np.array([1, 1, 1, 2, 2, 2])
>>> eeg = EEG(voting='soft', random_state=0)
>>> eeg = eeg.fit(X,y)
>>> print(eeg.predict(X))
[1 1 1 2 2 2]
>>>
"""

def __init__(self,
base_estimator=None,
base_sampler=None,
n_estimators=5,
voting='soft',
random_state=None,
n_jobs=1):

self.base_estimator = base_estimator
self.base_sampler = base_sampler
self.n_estimators = n_estimators
self.voting = voting
self.random_state = random_state
self.n_jobs = n_jobs

def _validate_estimator(self):
"""Check the estimator and set the base_estimator_ attribute."""
super(EasyEnsembleGeneralization, self)._validate_estimator(
default=DecisionTreeClassifier())

def _validate_sampler(self):
"""Check the sampler and set the base_sampler_ attribute."""

if self.base_sampler is not None:
self.base_sampler_ = self.base_sampler
else:
self.base_sampler_ = RandomUnderSampler()

if self.base_sampler_ is None:
raise ValueError("base_sampler cannot be None")

def fit(self, X, y, sample_weight=None):
"""Build an ensemble of estimators from the training set (X, y).

Parameters
----------
X : {array-like, sparse matrix} of shape = [n_samples, n_features]
The training input samples. Sparse matrices are accepted only if
they are supported by the base estimator.

y : array-like, shape = [n_samples]
The target values (class labels in classification, real numbers in
regression).

sample_weight : array-like, shape = [n_samples] or None
Sample weights. If None, then samples are equally weighted.
Note that this is supported only if the base estimator supports
sample weighting.

Returns
-------
self : object
Returns self.
"""


check_classification_targets(y)

self._validate_estimator()
self._validate_sampler()

random_state = check_random_state(self.random_state)

if not hasattr(self.base_sampler, 'random_state'):
ValueError('Base sampler must have a random_state parameter')

steps = [('sampler', self.base_sampler_),
('estimator', self.base_estimator_)]
pipeline_template = Pipeline(steps)

pipelines = []
for i in enumerate(range(self.n_estimators)):
pipeline = clone(pipeline_template)
_set_random_states(pipeline, random_state)
pipelines.append(pipeline)

ensemble_members = [[str(i), pipeline]
for i, pipeline in enumerate(pipelines)]

self._voting = VotingClassifier(ensemble_members,
voting=self.voting,
n_jobs=self.n_jobs)
self._voting.fit(X, y)

self.classes_ = self._voting.classes_
self.estimators_ = [pipeline.named_steps['estimator']
for pipeline in self._voting.estimators_]

return self

def predict(self, X):
""" Predict class labels for X.

Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.

Returns
----------
maj : array-like, shape = [n_samples]
Predicted class labels.
"""
check_is_fitted(self, "_voting")
return self._voting.predict(X)

def predict_proba(self, X):
"""Compute probabilities of possible outcomes for all samples in X.

Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.

Returns
----------
avg : array-like, shape = [n_samples, n_classes]
Weighted average probability for each class per sample.
"""
check_is_fitted(self, "_voting")
return self._voting.predict_proba(X)
79 changes: 79 additions & 0 deletions imblearn/ensemble/tests/test_easy_ensemble_generalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Testing for the VotingClassifier"""

from __future__ import print_function

import numpy as np
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.utils.testing import assert_almost_equal, assert_array_equal
from sklearn.utils.testing import assert_equal, assert_true, assert_false
from sklearn.utils.testing import assert_raise_message

from imblearn.ensemble import EasyEnsembleGeneralization as EEG

RND_SEED = 0
X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
[1.25192108, -0.22367336], [0.53366841, -0.30312976],
[1.52091956, -0.49283504], [-0.28162401, -2.10400981],
[0.83680821, 1.72827342], [0.3084254, 0.33299982],
[0.70472253, -0.73309052], [0.28893132, -0.38761769],
[1.15514042, 0.0129463], [0.88407872, 0.35454207],
[1.31301027, -0.92648734], [-1.11515198, -0.93689695],
[-0.18410027, -0.45194484], [0.9281014, 0.53085498],
[-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
[0.08711622, 0.93259929], [1.70580611, -0.11219234]])
y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])


def test_estimator_init():

eeg = EEG(n_estimators=0)
msg = "n_estimators must be greater than zero, got 0."
assert_raise_message(ValueError, msg, eeg.fit, X, y)


def test_predict_proba_hardvoting():
eeg = EEG(voting='hard', random_state=RND_SEED).fit(X, y)
msg = "predict_proba is not available when voting='hard'"
assert_raise_message(AttributeError, msg, eeg.predict_proba, X)


def test_notfitted():
eeg = EEG()
msg = ("This EasyEnsembleGeneralization instance is not fitted yet. Call \'fit\'"
" with appropriate arguments before using this method.")
assert_raise_message(NotFittedError, msg, eeg.predict_proba, X)


def test_majority_label():
"""Check classification by majority vote."""
eeg = EEG(voting='soft', random_state=RND_SEED)
scores = cross_val_score(eeg, X, y, cv=5, scoring='roc_auc')
print(scores.mean())
assert_almost_equal(scores.mean(), 0.65, decimal=2)


def test_predict_on_toy_problem():
"""Manually check predicted class labels for the toy dataset."""
eeg = EEG(voting='hard', random_state=RND_SEED)
assert_equal(all(eeg.fit(X, y).predict(X[0:6])), all([0, 1, 0, 0, 0, 1]))


def test_gridsearch():
"""Check GridSearch support."""
eeg = EEG(random_state=RND_SEED)

params = {'voting': ['soft', 'hard'],
'n_estimators': [2, 3, 4]}

grid = GridSearchCV(estimator=eeg, param_grid=params, cv=3)
grid.fit(X, y)


def test_parallel_predict():
"""Check parallel backend of EasyEnsembleGeneralization on the toy dataset."""
eeg1 = EEG(voting='soft', random_state=RND_SEED, n_jobs=1).fit(X, y)
eeg2 = EEG(voting='soft', random_state=RND_SEED, n_jobs=2).fit(X, y)

assert_array_equal(eeg1.predict(X), eeg2.predict(X))
assert_array_equal(eeg1.predict_proba(X), eeg2.predict_proba(X))