Skip to content

Commit aa1f233

Browse files
author
chkoar
committed
EasyEnsembleGeneralization Step1
1 parent 2c0628f commit aa1f233

File tree

4 files changed

+285
-2
lines changed

4 files changed

+285
-2
lines changed

.gitignore

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,9 @@ target/
6666
*.sln
6767
*.pyproj
6868
*.suo
69-
*.vs
69+
*.vs
70+
/*.csproj
71+
/.spyproject
72+
/.vscode
73+
/bin/Debug
74+
/obj/x86/Debug

imblearn/ensemble/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55

66
from .easy_ensemble import EasyEnsemble
7+
from .easy_ensemble_generalization import EasyEnsembleGeneralization
78
from .balance_cascade import BalanceCascade
89

9-
__all__ = ['EasyEnsemble', 'BalanceCascade']
10+
__all__ = ['EasyEnsemble', 'EasyEnsembleGeneralization', 'BalanceCascade']
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
"Easy Ensemble Generalization"
2+
3+
# Authors: Christos Aridas
4+
#
5+
# License: MIT
6+
7+
import numpy as np
8+
from sklearn.base import ClassifierMixin, clone
9+
from sklearn.ensemble import VotingClassifier
10+
from sklearn.ensemble.base import BaseEnsemble, _set_random_states
11+
from sklearn.tree import DecisionTreeClassifier
12+
from sklearn.utils import check_random_state
13+
from sklearn.utils.validation import check_is_fitted
14+
15+
from ..pipeline import Pipeline
16+
from ..under_sampling import RandomUnderSampler
17+
18+
MAX_INT = np.iinfo(np.int32).max
19+
20+
21+
class EasyEnsembleGeneralization(BaseEnsemble, ClassifierMixin):
22+
"""This classifier generalize the Easy Ensemble algorithm for imbalanced
23+
datasets.
24+
25+
Parameters
26+
----------
27+
estimator : object or None, optional (default=None)
28+
Invoking the ``fit`` method on the ``EasyEnsembleGeneralization`` will fit clones
29+
of those original estimators that will be stored in the class attribute
30+
``self.estimators_``. An estimator can be set to `None` using
31+
``set_params``.
32+
33+
sampler: object or None, optional (default=None)
34+
Invoking the ``fit`` method on the ``EasyEnsembleGeneralization`` will fit clones
35+
of those original samplers.
36+
37+
n_estimators : int, optional (default=10)
38+
The number of base estimators in the ensemble.
39+
40+
voting : str, {'hard', 'soft'} (default='hard')
41+
If 'hard', uses predicted class labels for majority rule voting.
42+
Else if 'soft', predicts the class label based on the argmax of
43+
the sums of the predicted probabilities, which is recommended for
44+
an ensemble of well-calibrated classifiers.
45+
46+
random_state : int, RandomState instance or None, optional (default=None)
47+
If int, random_state is the seed used by the random number generator;
48+
If RandomState instance, random_state is the random number generator;
49+
If None, the random number generator is the RandomState instance used
50+
by `np.random`.
51+
52+
n_jobs : int, optional (default=1)
53+
The number of jobs to run in parallel for ``fit``.
54+
If -1, then the number of jobs is set to the number of cores.
55+
56+
Attributes
57+
----------
58+
estimators_ : list of classifiers
59+
The collection of fitted estimators.
60+
61+
classes_ : array-like, shape = [n_predictions]
62+
The classes labels.
63+
64+
Examples
65+
--------
66+
>>>import numpy as np
67+
>>>from imblearn.ensemble import EasyEnsembleGeneralization as EEG
68+
>>>X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
69+
>>>y = np.array([1, 1, 1, 2, 2, 2])
70+
>>>eeg = EEG(voting='hard', random_state=0)
71+
>>>eeg.fit(X,y)
72+
>>>eeg.predict(X)
73+
[1 1 1 2 2 2]
74+
>>>
75+
"""
76+
77+
def __init__(self,
78+
base_estimator=None,
79+
base_sampler=None,
80+
n_estimators=5,
81+
voting='soft',
82+
random_state=None,
83+
n_jobs=1):
84+
85+
self.base_estimator = base_estimator
86+
self.base_sampler = base_sampler
87+
self.n_estimators = n_estimators
88+
self.voting = voting
89+
self.random_state = random_state
90+
self.n_jobs = n_jobs
91+
92+
def _validate_estimator(self):
93+
"""Check the estimator and set the base_estimator_ attribute."""
94+
super(EasyEnsembleGeneralization, self)._validate_estimator(
95+
default=DecisionTreeClassifier())
96+
97+
def _validate_sampler(self):
98+
"""Check the sampler and set the base_sampler_ attribute."""
99+
100+
if self.base_sampler is not None:
101+
self.base_sampler_ = self.base_sampler
102+
else:
103+
self.base_sampler_ = RandomUnderSampler()
104+
105+
if self.base_sampler_ is None:
106+
raise ValueError("base_sampler cannot be None")
107+
108+
def fit(self, X, y, sample_weight=None):
109+
"""Build an ensemble of estimators from the training set (X, y).
110+
111+
Parameters
112+
----------
113+
X : {array-like, sparse matrix} of shape = [n_samples, n_features]
114+
The training input samples. Sparse matrices are accepted only if
115+
they are supported by the base estimator.
116+
117+
y : array-like, shape = [n_samples]
118+
The target values (class labels in classification, real numbers in
119+
regression).
120+
121+
sample_weight : array-like, shape = [n_samples] or None
122+
Sample weights. If None, then samples are equally weighted.
123+
Note that this is supported only if the base estimator supports
124+
sample weighting.
125+
126+
Returns
127+
-------
128+
self : object
129+
Returns self.
130+
"""
131+
132+
random_state = check_random_state(self.random_state)
133+
134+
self._validate_estimator()
135+
self._validate_sampler()
136+
137+
random_state = check_random_state(self.random_state)
138+
139+
if not hasattr(self.base_sampler, 'random_state'):
140+
ValueError('Base sampler must have a random_state parameter')
141+
142+
steps = [('sampler', self.base_sampler_),
143+
('estimator', self.base_estimator_)]
144+
pipeline_template = Pipeline(steps)
145+
146+
pipelines = []
147+
for i in enumerate(range(self.n_estimators)):
148+
pipeline = clone(pipeline_template)
149+
_set_random_states(pipeline, random_state)
150+
pipelines.append(pipeline)
151+
152+
ensemble_members = [[str(i), pipeline]
153+
for i, pipeline in enumerate(pipelines)]
154+
155+
self._voting = VotingClassifier(ensemble_members,
156+
voting=self.voting,
157+
n_jobs=self.n_jobs)
158+
self._voting.fit(X, y)
159+
160+
self.classes_ = self._voting.classes_
161+
self.estimators_ = [pipeline.named_steps['estimator']
162+
for pipeline in self._voting.estimators_]
163+
164+
return self
165+
166+
def predict(self, X):
167+
""" Predict class labels for X.
168+
169+
Parameters
170+
----------
171+
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
172+
Training vectors, where n_samples is the number of samples and
173+
n_features is the number of features.
174+
175+
Returns
176+
----------
177+
maj : array-like, shape = [n_samples]
178+
Predicted class labels.
179+
"""
180+
check_is_fitted(self, "_voting")
181+
return self._voting.predict(X)
182+
183+
def predict_proba(self, X):
184+
"""Compute probabilities of possible outcomes for all samples in X.
185+
186+
Parameters
187+
----------
188+
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
189+
Training vectors, where n_samples is the number of samples and
190+
n_features is the number of features.
191+
192+
Returns
193+
----------
194+
avg : array-like, shape = [n_samples, n_classes]
195+
Weighted average probability for each class per sample.
196+
"""
197+
check_is_fitted(self, "_voting")
198+
return self._voting.predict_proba(X)
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""Testing for the VotingClassifier"""
2+
3+
from __future__ import print_function
4+
5+
import numpy as np
6+
from sklearn.exceptions import NotFittedError
7+
from sklearn.model_selection import GridSearchCV, cross_val_score
8+
from sklearn.utils.testing import assert_almost_equal, assert_array_equal
9+
from sklearn.utils.testing import assert_equal, assert_true, assert_false
10+
from sklearn.utils.testing import assert_raise_message
11+
12+
from imblearn.ensemble import EasyEnsembleGeneralization as EEG
13+
14+
RND_SEED = 0
15+
X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
16+
[1.25192108, -0.22367336], [0.53366841, -0.30312976],
17+
[1.52091956, -0.49283504], [-0.28162401, -2.10400981],
18+
[0.83680821, 1.72827342], [0.3084254, 0.33299982],
19+
[0.70472253, -0.73309052], [0.28893132, -0.38761769],
20+
[1.15514042, 0.0129463], [0.88407872, 0.35454207],
21+
[1.31301027, -0.92648734], [-1.11515198, -0.93689695],
22+
[-0.18410027, -0.45194484], [0.9281014, 0.53085498],
23+
[-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
24+
[0.08711622, 0.93259929], [1.70580611, -0.11219234]])
25+
y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
26+
27+
28+
def test_estimator_init():
29+
30+
eeg = EEG(n_estimators=0)
31+
msg = "n_estimators must be greater than zero, got 0."
32+
assert_raise_message(ValueError, msg, eeg.fit, X, y)
33+
34+
35+
def test_predict_proba_hardvoting():
36+
eeg = EEG(voting='hard', random_state=RND_SEED).fit(X, y)
37+
msg = "predict_proba is not available when voting='hard'"
38+
assert_raise_message(AttributeError, msg, eeg.predict_proba, X)
39+
40+
41+
def test_notfitted():
42+
eeg = EEG()
43+
msg = ("This EasyEnsembleGeneralization instance is not fitted yet. Call \'fit\'"
44+
" with appropriate arguments before using this method.")
45+
assert_raise_message(NotFittedError, msg, eeg.predict_proba, X)
46+
47+
48+
def test_majority_label():
49+
"""Check classification by majority vote."""
50+
eeg = EEG(voting='soft', random_state=RND_SEED)
51+
scores = cross_val_score(eeg, X, y, cv=5, scoring='roc_auc')
52+
print(scores.mean())
53+
assert_almost_equal(scores.mean(), 0.65, decimal=2)
54+
55+
56+
def test_predict_on_toy_problem():
57+
"""Manually check predicted class labels for the toy dataset."""
58+
eeg = EEG(voting='hard', random_state=RND_SEED)
59+
assert_equal(all(eeg.fit(X, y).predict(X[0:6])), all([0, 1, 0, 0, 0, 1]))
60+
61+
62+
def test_gridsearch():
63+
"""Check GridSearch support."""
64+
eeg = EEG(random_state=RND_SEED)
65+
66+
params = {'voting': ['soft', 'hard'],
67+
'n_estimators': [2, 3, 4]}
68+
69+
grid = GridSearchCV(estimator=eeg, param_grid=params, cv=3)
70+
grid.fit(X, y)
71+
72+
73+
def test_parallel_predict():
74+
"""Check parallel backend of EasyEnsembleGeneralization on the toy dataset."""
75+
eeg1 = EEG(voting='soft', random_state=RND_SEED, n_jobs=1).fit(X, y)
76+
eeg2 = EEG(voting='soft', random_state=RND_SEED, n_jobs=2).fit(X, y)
77+
78+
assert_array_equal(eeg1.predict(X), eeg2.predict(X))
79+
assert_array_equal(eeg1.predict_proba(X), eeg2.predict_proba(X))

0 commit comments

Comments
 (0)