Add EasyEnsembleGeneralization

chkoar · chkoar · commit efec02eebd36 · 2017-07-20T05:38:46.000+03:00
diff --git a/imblearn/ensemble/__init__.py b/imblearn/ensemble/__init__.py
@@ -4,6 +4,7 @@
 """
 
 from .easy_ensemble import EasyEnsemble
+from .easy_ensemble_generalization import EasyEnsembleGeneralization
 from .balance_cascade import BalanceCascade
 
-__all__ = ['EasyEnsemble', 'BalanceCascade']
+__all__ = ['EasyEnsemble', 'EasyEnsembleGeneralization', 'BalanceCascade']
diff --git a/imblearn/ensemble/easy_ensemble_generalization.py b/imblearn/ensemble/easy_ensemble_generalization.py
@@ -0,0 +1,208 @@
+"Easy Ensemble Generalization"
+
+# Authors: Christos Aridas
+#
+# License: MIT
+
+import numpy as np
+
+from sklearn.base import ClassifierMixin, clone
+from sklearn.ensemble import BaseEnsemble, VotingClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils import check_random_state
+from sklearn.utils.validation import check_is_fitted
+
+from ..pipeline import Pipeline
+from ..under_sampling import RandomUnderSampler as ROS
+
+
+MAX_INT = np.iinfo(np.int32).max
+
+
+class EasyEnsembleGeneralization(BaseEnsemble, ClassifierMixin):
+    """This classifier generalize the Easy Ensemble algorithm for imbalanced 
+       datasets.
+
+    Parameters
+    ----------
+    estimator : object or None, optional (default=None)
+        Invoking the ``fit`` method on the ``EasyEnsembleGeneralization`` will fit clones
+        of those original estimators that will be stored in the class attribute
+        ``self.estimators_``. An estimator can be set to `None` using
+        ``set_params``.
+
+    sampler: object or None, optional (default=None)
+        Invoking the ``fit`` method on the ``EasyEnsembleGeneralization`` will fit clones
+        of those original samplers.
+
+    n_estimators : int, optional (default=10)
+        The number of base estimators in the ensemble.
+
+    voting : str, {'hard', 'soft'} (default='hard')
+        If 'hard', uses predicted class labels for majority rule voting.
+        Else if 'soft', predicts the class label based on the argmax of
+        the sums of the predicted probabilities, which is recommended for
+        an ensemble of well-calibrated classifiers.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    n_jobs : int, optional (default=1)
+        The number of jobs to run in parallel for ``fit``.
+        If -1, then the number of jobs is set to the number of cores.
+
+    Attributes
+    ----------
+    estimators_ : list of classifiers
+        The collection of fitted estimators.
+
+    classes_ : array-like, shape = [n_predictions]
+        The classes labels.
+
+    Examples
+    --------
+    >>>import numpy as np
+    >>>from imblearn.ensemble import EasyEnsembleGeneralization as EEG
+    >>>X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    >>>y = np.array([1, 1, 1, 2, 2, 2])
+    >>>eeg = EEG(voting='hard', random_state=0)
+    >>>eeg.fit(X,y)
+    >>>eeg.predict(X)
+    [1 1 1 2 2 2]
+    >>>
+    """
+
+    def __init__(self,
+                 base_estimator=None,
+                 base_sampler=None,
+                 n_estimators=5,
+                 voting='hard',
+                 random_state=None,
+                 n_jobs=1):
+
+        self.base_estimator = base_estimator
+        self.base_sampler = base_sampler
+        self.n_estimators = n_estimators
+        self.voting = voting
+        self.random_state = random_state
+        self.n_jobs = n_jobs
+
+    def _validate_estimator(self):
+        """Check the estimator and set the base_estimator_ attribute."""
+        super(EasyEnsembleGeneralization, self)._validate_estimator(
+            default=DecisionTreeClassifier())
+
+    def _validate_sampler(self):
+        """Check the sampler and set the base_sampler_ attribute."""
+
+        if self.base_sampler is not None:
+            self.base_sampler_ = self.base_sampler
+        else:
+            self.base_sampler_ = ROS()
+
+        if self.base_sampler_ is None:
+            raise ValueError("base_sampler cannot be None")
+
+    def fit(self, X, y, sample_weight=None):
+        """Build an ensemble of estimators from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        y : array-like, shape = [n_samples]
+            The target values (class labels in classification, real numbers in
+            regression).
+
+        sample_weight : array-like, shape = [n_samples] or None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if the base estimator supports
+            sample weighting.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+        random_state = check_random_state(self.random_state)
+
+        self._validate_estimator()
+        self._validate_sampler()
+
+        random_state = check_random_state(self.random_state)
+        estimator_seeds = random_state.randint(MAX_INT, size=self.n_estimators)
+        sampler_seeds = random_state.randint(MAX_INT, size=self.n_estimators)
+
+        if not hasattr(self.base_sampler, 'random_state'):
+            ValueError('Base sampler must have a random_state parameter')
+
+        pipelines = []
+        seeds = zip(estimator_seeds, sampler_seeds)
+
+        for i, (estimator_seed, sampler_seed) in enumerate(seeds):
+
+            sampler = clone(self.base_sampler_)
+            sampler.set_params(random_state=sampler_seed)
+
+            if hasattr(self.base_estimator_, 'random_state'):
+                estimator = clone(self.base_estimator_)
+                estimator.set_params(random_state=estimator_seed)
+            else:
+                estimator = clone(self.base_estimator_)
+            steps = [('sampler', sampler), ('estimator', estimator)]
+            pipeline = Pipeline(steps)
+            pipelines.append(pipeline)
+
+        ensemble_members = [[str(i), pipeline]
+                            for i, pipeline in enumerate(pipelines)]
+
+        self._voting = VotingClassifier(ensemble_members,
+                                        voting=self.voting,
+                                        n_jobs=self.n_jobs)
+        self._voting.fit(X, y)
+
+        self.classes_ = self._voting.classes_
+        self.estimators_ = [pipeline.named_steps['estimator']
+                            for pipeline in self._voting.estimators_]
+
+        return self
+
+    def predict(self, X):
+        """ Predict class labels for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        Returns
+        ----------
+        maj : array-like, shape = [n_samples]
+            Predicted class labels.
+        """
+        check_is_fitted(self, "_voting")
+        return self._voting.predict(X)
+
+    def predict_proba(self, X):
+        """Compute probabilities of possible outcomes for all samples in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        Returns
+        ----------
+        avg : array-like, shape = [n_samples, n_classes]
+            Weighted average probability for each class per sample.
+        """
+        check_is_fitted(self, "_voting")
+        return self._voting.predict_proba(X)
diff --git a/imblearn/ensemble/tests/test_easy_ensemble_generalization.py b/imblearn/ensemble/tests/test_easy_ensemble_generalization.py
@@ -0,0 +1,79 @@
+"""Testing for the VotingClassifier"""
+
+from __future__ import print_function
+
+import numpy as np
+from sklearn.exceptions import NotFittedError
+from sklearn.model_selection import GridSearchCV, cross_val_score
+from sklearn.utils.testing import assert_almost_equal, assert_array_equal
+from sklearn.utils.testing import assert_equal, assert_true, assert_false
+from sklearn.utils.testing import assert_raise_message
+
+from imblearn.ensemble import EasyEnsembleGeneralization as EEG
+
+RND_SEED = 0
+X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
+              [1.25192108, -0.22367336], [0.53366841, -0.30312976],
+              [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
+              [0.83680821, 1.72827342], [0.3084254, 0.33299982],
+              [0.70472253, -0.73309052], [0.28893132, -0.38761769],
+              [1.15514042, 0.0129463], [0.88407872, 0.35454207],
+              [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
+              [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
+              [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
+              [0.08711622, 0.93259929], [1.70580611, -0.11219234]])
+y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
+
+
+def test_estimator_init():
+
+    eeg = EEG(n_estimators=0)
+    msg = "n_estimators must be greater than zero, got 0."
+    assert_raise_message(ValueError, msg, eeg.fit, X, y)
+
+
+def test_predict_proba_hardvoting():
+    eeg = EEG(voting='hard', random_state=RND_SEED).fit(X, y)
+    msg = "predict_proba is not available when voting='hard'"
+    assert_raise_message(AttributeError, msg, eeg.predict_proba, X)
+
+
+def test_notfitted():
+    eeg = EEG()
+    msg = ("This EasyEnsembleGeneralization instance is not fitted yet. Call \'fit\'"
+           " with appropriate arguments before using this method.")
+    assert_raise_message(NotFittedError, msg, eeg.predict_proba, X)
+
+
+def test_majority_label():
+    """Check classification by majority vote."""
+    eeg = EEG(voting='soft', random_state=RND_SEED)
+    scores = cross_val_score(eeg, X, y, cv=5, scoring='roc_auc')
+    print(scores.mean())
+    assert_almost_equal(scores.mean(), 0.625, decimal=2)
+
+
+def test_predict_on_toy_problem():
+    """Manually check predicted class labels for the toy dataset."""
+    eeg = EEG(voting='hard', random_state=RND_SEED)
+    assert_equal(all(eeg.fit(X, y).predict(X[0:6])), all([0, 1, 0, 0, 0, 1]))
+
+
+def test_gridsearch():
+    """Check GridSearch support."""
+    eeg = EEG(random_state=RND_SEED)
+
+    params = {'voting': ['soft', 'hard'],
+              'n_estimators': [2, 3, 4]}
+
+    grid = GridSearchCV(estimator=eeg, param_grid=params, cv=3)
+    grid.fit(X, y)
+
+
+def test_parallel_predict():
+    """Check parallel backend of EasyEnsembleGeneralization on the toy dataset."""
+    eeg1 = EEG(voting='soft', random_state=RND_SEED, n_jobs=1).fit(X, y)
+    eeg2 = EEG(voting='soft', random_state=RND_SEED, n_jobs=2).fit(X, y)
+
+    assert_array_equal(eeg1.predict(X), eeg2.predict(X))
+    assert_array_equal(eeg1.predict_proba(X), eeg2.predict_proba(X))