Refactor and add DaggingRegressor

chkoar · Jan 3, 2019 · d9f7260 · d9f7260
1 parent a8e81c0
commit d9f7260
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 59 deletions.
diff --git a/README.rst b/README.rst
@@ -21,7 +21,7 @@ Example
 
 .. code-block:: python
 
-    from dagging import Dagging
+    from dagging import DaggingClassifier
     from sklearn.datasets import load_iris 
 
     # Load Iris from from scikit-learn.

diff --git a/dagging/__init__.py b/dagging/__init__.py
@@ -1,5 +1,5 @@
 from ._version import get_versions
-from ._dagging import Dagging, DaggingClassifier # noqa
+from ._dagging import DaggingClassifier, DaggingRegressor  # noqa
 
 __version__ = get_versions()['version']
 del get_versions
diff --git a/dagging/_dagging.py b/dagging/_dagging.py
@@ -1,60 +1,22 @@
 import numpy as np
 
-from sklearn.base import ClassifierMixin
+from sklearn.base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor
 from sklearn.ensemble.base import BaseEnsemble
-from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import check_cv
 from sklearn.preprocessing import LabelEncoder
-from sklearn.tree import DecisionTreeClassifier
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.utils.validation import check_is_fitted, check_random_state
 
 
-class Dagging(BaseEnsemble, ClassifierMixin):
-    """A Dagging classifier.
-    This meta classifier creates a number of disjoint, stratified folds out of
-    the data and feeds each chunk of data to a copy of the supplied base
-    classifier. Predictions are made via hard or soft voting.
-    Useful for base classifiers that are quadratic or worse in time behavior,
-    regarding number of instances in the training data.
-
-    Parameters
-    ----------
-    base_estimator : object or None, optional (default=None)
-        The base estimator to fit on random subsets of the dataset.
-        If None, then the base estimator is a decision tree.
-    n_estimators : int, optional (default=3)
-        The number of base estimators in the ensemble.
-    voting : str, {'hard', 'soft'} (default='soft')
-        If 'hard', uses predicted class labels for majority rule voting.
-        Else if 'soft', predicts the class label based on the argmax of
-        the sums of the predicted probabilities, which is recommended for
-        an ensemble of well-calibrated classifiers.
-    random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    Attributes
-    ----------
-    base_estimator_ : estimator
-        The base estimator from which the ensemble is grown.
-    estimators_ : list of estimators
-        The collection of fitted base estimators.
-    References
-    ----------
-    .. [1] Ting, K. M., Witten, I. H.: Stacking Bagged and Dagged Models.
-           In: Fourteenth international Conference on Machine Learning,
-           San Francisco, CA, 367-375, 1997
-    """
-
+class BaseDagging(BaseEnsemble):
     def __init__(self,
                  base_estimator=None,
                  n_estimators=3,
                  voting='soft',
                  random_state=None):
-        super(Dagging, self).__init__(
+        super(BaseDagging, self).__init__(
             base_estimator=base_estimator,
             n_estimators=n_estimators)
         self.voting = voting
@@ -78,25 +40,29 @@ def fit(self, X, y):
         self : object
         """
         X, y = check_X_y(X, y)
-        check_classification_targets(y)
-
-        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
-            raise NotImplementedError('Multilabel and multi-output'
-                                      ' classification is not supported.')
 
         if self.voting not in ('soft', 'hard'):
             raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
                              % self.voting)
 
         self._validate_estimator()
+        if is_classifier(self.base_estimator_):
+            check_classification_targets(y)
+            if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
+                raise NotImplementedError('Multilabel and multi-output'
+                                          ' classification is not supported.')
+            self.le_ = LabelEncoder().fit(y)
+            self.classes_ = self.le_.classes_
+            transformed_y = self.le_.transform(y)
+        else:
+            transformed_y = y
 
-        self.le_ = LabelEncoder().fit(y)
-        self.classes_ = self.le_.classes_
         self.estimators_ = []
 
-        transformed_y = self.le_.transform(y)
         rs = check_random_state(self.random_state)
-        splitter = StratifiedKFold(self.n_estimators, random_state=rs)
+        splitter = check_cv(cv=self.n_estimators,
+                            y=transformed_y,
+                            classifier=is_classifier(self.base_estimator_))
 
         for _, index in splitter.split(X, transformed_y):
             estimator = self._make_estimator(append=False,
@@ -106,6 +72,57 @@ def fit(self, X, y):
 
         return self
 
+
+class DaggingClassifier(BaseDagging, ClassifierMixin):
+    """A Dagging classifier.
+    This meta classifier creates a number of disjoint, stratified folds out of
+    the data and feeds each chunk of data to a copy of the supplied base
+    classifier. Predictions are made via hard or soft voting.
+    Useful for base classifiers that are quadratic or worse in time behavior,
+    regarding number of instances in the training data.
+
+    Parameters
+    ----------
+    base_estimator : object or None, optional (default=None)
+        The base estimator to fit on random subsets of the dataset.
+        If None, then the base estimator is a decision tree.
+    n_estimators : int, optional (default=3)
+        The number of base estimators in the ensemble.
+    voting : str, {'hard', 'soft'} (default='soft')
+        If 'hard', uses predicted class labels for majority rule voting.
+        Else if 'soft', predicts the class label based on the argmax of
+        the sums of the predicted probabilities, which is recommended for
+        an ensemble of well-calibrated classifiers.
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    Attributes
+    ----------
+    base_estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+    estimators_ : list of estimators
+        The collection of fitted base estimators.
+    References
+    ----------
+    .. [1] Ting, K. M., Witten, I. H.: Stacking Bagged and Dagged Models.
+           In: Fourteenth international Conference on Machine Learning,
+           San Francisco, CA, 367-375, 1997
+    """
+
+    def __init__(self,
+                 base_estimator=None,
+                 n_estimators=3,
+                 voting='soft',
+                 random_state=None):
+        super(DaggingClassifier, self).__init__(
+            base_estimator=base_estimator,
+            n_estimators=n_estimators,
+            voting=voting,
+            random_state=random_state)
+
     def predict(self, X):
         """ Predict class labels for X.
         Parameters
@@ -168,8 +185,72 @@ def predict_proba(self):
 
     def _validate_estimator(self):
         """Check the estimator and set the base_estimator_ attribute."""
-        super(Dagging, self)._validate_estimator(
+        super(DaggingClassifier, self)._validate_estimator(
             default=DecisionTreeClassifier())
 
 
-DaggingClassifier = Dagging
+class DaggingRegressor(BaseDagging, RegressorMixin):
+    """A Dagging regressor.
+    This meta regressor creates a number of disjoint, stratified folds out of
+    the data and feeds each chunk of data to a copy of the supplied base
+    regressor. Predictions are made via hard or soft voting.
+    Useful for base regressor that are quadratic or worse in time behavior,
+    regarding number of instances in the training data.
+
+    Parameters
+    ----------
+    base_estimator : object or None, optional (default=None)
+        The base estimator to fit on random subsets of the dataset.
+        If None, then the base estimator is a decision tree.
+    n_estimators : int, optional (default=3)
+        The number of base estimators in the ensemble.
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    Attributes
+    ----------
+    base_estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+    estimators_ : list of estimators
+        The collection of fitted base estimators.
+    References
+    ----------
+    .. [1] Ting, K. M., Witten, I. H.: Stacking Bagged and Dagged Models.
+           In: Fourteenth international Conference on Machine Learning,
+           San Francisco, CA, 367-375, 1997
+    """
+
+    def __init__(self,
+                 base_estimator=None,
+                 n_estimators=3,
+                 random_state=None):
+        super(DaggingRegressor, self).__init__(
+            base_estimator=base_estimator,
+            n_estimators=n_estimators,
+            random_state=random_state)
+
+    def predict(self, X):
+        """ Predict class labels for X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features.
+        Returns
+        ----------
+        maj : array-like, shape = [n_samples]
+            Predicted class labels.
+        """
+        check_is_fitted(self, 'estimators_')
+        predictions = []
+        for estimator in self.estimators_:
+            predictions.append(estimator.predict(X))
+        return np.average(predictions, axis=0)
+
+    def _validate_estimator(self):
+        """Check the estimator and set the base_estimator_ attribute."""
+        super(DaggingRegressor, self)._validate_estimator(
+            default=DecisionTreeRegressor())
diff --git a/dagging/tests/test_dagging.py b/dagging/tests/test_dagging.py
@@ -1,8 +1,13 @@
 from sklearn.utils.estimator_checks import check_estimator
 
-from dagging import Dagging
+from dagging import DaggingClassifier, DaggingRegressor
 
 
-def test_check_estimator():
-    model = Dagging(random_state=0)
+def test_check_dagging_classifier():
+    model = DaggingClassifier(random_state=0, n_estimators=2)
+    check_estimator(model)
+
+
+def test_check_dagging_regressor():
+    model = DaggingRegressor(random_state=0)
     check_estimator(model)