Skip to content

Commit

Permalink
Refactor and add DaggingRegressor
Browse files Browse the repository at this point in the history
  • Loading branch information
ichkoar committed Jan 3, 2019
1 parent a8e81c0 commit d9f7260
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 59 deletions.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Example

.. code-block:: python
from dagging import Dagging
from dagging import DaggingClassifier
from sklearn.datasets import load_iris
# Load Iris from from scikit-learn.
Expand Down
2 changes: 1 addition & 1 deletion dagging/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from ._version import get_versions
from ._dagging import Dagging, DaggingClassifier # noqa
from ._dagging import DaggingClassifier, DaggingRegressor # noqa

__version__ = get_versions()['version']
del get_versions
189 changes: 135 additions & 54 deletions dagging/_dagging.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,22 @@
import numpy as np

from sklearn.base import ClassifierMixin
from sklearn.base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor
from sklearn.ensemble.base import BaseEnsemble
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import check_cv
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import check_X_y
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_is_fitted, check_random_state


class Dagging(BaseEnsemble, ClassifierMixin):
"""A Dagging classifier.
This meta classifier creates a number of disjoint, stratified folds out of
the data and feeds each chunk of data to a copy of the supplied base
classifier. Predictions are made via hard or soft voting.
Useful for base classifiers that are quadratic or worse in time behavior,
regarding number of instances in the training data.
Parameters
----------
base_estimator : object or None, optional (default=None)
The base estimator to fit on random subsets of the dataset.
If None, then the base estimator is a decision tree.
n_estimators : int, optional (default=3)
The number of base estimators in the ensemble.
voting : str, {'hard', 'soft'} (default='soft')
If 'hard', uses predicted class labels for majority rule voting.
Else if 'soft', predicts the class label based on the argmax of
the sums of the predicted probabilities, which is recommended for
an ensemble of well-calibrated classifiers.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Attributes
----------
base_estimator_ : estimator
The base estimator from which the ensemble is grown.
estimators_ : list of estimators
The collection of fitted base estimators.
References
----------
.. [1] Ting, K. M., Witten, I. H.: Stacking Bagged and Dagged Models.
In: Fourteenth international Conference on Machine Learning,
San Francisco, CA, 367-375, 1997
"""

class BaseDagging(BaseEnsemble):
def __init__(self,
base_estimator=None,
n_estimators=3,
voting='soft',
random_state=None):
super(Dagging, self).__init__(
super(BaseDagging, self).__init__(
base_estimator=base_estimator,
n_estimators=n_estimators)
self.voting = voting
Expand All @@ -78,25 +40,29 @@ def fit(self, X, y):
self : object
"""
X, y = check_X_y(X, y)
check_classification_targets(y)

if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
raise NotImplementedError('Multilabel and multi-output'
' classification is not supported.')

if self.voting not in ('soft', 'hard'):
raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
% self.voting)

self._validate_estimator()
if is_classifier(self.base_estimator_):
check_classification_targets(y)
if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
raise NotImplementedError('Multilabel and multi-output'
' classification is not supported.')
self.le_ = LabelEncoder().fit(y)
self.classes_ = self.le_.classes_
transformed_y = self.le_.transform(y)
else:
transformed_y = y

self.le_ = LabelEncoder().fit(y)
self.classes_ = self.le_.classes_
self.estimators_ = []

transformed_y = self.le_.transform(y)
rs = check_random_state(self.random_state)
splitter = StratifiedKFold(self.n_estimators, random_state=rs)
splitter = check_cv(cv=self.n_estimators,
y=transformed_y,
classifier=is_classifier(self.base_estimator_))

for _, index in splitter.split(X, transformed_y):
estimator = self._make_estimator(append=False,
Expand All @@ -106,6 +72,57 @@ def fit(self, X, y):

return self


class DaggingClassifier(BaseDagging, ClassifierMixin):
"""A Dagging classifier.
This meta classifier creates a number of disjoint, stratified folds out of
the data and feeds each chunk of data to a copy of the supplied base
classifier. Predictions are made via hard or soft voting.
Useful for base classifiers that are quadratic or worse in time behavior,
regarding number of instances in the training data.
Parameters
----------
base_estimator : object or None, optional (default=None)
The base estimator to fit on random subsets of the dataset.
If None, then the base estimator is a decision tree.
n_estimators : int, optional (default=3)
The number of base estimators in the ensemble.
voting : str, {'hard', 'soft'} (default='soft')
If 'hard', uses predicted class labels for majority rule voting.
Else if 'soft', predicts the class label based on the argmax of
the sums of the predicted probabilities, which is recommended for
an ensemble of well-calibrated classifiers.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Attributes
----------
base_estimator_ : estimator
The base estimator from which the ensemble is grown.
estimators_ : list of estimators
The collection of fitted base estimators.
References
----------
.. [1] Ting, K. M., Witten, I. H.: Stacking Bagged and Dagged Models.
In: Fourteenth international Conference on Machine Learning,
San Francisco, CA, 367-375, 1997
"""

def __init__(self,
base_estimator=None,
n_estimators=3,
voting='soft',
random_state=None):
super(DaggingClassifier, self).__init__(
base_estimator=base_estimator,
n_estimators=n_estimators,
voting=voting,
random_state=random_state)

def predict(self, X):
""" Predict class labels for X.
Parameters
Expand Down Expand Up @@ -168,8 +185,72 @@ def predict_proba(self):

def _validate_estimator(self):
"""Check the estimator and set the base_estimator_ attribute."""
super(Dagging, self)._validate_estimator(
super(DaggingClassifier, self)._validate_estimator(
default=DecisionTreeClassifier())


DaggingClassifier = Dagging
class DaggingRegressor(BaseDagging, RegressorMixin):
"""A Dagging regressor.
This meta regressor creates a number of disjoint, stratified folds out of
the data and feeds each chunk of data to a copy of the supplied base
regressor. Predictions are made via hard or soft voting.
Useful for base regressor that are quadratic or worse in time behavior,
regarding number of instances in the training data.
Parameters
----------
base_estimator : object or None, optional (default=None)
The base estimator to fit on random subsets of the dataset.
If None, then the base estimator is a decision tree.
n_estimators : int, optional (default=3)
The number of base estimators in the ensemble.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Attributes
----------
base_estimator_ : estimator
The base estimator from which the ensemble is grown.
estimators_ : list of estimators
The collection of fitted base estimators.
References
----------
.. [1] Ting, K. M., Witten, I. H.: Stacking Bagged and Dagged Models.
In: Fourteenth international Conference on Machine Learning,
San Francisco, CA, 367-375, 1997
"""

def __init__(self,
base_estimator=None,
n_estimators=3,
random_state=None):
super(DaggingRegressor, self).__init__(
base_estimator=base_estimator,
n_estimators=n_estimators,
random_state=random_state)

def predict(self, X):
""" Predict class labels for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
----------
maj : array-like, shape = [n_samples]
Predicted class labels.
"""
check_is_fitted(self, 'estimators_')
predictions = []
for estimator in self.estimators_:
predictions.append(estimator.predict(X))
return np.average(predictions, axis=0)

def _validate_estimator(self):
"""Check the estimator and set the base_estimator_ attribute."""
super(DaggingRegressor, self)._validate_estimator(
default=DecisionTreeRegressor())
11 changes: 8 additions & 3 deletions dagging/tests/test_dagging.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from sklearn.utils.estimator_checks import check_estimator

from dagging import Dagging
from dagging import DaggingClassifier, DaggingRegressor


def test_check_estimator():
model = Dagging(random_state=0)
def test_check_dagging_classifier():
model = DaggingClassifier(random_state=0, n_estimators=2)
check_estimator(model)


def test_check_dagging_regressor():
model = DaggingRegressor(random_state=0)
check_estimator(model)

0 comments on commit d9f7260

Please sign in to comment.