From d9f7260ad6076f43fd5bd505ee0c80fdc33343cc Mon Sep 17 00:00:00 2001 From: ichkoar Date: Thu, 3 Jan 2019 16:08:44 +0200 Subject: [PATCH] Refactor and add DaggingRegressor --- README.rst | 2 +- dagging/__init__.py | 2 +- dagging/_dagging.py | 189 ++++++++++++++++++++++++---------- dagging/tests/test_dagging.py | 11 +- 4 files changed, 145 insertions(+), 59 deletions(-) diff --git a/README.rst b/README.rst index ec3ac2f..8c8ebcb 100644 --- a/README.rst +++ b/README.rst @@ -21,7 +21,7 @@ Example .. code-block:: python - from dagging import Dagging + from dagging import DaggingClassifier from sklearn.datasets import load_iris # Load Iris from from scikit-learn. diff --git a/dagging/__init__.py b/dagging/__init__.py index 4d5d603..f5ce638 100644 --- a/dagging/__init__.py +++ b/dagging/__init__.py @@ -1,5 +1,5 @@ from ._version import get_versions -from ._dagging import Dagging, DaggingClassifier # noqa +from ._dagging import DaggingClassifier, DaggingRegressor # noqa __version__ = get_versions()['version'] del get_versions diff --git a/dagging/_dagging.py b/dagging/_dagging.py index c6fb416..c6be2a0 100644 --- a/dagging/_dagging.py +++ b/dagging/_dagging.py @@ -1,60 +1,22 @@ import numpy as np -from sklearn.base import ClassifierMixin +from sklearn.base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor from sklearn.ensemble.base import BaseEnsemble -from sklearn.model_selection import StratifiedKFold +from sklearn.model_selection import check_cv from sklearn.preprocessing import LabelEncoder -from sklearn.tree import DecisionTreeClassifier +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import check_X_y from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted, check_random_state -class Dagging(BaseEnsemble, ClassifierMixin): - """A Dagging classifier. - This meta classifier creates a number of disjoint, stratified folds out of - the data and feeds each chunk of data to a copy of the supplied base - classifier. Predictions are made via hard or soft voting. - Useful for base classifiers that are quadratic or worse in time behavior, - regarding number of instances in the training data. - - Parameters - ---------- - base_estimator : object or None, optional (default=None) - The base estimator to fit on random subsets of the dataset. - If None, then the base estimator is a decision tree. - n_estimators : int, optional (default=3) - The number of base estimators in the ensemble. - voting : str, {'hard', 'soft'} (default='soft') - If 'hard', uses predicted class labels for majority rule voting. - Else if 'soft', predicts the class label based on the argmax of - the sums of the predicted probabilities, which is recommended for - an ensemble of well-calibrated classifiers. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - Attributes - ---------- - base_estimator_ : estimator - The base estimator from which the ensemble is grown. - estimators_ : list of estimators - The collection of fitted base estimators. - References - ---------- - .. [1] Ting, K. M., Witten, I. H.: Stacking Bagged and Dagged Models. - In: Fourteenth international Conference on Machine Learning, - San Francisco, CA, 367-375, 1997 - """ - +class BaseDagging(BaseEnsemble): def __init__(self, base_estimator=None, n_estimators=3, voting='soft', random_state=None): - super(Dagging, self).__init__( + super(BaseDagging, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators) self.voting = voting @@ -78,25 +40,29 @@ def fit(self, X, y): self : object """ X, y = check_X_y(X, y) - check_classification_targets(y) - - if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: - raise NotImplementedError('Multilabel and multi-output' - ' classification is not supported.') if self.voting not in ('soft', 'hard'): raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)" % self.voting) self._validate_estimator() + if is_classifier(self.base_estimator_): + check_classification_targets(y) + if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: + raise NotImplementedError('Multilabel and multi-output' + ' classification is not supported.') + self.le_ = LabelEncoder().fit(y) + self.classes_ = self.le_.classes_ + transformed_y = self.le_.transform(y) + else: + transformed_y = y - self.le_ = LabelEncoder().fit(y) - self.classes_ = self.le_.classes_ self.estimators_ = [] - transformed_y = self.le_.transform(y) rs = check_random_state(self.random_state) - splitter = StratifiedKFold(self.n_estimators, random_state=rs) + splitter = check_cv(cv=self.n_estimators, + y=transformed_y, + classifier=is_classifier(self.base_estimator_)) for _, index in splitter.split(X, transformed_y): estimator = self._make_estimator(append=False, @@ -106,6 +72,57 @@ def fit(self, X, y): return self + +class DaggingClassifier(BaseDagging, ClassifierMixin): + """A Dagging classifier. + This meta classifier creates a number of disjoint, stratified folds out of + the data and feeds each chunk of data to a copy of the supplied base + classifier. Predictions are made via hard or soft voting. + Useful for base classifiers that are quadratic or worse in time behavior, + regarding number of instances in the training data. + + Parameters + ---------- + base_estimator : object or None, optional (default=None) + The base estimator to fit on random subsets of the dataset. + If None, then the base estimator is a decision tree. + n_estimators : int, optional (default=3) + The number of base estimators in the ensemble. + voting : str, {'hard', 'soft'} (default='soft') + If 'hard', uses predicted class labels for majority rule voting. + Else if 'soft', predicts the class label based on the argmax of + the sums of the predicted probabilities, which is recommended for + an ensemble of well-calibrated classifiers. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Attributes + ---------- + base_estimator_ : estimator + The base estimator from which the ensemble is grown. + estimators_ : list of estimators + The collection of fitted base estimators. + References + ---------- + .. [1] Ting, K. M., Witten, I. H.: Stacking Bagged and Dagged Models. + In: Fourteenth international Conference on Machine Learning, + San Francisco, CA, 367-375, 1997 + """ + + def __init__(self, + base_estimator=None, + n_estimators=3, + voting='soft', + random_state=None): + super(DaggingClassifier, self).__init__( + base_estimator=base_estimator, + n_estimators=n_estimators, + voting=voting, + random_state=random_state) + def predict(self, X): """ Predict class labels for X. Parameters @@ -168,8 +185,72 @@ def predict_proba(self): def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" - super(Dagging, self)._validate_estimator( + super(DaggingClassifier, self)._validate_estimator( default=DecisionTreeClassifier()) -DaggingClassifier = Dagging +class DaggingRegressor(BaseDagging, RegressorMixin): + """A Dagging regressor. + This meta regressor creates a number of disjoint, stratified folds out of + the data and feeds each chunk of data to a copy of the supplied base + regressor. Predictions are made via hard or soft voting. + Useful for base regressor that are quadratic or worse in time behavior, + regarding number of instances in the training data. + + Parameters + ---------- + base_estimator : object or None, optional (default=None) + The base estimator to fit on random subsets of the dataset. + If None, then the base estimator is a decision tree. + n_estimators : int, optional (default=3) + The number of base estimators in the ensemble. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Attributes + ---------- + base_estimator_ : estimator + The base estimator from which the ensemble is grown. + estimators_ : list of estimators + The collection of fitted base estimators. + References + ---------- + .. [1] Ting, K. M., Witten, I. H.: Stacking Bagged and Dagged Models. + In: Fourteenth international Conference on Machine Learning, + San Francisco, CA, 367-375, 1997 + """ + + def __init__(self, + base_estimator=None, + n_estimators=3, + random_state=None): + super(DaggingRegressor, self).__init__( + base_estimator=base_estimator, + n_estimators=n_estimators, + random_state=random_state) + + def predict(self, X): + """ Predict class labels for X. + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + Returns + ---------- + maj : array-like, shape = [n_samples] + Predicted class labels. + """ + check_is_fitted(self, 'estimators_') + predictions = [] + for estimator in self.estimators_: + predictions.append(estimator.predict(X)) + return np.average(predictions, axis=0) + + def _validate_estimator(self): + """Check the estimator and set the base_estimator_ attribute.""" + super(DaggingRegressor, self)._validate_estimator( + default=DecisionTreeRegressor()) diff --git a/dagging/tests/test_dagging.py b/dagging/tests/test_dagging.py index 2b18785..2b009fc 100644 --- a/dagging/tests/test_dagging.py +++ b/dagging/tests/test_dagging.py @@ -1,8 +1,13 @@ from sklearn.utils.estimator_checks import check_estimator -from dagging import Dagging +from dagging import DaggingClassifier, DaggingRegressor -def test_check_estimator(): - model = Dagging(random_state=0) +def test_check_dagging_classifier(): + model = DaggingClassifier(random_state=0, n_estimators=2) + check_estimator(model) + + +def test_check_dagging_regressor(): + model = DaggingRegressor(random_state=0) check_estimator(model)