[MRG] EHN Add a FunctionSampler (#342)

glemaitre · web-flow · commit d482829da33a · 2018-02-17T15:47:38.000-08:00
diff --git a/doc/api.rst b/doc/api.rst
@@ -92,7 +92,6 @@ Prototype selection
    combine.SMOTEENN
    combine.SMOTETomek
 
-
 .. _ensemble_ref:
 
 :mod:`imblearn.ensemble`: Ensemble methods
@@ -112,6 +111,20 @@ Prototype selection
    ensemble.BalancedBaggingClassifier
    ensemble.EasyEnsemble
 
+.. _misc_ref:
+   
+Miscellaneous
+=============
+
+Imbalance-learn provides some fast-prototyping tools.
+
+.. currentmodule:: imblearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   FunctionSampler
 
 .. _pipeline_ref:
 
diff --git a/doc/miscellaneous.rst b/doc/miscellaneous.rst
@@ -0,0 +1,40 @@
+.. _miscellaneous:
+
+======================
+Miscellaneous samplers
+======================
+
+.. currentmodule:: imblearn
+
+.. _function_sampler:
+
+Custom samplers
+---------------
+
+A fully customized sampler, :class:`FunctionSampler`, is available in
+imbalanced-learn such that you can fast prototype your own sampler by defining
+a single function. Additional parameters can be added using the attribute
+``kw_args`` which accepts a dictionary. The following example illustrates how
+to retain the 10 first elements of the array ``X`` and ``y``::
+
+  >>> import numpy as np
+  >>> from imblearn import FunctionSampler
+  >>> from sklearn.datasets import make_classification
+  >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
+  ...                            n_redundant=0, n_repeated=0, n_classes=3,
+  ...                            n_clusters_per_class=1,
+  ...                            weights=[0.01, 0.05, 0.94],
+  ...                            class_sep=0.8, random_state=0)
+  >>> def func(X, y):
+  ...   return X[:10], y[:10]
+  >>> sampler = FunctionSampler(func=func)
+  >>> X_res, y_res = sampler.fit_sample(X, y)
+  >>> np.all(X_res == X[:10])
+  True
+  >>> np.all(y_res == y[:10])
+  True
+
+We illustrate the use of such sampler to implement an outlier rejection
+estimator which can be easily used within a
+:class:`imblearn.pipeline.Pipeline`:
+:ref:`sphx_glr_auto_examples_plot_outlier_rejections.py`
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
@@ -14,6 +14,7 @@ User Guide
    under_sampling.rst
    combine.rst
    ensemble.rst
+   miscellaneous.rst
    metrics.rst
    Dataset loading utilities <datasets/index.rst>
    developers_utils.rst
diff --git a/examples/plot_outlier_rejections.py b/examples/plot_outlier_rejections.py
@@ -0,0 +1,88 @@
+"""
+===============================================================
+Customized sampler to implement an outlier rejections estimator
+===============================================================
+
+This example illustrates the use of a custom sampler to implement an outlier
+rejections estimator. It can be used easily within a pipeline in which the
+number of samples can vary during training, which usually is a limitation of
+the current scikit-learn pipeline.
+
+"""
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: MIT
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.datasets import make_moons, make_blobs
+from sklearn.ensemble import IsolationForest
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report
+
+from imblearn import FunctionSampler
+from imblearn.pipeline import make_pipeline
+
+print(__doc__)
+
+rng = np.random.RandomState(42)
+
+
+def plot_scatter(X, y, title):
+    plt.figure()
+    plt.scatter(X[y == 1, 0], X[y == 1, 1], label='Class #1')
+    plt.scatter(X[y == 0, 0], X[y == 0, 1], label='Class #0')
+    plt.legend()
+    plt.title(title)
+
+
+# Generate contaminated training data
+moons, _ = make_moons(n_samples=500, noise=0.05)
+blobs, _ = make_blobs(n_samples=500, centers=[(-0.75, 2.25),
+                                              (1.0, 2.0)],
+                      cluster_std=0.25)
+outliers = rng.uniform(low=-3, high=3, size=(500, 2))
+X_train = np.vstack([moons, blobs, outliers])
+y_train = np.hstack([np.ones(moons.shape[0], dtype=np.int8),
+                     np.zeros(blobs.shape[0], dtype=np.int8),
+                     rng.randint(0, 2, size=outliers.shape[0],
+                                 dtype=np.int8)])
+
+plot_scatter(X_train, y_train, 'Training dataset')
+
+# Generate non-contaminated testing data
+moons, _ = make_moons(n_samples=50, noise=0.05)
+blobs, _ = make_blobs(n_samples=50, centers=[(-0.75, 2.25),
+                                             (1.0, 2.0)],
+                      cluster_std=0.25)
+X_test = np.vstack([moons, blobs])
+y_test = np.hstack([np.ones(moons.shape[0], dtype=np.int8),
+                    np.zeros(blobs.shape[0], dtype=np.int8)])
+
+plot_scatter(X_test, y_test, 'Testing dataset')
+
+
+def outlier_rejection(X, y):
+    model = IsolationForest(max_samples=100,
+                            contamination=0.4,
+                            random_state=rng)
+    model.fit(X)
+    y_pred = model.predict(X)
+    return X[y_pred == 1], y[y_pred == 1]
+
+
+reject_sampler = FunctionSampler(func=outlier_rejection)
+X_inliers, y_inliers = reject_sampler.fit_sample(X_train, y_train)
+plot_scatter(X_inliers, y_inliers, 'Training data without outliers')
+
+pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
+                     LogisticRegression(random_state=rng))
+y_pred = pipe.fit(X_train, y_train).predict(X_test)
+print(classification_report(y_test, y_pred))
+
+clf = LogisticRegression(random_state=rng)
+y_pred = clf.fit(X_train, y_train).predict(X_test)
+print(classification_report(y_test, y_pred))
+
+plt.show()
diff --git a/imblearn/__init__.py b/imblearn/__init__.py
@@ -26,10 +26,7 @@
     Module which allowing to create pipeline with scikit-learn estimators.
 """
 
+from .base import FunctionSampler
 from ._version import __version__
 
-# list all submodules available in imblearn and version
-__all__ = [
-    'combine', 'ensemble', 'exceptions', 'metrics', 'over_sampling',
-    'under_sampling', 'utils', 'pipeline', '__version__'
-]
+__all__ = ['FunctionSampler', '__version__']
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -159,3 +159,107 @@ def fit(self, X, y):
         self.ratio_ = check_ratio(self.ratio, y, self._sampling_type)
 
         return self
+
+
+def _identity(X, y):
+    return X, y
+
+
+class FunctionSampler(SamplerMixin):
+    """Construct a sampler from calling an arbitrary callable.
+
+    Read more in the :ref:`User Guide <function_sampler>`.
+
+    Parameters
+    ----------
+    func : callable or None,
+        The callable to use for the transformation. This will be passed the
+        same arguments as transform, with args and kwargs forwarded. If func is
+        None, then func will be the identity function.
+
+    accept_sparse : bool, optional (default=True)
+        Whether sparse input are supported. By default, sparse inputs are
+        supported.
+
+    kw_args : dict, optional (default=None)
+        The keyword argument expected by ``func``.
+
+    Notes
+    -----
+
+    See
+    :ref:`sphx_glr_auto_examples_plot_outlier_rejections.py`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_classification
+    >>> from imblearn import FunctionSampler
+    >>> X, y = make_classification(n_classes=2, class_sep=2,
+    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
+    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
+
+    We can create to select only the first ten samples for instance.
+
+    >>> def func(X, y):
+    ...   return X[:10], y[:10]
+    >>> sampler = FunctionSampler(func=func)
+    >>> X_res, y_res = sampler.fit_sample(X, y)
+    >>> np.all(X_res == X[:10])
+    True
+    >>> np.all(y_res == y[:10])
+    True
+
+    We can also create a specific function which take some arguments.
+
+    >>> from collections import Counter
+    >>> from imblearn.under_sampling import RandomUnderSampler
+    >>> def func(X, y, ratio, random_state):
+    ...   return RandomUnderSampler(ratio=ratio,
+    ...                             random_state=random_state).fit_sample(X, y)
+    >>> sampler = FunctionSampler(func=func,
+    ...                           kw_args={'ratio': 'auto', 'random_state': 0})
+    >>> X_res, y_res = sampler.fit_sample(X, y)
+    >>> print('Resampled dataset shape {}'.format(
+    ...     sorted(Counter(y_res).items())))
+    Resampled dataset shape [(0, 100), (1, 100)]
+
+    """
+
+    def __init__(self, func=None, accept_sparse=True, kw_args=None):
+        self.func = func
+        self.accept_sparse = accept_sparse
+        self.kw_args = kw_args
+        self.logger = logging.getLogger(__name__)
+
+    def _check_X_y(self, X, y):
+        if self.accept_sparse:
+            X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        else:
+            X, y = check_X_y(X, y, accept_sparse=False)
+        y = check_target_type(y)
+
+        return X, y
+
+    def fit(self, X, y):
+        X, y = self._check_X_y(X, y)
+        self.X_hash_, self.y_hash_ = hash_X_y(X, y)
+        # when using a sampler, ratio_ is supposed to exist after fit
+        self.ratio_ = 'is_fitted'
+
+        return self
+
+    def _sample(self, X, y, func=None, kw_args=None):
+        X, y = self._check_X_y(X, y)
+        check_is_fitted(self, 'ratio_')
+        X_hash, y_hash = hash_X_y(X, y)
+        if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
+            raise RuntimeError("X and y need to be same array earlier fitted.")
+
+        if func is None:
+            func = _identity
+
+        return func(X, y, **(kw_args if self.kw_args else {}))
+
+    def sample(self, X, y):
+        return self._sample(X, y, func=self.func, kw_args=self.kw_args)
diff --git a/imblearn/tests/test_base.py b/imblearn/tests/test_base.py
@@ -0,0 +1,75 @@
+"""Test for miscellaneous samplers objects."""
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: MIT
+
+import pytest
+
+from scipy import sparse
+
+from sklearn.datasets import load_iris
+from sklearn.utils.testing import assert_array_equal
+from sklearn.utils.testing import assert_allclose_dense_sparse
+
+from imblearn.datasets import make_imbalance
+from imblearn import FunctionSampler
+from imblearn.under_sampling import RandomUnderSampler
+
+iris = load_iris()
+X, y = make_imbalance(iris.data, iris.target, ratio={0: 10, 1: 25},
+                      random_state=0)
+
+
+def test_function_sampler_reject_sparse():
+    X_sparse = sparse.csr_matrix(X)
+    sampler = FunctionSampler(accept_sparse=False)
+    with pytest.raises(TypeError, message="A sparse matrix was passed, "
+                       "but dense data is required"):
+        sampler.fit(X_sparse, y)
+
+
+@pytest.mark.parametrize(
+    "X, y",
+    [(X, y),
+     (sparse.csr_matrix(X), y),
+     (sparse.csc_matrix(X), y)])
+def test_function_sampler_identity(X, y):
+    sampler = FunctionSampler()
+    X_res, y_res = sampler.fit_sample(X, y)
+    assert_allclose_dense_sparse(X_res, X)
+    assert_array_equal(y_res, y)
+
+
+@pytest.mark.parametrize(
+    "X, y",
+    [(X, y),
+     (sparse.csr_matrix(X), y),
+     (sparse.csc_matrix(X), y)])
+def test_function_sampler_func(X, y):
+
+    def func(X, y):
+        return X[:10], y[:10]
+
+    sampler = FunctionSampler(func=func)
+    X_res, y_res = sampler.fit_sample(X, y)
+    assert_allclose_dense_sparse(X_res, X[:10])
+    assert_array_equal(y_res, y[:10])
+
+
+@pytest.mark.parametrize(
+    "X, y",
+    [(X, y),
+     (sparse.csr_matrix(X), y),
+     (sparse.csc_matrix(X), y)])
+def test_function_sampler_func_kwargs(X, y):
+
+    def func(X, y, ratio, random_state):
+        rus = RandomUnderSampler(ratio=ratio, random_state=random_state)
+        return rus.fit_sample(X, y)
+
+    sampler = FunctionSampler(func=func, kw_args={'ratio': 'auto',
+                                                  'random_state': 0})
+    X_res, y_res = sampler.fit_sample(X, y)
+    X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_sample(X, y)
+    assert_allclose_dense_sparse(X_res, X_res_2)
+    assert_array_equal(y_res, y_res_2)
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -25,7 +25,6 @@
 from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import set_random_state
 
-from imblearn.base import SamplerMixin
 from imblearn.over_sampling.base import BaseOverSampler
 from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler
 from imblearn.ensemble.base import BaseEnsembleSampler
@@ -47,10 +46,10 @@ def _yield_sampler_checks(name, Estimator):
     yield check_samplers_pandas
 
 
-def _yield_all_checks(name, Estimator):
+def _yield_all_checks(name, estimator):
     # trigger our checks if this is a SamplerMixin
-    if issubclass(Estimator, SamplerMixin):
-        for check in _yield_sampler_checks(name, Estimator):
+    if hasattr(estimator, 'sample'):
+        for check in _yield_sampler_checks(name, estimator):
             yield check