move the sampler to the base module

glemaitre · glemaitre · commit 427486a6433d · 2018-02-18T00:12:50.000+01:00
diff --git a/doc/api.rst b/doc/api.rst
@@ -92,7 +92,6 @@ Prototype selection
    combine.SMOTEENN
    combine.SMOTETomek
 
-
 .. _ensemble_ref:
 
 :mod:`imblearn.ensemble`: Ensemble methods
@@ -112,6 +111,20 @@ Prototype selection
    ensemble.BalancedBaggingClassifier
    ensemble.EasyEnsemble
 
+.. _misc_ref:
+   
+Miscellaneous
+=============
+
+Imbalance-learn provides some fast-prototyping tools.
+
+.. currentmodule:: imblearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   FunctionSampler
 
 .. _pipeline_ref:
 
diff --git a/doc/miscellaneous.rst b/doc/miscellaneous.rst
@@ -4,7 +4,7 @@
 Miscellaneous samplers
 ======================
 
-.. currentmodule:: imblearn.misc
+.. currentmodule:: imblearn
 
 .. _function_sampler:
 
@@ -18,7 +18,7 @@ a single function. Additional parameters can be added using the attribute
 to retain the 10 first elements of the array ``X`` and ``y``::
 
   >>> import numpy as np
-  >>> from imblearn.misc import FunctionSampler
+  >>> from imblearn import FunctionSampler
   >>> from sklearn.datasets import make_classification
   >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
   ...                            n_redundant=0, n_repeated=0, n_classes=3,
diff --git a/examples/plot_outlier_rejections.py b/examples/plot_outlier_rejections.py
@@ -21,7 +21,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import classification_report
 
-from imblearn.misc import FunctionSampler
+from imblearn import FunctionSampler
 from imblearn.pipeline import make_pipeline
 
 print(__doc__)
diff --git a/imblearn/__init__.py b/imblearn/__init__.py
@@ -26,10 +26,7 @@
     Module which allowing to create pipeline with scikit-learn estimators.
 """
 
+from .base import FunctionSampler
 from ._version import __version__
 
-# list all submodules available in imblearn and version
-__all__ = [
-    'combine', 'ensemble', 'exceptions', 'metrics', 'over_sampling',
-    'under_sampling', 'utils', 'pipeline', '__version__'
-]
+__all__ = ['FunctionSampler', '__version__']
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -159,3 +159,107 @@ def fit(self, X, y):
         self.ratio_ = check_ratio(self.ratio, y, self._sampling_type)
 
         return self
+
+
+def _identity(X, y):
+    return X, y
+
+
+class FunctionSampler(SamplerMixin):
+    """Construct a sampler from calling an arbitrary callable.
+
+    Read more in the :ref:`User Guide <function_sampler>`.
+
+    Parameters
+    ----------
+    func : callable or None,
+        The callable to use for the transformation. This will be passed the
+        same arguments as transform, with args and kwargs forwarded. If func is
+        None, then func will be the identity function.
+
+    accept_sparse : bool, optional (default=True)
+        Whether sparse input are supported. By default, sparse inputs are
+        supported.
+
+    kw_args : dict, optional (default=None)
+        The keyword argument expected by ``func``.
+
+    Notes
+    -----
+
+    See
+    :ref:`sphx_glr_auto_examples_plot_outlier_rejections.py`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_classification
+    >>> from imblearn import FunctionSampler
+    >>> X, y = make_classification(n_classes=2, class_sep=2,
+    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
+    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
+
+    We can create to select only the first ten samples for instance.
+
+    >>> def func(X, y):
+    ...   return X[:10], y[:10]
+    >>> sampler = FunctionSampler(func=func)
+    >>> X_res, y_res = sampler.fit_sample(X, y)
+    >>> np.all(X_res == X[:10])
+    True
+    >>> np.all(y_res == y[:10])
+    True
+
+    We can also create a specific function which take some arguments.
+
+    >>> from collections import Counter
+    >>> from imblearn.under_sampling import RandomUnderSampler
+    >>> def func(X, y, ratio, random_state):
+    ...   return RandomUnderSampler(ratio=ratio,
+    ...                             random_state=random_state).fit_sample(X, y)
+    >>> sampler = FunctionSampler(func=func,
+    ...                           kw_args={'ratio': 'auto', 'random_state': 0})
+    >>> X_res, y_res = sampler.fit_sample(X, y)
+    >>> print('Resampled dataset shape {}'.format(
+    ...     sorted(Counter(y_res).items())))
+    Resampled dataset shape [(0, 100), (1, 100)]
+
+    """
+
+    def __init__(self, func=None, accept_sparse=True, kw_args=None):
+        self.func = func
+        self.accept_sparse = accept_sparse
+        self.kw_args = kw_args
+        self.logger = logging.getLogger(__name__)
+
+    def _check_X_y(self, X, y):
+        if self.accept_sparse:
+            X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        else:
+            X, y = check_X_y(X, y, accept_sparse=False)
+        y = check_target_type(y)
+
+        return X, y
+
+    def fit(self, X, y):
+        X, y = self._check_X_y(X, y)
+        self.X_hash_, self.y_hash_ = hash_X_y(X, y)
+        # when using a sampler, ratio_ is supposed to exist after fit
+        self.ratio_ = 'is_fitted'
+
+        return self
+
+    def _sample(self, X, y, func=None, kw_args=None):
+        X, y = self._check_X_y(X, y)
+        check_is_fitted(self, 'ratio_')
+        X_hash, y_hash = hash_X_y(X, y)
+        if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
+            raise RuntimeError("X and y need to be same array earlier fitted.")
+
+        if func is None:
+            func = _identity
+
+        return func(X, y, **(kw_args if self.kw_args else {}))
+
+    def sample(self, X, y):
+        return self._sample(X, y, func=self.func, kw_args=self.kw_args)
diff --git a/imblearn/misc.py b/imblearn/misc.py
diff --git a/imblearn/tests/test_base.py b/imblearn/tests/test_base.py
@@ -12,7 +12,7 @@
 from sklearn.utils.testing import assert_allclose_dense_sparse
 
 from imblearn.datasets import make_imbalance
-from imblearn.misc import FunctionSampler
+from imblearn import FunctionSampler
 from imblearn.under_sampling import RandomUnderSampler
 
 iris = load_iris()
@@ -28,19 +28,23 @@ def test_function_sampler_reject_sparse():
         sampler.fit(X_sparse, y)
 
 
-@pytest.mark.parametrize("X,y", [(X, y),
-                                 (sparse.csr_matrix(X), y),
-                                 (sparse.csc_matrix(X), y)])
+@pytest.mark.parametrize(
+    "X, y",
+    [(X, y),
+     (sparse.csr_matrix(X), y),
+     (sparse.csc_matrix(X), y)])
 def test_function_sampler_identity(X, y):
     sampler = FunctionSampler()
     X_res, y_res = sampler.fit_sample(X, y)
     assert_allclose_dense_sparse(X_res, X)
     assert_array_equal(y_res, y)
 
 
-@pytest.mark.parametrize("X,y", [(X, y),
-                                 (sparse.csr_matrix(X), y),
-                                 (sparse.csc_matrix(X), y)])
+@pytest.mark.parametrize(
+    "X, y",
+    [(X, y),
+     (sparse.csr_matrix(X), y),
+     (sparse.csc_matrix(X), y)])
 def test_function_sampler_func(X, y):
 
     def func(X, y):
@@ -52,9 +56,11 @@ def func(X, y):
     assert_array_equal(y_res, y[:10])
 
 
-@pytest.mark.parametrize("X,y", [(X, y),
-                                 (sparse.csr_matrix(X), y),
-                                 (sparse.csc_matrix(X), y)])
+@pytest.mark.parametrize(
+    "X, y",
+    [(X, y),
+     (sparse.csr_matrix(X), y),
+     (sparse.csc_matrix(X), y)])
 def test_function_sampler_func_kwargs(X, y):
 
     def func(X, y, ratio, random_state):