Merge pull request scikit-learn-contrib#185 from scikit-learn-contrib/function-transformer

dukebody · web-flow · commit b3ab3eaf5958 · 2018-12-01T20:09:27.000+01:00
Move and document FunctionTransformer.
diff --git a/README.rst b/README.rst
@@ -11,7 +11,7 @@ In particular, it provides:
 
 1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features.
 2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 <https://github.com/paulgb/sklearn-pandas/issues/11>`__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``.
-3. A ``CategoricalImputer`` that replaces null-like values with the mode and works with string columns.
+3. A couple of special transformers that work well with pandas inputs: ``CategoricalImputer`` and ``FunctionTransformer`.`
 
 Installation
 ------------
@@ -406,11 +406,26 @@ Example: imputing with a fixed value:
     array(['a', 'b', 'b', 'a'], dtype=object)
 
 
+``FunctionTransformer``
+***********************
+
+Often one wants to apply simple transformations to data such as ``np.log``. ``FunctionTransformer`` is a simple wrapper that takes any function and applies vectorization so that it can be used as a transformer.
+
+Example:
+
+    >>> from sklearn_pandas import FunctionTransformer
+    >>> array = np.array([10, 100])
+    >>> transformer = FunctionTransformer(np.log10)
+
+    >>> transformer.fit_transform(array)
+    array([1., 2.])
+
 Changelog
 ---------
 
 Unreleased
 **********
+* Add ``FunctionTransformer`` class (#117).
 * Fix column names derivation for dataframes with multi-index or non-string
   columns (#166).
 * Change behaviour of DataFrameMapper's fit_transform method to invoke each underlying transformers'
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -2,6 +2,5 @@
 
 from .dataframe_mapper import DataFrameMapper  # NOQA
 from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV  # NOQA
-from .categorical_imputer import CategoricalImputer  # NOQA
+from .transformers import CategoricalImputer, FunctionTransformer  # NOQA
 from .features_generator import gen_features  # NOQA
-from .act_as_transformer import ActAsTransformer  # NOQA
diff --git a/sklearn_pandas/act_as_transformer.py b/sklearn_pandas/act_as_transformer.py
diff --git a/sklearn_pandas/transformers.py b/sklearn_pandas/transformers.py
@@ -0,0 +1,152 @@
+import numpy as np
+import pandas as pd
+
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+
+
+def _get_mask(X, value):
+    """
+    Compute the boolean mask X == missing_values.
+    """
+    if value == "NaN" or \
+       value is None or \
+       (isinstance(value, float) and np.isnan(value)):
+        return pd.isnull(X)
+    else:
+        return X == value
+
+
+class CategoricalImputer(BaseEstimator, TransformerMixin):
+    """
+    Impute missing values from a categorical/string np.ndarray or pd.Series
+    with the most frequent value on the training data.
+
+    Parameters
+    ----------
+    missing_values : string or "NaN", optional (default="NaN")
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed. None and np.nan are treated
+        as being the same, use the string value "NaN" for them.
+
+    copy : boolean, optional (default=True)
+        If True, a copy of X will be created.
+
+    strategy : string, optional (default = 'most_frequent')
+        The imputation strategy.
+
+        - If "most_frequent", then replace missing using the most frequent
+          value along each column. Can be used with strings or numeric data.
+        - If "constant", then replace missing values with fill_value. Can be
+          used with strings or numeric data.
+
+    fill_value : string, optional (default='?')
+        The value that all instances of `missing_values` are replaced
+        with if `strategy` is set to `constant`. This is useful if
+        you don't want to impute with the mode, or if there are multiple
+        modes in your data and you want to choose a particular one. If
+        `strategy` is not set to `constant`, this parameter is ignored.
+
+    Attributes
+    ----------
+    fill_ : str
+        The imputation fill value
+
+    """
+
+    def __init__(
+        self,
+        missing_values='NaN',
+        strategy='most_frequent',
+        fill_value='?',
+        copy=True
+    ):
+        self.missing_values = missing_values
+        self.copy = copy
+        self.fill_value = fill_value
+        self.strategy = strategy
+
+        strategies = ['constant', 'most_frequent']
+        if self.strategy not in strategies:
+            raise ValueError(
+                'Strategy {0} not in {1}'.format(self.strategy, strategies)
+            )
+
+    def fit(self, X, y=None):
+        """
+
+        Get the most frequent value.
+
+        Parameters
+        ----------
+            X : np.ndarray or pd.Series
+                Training data.
+
+            y : Passthrough for ``Pipeline`` compatibility.
+
+        Returns
+        -------
+            self: CategoricalImputer
+        """
+
+        mask = _get_mask(X, self.missing_values)
+        X = X[~mask]
+        if self.strategy == 'most_frequent':
+            modes = pd.Series(X).mode()
+        elif self.strategy == 'constant':
+            modes = np.array([self.fill_value])
+        if modes.shape[0] == 0:
+            raise ValueError('Data is empty or all values are null')
+        elif modes.shape[0] > 1:
+            raise ValueError('No value is repeated more than '
+                             'once in the column')
+        else:
+            self.fill_ = modes[0]
+
+        return self
+
+    def transform(self, X):
+        """
+
+        Replaces missing values in the input data with the most frequent value
+        of the training data.
+
+        Parameters
+        ----------
+            X : np.ndarray or pd.Series
+                Data with values to be imputed.
+
+        Returns
+        -------
+            np.ndarray
+                Data with imputed values.
+        """
+
+        check_is_fitted(self, 'fill_')
+
+        if self.copy:
+            X = X.copy()
+
+        mask = _get_mask(X, self.missing_values)
+        X[mask] = self.fill_
+
+        return np.asarray(X)
+
+
+class FunctionTransformer(BaseEstimator, TransformerMixin):
+    """
+    Use this class to convert a random function into a
+    transformer.
+    """
+
+    def __init__(self, func):
+        self.__func = func
+
+    def fit(self, x, y=None):
+        return self
+
+    def transform(self, x):
+        return np.vectorize(self.__func)(x)
+
+    def __call__(self, *args, **kwargs):
+        return self.__func(*args, **kwargs)
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -32,7 +32,6 @@
 from sklearn_pandas import DataFrameMapper, cross_val_score
 from sklearn_pandas.dataframe_mapper import _handle_feature, _build_transformer
 from sklearn_pandas.pipeline import TransformerPipeline
-from sklearn_pandas import ActAsTransformer
 
 
 class MockXTransformer(object):
@@ -951,17 +950,3 @@ def test_heterogeneous_output_types_input_df():
     dft = M.fit_transform(df)
     assert dft['feat1'].dtype == np.dtype('int64')
     assert dft['feat2'].dtype == np.dtype('float64')
-
-
-def test_actastransformer():
-    """
-    Test whether random transformations works
-    """
-    df = pd.DataFrame({
-        'feat1': [10, 100],
-    })
-    M = DataFrameMapper([
-        ('feat1', ActAsTransformer(np.log10))
-    ], input_df=True, df_out=True, default=None)
-    dft = M.fit_transform(df)
-    assert_array_equal([1., 2.], dft['feat1'].values)
diff --git a/tests/test_transformers.py b/tests/test_transformers.py
@@ -2,8 +2,9 @@
 
 import numpy as np
 import pandas as pd
+from numpy.testing import assert_array_equal
 
-from sklearn_pandas import CategoricalImputer
+from sklearn_pandas import CategoricalImputer, FunctionTransformer
 from sklearn_pandas import DataFrameMapper
 
 # In sklearn18 NotFittedError was moved from utils.validation
@@ -178,3 +179,15 @@ def test_default_fill_value_for_constant_strategy(input_type):
 
     assert imputer.fill_ == '?'
     assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()
+
+
+def test_function_transformer():
+    """
+    Test whether random transformations using FunctionTransformer work.
+    """
+    array = np.array([10, 100])
+    transformer = FunctionTransformer(np.log10)
+
+    transformed = transformer.fit_transform(array)
+
+    assert_array_equal(np.array([1., 2.]), transformed)