Merge remote-tracking branch 'upstream/master'

devforfu · devforfu · commit 55920dac3ad2 · 2019-01-29T12:39:22.000+05:00
diff --git a/README.rst b/README.rst
@@ -11,7 +11,7 @@ In particular, it provides:
 
 1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features.
 2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 <https://github.com/paulgb/sklearn-pandas/issues/11>`__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``.
-3. A ``CategoricalImputer`` that replaces null-like values with the mode and works with string columns.
+3. A couple of special transformers that work well with pandas inputs: ``CategoricalImputer`` and ``FunctionTransformer`.`
 
 Installation
 ------------
@@ -65,7 +65,7 @@ Transformation Mapping
 Map the Columns to Transformations
 **********************************
 
-The mapper takes a list of tuples. The first is a column name from the pandas DataFrame, or a list containing one or multiple columns (we will see an example with multiple columns later). The second is an object which will perform the transformation which will be applied to that column. The third is optional and is a dictionary containing the transformation options, if applicable (see "custom column names for transformed features" below).
+The mapper takes a list of tuples. The first element of each tuple is a column name from the pandas DataFrame, or a list containing one or multiple columns (we will see an example with multiple columns later). The second element is an object which will perform the transformation which will be applied to that column. The third one is optional and is a dictionary containing the transformation options, if applicable (see "custom column names for transformed features" below).
 
 Let's see an example::
 
@@ -406,11 +406,26 @@ Example: imputing with a fixed value:
     array(['a', 'b', 'b', 'a'], dtype=object)
 
 
+``FunctionTransformer``
+***********************
+
+Often one wants to apply simple transformations to data such as ``np.log``. ``FunctionTransformer`` is a simple wrapper that takes any function and applies vectorization so that it can be used as a transformer.
+
+Example:
+
+    >>> from sklearn_pandas import FunctionTransformer
+    >>> array = np.array([10, 100])
+    >>> transformer = FunctionTransformer(np.log10)
+
+    >>> transformer.fit_transform(array)
+    array([1., 2.])
+
 Changelog
 ---------
 
-Unreleased
-**********
+1.8.0 (2018-12-01)
+******************
+* Add ``FunctionTransformer`` class (#117).
 * Fix column names derivation for dataframes with multi-index or non-string
   columns (#166).
 * Change behaviour of DataFrameMapper's fit_transform method to invoke each underlying transformers'
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -1,6 +1,6 @@
-__version__ = '1.7.0'
+__version__ = '1.8.0'
 
 from .dataframe_mapper import DataFrameMapper  # NOQA
 from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV  # NOQA
-from .categorical_imputer import CategoricalImputer  # NOQA
+from .transformers import CategoricalImputer, FunctionTransformer  # NOQA
 from .features_generator import gen_features  # NOQA
diff --git a/sklearn_pandas/transformers.py b/sklearn_pandas/transformers.py
@@ -0,0 +1,152 @@
+import numpy as np
+import pandas as pd
+
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+
+
+def _get_mask(X, value):
+    """
+    Compute the boolean mask X == missing_values.
+    """
+    if value == "NaN" or \
+       value is None or \
+       (isinstance(value, float) and np.isnan(value)):
+        return pd.isnull(X)
+    else:
+        return X == value
+
+
+class CategoricalImputer(BaseEstimator, TransformerMixin):
+    """
+    Impute missing values from a categorical/string np.ndarray or pd.Series
+    with the most frequent value on the training data.
+
+    Parameters
+    ----------
+    missing_values : string or "NaN", optional (default="NaN")
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed. None and np.nan are treated
+        as being the same, use the string value "NaN" for them.
+
+    copy : boolean, optional (default=True)
+        If True, a copy of X will be created.
+
+    strategy : string, optional (default = 'most_frequent')
+        The imputation strategy.
+
+        - If "most_frequent", then replace missing using the most frequent
+          value along each column. Can be used with strings or numeric data.
+        - If "constant", then replace missing values with fill_value. Can be
+          used with strings or numeric data.
+
+    fill_value : string, optional (default='?')
+        The value that all instances of `missing_values` are replaced
+        with if `strategy` is set to `constant`. This is useful if
+        you don't want to impute with the mode, or if there are multiple
+        modes in your data and you want to choose a particular one. If
+        `strategy` is not set to `constant`, this parameter is ignored.
+
+    Attributes
+    ----------
+    fill_ : str
+        The imputation fill value
+
+    """
+
+    def __init__(
+        self,
+        missing_values='NaN',
+        strategy='most_frequent',
+        fill_value='?',
+        copy=True
+    ):
+        self.missing_values = missing_values
+        self.copy = copy
+        self.fill_value = fill_value
+        self.strategy = strategy
+
+        strategies = ['constant', 'most_frequent']
+        if self.strategy not in strategies:
+            raise ValueError(
+                'Strategy {0} not in {1}'.format(self.strategy, strategies)
+            )
+
+    def fit(self, X, y=None):
+        """
+
+        Get the most frequent value.
+
+        Parameters
+        ----------
+            X : np.ndarray or pd.Series
+                Training data.
+
+            y : Passthrough for ``Pipeline`` compatibility.
+
+        Returns
+        -------
+            self: CategoricalImputer
+        """
+
+        mask = _get_mask(X, self.missing_values)
+        X = X[~mask]
+        if self.strategy == 'most_frequent':
+            modes = pd.Series(X).mode()
+        elif self.strategy == 'constant':
+            modes = np.array([self.fill_value])
+        if modes.shape[0] == 0:
+            raise ValueError('Data is empty or all values are null')
+        elif modes.shape[0] > 1:
+            raise ValueError('No value is repeated more than '
+                             'once in the column')
+        else:
+            self.fill_ = modes[0]
+
+        return self
+
+    def transform(self, X):
+        """
+
+        Replaces missing values in the input data with the most frequent value
+        of the training data.
+
+        Parameters
+        ----------
+            X : np.ndarray or pd.Series
+                Data with values to be imputed.
+
+        Returns
+        -------
+            np.ndarray
+                Data with imputed values.
+        """
+
+        check_is_fitted(self, 'fill_')
+
+        if self.copy:
+            X = X.copy()
+
+        mask = _get_mask(X, self.missing_values)
+        X[mask] = self.fill_
+
+        return np.asarray(X)
+
+
+class FunctionTransformer(BaseEstimator, TransformerMixin):
+    """
+    Use this class to convert a random function into a
+    transformer.
+    """
+
+    def __init__(self, func):
+        self.__func = func
+
+    def fit(self, x, y=None):
+        return self
+
+    def transform(self, x):
+        return np.vectorize(self.__func)(x)
+
+    def __call__(self, *args, **kwargs):
+        return self.__func(*args, **kwargs)
diff --git a/tests/test_transformers.py b/tests/test_transformers.py
@@ -2,8 +2,9 @@
 
 import numpy as np
 import pandas as pd
+from numpy.testing import assert_array_equal
 
-from sklearn_pandas import CategoricalImputer
+from sklearn_pandas import CategoricalImputer, FunctionTransformer
 from sklearn_pandas import DataFrameMapper
 
 # In sklearn18 NotFittedError was moved from utils.validation
@@ -178,3 +179,15 @@ def test_default_fill_value_for_constant_strategy(input_type):
 
     assert imputer.fill_ == '?'
     assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()
+
+
+def test_function_transformer():
+    """
+    Test whether random transformations using FunctionTransformer work.
+    """
+    array = np.array([10, 100])
+    transformer = FunctionTransformer(np.log10)
+
+    transformed = transformer.fit_transform(array)
+
+    assert_array_equal(np.array([1., 2.]), transformed)