Merge pull request #29 from dukebody/transformers-list

calpaterson · calpaterson · commit de0e578e2289 · 2015-08-10T07:05:31.000+01:00
Allow specifying a list of transformers to apply sequentially to a set of columns. Resolves #17.
diff --git a/README.rst b/README.rst
@@ -60,13 +60,23 @@ Transformation Mapping
 Map the Columns to Transformations
 **********************************
 
-The mapper takes a list of pairs. The first is a column name from the pandas DataFrame (or a list of multiple columns, as we will see later). The second is an object which will perform the transformation which will be applied to that column::
+The mapper takes a list of pairs. The first is a column name from the pandas DataFrame, or a list containing one or multiple columns (we will see an example with multiple columns later). The second is an object which will perform the transformation which will be applied to that column::
 
     >>> mapper = DataFrameMapper([
     ...     ('pet', sklearn.preprocessing.LabelBinarizer()),
-    ...     ('children', sklearn.preprocessing.StandardScaler())
+    ...     (['children'], sklearn.preprocessing.StandardScaler())
     ... ])
 
+The difference between specifying the column selector as `'column'` (as a simple stirng) and `['column']` (as a list with one element) is the shape of the array that is passed to the transformer. In the first case, a one dimensional array with be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector.
+
+This behaviour mimics the same pattern as pandas' dataframes `__getitem__` indexing:
+
+    >>> data['children'].shape
+    (8,)
+    >>> data[['children']].shape
+    (8, 1)
+
+Be aware that some transformers expect a 1-dimensional input (the label-oriented ones) while some others, like `OneHotEncoder` or `Imputer`, expect 2-dimensional input, with the shape `[n_samples, n_features]`.
 
 Test the Transformation
 ***********************
@@ -112,6 +122,21 @@ Now running ``fit_transform`` will run PCA on the ``children`` and ``salary`` co
            [ -6.4],
            [-15.4]])
 
+Multiple transformers for the same column
+*****************************************
+
+Multiple transformers can be applied to the same column specifying them
+in a list::
+
+    >>> mapper3 = DataFrameMapper([
+    ...     (['age'], [sklearn.preprocessing.Imputer(),
+    ...                sklearn.preprocessing.StandardScaler()])])
+    >>> data_3 = pd.DataFrame({'age': [1, np.nan, 3]})
+    >>> mapper3.fit_transform(data_3)
+    array([[-1.22474487],
+           [ 0.        ],
+           [ 1.22474487]])
+
 Columns that don't need any transformation
 ******************************************
 
diff --git a/setup.py b/setup.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 from setuptools import setup
+from setuptools.command.test import test as TestCommand
 import re
 
 for line in open('sklearn_pandas/__init__.py'):
@@ -9,6 +10,24 @@
         __version__, = match.groups()
 
 
+class PyTest(TestCommand):
+    user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]
+
+    def initialize_options(self):
+        TestCommand.initialize_options(self)
+        self.pytest_args = []
+
+    def finalize_options(self):
+        TestCommand.finalize_options(self)
+        self.test_args = []
+        self.test_suite = True
+
+    def run(self):
+        import pytest
+        errno = pytest.main(self.pytest_args)
+        raise SystemExit(errno)
+
+
 setup(name='sklearn-pandas',
       version=__version__,
       description='Pandas integration with sklearn',
@@ -19,6 +38,9 @@
       keywords=['scikit', 'sklearn', 'pandas'],
       install_requires=[
           'scikit-learn>=0.13',
+          'scipy>=0.14',
           'pandas>=0.11.0',
-          'numpy>=1.6.1']
-)
+          'numpy>=1.6.1'],
+      tests_require=['pytest', 'mock'],
+      cmdclass={'test': PyTest},
+      )
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -1,4 +1,3 @@
-
 __version__ = '0.0.10'
 
 import numpy as np
@@ -11,6 +10,7 @@
 if sys.version_info >= (3, 0):
     basestring = str
 
+
 def cross_val_score(model, X, *args, **kwargs):
     X = DataWrapper(X)
     return cross_validation.cross_val_score(model, X, *args, **kwargs)
@@ -23,6 +23,7 @@ def fit(self, X, *params, **kwparams):
     def predict(self, X, *params, **kwparams):
         super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
 
+
 try:
     class RandomizedSearchCV(grid_search.RandomizedSearchCV):
         def fit(self, X, *params, **kwparams):
@@ -53,34 +54,45 @@ def transform(self, X):
         return np.array(X).astype(np.float)
 
 
+def _handle_feature(fea):
+    if hasattr(fea, 'toarray'):
+        # sparse arrays should be converted to regular arrays
+        # for hstack.
+        fea = fea.toarray()
+
+    if len(fea.shape) == 1:
+        fea = np.array([fea]).T
+
+    return fea
+
+
 class DataFrameMapper(BaseEstimator, TransformerMixin):
-    '''
+    """
     Map Pandas data frame column subsets to their own
     sklearn transformation.
-    '''
+    """
 
     def __init__(self, features):
-        '''
+        """
         Params:
 
         features    a list of pairs. The first element is the pandas column
                     selector. This can be a string (for one column) or a list
                     of strings. The second element is an object that supports
                     sklearn's transform interface.
-        '''
+        """
         self.features = features
 
-
     def _get_col_subset(self, X, cols):
-        '''
+        """
         Get a subset of columns from the given table X.
 
         X       a Pandas dataframe; the table to select columns from
         cols    a string or list of strings representing the columns
                 to select
 
         Returns a numpy array with the data from the selected columns
-        '''
+        """
         return_vector = False
         if isinstance(cols, basestring):
             return_vector = True
@@ -101,47 +113,47 @@ def _get_col_subset(self, X, cols):
 
         return t
 
-
     def fit(self, X, y=None):
-        '''
+        """
         Fit a transformation from the pipeline
 
         X       the data to fit
-        '''
-        for columns, transformer in self.features:
-            if transformer is not None:
-                transformer.fit(self._get_col_subset(X, columns))
+        """
+        for columns, transformers in self.features:
+            if transformers is not None:
+                if isinstance(transformers, list):
+                    # first fit_transform all transformers except the last one
+                    Xt = self._get_col_subset(X, columns)
+                    for transformer in transformers[:-1]:
+                        Xt = transformer.fit_transform(Xt)
+                    # then fit the last one without transformation
+                    transformers[-1].fit(Xt)
+                else:
+                    transformers.fit(self._get_col_subset(X, columns))
         return self
 
-
     def transform(self, X):
-        '''
+        """
         Transform the given data. Assumes that fit has already been called.
 
         X       the data to transform
-        '''
+        """
         extracted = []
-        for columns, transformer in self.features:
+        for columns, transformers in self.features:
             # columns could be a string or list of
             # strings; we don't care because pandas
             # will handle either.
-            if transformer is not None:
-                fea = transformer.transform(self._get_col_subset(X, columns))
-            else:
-                fea = self._get_col_subset(X, columns)
-
-            if hasattr(fea, 'toarray'):
-                # sparse arrays should be converted to regular arrays
-                # for hstack.
-                fea = fea.toarray()
-
-            if len(fea.shape) == 1:
-                fea = np.array([fea]).T
-            extracted.append(fea)
+            Xt = self._get_col_subset(X, columns)
+            if transformers is not None:
+                if isinstance(transformers, list):
+                    for transformer in transformers:
+                        Xt = transformer.transform(Xt)
+                else:
+                    Xt = transformers.transform(Xt)
+            extracted.append(_handle_feature(Xt))
 
         # combine the feature outputs into one array.
         # at this point we lose track of which features
         # were created from which input columns, so it's
         # assumed that that doesn't matter to the model.
         return np.hstack(extracted)
-
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -1,11 +1,13 @@
 import pytest
+from mock import Mock
 
 from pandas import DataFrame
 import pandas as pd
 from sklearn.datasets import load_iris
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
 from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.preprocessing import Imputer, StandardScaler
 import numpy as np
 
 from sklearn_pandas import (
@@ -31,7 +33,7 @@ def iris_dataframe():
 
 @pytest.fixture
 def cars_dataframe():
-    return pd.read_csv("tests/test_data/cars.csv.gz")
+    return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')
 
 
 def test_with_iris_dataframe(iris_dataframe):
@@ -73,3 +75,53 @@ def test_with_car_dataframe(cars_dataframe):
     labels = cars_dataframe["model"]
     scores = cross_val_score(pipeline, data, labels)
     assert scores.mean() > 0.30
+
+
+def test_cols_string_array():
+    """
+    If an string specified as the columns, the transformer
+    is called with a 1-d array as input.
+    """
+    dataframe = pd.DataFrame({"a": [1, 2, 3]})
+    mock_transformer = Mock()
+    mock_transformer.transform.return_value = np.array([1, 2, 3])  # do nothing
+    mapper = DataFrameMapper([("a", mock_transformer)])
+
+    mapper.fit_transform(dataframe)
+    args, kwargs = mock_transformer.fit.call_args
+    assert args[0].shape == (3,)
+
+
+def test_cols_list_column_vector():
+    """
+    If a one-element list is specified as the columns, the transformer
+    is called with a column vector as input.
+    """
+    dataframe = pd.DataFrame({"a": [1, 2, 3]})
+    mock_transformer = Mock()
+    mock_transformer.transform.return_value = np.array([1, 2, 3])  # do nothing
+    mapper = DataFrameMapper([(["a"], mock_transformer)])
+
+    mapper.fit_transform(dataframe)
+    args, kwargs = mock_transformer.fit.call_args
+    assert args[0].shape == (3, 1)
+
+
+def test_list_transformers():
+    """
+    Specifying a list of transformers applies them sequentially to the
+    selected column.
+    """
+    dataframe = pd.DataFrame({"a": [1, np.nan, 3], "b": [1, 5, 7]})
+
+    mapper = DataFrameMapper([
+        (["a"], [Imputer(), StandardScaler()]),
+        (["b"], StandardScaler()),
+    ])
+    dmatrix = mapper.fit_transform(dataframe)
+
+    assert pd.isnull(dmatrix).sum() == 0  # no null values
+
+    # all features have mean 0 and std deviation 1 (standardized)
+    assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all()
+    assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()