Skip to content

Commit de0e578

Browse files
committed
Merge pull request #29 from dukebody/transformers-list
Allow specifying a list of transformers to apply sequentially to a set of columns. Resolves #17.
2 parents 425c60e + 6ed13df commit de0e578

File tree

4 files changed

+148
-37
lines changed

4 files changed

+148
-37
lines changed

README.rst

+27-2
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,23 @@ Transformation Mapping
6060
Map the Columns to Transformations
6161
**********************************
6262

63-
The mapper takes a list of pairs. The first is a column name from the pandas DataFrame (or a list of multiple columns, as we will see later). The second is an object which will perform the transformation which will be applied to that column::
63+
The mapper takes a list of pairs. The first is a column name from the pandas DataFrame, or a list containing one or multiple columns (we will see an example with multiple columns later). The second is an object which will perform the transformation which will be applied to that column::
6464

6565
>>> mapper = DataFrameMapper([
6666
... ('pet', sklearn.preprocessing.LabelBinarizer()),
67-
... ('children', sklearn.preprocessing.StandardScaler())
67+
... (['children'], sklearn.preprocessing.StandardScaler())
6868
... ])
6969

70+
The difference between specifying the column selector as `'column'` (as a simple stirng) and `['column']` (as a list with one element) is the shape of the array that is passed to the transformer. In the first case, a one dimensional array with be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector.
71+
72+
This behaviour mimics the same pattern as pandas' dataframes `__getitem__` indexing:
73+
74+
>>> data['children'].shape
75+
(8,)
76+
>>> data[['children']].shape
77+
(8, 1)
78+
79+
Be aware that some transformers expect a 1-dimensional input (the label-oriented ones) while some others, like `OneHotEncoder` or `Imputer`, expect 2-dimensional input, with the shape `[n_samples, n_features]`.
7080

7181
Test the Transformation
7282
***********************
@@ -112,6 +122,21 @@ Now running ``fit_transform`` will run PCA on the ``children`` and ``salary`` co
112122
[ -6.4],
113123
[-15.4]])
114124

125+
Multiple transformers for the same column
126+
*****************************************
127+
128+
Multiple transformers can be applied to the same column specifying them
129+
in a list::
130+
131+
>>> mapper3 = DataFrameMapper([
132+
... (['age'], [sklearn.preprocessing.Imputer(),
133+
... sklearn.preprocessing.StandardScaler()])])
134+
>>> data_3 = pd.DataFrame({'age': [1, np.nan, 3]})
135+
>>> mapper3.fit_transform(data_3)
136+
array([[-1.22474487],
137+
[ 0. ],
138+
[ 1.22474487]])
139+
115140
Columns that don't need any transformation
116141
******************************************
117142

setup.py

+24-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python
22

33
from setuptools import setup
4+
from setuptools.command.test import test as TestCommand
45
import re
56

67
for line in open('sklearn_pandas/__init__.py'):
@@ -9,6 +10,24 @@
910
__version__, = match.groups()
1011

1112

13+
class PyTest(TestCommand):
14+
user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]
15+
16+
def initialize_options(self):
17+
TestCommand.initialize_options(self)
18+
self.pytest_args = []
19+
20+
def finalize_options(self):
21+
TestCommand.finalize_options(self)
22+
self.test_args = []
23+
self.test_suite = True
24+
25+
def run(self):
26+
import pytest
27+
errno = pytest.main(self.pytest_args)
28+
raise SystemExit(errno)
29+
30+
1231
setup(name='sklearn-pandas',
1332
version=__version__,
1433
description='Pandas integration with sklearn',
@@ -19,6 +38,9 @@
1938
keywords=['scikit', 'sklearn', 'pandas'],
2039
install_requires=[
2140
'scikit-learn>=0.13',
41+
'scipy>=0.14',
2242
'pandas>=0.11.0',
23-
'numpy>=1.6.1']
24-
)
43+
'numpy>=1.6.1'],
44+
tests_require=['pytest', 'mock'],
45+
cmdclass={'test': PyTest},
46+
)

sklearn_pandas/__init__.py

+44-32
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
__version__ = '0.0.10'
32

43
import numpy as np
@@ -11,6 +10,7 @@
1110
if sys.version_info >= (3, 0):
1211
basestring = str
1312

13+
1414
def cross_val_score(model, X, *args, **kwargs):
1515
X = DataWrapper(X)
1616
return cross_validation.cross_val_score(model, X, *args, **kwargs)
@@ -23,6 +23,7 @@ def fit(self, X, *params, **kwparams):
2323
def predict(self, X, *params, **kwparams):
2424
super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
2525

26+
2627
try:
2728
class RandomizedSearchCV(grid_search.RandomizedSearchCV):
2829
def fit(self, X, *params, **kwparams):
@@ -53,34 +54,45 @@ def transform(self, X):
5354
return np.array(X).astype(np.float)
5455

5556

57+
def _handle_feature(fea):
58+
if hasattr(fea, 'toarray'):
59+
# sparse arrays should be converted to regular arrays
60+
# for hstack.
61+
fea = fea.toarray()
62+
63+
if len(fea.shape) == 1:
64+
fea = np.array([fea]).T
65+
66+
return fea
67+
68+
5669
class DataFrameMapper(BaseEstimator, TransformerMixin):
57-
'''
70+
"""
5871
Map Pandas data frame column subsets to their own
5972
sklearn transformation.
60-
'''
73+
"""
6174

6275
def __init__(self, features):
63-
'''
76+
"""
6477
Params:
6578
6679
features a list of pairs. The first element is the pandas column
6780
selector. This can be a string (for one column) or a list
6881
of strings. The second element is an object that supports
6982
sklearn's transform interface.
70-
'''
83+
"""
7184
self.features = features
7285

73-
7486
def _get_col_subset(self, X, cols):
75-
'''
87+
"""
7688
Get a subset of columns from the given table X.
7789
7890
X a Pandas dataframe; the table to select columns from
7991
cols a string or list of strings representing the columns
8092
to select
8193
8294
Returns a numpy array with the data from the selected columns
83-
'''
95+
"""
8496
return_vector = False
8597
if isinstance(cols, basestring):
8698
return_vector = True
@@ -101,47 +113,47 @@ def _get_col_subset(self, X, cols):
101113

102114
return t
103115

104-
105116
def fit(self, X, y=None):
106-
'''
117+
"""
107118
Fit a transformation from the pipeline
108119
109120
X the data to fit
110-
'''
111-
for columns, transformer in self.features:
112-
if transformer is not None:
113-
transformer.fit(self._get_col_subset(X, columns))
121+
"""
122+
for columns, transformers in self.features:
123+
if transformers is not None:
124+
if isinstance(transformers, list):
125+
# first fit_transform all transformers except the last one
126+
Xt = self._get_col_subset(X, columns)
127+
for transformer in transformers[:-1]:
128+
Xt = transformer.fit_transform(Xt)
129+
# then fit the last one without transformation
130+
transformers[-1].fit(Xt)
131+
else:
132+
transformers.fit(self._get_col_subset(X, columns))
114133
return self
115134

116-
117135
def transform(self, X):
118-
'''
136+
"""
119137
Transform the given data. Assumes that fit has already been called.
120138
121139
X the data to transform
122-
'''
140+
"""
123141
extracted = []
124-
for columns, transformer in self.features:
142+
for columns, transformers in self.features:
125143
# columns could be a string or list of
126144
# strings; we don't care because pandas
127145
# will handle either.
128-
if transformer is not None:
129-
fea = transformer.transform(self._get_col_subset(X, columns))
130-
else:
131-
fea = self._get_col_subset(X, columns)
132-
133-
if hasattr(fea, 'toarray'):
134-
# sparse arrays should be converted to regular arrays
135-
# for hstack.
136-
fea = fea.toarray()
137-
138-
if len(fea.shape) == 1:
139-
fea = np.array([fea]).T
140-
extracted.append(fea)
146+
Xt = self._get_col_subset(X, columns)
147+
if transformers is not None:
148+
if isinstance(transformers, list):
149+
for transformer in transformers:
150+
Xt = transformer.transform(Xt)
151+
else:
152+
Xt = transformers.transform(Xt)
153+
extracted.append(_handle_feature(Xt))
141154

142155
# combine the feature outputs into one array.
143156
# at this point we lose track of which features
144157
# were created from which input columns, so it's
145158
# assumed that that doesn't matter to the model.
146159
return np.hstack(extracted)
147-

tests/test_dataframe_mapper.py

+53-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import pytest
2+
from mock import Mock
23

34
from pandas import DataFrame
45
import pandas as pd
56
from sklearn.datasets import load_iris
67
from sklearn.pipeline import Pipeline
78
from sklearn.svm import SVC
89
from sklearn.feature_extraction.text import CountVectorizer
10+
from sklearn.preprocessing import Imputer, StandardScaler
911
import numpy as np
1012

1113
from sklearn_pandas import (
@@ -31,7 +33,7 @@ def iris_dataframe():
3133

3234
@pytest.fixture
3335
def cars_dataframe():
34-
return pd.read_csv("tests/test_data/cars.csv.gz")
36+
return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')
3537

3638

3739
def test_with_iris_dataframe(iris_dataframe):
@@ -73,3 +75,53 @@ def test_with_car_dataframe(cars_dataframe):
7375
labels = cars_dataframe["model"]
7476
scores = cross_val_score(pipeline, data, labels)
7577
assert scores.mean() > 0.30
78+
79+
80+
def test_cols_string_array():
81+
"""
82+
If an string specified as the columns, the transformer
83+
is called with a 1-d array as input.
84+
"""
85+
dataframe = pd.DataFrame({"a": [1, 2, 3]})
86+
mock_transformer = Mock()
87+
mock_transformer.transform.return_value = np.array([1, 2, 3]) # do nothing
88+
mapper = DataFrameMapper([("a", mock_transformer)])
89+
90+
mapper.fit_transform(dataframe)
91+
args, kwargs = mock_transformer.fit.call_args
92+
assert args[0].shape == (3,)
93+
94+
95+
def test_cols_list_column_vector():
96+
"""
97+
If a one-element list is specified as the columns, the transformer
98+
is called with a column vector as input.
99+
"""
100+
dataframe = pd.DataFrame({"a": [1, 2, 3]})
101+
mock_transformer = Mock()
102+
mock_transformer.transform.return_value = np.array([1, 2, 3]) # do nothing
103+
mapper = DataFrameMapper([(["a"], mock_transformer)])
104+
105+
mapper.fit_transform(dataframe)
106+
args, kwargs = mock_transformer.fit.call_args
107+
assert args[0].shape == (3, 1)
108+
109+
110+
def test_list_transformers():
111+
"""
112+
Specifying a list of transformers applies them sequentially to the
113+
selected column.
114+
"""
115+
dataframe = pd.DataFrame({"a": [1, np.nan, 3], "b": [1, 5, 7]})
116+
117+
mapper = DataFrameMapper([
118+
(["a"], [Imputer(), StandardScaler()]),
119+
(["b"], StandardScaler()),
120+
])
121+
dmatrix = mapper.fit_transform(dataframe)
122+
123+
assert pd.isnull(dmatrix).sum() == 0 # no null values
124+
125+
# all features have mean 0 and std deviation 1 (standardized)
126+
assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all()
127+
assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()

0 commit comments

Comments
 (0)