Skip to content

Commit b3ab3ea

Browse files
authored
Merge pull request scikit-learn-contrib#185 from scikit-learn-contrib/function-transformer
Move and document FunctionTransformer.
2 parents f3cf7f6 + 2725926 commit b3ab3ea

File tree

6 files changed

+183
-40
lines changed

6 files changed

+183
-40
lines changed

README.rst

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ In particular, it provides:
1111

1212
1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features.
1313
2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 <https://github.com/paulgb/sklearn-pandas/issues/11>`__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``.
14-
3. A ``CategoricalImputer`` that replaces null-like values with the mode and works with string columns.
14+
3. A couple of special transformers that work well with pandas inputs: ``CategoricalImputer`` and ``FunctionTransformer`.`
1515
1616
Installation
1717
------------
@@ -406,11 +406,26 @@ Example: imputing with a fixed value:
406406
array(['a', 'b', 'b', 'a'], dtype=object)
407407

408408

409+
``FunctionTransformer``
410+
***********************
411+
412+
Often one wants to apply simple transformations to data such as ``np.log``. ``FunctionTransformer`` is a simple wrapper that takes any function and applies vectorization so that it can be used as a transformer.
413+
414+
Example:
415+
416+
>>> from sklearn_pandas import FunctionTransformer
417+
>>> array = np.array([10, 100])
418+
>>> transformer = FunctionTransformer(np.log10)
419+
420+
>>> transformer.fit_transform(array)
421+
array([1., 2.])
422+
409423
Changelog
410424
---------
411425

412426
Unreleased
413427
**********
428+
* Add ``FunctionTransformer`` class (#117).
414429
* Fix column names derivation for dataframes with multi-index or non-string
415430
columns (#166).
416431
* Change behaviour of DataFrameMapper's fit_transform method to invoke each underlying transformers'

sklearn_pandas/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,5 @@
22

33
from .dataframe_mapper import DataFrameMapper # NOQA
44
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
5-
from .categorical_imputer import CategoricalImputer # NOQA
5+
from .transformers import CategoricalImputer, FunctionTransformer # NOQA
66
from .features_generator import gen_features # NOQA
7-
from .act_as_transformer import ActAsTransformer # NOQA

sklearn_pandas/act_as_transformer.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

sklearn_pandas/transformers.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
from sklearn.base import BaseEstimator, TransformerMixin
5+
from sklearn.utils.validation import check_is_fitted
6+
7+
8+
def _get_mask(X, value):
9+
"""
10+
Compute the boolean mask X == missing_values.
11+
"""
12+
if value == "NaN" or \
13+
value is None or \
14+
(isinstance(value, float) and np.isnan(value)):
15+
return pd.isnull(X)
16+
else:
17+
return X == value
18+
19+
20+
class CategoricalImputer(BaseEstimator, TransformerMixin):
21+
"""
22+
Impute missing values from a categorical/string np.ndarray or pd.Series
23+
with the most frequent value on the training data.
24+
25+
Parameters
26+
----------
27+
missing_values : string or "NaN", optional (default="NaN")
28+
The placeholder for the missing values. All occurrences of
29+
`missing_values` will be imputed. None and np.nan are treated
30+
as being the same, use the string value "NaN" for them.
31+
32+
copy : boolean, optional (default=True)
33+
If True, a copy of X will be created.
34+
35+
strategy : string, optional (default = 'most_frequent')
36+
The imputation strategy.
37+
38+
- If "most_frequent", then replace missing using the most frequent
39+
value along each column. Can be used with strings or numeric data.
40+
- If "constant", then replace missing values with fill_value. Can be
41+
used with strings or numeric data.
42+
43+
fill_value : string, optional (default='?')
44+
The value that all instances of `missing_values` are replaced
45+
with if `strategy` is set to `constant`. This is useful if
46+
you don't want to impute with the mode, or if there are multiple
47+
modes in your data and you want to choose a particular one. If
48+
`strategy` is not set to `constant`, this parameter is ignored.
49+
50+
Attributes
51+
----------
52+
fill_ : str
53+
The imputation fill value
54+
55+
"""
56+
57+
def __init__(
58+
self,
59+
missing_values='NaN',
60+
strategy='most_frequent',
61+
fill_value='?',
62+
copy=True
63+
):
64+
self.missing_values = missing_values
65+
self.copy = copy
66+
self.fill_value = fill_value
67+
self.strategy = strategy
68+
69+
strategies = ['constant', 'most_frequent']
70+
if self.strategy not in strategies:
71+
raise ValueError(
72+
'Strategy {0} not in {1}'.format(self.strategy, strategies)
73+
)
74+
75+
def fit(self, X, y=None):
76+
"""
77+
78+
Get the most frequent value.
79+
80+
Parameters
81+
----------
82+
X : np.ndarray or pd.Series
83+
Training data.
84+
85+
y : Passthrough for ``Pipeline`` compatibility.
86+
87+
Returns
88+
-------
89+
self: CategoricalImputer
90+
"""
91+
92+
mask = _get_mask(X, self.missing_values)
93+
X = X[~mask]
94+
if self.strategy == 'most_frequent':
95+
modes = pd.Series(X).mode()
96+
elif self.strategy == 'constant':
97+
modes = np.array([self.fill_value])
98+
if modes.shape[0] == 0:
99+
raise ValueError('Data is empty or all values are null')
100+
elif modes.shape[0] > 1:
101+
raise ValueError('No value is repeated more than '
102+
'once in the column')
103+
else:
104+
self.fill_ = modes[0]
105+
106+
return self
107+
108+
def transform(self, X):
109+
"""
110+
111+
Replaces missing values in the input data with the most frequent value
112+
of the training data.
113+
114+
Parameters
115+
----------
116+
X : np.ndarray or pd.Series
117+
Data with values to be imputed.
118+
119+
Returns
120+
-------
121+
np.ndarray
122+
Data with imputed values.
123+
"""
124+
125+
check_is_fitted(self, 'fill_')
126+
127+
if self.copy:
128+
X = X.copy()
129+
130+
mask = _get_mask(X, self.missing_values)
131+
X[mask] = self.fill_
132+
133+
return np.asarray(X)
134+
135+
136+
class FunctionTransformer(BaseEstimator, TransformerMixin):
137+
"""
138+
Use this class to convert a random function into a
139+
transformer.
140+
"""
141+
142+
def __init__(self, func):
143+
self.__func = func
144+
145+
def fit(self, x, y=None):
146+
return self
147+
148+
def transform(self, x):
149+
return np.vectorize(self.__func)(x)
150+
151+
def __call__(self, *args, **kwargs):
152+
return self.__func(*args, **kwargs)

tests/test_dataframe_mapper.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
from sklearn_pandas import DataFrameMapper, cross_val_score
3333
from sklearn_pandas.dataframe_mapper import _handle_feature, _build_transformer
3434
from sklearn_pandas.pipeline import TransformerPipeline
35-
from sklearn_pandas import ActAsTransformer
3635

3736

3837
class MockXTransformer(object):
@@ -951,17 +950,3 @@ def test_heterogeneous_output_types_input_df():
951950
dft = M.fit_transform(df)
952951
assert dft['feat1'].dtype == np.dtype('int64')
953952
assert dft['feat2'].dtype == np.dtype('float64')
954-
955-
956-
def test_actastransformer():
957-
"""
958-
Test whether random transformations works
959-
"""
960-
df = pd.DataFrame({
961-
'feat1': [10, 100],
962-
})
963-
M = DataFrameMapper([
964-
('feat1', ActAsTransformer(np.log10))
965-
], input_df=True, df_out=True, default=None)
966-
dft = M.fit_transform(df)
967-
assert_array_equal([1., 2.], dft['feat1'].values)

tests/test_categorical_imputer.py renamed to tests/test_transformers.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22

33
import numpy as np
44
import pandas as pd
5+
from numpy.testing import assert_array_equal
56

6-
from sklearn_pandas import CategoricalImputer
7+
from sklearn_pandas import CategoricalImputer, FunctionTransformer
78
from sklearn_pandas import DataFrameMapper
89

910
# In sklearn18 NotFittedError was moved from utils.validation
@@ -178,3 +179,15 @@ def test_default_fill_value_for_constant_strategy(input_type):
178179

179180
assert imputer.fill_ == '?'
180181
assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()
182+
183+
184+
def test_function_transformer():
185+
"""
186+
Test whether random transformations using FunctionTransformer work.
187+
"""
188+
array = np.array([10, 100])
189+
transformer = FunctionTransformer(np.log10)
190+
191+
transformed = transformer.fit_transform(array)
192+
193+
assert_array_equal(np.array([1., 2.]), transformed)

0 commit comments

Comments
 (0)