Skip to content

Commit 55920da

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents 89e6d11 + 08ed9d0 commit 55920da

File tree

4 files changed

+187
-7
lines changed

4 files changed

+187
-7
lines changed

README.rst

+19-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ In particular, it provides:
1111

1212
1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features.
1313
2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 <https://github.com/paulgb/sklearn-pandas/issues/11>`__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``.
14-
3. A ``CategoricalImputer`` that replaces null-like values with the mode and works with string columns.
14+
3. A couple of special transformers that work well with pandas inputs: ``CategoricalImputer`` and ``FunctionTransformer`.`
1515
1616
Installation
1717
------------
@@ -65,7 +65,7 @@ Transformation Mapping
6565
Map the Columns to Transformations
6666
**********************************
6767

68-
The mapper takes a list of tuples. The first is a column name from the pandas DataFrame, or a list containing one or multiple columns (we will see an example with multiple columns later). The second is an object which will perform the transformation which will be applied to that column. The third is optional and is a dictionary containing the transformation options, if applicable (see "custom column names for transformed features" below).
68+
The mapper takes a list of tuples. The first element of each tuple is a column name from the pandas DataFrame, or a list containing one or multiple columns (we will see an example with multiple columns later). The second element is an object which will perform the transformation which will be applied to that column. The third one is optional and is a dictionary containing the transformation options, if applicable (see "custom column names for transformed features" below).
6969

7070
Let's see an example::
7171

@@ -406,11 +406,26 @@ Example: imputing with a fixed value:
406406
array(['a', 'b', 'b', 'a'], dtype=object)
407407

408408

409+
``FunctionTransformer``
410+
***********************
411+
412+
Often one wants to apply simple transformations to data such as ``np.log``. ``FunctionTransformer`` is a simple wrapper that takes any function and applies vectorization so that it can be used as a transformer.
413+
414+
Example:
415+
416+
>>> from sklearn_pandas import FunctionTransformer
417+
>>> array = np.array([10, 100])
418+
>>> transformer = FunctionTransformer(np.log10)
419+
420+
>>> transformer.fit_transform(array)
421+
array([1., 2.])
422+
409423
Changelog
410424
---------
411425

412-
Unreleased
413-
**********
426+
1.8.0 (2018-12-01)
427+
******************
428+
* Add ``FunctionTransformer`` class (#117).
414429
* Fix column names derivation for dataframes with multi-index or non-string
415430
columns (#166).
416431
* Change behaviour of DataFrameMapper's fit_transform method to invoke each underlying transformers'

sklearn_pandas/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
__version__ = '1.7.0'
1+
__version__ = '1.8.0'
22

33
from .dataframe_mapper import DataFrameMapper # NOQA
44
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
5-
from .categorical_imputer import CategoricalImputer # NOQA
5+
from .transformers import CategoricalImputer, FunctionTransformer # NOQA
66
from .features_generator import gen_features # NOQA

sklearn_pandas/transformers.py

+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
from sklearn.base import BaseEstimator, TransformerMixin
5+
from sklearn.utils.validation import check_is_fitted
6+
7+
8+
def _get_mask(X, value):
9+
"""
10+
Compute the boolean mask X == missing_values.
11+
"""
12+
if value == "NaN" or \
13+
value is None or \
14+
(isinstance(value, float) and np.isnan(value)):
15+
return pd.isnull(X)
16+
else:
17+
return X == value
18+
19+
20+
class CategoricalImputer(BaseEstimator, TransformerMixin):
21+
"""
22+
Impute missing values from a categorical/string np.ndarray or pd.Series
23+
with the most frequent value on the training data.
24+
25+
Parameters
26+
----------
27+
missing_values : string or "NaN", optional (default="NaN")
28+
The placeholder for the missing values. All occurrences of
29+
`missing_values` will be imputed. None and np.nan are treated
30+
as being the same, use the string value "NaN" for them.
31+
32+
copy : boolean, optional (default=True)
33+
If True, a copy of X will be created.
34+
35+
strategy : string, optional (default = 'most_frequent')
36+
The imputation strategy.
37+
38+
- If "most_frequent", then replace missing using the most frequent
39+
value along each column. Can be used with strings or numeric data.
40+
- If "constant", then replace missing values with fill_value. Can be
41+
used with strings or numeric data.
42+
43+
fill_value : string, optional (default='?')
44+
The value that all instances of `missing_values` are replaced
45+
with if `strategy` is set to `constant`. This is useful if
46+
you don't want to impute with the mode, or if there are multiple
47+
modes in your data and you want to choose a particular one. If
48+
`strategy` is not set to `constant`, this parameter is ignored.
49+
50+
Attributes
51+
----------
52+
fill_ : str
53+
The imputation fill value
54+
55+
"""
56+
57+
def __init__(
58+
self,
59+
missing_values='NaN',
60+
strategy='most_frequent',
61+
fill_value='?',
62+
copy=True
63+
):
64+
self.missing_values = missing_values
65+
self.copy = copy
66+
self.fill_value = fill_value
67+
self.strategy = strategy
68+
69+
strategies = ['constant', 'most_frequent']
70+
if self.strategy not in strategies:
71+
raise ValueError(
72+
'Strategy {0} not in {1}'.format(self.strategy, strategies)
73+
)
74+
75+
def fit(self, X, y=None):
76+
"""
77+
78+
Get the most frequent value.
79+
80+
Parameters
81+
----------
82+
X : np.ndarray or pd.Series
83+
Training data.
84+
85+
y : Passthrough for ``Pipeline`` compatibility.
86+
87+
Returns
88+
-------
89+
self: CategoricalImputer
90+
"""
91+
92+
mask = _get_mask(X, self.missing_values)
93+
X = X[~mask]
94+
if self.strategy == 'most_frequent':
95+
modes = pd.Series(X).mode()
96+
elif self.strategy == 'constant':
97+
modes = np.array([self.fill_value])
98+
if modes.shape[0] == 0:
99+
raise ValueError('Data is empty or all values are null')
100+
elif modes.shape[0] > 1:
101+
raise ValueError('No value is repeated more than '
102+
'once in the column')
103+
else:
104+
self.fill_ = modes[0]
105+
106+
return self
107+
108+
def transform(self, X):
109+
"""
110+
111+
Replaces missing values in the input data with the most frequent value
112+
of the training data.
113+
114+
Parameters
115+
----------
116+
X : np.ndarray or pd.Series
117+
Data with values to be imputed.
118+
119+
Returns
120+
-------
121+
np.ndarray
122+
Data with imputed values.
123+
"""
124+
125+
check_is_fitted(self, 'fill_')
126+
127+
if self.copy:
128+
X = X.copy()
129+
130+
mask = _get_mask(X, self.missing_values)
131+
X[mask] = self.fill_
132+
133+
return np.asarray(X)
134+
135+
136+
class FunctionTransformer(BaseEstimator, TransformerMixin):
137+
"""
138+
Use this class to convert a random function into a
139+
transformer.
140+
"""
141+
142+
def __init__(self, func):
143+
self.__func = func
144+
145+
def fit(self, x, y=None):
146+
return self
147+
148+
def transform(self, x):
149+
return np.vectorize(self.__func)(x)
150+
151+
def __call__(self, *args, **kwargs):
152+
return self.__func(*args, **kwargs)

tests/test_categorical_imputer.py tests/test_transformers.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22

33
import numpy as np
44
import pandas as pd
5+
from numpy.testing import assert_array_equal
56

6-
from sklearn_pandas import CategoricalImputer
7+
from sklearn_pandas import CategoricalImputer, FunctionTransformer
78
from sklearn_pandas import DataFrameMapper
89

910
# In sklearn18 NotFittedError was moved from utils.validation
@@ -178,3 +179,15 @@ def test_default_fill_value_for_constant_strategy(input_type):
178179

179180
assert imputer.fill_ == '?'
180181
assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()
182+
183+
184+
def test_function_transformer():
185+
"""
186+
Test whether random transformations using FunctionTransformer work.
187+
"""
188+
array = np.array([10, 100])
189+
transformer = FunctionTransformer(np.log10)
190+
191+
transformed = transformer.fit_transform(array)
192+
193+
assert_array_equal(np.array([1., 2.]), transformed)

0 commit comments

Comments
 (0)