Skip to content

Commit ae88b71

Browse files
committed
Merge with upstream
2 parents 45992e1 + 0024cf7 commit ae88b71

File tree

7 files changed

+206
-57
lines changed

7 files changed

+206
-57
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@
33
.tox/
44
build/
55
dist/
6-
.cache/
6+
.cache/
7+
.idea/
8+
.pytest_cache/

README.rst

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -401,23 +401,29 @@ Example: imputing with a fixed value:
401401

402402
>>> from sklearn_pandas import CategoricalImputer
403403
>>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
404-
>>> imputer = CategoricalImputer(strategy='fixed_value', replacement='a')
404+
>>> imputer = CategoricalImputer(strategy='constant', fill_value='a')
405405
>>> imputer.fit_transform(data)
406406
array(['a', 'b', 'b', 'a'], dtype=object)
407407

408408

409409
Changelog
410410
---------
411411

412-
Development
412+
Unreleased
413+
**********
414+
* Fix column names derivation for dataframes with multi-index or non-string
415+
columns (#166).
416+
* Change behaviour of DataFrameMapper's fit_transform method to invoke each underlying transformers'
417+
native fit_transform if implemented. (#150)
418+
419+
1.7.0 (2018-08-15)
413420
******************
414421
* Fix issues with unicode names in ``get_names`` (#160).
415422
* Update to build using ``numpy==1.14`` and ``python==3.6`` (#154).
416-
* Add ``strategy`` and ``replacement`` parameters to ``CategoricalImputer`` to allow imputing
417-
with values other than the mode (#144).
423+
* Add ``strategy`` and ``fill_value`` parameters to ``CategoricalImputer`` to allow imputing
424+
with values other than the mode (#144), (#161).
418425
* Preserve input data types when no transform is supplied (#138).
419426

420-
421427
1.6.0 (2017-10-28)
422428
******************
423429
* Add column name to exception during fit/transform (#110).
@@ -497,16 +503,19 @@ Other contributors:
497503
* Ariel Rossanigo (@arielrossanigo)
498504
* Arnau Gil Amat (@arnau126)
499505
* Assaf Ben-David (@AssafBenDavid)
506+
* Brendan Herger (@bjherger)
500507
* Cal Paterson (@calpaterson)
501508
* @defvorfu
502509
* Gustavo Sena Mafra (@gsmafra)
503510
* Israel Saeta Pérez (@dukebody)
504511
* Jeremy Howard (@jph00)
505512
* Jimmy Wan (@jimmywan)
513+
* Kristof Van Engeland (@kristofve91)
506514
* Olivier Grisel (@ogrisel)
507515
* Paul Butler (@paulgb)
508516
* Richard Miller (@rwjmiller)
509517
* Ritesh Agrawal (@ragrawal)
518+
* @SandroCasagrande
510519
* Timothy Sweetser (@hacktuarial)
511520
* Vitaley Zaretskey (@vzaretsk)
512521
* Zac Stewart (@zacstewart)

sklearn_pandas/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = '1.6.0'
1+
__version__ = '1.7.0'
22

33
from .dataframe_mapper import DataFrameMapper # NOQA
44
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA

sklearn_pandas/categorical_imputer.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,49 +33,46 @@ class CategoricalImputer(BaseEstimator, TransformerMixin):
3333
copy : boolean, optional (default=True)
3434
If True, a copy of X will be created.
3535
36-
strategy : string, optional (default = 'mode')
37-
If set to 'mode', replace all instances of `missing_values`
38-
with the modal value. Otherwise, replace with
39-
the value specified via `replacement`.
36+
strategy : string, optional (default = 'most_frequent')
37+
The imputation strategy.
4038
41-
replacement : string, optional (default='?')
39+
- If "most_frequent", then replace missing using the most frequent
40+
value along each column. Can be used with strings or numeric data.
41+
- If "constant", then replace missing values with fill_value. Can be
42+
used with strings or numeric data.
43+
44+
fill_value : string, optional (default='?')
4245
The value that all instances of `missing_values` are replaced
43-
with if `strategy` is not set to 'mode'. This is useful if
46+
with if `strategy` is set to `constant`. This is useful if
4447
you don't want to impute with the mode, or if there are multiple
4548
modes in your data and you want to choose a particular one. If
46-
`strategy` is set to `mode`, this parameter is ignored.
49+
`strategy` is not set to `constant`, this parameter is ignored.
4750
4851
Attributes
4952
----------
5053
fill_ : str
51-
Most frequent value of the training data.
54+
The imputation fill value
5255
5356
"""
5457

5558
def __init__(
5659
self,
5760
missing_values='NaN',
58-
strategy='mode',
59-
replacement=None,
61+
strategy='most_frequent',
62+
fill_value='?',
6063
copy=True
6164
):
6265
self.missing_values = missing_values
6366
self.copy = copy
64-
self.replacement = replacement
67+
self.fill_value = fill_value
6568
self.strategy = strategy
6669

67-
strategies = ['fixed_value', 'mode']
70+
strategies = ['constant', 'most_frequent']
6871
if self.strategy not in strategies:
6972
raise ValueError(
7073
'Strategy {0} not in {1}'.format(self.strategy, strategies)
7174
)
7275

73-
if self.strategy == 'fixed_value' and self.replacement is None:
74-
raise ValueError(
75-
'Please specify a value for \'replacement\''
76-
'when using the fixed_value strategy.'
77-
)
78-
7976
def fit(self, X, y=None):
8077
"""
8178
@@ -95,10 +92,10 @@ def fit(self, X, y=None):
9592

9693
mask = _get_mask(X, self.missing_values)
9794
X = X[~mask]
98-
if self.strategy == 'mode':
95+
if self.strategy == 'most_frequent':
9996
modes = pd.Series(X).mode()
100-
elif self.strategy == 'fixed_value':
101-
modes = np.array([self.replacement])
97+
elif self.strategy == 'constant':
98+
modes = np.array([self.fill_value])
10299
if modes.shape[0] == 0:
103100
raise ValueError('Data is empty or all values are null')
104101
elif modes.shape[0] > 1:

sklearn_pandas/dataframe_mapper.py

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,16 @@ def __init__(self, features, default=False, sparse=False, df_out=False,
120120
if (df_out and (sparse or default)):
121121
raise ValueError("Can not use df_out with sparse or default")
122122

123+
def _build(self):
124+
"""
125+
Build attributes built_features and built_default.
126+
"""
127+
if isinstance(self.features, list):
128+
self.built_features = [_build_feature(*f) for f in self.features]
129+
else:
130+
self.built_features = self.features
131+
self.built_default = _build_transformer(self.default)
132+
123133
@property
124134
def _selected_columns(self):
125135
"""
@@ -204,12 +214,7 @@ def fit(self, X, y=None):
204214
y the target vector relative to X, optional
205215
206216
"""
207-
if isinstance(self.features, list):
208-
self.built_features = [_build_feature(*f) for f in self.features]
209-
else:
210-
self.built_features = self.features
211-
212-
self.built_default = _build_transformer(self.default)
217+
self._build()
213218

214219
for columns, transformers, options in self.built_features:
215220
input_df = options.get('input_df', self.input_df)
@@ -239,7 +244,7 @@ def get_names(self, columns, transformer, x, alias=None):
239244
if alias is not None:
240245
name = alias
241246
elif isinstance(columns, list):
242-
name = '_'.join(columns)
247+
name = '_'.join(map(str, columns))
243248
else:
244249
name = columns
245250
num_cols = x.shape[1] if len(x.shape) > 1 else 1
@@ -279,23 +284,32 @@ def get_dtype(self, ex):
279284
else:
280285
raise TypeError(type(ex))
281286

282-
def transform(self, X):
287+
def _transform(self, X, y=None, do_fit=False):
283288
"""
284-
Transform the given data. Assumes that fit has already been called.
285-
286-
X the data to transform
289+
Transform the given data with possibility to fit in advance.
290+
Avoids code duplication for implementation of transform and
291+
fit_transform.
287292
"""
293+
if do_fit:
294+
self._build()
295+
288296
extracted = []
289297
self.transformed_names_ = []
290298
for columns, transformers, options in self.built_features:
291299
input_df = options.get('input_df', self.input_df)
300+
292301
# columns could be a string or list of
293302
# strings; we don't care because pandas
294303
# will handle either.
295304
Xt = self._get_col_subset(X, columns, input_df)
296305
if transformers is not None:
297306
with add_column_names_to_exception(columns):
298-
Xt = transformers.transform(Xt)
307+
if do_fit and hasattr(transformers, 'fit_transform'):
308+
Xt = _call_fit(transformers.fit_transform, Xt, y)
309+
else:
310+
if do_fit:
311+
_call_fit(transformers.fit, Xt, y)
312+
Xt = transformers.transform(Xt)
299313
extracted.append(_handle_feature(Xt))
300314

301315
alias = options.get('alias')
@@ -308,7 +322,12 @@ def transform(self, X):
308322
Xt = self._get_col_subset(X, unsel_cols, self.input_df)
309323
if self.built_default is not None:
310324
with add_column_names_to_exception(unsel_cols):
311-
Xt = self.built_default.transform(Xt)
325+
if do_fit and hasattr(self.built_default, 'fit_transform'):
326+
Xt = _call_fit(self.built_default.fit_transform, Xt, y)
327+
else:
328+
if do_fit:
329+
_call_fit(self.built_default.fit, Xt, y)
330+
Xt = self.built_default.transform(Xt)
312331
self.transformed_names_ += self.get_names(
313332
unsel_cols, self.built_default, Xt)
314333
else:
@@ -355,6 +374,25 @@ def transform(self, X):
355374
else:
356375
return stacked
357376

377+
def transform(self, X):
378+
"""
379+
Transform the given data. Assumes that fit has already been called.
380+
381+
X the data to transform
382+
"""
383+
return self._transform(X)
384+
385+
def fit_transform(self, X, y=None):
386+
"""
387+
Fit a transformation from the pipeline and directly apply
388+
it to the given data.
389+
390+
X the data to fit
391+
392+
y the target vector relative to X, optional
393+
"""
394+
return self._transform(X, y, True)
395+
358396
def get_params(self, deep=True):
359397
out = super(DataFrameMapper, self).get_params(deep=False)
360398
if not deep:
@@ -404,3 +442,5 @@ def set_params(self, **params):
404442

405443
for instance in transformers_instances:
406444
instance.set_params(**assignment[id(instance)])
445+
446+

tests/test_categorical_imputer.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -147,26 +147,34 @@ def test_custom_replacement(replacement_value, input_type):
147147
Xc = X.copy()
148148

149149
Xt = CategoricalImputer(
150-
strategy='fixed_value',
151-
replacement=replacement_value
150+
strategy='constant',
151+
fill_value=replacement_value
152152
).fit_transform(X)
153153

154154
assert pd.core.common.array_equivalent(np.asarray(X), np.asarray(Xc))
155155
assert isinstance(Xt, np.ndarray)
156156
assert (Xt == ['a', replacement_value, 'b', 'b']).all()
157157

158158

159-
def test_missing_replacement():
160-
"""
161-
Raise error if no replacement value specified and strategy='fixed_value'
162-
"""
163-
with pytest.raises(ValueError):
164-
CategoricalImputer(strategy="fixed_value")
165-
166-
167159
def test_invalid_strategy():
168160
"""
169161
Raise an error if an invalid strategy is entered
170162
"""
171163
with pytest.raises(ValueError):
172164
CategoricalImputer(strategy="not_a_supported_strategy")
165+
166+
167+
@pytest.mark.parametrize('input_type', ['np', 'pd'])
168+
def test_default_fill_value_for_constant_strategy(input_type):
169+
data = ['a', np.nan, 'b', 'b']
170+
171+
if input_type == 'pd':
172+
X = pd.Series(data)
173+
else:
174+
X = np.asarray(data, dtype=object)
175+
176+
imputer = CategoricalImputer(strategy='constant')
177+
Xt = imputer.fit_transform(X)
178+
179+
assert imputer.fill_ == '?'
180+
assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()

0 commit comments

Comments
 (0)