Merge with upstream

devforfu · devforfu · commit ae88b7188d18 · 2018-08-23T19:47:27.000+05:00
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,6 @@
 .tox/
 build/
 dist/
-.cache/
+.cache/
+.idea/
+.pytest_cache/
diff --git a/README.rst b/README.rst
@@ -401,23 +401,29 @@ Example: imputing with a fixed value:
 
     >>> from sklearn_pandas import CategoricalImputer
     >>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
-    >>> imputer = CategoricalImputer(strategy='fixed_value', replacement='a')
+    >>> imputer = CategoricalImputer(strategy='constant', fill_value='a')
     >>> imputer.fit_transform(data)
     array(['a', 'b', 'b', 'a'], dtype=object)
 
 
 Changelog
 ---------
 
-Development
+Unreleased
+**********
+* Fix column names derivation for dataframes with multi-index or non-string
+  columns (#166).
+* Change behaviour of DataFrameMapper's fit_transform method to invoke each underlying transformers'
+  native fit_transform if implemented. (#150)
+
+1.7.0 (2018-08-15)
 ******************
 * Fix issues with unicode names in ``get_names`` (#160).
 * Update to build using ``numpy==1.14`` and ``python==3.6`` (#154).
-* Add ``strategy`` and ``replacement`` parameters to ``CategoricalImputer`` to allow imputing
-  with values other than the mode (#144).
+* Add ``strategy`` and ``fill_value`` parameters to ``CategoricalImputer`` to allow imputing
+  with values other than the mode (#144), (#161).
 * Preserve input data types when no transform is supplied (#138).
 
-
 1.6.0 (2017-10-28)
 ******************
 * Add column name to exception during fit/transform (#110).
@@ -497,16 +503,19 @@ Other contributors:
 * Ariel Rossanigo (@arielrossanigo)
 * Arnau Gil Amat (@arnau126)
 * Assaf Ben-David (@AssafBenDavid)
+* Brendan Herger (@bjherger)
 * Cal Paterson (@calpaterson)
 * @defvorfu
 * Gustavo Sena Mafra (@gsmafra)
 * Israel Saeta Pérez (@dukebody)
 * Jeremy Howard (@jph00)
 * Jimmy Wan (@jimmywan)
+* Kristof Van Engeland (@kristofve91)
 * Olivier Grisel (@ogrisel)
 * Paul Butler (@paulgb)
 * Richard Miller (@rwjmiller)
 * Ritesh Agrawal (@ragrawal)
+* @SandroCasagrande
 * Timothy Sweetser (@hacktuarial)
 * Vitaley Zaretskey (@vzaretsk)
 * Zac Stewart (@zacstewart)
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.6.0'
+__version__ = '1.7.0'
 
 from .dataframe_mapper import DataFrameMapper  # NOQA
 from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV  # NOQA
diff --git a/sklearn_pandas/categorical_imputer.py b/sklearn_pandas/categorical_imputer.py
@@ -33,49 +33,46 @@ class CategoricalImputer(BaseEstimator, TransformerMixin):
     copy : boolean, optional (default=True)
         If True, a copy of X will be created.
 
-    strategy : string, optional (default = 'mode')
-        If set to 'mode', replace all instances of `missing_values`
-        with the modal value. Otherwise, replace with
-        the value specified via `replacement`.
+    strategy : string, optional (default = 'most_frequent')
+        The imputation strategy.
 
-    replacement : string, optional (default='?')
+        - If "most_frequent", then replace missing using the most frequent
+          value along each column. Can be used with strings or numeric data.
+        - If "constant", then replace missing values with fill_value. Can be
+          used with strings or numeric data.
+
+    fill_value : string, optional (default='?')
         The value that all instances of `missing_values` are replaced
-        with if `strategy` is not set to 'mode'. This is useful if
+        with if `strategy` is set to `constant`. This is useful if
         you don't want to impute with the mode, or if there are multiple
         modes in your data and you want to choose a particular one. If
-        `strategy` is set to `mode`, this parameter is ignored.
+        `strategy` is not set to `constant`, this parameter is ignored.
 
     Attributes
     ----------
     fill_ : str
-        Most frequent value of the training data.
+        The imputation fill value
 
     """
 
     def __init__(
         self,
         missing_values='NaN',
-        strategy='mode',
-        replacement=None,
+        strategy='most_frequent',
+        fill_value='?',
         copy=True
     ):
         self.missing_values = missing_values
         self.copy = copy
-        self.replacement = replacement
+        self.fill_value = fill_value
         self.strategy = strategy
 
-        strategies = ['fixed_value', 'mode']
+        strategies = ['constant', 'most_frequent']
         if self.strategy not in strategies:
             raise ValueError(
                 'Strategy {0} not in {1}'.format(self.strategy, strategies)
             )
 
-        if self.strategy == 'fixed_value' and self.replacement is None:
-            raise ValueError(
-                'Please specify a value for \'replacement\''
-                'when using the fixed_value strategy.'
-            )
-
     def fit(self, X, y=None):
         """
 
@@ -95,10 +92,10 @@ def fit(self, X, y=None):
 
         mask = _get_mask(X, self.missing_values)
         X = X[~mask]
-        if self.strategy == 'mode':
+        if self.strategy == 'most_frequent':
             modes = pd.Series(X).mode()
-        elif self.strategy == 'fixed_value':
-            modes = np.array([self.replacement])
+        elif self.strategy == 'constant':
+            modes = np.array([self.fill_value])
         if modes.shape[0] == 0:
             raise ValueError('Data is empty or all values are null')
         elif modes.shape[0] > 1:
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -120,6 +120,16 @@ def __init__(self, features, default=False, sparse=False, df_out=False,
         if (df_out and (sparse or default)):
             raise ValueError("Can not use df_out with sparse or default")
 
+    def _build(self):
+        """
+        Build attributes built_features and built_default.
+        """
+        if isinstance(self.features, list):
+            self.built_features = [_build_feature(*f) for f in self.features]
+        else:
+            self.built_features = self.features
+        self.built_default = _build_transformer(self.default)
+
     @property
     def _selected_columns(self):
         """
@@ -204,12 +214,7 @@ def fit(self, X, y=None):
         y       the target vector relative to X, optional
 
         """
-        if isinstance(self.features, list):
-            self.built_features = [_build_feature(*f) for f in self.features]
-        else:
-            self.built_features = self.features
-
-        self.built_default = _build_transformer(self.default)
+        self._build()
 
         for columns, transformers, options in self.built_features:
             input_df = options.get('input_df', self.input_df)
@@ -239,7 +244,7 @@ def get_names(self, columns, transformer, x, alias=None):
         if alias is not None:
             name = alias
         elif isinstance(columns, list):
-            name = '_'.join(columns)
+            name = '_'.join(map(str, columns))
         else:
             name = columns
         num_cols = x.shape[1] if len(x.shape) > 1 else 1
@@ -279,23 +284,32 @@ def get_dtype(self, ex):
         else:
             raise TypeError(type(ex))
 
-    def transform(self, X):
+    def _transform(self, X, y=None, do_fit=False):
         """
-        Transform the given data. Assumes that fit has already been called.
-
-        X       the data to transform
+        Transform the given data with possibility to fit in advance.
+        Avoids code duplication for implementation of transform and
+        fit_transform.
         """
+        if do_fit:
+            self._build()
+
         extracted = []
         self.transformed_names_ = []
         for columns, transformers, options in self.built_features:
             input_df = options.get('input_df', self.input_df)
+
             # columns could be a string or list of
             # strings; we don't care because pandas
             # will handle either.
             Xt = self._get_col_subset(X, columns, input_df)
             if transformers is not None:
                 with add_column_names_to_exception(columns):
-                    Xt = transformers.transform(Xt)
+                    if do_fit and hasattr(transformers, 'fit_transform'):
+                        Xt = _call_fit(transformers.fit_transform, Xt, y)
+                    else:
+                        if do_fit:
+                            _call_fit(transformers.fit, Xt, y)
+                        Xt = transformers.transform(Xt)
             extracted.append(_handle_feature(Xt))
 
             alias = options.get('alias')
@@ -308,7 +322,12 @@ def transform(self, X):
             Xt = self._get_col_subset(X, unsel_cols, self.input_df)
             if self.built_default is not None:
                 with add_column_names_to_exception(unsel_cols):
-                    Xt = self.built_default.transform(Xt)
+                    if do_fit and hasattr(self.built_default, 'fit_transform'):
+                        Xt = _call_fit(self.built_default.fit_transform, Xt, y)
+                    else:
+                        if do_fit:
+                            _call_fit(self.built_default.fit, Xt, y)
+                        Xt = self.built_default.transform(Xt)
                 self.transformed_names_ += self.get_names(
                     unsel_cols, self.built_default, Xt)
             else:
@@ -355,6 +374,25 @@ def transform(self, X):
         else:
             return stacked
 
+    def transform(self, X):
+        """
+        Transform the given data. Assumes that fit has already been called.
+
+        X       the data to transform
+        """
+        return self._transform(X)
+
+    def fit_transform(self, X, y=None):
+        """
+        Fit a transformation from the pipeline and directly apply
+        it to the given data.
+
+        X       the data to fit
+
+        y       the target vector relative to X, optional
+        """
+        return self._transform(X, y, True)
+
     def get_params(self, deep=True):
         out = super(DataFrameMapper, self).get_params(deep=False)
         if not deep:
@@ -404,3 +442,5 @@ def set_params(self, **params):
 
         for instance in transformers_instances:
             instance.set_params(**assignment[id(instance)])
+
+
diff --git a/tests/test_categorical_imputer.py b/tests/test_categorical_imputer.py
@@ -147,26 +147,34 @@ def test_custom_replacement(replacement_value, input_type):
     Xc = X.copy()
 
     Xt = CategoricalImputer(
-        strategy='fixed_value',
-        replacement=replacement_value
+        strategy='constant',
+        fill_value=replacement_value
     ).fit_transform(X)
 
     assert pd.core.common.array_equivalent(np.asarray(X), np.asarray(Xc))
     assert isinstance(Xt, np.ndarray)
     assert (Xt == ['a', replacement_value, 'b', 'b']).all()
 
 
-def test_missing_replacement():
-    """
-    Raise error if no replacement value specified and strategy='fixed_value'
-    """
-    with pytest.raises(ValueError):
-        CategoricalImputer(strategy="fixed_value")
-
-
 def test_invalid_strategy():
     """
     Raise an error if an invalid strategy is entered
     """
     with pytest.raises(ValueError):
         CategoricalImputer(strategy="not_a_supported_strategy")
+
+
+@pytest.mark.parametrize('input_type', ['np', 'pd'])
+def test_default_fill_value_for_constant_strategy(input_type):
+    data = ['a', np.nan, 'b', 'b']
+
+    if input_type == 'pd':
+        X = pd.Series(data)
+    else:
+        X = np.asarray(data, dtype=object)
+
+    imputer = CategoricalImputer(strategy='constant')
+    Xt = imputer.fit_transform(X)
+
+    assert imputer.fill_ == '?'
+    assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py

-Original file line number
+Diff line change
 .tox/
 build/
 dist/
 -.cache/
 +.cache/
 +.idea/
 +.pytest_cache/
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = '1.6.0'`
	`1`	`+__version__ = '1.7.0'`
`2`	`2`
`3`	`3`	`from .dataframe_mapper import DataFrameMapper # NOQA`
`4`	`4`	`from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA`