From 3f21671f7ecb77667171138929d8381c336ad3e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Israel=20Saeta=20P=C3=A9rez?= Date: Sun, 20 Mar 2016 12:44:52 +0100 Subject: [PATCH 1/3] Add self.feature_indices_ attribute to mapper that specifies the indexes of self.features in the extracted array --- sklearn_pandas/dataframe_mapper.py | 10 ++++++++-- tests/test_dataframe_mapper.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 9a59f6d..1cb017c 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -104,6 +104,8 @@ def transform(self, X): X the data to transform """ extracted = [] + self.feature_indices_ = [0] + for columns, transformers in self.features: # columns could be a string or list of # strings; we don't care because pandas @@ -111,7 +113,11 @@ def transform(self, X): Xt = self._get_col_subset(X, columns) if transformers is not None: Xt = transformers.transform(Xt) - extracted.append(_handle_feature(Xt)) + + feature = _handle_feature(Xt) + extracted.append(feature) + self.feature_indices_.append(self.feature_indices_[-1] + + feature.shape[1]) # combine the feature outputs into one array. # at this point we lose track of which features @@ -120,7 +126,7 @@ def transform(self, X): # If any of the extracted features is sparse, combine sparsely. # Otherwise, combine as normal arrays. - if any(sparse.issparse(fea) for fea in extracted): + if any(sparse.issparse(feature) for feature in extracted): stacked = sparse.hstack(extracted).tocsr() # return a sparse matrix only if the mapper was initialized # with sparse=True diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index fa697b1..c592232 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -17,7 +17,7 @@ from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer -from sklearn.preprocessing import Imputer, StandardScaler +from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder from sklearn.base import BaseEstimator, TransformerMixin import numpy as np from numpy.testing import assert_array_equal @@ -147,6 +147,34 @@ def test_handle_feature_1dim(): assert_array_equal(_handle_feature(array), np.array([[1], [2]])) +def test_feature_indices_dense(): + df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 2, 7]}) + mapper = DataFrameMapper([ + (['a'], OneHotEncoder()), + ('b', None) + ]) + transformed = mapper.fit_transform(df) + + indices = mapper.feature_indices_ + assert len(indices) == len(mapper.features) + 1 + assert (transformed[:, indices[0]:indices[1]] == + OneHotEncoder(sparse=False).fit_transform(df[['a']])).all() + assert (transformed[:, indices[1]:indices[2]] == df[['b']].values).all() + + +def test_feature_indices_sparse(simple_dataframe): + mapper = DataFrameMapper([ + (['a'], OneHotEncoder()) + ], sparse=True) + transformed = mapper.fit_transform(simple_dataframe) + + indices = mapper.feature_indices_ + assert len(indices) == len(mapper.features) + 1 + # compare equality by checking that all elements in the difference are 0 + assert (transformed[:, indices[0]:indices[1]] - + OneHotEncoder().fit_transform(simple_dataframe[['a']])).nnz == 0 + + def test_build_transformers(): """ When a list of transformers is passed, return a pipeline with From 034cab29036a81d424552576f84f1f2bb40bccd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Israel=20Saeta=20P=C3=A9rez?= Date: Sun, 20 Mar 2016 13:15:16 +0100 Subject: [PATCH 2/3] Add documentation of new feature to README. --- README.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.rst b/README.rst index a1e80b0..2075434 100644 --- a/README.rst +++ b/README.rst @@ -102,6 +102,17 @@ Now that the transformation is trained, we confirm that it works on new data:: >>> np.round(mapper.transform(sample), 2) array([[ 1. , 0. , 0. , 1.04]]) +After transformation, the ``feature_indices_` attribute of the mapper +indicates which columns of the resulting output array correspond to which +input features. Input feature ``i`` is mapped to features from +``feature_indices_[i]`` to ``feature_indices_[i+1]`` in transformed output. +For example: + + >>> mapper.feature_indices_[0], mapper.feature_indices_[1] # pet + (0, 3) + >>> mapper.feature_indices_[1], mapper.feature_indices_[2] # children + (3, 4) + Transform Multiple Columns ************************** @@ -195,6 +206,8 @@ Development * Deprecate custom cross-validation shim classes. * Require ``scikit-learn>=0.15.0``. Resolves #49. +* Add ``feature_indices_`` attribute indicating the mapping between input and + ouptut variables. 1.1.0 (2015-12-06) From f9bedeef7735302e72a35d37ffdc0a79ee9c71db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Israel=20Saeta=20P=C3=A9rez?= Date: Sun, 20 Mar 2016 13:16:28 +0100 Subject: [PATCH 3/3] Convert mapper documentation to sklearn style. --- sklearn_pandas/dataframe_mapper.py | 35 ++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 1cb017c..e8dad5e 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -29,21 +29,32 @@ def _build_transformer(transformers): class DataFrameMapper(BaseEstimator, TransformerMixin): """ - Map Pandas data frame column subsets to their own - sklearn transformation. + Map pandas DataFrame column subsets via sklearn transforms to feature + arrays. + + Parameters + ---------- + features : list of tuples of the form (column_selector, transform) + A column selector may be a string (for selecting a single column + as a 1-d array) or a list of string (for selecting one or more + columns as a 2-d array). + A transform is an object which supports sklearns' transform + interface, or a list of such objects. + + sparse : bool, optional (default=False) + Return a sparse matrix if set True and any of the extracted + features are sparse. + + Attributes + ---------- + feature_indices_ : array of shape (len(self.features) + 1,) + Indices of self.features in the extracted array. + Feature ``i`` in self.features is mapped to features from + ``feature_indices_[i]`` to ``feature_indices_[i+1]`` in transformed + output. """ def __init__(self, features, sparse=False): - """ - Params: - - features a list of pairs. The first element is the pandas column - selector. This can be a string (for one column) or a list - of strings. The second element is an object that supports - sklearn's transform interface, or a list of such objects. - sparse will return sparse matrix if set True and any of the - extracted features is sparse. Defaults to False. - """ if isinstance(features, list): features = [(columns, _build_transformer(transformers)) for (columns, transformers) in features]