diff --git a/examples/01 Flowers and Forests - A Simple Pipeline.ipynb b/examples/01 Flowers and Forests - A Simple Pipeline.ipynb new file mode 100644 index 0000000..f85c2ba --- /dev/null +++ b/examples/01 Flowers and Forests - A Simple Pipeline.ipynb @@ -0,0 +1,333 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import sklearn.datasets\n", + "import pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn_pandas import DataFrameMapper, make_dataframe_pipeline\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.cross_validation import cross_val_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Gather a Tidy Dataframe\n", + "----" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)class
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", + "
" + ], + "text/plain": [ + " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", + "0 5.1 3.5 1.4 0.2 \n", + "1 4.9 3.0 1.4 0.2 \n", + "2 4.7 3.2 1.3 0.2 \n", + "3 4.6 3.1 1.5 0.2 \n", + "4 5.0 3.6 1.4 0.2 \n", + "\n", + " class \n", + "0 setosa \n", + "1 setosa \n", + "2 setosa \n", + "3 setosa \n", + "4 setosa " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_data = sklearn.datasets.load_iris()\n", + "iris = pandas.DataFrame(data = iris_data[\"data\"], columns=iris_data[\"feature_names\"])\n", + "iris[\"class\"] = iris_data[\"target_names\"][iris_data[\"target\"]]\n", + "\n", + "iris.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assemble a Simple Learning Pipeline\n", + "--------\n", + "\n", + "A DataFramePipeline begins with a DataFrameMapper, specify how features **`X`** and targets **`y`** are extracted from an input frame. It ends with an estimator object.\n", + "\n", + "In this case, extract each available feature without transformation and specify the class label as the target." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "forest_pipeline = make_dataframe_pipeline([\n", + " DataFrameMapper(iris_data[\"feature_names\"], \"class\"),\n", + " RandomForestClassifier(n_estimators=200)\n", + " ])\n", + "\n", + "logistic_pipeline = make_dataframe_pipeline([\n", + " DataFrameMapper(iris_data[\"feature_names\"], \"class\"),\n", + " LogisticRegression()\n", + " ])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cross Validate\n", + "-----\n", + "\n", + "Cross validation requires the target **`y`** to perform train-test splits. Use the pipeline's DataFrameMapper to extract the target feature array from input data. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
forestlogistic
count5.0000005.000000
mean0.9600000.960000
std0.0278890.043461
min0.9333330.900000
25%0.9333330.933333
50%0.9666670.966667
75%0.9666671.000000
max1.0000001.000000
\n", + "
" + ], + "text/plain": [ + " forest logistic\n", + "count 5.000000 5.000000\n", + "mean 0.960000 0.960000\n", + "std 0.027889 0.043461\n", + "min 0.933333 0.900000\n", + "25% 0.933333 0.933333\n", + "50% 0.966667 0.966667\n", + "75% 0.966667 1.000000\n", + "max 1.000000 1.000000" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cross_val_result = pandas.DataFrame.from_dict({\n", + " \"forest\" : cross_val_score(\n", + " estimator = forest_pipeline,\n", + " X = iris, y = forest_pipeline._dataframe_mapper.extract_y(iris),\n", + " cv = 5, scoring=\"accuracy\"),\n", + " \"logistic\" : cross_val_score(\n", + " estimator = logistic_pipeline,\n", + " X = iris, y = logistic_pipeline._dataframe_mapper.extract_y(iris),\n", + " cv = 5, scoring=\"accuracy\")\n", + " })\n", + "\n", + "cross_val_result.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Extract Feature Metadata\n", + "----\n", + "\n", + "The DataFrameMapper may be used to associate estimator metadata with feature source information. In this case, the `feature_importances_` vector is associated with the source column name." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "sepal length (cm) 0.111118\n", + "sepal width (cm) 0.028009\n", + "petal length (cm) 0.455807\n", + "petal width (cm) 0.405066\n", + "Name: feature_importances, dtype: float64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "forest_pipeline.fit(iris)\n", + "pandas.Series(\n", + " data = forest_pipeline._final_estimator.feature_importances_,\n", + " index = forest_pipeline._dataframe_mapper.X_columns_,\n", + " name=\"feature_importances\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py index 537ab56..b576d40 100644 --- a/sklearn_pandas/__init__.py +++ b/sklearn_pandas/__init__.py @@ -2,3 +2,4 @@ from .dataframe_mapper import DataFrameMapper # NOQA from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA +from .dataframe_pipeline import DataFramePipeline, make_dataframe_pipeline diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 9a59f6d..bb755db 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -1,3 +1,4 @@ +import itertools import sys import pandas as pd import numpy as np @@ -29,27 +30,95 @@ def _build_transformer(transformers): class DataFrameMapper(BaseEstimator, TransformerMixin): """ - Map Pandas data frame column subsets to their own - sklearn transformation. + Map pandas DataFrame column subsets via sklearn transforms to feature + arrays. + + Parameters + ---------- + features : list of ((column_selector, [transform]) or column_selector) + Each feature is composed of a pandas column selector and an + optional transform. A column selector may be a string (for a single + column) or a list of strings. A transform is an object which + supports sklearns' transform interface, or a list of such objects. + + y_feature : (column_selector, [transform]) or columns_selector (optional) + A single feature, as per entries in features, used to extract a + single value from input frames during fitting. + + sparse : bool, optional (default=False) + Return a sparse matrix if set True and any extracted features are + sparse. + + Attributes + ---------- + feature_widths_ : array of shape(len(self.features)) + Widths of self.features in extracted X array. + Feature 'i' in self.features produces output features of given width. + + feature_indices_ : array of shape(len(self.features) + 1) + Indicies of self.features in extracted X array. + Feature 'i' in self.features is mapped to features from + 'feature_indices_[i]' to 'feature_indices_[i+1]' in transformed output. + + X_n_features_: int + Number of output features in extracted feature arrays. + + X_features_: array of shape(X_n_features_, 2) + Array of source feature pairs used to generate each output feature. + + X_columns_: array of shape(X_n_features_) + Array of column selectors used to generate each output feature. """ - def __init__(self, features, sparse=False): - """ - Params: - - features a list of pairs. The first element is the pandas column - selector. This can be a string (for one column) or a list - of strings. The second element is an object that supports - sklearn's transform interface, or a list of such objects. - sparse will return sparse matrix if set True and any of the - extracted features is sparse. Defaults to False. - """ - if isinstance(features, list): - features = [(columns, _build_transformer(transformers)) - for (columns, transformers) in features] - self.features = features + @property + def feature_indices_(self): + """Indicies of self.features in extracted X array.""" + return np.cumsum([0] + list(self.feature_widths_)) + + @property + def X_n_features_(self): + """Number of output features in extracted feature array.""" + return np.sum(self.feature_widths_) + + @property + def X_features_(self): + """Array of source feature pairs used to generate each output feature.""" + rs = [] + for width, feature in zip(self.feature_widths_, self.features): + for _ in range(width): + rs.append(feature) + return np.array(rs, dtype=object) + + @property + def X_columns_(self): + """Array of column selectors used to generate each output feature.""" + return self.X_features_[:,0] + + def __init__(self, features, y_feature = None, sparse=False): + if isinstance(features, string_types): + features = [features] + + self.features = [] + for f in features: + if isinstance(f, string_types): + self.features.append((f, None)) + else: + columns, transformers = f + self.features.append((columns, _build_transformer(transformers))) + + self.feature_widths_ = None + + if y_feature is None: + self.y_feature = None + elif isinstance(y_feature, string_types): + self.y_feature = (y_feature, None) + else: + y_columns, y_transformers = y_feature + self.y_feature = (y_columns, _build_transformer(y_transformers)) + self.sparse = sparse + def __setstate__(self, state): # compatibility shim for pickles created with sklearn-pandas<1.0.0 self.features = [(columns, _build_transformer(transformers)) @@ -86,15 +155,82 @@ def _get_col_subset(self, X, cols): return t + def extract_Xy(self, X, y=None): + """Extract X and y values for pipeline from inputs 'X' and 'y'.""" + return self.extract_X(X), self.extract_y(X, y) + + def extract_X(self, X): + """Extract X values for pipeline, equivalent to self.transform.""" + return self.transform(X) + + def extract_y(self, X, y=None): + """Extract y values for pipeline from input 'y' or 'X'. + + Extract self.y_feature from dataframe 'y'. Fall back to extraction from + dataframe 'X' if y is not null but not DataFrame. + """ + + + if y is None and self.y_feature is None: + return None + + if self.y_feature is None: + raise ValueError("DataFrameMapper does not support extract_y, self.y_feature is None.") + + if isinstance(y, pd.DataFrame): + df = y + elif isinstance(y, pd.Series): + df = y.to_frame() + else: + assert isinstance(X, pd.DataFrame) + df = X + + + y_columns, y_transformers = self.y_feature + # columns could be a string or list of + # strings; we don't care because pandas + # will handle either. + yt = self._get_col_subset(df, y_columns) + if y_transformers is not None: + yt = y_transformers.transform(yt) + + return yt + def fit(self, X, y=None): """ Fit a transformation from the pipeline - X the data to fit + X the dataframe to fit + y DataFrame from which to extract 'target' columns, 'X' used as + 'target' column source if None. """ + + feature_widths = [] + cur_index = 0 for columns, transformers in self.features: + # columns could be a string or list of + # strings; we don't care because pandas + # will handle either. + Xt = self._get_col_subset(X, columns) if transformers is not None: - transformers.fit(self._get_col_subset(X, columns)) + transformers.fit(Xt) + Xt = transformers.transform(Xt) + feature_widths.append(_handle_feature(Xt).shape[1]) + self.feature_widths_ = np.array(feature_widths) + + if self.y_feature is not None: + if isinstance(y, pd.DataFrame): + df = y + elif isinstance(y, pd.Series): + df = y.to_frame() + else: + assert isinstance(X, pd.DataFrame) + df = X + + y_columns, y_transformers = self.y_feature + if y_transformers is not None: + y_transformers.fit(self._get_col_subset(df, y_columns)) + return self def transform(self, X): diff --git a/sklearn_pandas/dataframe_pipeline.py b/sklearn_pandas/dataframe_pipeline.py new file mode 100644 index 0000000..ed75cdb --- /dev/null +++ b/sklearn_pandas/dataframe_pipeline.py @@ -0,0 +1,148 @@ +import six +from sklearn.pipeline import _name_estimators, Pipeline +from sklearn.utils.metaestimators import if_delegate_has_method + +from sklearn_pandas.dataframe_mapper import DataFrameMapper + +class DataFramePipeline(Pipeline): + """Pipeline of transforms with a final estimator supporting DataFrame input. + + Sequentially applies a list of transforms and a final estimator, extracting + 'X' and 'y' data for the pipeline via an initial DataFrameMapper. The first + step of the pipeline must be a DataFrameMapper. Intermediate steps of the + pipeline must be 'transforms' implementing the 'fit' and 'transform' methods. + The final estimator only needs to implement fit. + + Parameters + ---------- + steps : list + List of (name, transform) tuples (implementing fit/transform) that are + chained, in the order in which they are chained, with the last object + an estimator. + """ + + def __init__(self, steps): + Pipeline.__init__(self, steps) + + if not isinstance(self._dataframe_mapper, DataFrameMapper): + raise TypeError( + "First step of a DataFramePipeline must be a DataFrameMapper, " + "'%s' (type %s) is not." % + (self._dataframe_mapper, type(self._dataframe_mapper)) + ) + + @property + def _dataframe_mapper(self): + """Return DataFrameMapper at head of pipeline.""" + return self.steps[0][1] + + @property + def _dataframe_mapper_name(self): + """Return name of DataFrameMapper at head of pipeline.""" + return self.steps[0][0] + + def _pre_transform(self, X, y=None, **fit_params): + fit_params_steps = dict((step, {}) for step, _ in self.steps) + for pname, pval in six.iteritems(fit_params): + step, param = pname.split('__', 1) + fit_params_steps[step][param] = pval + + Xt = self._dataframe_mapper.fit_transform( + X, y, **fit_params_steps[self._dataframe_mapper_name]) + yt = self._dataframe_mapper.extract_y(X, y) + + for name, transform in self.steps[1:-1]: + if hasattr(transform, "fit_transform"): + Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) + else: + Xt = transform.fit(Xt, y, **fit_params_steps[name]) \ + .transform(Xt) + + return Xt, yt, fit_params_steps[self.steps[-1][0]] + + def fit(self, X, y=None, **fit_params): + """Fit all the transforms one after the other and transform the + data, then fit the transformed data using the final estimator. + + Parameters + ---------- + X : (DataFrame) + Training data. Must fulfill input requirements of first step of the + pipeline. + y : (DataFrame or Series), default=None + Training targets. Must fulfill label requirements for all steps of + the pipeline. Training target are extracted via mapper from 'X' if + 'y' is None. + """ + Xt, yt, fit_params = self._pre_transform(X, y, **fit_params) + self.steps[-1][-1].fit(Xt, yt, **fit_params) + return self + + def fit_transform(self, X, y=None, **fit_params): + """Fit all the transforms one after the other and transform the + data, then use fit_transform on transformed data using the final + estimator. + + Parameters + ---------- + X : (DataFrame) + Training data. Must fulfill input requirements of first step of the + pipeline. + y : (DataFrame or Series), default=None + Training targets. Must fulfill label requirements for all steps of + the pipeline. Training target are extracted via mapper from 'X' if + 'y' is None. + """ + Xt, yt, fit_params = self._pre_transform(X, y, **fit_params) + if hasattr(self.steps[-1][-1], 'fit_transform'): + return self.steps[-1][-1].fit_transform(Xt, yt, **fit_params) + else: + return self.steps[-1][-1].fit(Xt, yt, **fit_params).transform(Xt) + + @if_delegate_has_method(delegate='_final_estimator') + def fit_predict(self, X, y=None, **fit_params): + """Applies fit_predict of last step in pipeline after transforms. + + Applies fit_transforms of a pipeline to the data, followed by the + fit_predict method of the final estimator in the pipeline. Valid + only if the final estimator implements fit_predict. + + Parameters + ---------- + X : (DataFrame) + Training data. Must fulfill input requirements of first step of the + pipeline. + y : (DataFrame or Series), default=None + Training targets. Must fulfill label requirements for all steps of + the pipeline. Training target are extracted via mapper from 'X' if + 'y' is None. + """ + Xt, yt, fit_params = self._pre_transform(X, y, **fit_params) + return self.steps[-1][-1].fit_predict(Xt, yt, **fit_params) + + @if_delegate_has_method(delegate='_final_estimator') + def score(self, X, y=None): + """Applies transforms to the data, and the score method of the + final estimator. Valid only if the final estimator implements + score. + + Parameters + ---------- + X : (DataFrame) + Training data. Must fulfill input requirements of first step of the + pipeline. + y : (DataFrame or Series), default=None + Training targets. Must fulfill label requirements for all steps of + the pipeline. Training target are extracted via mapper from 'X' if + 'y' is None. + """ + Xt = X + for name, transform in self.steps[:-1]: + Xt = transform.transform(Xt) + + yt = self._dataframe_mapper.extract_y(X, y) + return self.steps[-1][-1].score(Xt, yt) + +def make_dataframe_pipeline(steps): + """Construct a DataFramePipeline from the given estimators.""" + return DataFramePipeline(_name_estimators(steps))