diff --git a/examples/01 Flowers and Forests - A Simple Pipeline.ipynb b/examples/01 Flowers and Forests - A Simple Pipeline.ipynb
new file mode 100644
index 0000000..f85c2ba
--- /dev/null
+++ b/examples/01 Flowers and Forests - A Simple Pipeline.ipynb
@@ -0,0 +1,333 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import sklearn.datasets\n",
+ "import pandas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn_pandas import DataFrameMapper, make_dataframe_pipeline\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.cross_validation import cross_val_score"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Gather a Tidy Dataframe\n",
+ "----"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sepal length (cm) | \n",
+ " sepal width (cm) | \n",
+ " petal length (cm) | \n",
+ " petal width (cm) | \n",
+ " class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5.1 | \n",
+ " 3.5 | \n",
+ " 1.4 | \n",
+ " 0.2 | \n",
+ " setosa | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4.9 | \n",
+ " 3.0 | \n",
+ " 1.4 | \n",
+ " 0.2 | \n",
+ " setosa | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4.7 | \n",
+ " 3.2 | \n",
+ " 1.3 | \n",
+ " 0.2 | \n",
+ " setosa | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4.6 | \n",
+ " 3.1 | \n",
+ " 1.5 | \n",
+ " 0.2 | \n",
+ " setosa | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5.0 | \n",
+ " 3.6 | \n",
+ " 1.4 | \n",
+ " 0.2 | \n",
+ " setosa | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n",
+ "0 5.1 3.5 1.4 0.2 \n",
+ "1 4.9 3.0 1.4 0.2 \n",
+ "2 4.7 3.2 1.3 0.2 \n",
+ "3 4.6 3.1 1.5 0.2 \n",
+ "4 5.0 3.6 1.4 0.2 \n",
+ "\n",
+ " class \n",
+ "0 setosa \n",
+ "1 setosa \n",
+ "2 setosa \n",
+ "3 setosa \n",
+ "4 setosa "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "iris_data = sklearn.datasets.load_iris()\n",
+ "iris = pandas.DataFrame(data = iris_data[\"data\"], columns=iris_data[\"feature_names\"])\n",
+ "iris[\"class\"] = iris_data[\"target_names\"][iris_data[\"target\"]]\n",
+ "\n",
+ "iris.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Assemble a Simple Learning Pipeline\n",
+ "--------\n",
+ "\n",
+ "A DataFramePipeline begins with a DataFrameMapper, specify how features **`X`** and targets **`y`** are extracted from an input frame. It ends with an estimator object.\n",
+ "\n",
+ "In this case, extract each available feature without transformation and specify the class label as the target."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "forest_pipeline = make_dataframe_pipeline([\n",
+ " DataFrameMapper(iris_data[\"feature_names\"], \"class\"),\n",
+ " RandomForestClassifier(n_estimators=200)\n",
+ " ])\n",
+ "\n",
+ "logistic_pipeline = make_dataframe_pipeline([\n",
+ " DataFrameMapper(iris_data[\"feature_names\"], \"class\"),\n",
+ " LogisticRegression()\n",
+ " ])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Cross Validate\n",
+ "-----\n",
+ "\n",
+ "Cross validation requires the target **`y`** to perform train-test splits. Use the pipeline's DataFrameMapper to extract the target feature array from input data. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " forest | \n",
+ " logistic | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 5.000000 | \n",
+ " 5.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 0.960000 | \n",
+ " 0.960000 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 0.027889 | \n",
+ " 0.043461 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 0.933333 | \n",
+ " 0.900000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 0.933333 | \n",
+ " 0.933333 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 0.966667 | \n",
+ " 0.966667 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 0.966667 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " forest logistic\n",
+ "count 5.000000 5.000000\n",
+ "mean 0.960000 0.960000\n",
+ "std 0.027889 0.043461\n",
+ "min 0.933333 0.900000\n",
+ "25% 0.933333 0.933333\n",
+ "50% 0.966667 0.966667\n",
+ "75% 0.966667 1.000000\n",
+ "max 1.000000 1.000000"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cross_val_result = pandas.DataFrame.from_dict({\n",
+ " \"forest\" : cross_val_score(\n",
+ " estimator = forest_pipeline,\n",
+ " X = iris, y = forest_pipeline._dataframe_mapper.extract_y(iris),\n",
+ " cv = 5, scoring=\"accuracy\"),\n",
+ " \"logistic\" : cross_val_score(\n",
+ " estimator = logistic_pipeline,\n",
+ " X = iris, y = logistic_pipeline._dataframe_mapper.extract_y(iris),\n",
+ " cv = 5, scoring=\"accuracy\")\n",
+ " })\n",
+ "\n",
+ "cross_val_result.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Extract Feature Metadata\n",
+ "----\n",
+ "\n",
+ "The DataFrameMapper may be used to associate estimator metadata with feature source information. In this case, the `feature_importances_` vector is associated with the source column name."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "sepal length (cm) 0.111118\n",
+ "sepal width (cm) 0.028009\n",
+ "petal length (cm) 0.455807\n",
+ "petal width (cm) 0.405066\n",
+ "Name: feature_importances, dtype: float64"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "forest_pipeline.fit(iris)\n",
+ "pandas.Series(\n",
+ " data = forest_pipeline._final_estimator.feature_importances_,\n",
+ " index = forest_pipeline._dataframe_mapper.X_columns_,\n",
+ " name=\"feature_importances\"\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
index 537ab56..b576d40 100644
--- a/sklearn_pandas/__init__.py
+++ b/sklearn_pandas/__init__.py
@@ -2,3 +2,4 @@
from .dataframe_mapper import DataFrameMapper # NOQA
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
+from .dataframe_pipeline import DataFramePipeline, make_dataframe_pipeline
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
index 9a59f6d..bb755db 100644
--- a/sklearn_pandas/dataframe_mapper.py
+++ b/sklearn_pandas/dataframe_mapper.py
@@ -1,3 +1,4 @@
+import itertools
import sys
import pandas as pd
import numpy as np
@@ -29,27 +30,95 @@ def _build_transformer(transformers):
class DataFrameMapper(BaseEstimator, TransformerMixin):
"""
- Map Pandas data frame column subsets to their own
- sklearn transformation.
+ Map pandas DataFrame column subsets via sklearn transforms to feature
+ arrays.
+
+ Parameters
+ ----------
+ features : list of ((column_selector, [transform]) or column_selector)
+ Each feature is composed of a pandas column selector and an
+ optional transform. A column selector may be a string (for a single
+ column) or a list of strings. A transform is an object which
+ supports sklearns' transform interface, or a list of such objects.
+
+ y_feature : (column_selector, [transform]) or columns_selector (optional)
+ A single feature, as per entries in features, used to extract a
+ single value from input frames during fitting.
+
+ sparse : bool, optional (default=False)
+ Return a sparse matrix if set True and any extracted features are
+ sparse.
+
+ Attributes
+ ----------
+ feature_widths_ : array of shape(len(self.features))
+ Widths of self.features in extracted X array.
+ Feature 'i' in self.features produces output features of given width.
+
+ feature_indices_ : array of shape(len(self.features) + 1)
+ Indicies of self.features in extracted X array.
+ Feature 'i' in self.features is mapped to features from
+ 'feature_indices_[i]' to 'feature_indices_[i+1]' in transformed output.
+
+ X_n_features_: int
+ Number of output features in extracted feature arrays.
+
+ X_features_: array of shape(X_n_features_, 2)
+ Array of source feature pairs used to generate each output feature.
+
+ X_columns_: array of shape(X_n_features_)
+ Array of column selectors used to generate each output feature.
"""
- def __init__(self, features, sparse=False):
- """
- Params:
-
- features a list of pairs. The first element is the pandas column
- selector. This can be a string (for one column) or a list
- of strings. The second element is an object that supports
- sklearn's transform interface, or a list of such objects.
- sparse will return sparse matrix if set True and any of the
- extracted features is sparse. Defaults to False.
- """
- if isinstance(features, list):
- features = [(columns, _build_transformer(transformers))
- for (columns, transformers) in features]
- self.features = features
+ @property
+ def feature_indices_(self):
+ """Indicies of self.features in extracted X array."""
+ return np.cumsum([0] + list(self.feature_widths_))
+
+ @property
+ def X_n_features_(self):
+ """Number of output features in extracted feature array."""
+ return np.sum(self.feature_widths_)
+
+ @property
+ def X_features_(self):
+ """Array of source feature pairs used to generate each output feature."""
+ rs = []
+ for width, feature in zip(self.feature_widths_, self.features):
+ for _ in range(width):
+ rs.append(feature)
+ return np.array(rs, dtype=object)
+
+ @property
+ def X_columns_(self):
+ """Array of column selectors used to generate each output feature."""
+ return self.X_features_[:,0]
+
+ def __init__(self, features, y_feature = None, sparse=False):
+ if isinstance(features, string_types):
+ features = [features]
+
+ self.features = []
+ for f in features:
+ if isinstance(f, string_types):
+ self.features.append((f, None))
+ else:
+ columns, transformers = f
+ self.features.append((columns, _build_transformer(transformers)))
+
+ self.feature_widths_ = None
+
+ if y_feature is None:
+ self.y_feature = None
+ elif isinstance(y_feature, string_types):
+ self.y_feature = (y_feature, None)
+ else:
+ y_columns, y_transformers = y_feature
+ self.y_feature = (y_columns, _build_transformer(y_transformers))
+
self.sparse = sparse
+
def __setstate__(self, state):
# compatibility shim for pickles created with sklearn-pandas<1.0.0
self.features = [(columns, _build_transformer(transformers))
@@ -86,15 +155,82 @@ def _get_col_subset(self, X, cols):
return t
+ def extract_Xy(self, X, y=None):
+ """Extract X and y values for pipeline from inputs 'X' and 'y'."""
+ return self.extract_X(X), self.extract_y(X, y)
+
+ def extract_X(self, X):
+ """Extract X values for pipeline, equivalent to self.transform."""
+ return self.transform(X)
+
+ def extract_y(self, X, y=None):
+ """Extract y values for pipeline from input 'y' or 'X'.
+
+ Extract self.y_feature from dataframe 'y'. Fall back to extraction from
+ dataframe 'X' if y is not null but not DataFrame.
+ """
+
+
+ if y is None and self.y_feature is None:
+ return None
+
+ if self.y_feature is None:
+ raise ValueError("DataFrameMapper does not support extract_y, self.y_feature is None.")
+
+ if isinstance(y, pd.DataFrame):
+ df = y
+ elif isinstance(y, pd.Series):
+ df = y.to_frame()
+ else:
+ assert isinstance(X, pd.DataFrame)
+ df = X
+
+
+ y_columns, y_transformers = self.y_feature
+ # columns could be a string or list of
+ # strings; we don't care because pandas
+ # will handle either.
+ yt = self._get_col_subset(df, y_columns)
+ if y_transformers is not None:
+ yt = y_transformers.transform(yt)
+
+ return yt
+
def fit(self, X, y=None):
"""
Fit a transformation from the pipeline
- X the data to fit
+ X the dataframe to fit
+ y DataFrame from which to extract 'target' columns, 'X' used as
+ 'target' column source if None.
"""
+
+ feature_widths = []
+ cur_index = 0
for columns, transformers in self.features:
+ # columns could be a string or list of
+ # strings; we don't care because pandas
+ # will handle either.
+ Xt = self._get_col_subset(X, columns)
if transformers is not None:
- transformers.fit(self._get_col_subset(X, columns))
+ transformers.fit(Xt)
+ Xt = transformers.transform(Xt)
+ feature_widths.append(_handle_feature(Xt).shape[1])
+ self.feature_widths_ = np.array(feature_widths)
+
+ if self.y_feature is not None:
+ if isinstance(y, pd.DataFrame):
+ df = y
+ elif isinstance(y, pd.Series):
+ df = y.to_frame()
+ else:
+ assert isinstance(X, pd.DataFrame)
+ df = X
+
+ y_columns, y_transformers = self.y_feature
+ if y_transformers is not None:
+ y_transformers.fit(self._get_col_subset(df, y_columns))
+
return self
def transform(self, X):
diff --git a/sklearn_pandas/dataframe_pipeline.py b/sklearn_pandas/dataframe_pipeline.py
new file mode 100644
index 0000000..ed75cdb
--- /dev/null
+++ b/sklearn_pandas/dataframe_pipeline.py
@@ -0,0 +1,148 @@
+import six
+from sklearn.pipeline import _name_estimators, Pipeline
+from sklearn.utils.metaestimators import if_delegate_has_method
+
+from sklearn_pandas.dataframe_mapper import DataFrameMapper
+
+class DataFramePipeline(Pipeline):
+ """Pipeline of transforms with a final estimator supporting DataFrame input.
+
+ Sequentially applies a list of transforms and a final estimator, extracting
+ 'X' and 'y' data for the pipeline via an initial DataFrameMapper. The first
+ step of the pipeline must be a DataFrameMapper. Intermediate steps of the
+ pipeline must be 'transforms' implementing the 'fit' and 'transform' methods.
+ The final estimator only needs to implement fit.
+
+ Parameters
+ ----------
+ steps : list
+ List of (name, transform) tuples (implementing fit/transform) that are
+ chained, in the order in which they are chained, with the last object
+ an estimator.
+ """
+
+ def __init__(self, steps):
+ Pipeline.__init__(self, steps)
+
+ if not isinstance(self._dataframe_mapper, DataFrameMapper):
+ raise TypeError(
+ "First step of a DataFramePipeline must be a DataFrameMapper, "
+ "'%s' (type %s) is not." %
+ (self._dataframe_mapper, type(self._dataframe_mapper))
+ )
+
+ @property
+ def _dataframe_mapper(self):
+ """Return DataFrameMapper at head of pipeline."""
+ return self.steps[0][1]
+
+ @property
+ def _dataframe_mapper_name(self):
+ """Return name of DataFrameMapper at head of pipeline."""
+ return self.steps[0][0]
+
+ def _pre_transform(self, X, y=None, **fit_params):
+ fit_params_steps = dict((step, {}) for step, _ in self.steps)
+ for pname, pval in six.iteritems(fit_params):
+ step, param = pname.split('__', 1)
+ fit_params_steps[step][param] = pval
+
+ Xt = self._dataframe_mapper.fit_transform(
+ X, y, **fit_params_steps[self._dataframe_mapper_name])
+ yt = self._dataframe_mapper.extract_y(X, y)
+
+ for name, transform in self.steps[1:-1]:
+ if hasattr(transform, "fit_transform"):
+ Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
+ else:
+ Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
+ .transform(Xt)
+
+ return Xt, yt, fit_params_steps[self.steps[-1][0]]
+
+ def fit(self, X, y=None, **fit_params):
+ """Fit all the transforms one after the other and transform the
+ data, then fit the transformed data using the final estimator.
+
+ Parameters
+ ----------
+ X : (DataFrame)
+ Training data. Must fulfill input requirements of first step of the
+ pipeline.
+ y : (DataFrame or Series), default=None
+ Training targets. Must fulfill label requirements for all steps of
+ the pipeline. Training target are extracted via mapper from 'X' if
+ 'y' is None.
+ """
+ Xt, yt, fit_params = self._pre_transform(X, y, **fit_params)
+ self.steps[-1][-1].fit(Xt, yt, **fit_params)
+ return self
+
+ def fit_transform(self, X, y=None, **fit_params):
+ """Fit all the transforms one after the other and transform the
+ data, then use fit_transform on transformed data using the final
+ estimator.
+
+ Parameters
+ ----------
+ X : (DataFrame)
+ Training data. Must fulfill input requirements of first step of the
+ pipeline.
+ y : (DataFrame or Series), default=None
+ Training targets. Must fulfill label requirements for all steps of
+ the pipeline. Training target are extracted via mapper from 'X' if
+ 'y' is None.
+ """
+ Xt, yt, fit_params = self._pre_transform(X, y, **fit_params)
+ if hasattr(self.steps[-1][-1], 'fit_transform'):
+ return self.steps[-1][-1].fit_transform(Xt, yt, **fit_params)
+ else:
+ return self.steps[-1][-1].fit(Xt, yt, **fit_params).transform(Xt)
+
+ @if_delegate_has_method(delegate='_final_estimator')
+ def fit_predict(self, X, y=None, **fit_params):
+ """Applies fit_predict of last step in pipeline after transforms.
+
+ Applies fit_transforms of a pipeline to the data, followed by the
+ fit_predict method of the final estimator in the pipeline. Valid
+ only if the final estimator implements fit_predict.
+
+ Parameters
+ ----------
+ X : (DataFrame)
+ Training data. Must fulfill input requirements of first step of the
+ pipeline.
+ y : (DataFrame or Series), default=None
+ Training targets. Must fulfill label requirements for all steps of
+ the pipeline. Training target are extracted via mapper from 'X' if
+ 'y' is None.
+ """
+ Xt, yt, fit_params = self._pre_transform(X, y, **fit_params)
+ return self.steps[-1][-1].fit_predict(Xt, yt, **fit_params)
+
+ @if_delegate_has_method(delegate='_final_estimator')
+ def score(self, X, y=None):
+ """Applies transforms to the data, and the score method of the
+ final estimator. Valid only if the final estimator implements
+ score.
+
+ Parameters
+ ----------
+ X : (DataFrame)
+ Training data. Must fulfill input requirements of first step of the
+ pipeline.
+ y : (DataFrame or Series), default=None
+ Training targets. Must fulfill label requirements for all steps of
+ the pipeline. Training target are extracted via mapper from 'X' if
+ 'y' is None.
+ """
+ Xt = X
+ for name, transform in self.steps[:-1]:
+ Xt = transform.transform(Xt)
+
+ yt = self._dataframe_mapper.extract_y(X, y)
+ return self.steps[-1][-1].score(Xt, yt)
+
+def make_dataframe_pipeline(steps):
+ """Construct a DataFramePipeline from the given estimators."""
+ return DataFramePipeline(_name_estimators(steps))