Enable regex and other ways to dynamically select columns. (#246)

ragrawal · kitmonisit · ragrawal · web-flow · commit 65538c4322f2 · 2021-05-08T00:24:57.000-07:00
Closes #239 #137 Co-authored-by: Kit Monisit <kmonisit@gmail.com> Co-authored-by: ragrawal <ragrawal@varomoney.com>
diff --git a/README.rst b/README.rst
@@ -30,9 +30,11 @@ The examples in this file double as basic sanity tests. To run them, use ``docte
 
     # python -m doctest README.rst
 
+
 Usage
 -----
 
+
 Import
 ******
 
@@ -50,25 +52,33 @@ For these examples, we'll also use pandas, numpy, and sklearn::
     >>> import pandas as pd
     >>> import numpy as np
     >>> import sklearn.preprocessing, sklearn.decomposition, \
-    ...     sklearn.linear_model, sklearn.pipeline, sklearn.metrics
+    ...     sklearn.linear_model, sklearn.pipeline, sklearn.metrics, \
+    ...     sklearn.compose
     >>> from sklearn.feature_extraction.text import CountVectorizer
 
+
 Load some Data
 **************
 
+
 Normally you'll read the data from a file, but for demonstration purposes we'll create a data frame from a Python dict::
 
     >>> data = pd.DataFrame({'pet':      ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
     ...                      'children': [4., 6, 3, 3, 2, 3, 5, 4],
     ...                      'salary':   [90., 24, 44, 27, 32, 59, 36, 27]})
 
+
 Transformation Mapping
 ----------------------
 
+
 Map the Columns to Transformations
 **********************************
 
-The mapper takes a list of tuples. The first element of each tuple is a column name from the pandas DataFrame, or a list containing one or multiple columns (we will see an example with multiple columns later). The second element is an object which will perform the transformation which will be applied to that column. The third one is optional and is a dictionary containing the transformation options, if applicable (see "custom column names for transformed features" below).
+The mapper takes a list of tuples. Each tuple has three elements:
+  1. column name(s): The first element is a column name from the pandas DataFrame, or a list containing one or multiple columns (we will see an example with multiple columns later) or an instance of a callable function such as `make_column_selector <https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html>` 
+  2. transformer(s): The second element is an object which will perform the transformation which will be applied to that column. 
+  3. attributes: The third one is optional and is a dictionary containing the transformation options, if applicable (see "custom column names for transformed features" below).
 
 Let's see an example::
 
@@ -77,7 +87,7 @@ Let's see an example::
     ...     (['children'], sklearn.preprocessing.StandardScaler())
     ... ])
 
-The difference between specifying the column selector as ``'column'`` (as a simple string) and ``['column']`` (as a list with one element) is the shape of the array that is passed to the transformer. In the first case, a one dimensional array will be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector.
+The difference between specifying the column selector as ``'column'`` (as a simple string) and ``['column']`` (as a list with one element) is the shape of the array that is passed to the transformer. In the first case, a one dimensional array will be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector. 
 
 This behaviour mimics the same pattern as pandas' dataframes ``__getitem__``  indexing:
 
@@ -88,6 +98,7 @@ This behaviour mimics the same pattern as pandas' dataframes ``__getitem__``  in
 
 Be aware that some transformers expect a 1-dimensional input (the label-oriented ones) while some others, like ``OneHotEncoder`` or ``Imputer``, expect 2-dimensional input, with the shape ``[n_samples, n_features]``.
 
+
 Test the Transformation
 ***********************
 
@@ -150,6 +161,46 @@ Alternatively, you can also specify prefix and/or suffix to add to the column na
   >>> mapper_alias.transformed_names_
   ['standard_scaled_children', 'children_raw']
 
+
+Dynamic Columns
+***********************
+In some situations the columns are not known before hand and we would like to dynamically select them during the fit operation. As shown below, in such situations you can provide either a custom callable or use `make_column_selector <https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html>`. 
+
+
+    >>> class GetColumnsStartingWith:
+    ...   def __init__(self, start_str):
+    ...     self.pattern = start_str
+    ...
+    ...   def __call__(self, X:pd.DataFrame=None):
+    ...     return [c for c in X.columns if c.startswith(self.pattern)]
+    ...
+    >>> df = pd.DataFrame({
+    ...    'sepal length (cm)': [1.0, 2.0, 3.0],
+    ...    'sepal width (cm)': [1.0, 2.0, 3.0],
+    ...    'petal length (cm)': [1.0, 2.0, 3.0],
+    ...    'petal width (cm)': [1.0, 2.0, 3.0]
+    ... })
+    >>> t = DataFrameMapper([
+    ...     (
+    ...       sklearn.compose.make_column_selector(dtype_include=float),
+    ...       sklearn.preprocessing.StandardScaler(),
+    ...       {'alias': 'x'}
+    ...     ),
+    ...     (
+    ...       GetColumnsStartingWith('petal'),
+    ...       None,
+    ...       {'alias': 'petal'}
+    ...     )], df_out=True, default=False)
+    >>> t.fit(df).transform(df).shape
+    (3, 6)
+    >>> t.transformed_names_
+    ['x_0', 'x_1', 'x_2', 'x_3', 'petal_0', 'petal_1']
+
+
+
+Above we use `make_column_selector` to select all columns that are of type float and also use a custom callable function to select columns that start with the word 'petal'.
+
+
 Passing Series/DataFrames to the transformers
 *********************************************
 
@@ -463,6 +514,11 @@ Changelog
 ---------
 
 
+2.2.0 (2021-05-07)
+******************
+* Added an ability to provide callable functions instead of static column list.
+
+
 2.1.0 (2021-02-26)
 ******************
 * Removed test for Python 3.6 and added Python 3.9
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '2.1.0'
+__version__ = '2.2.0'
 
 import logging
 logger = logging.getLogger(__name__)
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -1,11 +1,9 @@
 import contextlib
-
 from datetime import datetime
 import pandas as pd
 import numpy as np
 from scipy import sparse
 from sklearn.base import BaseEstimator, TransformerMixin
-
 from .cross_validation import DataWrapper
 from .pipeline import make_transformer_pipeline, _call_fit, TransformerPipeline
 from . import logger
@@ -29,8 +27,14 @@ def _build_transformer(transformers):
     return transformers
 
 
-def _build_feature(columns, transformers, options={}):
-    return (columns, _build_transformer(transformers), options)
+def _build_feature(columns, transformers, options={}, X=None):
+    if X is None:
+        return (columns, _build_transformer(transformers), options)
+    return (
+        columns(X) if callable(columns) else columns,
+        _build_transformer(transformers),
+        options
+    )
 
 
 def _elapsed_secs(t1):
@@ -116,14 +120,16 @@ def __init__(self, features, default=False, sparse=False, df_out=False,
         if (df_out and (sparse or default)):
             raise ValueError("Can not use df_out with sparse or default")
 
-    def _build(self):
+    def _build(self, X=None):
         """
         Build attributes built_features and built_default.
         """
         if isinstance(self.features, list):
-            self.built_features = [_build_feature(*f) for f in self.features]
+            self.built_features = [
+                _build_feature(*f, X=X) for f in self.features
+            ]
         else:
-            self.built_features = self.features
+            self.built_features = _build_feature(*self.features, X=X)
         self.built_default = _build_transformer(self.default)
 
     @property
@@ -185,11 +191,13 @@ def _get_col_subset(self, X, cols, input_df=False):
         Get a subset of columns from the given table X.
 
         X       a Pandas dataframe; the table to select columns from
-        cols    a string or list of strings representing the columns
-                to select
+        cols    a string or list of strings representing the columns to select.
+                It can also be a callable that returns True or False, i.e.
+                compatible with the built-in filter function.
 
         Returns a numpy array with the data from the selected columns
         """
+
         if isinstance(cols, string_types):
             return_vector = True
             cols = [cols]
@@ -226,7 +234,7 @@ def fit(self, X, y=None):
         y       the target vector relative to X, optional
 
         """
-        self._build()
+        self._build(X=X)
 
         for columns, transformers, options in self.built_features:
             t1 = datetime.now()
@@ -315,7 +323,7 @@ def _transform(self, X, y=None, do_fit=False):
         fit_transform.
         """
         if do_fit:
-            self._build()
+            self._build(X=X)
 
         extracted = []
         transformed_names_ = []
diff --git a/sklearn_pandas/transformers.py b/sklearn_pandas/transformers.py
@@ -33,7 +33,7 @@ def __init__(self, func):
         """
 
         warnings.warn("""
-            NumericalTransformer will be deprecated in 2.2 version.
+            NumericalTransformer will be deprecated in 3.0 version.
             Please use Sklearn.base.TransformerMixin to write
             customer transformers
             """, DeprecationWarning)
diff --git a/test.py b/test.py
@@ -0,0 +1,30 @@
+import pytest
+from unittest.mock import Mock
+import numpy as np
+import pandas as pd
+from sklearn_pandas import DataFrameMapper
+from sklearn.compose import make_column_selector
+from sklearn.preprocessing import StandardScaler
+
+
+class GetStartWith:
+    def __init__(self, start_str):
+        self.start_str = start_str
+
+    def __call__(self, X: pd.DataFrame) -> list:
+        return [c for c in X.columns if c.startswith(self.start_str)]
+
+
+df = pd.DataFrame({
+    'sepal length (cm)': [1.0, 2.0, 3.0],
+    'sepal width (cm)': [1.0, 2.0, 3.0],
+    'petal length (cm)': [1.0, 2.0, 3.0],
+    'petal width (cm)': [1.0, 2.0, 3.0]
+})
+t = DataFrameMapper([
+    (make_column_selector(dtype_include=float), StandardScaler(), {'alias': 'x'}),
+    (GetStartWith('petal'), None, {'alias': 'petal'})
+], df_out=True, default=False)
+
+t.fit(df)
+print(t.transform(df).shape)
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -20,6 +20,7 @@
 import numpy as np
 from numpy.testing import assert_array_equal
 import pickle
+from sklearn.compose import make_column_selector
 
 from sklearn_pandas import DataFrameMapper
 from sklearn_pandas.dataframe_mapper import _handle_feature, _build_transformer
@@ -969,3 +970,19 @@ def test_heterogeneous_output_types_input_df():
     dft = M.fit_transform(df)
     assert dft['feat1'].dtype == np.dtype('int64')
     assert dft['feat2'].dtype == np.dtype('float64')
+
+
+def test_make_column_selector(iris_dataframe):
+    t = DataFrameMapper([
+        (make_column_selector(dtype_include=float), None, {'alias': 'x'}),
+        ('sepal length (cm)', None),
+    ], df_out=True, default=False)
+
+    xt = t.fit(iris_dataframe).transform(iris_dataframe)
+    expected = ['x_0', 'x_1', 'x_2', 'x_3', 'sepal length (cm)']
+    assert list(xt.columns) == expected
+
+    pickled = pickle.dumps(t)
+    t2 = pickle.loads(pickled)
+    xt2 = t2.transform(iris_dataframe)
+    assert np.array_equal(xt.values, xt2.values)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = '2.1.0'`
	`1`	`+__version__ = '2.2.0'`
`2`	`2`
`3`	`3`	`import logging`
`4`	`4`	`logger = logging.getLogger(__name__)`