add groups parameter to SFS and EFS fit(), for forwarding to sklearn cv (#537)

Adam Cooper · rasbt · commit e78846fff187 · 2019-05-17T01:24:50.000-05:00
* add groups parameter to SFS and EFS fit(), for forwarding to sklearn cv

* pep8 fix and bumping up the changelog note to the 0.17 release
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -16,7 +16,7 @@ The CHANGELOG for the current development version is available at
 
 ##### New Features
 
-- -
+- Add optional `groups` parameter to `SequentialFeatureSelector` and `ExhaustiveFeatureSelector` `fit()` methods for forwarding to sklearn CV ([#537](https://github.com/rasbt/mlxtend/pull/537) via [arc12](https://github.com/qiaguhttps://github.com/arc12))
 
 ##### Changes
 
diff --git a/mlxtend/feature_selection/exhaustive_feature_selector.py b/mlxtend/feature_selection/exhaustive_feature_selector.py
@@ -25,10 +25,11 @@
 from sklearn.externals.joblib import Parallel, delayed
 
 
-def _calc_score(selector, X, y, indices, **fit_params):
+def _calc_score(selector, X, y, indices, groups=None, **fit_params):
     if selector.cv:
         scores = cross_val_score(selector.est_,
                                  X[:, indices], y,
+                                 groups=groups,
                                  cv=selector.cv,
                                  scoring=selector.scorer,
                                  n_jobs=1,
@@ -175,7 +176,7 @@ def __init__(self, estimator, min_features=1, max_features=1,
         # don't mess with this unless testing
         self._TESTING_INTERRUPT_MODE = False
 
-    def fit(self, X, y, custom_feature_names=None, **fit_params):
+    def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
         """Perform feature selection and learn model from training data.
 
         Parameters
@@ -191,6 +192,9 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
             Custom feature names for `self.k_feature_names` and
             `self.subsets_[i]['feature_names']`.
             (new in v 0.13.0)
+        groups : array-like, with shape (n_samples,), optional
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Passed to the fit method of the cross-validator.
         fit_params : dict of string -> object, optional
             Parameters to pass to to the fit method of classifier.
 
@@ -268,7 +272,7 @@ def ncr(n, r):
         n_jobs = min(self.n_jobs, all_comb)
         parallel = Parallel(n_jobs=n_jobs, pre_dispatch=self.pre_dispatch)
         work = enumerate(parallel(delayed(_calc_score)
-                                  (self, X_, y, c, **fit_params)
+                                  (self, X_, y, c, groups=groups, **fit_params)
                                   for c in candidates))
 
         try:
@@ -336,7 +340,7 @@ def transform(self, X):
             X_ = X
         return X_[:, self.best_idx_]
 
-    def fit_transform(self, X, y, **fit_params):
+    def fit_transform(self, X, y, groups=None, **fit_params):
         """Fit to training data and return the best selected features from X.
 
         Parameters
@@ -348,6 +352,9 @@ def fit_transform(self, X, y, **fit_params):
             argument for X.
         y : array-like, shape = [n_samples]
             Target values.
+        groups : array-like, with shape (n_samples,), optional
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Passed to the fit method of the cross-validator.
         fit_params : dict of string -> object, optional
             Parameters to pass to to the fit method of classifier.
 
@@ -356,7 +363,7 @@ def fit_transform(self, X, y, **fit_params):
         Feature subset of X, shape={n_samples, k_features}
 
         """
-        self.fit(X, y, **fit_params)
+        self.fit(X, y, groups=groups, **fit_params)
         return self.transform(X)
 
     def get_metric_dict(self, confidence_interval=0.95):
diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py
@@ -23,10 +23,11 @@
 from sklearn.externals.joblib import Parallel, delayed
 
 
-def _calc_score(selector, X, y, indices, **fit_params):
+def _calc_score(selector, X, y, indices, groups=None, **fit_params):
     if selector.cv:
         scores = cross_val_score(selector.est_,
                                  X[:, indices], y,
+                                 groups=groups,
                                  cv=selector.cv,
                                  scoring=selector.scorer,
                                  n_jobs=1,
@@ -242,7 +243,7 @@ def set_params(self, **params):
         self._set_params('estimator', 'named_estimators', **params)
         return self
 
-    def fit(self, X, y, custom_feature_names=None, **fit_params):
+    def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
         """Perform feature selection and learn model from training data.
 
         Parameters
@@ -260,6 +261,9 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
             Custom feature names for `self.k_feature_names` and
             `self.subsets_[i]['feature_names']`.
             (new in v 0.13.0)
+        groups : array-like, with shape (n_samples,), optional
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Passed to the fit method of the cross-validator.
         fit_params : dict of string -> object, optional
             Parameters to pass to to the fit method of classifier.
 
@@ -291,8 +295,8 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
         if not isinstance(self.k_features, int) and\
                 not isinstance(self.k_features, tuple)\
                 and not isinstance(self.k_features, str):
-                raise AttributeError('k_features must be a positive integer'
-                                     ', tuple, or string')
+            raise AttributeError('k_features must be a positive integer'
+                                 ', tuple, or string')
 
         if (isinstance(self.k_features, int) and (
                 self.k_features < 1 or self.k_features > X_.shape[1])):
@@ -351,7 +355,8 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
                 k_to_select = min_k
             k_idx = tuple(range(X_.shape[1]))
             k = len(k_idx)
-            k_idx, k_score = _calc_score(self, X_, y, k_idx, **fit_params)
+            k_idx, k_score = _calc_score(self, X_, y, k_idx,
+                                         groups=groups, **fit_params)
             self.subsets_[k] = {
                 'feature_idx': k_idx,
                 'cv_scores': k_score,
@@ -370,6 +375,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
                         subset=prev_subset,
                         X=X_,
                         y=y,
+                        groups=groups,
                         **fit_params
                     )
                 else:
@@ -378,6 +384,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
                         feature_set=prev_subset,
                         X=X_,
                         y=y,
+                        groups=groups,
                         **fit_params
                     )
 
@@ -404,6 +411,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
                                 fixed_feature=new_feature,
                                 X=X_,
                                 y=y,
+                                groups=groups,
                                 **fit_params
                             )
 
@@ -413,6 +421,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
                                 subset=set(k_idx),
                                 X=X_,
                                 y=y,
+                                groups=groups,
                                 **fit_params
                             )
 
@@ -472,7 +481,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
                                           X)
                     raise KeyboardInterrupt
 
-        except KeyboardInterrupt as e:
+        except KeyboardInterrupt:
             self.interrupted_ = True
             sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')
 
@@ -512,7 +521,7 @@ def fit(self, X, y, custom_feature_names=None, **fit_params):
         return self
 
     def _inclusion(self, orig_set, subset, X, y, ignore_feature=None,
-                   **fit_params):
+                   groups=None, **fit_params):
         all_avg_scores = []
         all_cv_scores = []
         all_subsets = []
@@ -526,7 +535,7 @@ def _inclusion(self, orig_set, subset, X, y, ignore_feature=None,
             work = parallel(delayed(_calc_score)
                             (self, X, y,
                              tuple(subset | {feature}),
-                             **fit_params)
+                             groups=groups, **fit_params)
                             for feature in remaining
                             if feature != ignore_feature)
 
@@ -541,7 +550,8 @@ def _inclusion(self, orig_set, subset, X, y, ignore_feature=None,
                    all_cv_scores[best])
         return res
 
-    def _exclusion(self, feature_set, X, y, fixed_feature=None, **fit_params):
+    def _exclusion(self, feature_set, X, y, fixed_feature=None,
+                   groups=None, **fit_params):
         n = len(feature_set)
         res = (None, None, None)
         if n > 1:
@@ -552,7 +562,8 @@ def _exclusion(self, feature_set, X, y, fixed_feature=None, **fit_params):
             n_jobs = min(self.n_jobs, features)
             parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
                                 pre_dispatch=self.pre_dispatch)
-            work = parallel(delayed(_calc_score)(self, X, y, p, **fit_params)
+            work = parallel(delayed(_calc_score)(self, X, y, p,
+                                                 groups=groups, **fit_params)
                             for p in combinations(feature_set, r=n - 1)
                             if not fixed_feature or fixed_feature in set(p))
 
@@ -591,7 +602,7 @@ def transform(self, X):
             X_ = X
         return X_[:, self.k_feature_idx_]
 
-    def fit_transform(self, X, y, **fit_params):
+    def fit_transform(self, X, y, groups=None, **fit_params):
         """Fit to training data then reduce X to its most important features.
 
         Parameters
@@ -605,6 +616,9 @@ def fit_transform(self, X, y, **fit_params):
             Target values.
             New in v 0.13.0: a pandas Series are now also accepted as
             argument for y.
+        groups : array-like, with shape (n_samples,), optional
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Passed to the fit method of the cross-validator.
         fit_params : dict of string -> object, optional
             Parameters to pass to to the fit method of classifier.
 
@@ -613,7 +627,7 @@ def fit_transform(self, X, y, **fit_params):
         Reduced feature subset of X, shape={n_samples, k_features}
 
         """
-        self.fit(X, y, **fit_params)
+        self.fit(X, y, groups=groups, **fit_params)
         return self.transform(X)
 
     def get_metric_dict(self, confidence_interval=0.95):
diff --git a/mlxtend/feature_selection/tests/test_exhaustive_feature_selector.py b/mlxtend/feature_selection/tests/test_exhaustive_feature_selector.py
@@ -16,6 +16,7 @@
 from sklearn.linear_model import LinearRegression
 from sklearn.datasets import load_boston
 from mlxtend.utils import assert_raises
+from sklearn.model_selection import GroupKFold
 
 
 def dict_compare_utility(d1, d2):
@@ -183,6 +184,40 @@ def test_knn_cv3():
     assert round(efs1.best_score_, 4) == 0.9728
 
 
+def test_knn_cv3_groups():
+    iris = load_iris()
+    X = iris.data
+    y = iris.target
+    knn = KNeighborsClassifier(n_neighbors=4)
+    efs1 = EFS(knn,
+               min_features=3,
+               max_features=3,
+               scoring='accuracy',
+               cv=GroupKFold(n_splits=3),
+               print_progress=False)
+    np.random.seed(1630672634)
+    groups = np.random.randint(0, 6, size=len(y))
+    efs1 = efs1.fit(X, y, groups=groups)
+    # print(efs1.subsets_)
+    expect = {0: {'cv_scores': np.array([0.97916667, 0.93877551, 0.9245283]),
+                  'feature_idx': (0, 1, 2),
+                  'avg_score': 0.9474901595858469,
+                  'feature_names': ('0', '1', '2')},
+              1: {'cv_scores': np.array([1., 0.93877551, 0.9245283]),
+                  'feature_idx': (0, 1, 3),
+                  'avg_score': 0.9544346040302915,
+                  'feature_names': ('0', '1', '3')},
+              2: {'cv_scores': np.array([0.97916667, 0.95918367, 0.9245283]),
+                  'feature_idx': (0, 2, 3),
+                  'avg_score': 0.9542928806742822,
+                  'feature_names': ('0', '2', '3')},
+              3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
+                  'feature_idx': (1, 2, 3),
+                  'avg_score': 0.9605821888503829,
+                  'feature_names': ('1', '2', '3')}}
+    dict_compare_utility(d1=expect, d2=efs1.subsets_)
+
+
 def test_fit_params():
     iris = load_iris()
     X = iris.data
diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py
@@ -219,6 +219,33 @@ def test_knn_cv3():
     dict_compare_utility(d1=expect, d2=sfs1.subsets_)
 
 
+def test_knn_cv3_groups():
+    iris = load_iris()
+    X = iris.data
+    y = iris.target
+    knn = KNeighborsClassifier(n_neighbors=4)
+    sfs1 = SFS(knn,
+               k_features=3,
+               forward=True,
+               floating=False,
+               cv=GroupKFold(n_splits=3),
+               verbose=0)
+    np.random.seed(1630672634)
+    groups = np.random.randint(0, 6, size=len(y))
+    sfs1 = sfs1.fit(X, y, groups=groups)
+    # print(sfs1.subsets_)
+    expect = {
+        1: {'cv_scores': np.array([0.97916667, 0.93877551, 0.96226415]),
+            'feature_idx': (3,),
+            'avg_score': 0.9600687759380482},
+        2: {'cv_scores': np.array([0.95833333, 0.93877551, 0.98113208]),
+            'feature_idx': (1, 3),
+            'avg_score': 0.9594136396697044},
+        3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
+            'feature_idx': (1, 2, 3),
+            'avg_score': 0.9605821888503829}}
+    dict_compare_utility(d1=expect, d2=sfs1.subsets_, decimal=3)
+
 def test_knn_rbf_groupkfold():
     nan_roc_auc_scorer = make_scorer(nan_roc_auc_score)
     rng = np.random.RandomState(123)