RFECV is much slower than Sklearn's implementation

I am aware that yellowbrick is using RFE and CV separately to produce the visualiser but the approach is several times slower than sklearn's implementation of RFECV.

Running the following in a jupyter notebook:
```
import yellowbrick
print('yellowbrick version: ', yellowbrick.__version__)
import sklearn
(print('sklearn version: ', sklearn.__version__))
```
_yellowbrick version:  1.1
sklearn version:  0.22.1_

```
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFECV as skrfecv
from yellowbrick.model_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

# Build a classification task using 4 out of 50 informative features
X, y = make_classification(n_samples=200, n_features=50, n_informative=4,
                           n_redundant=2, n_repeated=0, n_classes=4,
                           n_clusters_per_class=1, random_state=0)

log_reg = LogisticRegression()


def rfe_time_test(yb=True):
    if yb:
        rfecv = RFECV(log_reg, step=1, cv=StratifiedKFold(5),
                      scoring='accuracy')
    else:
        rfecv = skrfecv(log_reg, step=1, cv=StratifiedKFold(5),
                      scoring='accuracy')
    _ = rfecv.fit(X, y)

%timeit rfe_time_test(yb=True)
```
_1min 23s ± 8.18 s per loop (mean ± std. dev. of 7 runs, 1 loop each)_

```
%timeit rfe_time_test(yb=False)
```
_3.73 s ± 430 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)_

If this is unavoidable due to using CV separately to get the full scores, then it would be nice to note in the documentation, so that you could use sklearn's RFECV to drop the bottom ~50% of features before running the visualiser. 

This got me interested so I did some digging into what might affect the difference between sklearn and yellowbricks's RFECV:
```
import matplotlib.pyplot as plt
import numpy as np

def plot_timings(x_range, yb_timings, sk_timings, x_axis, titles):
    f, ax = plt.subplots(1, 2)
    s_times = np.array([t.average for t in sk_timings])
    y_times = np.array([t.average for t in yb_timings])
    ax[0].plot(x_range, y_times, 'ro-')
    ax[0].plot(x_range, s_times, 'bo-')
    ax[0].legend(['yellowbrick', 'sklearn'])
    ax[0].set_ylabel('Time (seconds)')
    ax[1].set_ylabel('YB time / SK time')

    ratio = y_times/s_times
    ax[1].plot(x_range, ratio, 'og-')
    for i, title in enumerate(titles):
        ax[i].set_title(title)
        ax[i].set_xlabel(x_axis)
    
    f.subplots_adjust(wspace=0.25)
    f.set_size_inches(10, 6)
        
    plt.show()
    
    return f

yb_timings = []
sk_timings = []
n_obs = [i for i in range(200, 1001, 100)]
for i in n_obs:
    # Build a classification task using 4 informative features
    X, y = make_classification(n_samples=i, n_features=10, n_informative=4,
                               n_redundant=2, n_repeated=0, n_classes=4,
                               n_clusters_per_class=1, random_state=0)
    yb_time = %timeit -o rfe_time_test(yb=True)
    yb_timings.append(yb_time)
    
    sk_time = %timeit -o rfe_time_test(yb=False)
    sk_timings.append(sk_time)
    
obs = plot_timings(n_obs, yb_timings, 
                   sk_timings, x_axis='Number of observations', 
                   titles=['Timings', 'Ratio'])
```
![Timings and observations](https://user-images.githubusercontent.com/28015710/75980692-e79e1580-5eda-11ea-9d91-593c8e51eccf.png)

Ratio of time difference is fairly stable over number of observations.

```
yb_timings = []
sk_timings = []
n_feats = [i for i in range(10, 51, 10)]
for i in n_feats:
    # Build a classification task using 4 informative features
    X, y = make_classification(n_samples=200, n_features=i, n_informative=4,
                               n_redundant=2, n_repeated=0, n_classes=4,
                               n_clusters_per_class=1, random_state=0)
    yb_time = %timeit -o rfe_time_test(yb=True)
    yb_timings.append(yb_time)
    
    sk_time = %timeit -o rfe_time_test(yb=False)
    sk_timings.append(sk_time)
    
feats = plot_timings(n_feats, yb_timings, 
                     sk_timings, x_axis='Number of input features', 
                     titles=['Timings', 'Ratio'])
```
![Timings and features](https://user-images.githubusercontent.com/28015710/75980850-29c75700-5edb-11ea-834d-c204e6c9846f.png)

As number of starting features increase YB becomes even slower relative to sklearn.

```
# Build a classification task using 4 informative features
X, y = make_classification(n_samples=200, n_features=10, n_informative=4,
                           n_redundant=2, n_repeated=0, n_classes=4,
                           n_clusters_per_class=1, random_state=0)

log_reg = LogisticRegression()

yb_timings = []
sk_timings = []
cvs = [i for i in range(2, 11, 2)]
for i in cvs:
    def rfe_time_test(yb=True):
        if yb:
            rfecv = RFECV(log_reg, step=1, cv=StratifiedKFold(i),
                          scoring='accuracy')
        else:
            rfecv = skrfecv(log_reg, step=1, cv=StratifiedKFold(i),
                          scoring='accuracy')
        _ = rfecv.fit(X, y)
        
    yb_time = %timeit -o rfe_time_test(yb=True)
    yb_timings.append(yb_time)
    
    sk_time = %timeit -o rfe_time_test(yb=False)
    sk_timings.append(sk_time)
    
cv = plot_timings(cvs, yb_timings, 
                     sk_timings, x_axis='Number of CV folds', 
                     titles=['Timings', 'Ratio'])
```
![Timings and CV folds](https://user-images.githubusercontent.com/28015710/75981029-888cd080-5edb-11ea-8203-0afca01dadea.png)

YB becomes slower with increasing number of folds too!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

RFECV is much slower than Sklearn's implementation #1047

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

RFECV is much slower than Sklearn's implementation #1047

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions