Skip to content

RFECV is much slower than Sklearn's implementation #1047

@jc639

Description

@jc639

I am aware that yellowbrick is using RFE and CV separately to produce the visualiser but the approach is several times slower than sklearn's implementation of RFECV.

Running the following in a jupyter notebook:

import yellowbrick
print('yellowbrick version: ', yellowbrick.__version__)
import sklearn
(print('sklearn version: ', sklearn.__version__))

yellowbrick version: 1.1
sklearn version: 0.22.1

from sklearn.datasets import make_classification
from sklearn.feature_selection import RFECV as skrfecv
from yellowbrick.model_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

# Build a classification task using 4 out of 50 informative features
X, y = make_classification(n_samples=200, n_features=50, n_informative=4,
                           n_redundant=2, n_repeated=0, n_classes=4,
                           n_clusters_per_class=1, random_state=0)

log_reg = LogisticRegression()


def rfe_time_test(yb=True):
    if yb:
        rfecv = RFECV(log_reg, step=1, cv=StratifiedKFold(5),
                      scoring='accuracy')
    else:
        rfecv = skrfecv(log_reg, step=1, cv=StratifiedKFold(5),
                      scoring='accuracy')
    _ = rfecv.fit(X, y)

%timeit rfe_time_test(yb=True)

1min 23s ± 8.18 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

%timeit rfe_time_test(yb=False)

3.73 s ± 430 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

If this is unavoidable due to using CV separately to get the full scores, then it would be nice to note in the documentation, so that you could use sklearn's RFECV to drop the bottom ~50% of features before running the visualiser.

This got me interested so I did some digging into what might affect the difference between sklearn and yellowbricks's RFECV:

import matplotlib.pyplot as plt
import numpy as np

def plot_timings(x_range, yb_timings, sk_timings, x_axis, titles):
    f, ax = plt.subplots(1, 2)
    s_times = np.array([t.average for t in sk_timings])
    y_times = np.array([t.average for t in yb_timings])
    ax[0].plot(x_range, y_times, 'ro-')
    ax[0].plot(x_range, s_times, 'bo-')
    ax[0].legend(['yellowbrick', 'sklearn'])
    ax[0].set_ylabel('Time (seconds)')
    ax[1].set_ylabel('YB time / SK time')

    ratio = y_times/s_times
    ax[1].plot(x_range, ratio, 'og-')
    for i, title in enumerate(titles):
        ax[i].set_title(title)
        ax[i].set_xlabel(x_axis)
    
    f.subplots_adjust(wspace=0.25)
    f.set_size_inches(10, 6)
        
    plt.show()
    
    return f

yb_timings = []
sk_timings = []
n_obs = [i for i in range(200, 1001, 100)]
for i in n_obs:
    # Build a classification task using 4 informative features
    X, y = make_classification(n_samples=i, n_features=10, n_informative=4,
                               n_redundant=2, n_repeated=0, n_classes=4,
                               n_clusters_per_class=1, random_state=0)
    yb_time = %timeit -o rfe_time_test(yb=True)
    yb_timings.append(yb_time)
    
    sk_time = %timeit -o rfe_time_test(yb=False)
    sk_timings.append(sk_time)
    
obs = plot_timings(n_obs, yb_timings, 
                   sk_timings, x_axis='Number of observations', 
                   titles=['Timings', 'Ratio'])

Timings and observations

Ratio of time difference is fairly stable over number of observations.

yb_timings = []
sk_timings = []
n_feats = [i for i in range(10, 51, 10)]
for i in n_feats:
    # Build a classification task using 4 informative features
    X, y = make_classification(n_samples=200, n_features=i, n_informative=4,
                               n_redundant=2, n_repeated=0, n_classes=4,
                               n_clusters_per_class=1, random_state=0)
    yb_time = %timeit -o rfe_time_test(yb=True)
    yb_timings.append(yb_time)
    
    sk_time = %timeit -o rfe_time_test(yb=False)
    sk_timings.append(sk_time)
    
feats = plot_timings(n_feats, yb_timings, 
                     sk_timings, x_axis='Number of input features', 
                     titles=['Timings', 'Ratio'])

Timings and features

As number of starting features increase YB becomes even slower relative to sklearn.

# Build a classification task using 4 informative features
X, y = make_classification(n_samples=200, n_features=10, n_informative=4,
                           n_redundant=2, n_repeated=0, n_classes=4,
                           n_clusters_per_class=1, random_state=0)

log_reg = LogisticRegression()

yb_timings = []
sk_timings = []
cvs = [i for i in range(2, 11, 2)]
for i in cvs:
    def rfe_time_test(yb=True):
        if yb:
            rfecv = RFECV(log_reg, step=1, cv=StratifiedKFold(i),
                          scoring='accuracy')
        else:
            rfecv = skrfecv(log_reg, step=1, cv=StratifiedKFold(i),
                          scoring='accuracy')
        _ = rfecv.fit(X, y)
        
    yb_time = %timeit -o rfe_time_test(yb=True)
    yb_timings.append(yb_time)
    
    sk_time = %timeit -o rfe_time_test(yb=False)
    sk_timings.append(sk_time)
    
cv = plot_timings(cvs, yb_timings, 
                     sk_timings, x_axis='Number of CV folds', 
                     titles=['Timings', 'Ratio'])

Timings and CV folds

YB becomes slower with increasing number of folds too!

Metadata

Metadata

Assignees

No one assigned

    Labels

    priority: highshould be done before next releasetype: technical debtwork to optimize or generalize code

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions