-
-
Notifications
You must be signed in to change notification settings - Fork 562
Description
I am aware that yellowbrick is using RFE and CV separately to produce the visualiser but the approach is several times slower than sklearn's implementation of RFECV.
Running the following in a jupyter notebook:
import yellowbrick
print('yellowbrick version: ', yellowbrick.__version__)
import sklearn
(print('sklearn version: ', sklearn.__version__))
yellowbrick version: 1.1
sklearn version: 0.22.1
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFECV as skrfecv
from yellowbrick.model_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
# Build a classification task using 4 out of 50 informative features
X, y = make_classification(n_samples=200, n_features=50, n_informative=4,
n_redundant=2, n_repeated=0, n_classes=4,
n_clusters_per_class=1, random_state=0)
log_reg = LogisticRegression()
def rfe_time_test(yb=True):
if yb:
rfecv = RFECV(log_reg, step=1, cv=StratifiedKFold(5),
scoring='accuracy')
else:
rfecv = skrfecv(log_reg, step=1, cv=StratifiedKFold(5),
scoring='accuracy')
_ = rfecv.fit(X, y)
%timeit rfe_time_test(yb=True)
1min 23s ± 8.18 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit rfe_time_test(yb=False)
3.73 s ± 430 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
If this is unavoidable due to using CV separately to get the full scores, then it would be nice to note in the documentation, so that you could use sklearn's RFECV to drop the bottom ~50% of features before running the visualiser.
This got me interested so I did some digging into what might affect the difference between sklearn and yellowbricks's RFECV:
import matplotlib.pyplot as plt
import numpy as np
def plot_timings(x_range, yb_timings, sk_timings, x_axis, titles):
f, ax = plt.subplots(1, 2)
s_times = np.array([t.average for t in sk_timings])
y_times = np.array([t.average for t in yb_timings])
ax[0].plot(x_range, y_times, 'ro-')
ax[0].plot(x_range, s_times, 'bo-')
ax[0].legend(['yellowbrick', 'sklearn'])
ax[0].set_ylabel('Time (seconds)')
ax[1].set_ylabel('YB time / SK time')
ratio = y_times/s_times
ax[1].plot(x_range, ratio, 'og-')
for i, title in enumerate(titles):
ax[i].set_title(title)
ax[i].set_xlabel(x_axis)
f.subplots_adjust(wspace=0.25)
f.set_size_inches(10, 6)
plt.show()
return f
yb_timings = []
sk_timings = []
n_obs = [i for i in range(200, 1001, 100)]
for i in n_obs:
# Build a classification task using 4 informative features
X, y = make_classification(n_samples=i, n_features=10, n_informative=4,
n_redundant=2, n_repeated=0, n_classes=4,
n_clusters_per_class=1, random_state=0)
yb_time = %timeit -o rfe_time_test(yb=True)
yb_timings.append(yb_time)
sk_time = %timeit -o rfe_time_test(yb=False)
sk_timings.append(sk_time)
obs = plot_timings(n_obs, yb_timings,
sk_timings, x_axis='Number of observations',
titles=['Timings', 'Ratio'])
Ratio of time difference is fairly stable over number of observations.
yb_timings = []
sk_timings = []
n_feats = [i for i in range(10, 51, 10)]
for i in n_feats:
# Build a classification task using 4 informative features
X, y = make_classification(n_samples=200, n_features=i, n_informative=4,
n_redundant=2, n_repeated=0, n_classes=4,
n_clusters_per_class=1, random_state=0)
yb_time = %timeit -o rfe_time_test(yb=True)
yb_timings.append(yb_time)
sk_time = %timeit -o rfe_time_test(yb=False)
sk_timings.append(sk_time)
feats = plot_timings(n_feats, yb_timings,
sk_timings, x_axis='Number of input features',
titles=['Timings', 'Ratio'])
As number of starting features increase YB becomes even slower relative to sklearn.
# Build a classification task using 4 informative features
X, y = make_classification(n_samples=200, n_features=10, n_informative=4,
n_redundant=2, n_repeated=0, n_classes=4,
n_clusters_per_class=1, random_state=0)
log_reg = LogisticRegression()
yb_timings = []
sk_timings = []
cvs = [i for i in range(2, 11, 2)]
for i in cvs:
def rfe_time_test(yb=True):
if yb:
rfecv = RFECV(log_reg, step=1, cv=StratifiedKFold(i),
scoring='accuracy')
else:
rfecv = skrfecv(log_reg, step=1, cv=StratifiedKFold(i),
scoring='accuracy')
_ = rfecv.fit(X, y)
yb_time = %timeit -o rfe_time_test(yb=True)
yb_timings.append(yb_time)
sk_time = %timeit -o rfe_time_test(yb=False)
sk_timings.append(sk_time)
cv = plot_timings(cvs, yb_timings,
sk_timings, x_axis='Number of CV folds',
titles=['Timings', 'Ratio'])
YB becomes slower with increasing number of folds too!