-
Notifications
You must be signed in to change notification settings - Fork 260
/
Copy pathboruta_py.py
637 lines (528 loc) · 24.8 KB
/
boruta_py.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Author: Daniel Homola <[email protected]>
Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/
License: BSD 3 clause
"""
from __future__ import print_function, division
import numpy as np
import scipy as sp
from sklearn.utils import check_random_state, check_X_y
from sklearn.base import TransformerMixin, BaseEstimator
import warnings
class BorutaPy(BaseEstimator, TransformerMixin):
"""
Improved Python implementation of the Boruta R package.
The improvements of this implementation include:
- Faster run times:
Thanks to scikit-learn's fast implementation of the ensemble methods.
- Scikit-learn like interface:
Use BorutaPy just like any other scikit learner: fit, fit_transform and
transform are all implemented in a similar fashion.
- Modularity:
Any ensemble method could be used: random forest, extra trees
classifier, even gradient boosted trees.
- Two step correction:
The original Boruta code corrects for multiple testing in an overly
conservative way. In this implementation, the Benjamini Hochberg FDR is
used to correct in each iteration across active features. This means
only those features are included in the correction which are still in
the selection process. Following this, each that passed goes through a
regular Bonferroni correction to check for the repeated testing over
the iterations.
- Percentile:
Instead of using the max values of the shadow features the user can
specify which percentile to use. This gives a finer control over this
crucial parameter. For more info, please read about the perc parameter.
- Automatic tree number:
Setting the n_estimator to 'auto' will calculate the number of trees
in each itartion based on the number of features under investigation.
This way more trees are used when the training data has many features
and less when most of the features have been rejected.
- Ranking of features:
After fitting BorutaPy it provides the user with ranking of features.
Confirmed ones are 1, Tentatives are 2, and the rejected are ranked
starting from 3, based on their feautre importance history through
the iterations.
We highly recommend using pruned trees with a depth between 3-7.
For more, see the docs of these functions, and the examples below.
Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/
Boruta is an all relevant feature selection method, while most other are
minimal optimal; this means it tries to find all features carrying
information usable for prediction, rather than finding a possibly compact
subset of features on which some classifier has a minimal error.
Why bother with all relevant feature selection?
When you try to understand the phenomenon that made your data, you should
care about all factors that contribute to it, not just the bluntest signs
of it in context of your methodology (yes, minimal optimal set of features
by definition depends on your classifier choice).
Parameters
----------
estimator : object
A supervised learning estimator, with a 'fit' method that returns the
feature_importances_ attribute. Important features must correspond to
high absolute values in the feature_importances_.
n_estimators : int or string, default = 1000
If int sets the number of estimators in the chosen ensemble method.
If 'auto' this is determined automatically based on the size of the
dataset. The other parameters of the used estimators need to be set
with initialisation.
perc : int, default = 100
Instead of the max we use the percentile defined by the user, to pick
our threshold for comparison between shadow and real features. The max
tend to be too stringent. This provides a finer control over this. The
lower perc is the more false positives will be picked as relevant but
also the less relevant features will be left out. The usual trade-off.
The default is essentially the vanilla Boruta corresponding to the max.
alpha : float, default = 0.05
Level at which the corrected p-values will get rejected in both
correction steps.
two_step : Boolean, default = True
If you want to use the original implementation of Boruta with Bonferroni
correction only set this to False.
max_iter : int, default = 100
The number of maximum iterations to perform.
random_state : int, RandomState instance or None; default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
verbose : int, default=0
Controls verbosity of output:
- 0: no output
- 1: displays iteration number
- 2: which features have been selected already
early_stopping : bool, default = False
Whether to use early stopping to terminate the selection process
before reaching `max_iter` iterations if the algorithm cannot
confirm a tentative feature for `n_iter_no_change` iterations.
Will speed up the process at a cost of a possibility of a
worse result.
n_iter_no_change : int, default = 20
Ignored if `early_stopping` is False. The maximum amount of
iterations without confirming a tentative feature.
Attributes
----------
n_features_ : int
The number of selected features.
support_ : array of shape [n_features]
The mask of selected features - only confirmed ones are True.
support_weak_ : array of shape [n_features]
The mask of selected tentative features, which haven't gained enough
support during the max_iter number of iterations..
ranking_ : array of shape [n_features]
The feature ranking, such that ``ranking_[i]`` corresponds to the
ranking position of the i-th feature. Selected (i.e., estimated
best) features are assigned rank 1 and tentative features are assigned
rank 2.
importance_history_ : array-like, shape [n_features, n_iters]
The calculated importance values for each feature across all iterations.
Examples
--------
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
X = pd.read_csv('examples/test_X.csv', index_col=0).values
y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
y = y.ravel()
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)
# check selected features - first 5 features are selected
feat_selector.support_
# check ranking of features
feat_selector.ranking_
# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)
References
----------
[1] Kursa M., Rudnicki W., "Feature Selection with the Boruta Package"
Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010
"""
def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05,
two_step=True, max_iter=100, random_state=None, verbose=0,
early_stopping=False, n_iter_no_change=20):
self.estimator = estimator
self.n_estimators = n_estimators
self.perc = perc
self.alpha = alpha
self.two_step = two_step
self.max_iter = max_iter
self.random_state = random_state
self.verbose = verbose
self.early_stopping = early_stopping
self.n_iter_no_change = n_iter_no_change
self.__version__ = '0.3'
self._is_lightgbm = 'lightgbm' in str(type(self.estimator))
def fit(self, X, y):
"""
Fits the Boruta feature selection with the provided estimator.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
The training input samples.
y : array-like, shape = [n_samples]
The target values.
"""
return self._fit(X, y)
def transform(self, X, weak=False, return_df=False):
"""
Reduces the input X to the features selected by Boruta.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
The training input samples.
weak: boolean, default = False
If set to true, the tentative features are also used to reduce X.
return_df : boolean, default = False
If ``X`` if a pandas dataframe and this parameter is set to True,
the transformed data will also be a dataframe.
Returns
-------
X : array-like, shape = [n_samples, n_features_]
The input matrix X's columns are reduced to the features which were
selected by Boruta.
"""
return self._transform(X, weak, return_df)
def fit_transform(self, X, y, weak=False, return_df=False):
"""
Fits Boruta, then reduces the input X to the selected features.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
The training input samples.
y : array-like, shape = [n_samples]
The target values.
weak: boolean, default = False
If set to true, the tentative features are also used to reduce X.
return_df : boolean, default = False
If ``X`` if a pandas dataframe and this parameter is set to True,
the transformed data will also be a dataframe.
Returns
-------
X : array-like, shape = [n_samples, n_features_]
The input matrix X's columns are reduced to the features which were
selected by Boruta.
"""
self._fit(X, y)
return self._transform(X, weak, return_df)
def _validate_pandas_input(self, arg):
try:
return arg.values
except AttributeError:
raise ValueError(
"input needs to be a numpy array or pandas data frame."
)
def _fit(self, X, y):
# check input params
self._check_params(X, y)
if not isinstance(X, np.ndarray):
X = self._validate_pandas_input(X)
if not isinstance(y, np.ndarray):
y = self._validate_pandas_input(y)
self.random_state = check_random_state(self.random_state)
early_stopping = False
if self.early_stopping:
if self.n_iter_no_change >= self.max_iter:
if self.verbose > 0:
print(
f"n_iter_no_change is bigger or equal to max_iter"
f"({self.n_iter_no_change} >= {self.max_iter}), "
f"early stopping will not be used."
)
else:
early_stopping = True
# setup variables for Boruta
n_sample, n_feat = X.shape
_iter = 1
# early stopping vars
_same_iters = 1
_last_dec_reg = None
# holds the decision about each feature:
# 0 - default state = tentative in original code
# 1 - accepted in original code
# -1 - rejected in original code
dec_reg = np.zeros(n_feat, dtype=np.int)
# counts how many times a given feature was more important than
# the best of the shadow features
hit_reg = np.zeros(n_feat, dtype=np.int)
# these record the history of the iterations
imp_history = np.zeros(n_feat, dtype=np.float)
sha_max_history = []
# set n_estimators
if self.n_estimators != 'auto':
self.estimator.set_params(n_estimators=self.n_estimators)
# main feature selection loop
while np.any(dec_reg == 0) and _iter < self.max_iter:
# find optimal number of trees and depth
if self.n_estimators == 'auto':
# number of features that aren't rejected
not_rejected = np.where(dec_reg >= 0)[0].shape[0]
n_tree = self._get_tree_num(not_rejected)
self.estimator.set_params(n_estimators=n_tree)
# make sure we start with a new tree in each iteration
if self._is_lightgbm:
self.estimator.set_params(random_state=self.random_state.randint(0, 10000))
else:
self.estimator.set_params(random_state=self.random_state)
# add shadow attributes, shuffle them and train estimator, get imps
cur_imp = self._add_shadows_get_imps(X, y, dec_reg)
# get the threshold of shadow importances we will use for rejection
imp_sha_max = np.percentile(cur_imp[1], self.perc)
# record importance history
sha_max_history.append(imp_sha_max)
imp_history = np.vstack((imp_history, cur_imp[0]))
# register which feature is more imp than the max of shadows
hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max)
# based on hit_reg we check if a feature is doing better than
# expected by chance
dec_reg = self._do_tests(dec_reg, hit_reg, _iter)
# print out confirmed features
if self.verbose > 0 and _iter < self.max_iter:
self._print_results(dec_reg, _iter, 0)
if _iter < self.max_iter:
_iter += 1
# early stopping
if early_stopping:
if _last_dec_reg is not None and (_last_dec_reg == dec_reg).all():
_same_iters += 1
if self.verbose > 0:
print(
f"Early stopping: {_same_iters} out "
f"of {self.n_iter_no_change}"
)
else:
_same_iters = 1
_last_dec_reg = dec_reg.copy()
if _same_iters > self.n_iter_no_change:
break
# we automatically apply R package's rough fix for tentative ones
confirmed = np.where(dec_reg == 1)[0]
tentative = np.where(dec_reg == 0)[0]
# ignore the first row of zeros
tentative_median = np.median(imp_history[1:, tentative], axis=0)
# which tentative to keep
tentative_confirmed = np.where(tentative_median
> np.median(sha_max_history))[0]
tentative = tentative[tentative_confirmed]
# basic result variables
self.n_features_ = confirmed.shape[0]
self.support_ = np.zeros(n_feat, dtype=np.bool)
self.support_[confirmed] = 1
self.support_weak_ = np.zeros(n_feat, dtype=np.bool)
self.support_weak_[tentative] = 1
# ranking, confirmed variables are rank 1
self.ranking_ = np.ones(n_feat, dtype=np.int)
# tentative variables are rank 2
self.ranking_[tentative] = 2
# selected = confirmed and tentative
selected = np.hstack((confirmed, tentative))
# all rejected features are sorted by importance history
not_selected = np.setdiff1d(np.arange(n_feat), selected)
# large importance values should rank higher = lower ranks -> *(-1)
imp_history_rejected = imp_history[1:, not_selected] * -1
# update rank for not_selected features
if not_selected.shape[0] > 0:
# calculate ranks in each iteration, then median of ranks across feats
iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
rank_medians = np.nanmedian(iter_ranks, axis=0)
ranks = self._nanrankdata(rank_medians, axis=0)
# set smallest rank to 3 if there are tentative feats
if tentative.shape[0] > 0:
ranks = ranks - np.min(ranks) + 3
else:
# and 2 otherwise
ranks = ranks - np.min(ranks) + 2
self.ranking_[not_selected] = ranks
else:
# all are selected, thus we set feature supports to True
self.support_ = np.ones(n_feat, dtype=np.bool)
self.importance_history_ = imp_history
# notify user
if self.verbose > 0:
self._print_results(dec_reg, _iter, 1)
return self
def _transform(self, X, weak=False, return_df=False):
# sanity check
try:
self.ranking_
except AttributeError:
raise ValueError('You need to call the fit(X, y) method first.')
if weak:
indices = self.support_ + self.support_weak_
else:
indices = self.support_
if return_df:
X = X.iloc[:, indices]
else:
X = X[:, indices]
return X
def _get_tree_num(self, n_feat):
depth = None
try:
depth = self.estimator.get_params()['max_depth']
except KeyError:
warnings.warn(
"The estimator does not have a max_depth property, as a result "
" the number of trees to use cannot be estimated automatically."
)
if depth == None:
depth = 10
# how many times a feature should be considered on average
f_repr = 100
# n_feat * 2 because the training matrix is extended with n shadow features
multi = ((n_feat * 2) / (np.sqrt(n_feat * 2) * depth))
n_estimators = int(multi * f_repr)
return n_estimators
def _get_imp(self, X, y):
try:
self.estimator.fit(X, y)
except Exception as e:
raise ValueError('Please check your X and y variable. The provided '
'estimator cannot be fitted to your data.\n' + str(e))
try:
imp = self.estimator.feature_importances_
except Exception:
raise ValueError('Only methods with feature_importance_ attribute '
'are currently supported in BorutaPy.')
return imp
def _get_shuffle(self, seq):
self.random_state.shuffle(seq)
return seq
def _add_shadows_get_imps(self, X, y, dec_reg):
# find features that are tentative still
x_cur_ind = np.where(dec_reg >= 0)[0]
x_cur = np.copy(X[:, x_cur_ind])
x_cur_w = x_cur.shape[1]
# deep copy the matrix for the shadow matrix
x_sha = np.copy(x_cur)
# make sure there's at least 5 columns in the shadow matrix for
while (x_sha.shape[1] < 5):
x_sha = np.hstack((x_sha, x_sha))
# shuffle xSha
x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha)
# get importance of the merged matrix
imp = self._get_imp(np.hstack((x_cur, x_sha)), y)
# separate importances of real and shadow features
imp_sha = imp[x_cur_w:]
imp_real = np.zeros(X.shape[1])
imp_real[:] = np.nan
imp_real[x_cur_ind] = imp[:x_cur_w]
return imp_real, imp_sha
def _assign_hits(self, hit_reg, cur_imp, imp_sha_max):
# register hits for features that did better than the best of shadows
cur_imp_no_nan = cur_imp[0]
cur_imp_no_nan[np.isnan(cur_imp_no_nan)] = 0
hits = np.where(cur_imp_no_nan > imp_sha_max)[0]
hit_reg[hits] += 1
return hit_reg
def _do_tests(self, dec_reg, hit_reg, _iter):
active_features = np.where(dec_reg >= 0)[0]
hits = hit_reg[active_features]
# get uncorrected p values based on hit_reg
to_accept_ps = sp.stats.binom.sf(hits - 1, _iter, .5).flatten()
to_reject_ps = sp.stats.binom.cdf(hits, _iter, .5).flatten()
if self.two_step:
# two step multicor process
# first we correct for testing several features in each round using FDR
to_accept = self._fdrcorrection(to_accept_ps, alpha=self.alpha)[0]
to_reject = self._fdrcorrection(to_reject_ps, alpha=self.alpha)[0]
# second we correct for testing the same feature over and over again
# using bonferroni
to_accept2 = to_accept_ps <= self.alpha / float(_iter)
to_reject2 = to_reject_ps <= self.alpha / float(_iter)
# combine the two multi corrections, and get indexes
to_accept *= to_accept2
to_reject *= to_reject2
else:
# as in th original Boruta, we simply do bonferroni correction
# with the total n_feat in each iteration
to_accept = to_accept_ps <= self.alpha / float(len(dec_reg))
to_reject = to_reject_ps <= self.alpha / float(len(dec_reg))
# find features which are 0 and have been rejected or accepted
to_accept = np.where((dec_reg[active_features] == 0) * to_accept)[0]
to_reject = np.where((dec_reg[active_features] == 0) * to_reject)[0]
# updating dec_reg
dec_reg[active_features[to_accept]] = 1
dec_reg[active_features[to_reject]] = -1
return dec_reg
def _fdrcorrection(self, pvals, alpha=0.05):
"""
Benjamini/Hochberg p-value correction for false discovery rate, from
statsmodels package. Included here for decoupling dependency on statsmodels.
Parameters
----------
pvals : array_like
set of p-values of the individual tests.
alpha : float
error rate
Returns
-------
rejected : array, bool
True if a hypothesis is rejected, False if not
pvalue-corrected : array
pvalues adjusted for multiple hypothesis testing to limit FDR
"""
pvals = np.asarray(pvals)
pvals_sortind = np.argsort(pvals)
pvals_sorted = np.take(pvals, pvals_sortind)
nobs = len(pvals_sorted)
ecdffactor = np.arange(1, nobs + 1) / float(nobs)
reject = pvals_sorted <= ecdffactor * alpha
if reject.any():
rejectmax = max(np.nonzero(reject)[0])
reject[:rejectmax] = True
pvals_corrected_raw = pvals_sorted / ecdffactor
pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
pvals_corrected[pvals_corrected > 1] = 1
# reorder p-values and rejection mask to original order of pvals
pvals_corrected_ = np.empty_like(pvals_corrected)
pvals_corrected_[pvals_sortind] = pvals_corrected
reject_ = np.empty_like(reject)
reject_[pvals_sortind] = reject
return reject_, pvals_corrected_
def _nanrankdata(self, X, axis=1):
"""
Replaces bottleneck's nanrankdata with scipy and numpy alternative.
"""
ranks = sp.stats.mstats.rankdata(X, axis=axis)
ranks[np.isnan(X)] = np.nan
return ranks
def _check_params(self, X, y):
"""
Check hyperparameters as well as X and y before proceeding with fit.
"""
# check X and y are consistent len, X is Array and y is column
X, y = check_X_y(X, y)
if self.perc <= 0 or self.perc > 100:
raise ValueError('The percentile should be between 0 and 100.')
if self.alpha <= 0 or self.alpha > 1:
raise ValueError('Alpha should be between 0 and 1.')
def _print_results(self, dec_reg, _iter, flag):
n_iter = str(_iter) + ' / ' + str(self.max_iter)
n_confirmed = np.where(dec_reg == 1)[0].shape[0]
n_rejected = np.where(dec_reg == -1)[0].shape[0]
cols = ['Iteration: ', 'Confirmed: ', 'Tentative: ', 'Rejected: ']
# still in feature selection
if flag == 0:
n_tentative = np.where(dec_reg == 0)[0].shape[0]
content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected])
if self.verbose == 1:
output = cols[0] + n_iter
elif self.verbose > 1:
output = '\n'.join([x[0] + '\t' + x[1] for x in zip(cols, content)])
# Boruta finished running and tentatives have been filtered
else:
n_tentative = np.sum(self.support_weak_)
n_rejected = np.sum(~(self.support_|self.support_weak_))
content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected])
result = '\n'.join([x[0] + '\t' + x[1] for x in zip(cols, content)])
output = "\n\nBorutaPy finished running.\n\n" + result
print(output)