-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgreedyfs.py
112 lines (95 loc) · 3.18 KB
/
greedyfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import datetime
import time
from numpy import mean
from pandas import DataFrame
from sklearn.base import clone
def adjust_tuned_par(x, tp):
n = []
tp2 = {}
for p in tp:
cm = 0
if p == 'max_features':
for c in tp[p]:
if c <= len(x.columns):
n.append(c)
else:
if c > cm:
cm = c
if cm > max(n):
n.append(len(x.columns))
tp2[p] = n
else:
tp2[p] = tp[p]
return tp2
def gfs_step_one_f(xc, est, x_train, y_train, tuned_parameters, c, bp, u, e):
# evaluate one feature
xn = xc.copy()
xn[c] = x_train[c]
est2 = clone(est)
est2.param_grid = adjust_tuned_par(xn, tuned_parameters)
est2.fit(xn, y_train[0])
pred2 = est2.best_estimator_.predict(xn)
acc2 = mean(pred2 == y_train[0])
bp[c] = est2.best_params_
u[c] = acc2
e[c] = (pred2 == y_train[0])
def gfs_step(xc, est, x_train, y_train, tuned_parameters, callback=None, tie_min_trees=True):
# find one additional best feature
bp = {}
u = {}
e = {}
s = len(xc.columns)
i = 0
p = len(x_train.columns)
for c in x_train.columns:
i += 1
if c not in xc.columns:
gfs_step_one_f(xc, est, x_train, y_train, tuned_parameters, c, bp, u, e)
if callback:
callback(s, i, p)
time.sleep(0.1)
m = max(u.values())
if len([c for c in u.keys() if u[c] == m]) == 1 or not tie_min_trees:
c_max = max(u, key=u.get)
else:
v = {}
for c in u.keys():
if u[c] == m:
v[c] = bp[c]['n_estimators']
c_max = min(v, key=v.get)
if c_max != '':
xc[c_max] = x_train[c_max]
return xc, c_max, bp, m, u, e
def greedy_feature_selection(est, x_train, y_train, margin, tuned_parameters, callback=None, tie_min_trees=True):
"""
Main algorithm of GFS
:param est:
trained estimator, instance of RandomForestClassifier
:param x_train:
input training dataset: DataFrame
:param y_train:
input class labels: DataFrame
:param margin:
lower limit as stop criterion for training: float
:param tuned_parameters:
dict of hyperparameters to be tuned during training: dict
:param callback:
progressbar callback function with three parameters: step, feature number, max features
:param tie_min_trees:
whether the least-trees-used criterion is applied: boolean
:return:
reduced dataset: DataFrame,
algorithm details: list
"""
xc = DataFrame()
ld = []
u = {}
rez_stari = 0.0
while (ld == [] or (rez_stari <= max(u.values()) < margin)) and len(xc.columns) < len(x_train.columns):
if ld:
rez_stari = max(u.values())
xc, c_max, bp, m, u, e = gfs_step(xc, est, x_train, y_train, tuned_parameters, callback, tie_min_trees)
ld.append([c_max, bp, m, u, e])
return xc, ld # output dataset and algorithm details
if __name__ == '__main__':
pass