-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadaboosting.py
240 lines (190 loc) · 8.45 KB
/
adaboosting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# -*- coding: utf-8 -*-
"""
Script for Adaboost class
"""
#from helper_functions import scaler, MSE, importData
from sklearn.model_selection import train_test_split
from sklearn import tree
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
class AdaBoost:
def __init__(self, iterations, depth, loss_func, X_train, y_train, X_test, y_test, X_eval = None , y_eval = None):
'''
Initialise the parameters of the AdaBoost.
Inputs:
iterations : int
The number of iterations of the boost
depth : int
The maximum depth of the tree one would like to grow.
X_train, X_eval, X_test : (n_samples, n_features)
y_train, y_eval, y_test : (n_samples, 1)
Returns:
Nothing
'''
self.iterations = iterations
self.depth = depth
self.X_train, self.y_train, self.X_test, self.y_test = X_train, y_train, X_test, y_test
self.X_eval, self.y_eval = X_eval, y_eval
self.n = self.X_train.shape[0]
self.loss_func = loss_func
def main(self, best_mse, best_iteration, best_depth, best_function, best_params):
AdaBoost.training(self)
train_predict, train_MSE, train_R2 = AdaBoost.evaluate(self, self.X_train, self.y_train)
test_predict, test_MSE, test_R2 = AdaBoost.evaluate(self, self.X_test, self.y_test)
if test_MSE < best_mse:
best_mse = test_MSE
best_trees = self.trees
best_iteration_weight = self.iteration_weight
best_iteration = self.iterations
best_depth = self.depth
best_function = self.loss_func
best_params = zip(best_trees, best_iteration_weight)
return train_MSE, train_R2, test_MSE, test_R2, best_mse, best_iteration, best_depth, best_function, best_params
#define the loss functions for adaboost
def linear (self, y_predict, y):
loss = np.absolute(y_predict - y)
return loss / np.amax(loss)
def exponential(self, y_predict, y):
loss = np.absolute(y_predict - y)
return 1 - np.exp(- loss / np.amax(loss))
def square(self, y_predict, y):
loss = (y_predict - y)**2
return loss / np.amax(loss)
def prediction_loss(self, y, p):
'''
A function for finding the loss using on of the loss functions below.
Inputs:
y : (n_samples, 1)
The y data, this could be train or test.
p : (n_samples, 1)
The prediction data
Returns:
loss: (n_samples,1)
The calculated loss
'''
if self.loss_func == 'linear':
loss = AdaBoost.linear(self, p, y)
elif self.loss_func == 'square':
loss = AdaBoost.square(self, p, y)
elif self.loss_func == 'exponential':
loss = AdaBoost.exponential(self, p, y)
return loss
def training(self):
'''
Function to train or begin the adaboost iterative process.
Inputs:
loss_func: - string
loss function is 'square', 'linear' or 'exponential'.
Returns:
loss (and 'test_loss'): (n_samples,1)
Returns the loss depending on which loss function is calculated from the training set and y.
'''
W = np.ones(self.n) # initialise sample weights as 1.0
self.test_predict_iter = np.zeros(len(self.y_test))
self.train_predict_iter = np.zeros(len(self.y_train))
if type(self.X_eval) != type(None):
self.eval_predict = np.zeros(len(self.y_eval))
self.iteration_weight = np.zeros(self.iterations)
self.trees = []
self.beta = np.zeros(self.iterations)
self.loss = np.zeros((self.iterations, len(self.y_train)))
self.test_loss = np.zeros((self.iterations, len(self.y_test)))
train_mask = np.ones(self.iterations, dtype = bool)
test_mask = np.ones(self.iterations, dtype = bool)
for i in range(0, self.iterations):
#normalise the weights
W_norm = W / np.sum(W)
# fit a weak decision tree
reg_weak = tree.DecisionTreeRegressor(max_depth = self.depth)
reg_weak.fit(self.X_train, self.y_train, sample_weight = W_norm)
# predict on train and test
train_predict = reg_weak.predict(self.X_train)
test_predict = reg_weak.predict(self.X_test)
loss = AdaBoost.prediction_loss(self, train_predict, self.y_train)
test_loss = AdaBoost.prediction_loss(self, test_predict, self.y_test)
self.loss[i] = loss
self.test_loss[i] = test_loss
#find the average loss and update sample weights
loss_ave = np.sum(loss * W_norm)
#stop learning
if loss_ave >= 0.5:
print ('breaking Adaboost')
#print(len(test_mask))
test_mask[[i]] = False
train_mask[[i]] = False
self.iterations = i - 1
self.loss = self.loss[train_mask]
self.test_loss = self.test_loss[test_mask]
return
beta = loss_ave / (1.0 - loss_ave)
self.iteration_weight[i] = np.log(1.0 / beta) #in the report this is called alpha
self.beta[i] = beta
self.test_predict_iter += beta* test_predict
self.train_predict_iter += beta* train_predict
#update weights
W = W_norm * (beta**(1-loss))
self.trees.append(reg_weak)
def evaluate(self, X, y, in_trees = None, in_iteration_weight = None):
'''
This function finds the ensemble prediction and returns the mse
and r2 score of this prediction.
Inputs:
X: (n_samples, n_features)
The X matrix to be used in the ensemble prediction. Usually
the training X would be passed here, or evaluation if used.
y: (n_samples,1)
Returns:
median_predict: (n_samples,1)
The ensemble prediction
MSE : float
r2 : float
'''
prediction = []
if in_trees == None:
trees = self.trees
iteration_weight = self.iteration_weight
else:
trees = in_trees
iteration_weight = in_iteration_weight
for i in range(0, len(trees)):
prediction.append(trees[i].predict(X))
prediction = np.array(prediction).T
ordered_matrix_idx = np.argsort(prediction, axis = 1)
iteration_weight_cumu = np.cumsum(iteration_weight[ordered_matrix_idx], axis = 1)
max_cumu = iteration_weight_cumu[:, -1][:, np.newaxis]
median_true = iteration_weight_cumu >= 0.5 * max_cumu
median_idx = median_true.argmax(axis=1)
median_iteration = ordered_matrix_idx[np.arange(X.shape[0]), median_idx]
median_predict = prediction[np.arange(X.shape[0]), median_iteration]
MSE, R2 = AdaBoost.calcMSE_R2(self, median_predict, y)
return median_predict, MSE, R2
def calcMSE_R2(self, p, y):
'''
A function for calculate the mean squared error and r2 score
Inputs:
p : (n_samples,1)
the prediction
y: (n_samples,1)
the y values
Returns:
MSE : float
R2: float
'''
MSE = mean_squared_error(y, p)
R2 = r2_score(y, p)
return MSE, R2
def shuffleAndsplit(self, X, y):
'''
A function for shuffling and splitting data. This
function is no longer used. Now the data is split
in the Regression class in methods.py
'''
curr_seed= 0
np.random.seed(curr_seed)
np.random.shuffle(X)
np.random.seed(curr_seed)
np.random.shuffle(y)
X = X[0:1000] #algorithm testing with smaller samples than full data
y = y[0:1000]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
return X_train, X_test, y_train, y_test