-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathada.py
123 lines (107 loc) · 3.63 KB
/
ada.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# ada.py
# Bill Waldrep, November 2012
#
# Implementation of AdaBoost with decision stubs
# as weak learners.
import numpy as np
import math as m
import matplotlib
import matplotlib.pyplot as pl
import matplotlib.cm as cm
def load_data(suffix='train.csv'):
data_file = 'data/ada_x_' + suffix
label_file = 'data/ada_y_' + suffix
delim = ','
data = np.loadtxt(data_file, delimiter=delim)
labels = np.loadtxt(label_file, delimiter=delim)
return data, labels
def show_image(img):
"""Plot the input as a greyscale
image for sanity checking"""
temp = np.reshape(x[3], (28,28))
pl.imshow(temp, cmap = cm.Greys_r)
pl.show()
class DecisionStump:
def __init__(self, dimension=-1, thresh=-1, inequal=-1):
self.dim = dimension
self.t = thresh
self.iq = inequal
def classify(self, data):
# start by labeling everything 1
results = np.ones(len(data),'int')
if self.iq == "gte":
# flip the labels of everything greater than or equal to the threshold
results[data[:,self.dim] >= self.t] = -1
else:
# flip the labels of everything less than or equal to the threshold
results[data[:,self.dim] <= self.t] = -1
return results
def train(self, data, label, D):
rows, cols = np.shape(data)
min_error = np.inf
for dim in xrange(cols):
# get all possible thresholds we could split on
vals = np.unique(data[:,dim])
for t in vals:
for flip in ["gte", "lte"]:
candidate = DecisionStump(dim,t,flip)
guess = candidate.classify(data)
# initialize to zero error
error = np.zeros(rows,'int')
error[guess != label] = 1
weighted = np.dot(error,D)
#print "dim %d, t %d, iq %s, w %f, min %f" % (dim, t, flip, weighted, min_error)
if weighted < min_error:
min_error = weighted
self.dim = dim
self.t = t
self.iq = flip
return self.classify(data), min_error
def debug(self):
print self.dim, self.t, self.iq
class AdaBoost:
def train(self, T=500, suffix='train.csv'):
data, labels = load_data(suffix)
tdat, tlab = load_data('test.csv')
rows, cols = np.shape(data)
weights = np.ones(rows, 'float')/rows
self.alphas = []
self.classifiers = []
trainerr = []
testerr = []
for i in xrange(T):
h = DecisionStump()
guess, herr = h.train(data,labels,weights)
# print i, herr
# add new classifier and coefficient
self.alphas.append(m.log((1-herr)/max(herr,1e-16))/2)
self.classifiers.append(h)
# normalization factor
z = 2 * m.sqrt(max(herr,1e-16) * (1 - herr))
# re-weight the features
temp = np.multiply(-self.alphas[i] * labels, guess)
weights = np.multiply(weights, np.exp(temp))/z
# compute and save training and testing error
trainerr.append(self.check(data, labels))
testerr.append(self.check(tdat, tlab))
# We need to plot this stuff
print i, self.alphas[i], trainerr[i], testerr[i]
return trainerr, testerr
def evaluate(self, data):
n = len(data)
# start with a neutral prediction
result = np.zeros(n, 'float')
# add the weighted vote of each weak classifier
for i in range(len(self.alphas)):
h = self.classifiers[i]
result += h.classify(data) * self.alphas[i]
# take the sign to get the final classification
return np.sign(result)
def check(self, data, labels):
guess = self.evaluate(data)
error = np.zeros(len(guess), 'int')
error[guess != labels] = 1
return error.sum()/float(len(error))
# Debugging
a = AdaBoost()
train, test = a.train()