-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSVM.py
181 lines (122 loc) · 6.17 KB
/
SVM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# -*- coding: utf-8 -*-
"""
Created on Sat Jan 02 12:40:14 2016
@author: Pavitrakumar
"""
from __future__ import division
import numpy as np
from sklearn import datasets
import cvxopt
import cvxopt.solvers
from sklearn.cross_validation import train_test_split
#generating linearly seprable data set.
#Target values = -1 or 1.
mean1 = np.array([0, 2])
mean2 = np.array([2, 0])
cov = np.array([[0.8, 0.6], [0.6, 0.8]])
X1 = np.random.multivariate_normal(mean1, cov, 100)
y1 = np.ones(len(X1))
X2 = np.random.multivariate_normal(mean2, cov, 100)
y2 = np.ones(len(X2)) * -1
X = np.vstack((X1, X2))
y = np.hstack((y1, y2))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#X is 150x 4 matrix (2 featues, 150 data points)
#y is 150x 1 matrix
n_samples = X_train.shape[0]
n_features = X_train.shape[1]
def kernel(x,y):
return np.dot(x,y)
K = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
K[i,j] = kernel(X_train[i], X_train[j])
#K is (150,150) matrix
#look at lecture notes, each term of P matrix is like y1y1(X1^T*X1)...
#we have already calculated the (Xi^T*Xj) part in matix K.
#now we need to calculate the yiyj part - since yi is just scalar, we just compute the outer product.
Y = np.outer(y_train,y_train)
#now to combine K and Y into P
P = cvxopt.matrix(Y * K)
#the quadratic equation we are trying to minimize is :
# 1/2 x^T*P*x + Q^T*X
#we have computed P already, now for Q:
#according to the final minimization equation equation, the co-efficient for sigma xn is (-1)
#so, we pass a col-matrix of -1s as Q
Q = cvxopt.matrix(-1 * np.ones(n_samples))
#now, we have the solver which looks like this:
#cvxopt.solvers.qp(P, Q, G, h, A, b)
#We also need to pass in the constraints h,A,b,G
A = cvxopt.matrix(y_train, (1,n_samples),tc='d') #reshaping (1x150 matrix)
b = cvxopt.matrix(0.0)
#The above A and b correspnd to Ax=b general form and y^T*x=0 as a constraint in our equation.
#for general Qp using cvxopt refer: #https://courses.csail.mit.edu/6.867/wiki/images/a/a7/Qp-cvxopt.pdf
#G and h general form: Gx <= h and 0<=xn<=C as a constraint in our equation if we need margin. For non-margins, we have 0 <= xn.
#for non-margin support vectors, 0 <= xn but, it is not in the form Gx <= h. To convert it to that form, we have to convert it as:
# (-1)* xn <= 0
# so, G will be a matrix of diaognals -1 and h will be a 150x1 matrix of 0s
# RHS = 0 so h is a 1d matrix of 0 as coefficient is easy to understand.
# Why does the G matrix have a diaognal of -1s?
# Since our goal is to find the "support vectors" and these support vectors are a subset of points from the exisitng data point.
# (support vectors define the margin of the hyperplane we are trying to find out)
# So, each of the data point is to be considered as a unknown variable and the resultant value decides whether it is a support vector or not.
#If we have 5 data points, equations are: 1*x1+0*x2+0*x3+0*x4+0*x5 = 0 so here h = [0] and G = [1,0,0,0,0] .... 0*x1+0*x2+0*x3+0*x4+1*x5 = 0 so here h = [0] and G = [0,0,0,0,1]
G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1)) # 150x150
h = cvxopt.matrix(np.zeros(n_samples)) # 150x1
#for margin support vectors, we have a range in which xn lies i.e 0<=xn<=C
#similar to previous where we had only one equality constraint to take care of, here we have 2.
#that is: 0<=xn<=C can be split in 2. (we need to split because constraints to QP algo has to be in the form Gx<=h)
# 0 < xn ==> (-1)*xn < = 0 (is in the form Gx<=h) .....(1)
# (1)*xn <= C ==> (1)*xn <= C (is in the form Gx<=h, no need to change signs) ...........(2)
#So, the matrix for G is the previous matrix stacked(vertically) with another diaognal matrix of 1s
tmp1 = np.diag(np.ones(n_samples) * -1) # = G if non-margin SVs (only using (1))
tmp2 = np.identity(n_samples) # using (2) (1 x xn each row)
G = cvxopt.matrix(np.vstack((tmp1, tmp2))) # vertically stack the 2 matrices above
#G is now a 300x150 matrix
#Similarly, for h here xn <= C so h matrix also has to take care of that constraint
#It is a stacked(horizontally) with a (1x150 stacked horizontally with 1x150) 1x150 matrix of C value.
C = 2
tmp1 = np.zeros(n_samples) # = h if non-margin SVs (only using (1))
tmp2 = np.ones(n_samples) * C # using (2)
h = cvxopt.matrix(np.hstack((tmp1, tmp2))) # 300x1 matrix
#h is 1x300 matrix [0,0,0,....0,0,C,C,....,C,C,C]
#Now we have all the required variables needed to pass to the QP solver and get the xns (the support vectors)
solution = cvxopt.solvers.qp(P, Q, G, h, A, b)
a = np.ravel(solution['x'])
#in the above vector, the points which have non-zero(or a very low cut off value - we have chosen 1e-5) values are the support vectors.
sv = a > 1e-5 # [true if condition satisfies, else false]
ind = np.arange(len(a))[sv] #getting the indices of the support vectors in the training set
#ex: ind = [1,4,12,59,30] <- 1st,4th,12th etc.. are all support vectors.
a = a[sv] # all support vectors.
sv_x = X[sv]
sv_y = y[sv]
#now we need to compute intercept (b)
#b = ym - sigma xnynK(xn,xm) wehre xn>cutoff and K(xn,xm) is kernalized input
#basically, it is b = acual - predicted (error)
#since we are only using linear kernel, it is Xi^T.Xj (K matrix)
b = 0
for i in range(len(a)):
ym = sv_y[i]
xm = sv_x[i]
b+=ym
b-=np.sum(a*sv_y*K[sv,ind[i]])
b/= len(a)
#computing weight vector:
w = np.zeros(n_features)
#w = sigma xn * Xn * yn where [Xn,yn] are the data points of the support vectors and xn is the alpha of the support vector.
#since we are using linear kernel, we need weights.
for i in range(len(a)):
w += a[i]*sv_x[i]*sv_y[i]
y_predict = np.zeros(len(X_test))
for i in range(len(X_test)):
#similar to the formula for b just re-arrange it and calculate kernel for test set this time.
s = 0
for a_val,xm,ym in zip(a,sv_x,sv_y):
print a_val
print xm
print kernel(X_test[i],xm)
s += a_val * ym * kernel(X_test[i],xm)
y_predict[i] = s
y_predict = np.sign(y_predict + b)
print "accuracy is ",(sum(y_predict==y_test)/len(y_predict))
#right now, it only works on binary classification tasks but one-vs-all technique can be used to implement a multi-class classification algo.