-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGradient_Descent_Template.py
executable file
·309 lines (264 loc) · 11.1 KB
/
Gradient_Descent_Template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/python
import random
##################################################################
###################### Testing Tools #############################
##################################################################
'''
Generates inputs for gradient descent.
Inputs:
voters: the number of voters, or n from the pdf
demographic: number of demographic info.
error: How well our model will fit the data. Below 10 and its pretty good. Above 50 it's pretty bad.
Output:
Theta: n by m matrix from pdf
Y: length n vector of preferences
true_x: x from which we generated Y. If error is low, this will be quite close to optimal.
When testing, you should check if the final x you get has a low mean_square_diff with this
initial_x: perturbed version of true_x. Useful starting point
'''
# Uncomment the block below to generate random parameters. When you submit, comment it out again: You won't be able
# to use numpy on alg.
'''
import numpy as np
def generate_parameters(voters,demographic,error):
#Randomly generate true params Theta, true_x, and Y
Theta = 100*np.random.rand(voters,demographic)
true_x = 10*np.random.rand(1,demographic) - 5
Y = Theta.dot(true_x.transpose())
Y = Y.transpose()
Y = Y + np.random.normal(scale=error,size=voters)
#Perturb the true x to get something close
scaling = 0.5*np.ones((1,demographic))+np.random.rand(1,demographic)
initial_x = np.multiply(true_x,scaling)
#Somewhat hacky way to convert away from np arrays to lists
Theta = Theta.tolist()
Y = [Y[0][i] for i in xrange(voters)]
true_x = [true_x[0][i] for i in xrange(demographic)]
initial_x = [initial_x[0][i] for i in xrange(demographic)]
return Theta,Y,true_x,initial_x
'''
'''
This function is used by the tests and may be useful to use when calculating whether you should stop
This function takes two vectors as input and computes the mean of their squared error
Inputs:
v1, a length k vector
v2, a length k vector
Output:
mse, a float for their mean-squared error
'''
def mean_square_diff(v1,v2):
diff_vector = [v1[i]-v2[i] for i in xrange(len(v1))]
mse = 1.0/len(v1)*sum(difference**2 for difference in diff_vector)
return mse
##################################################################
#################### Part B: Gradient Descent ####################
##################################################################
# GRADIENT DESCENT SPECIFICS
# The stopping condition is given below, namely, when the mean squared diff of the x's
# between iterations is less than some constant. Note, this is not the mean squared diff
# of f(x) but of the vector x itself! For instance
# x_at_iteration_k = [1,2,4,5]
# x_at_iteration_k+1 = [1,4,2,6]
# mean_square_change = mean_square_diff(x_at_iteration_k,x_at_iteration_k+1)
'''
Compute a 'sufficiently close to optimal' x using gradient descent
Inputs:
Theta - The voting Data as a n by m array
Y - The favorabilty scores of the voters
initial_x - An initial guess for the optimal parameters provided to you
eta - The learning rate which will be given to you.
Output:
nearly optimal x.
'''
def gradient_descent(Theta, Y, initial_x, eta):
#We've initialized some variables for you
n,m = len(Theta),len(Theta[0])
current_x = initial_x
mean_square_change = 1
while mean_square_change > 0.0000000001:
theta_x = []
for i in range(0, n):
curr = 0
for j in range(0,m):
curr += current_x[j] * Theta[i][j]
theta_x.append(curr)
gradient = []
for i in range(0,m):
curr = 0
for j in range(0, n):
curr += Theta[j][i]*(Y[j] - theta_x[j]) * (2.0*eta/n)
gradient.append(curr)
#print(b_gradient)
prev_x = current_x
current_x = []
for i in range(0, m):
current_x.append(prev_x[i] + gradient[i])
mean_square_change = mean_square_diff(prev_x, current_x)
#print(mean_square_change)
#CALCULATE MEAN ERROR AGAIN AFTER COMPUTING NEW HYPOTHESIS AFTER GRADIENT DESCENT
return current_x
##################################################################
############### Part C: Minibatch Gradient Descent################
##################################################################
################################## ALGORITHM OVERVIEW ###########################################
# Very similar to above but now we are going to take a subset of 10 #
# voters on which to perform our gradient update. We could pick a random set of 10 each time #
# but we're going to do this semi-randomly as follows: #
# -Generate a random permutation of [0,1...,n] (say, [5,11,2,8 . . .]) #
# This permutation allows us to choose a subset of 10 voters to focus on. #
# -Have a sliding window of 10 that chooses the first 10 elements in the permutation #
# then the next 10 and so on, cycling once we reach the end of this permutation #
# -For each group of ten, we perform a subgradient update on x. #
# You can derive this from the J(x)^mini #
# -Lastly, we only update our stopping condition, mean_square_change #
# when we iterate through all n voters. Counter keeps track of this. #
#################################################################################################
'''
Minibatch Gradient Descent
Compute a 'sufficiently close to optimal' x using gradient descent with small batches
Inputs:
Theta - The voting Data as a n by m array
Y - The favorabilty scores of the voters
initial_x - An initial guess for the optimal parameters provided to you
eta - The learning rate which will be given to you.
Output:
nearly optimal x.
'''
def minibatch_gradient_descent(Theta, Y, initial_x, eta):
# We've gotten you started. Voter_ordering is a random permutation.
# Window position can be used to keep track of the sliding window's position
print("minibatch")
n,m = len(Theta),len(Theta[0])
current_x = initial_x
voter_ordering = range(n)
random.shuffle(voter_ordering)
mean_square_change = 1
window_position = 0
counter = 0
current_x = initial_x
temp_x = current_x[:]
while mean_square_change > 0.000000001:
#TODO: Minibatch updates
#Remove this when you actually fill this out
start = int(counter * 10)
curr_iter= voter_ordering[start:start+10]
error_vec = [Y[i]-dot_product(current_x,Theta[i]) for i in curr_iter]# 1 x n matrix
sub_grad=[]
for i in range(m):
temp = 0
for j in xrange(10):
temp += error_vec[j]*Theta[curr_iter[j]][i]
sub_grad.append(temp)
for k in range(0,m):
current_x[k] += (eta * (2.0/10) * sub_grad[k] )
counter +=1
if counter == n/10:
mean_square_change = mean_square_diff(temp_x, current_x)
temp_x = current_x[:]
counter = 0
# TODO: stopping condition updates
return current_x
##################################################################
############## Part D: Line search Gradient Descent###############
##################################################################
'''
Compute the mean-squared error between the prediction for Y given Theta and the current parameters x
and the actual voter desires, Y.
Input:
Theta - The voting Data as a n by m array
Y - The favorabilty scores of the voters. Length n.
x - The current guess forx the optimal parameters. Length m.
Output:
A float for the prediction error.
'''
def prediction_error(Theta,Y,x):
#TODO Compute the MSE between the prediction and Y
n = len(Theta)
prediction_error = 0
for i in range(0,n):
err = Y[i]-dot_product(x,Theta[i])
prediction_error += (err*err)
return prediction_error/n
'''
This function should return the next current_x after doubling the learning rate
until we hit the max or the prediction error increases
Inputs:
current_x Current guess for x. Length m.
gradient Gradient of current_x. Length m.
min_rate Fixed given rate.
max_rate Fixed max rate.
Output:
updated_x Check pseudocode.
'''
def line_search(Theta,Y,current_x,gradient,min_rate=0.0000001,max_rate=0.1):
#TODO Adapt the pseudocode to working python code
n = len(Theta)
m = len(Theta[0])
eta_cur = min_rate
grad = compute_gradient(Theta, Y, current_x)
current_x = [current_x[k] - eta_cur* grad[k] for k in range(0,m)]
while eta_cur < max_rate:
#print eta_cur
grad = compute_gradient(Theta, Y, current_x)
x_temp = current_x[:]
x_temp = [ x_temp[k] - eta_cur* grad[k] for k in range(0,m)]
#print grad, x_temp, current_x
#print prediction_error(Theta, Y, x_temp), prediction_error(Theta, Y, current_x)
if prediction_error(Theta, Y, x_temp) <= prediction_error(Theta, Y, current_x):
current_x = x_temp
eta_cur = 2*eta_cur
else:
break
return current_x
'''
Inputs:
Theta The voting Data as a n by m array
Y The favorabilty scores of the voters. Length n.
x The current guess for the optimal parameters. Length m.
Output:
gradient Length m vector of the gradient.
'''
def dot_product(a, b):
res = 0
assert(len(a) == len(b))
for i in range(len(a)):
res += a[i] *b[i]
return res
def vector_with_scalar(v, s):
return [ v[i]*s for i in range(len(v))]
def compute_gradient(Theta,Y,current_x):
#TODO: Compute the gradient. Should be able to copy paste from part b.
#We've initialized some variables for you
n= len(Theta)
m = len(Theta[0])
theta_x = []
for i in range(0, n):
curr = 0
for j in range(0,m):
curr += current_x[j] * Theta[i][j]
theta_x.append(curr)
#theta_x = [ (current_x[j] * Theta[i][j] for j in range(0,m)) for k ]
gradient = []
for i in range(0,m):
curr = 0
for j in range(0, n):
curr -= Theta[j][i]*(Y[j] - theta_x[j]) *(2.0/n)
gradient.append(curr)
# print(err)
return gradient
#print(b_gradient)
#Lean in with hackathons. Partnering with other companies.
def gradient_descent_complete(Theta,Y,initial_x):
n,m = len(Theta),len(Theta[0])
delta = 1
current_x = initial_x
last_error = prediction_error(Theta,Y,current_x)
while delta > 0.1:
gradient = compute_gradient(Theta,Y,current_x)
current_x = line_search(Theta,Y,current_x,gradient,0.0000005,0.1)
current_error = prediction_error(Theta,Y,current_x)
delta = last_error - current_error
last_error = current_error
#TODO comment this out. It's just here to make sure this ends while you're testing
# delta = 0
return current_x