-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNeuralNetwork.py
223 lines (164 loc) · 8.19 KB
/
NeuralNetwork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 18 12:14:35 2015
@author: Pavitrakumar
"""
from __future__ import division
import numpy as np
from scipy import optimize
"""
A simple neural netowork with 3 layers.
No. of units in Input layer - taken from dataset (X)
No. of units in Hidden layer - given as user input
No. of units in Output layer - taken from dataset(y)
So, there will be 2 theta matrices
Theta1 for input->hidden
Theta2 for hidden->output
X is the features matrix - the features
y is the response vector -the output
"""
"""
Size of Theta1 is no.of hiddenlayers(in hidden layer1) x no. of input layers
i.e simply the [A]x[B] where A on the left side corresponds to the no.of units
in the layer on the left, similarly B on the right side(of the dim) corrensponds to
the no.of units in the layer on the right side
So, our Theta1's dim is hidden_layer_size x input_layer_size + 1
Similarly the Theta2 it is output_layer_size x hidden_layer_size + 1
NOTE: +1 is for the bias term
"""
"""
Now, we implement the feedforward function to get the cost
cost is h(x) the hypothesis calculated by feeding the input through the
network using the respective weights.
After feed-forward we back propogate to find the delta values,
using this, we find the theta gradients which will be the input to an optimization algorithm
[optimization algorithm used is from scikit]
"""
def sigmoid(z):
g = 1.0 / (1.0 + np.exp(-z))
return g
def gradient_of_sigmoid(z):
sig = sigmoid(z)
g = sig * (1 - sig)
return g
def pack_thetas(Theta1, Theta2):
#reshaping a matrix here return a vector of size (no. of rows x no. of columns)
#i.e each row is appended one after the other to make a huge vector from a matirx
return np.concatenate((Theta1.reshape(-1), Theta2.reshape(-1)))
def unpack_thetas(Theta_combi, input_layer_size, hidden_layer_size, output_layer_size):
Theta1_start = 0
Theta1_end = hidden_layer_size * (input_layer_size + 1)
#Using the total size (which we already know because of the dimensions of the matirx)
#We can restore the vector into a matrix by simply reshaping it as per its dimensions.
Theta1 = Theta_combi[Theta1_start:Theta1_end].reshape((hidden_layer_size, input_layer_size + 1))
Theta2 = Theta_combi[Theta1_end:].reshape((output_layer_size, hidden_layer_size + 1))
return Theta1, Theta2
def rand_init(l_in, l_out,e):
#randomly initialize the weights for theta matrices in the range [-e,e]
return np.random.rand(l_out, l_in) * 2 * e - e
def feed_forward_back_prop(Theta1,Theta2,input_layer_size,hidden_layer_size,output_layer_size,X,y,lam,returnJ = False):
J = 0
print "Shape of y is: " , np.shape(y),"\n"
print "Shape of X is: " , np.shape(X),"\n"
print "output layer size is: " , output_layer_size
Theta1_gradient = np.zeros(np.shape(Theta1))
Theta2_gradient = np.zeros(np.shape(Theta2))
#for a multi-layer implementation, take the theta values in a single vector and unroll them
no_training_samples = np.shape(X)[0]
#since its a classification problem, we build a new Y matrix(binary) from vector of Y
#i.e it now has output_layer_size columns - each column represents the probability of that class being the result(or output)
new_Y = np.eye(output_layer_size)[y]
#adding a vector of ones to the X matrix(as the first column) - bias terms for each training example
X = np.insert(X,0,1,axis=1)
print "head of X is:\n"
print X[1:5,:]
#Start the feed-forward steps
a1 = X
#a1 = [no. of training examples] x [input_layer_size + 1]
#theta1 = [hidden_layer_size] x [input_layer_size + 1]
#theta1 transpose = [input_layer_size + 1] x [hidden_layer_size]
z2 = np.dot(a1,Theta1.T)
#z2 = [no. of training examples] x [hidden_layer_size]
a2 = sigmoid(z2)
#For each hidden layer(if it exists), this process continues
#Now, a2 is the new treated as a new X and a similar process to above is carried out
a2 = np.insert(a2,0,1,axis=1)
z3 = np.dot(a2,Theta2.T)
a3 = sigmoid(z3)
#Now we compute the cost function J(theta)
#f = positive cost - negative cost
f = np.sum(np.multiply(-new_Y,np.log(a3))-np.multiply((1-new_Y),np.log(1-a3)),axis = 0) # columnwise sum
f = (1.0/no_training_samples)*f
f = sum(f)
J = f
print "J is: ",J
reg_term = (lam/(2.0*no_training_samples))*(np.sum(np.sum(np.power(Theta1[:,1:],2),axis=0))+np.sum(np.sum(np.power(Theta2[:,1:],2),axis=0)))
print "Regularization term is: ",reg_term
J = J + reg_term
print "J after regularization: ",J,"\n"
if returnJ is True:
return J
#Now we need to calculate the gradients by backpropagating
#for layer 3 (output layer)
s_delta_3 = a3-new_Y
#we are splicing theta2 to get rid of the bias terms
#for layer 2 (hidden layer)
s_delta_2 = np.multiply(np.dot(s_delta_3,Theta2[:,1:]),gradient_of_sigmoid(z2))
#nothing for input layer because we dont aassociate error terms with input layer.
#now,for accumulating the s_deltas, we use delta
delta_2 = np.dot(a2.T,s_delta_3)
delta_1 = np.dot(a1.T,s_delta_2)
Theta1_gradient = (1/no_training_samples)*delta_1.T
Theta2_gradient = (1/no_training_samples)*delta_2.T
#Regularizing the theta terms
reg_term_T1 = (lam/no_training_samples)*Theta1[:,1:]
reg_term_T2 = (lam/no_training_samples)*Theta2[:,1:]
Theta1_gradient[:,1:] += reg_term_T1
Theta2_gradient[:,1:] += reg_term_T2
return Theta1_gradient,Theta2_gradient
def returnsJ(Theta_combi,input_layer_size,hidden_layer_size,output_layer_size,X,y,lam):
Theta1, Theta2 = unpack_thetas(Theta_combi, input_layer_size, hidden_layer_size, output_layer_size)
J = feed_forward_back_prop(Theta1,Theta2,input_layer_size,hidden_layer_size,output_layer_size,X,y,lam,True)
return J
def returnThetaGrad(Theta_combi,input_layer_size,hidden_layer_size,output_layer_size,X,y,lam):
Theta1, Theta2 = unpack_thetas(Theta_combi, input_layer_size, hidden_layer_size, output_layer_size)
Theta1_grad, Theta2_grad = feed_forward_back_prop(Theta1,Theta2,input_layer_size,hidden_layer_size,output_layer_size,X,y,lam)
return pack_thetas(Theta1_grad,Theta2_grad)
def fit(X,y,maxiter = 750,method = 'TNC',hidden_layer_size = 25,lam = 0):
input_layer_size = np.shape(X)[1] #dependant on dataset
#hidden_layer_size = 50 # user input
output_layer_size = len(set(y))
Theta1 = rand_init(hidden_layer_size, input_layer_size + 1,0.12)
Theta2 = rand_init(output_layer_size, hidden_layer_size + 1,0.12)
print "dim of theta1: ",Theta1.shape
print "dim of theta2: ",Theta2.shape
Theta_combi = pack_thetas(Theta1,Theta2)
#Optimization algorithms which returs the optimized values of theta given the gradients and cost function result
options = {'maxiter': maxiter}
_res = optimize.minimize(returnsJ, Theta_combi, jac=returnThetaGrad, method=method,args=(input_layer_size, hidden_layer_size, output_layer_size, X, y,lam), options=options)
t1, t2 = unpack_thetas(_res.x, input_layer_size, hidden_layer_size, output_layer_size)
return t1,t2
def predict(t1,t2,X):
#here, we just perform a feed-forward step
h1 = sigmoid(np.dot(np.insert(X,0,1,axis=1),t1.T))
h2 = sigmoid(np.dot(np.insert(h1,0,1,axis=1),t2.T))
return h2
"""
testing using IRIS data set
"""
import sklearn.datasets as datasets
from sklearn import cross_validation
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3)
#The optimized theta values after training using the training data
t1,t2 = fit(X_train,y_train)
#Predicting using the optimized theta values
pred = predict(t1,t2,X_test)
#predict function returns the matrix of probability of each class for each test data point
pred = pred.argmax(1)
#pred matrix is convertex to a vector where each element represents the index of maximum probability in the row
#i.e it now holds the vector of classes where each element represents the classification result of a test data point
from sklearn.metrics import accuracy_score
print "Prediction accuracy: ",accuracy_score(y_test,pred)