-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRNN_v1.py
156 lines (112 loc) · 5.51 KB
/
RNN_v1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import numpy as np
import pylab as pl
from tensorflow.examples.tutorials.mnist import input_data # tf is used just for MNIST dataset
"""
Single hidden layer recurent neural network. Each image of MNIST data set is interpreted as
sequence of X mini-frames (time_steps). e.g if image width is 28 pixels, it could be split into
4 subsequent mini-frames (time_steps) of 7 pixels in width. In such case the net would cycle
each mini-frame of single data point for 4 times before giving the final answer.
"""
def activation(input):
return input * (input > 0) # if you change this one be sure to change activation_derivative as well
def activation_derivative(input):
return 1 * (input > 0)
def softmax(input):
return np.exp(input) / np.sum(np.exp(input), axis=1, keepdims=True) # this activation is applied to final layer
def forward_prop(x, y, w1, w2, rw1):
"""
This function forward propagates input (x) using weights (w1, w2, rw1).
Returns a tuple of:
hidden state values (a1_values),
last layer values (a2_values)
errors of the last layer (a2_deltas)
and net's output values (a2_values[-1]).
"""
_, time_steps, _ = np.shape(x)
a1_values = [np.zeros((len(x), len(w1.T)))]
a2_values = []
a2_deltas = []
for time_step in range(time_steps):
a1 = activation(np.dot(x[:, time_step, :], w1) + np.dot(a1_values[-1], rw1))
a2 = softmax(np.dot(a1, w2))
a2_deltas.append(a2 - y)
a1_values.append(a1)
a2_values.append(a2)
return a1_values, a2_values, a2_deltas, a2_values[-1]
def backward_prop(x, w1, w2, rw1, a1_values, a2_deltas):
"""
This function performs backward prop using inputs (x), weights (w1, w2, rw1),
hidden state values (a1_values) and last layer errors (a2_deltas).
Hidden state values and last layer errors are computed during forward prop.
Returns a tuple of all the weights (dw1, dw2, drw1).
Credits: https://github.com/km1414/Deep-stuff/blob/master/Neural%20networks/RNN_np.py
I've modified parts of it.
"""
data_points, time_steps, _ = np.shape(x)
next_a1_delta = np.zeros_like(a1_values[-1]) # np.zeros((len(x), len(w1.T)))
dw2 = 0
dw1 = 0
drw1 = 0
for time_step in reversed(range(time_steps)):
a0 = x[:, time_step, :]
a1_previous = a1_values[time_step]
a1 = a1_values[time_step + 1]
a2_delta = a2_deltas[time_step]
a1_delta = (np.dot(next_a1_delta, rw1.T) + np.dot(a2_delta, w2.T)) * activation_derivative(a1)
dw1 += np.dot(a0.T, a1_delta) / data_points
dw2 += np.dot(a1.T, a2_delta) / data_points
drw1 += np.dot(a1_previous.T, a1_delta) / data_points
next_a1_delta = a1_delta
return dw1, dw2, drw1
def get_cost(y_, y):
return np.sum(-y * np.log(y_) - (np.ones(np.shape(y)) - y) * np.log(1 - y_)) / len(y_)
def get_accuracy(y_, y):
y_ = np.argmax(y_, axis=1)
y = np.argmax(y, axis=1)
accuracy = np.mean(np.equal(y_, y))
return accuracy
def one_adam_step(parameter, gradient, cache, beta1=0.9, beta2=0.999, alpha=0.001, epsilon=1e-8):
m, v = cache
# Adam optimization implemented exactly as formulated in the paper on page 2:
# https://arxiv.org/pdf/1412.6980.pdf
m = beta1 * m + (1 - beta1) * gradient # update biased first moment estimate
v = beta2 * v + (1 - beta2) * gradient ** 2 # update biased second raw moment estimate
m_ = m / (1 - beta1) # compute bias-corrected first moment estimate
v_ = v / (1 - beta2) # compute bias-corrected second raw moment estimate
updated_parameter = parameter - alpha * m_ / (v_ ** 0.5 + epsilon) # update parameters
return updated_parameter, (m, v)
def get_data_batch(x, y, batch_size):
indexes = np.random.choice(len(y) - (batch_size+1))
x_batch = x[indexes:indexes+batch_size, :, :]
y_batch = y[indexes:indexes+batch_size, :]
return x_batch, y_batch
# hyper-parameters
time_steps = 4 # an image is separated into time_steps parts for recurrence
batch_size = 200
# load data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
x_train = mnist.train.images.reshape(-1, time_steps, 784//time_steps) # shape=(data_points, time_step, pixel/time_step)
y_train = mnist.train.labels
x_test = mnist.validation.images.reshape(-1, time_steps, 784//time_steps)
y_test = mnist.validation.labels
# reset training
w1 = np.random.uniform(-0.1, 0.1, (784//time_steps, 50))
w2 = np.random.uniform(-0.1, 0.1, (50, 10))
rw1 = np.random.uniform(-0.1, 0.1, (50, 50))
w1_cache, w2_cache, rw1_cache = (0, 0), (0, 0), (0, 0) # adam cache
accuracy_hist = []
# training loop
for i in range(100):
# randomize a training batch
x_batch, y_batch = get_data_batch(x_train, y_train, batch_size)
# forward and backward prop
a1_values, a2_values, a2_deltas, y_ = forward_prop(x_batch, y_batch, w1, w2, rw1)
dw1, dw2, drw1 = backward_prop(x_batch, w1, w2, rw1, a1_values, a2_deltas)
# update parameters
w1, w1_cache = one_adam_step(parameter=w1, gradient=dw1, cache=w1_cache)
w2, w2_cache = one_adam_step(parameter=w2, gradient=dw2, cache=w2_cache)
rw1, rw1_cache = one_adam_step(parameter=rw1, gradient=drw1, cache=rw1_cache)
# plot and print
accuracy_hist.append(get_accuracy(forward_prop(x_test, y_test, w1, w2, rw1)[3], y_test))
pl.plot(accuracy_hist); pl.pause(1e-10)
print(get_cost(y_, y_batch), get_accuracy(y_, y_batch))