|
| 1 | +# https://udemy.com/recommender-systems |
| 2 | +# https://deeplearningcourses.com/recommender-systems |
| 3 | +from __future__ import print_function, division |
| 4 | +from builtins import range |
| 5 | +# Note: you may need to update your version of future |
| 6 | +# sudo pip install -U future |
| 7 | + |
| 8 | +import numpy as np |
| 9 | +import tensorflow as tf |
| 10 | +import matplotlib.pyplot as plt |
| 11 | +from sklearn.utils import shuffle |
| 12 | + |
| 13 | +import pandas as pd |
| 14 | +from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz |
| 15 | +from datetime import datetime |
| 16 | + |
| 17 | + |
| 18 | +# is it possible to one-hot encode the data prior to feeding it |
| 19 | +# into the neural network, so that we don't have to do it on the fly? |
| 20 | +# yes, but: |
| 21 | +# 1) scipy sparse doesn't support N-D matrices |
| 22 | +# 2) you can use the 'sparse' standalone package, but it takes very long |
| 23 | +# and you will run out of RAM |
| 24 | + |
| 25 | + |
| 26 | +def one_hot_encode(X, K): |
| 27 | + # input is N x D |
| 28 | + # output is N x D x K |
| 29 | + N, D = X.shape |
| 30 | + Y = np.zeros((N, D, K)) |
| 31 | + for n, d in zip(*X.nonzero()): |
| 32 | + # 0.5...5 --> 1..10 --> 0..9 |
| 33 | + k = int(X[n,d]*2 - 1) |
| 34 | + Y[n,d,k] = 1 |
| 35 | + return Y |
| 36 | + |
| 37 | +def one_hot_mask(X, K): |
| 38 | + # input is N x D |
| 39 | + # output is N x D x K |
| 40 | + N, D = X.shape |
| 41 | + Y = np.zeros((N, D, K)) |
| 42 | + # if X[n,d] == 0, there's a missing rating |
| 43 | + # so the mask should be all zeros |
| 44 | + # else, it should be all ones |
| 45 | + for n, d in zip(*X.nonzero()): |
| 46 | + Y[n,d,:] = 1 |
| 47 | + return Y |
| 48 | + |
| 49 | +one_to_ten = np.arange(K) + 1 # [1, 2, 3, ..., 10] |
| 50 | +def convert_probs_to_ratings(probs): |
| 51 | + # probs is N x D x K |
| 52 | + # output is N x D matrix of predicted ratings |
| 53 | + # N, D, K = probs.shape |
| 54 | + # out = np.zeros((N, D)) |
| 55 | + # each predicted rating is a weighted average using the probabilities |
| 56 | + # for n in range(N): |
| 57 | + # for d in range(D): |
| 58 | + # out[n,d] = probs[n,d].dot(one_to_ten) / 2 |
| 59 | + # return out |
| 60 | + return probs.dot(one_to_ten) / 2 |
| 61 | + |
| 62 | + |
| 63 | + |
| 64 | +def dot1(V, W): |
| 65 | + # V is N x D x K (batch of visible units) |
| 66 | + # W is D x K x M (weights) |
| 67 | + # returns N x M (hidden layer size) |
| 68 | + return tf.tensordot(V, W, axes=[[1,2], [0,1]]) |
| 69 | + |
| 70 | +def dot2(H, W): |
| 71 | + # H is N x M (batch of hiddens) |
| 72 | + # W is D x K x M (weights transposed) |
| 73 | + # returns N x D x K (visible) |
| 74 | + return tf.tensordot(H, W, axes=[[1], [2]]) |
| 75 | + |
| 76 | + |
| 77 | +class RBM(object): |
| 78 | + def __init__(self, D, M, K): |
| 79 | + self.D = D # input feature size |
| 80 | + self.M = M # hidden size |
| 81 | + self.K = K # number of ratings |
| 82 | + self.build(D, M, K) |
| 83 | + |
| 84 | + |
| 85 | + def build(self, D, M, K): |
| 86 | + # params |
| 87 | + self.W = tf.Variable(tf.random_normal(shape=(D, K, M)) * np.sqrt(2.0 / M)) |
| 88 | + self.c = tf.Variable(np.zeros(M).astype(np.float32)) |
| 89 | + self.b = tf.Variable(np.zeros((D, K)).astype(np.float32)) |
| 90 | + |
| 91 | + # data |
| 92 | + self.X_in = tf.placeholder(tf.float32, shape=(None, D, K)) |
| 93 | + self.mask = tf.placeholder(tf.float32, shape=(None, D, K)) |
| 94 | + |
| 95 | + # conditional probabilities |
| 96 | + # NOTE: tf.contrib.distributions.Bernoulli API has changed in Tensorflow v1.2 |
| 97 | + V = self.X_in |
| 98 | + p_h_given_v = tf.nn.sigmoid(dot1(V, self.W) + self.c) |
| 99 | + self.p_h_given_v = p_h_given_v # save for later |
| 100 | + |
| 101 | + # draw a sample from p(h | v) |
| 102 | + r = tf.random_uniform(shape=tf.shape(p_h_given_v)) |
| 103 | + H = tf.to_float(r < p_h_given_v) |
| 104 | + |
| 105 | + # draw a sample from p(v | h) |
| 106 | + # note: we don't have to actually do the softmax |
| 107 | + logits = dot2(H, self.W) + self.b |
| 108 | + cdist = tf.distributions.Categorical(logits=logits) |
| 109 | + X_sample = cdist.sample() # shape is (N, D) |
| 110 | + X_sample = tf.one_hot(X_sample, depth=self.K) # turn it into (N, D, K) |
| 111 | + X_sample = X_sample * self.mask # missing ratings shouldn't contribute to objective |
| 112 | + |
| 113 | + |
| 114 | + # build the objective |
| 115 | + objective = tf.reduce_mean(self.free_energy(self.X_in)) - tf.reduce_mean(self.free_energy(X_sample)) |
| 116 | + self.train_op = tf.train.AdamOptimizer(1e-2).minimize(objective) |
| 117 | + # self.train_op = tf.train.GradientDescentOptimizer(1e-3).minimize(objective) |
| 118 | + |
| 119 | + # build the cost |
| 120 | + # we won't use this to optimize the model parameters |
| 121 | + # just to observe what happens during training |
| 122 | + logits = self.forward_logits(self.X_in) |
| 123 | + self.cost = tf.reduce_mean( |
| 124 | + tf.nn.softmax_cross_entropy_with_logits( |
| 125 | + labels=self.X_in, |
| 126 | + logits=logits, |
| 127 | + ) |
| 128 | + ) |
| 129 | + |
| 130 | + # to get the output |
| 131 | + self.output_visible = self.forward_output(self.X_in) |
| 132 | + |
| 133 | + initop = tf.global_variables_initializer() |
| 134 | + self.session = tf.Session() |
| 135 | + self.session.run(initop) |
| 136 | + |
| 137 | + def fit(self, X, mask, X_test, mask_test, epochs=10, batch_sz=256, show_fig=True): |
| 138 | + N, D = X.shape |
| 139 | + n_batches = N // batch_sz |
| 140 | + |
| 141 | + |
| 142 | + costs = [] |
| 143 | + test_costs = [] |
| 144 | + for i in range(epochs): |
| 145 | + t0 = datetime.now() |
| 146 | + print("epoch:", i) |
| 147 | + X, mask, X_test, mask_test = shuffle(X, mask, X_test, mask_test) # everything has to be shuffled accordingly |
| 148 | + for j in range(n_batches): |
| 149 | + x = X[j*batch_sz:(j*batch_sz + batch_sz)].toarray() |
| 150 | + m = mask[j*batch_sz:(j*batch_sz + batch_sz)].toarray() |
| 151 | + |
| 152 | + # both visible units and mask have to be in one-hot form |
| 153 | + # N x D --> N x D x K |
| 154 | + batch_one_hot = one_hot_encode(x, self.K) |
| 155 | + m = one_hot_mask(m, self.K) |
| 156 | + |
| 157 | + _, c = self.session.run( |
| 158 | + (self.train_op, self.cost), |
| 159 | + feed_dict={self.X_in: batch_one_hot, self.mask: m} |
| 160 | + ) |
| 161 | + |
| 162 | + if j % 100 == 0: |
| 163 | + print("j / n_batches:", j, "/", n_batches, "cost:", c) |
| 164 | + print("duration:", datetime.now() - t0) |
| 165 | + |
| 166 | + # calculate the true train and test cost |
| 167 | + t0 = datetime.now() |
| 168 | + sse = 0 |
| 169 | + test_sse = 0 |
| 170 | + n = 0 |
| 171 | + test_n = 0 |
| 172 | + for j in range(n_batches): |
| 173 | + x = X[j*batch_sz:(j*batch_sz + batch_sz)].toarray() |
| 174 | + m = mask[j*batch_sz:(j*batch_sz + batch_sz)].toarray() |
| 175 | + |
| 176 | + # only visible input has to be in one-hot form |
| 177 | + xoh = one_hot_encode(x, self.K) |
| 178 | + |
| 179 | + probs = self.get_visible(xoh) |
| 180 | + xhat = convert_probs_to_ratings(probs) |
| 181 | + sse += (m * (xhat - x)*(xhat - x)).sum() |
| 182 | + n += m.sum() |
| 183 | + |
| 184 | + # the test PREDICTIONS come from the train data! |
| 185 | + # X_test and mask_test are only used for targets |
| 186 | + xt = X_test[j*batch_sz:(j*batch_sz + batch_sz)].toarray() |
| 187 | + mt = mask_test[j*batch_sz:(j*batch_sz + batch_sz)].toarray() |
| 188 | + |
| 189 | + test_sse += (mt * (xhat - xt) * (xhat - xt)).sum() |
| 190 | + test_n += mt.sum() |
| 191 | + c = sse/n |
| 192 | + ct = test_sse/test_n |
| 193 | + print("train mse:", c) |
| 194 | + print("test mse:", ct) |
| 195 | + print("calculate cost duration:", datetime.now() - t0) |
| 196 | + costs.append(c) |
| 197 | + test_costs.append(ct) |
| 198 | + if show_fig: |
| 199 | + plt.plot(costs, label='train mse') |
| 200 | + plt.plot(test_costs, label='test mse') |
| 201 | + plt.legend() |
| 202 | + plt.show() |
| 203 | + |
| 204 | + def free_energy(self, V): |
| 205 | + first_term = -tf.reduce_sum(dot1(V, self.b)) |
| 206 | + second_term = -tf.reduce_sum( |
| 207 | + # tf.log(1 + tf.exp(tf.matmul(V, self.W) + self.c)), |
| 208 | + tf.nn.softplus(dot1(V, self.W) + self.c), |
| 209 | + axis=1 |
| 210 | + ) |
| 211 | + return first_term + second_term |
| 212 | + |
| 213 | + def forward_hidden(self, X): |
| 214 | + return tf.nn.sigmoid(dot1(X, self.W) + self.c) |
| 215 | + |
| 216 | + def forward_logits(self, X): |
| 217 | + Z = self.forward_hidden(X) |
| 218 | + return dot2(Z, self.W) + self.b |
| 219 | + |
| 220 | + def forward_output(self, X): |
| 221 | + return tf.nn.softmax(self.forward_logits(X)) |
| 222 | + |
| 223 | + def transform(self, X): |
| 224 | + # accepts and returns a real numpy array |
| 225 | + # unlike forward_hidden and forward_output |
| 226 | + # which deal with tensorflow variables |
| 227 | + return self.session.run(self.p_h_given_v, feed_dict={self.X_in: X}) |
| 228 | + |
| 229 | + def get_visible(self, X): |
| 230 | + return self.session.run(self.output_visible, feed_dict={self.X_in: X}) |
| 231 | + |
| 232 | + |
| 233 | +def main(): |
| 234 | + A = load_npz("Atrain.npz") |
| 235 | + A_test = load_npz("Atest.npz") |
| 236 | + mask = (A > 0) * 1.0 |
| 237 | + mask_test = (A_test > 0) * 1.0 |
| 238 | + |
| 239 | + N, M = A.shape |
| 240 | + rbm = RBM(M, 50, 10) |
| 241 | + rbm.fit(A, mask, A_test, mask_test) |
| 242 | + |
| 243 | + |
| 244 | +if __name__ == '__main__': |
| 245 | + main() |
0 commit comments