|
| 1 | +# https://udemy.com/recommender-systems |
| 2 | +# https://deeplearningcourses.com/recommender-systems |
| 3 | +from __future__ import print_function, division |
| 4 | +from builtins import range, input |
| 5 | +# Note: you may need to update your version of future |
| 6 | +# sudo pip install -U future |
| 7 | + |
| 8 | +import pickle |
| 9 | +import numpy as np |
| 10 | +import pandas as pd |
| 11 | +import matplotlib.pyplot as plt |
| 12 | +from sklearn.utils import shuffle |
| 13 | +from datetime import datetime |
| 14 | +from sortedcontainers import SortedList |
| 15 | + |
| 16 | +# load in the data |
| 17 | +import os |
| 18 | +if not os.path.exists('user2movie.json') or \ |
| 19 | + not os.path.exists('movie2user.json') or \ |
| 20 | + not os.path.exists('usermovie2rating.json') or \ |
| 21 | + not os.path.exists('usermovie2rating_test.json'): |
| 22 | + import preprocess2dict |
| 23 | + |
| 24 | + |
| 25 | +with open('user2movie.json', 'rb') as f: |
| 26 | + user2movie = pickle.load(f) |
| 27 | + |
| 28 | +with open('movie2user.json', 'rb') as f: |
| 29 | + movie2user = pickle.load(f) |
| 30 | + |
| 31 | +with open('usermovie2rating.json', 'rb') as f: |
| 32 | + usermovie2rating = pickle.load(f) |
| 33 | + |
| 34 | +with open('usermovie2rating_test.json', 'rb') as f: |
| 35 | + usermovie2rating_test = pickle.load(f) |
| 36 | + |
| 37 | + |
| 38 | +N = np.max(list(user2movie.keys())) + 1 |
| 39 | +# the test set may contain movies the train set doesn't have data on |
| 40 | +m1 = np.max(list(movie2user.keys())) |
| 41 | +m2 = np.max([m for (u, m), r in usermovie2rating_test.items()]) |
| 42 | +M = max(m1, m2) + 1 |
| 43 | +print("N:", N, "M:", M) |
| 44 | + |
| 45 | +if M > 2000: |
| 46 | + print("N =", N, "are you sure you want to continue?") |
| 47 | + print("Comment out these lines if so...") |
| 48 | + exit() |
| 49 | + |
| 50 | + |
| 51 | +# to find the user similarities, you have to do O(M^2 * N) calculations! |
| 52 | +# in the "real-world" you'd want to parallelize this |
| 53 | +# note: we really only have to do half the calculations, since w_ij is symmetric |
| 54 | +K = 20 # number of neighbors we'd like to consider |
| 55 | +limit = 5 # number of common movies users must have in common in order to consider |
| 56 | +neighbors = [] # store neighbors in this list |
| 57 | +averages = [] # each item's average rating for later use |
| 58 | +deviations = [] # each item's deviation for later use |
| 59 | + |
| 60 | +for i in range(M): |
| 61 | + # find the K closest items to item i |
| 62 | + users_i = movie2user[i] |
| 63 | + users_i_set = set(users_i) |
| 64 | + |
| 65 | + # calculate avg and deviation |
| 66 | + ratings_i = { user:usermovie2rating[(user, i)] for user in users_i } |
| 67 | + avg_i = np.mean(list(ratings_i.values())) |
| 68 | + dev_i = { user:(rating - avg_i) for user, rating in ratings_i.items() } |
| 69 | + dev_i_values = np.array(list(dev_i.values())) |
| 70 | + sigma_i = np.sqrt(dev_i_values.dot(dev_i_values)) |
| 71 | + |
| 72 | + # save these for later use |
| 73 | + averages.append(avg_i) |
| 74 | + deviations.append(dev_i) |
| 75 | + |
| 76 | + sl = SortedList() |
| 77 | + for j in range(M): |
| 78 | + # don't include yourself |
| 79 | + if j != i: |
| 80 | + users_j = movie2user[j] |
| 81 | + users_j_set = set(users_j) |
| 82 | + common_users = (users_i_set & users_j_set) # intersection |
| 83 | + if len(common_users) > limit: |
| 84 | + # calculate avg and deviation |
| 85 | + ratings_j = { user:usermovie2rating[(user, j)] for user in users_j } |
| 86 | + avg_j = np.mean(list(ratings_j.values())) |
| 87 | + dev_j = { user:(rating - avg_j) for user, rating in ratings_j.items() } |
| 88 | + dev_j_values = np.array(list(dev_j.values())) |
| 89 | + sigma_j = np.sqrt(dev_j_values.dot(dev_j_values)) |
| 90 | + |
| 91 | + # calculate correlation coefficient |
| 92 | + numerator = sum(dev_i[m]*dev_j[m] for m in common_users) |
| 93 | + w_ij = numerator / (sigma_i * sigma_j) |
| 94 | + |
| 95 | + # insert into sorted list and truncate |
| 96 | + # negate weight, because list is sorted ascending |
| 97 | + # maximum value (1) is "closest" |
| 98 | + sl.add((-w_ij, j)) |
| 99 | + if len(sl) > K: |
| 100 | + del sl[-1] |
| 101 | + |
| 102 | + # store the neighbors |
| 103 | + neighbors.append(sl) |
| 104 | + |
| 105 | + # print out useful things |
| 106 | + if i % 1 == 0: |
| 107 | + print(i) |
| 108 | + |
| 109 | + |
| 110 | + |
| 111 | +# using neighbors, calculate train and test MSE |
| 112 | + |
| 113 | +def predict(i, u): |
| 114 | + # calculate the weighted sum of deviations |
| 115 | + numerator = 0 |
| 116 | + denominator = 0 |
| 117 | + for neg_w, j in neighbors[i]: |
| 118 | + # remember, the weight is stored as its negative |
| 119 | + # so the negative of the negative weight is the positive weight |
| 120 | + try: |
| 121 | + numerator += -neg_w * deviations[j][u] |
| 122 | + denominator += abs(neg_w) |
| 123 | + except KeyError: |
| 124 | + # neighbor may not have been rated by the same user |
| 125 | + # don't want to do dictionary lookup twice |
| 126 | + # so just throw exception |
| 127 | + pass |
| 128 | + |
| 129 | + if denominator == 0: |
| 130 | + prediction = averages[i] |
| 131 | + else: |
| 132 | + prediction = numerator / denominator + averages[i] |
| 133 | + prediction = min(5, prediction) |
| 134 | + prediction = max(0.5, prediction) # min rating is 0.5 |
| 135 | + return prediction |
| 136 | + |
| 137 | + |
| 138 | + |
| 139 | +train_predictions = [] |
| 140 | +train_targets = [] |
| 141 | +for (u, m), target in usermovie2rating.items(): |
| 142 | + # calculate the prediction for this movie |
| 143 | + prediction = predict(m, u) |
| 144 | + |
| 145 | + # save the prediction and target |
| 146 | + train_predictions.append(prediction) |
| 147 | + train_targets.append(target) |
| 148 | + |
| 149 | +test_predictions = [] |
| 150 | +test_targets = [] |
| 151 | +# same thing for test set |
| 152 | +for (u, m), target in usermovie2rating_test.items(): |
| 153 | + # calculate the prediction for this movie |
| 154 | + prediction = predict(m, u) |
| 155 | + |
| 156 | + # save the prediction and target |
| 157 | + test_predictions.append(prediction) |
| 158 | + test_targets.append(target) |
| 159 | + |
| 160 | + |
| 161 | +# calculate accuracy |
| 162 | +def mse(p, t): |
| 163 | + p = np.array(p) |
| 164 | + t = np.array(t) |
| 165 | + return np.mean((p - t)**2) |
| 166 | + |
| 167 | +print('train mse:', mse(train_predictions, train_targets)) |
| 168 | +print('test mse:', mse(test_predictions, test_targets)) |
| 169 | + |
| 170 | + |
| 171 | + |
0 commit comments