Skip to content

Commit 994c7fc

Browse files
committed
recommenders
1 parent 5badbf3 commit 994c7fc

13 files changed

+1353
-0
lines changed

recommenders/autorec.py

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# https://udemy.com/recommender-systems
2+
# https://deeplearningcourses.com/recommender-systems
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
import numpy as np
9+
import pandas as pd
10+
import matplotlib.pyplot as plt
11+
from sklearn.utils import shuffle
12+
from scipy.sparse import save_npz, load_npz
13+
14+
import keras.backend as K
15+
from keras.models import Model
16+
from keras.layers import Input, Dropout, Dense
17+
from keras.regularizers import l2
18+
from keras.optimizers import SGD
19+
20+
# config
21+
batch_size = 128
22+
epochs = 20
23+
reg = 0.0001
24+
# reg = 0
25+
26+
A = load_npz("Atrain.npz")
27+
A_test = load_npz("Atest.npz")
28+
mask = (A > 0) * 1.0
29+
mask_test = (A_test > 0) * 1.0
30+
31+
# make copies since we will shuffle
32+
A_copy = A.copy()
33+
mask_copy = mask.copy()
34+
A_test_copy = A_test.copy()
35+
mask_test_copy = mask_test.copy()
36+
37+
N, M = A.shape
38+
print("N:", N, "M:", M)
39+
print("N // batch_size:", N // batch_size)
40+
41+
# center the data
42+
mu = A.sum() / mask.sum()
43+
print("mu:", mu)
44+
45+
46+
47+
# build the model - just a 1 hidden layer autoencoder
48+
i = Input(shape=(M,))
49+
# bigger hidden layer size seems to help!
50+
x = Dropout(0.7)(i)
51+
x = Dense(700, activation='tanh', kernel_regularizer=l2(reg))(x)
52+
# x = Dropout(0.5)(x)
53+
x = Dense(M, kernel_regularizer=l2(reg))(x)
54+
55+
56+
57+
def custom_loss(y_true, y_pred):
58+
mask = K.cast(K.not_equal(y_true, 0), dtype='float32')
59+
diff = y_pred - y_true
60+
sqdiff = diff * diff * mask
61+
sse = K.sum(K.sum(sqdiff))
62+
n = K.sum(K.sum(mask))
63+
return sse / n
64+
65+
66+
def generator(A, M):
67+
while True:
68+
A, M = shuffle(A, M)
69+
for i in range(A.shape[0] // batch_size + 1):
70+
upper = min((i+1)*batch_size, A.shape[0])
71+
a = A[i*batch_size:upper].toarray()
72+
m = M[i*batch_size:upper].toarray()
73+
a = a - mu * m # must keep zeros at zero!
74+
# m2 = (np.random.random(a.shape) > 0.5)
75+
# noisy = a * m2
76+
noisy = a # no noise
77+
yield noisy, a
78+
79+
80+
def test_generator(A, M, A_test, M_test):
81+
# assumes A and A_test are in corresponding order
82+
# both of size N x M
83+
while True:
84+
for i in range(A.shape[0] // batch_size + 1):
85+
upper = min((i+1)*batch_size, A.shape[0])
86+
a = A[i*batch_size:upper].toarray()
87+
m = M[i*batch_size:upper].toarray()
88+
at = A_test[i*batch_size:upper].toarray()
89+
mt = M_test[i*batch_size:upper].toarray()
90+
a = a - mu * m
91+
at = at - mu * mt
92+
yield a, at
93+
94+
95+
96+
model = Model(i, x)
97+
model.compile(
98+
loss=custom_loss,
99+
optimizer=SGD(lr=0.08, momentum=0.9),
100+
# optimizer='adam',
101+
metrics=[custom_loss],
102+
)
103+
104+
105+
r = model.fit_generator(
106+
generator(A, mask),
107+
validation_data=test_generator(A_copy, mask_copy, A_test_copy, mask_test_copy),
108+
epochs=epochs,
109+
steps_per_epoch=A.shape[0] // batch_size + 1,
110+
validation_steps=A_test.shape[0] // batch_size + 1,
111+
)
112+
print(r.history.keys())
113+
114+
115+
116+
# plot losses
117+
plt.plot(r.history['loss'], label="train loss")
118+
plt.plot(r.history['val_loss'], label="test loss")
119+
plt.legend()
120+
plt.show()
121+
122+
# plot mse
123+
plt.plot(r.history['custom_loss'], label="train mse")
124+
plt.plot(r.history['val_custom_loss'], label="test mse")
125+
plt.legend()
126+
plt.show()

recommenders/extra_reading.txt

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
How Hacker News ranking really works: scoring, controversy, and penalties
2+
http://www.righto.com/2013/11/how-hacker-news-ranking-really-works.html
3+
4+
The Evolution Of Hacker News
5+
https://techcrunch.com/2013/05/18/the-evolution-of-hacker-news/
6+
7+
Reddit sorting code
8+
https://github.com/reddit-archive/reddit/blob/master/r2/r2/lib/db/_sorts.pyx
9+
10+
Revealed: US spy operation that manipulates social media
11+
https://www.theguardian.com/technology/2011/mar/17/us-spy-operation-social-networks
12+
13+
Learning to rank
14+
https://en.wikipedia.org/wiki/Learning_to_rank#Evaluation_measures
15+
16+
How Not To Sort By Average Rating
17+
https://www.evanmiller.org/how-not-to-sort-by-average-rating.html
18+
19+
Wilson score interval
20+
https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
21+
22+
reddit’s new comment sorting system
23+
https://redditblog.com/2009/10/15/reddits-new-comment-sorting-system/
24+
25+
Markov Chains Explained Visually
26+
http://setosa.io/ev/markov-chains/
27+
28+
An algorithmic framework for performing collaborative filtering
29+
https://dl.acm.org/citation.cfm?id=312682
30+
31+
Item-based collaborative filtering recommendation algorithms
32+
https://dl.acm.org/citation.cfm?id=372071
33+
34+
FunkSVD
35+
http://sifter.org/~simon/journal/20061211.html
36+
37+
Probabilistic Matrix Factorization
38+
https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf
39+
40+
Bayesian Probabilistic Matrix Factorization using Markov Chain Monte Carlo
41+
https://www.cs.toronto.edu/~amnih/papers/bpmf.pdf
42+
43+
Algorithms for Non-negative Matrix Factorization
44+
https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf
45+
46+
Learning the parts of objects by non-negative matrix factorization
47+
http://www.columbia.edu/~jwp2128/Teaching/E4903/papers/nmf_nature.pdf
48+
49+
Restricted Boltzmann Machines for Collaborative Filtering
50+
https://www.cs.toronto.edu/~rsalakhu/papers/rbmcf.pdf
51+
52+
AutoRec: Autoencoders Meet Collaborative Filtering
53+
http://users.cecs.anu.edu.au/~u5098633/papers/www15.pdf

recommenders/itembased.py

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# https://udemy.com/recommender-systems
2+
# https://deeplearningcourses.com/recommender-systems
3+
from __future__ import print_function, division
4+
from builtins import range, input
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
import pickle
9+
import numpy as np
10+
import pandas as pd
11+
import matplotlib.pyplot as plt
12+
from sklearn.utils import shuffle
13+
from datetime import datetime
14+
from sortedcontainers import SortedList
15+
16+
# load in the data
17+
import os
18+
if not os.path.exists('user2movie.json') or \
19+
not os.path.exists('movie2user.json') or \
20+
not os.path.exists('usermovie2rating.json') or \
21+
not os.path.exists('usermovie2rating_test.json'):
22+
import preprocess2dict
23+
24+
25+
with open('user2movie.json', 'rb') as f:
26+
user2movie = pickle.load(f)
27+
28+
with open('movie2user.json', 'rb') as f:
29+
movie2user = pickle.load(f)
30+
31+
with open('usermovie2rating.json', 'rb') as f:
32+
usermovie2rating = pickle.load(f)
33+
34+
with open('usermovie2rating_test.json', 'rb') as f:
35+
usermovie2rating_test = pickle.load(f)
36+
37+
38+
N = np.max(list(user2movie.keys())) + 1
39+
# the test set may contain movies the train set doesn't have data on
40+
m1 = np.max(list(movie2user.keys()))
41+
m2 = np.max([m for (u, m), r in usermovie2rating_test.items()])
42+
M = max(m1, m2) + 1
43+
print("N:", N, "M:", M)
44+
45+
if M > 2000:
46+
print("N =", N, "are you sure you want to continue?")
47+
print("Comment out these lines if so...")
48+
exit()
49+
50+
51+
# to find the user similarities, you have to do O(M^2 * N) calculations!
52+
# in the "real-world" you'd want to parallelize this
53+
# note: we really only have to do half the calculations, since w_ij is symmetric
54+
K = 20 # number of neighbors we'd like to consider
55+
limit = 5 # number of common movies users must have in common in order to consider
56+
neighbors = [] # store neighbors in this list
57+
averages = [] # each item's average rating for later use
58+
deviations = [] # each item's deviation for later use
59+
60+
for i in range(M):
61+
# find the K closest items to item i
62+
users_i = movie2user[i]
63+
users_i_set = set(users_i)
64+
65+
# calculate avg and deviation
66+
ratings_i = { user:usermovie2rating[(user, i)] for user in users_i }
67+
avg_i = np.mean(list(ratings_i.values()))
68+
dev_i = { user:(rating - avg_i) for user, rating in ratings_i.items() }
69+
dev_i_values = np.array(list(dev_i.values()))
70+
sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))
71+
72+
# save these for later use
73+
averages.append(avg_i)
74+
deviations.append(dev_i)
75+
76+
sl = SortedList()
77+
for j in range(M):
78+
# don't include yourself
79+
if j != i:
80+
users_j = movie2user[j]
81+
users_j_set = set(users_j)
82+
common_users = (users_i_set & users_j_set) # intersection
83+
if len(common_users) > limit:
84+
# calculate avg and deviation
85+
ratings_j = { user:usermovie2rating[(user, j)] for user in users_j }
86+
avg_j = np.mean(list(ratings_j.values()))
87+
dev_j = { user:(rating - avg_j) for user, rating in ratings_j.items() }
88+
dev_j_values = np.array(list(dev_j.values()))
89+
sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))
90+
91+
# calculate correlation coefficient
92+
numerator = sum(dev_i[m]*dev_j[m] for m in common_users)
93+
w_ij = numerator / (sigma_i * sigma_j)
94+
95+
# insert into sorted list and truncate
96+
# negate weight, because list is sorted ascending
97+
# maximum value (1) is "closest"
98+
sl.add((-w_ij, j))
99+
if len(sl) > K:
100+
del sl[-1]
101+
102+
# store the neighbors
103+
neighbors.append(sl)
104+
105+
# print out useful things
106+
if i % 1 == 0:
107+
print(i)
108+
109+
110+
111+
# using neighbors, calculate train and test MSE
112+
113+
def predict(i, u):
114+
# calculate the weighted sum of deviations
115+
numerator = 0
116+
denominator = 0
117+
for neg_w, j in neighbors[i]:
118+
# remember, the weight is stored as its negative
119+
# so the negative of the negative weight is the positive weight
120+
try:
121+
numerator += -neg_w * deviations[j][u]
122+
denominator += abs(neg_w)
123+
except KeyError:
124+
# neighbor may not have been rated by the same user
125+
# don't want to do dictionary lookup twice
126+
# so just throw exception
127+
pass
128+
129+
if denominator == 0:
130+
prediction = averages[i]
131+
else:
132+
prediction = numerator / denominator + averages[i]
133+
prediction = min(5, prediction)
134+
prediction = max(0.5, prediction) # min rating is 0.5
135+
return prediction
136+
137+
138+
139+
train_predictions = []
140+
train_targets = []
141+
for (u, m), target in usermovie2rating.items():
142+
# calculate the prediction for this movie
143+
prediction = predict(m, u)
144+
145+
# save the prediction and target
146+
train_predictions.append(prediction)
147+
train_targets.append(target)
148+
149+
test_predictions = []
150+
test_targets = []
151+
# same thing for test set
152+
for (u, m), target in usermovie2rating_test.items():
153+
# calculate the prediction for this movie
154+
prediction = predict(m, u)
155+
156+
# save the prediction and target
157+
test_predictions.append(prediction)
158+
test_targets.append(target)
159+
160+
161+
# calculate accuracy
162+
def mse(p, t):
163+
p = np.array(p)
164+
t = np.array(t)
165+
return np.mean((p - t)**2)
166+
167+
print('train mse:', mse(train_predictions, train_targets))
168+
print('test mse:', mse(test_predictions, test_targets))
169+
170+
171+

0 commit comments

Comments
 (0)