lazyprogrammer
diff --git a/‎recommenders/autorec.py
+126 b/‎recommenders/autorec.py
+126
diff --git a/‎recommenders/extra_reading.txt
+53 b/‎recommenders/extra_reading.txt
+53
diff --git a/‎recommenders/itembased.py
+171 b/‎recommenders/itembased.py
+171
@@ -0,0 +1,126 @@
+# https://udemy.com/recommender-systems
+# https://deeplearningcourses.com/recommender-systems
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.utils import shuffle
+from scipy.sparse import save_npz, load_npz
+
+import keras.backend as K
+from keras.models import Model
+from keras.layers import Input, Dropout, Dense
+from keras.regularizers import l2
+from keras.optimizers import SGD
+
+# config
+batch_size = 128
+epochs = 20
+reg = 0.0001
+# reg = 0
+
+A = load_npz("Atrain.npz")
+A_test = load_npz("Atest.npz")
+mask = (A > 0) * 1.0
+mask_test = (A_test > 0) * 1.0
+
+# make copies since we will shuffle
+A_copy = A.copy()
+mask_copy = mask.copy()
+A_test_copy = A_test.copy()
+mask_test_copy = mask_test.copy()
+
+N, M = A.shape
+print("N:", N, "M:", M)
+print("N // batch_size:", N // batch_size)
+
+# center the data
+mu = A.sum() / mask.sum()
+print("mu:", mu)
+
+
+
+# build the model - just a 1 hidden layer autoencoder
+i = Input(shape=(M,))
+# bigger hidden layer size seems to help!
+x = Dropout(0.7)(i)
+x = Dense(700, activation='tanh', kernel_regularizer=l2(reg))(x)
+# x = Dropout(0.5)(x)
+x = Dense(M, kernel_regularizer=l2(reg))(x)
+
+
+
+def custom_loss(y_true, y_pred):
+  mask = K.cast(K.not_equal(y_true, 0), dtype='float32')
+  diff = y_pred - y_true
+  sqdiff = diff * diff * mask
+  sse = K.sum(K.sum(sqdiff))
+  n = K.sum(K.sum(mask))
+  return sse / n
+
+
+def generator(A, M):
+  while True:
+    A, M = shuffle(A, M)
+    for i in range(A.shape[0] // batch_size + 1):
+      upper = min((i+1)*batch_size, A.shape[0])
+      a = A[i*batch_size:upper].toarray()
+      m = M[i*batch_size:upper].toarray()
+      a = a - mu * m # must keep zeros at zero!
+      # m2 = (np.random.random(a.shape) > 0.5)
+      # noisy = a * m2
+      noisy = a # no noise
+      yield noisy, a
+
+
+def test_generator(A, M, A_test, M_test):
+  # assumes A and A_test are in corresponding order
+  # both of size N x M
+  while True:
+    for i in range(A.shape[0] // batch_size + 1):
+      upper = min((i+1)*batch_size, A.shape[0])
+      a = A[i*batch_size:upper].toarray()
+      m = M[i*batch_size:upper].toarray()
+      at = A_test[i*batch_size:upper].toarray()
+      mt = M_test[i*batch_size:upper].toarray()
+      a = a - mu * m
+      at = at - mu * mt
+      yield a, at
+
+
+
+model = Model(i, x)
+model.compile(
+  loss=custom_loss,
+  optimizer=SGD(lr=0.08, momentum=0.9),
+  # optimizer='adam',
+  metrics=[custom_loss],
+)
+
+
+r = model.fit_generator(
+  generator(A, mask),
+  validation_data=test_generator(A_copy, mask_copy, A_test_copy, mask_test_copy),
+  epochs=epochs,
+  steps_per_epoch=A.shape[0] // batch_size + 1,
+  validation_steps=A_test.shape[0] // batch_size + 1,
+)
+print(r.history.keys())
+
+
+
+# plot losses
+plt.plot(r.history['loss'], label="train loss")
+plt.plot(r.history['val_loss'], label="test loss")
+plt.legend()
+plt.show()
+
+# plot mse
+plt.plot(r.history['custom_loss'], label="train mse")
+plt.plot(r.history['val_custom_loss'], label="test mse")
+plt.legend()
+plt.show()
@@ -0,0 +1,53 @@
+How Hacker News ranking really works: scoring, controversy, and penalties
+http://www.righto.com/2013/11/how-hacker-news-ranking-really-works.html
+
+The Evolution Of Hacker News
+https://techcrunch.com/2013/05/18/the-evolution-of-hacker-news/
+
+Reddit sorting code
+https://github.com/reddit-archive/reddit/blob/master/r2/r2/lib/db/_sorts.pyx
+
+Revealed: US spy operation that manipulates social media
+https://www.theguardian.com/technology/2011/mar/17/us-spy-operation-social-networks
+
+Learning to rank
+https://en.wikipedia.org/wiki/Learning_to_rank#Evaluation_measures
+
+How Not To Sort By Average Rating
+https://www.evanmiller.org/how-not-to-sort-by-average-rating.html
+
+Wilson score interval
+https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
+
+reddit’s new comment sorting system
+https://redditblog.com/2009/10/15/reddits-new-comment-sorting-system/
+
+Markov Chains Explained Visually
+http://setosa.io/ev/markov-chains/
+
+An algorithmic framework for performing collaborative filtering
+https://dl.acm.org/citation.cfm?id=312682
+
+Item-based collaborative filtering recommendation algorithms
+https://dl.acm.org/citation.cfm?id=372071
+
+FunkSVD
+http://sifter.org/~simon/journal/20061211.html
+
+Probabilistic Matrix Factorization
+https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf
+
+Bayesian Probabilistic Matrix Factorization using Markov Chain Monte Carlo
+https://www.cs.toronto.edu/~amnih/papers/bpmf.pdf
+
+Algorithms for Non-negative Matrix Factorization
+https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf
+
+Learning the parts of objects by non-negative matrix factorization
+http://www.columbia.edu/~jwp2128/Teaching/E4903/papers/nmf_nature.pdf
+
+Restricted Boltzmann Machines for Collaborative Filtering
+https://www.cs.toronto.edu/~rsalakhu/papers/rbmcf.pdf
+
+AutoRec: Autoencoders Meet Collaborative Filtering
+http://users.cecs.anu.edu.au/~u5098633/papers/www15.pdf
@@ -0,0 +1,171 @@
+# https://udemy.com/recommender-systems
+# https://deeplearningcourses.com/recommender-systems
+from __future__ import print_function, division
+from builtins import range, input
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+import pickle
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.utils import shuffle
+from datetime import datetime
+from sortedcontainers import SortedList
+
+# load in the data
+import os
+if not os.path.exists('user2movie.json') or \
+   not os.path.exists('movie2user.json') or \
+   not os.path.exists('usermovie2rating.json') or \
+   not os.path.exists('usermovie2rating_test.json'):
+   import preprocess2dict
+
+
+with open('user2movie.json', 'rb') as f:
+  user2movie = pickle.load(f)
+
+with open('movie2user.json', 'rb') as f:
+  movie2user = pickle.load(f)
+
+with open('usermovie2rating.json', 'rb') as f:
+  usermovie2rating = pickle.load(f)
+
+with open('usermovie2rating_test.json', 'rb') as f:
+  usermovie2rating_test = pickle.load(f)
+
+
+N = np.max(list(user2movie.keys())) + 1
+# the test set may contain movies the train set doesn't have data on
+m1 = np.max(list(movie2user.keys()))
+m2 = np.max([m for (u, m), r in usermovie2rating_test.items()])
+M = max(m1, m2) + 1
+print("N:", N, "M:", M)
+
+if M > 2000:
+  print("N =", N, "are you sure you want to continue?")
+  print("Comment out these lines if so...")
+  exit()
+
+
+# to find the user similarities, you have to do O(M^2 * N) calculations!
+# in the "real-world" you'd want to parallelize this
+# note: we really only have to do half the calculations, since w_ij is symmetric
+K = 20 # number of neighbors we'd like to consider
+limit = 5 # number of common movies users must have in common in order to consider
+neighbors = [] # store neighbors in this list
+averages = [] # each item's average rating for later use
+deviations = [] # each item's deviation for later use
+
+for i in range(M):
+  # find the K closest items to item i
+  users_i = movie2user[i]
+  users_i_set = set(users_i)
+
+  # calculate avg and deviation
+  ratings_i = { user:usermovie2rating[(user, i)] for user in users_i }
+  avg_i = np.mean(list(ratings_i.values()))
+  dev_i = { user:(rating - avg_i) for user, rating in ratings_i.items() }
+  dev_i_values = np.array(list(dev_i.values()))
+  sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))
+
+  # save these for later use
+  averages.append(avg_i)
+  deviations.append(dev_i)
+
+  sl = SortedList()
+  for j in range(M):
+    # don't include yourself
+    if j != i:
+      users_j = movie2user[j]
+      users_j_set = set(users_j)
+      common_users = (users_i_set & users_j_set) # intersection
+      if len(common_users) > limit:
+        # calculate avg and deviation
+        ratings_j = { user:usermovie2rating[(user, j)] for user in users_j }
+        avg_j = np.mean(list(ratings_j.values()))
+        dev_j = { user:(rating - avg_j) for user, rating in ratings_j.items() }
+        dev_j_values = np.array(list(dev_j.values()))
+        sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))
+
+        # calculate correlation coefficient
+        numerator = sum(dev_i[m]*dev_j[m] for m in common_users)
+        w_ij = numerator / (sigma_i * sigma_j)
+
+        # insert into sorted list and truncate
+        # negate weight, because list is sorted ascending
+        # maximum value (1) is "closest"
+        sl.add((-w_ij, j))
+        if len(sl) > K:
+          del sl[-1]
+
+  # store the neighbors
+  neighbors.append(sl)
+
+  # print out useful things
+  if i % 1 == 0:
+    print(i)
+
+
+
+# using neighbors, calculate train and test MSE
+
+def predict(i, u):
+  # calculate the weighted sum of deviations
+  numerator = 0
+  denominator = 0
+  for neg_w, j in neighbors[i]:
+    # remember, the weight is stored as its negative
+    # so the negative of the negative weight is the positive weight
+    try:
+      numerator += -neg_w * deviations[j][u]
+      denominator += abs(neg_w)
+    except KeyError:
+      # neighbor may not have been rated by the same user
+      # don't want to do dictionary lookup twice
+      # so just throw exception
+      pass
+
+  if denominator == 0:
+    prediction = averages[i]
+  else:
+    prediction = numerator / denominator + averages[i]
+  prediction = min(5, prediction)
+  prediction = max(0.5, prediction) # min rating is 0.5
+  return prediction
+
+
+
+train_predictions = []
+train_targets = []
+for (u, m), target in usermovie2rating.items():
+  # calculate the prediction for this movie
+  prediction = predict(m, u)
+
+  # save the prediction and target
+  train_predictions.append(prediction)
+  train_targets.append(target)
+
+test_predictions = []
+test_targets = []
+# same thing for test set
+for (u, m), target in usermovie2rating_test.items():
+  # calculate the prediction for this movie
+  prediction = predict(m, u)
+
+  # save the prediction and target
+  test_predictions.append(prediction)
+  test_targets.append(target)
+
+
+# calculate accuracy
+def mse(p, t):
+  p = np.array(p)
+  t = np.array(t)
+  return np.mean((p - t)**2)
+
+print('train mse:', mse(train_predictions, train_targets))
+print('test mse:', mse(test_predictions, test_targets))
+
+
+