Skip to content

Commit 9cc8ec3

Browse files
committed
update
1 parent 05d2580 commit 9cc8ec3

File tree

2 files changed

+61
-1
lines changed

2 files changed

+61
-1
lines changed

recommenders/spark.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
1313

1414
# load in the data
15-
data = sc.textFile("/Users/macuser/Code/machine_learning_examples/large_files/movielens-20m-dataset/small_rating.csv")
15+
data = sc.textFile("../large_files/movielens-20m-dataset/small_rating.csv")
1616

1717
# filter out header
1818
header = data.first() #extract header

recommenders/spark2.py

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# https://udemy.com/recommender-systems
2+
# https://deeplearningcourses.com/recommender-systems
3+
4+
# notes:
5+
# you may have trouble with full dataset on just your local machine
6+
# if you want to know what's in an RDD, use .take(n), ex:
7+
# tmp = p.take(5)
8+
# print(tmp)
9+
10+
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
11+
from pyspark import SparkContext
12+
13+
# increase memory
14+
# SparkContext.setSystemProperty('spark.driver.memory', '10g')
15+
# SparkContext.setSystemProperty('spark.executor.memory', '10g')
16+
17+
sc = SparkContext("local", "Your App Name Here")
18+
19+
20+
# load in the data
21+
# data = sc.textFile("../large_files/movielens-20m-dataset/small_rating.csv")
22+
data = sc.textFile("../large_files/movielens-20m-dataset/rating.csv.gz")
23+
24+
# filter out header
25+
header = data.first() #extract header
26+
data = data.filter(lambda row: row != header)
27+
28+
# convert into a sequence of Rating objects
29+
ratings = data.map(
30+
lambda l: l.split(',')
31+
).map(
32+
lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))
33+
)
34+
35+
# split into train and test
36+
train, test = ratings.randomSplit([0.8, 0.2])
37+
38+
# train the model
39+
K = 10
40+
epochs = 10
41+
model = ALS.train(train, K, epochs)
42+
43+
# evaluate the model
44+
45+
# train
46+
x = train.map(lambda p: (p[0], p[1]))
47+
p = model.predictAll(x).map(lambda r: ((r[0], r[1]), r[2]))
48+
ratesAndPreds = train.map(lambda r: ((r[0], r[1]), r[2])).join(p)
49+
# joins on first item: (user_id, movie_id)
50+
# each row of result is: ((user_id, movie_id), (rating, prediction))
51+
mse = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
52+
print("***** train mse: %s *****" % mse)
53+
54+
55+
# test
56+
x = test.map(lambda p: (p[0], p[1]))
57+
p = model.predictAll(x).map(lambda r: ((r[0], r[1]), r[2]))
58+
ratesAndPreds = test.map(lambda r: ((r[0], r[1]), r[2])).join(p)
59+
mse = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
60+
print("***** test mse: %s *****" % mse)

0 commit comments

Comments
 (0)