1
+ # https://udemy.com/recommender-systems
2
+ # https://deeplearningcourses.com/recommender-systems
3
+
4
+ ### meant to be pasted into console ###
5
+
6
+ # notes:
7
+ # you may have trouble with full dataset on just your local machine
8
+ # if you want to know what's in an RDD, use .take(n), ex:
9
+ # tmp = p.take(5)
10
+ # print(tmp)
11
+
12
+ from pyspark .mllib .recommendation import ALS , MatrixFactorizationModel , Rating
13
+
14
+ # load in the data
15
+ data = sc .textFile ("/Users/macuser/Code/machine_learning_examples/large_files/movielens-20m-dataset/small_rating.csv" )
16
+
17
+ # filter out header
18
+ header = data .first () #extract header
19
+ data = data .filter (lambda row : row != header )
20
+
21
+ # convert into a sequence of Rating objects
22
+ ratings = data .map (
23
+ lambda l : l .split (',' )
24
+ ).map (
25
+ lambda l : Rating (int (l [0 ]), int (l [1 ]), float (l [2 ]))
26
+ )
27
+
28
+ # split into train and test
29
+ train , test = ratings .randomSplit ([0.8 , 0.2 ])
30
+
31
+ # train the model
32
+ K = 10
33
+ epochs = 10
34
+ model = ALS .train (train , K , epochs )
35
+
36
+ # evaluate the model
37
+
38
+ # train
39
+ x = train .map (lambda p : (p [0 ], p [1 ]))
40
+ p = model .predictAll (x ).map (lambda r : ((r [0 ], r [1 ]), r [2 ]))
41
+ ratesAndPreds = train .map (lambda r : ((r [0 ], r [1 ]), r [2 ])).join (p )
42
+ # joins on first item: (user_id, movie_id)
43
+ # each row of result is: ((user_id, movie_id), (rating, prediction))
44
+ mse = ratesAndPreds .map (lambda r : (r [1 ][0 ] - r [1 ][1 ])** 2 ).mean ()
45
+ print ("train mse:" , mse )
46
+
47
+
48
+ # test
49
+ x = test .map (lambda p : (p [0 ], p [1 ]))
50
+ p = model .predictAll (x ).map (lambda r : ((r [0 ], r [1 ]), r [2 ]))
51
+ ratesAndPreds = test .map (lambda r : ((r [0 ], r [1 ]), r [2 ])).join (p )
52
+ mse = ratesAndPreds .map (lambda r : (r [1 ][0 ] - r [1 ][1 ])** 2 ).mean ()
53
+ print ("test mse:" , mse )
0 commit comments