1+ # https://udemy.com/recommender-systems
2+ # https://deeplearningcourses.com/recommender-systems
3+
4+ ### meant to be pasted into console ###
5+
6+ # notes:
7+ # you may have trouble with full dataset on just your local machine
8+ # if you want to know what's in an RDD, use .take(n), ex:
9+ # tmp = p.take(5)
10+ # print(tmp)
11+
12+ from pyspark .mllib .recommendation import ALS , MatrixFactorizationModel , Rating
13+
14+ # load in the data
15+ data = sc .textFile ("/Users/macuser/Code/machine_learning_examples/large_files/movielens-20m-dataset/small_rating.csv" )
16+
17+ # filter out header
18+ header = data .first () #extract header
19+ data = data .filter (lambda row : row != header )
20+
21+ # convert into a sequence of Rating objects
22+ ratings = data .map (
23+ lambda l : l .split (',' )
24+ ).map (
25+ lambda l : Rating (int (l [0 ]), int (l [1 ]), float (l [2 ]))
26+ )
27+
28+ # split into train and test
29+ train , test = ratings .randomSplit ([0.8 , 0.2 ])
30+
31+ # train the model
32+ K = 10
33+ epochs = 10
34+ model = ALS .train (train , K , epochs )
35+
36+ # evaluate the model
37+
38+ # train
39+ x = train .map (lambda p : (p [0 ], p [1 ]))
40+ p = model .predictAll (x ).map (lambda r : ((r [0 ], r [1 ]), r [2 ]))
41+ ratesAndPreds = train .map (lambda r : ((r [0 ], r [1 ]), r [2 ])).join (p )
42+ # joins on first item: (user_id, movie_id)
43+ # each row of result is: ((user_id, movie_id), (rating, prediction))
44+ mse = ratesAndPreds .map (lambda r : (r [1 ][0 ] - r [1 ][1 ])** 2 ).mean ()
45+ print ("train mse:" , mse )
46+
47+
48+ # test
49+ x = test .map (lambda p : (p [0 ], p [1 ]))
50+ p = model .predictAll (x ).map (lambda r : ((r [0 ], r [1 ]), r [2 ]))
51+ ratesAndPreds = test .map (lambda r : ((r [0 ], r [1 ]), r [2 ])).join (p )
52+ mse = ratesAndPreds .map (lambda r : (r [1 ][0 ] - r [1 ][1 ])** 2 ).mean ()
53+ print ("test mse:" , mse )
0 commit comments