-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrecommender.py
94 lines (70 loc) · 2.87 KB
/
recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
import numpy as np
import math
# pass in column names for each CSV and read them using pandas.
# Column names available in the readme file
#Reading users file:
user_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('Desktop/ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')
#Reading ratings file:
data_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
data = pd.read_csv('Desktop/ml-100k/u.data', sep='\t', names=data_cols, encoding='latin-1')
#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('Desktop/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')
utrain = (data.sort_values('user_id'))[:99832]
utest = (data.sort_values('user_id'))[99833:]
utrain=utrain.as_matrix(columns=['user_id','movie_id','rating'])
utest=utest.as_matrix(columns=['user_id','movie_id','rating'])
users_list = []
for i in range(1,943):
list=[]
for j in range(0,len(utrain)):
if utrain[j][0] == i:
list.append(utrain[j])
else:
break
utrain=utrain[j:]
users_list.append(list)
def EucledianScore(train_user, test_user):
sum=0
count=0
for i in test_user:
score=0
for j in train_user:
if(int(i[1]))==int(j[1]):
score = ((float(i[2])-float(j[2]))*(float(i[2])-float(j[2])))
count+=1
sum+=score
if (count<4):
sum = 1000000
return (math.sqrt(sum))
score_list=[]
for i in range(0,942):
score_list.append([i+1, EucledianScore(users_list[i],utest)])
score = pd.DataFrame(score_list, columns=['user_id','Eucledian Score'])
score = score.sort_values(by = 'Eucledian Score')
print(score)
score_matrix = score.as_matrix()
user = int(score_matrix[0][0])
common_list = []
full_list = []
for i in utest:
for j in users_list[user-1]:
if(int(i[1]) == int(j[1])):
common_list.append(int(j[1]))
full_list.append(int(j[1]))
common_list = set(common_list)
full_list = set(full_list)
recommendation = full_list.difference(common_list)
item_list = (((pd.merge(items,data).sort_values(by = 'movie_id')).groupby('movie title')))['movie_id', 'movie title', 'rating']
item_list = item_list.mean()
item_list['movie title'] = item_list.index
item_list = item_list.as_matrix()
recommendation_list = []
for i in recommendation:
recommendation_list.append(item_list[i-1])
recommendation = (pd.DataFrame(recommendation_list, columns = ['movie_id', 'mean rating', 'movie title'])).sort_values(by = 'mean rating', ascending = False)
print(recommendation[['mean rating', 'movie title']])