├── data.txt ├── ItemCF ├── ItemCF.png ├── README.md └── ItemCF.py ├── UserCF ├── UserCF.png ├── README.md └── UserCF.py └── README.md /data.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingzhexiaozhu/MovieRecommendation/HEAD/data.txt -------------------------------------------------------------------------------- /ItemCF/ItemCF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingzhexiaozhu/MovieRecommendation/HEAD/ItemCF/ItemCF.png -------------------------------------------------------------------------------- /UserCF/UserCF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingzhexiaozhu/MovieRecommendation/HEAD/UserCF/UserCF.png -------------------------------------------------------------------------------- /ItemCF/README.md: -------------------------------------------------------------------------------- 1 | # ItemCf基于项目的协同过滤推荐实现 2 | 3 | 关于推荐系统的介绍见博客:http://blog.csdn.net/u012050154/article/details/52267712 4 | 5 | ItemCF.py是代码实现 6 | ItemCF.png是程序运行结果 7 | 8 | 9 | -------------------------------------------------------------------------------- /UserCF/README.md: -------------------------------------------------------------------------------- 1 | # UserCF基于用户的协同过滤推荐实现 2 | 3 | UserCF的思想见博客:http://blog.csdn.net/u012050154/article/details/52268057 4 | UserCF.py是代码实现 5 | UserCF.png是程序运行结果 6 | 7 | 关于推荐系统的介绍见博客:http://blog.csdn.net/u012050154/article/details/52267712 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MovieRecommendation 2 | 3 | 基于Python3,实现电影推荐系统,数据集是MovieLens官方数据集【见data.txt】 4 | 5 | 基于用户的协同过滤算法UserCF,UserCF的思想见博客:http://blog.csdn.net/u012050154/article/details/52268057 6 | 基于项目的协同过滤算法ItemCF 7 | 8 | 关于推荐系统的介绍见博客:http://blog.csdn.net/u012050154/article/details/52267712 9 | -------------------------------------------------------------------------------- /UserCF/UserCF.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | # 基于用户的协同过滤推荐算法实现 4 | import random 5 | 6 | import math 7 | from operator import itemgetter 8 | 9 | 10 | class UserBasedCF(): 11 | # 初始化相关参数 12 | def __init__(self): 13 | # 找到与目标用户兴趣相似的20个用户,为其推荐10部电影 14 | self.n_sim_user = 20 15 | self.n_rec_movie = 10 16 | 17 | # 将数据集划分为训练集和测试集 18 | self.trainSet = {} 19 | self.testSet = {} 20 | 21 | # 用户相似度矩阵 22 | self.user_sim_matrix = {} 23 | self.movie_count = 0 24 | 25 | print('Similar user number = %d' % self.n_sim_user) 26 | print('Recommneded movie number = %d' % self.n_rec_movie) 27 | 28 | 29 | # 读文件得到“用户-电影”数据 30 | def get_dataset(self, filename, pivot=0.75): 31 | trainSet_len = 0 32 | testSet_len = 0 33 | for line in self.load_file(filename): 34 | user, movie, rating, timestamp = line.split(',') 35 | if random.random() < pivot: 36 | self.trainSet.setdefault(user, {}) 37 | self.trainSet[user][movie] = rating 38 | trainSet_len += 1 39 | else: 40 | self.testSet.setdefault(user, {}) 41 | self.testSet[user][movie] = rating 42 | testSet_len += 1 43 | print('Split trainingSet and testSet success!') 44 | print('TrainSet = %s' % trainSet_len) 45 | print('TestSet = %s' % testSet_len) 46 | 47 | 48 | # 读文件,返回文件的每一行 49 | def load_file(self, filename): 50 | with open(filename, 'r') as f: 51 | for i, line in enumerate(f): 52 | if i == 0: # 去掉文件第一行的title 53 | continue 54 | yield line.strip('\r\n') 55 | print('Load %s success!' % filename) 56 | 57 | 58 | # 计算用户之间的相似度 59 | def calc_user_sim(self): 60 | # 构建“电影-用户”倒排索引 61 | # key = movieID, value = list of userIDs who have seen this movie 62 | print('Building movie-user table ...') 63 | movie_user = {} 64 | for user, movies in self.trainSet.items(): 65 | for movie in movies: 66 | if movie not in movie_user: 67 | movie_user[movie] = set() 68 | movie_user[movie].add(user) 69 | print('Build movie-user table success!') 70 | 71 | self.movie_count = len(movie_user) 72 | print('Total movie number = %d' % self.movie_count) 73 | 74 | print('Build user co-rated movies matrix ...') 75 | for movie, users in movie_user.items(): 76 | for u in users: 77 | for v in users: 78 | if u == v: 79 | continue 80 | self.user_sim_matrix.setdefault(u, {}) 81 | self.user_sim_matrix[u].setdefault(v, 0) 82 | self.user_sim_matrix[u][v] += 1 83 | print('Build user co-rated movies matrix success!') 84 | 85 | # 计算相似性 86 | print('Calculating user similarity matrix ...') 87 | for u, related_users in self.user_sim_matrix.items(): 88 | for v, count in related_users.items(): 89 | self.user_sim_matrix[u][v] = count / math.sqrt(len(self.trainSet[u]) * len(self.trainSet[v])) 90 | print('Calculate user similarity matrix success!') 91 | 92 | 93 | # 针对目标用户U,找到其最相似的K个用户,产生N个推荐 94 | def recommend(self, user): 95 | K = self.n_sim_user 96 | N = self.n_rec_movie 97 | rank = {} 98 | watched_movies = self.trainSet[user] 99 | 100 | # v=similar user, wuv=similar factor 101 | for v, wuv in sorted(self.user_sim_matrix[user].items(), key=itemgetter(1), reverse=True)[0:K]: 102 | for movie in self.trainSet[v]: 103 | if movie in watched_movies: 104 | continue 105 | rank.setdefault(movie, 0) 106 | rank[movie] += wuv 107 | return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] 108 | 109 | 110 | # 产生推荐并通过准确率、召回率和覆盖率进行评估 111 | def evaluate(self): 112 | print("Evaluation start ...") 113 | N = self.n_rec_movie 114 | # 准确率和召回率 115 | hit = 0 116 | rec_count = 0 117 | test_count = 0 118 | # 覆盖率 119 | all_rec_movies = set() 120 | 121 | for i, user, in enumerate(self.trainSet): 122 | test_movies = self.testSet.get(user, {}) 123 | rec_movies = self.recommend(user) 124 | for movie, w in rec_movies: 125 | if movie in test_movies: 126 | hit += 1 127 | all_rec_movies.add(movie) 128 | rec_count += N 129 | test_count += len(test_movies) 130 | 131 | precision = hit / (1.0 * rec_count) 132 | recall = hit / (1.0 * test_count) 133 | coverage = len(all_rec_movies) / (1.0 * self.movie_count) 134 | print('precisioin=%.4f\trecall=%.4f\tcoverage=%.4f' % (precision, recall, coverage)) 135 | 136 | 137 | if __name__ == '__main__': 138 | rating_file = 'D:\\学习资料\\推荐系统\\ml-latest-small\\ratings.csv' 139 | userCF = UserBasedCF() 140 | userCF.get_dataset(rating_file) 141 | userCF.calc_user_sim() 142 | userCF.evaluate() 143 | -------------------------------------------------------------------------------- /ItemCF/ItemCF.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | # 基于项目的协同过滤推荐算法实现 4 | import random 5 | 6 | import math 7 | from operator import itemgetter 8 | 9 | 10 | class ItemBasedCF(): 11 | # 初始化参数 12 | def __init__(self): 13 | # 找到相似的20部电影,为目标用户推荐10部电影 14 | self.n_sim_movie = 20 15 | self.n_rec_movie = 10 16 | 17 | # 将数据集划分为训练集和测试集 18 | self.trainSet = {} 19 | self.testSet = {} 20 | 21 | # 用户相似度矩阵 22 | self.movie_sim_matrix = {} 23 | self.movie_popular = {} 24 | self.movie_count = 0 25 | 26 | print('Similar movie number = %d' % self.n_sim_movie) 27 | print('Recommneded movie number = %d' % self.n_rec_movie) 28 | 29 | 30 | # 读文件得到“用户-电影”数据 31 | def get_dataset(self, filename, pivot=0.75): 32 | trainSet_len = 0 33 | testSet_len = 0 34 | for line in self.load_file(filename): 35 | user, movie, rating, timestamp = line.split(',') 36 | if(random.random() < pivot): 37 | self.trainSet.setdefault(user, {}) 38 | self.trainSet[user][movie] = rating 39 | trainSet_len += 1 40 | else: 41 | self.testSet.setdefault(user, {}) 42 | self.testSet[user][movie] = rating 43 | testSet_len += 1 44 | print('Split trainingSet and testSet success!') 45 | print('TrainSet = %s' % trainSet_len) 46 | print('TestSet = %s' % testSet_len) 47 | 48 | 49 | # 读文件,返回文件的每一行 50 | def load_file(self, filename): 51 | with open(filename, 'r') as f: 52 | for i, line in enumerate(f): 53 | if i == 0: # 去掉文件第一行的title 54 | continue 55 | yield line.strip('\r\n') 56 | print('Load %s success!' % filename) 57 | 58 | 59 | # 计算电影之间的相似度 60 | def calc_movie_sim(self): 61 | for user, movies in self.trainSet.items(): 62 | for movie in movies: 63 | if movie not in self.movie_popular: 64 | self.movie_popular[movie] = 0 65 | self.movie_popular[movie] += 1 66 | 67 | self.movie_count = len(self.movie_popular) 68 | print("Total movie number = %d" % self.movie_count) 69 | 70 | for user, movies in self.trainSet.items(): 71 | for m1 in movies: 72 | for m2 in movies: 73 | if m1 == m2: 74 | continue 75 | self.movie_sim_matrix.setdefault(m1, {}) 76 | self.movie_sim_matrix[m1].setdefault(m2, 0) 77 | self.movie_sim_matrix[m1][m2] += 1 78 | print("Build co-rated users matrix success!") 79 | 80 | # 计算电影之间的相似性 81 | print("Calculating movie similarity matrix ...") 82 | for m1, related_movies in self.movie_sim_matrix.items(): 83 | for m2, count in related_movies.items(): 84 | # 注意0向量的处理,即某电影的用户数为0 85 | if self.movie_popular[m1] == 0 or self.movie_popular[m2] == 0: 86 | self.movie_sim_matrix[m1][m2] = 0 87 | else: 88 | self.movie_sim_matrix[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2]) 89 | print('Calculate movie similarity matrix success!') 90 | 91 | 92 | # 针对目标用户U,找到K部相似的电影,并推荐其N部电影 93 | def recommend(self, user): 94 | K = self.n_sim_movie 95 | N = self.n_rec_movie 96 | rank = {} 97 | watched_movies = self.trainSet[user] 98 | 99 | for movie, rating in watched_movies.items(): 100 | for related_movie, w in sorted(self.movie_sim_matrix[movie].items(), key=itemgetter(1), reverse=True)[:K]: 101 | if related_movie in watched_movies: 102 | continue 103 | rank.setdefault(related_movie, 0) 104 | rank[related_movie] += w * float(rating) 105 | return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N] 106 | 107 | 108 | # 产生推荐并通过准确率、召回率和覆盖率进行评估 109 | def evaluate(self): 110 | print('Evaluating start ...') 111 | N = self.n_rec_movie 112 | # 准确率和召回率 113 | hit = 0 114 | rec_count = 0 115 | test_count = 0 116 | # 覆盖率 117 | all_rec_movies = set() 118 | 119 | for i, user in enumerate(self.trainSet): 120 | test_moives = self.testSet.get(user, {}) 121 | rec_movies = self.recommend(user) 122 | for movie, w in rec_movies: 123 | if movie in test_moives: 124 | hit += 1 125 | all_rec_movies.add(movie) 126 | rec_count += N 127 | test_count += len(test_moives) 128 | 129 | precision = hit / (1.0 * rec_count) 130 | recall = hit / (1.0 * test_count) 131 | coverage = len(all_rec_movies) / (1.0 * self.movie_count) 132 | print('precisioin=%.4f\trecall=%.4f\tcoverage=%.4f' % (precision, recall, coverage)) 133 | 134 | 135 | if __name__ == '__main__': 136 | rating_file = 'D:\\学习资料\\推荐系统\\ml-latest-small\\ratings.csv' 137 | itemCF = ItemBasedCF() 138 | itemCF.get_dataset(rating_file) 139 | itemCF.calc_movie_sim() 140 | itemCF.evaluate() --------------------------------------------------------------------------------