├── CalcMrr.py ├── README.md ├── CalcAuc.py ├── CalcNdcg.py ├── CalcPreRec.py ├── LoadData.py ├── IMF.py ├── Evaluate.py └── main.py /CalcMrr.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jake221' 2 | 3 | def MRR(rec_list,correct_items,ignore_items): 4 | pos = 0 5 | for i in range(rec_list.size): 6 | if rec_list[i] in ignore_items: 7 | continue 8 | 9 | pos = pos+1 10 | 11 | if rec_list[pos] in correct_items: 12 | mrr = 1 / (pos * 1.0) 13 | break 14 | return mrr 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MymediaLite_Python 2 | 3 | This recommendation framework is the Python version of MyMediaLite and only integrates the WRMF model temporarily. 4 | 5 | Run this program is easy, just run main.py and you can obtain the metrics of WRMF in the given parameters (of course you need to install the basic package like numpy and scipy). 6 | 7 | You can change the training and test file into your own files and tune parameters to run your own recommendation methods. 8 | -------------------------------------------------------------------------------- /CalcAuc.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jake221' 2 | 3 | def AUC(rec_list,correct_items,ignore_items): 4 | num_eval_items = rec_list.size - ignore_items.size 5 | num_correct_items = correct_items.size 6 | num_eval_pairs = (num_eval_items - num_correct_items) * num_correct_items 7 | 8 | if (num_eval_pairs == 0): 9 | auc = 0.5 10 | 11 | num_correct_pairs = 0 12 | hit_count = 0 13 | for i in range(rec_list.size): 14 | if rec_list[i] in ignore_items: 15 | continue 16 | 17 | if rec_list[i] not in correct_items: 18 | num_correct_pairs = num_correct_pairs + hit_count 19 | else: 20 | hit_count = hit_count + 1 21 | auc = num_correct_pairs / (num_eval_pairs * 1.0) 22 | return auc -------------------------------------------------------------------------------- /CalcNdcg.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jake221' 2 | 3 | import math 4 | 5 | def NDCG(rec_list,correct_items,ignore_items): 6 | dcg = 0 7 | idcg = computeIDCG(correct_items.size) 8 | left_out = 0 9 | 10 | for i in range(rec_list.size): 11 | item_id = rec_list[i] 12 | if item_id in ignore_items: 13 | left_out = left_out + 1 14 | continue 15 | 16 | if item_id not in correct_items: 17 | continue 18 | 19 | rank = i + 1 - left_out 20 | dcg = dcg + math.log(2) / (math.log(rank+1)) 21 | ndcg = dcg / idcg 22 | return ndcg 23 | 24 | def computeIDCG(n): 25 | idcg = 0 26 | for i in range(n): 27 | idcg = idcg + math.log(2) / (math.log(i+2)) 28 | return idcg 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /CalcPreRec.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jake221' 2 | 3 | import numpy as np 4 | 5 | def PrecisionAndRecall(rec_list,correct_items,ignore_items,AtN): 6 | precision = np.zeros((1,len(AtN))) 7 | recall = np.zeros((1,len(AtN))) 8 | for i in range(len(AtN)): 9 | 10 | precision[0,i] = hitAt(rec_list,correct_items,ignore_items,AtN[i]) / (AtN[i] * 1.0) 11 | recall[0,i] = hitAt(rec_list,correct_items,ignore_items,AtN[i]) / (correct_items.size * 1.0) 12 | Map = AP(rec_list,correct_items,ignore_items) 13 | 14 | return precision, recall, Map 15 | 16 | def AP(rec_list,correct_items,ignore_items): 17 | # compute the average precision (AP) of a list of ranked items 18 | hit_count = 0 19 | avg_prec_sum = 0 20 | left_out = 0 21 | for i in range(rec_list.size): 22 | item_id = rec_list[i] 23 | if item_id in ignore_items: 24 | left_out = left_out + 1 25 | continue 26 | 27 | if item_id not in correct_items: 28 | continue 29 | 30 | hit_count = hit_count + 1 31 | 32 | avg_prec_sum = avg_prec_sum + (hit_count / ((i + 1 - left_out) * 1.0)) 33 | 34 | if hit_count != 0: 35 | map = avg_prec_sum / (hit_count * 1.0) 36 | else: 37 | map = 0 38 | return map 39 | 40 | def hitAt(rec_list,correct_items,ignore_items,n): 41 | hit_count = 0 42 | left_out = 0 43 | 44 | for i in range(rec_list.size): 45 | # print 'rec_list', rec_list.shape, i 46 | item_id = rec_list[i] 47 | if item_id in ignore_items: 48 | left_out = left_out + 1 49 | continue 50 | 51 | if item_id not in correct_items: 52 | continue 53 | 54 | if i < (n + left_out): 55 | hit_count = hit_count + 1 56 | else: 57 | break 58 | return hit_count -------------------------------------------------------------------------------- /LoadData.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jake221' 2 | 3 | import time 4 | import numpy as np 5 | import scipy.sparse as sparse 6 | 7 | def load_train_matrix(file_name,num_users,num_items,cPos): 8 | ''' 9 | :return: user-item rating martix (sparse) 10 | ''' 11 | t0 = time.time() 12 | counts = np.zeros((num_users,num_items)) 13 | total = 0.0 # to store the number of nonzero entry 14 | num_zeros = num_users * num_items 15 | for i, line in enumerate(open(file_name,'r')): 16 | user, item = line.strip().split('\t') 17 | user = int(user) 18 | item = int(item) 19 | count = 1.0 20 | if user > num_users: 21 | continue 22 | if item > num_items: 23 | continue 24 | if count != 0: 25 | counts[user-1,item-1] = count 26 | total += count 27 | num_zeros -= 1 28 | if i % 100000 == 0: 29 | print "loaded %i counts..." % i 30 | counts *= cPos 31 | sparse_counts = sparse.csr_matrix(counts) # transformed the matrix into a sparse matrix 32 | t1 = time.time() 33 | print 'Finished loading train matrix in %f seconds' % (t1 - t0) 34 | return sparse_counts, counts 35 | 36 | def load_test_matrix(file_name,num_users,num_items): 37 | ''' 38 | :return: user-item rating martix (sparse) 39 | ''' 40 | t0 = time.time() 41 | counts = np.zeros((num_users,num_items)) 42 | total = 0.0 # to store the number of nonzero entry 43 | num_zeros = num_users * num_items 44 | for i, line in enumerate(open(file_name,'r')): 45 | user, item = line.strip().split('\t') 46 | user = int(user) 47 | item = int(item) 48 | count = 1.0 49 | if user > num_users: 50 | continue 51 | if item > num_items: 52 | continue 53 | if count != 0: 54 | counts[user-1,item-1] = count 55 | total += count 56 | num_zeros -= 1 57 | if i % 100000 == 0: 58 | print "loaded %i counts..." % i 59 | t1 = time.time() 60 | print 'Finished loading test matrix in %f seconds' % (t1 - t0) 61 | return counts 62 | 63 | def load_tuple(file_name,data_size): 64 | ''' 65 | :return: 66 | ''' 67 | rating_tuple = np.zeros((data_size,2)) 68 | for i,line in enumerate(open(file_name,'r')): 69 | user, item = line.strip().split('\t') 70 | rating_tuple[i-1,0] = int(user)-1 71 | rating_tuple[i-1,1] = int(item)-1 72 | return rating_tuple -------------------------------------------------------------------------------- /IMF.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | __author__ = 'jake221' 3 | 4 | import numpy as np 5 | import scipy.sparse as sparse 6 | from scipy.sparse.linalg import spsolve 7 | import time 8 | 9 | class ImplicitMF(): 10 | 11 | def __init__(self, counts, num_factors, num_iterations,reg_param): 12 | self.counts = counts 13 | self.num_users = counts.shape[0] 14 | self.num_items = counts.shape[1] 15 | self.num_factors = num_factors 16 | self.num_iterations = num_iterations 17 | self.reg_param = reg_param 18 | 19 | def train_model(self): 20 | #创建user_vectors和item_vectors,他们的元素~N(0,1)的正态分布 21 | self.user_vectors = np.random.normal(size=(self.num_users, 22 | self.num_factors)) 23 | self.item_vectors = np.random.normal(size=(self.num_items, 24 | self.num_factors)) 25 | '''要生成很大的数字序列的时候,用xrange会比range性能优很多, 26 | 因为不需要一上来就开辟一块很大的内存空间,这两个基本上都是在循环的时候用''' 27 | for i in xrange(self.num_iterations): 28 | t0 = time.time() 29 | print 'Solving for user vectors...' 30 | self.user_vectors = self.iteration(True, sparse.csr_matrix(self.item_vectors)) 31 | print 'Solving for item vectors...' 32 | self.item_vectors = self.iteration(False, sparse.csr_matrix(self.user_vectors)) 33 | t1 = time.time() 34 | print 'iteration %i finished in %f seconds' % (i + 1, t1 - t0) 35 | return self.user_vectors,self.item_vectors 36 | 37 | def iteration(self, user, fixed_vecs): 38 | #相当于C的三木运算符,if user=True num_solve = num_users,反之为num_items 39 | num_solve = self.num_users if user else self.num_items # 用户(或者产品)个数:待求解的向量的规模 40 | num_fixed = fixed_vecs.shape[0] # 产品(或者用户)个数:固定的向量的规模 41 | YTY = fixed_vecs.T.dot(fixed_vecs) # Y^T * Y 42 | eye = sparse.eye(num_fixed) # 用于计算后面的Y^T * C^u * p(u) 43 | lambda_eye = self.reg_param * sparse.eye(self.num_factors) # lambda * I 44 | solve_vecs = np.zeros((num_solve, self.num_factors)) # 结果存储器 45 | t = time.time() 46 | for i in xrange(num_solve): 47 | if user: 48 | counts_i = self.counts[i].toarray() # Return a dense ndarray representation of this matrix:将第i个用户的评分向量由稀疏矩阵转变为向量 49 | else: 50 | #如果要求item_vec,counts_i为counts中的第i列的转置 51 | counts_i = self.counts[:, i].T.toarray() 52 | ''' 原论文中c_ui=1+alpha*r_ui,但是在计算Y’CuY时为了降低时间复杂度,利用了 53 | Y'CuY=Y'Y+Y'(Cu-I)Y,由于Cu是对角矩阵,其元素为c_ui,即1+alpha*r_ui。 54 | 所以Cu-I也就是对角元素为alpha*r_ui的对角矩阵''' 55 | CuI = sparse.diags(counts_i, [0]) # 将r_ui放到对角线上,若r_ui != 0,则r_ii != 0 56 | pu = counts_i.copy() # 复制pu 57 | #np.where(pu != 0)返回pu中元素不为0的索引,然后将这些元素赋值为1,不知道这里为什么要赋值为1?:将pu中的非零项置为1;因为pu是由counts_i得到的而counts_i本质是alpha * r_ui 58 | pu[np.where(pu != 0)] = 1.0 59 | YTCuIY = fixed_vecs.T.dot(CuI).dot(fixed_vecs) 60 | YTCupu = fixed_vecs.T.dot(CuI + eye).dot(sparse.csr_matrix(pu).T) 61 | xu = spsolve(YTY + YTCuIY + lambda_eye, YTCupu) # spsolve(A,B)求解Ax = B 62 | solve_vecs[i] = xu 63 | if i % 1000 == 0: 64 | print 'Solved %i vecs in %d seconds' % (i, time.time() - t) 65 | t = time.time() 66 | return solve_vecs -------------------------------------------------------------------------------- /Evaluate.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jake221' 2 | 3 | import numpy as np 4 | import CalcPreRec 5 | import CalcAuc 6 | import CalcNdcg 7 | import CalcMrr 8 | import time 9 | 10 | class Evaluate(): 11 | ''' 12 | evaluate the effectiveness of the recommendation model using eight different information retrieval metrics 13 | ''' 14 | def __init__(self,user_vecs,item_vecs,train_matrix,test_matrix,test_users,candidate_items): 15 | self.user_vecs = user_vecs 16 | self.item_vecs = item_vecs 17 | self.train_matrix = train_matrix 18 | self.test_matrix = test_matrix 19 | self.test_users = test_users 20 | self.candidate_items = candidate_items 21 | 22 | def CalcMetrics(self): 23 | ''' 24 | :return: eight metrics 25 | ''' 26 | num_users = 0 27 | ret = np.zeros((8,1)) 28 | user_num = len(self.test_users) 29 | 30 | precision = np.zeros((user_num,2)) 31 | recall = np.zeros((user_num,2)) 32 | map = np.zeros((user_num,1)) 33 | auc = np.zeros((user_num,1)) 34 | ndcg = np.zeros((user_num,1)) 35 | mrr = np.zeros((user_num,1)) 36 | AtN = [5,10] 37 | 38 | t0 = time.time() 39 | print 'Start evaluating...' 40 | 41 | for i in xrange(user_num): 42 | user_id = self.test_users[i] 43 | # print 'user_id',user_id 44 | 45 | # find items that user has rated in the test set 46 | test_nonzero_idx = self.test_matrix[user_id,:].nonzero() 47 | test_items_idx = test_nonzero_idx[0] 48 | correct_items = np.intersect1d(test_items_idx,self.candidate_items) 49 | 50 | # find items that user has rated in the train set 51 | train_nonzero_idx = self.train_matrix[user_id,:].nonzero() 52 | # print 'train_nonzero_idx',train_nonzero_idx 53 | train_items_idx = train_nonzero_idx[0] 54 | # print 'self.candidate_items',self.candidate_items 55 | candidate_items_in_train = np.intersect1d(train_items_idx,self.candidate_items) 56 | 57 | num_eval_items = self.candidate_items.size - candidate_items_in_train.size 58 | 59 | # if user has not rated any items in test set or all items in test set are relevant then continue 60 | if correct_items.size == 0 | num_eval_items - correct_items.size == 0: 61 | continue 62 | 63 | # generate a item recommendation list for user_id 64 | recommendation_list = self.GenerateLists(self.user_vecs,self.item_vecs,user_id,self.candidate_items) 65 | 66 | ignore_items = train_items_idx 67 | 68 | precision[i,:],recall[i,:],map[i] = CalcPreRec.PrecisionAndRecall(recommendation_list, correct_items, ignore_items, AtN) 69 | auc[i] = CalcAuc.AUC(recommendation_list, correct_items, ignore_items) 70 | ndcg[i] = CalcNdcg.NDCG(recommendation_list, correct_items, ignore_items) 71 | mrr[i] = CalcMrr.MRR(recommendation_list, correct_items, ignore_items) 72 | num_users = num_users + 1 73 | 74 | t1 = time.time() 75 | print 'Evaluation finished in %f seconds' % (t1 - t0) 76 | 77 | ret[0] = sum(auc) / (num_users * 1.0) 78 | ret[1] = sum(precision[:,0]) / (num_users * 1.0) 79 | ret[2] = sum(precision[:,1]) / (num_users * 1.0) 80 | ret[3] = sum(map) / (num_users * 1.0) 81 | ret[4] = sum(recall[:,0]) / (num_users * 1.0) 82 | ret[5] = sum(recall[:,1]) / (num_users * 1.0) 83 | ret[6] = sum(ndcg) / (num_users * 1.0) 84 | ret[7] = sum(mrr) / (num_users * 1.0) 85 | 86 | return ret 87 | 88 | def GenerateLists(self,user_vecs,item_vecs,user_id,candidate_items): 89 | predict_list = np.zeros((candidate_items.size,1)) 90 | for i in range(candidate_items.size): 91 | predict_list[i] = np.dot(user_vecs[user_id,:], item_vecs[candidate_items[i],:]) 92 | list_asc = np.argsort(predict_list,axis=0) 93 | sorted_list = list_asc[::-1] 94 | 95 | return sorted_list -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jake221' 2 | 3 | ''' 4 | * Weighted matrix factorization method proposed by Hu et al. and Pan et al.. 5 | * 6 | * We use the fast learning method proposed by Hu et al. (alternating least squares), 7 | * and we use a global weight to penalize observed/unobserved values. 8 | * 9 | * Literature: 10 | * 11 | * Y. Hu, Y. Koren, C. Volinsky: Collaborative filtering for implicit feedback datasets. 12 | * ICDM 2008. 13 | * http://research.yahoo.net/files/HuKorenVolinsky-ICDM08.pdf 14 | * 15 | * R. Pan, Y. Zhou, B. Cao, N. N. Liu, R. M. Lukose, M. Scholz, Q. Yang: 16 | * One-class collaborative filtering, 17 | * ICDM 2008. 18 | * http://www.hpl.hp.com/techreports/2008/HPL-2008-48R1.pdf 19 | * 20 | * This recommendation framework is the Python version of MyMediaLite and only integrates the WRMF model. 21 | * You can contact me via wangjinkun@main.hfut.edu.cn 22 | ''' 23 | 24 | import LoadData 25 | from IMF import * 26 | from Evaluate import * 27 | 28 | class ItemRecommendation(): 29 | ''' 30 | This class includes three components: load train data and test data; train the recommendation model; evaluate the model by eight different metrics 31 | ''' 32 | def __init__(self,TRAIN_FILE,TEST_FILE,NUM_USERS,NUM_ITEMS,TRAIN_SIZE,TEST_SIZE): 33 | ''' 34 | :param TRAIN_FILE: 35 | :param TEST_FILE: 36 | :param NUM_USERS: 37 | :param NUM_ITEMS: 38 | :param TRAIN_SIZE: 39 | :param TEST_SIZE: 40 | :return: 41 | ''' 42 | self.TRAIN_FILE = TRAIN_FILE 43 | self.TEST_FILE = TEST_FILE 44 | self.NUM_USERS = NUM_USERS 45 | self.NUM_ITEMS = NUM_ITEMS 46 | self.TRAIN_SIZE = TRAIN_SIZE 47 | self.TEST_SIZE = TEST_SIZE 48 | 49 | def load_data(self,cPos): 50 | # load train matrix and test matrix 51 | self.train_sparsematrix, self.train_matrix = LoadData.load_train_matrix(self.TRAIN_FILE,self.NUM_USERS,self.NUM_ITEMS,cPos) 52 | self.train_tuple = LoadData.load_tuple(self.TRAIN_FILE,self.TRAIN_SIZE) 53 | self.test_matrix = LoadData.load_test_matrix(self.TEST_FILE,self.NUM_USERS,self.NUM_ITEMS) 54 | self.test_tuple = LoadData.load_tuple(self.TEST_FILE,self.TEST_SIZE) 55 | 56 | def model_train(self,NUM_FACTORS,NUM_ITERATIONS,REG_PARAMETERS): 57 | # train the model 58 | imf = ImplicitMF(self.train_sparsematrix,NUM_FACTORS,NUM_ITERATIONS,REG_PARAMETERS) 59 | self.userFactors,self.itemFactors = imf.train_model() 60 | 61 | def model_evaluate(self): 62 | # evaluate the model 63 | 64 | ## preprocess: find test_users and candidate_items 65 | test_users = np.unique(self.test_tuple[:,0]) # all users in the test set 66 | allItems_train = np.unique(self.train_tuple[:,1]) # all items in the train set 67 | allItems_test = np.unique(self.test_tuple[:,1]) # all items in the test set 68 | candidate_items = np.union1d(allItems_train,allItems_test) # all items in the train and test set 69 | 70 | recEval = Evaluate(self.userFactors,self.itemFactors,self.train_matrix,self.test_matrix,test_users,candidate_items) 71 | ret = recEval.CalcMetrics() 72 | print 'AUC =',ret[0],'Prec@5 =',ret[1],'Prec@10 =',ret[2], 'MAP =', ret[3], 'Rec@5 =', ret[4], 'Rec@10 =', ret[5], 'NDCG =', ret[6], 'MRR =', ret[7] 73 | return ret 74 | 75 | if __name__ == '__main__': 76 | ''' 77 | Parameter setting 78 | ''' 79 | TRAIN_FILE = './data/ml100k/train_data.txt' 80 | TEST_FILE = './data/ml100k/test_data.txt' 81 | NUM_USERS = 943 82 | NUM_ITEMS = 1682 83 | TRAIN_SIZE = 90570 84 | TEST_SIZE = 9430 85 | cPos = 2 86 | 87 | NUM_FACTORS = 10 88 | NUM_ITERATIONS = 20 89 | REG_PARAMETERS = 0.01 90 | 91 | # initialize the ItemRecommendation class 92 | item_rec = ItemRecommendation(TRAIN_FILE,TEST_FILE,NUM_USERS,NUM_ITEMS,TRAIN_SIZE,TEST_SIZE) 93 | # load data 94 | item_rec.load_data(cPos) 95 | item_rec.model_train(NUM_FACTORS,NUM_ITERATIONS,REG_PARAMETERS) 96 | metrics = item_rec.model_evaluate() --------------------------------------------------------------------------------