├── CalcMrr.py
├── README.md
├── CalcAuc.py
├── CalcNdcg.py
├── CalcPreRec.py
├── LoadData.py
├── IMF.py
├── Evaluate.py
└── main.py


/CalcMrr.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jake221'
 2 | 
 3 | def MRR(rec_list,correct_items,ignore_items):
 4 |     pos = 0
 5 |     for i in range(rec_list.size):
 6 |         if rec_list[i] in ignore_items:
 7 |             continue
 8 | 
 9 |         pos = pos+1
10 | 
11 |         if rec_list[pos] in correct_items:
12 |             mrr = 1 / (pos * 1.0)
13 |             break
14 |     return mrr
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MymediaLite_Python
2 | 
3 | This recommendation framework is the Python version of MyMediaLite and only integrates the WRMF model temporarily.
4 | 
5 | Run this program is easy, just run main.py and you can obtain the metrics of WRMF in the given parameters (of course you need to install the basic package like numpy and scipy).
6 | 
7 | You can change the training and test file into your own files and tune parameters to run your own recommendation methods.
8 | 


--------------------------------------------------------------------------------
/CalcAuc.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jake221'
 2 | 
 3 | def AUC(rec_list,correct_items,ignore_items):
 4 |     num_eval_items = rec_list.size - ignore_items.size
 5 |     num_correct_items = correct_items.size
 6 |     num_eval_pairs = (num_eval_items - num_correct_items) * num_correct_items
 7 | 
 8 |     if (num_eval_pairs == 0):
 9 |       auc = 0.5
10 | 
11 |     num_correct_pairs = 0
12 |     hit_count = 0
13 |     for i in range(rec_list.size):
14 |         if rec_list[i] in ignore_items:
15 |             continue
16 | 
17 |         if rec_list[i] not in correct_items:
18 |             num_correct_pairs = num_correct_pairs + hit_count
19 |         else:
20 |             hit_count = hit_count + 1
21 |     auc = num_correct_pairs / (num_eval_pairs * 1.0)
22 |     return auc


--------------------------------------------------------------------------------
/CalcNdcg.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jake221'
 2 | 
 3 | import math
 4 | 
 5 | def NDCG(rec_list,correct_items,ignore_items):
 6 |     dcg = 0
 7 |     idcg = computeIDCG(correct_items.size)
 8 |     left_out = 0
 9 | 
10 |     for i in range(rec_list.size):
11 |         item_id = rec_list[i]
12 |         if item_id in ignore_items:
13 |             left_out = left_out + 1
14 |             continue
15 | 
16 |         if item_id not in correct_items:
17 |             continue
18 | 
19 |         rank = i + 1 - left_out
20 |         dcg = dcg + math.log(2) / (math.log(rank+1))
21 |     ndcg = dcg / idcg
22 |     return ndcg
23 | 
24 | def computeIDCG(n):
25 |     idcg = 0
26 |     for i in range(n):
27 |         idcg = idcg + math.log(2) / (math.log(i+2))
28 |     return idcg
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/CalcPreRec.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jake221'
 2 | 
 3 | import numpy as np
 4 | 
 5 | def PrecisionAndRecall(rec_list,correct_items,ignore_items,AtN):
 6 |     precision = np.zeros((1,len(AtN)))
 7 |     recall = np.zeros((1,len(AtN)))
 8 |     for i in range(len(AtN)):
 9 | 
10 |         precision[0,i] = hitAt(rec_list,correct_items,ignore_items,AtN[i]) / (AtN[i] * 1.0)
11 |         recall[0,i] = hitAt(rec_list,correct_items,ignore_items,AtN[i]) / (correct_items.size * 1.0)
12 |     Map = AP(rec_list,correct_items,ignore_items)
13 | 
14 |     return precision, recall, Map
15 | 
16 | def AP(rec_list,correct_items,ignore_items):
17 |     # compute the average precision (AP) of a list of ranked items
18 |     hit_count = 0
19 |     avg_prec_sum = 0
20 |     left_out = 0
21 |     for i in range(rec_list.size):
22 |         item_id = rec_list[i]
23 |         if item_id in ignore_items:
24 |             left_out = left_out + 1
25 |             continue
26 | 
27 |         if item_id not in correct_items:
28 |             continue
29 | 
30 |         hit_count = hit_count + 1
31 | 
32 |         avg_prec_sum = avg_prec_sum + (hit_count / ((i + 1 - left_out) * 1.0))
33 | 
34 |     if hit_count != 0:
35 |         map = avg_prec_sum / (hit_count * 1.0)
36 |     else:
37 |         map = 0
38 |     return map
39 | 
40 | def hitAt(rec_list,correct_items,ignore_items,n):
41 |     hit_count = 0
42 |     left_out = 0
43 | 
44 |     for i in range(rec_list.size):
45 |         # print 'rec_list', rec_list.shape, i
46 |         item_id = rec_list[i]
47 |         if item_id in ignore_items:
48 |             left_out = left_out + 1
49 |             continue
50 | 
51 |         if item_id not in correct_items:
52 |             continue
53 | 
54 |         if i < (n + left_out):
55 |             hit_count = hit_count + 1
56 |         else:
57 |             break
58 |     return hit_count


--------------------------------------------------------------------------------
/LoadData.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jake221'
 2 | 
 3 | import time
 4 | import numpy as np
 5 | import scipy.sparse as sparse
 6 | 
 7 | def load_train_matrix(file_name,num_users,num_items,cPos):
 8 |     '''
 9 |     :return: user-item rating martix (sparse)
10 |     '''
11 |     t0 = time.time()
12 |     counts = np.zeros((num_users,num_items))
13 |     total = 0.0     # to store the number of nonzero entry
14 |     num_zeros = num_users * num_items
15 |     for i, line in enumerate(open(file_name,'r')):
16 |         user, item = line.strip().split('\t')
17 |         user = int(user)
18 |         item = int(item)
19 |         count = 1.0
20 |         if user > num_users:
21 |             continue
22 |         if item > num_items:
23 |             continue
24 |         if count != 0:
25 |             counts[user-1,item-1] = count
26 |             total += count
27 |             num_zeros -= 1
28 |         if i % 100000 == 0:
29 |             print "loaded %i counts..." % i
30 |     counts *=  cPos
31 |     sparse_counts = sparse.csr_matrix(counts)       # transformed the matrix into a sparse matrix
32 |     t1 = time.time()
33 |     print 'Finished loading train matrix in %f seconds' % (t1 - t0)
34 |     return sparse_counts, counts
35 | 
36 | def load_test_matrix(file_name,num_users,num_items):
37 |     '''
38 |     :return: user-item rating martix (sparse)
39 |     '''
40 |     t0 = time.time()
41 |     counts = np.zeros((num_users,num_items))
42 |     total = 0.0     # to store the number of nonzero entry
43 |     num_zeros = num_users * num_items
44 |     for i, line in enumerate(open(file_name,'r')):
45 |         user, item = line.strip().split('\t')
46 |         user = int(user)
47 |         item = int(item)
48 |         count = 1.0
49 |         if user > num_users:
50 |             continue
51 |         if item > num_items:
52 |             continue
53 |         if count != 0:
54 |             counts[user-1,item-1] = count
55 |             total += count
56 |             num_zeros -= 1
57 |         if i % 100000 == 0:
58 |             print "loaded %i counts..." % i
59 |     t1 = time.time()
60 |     print 'Finished loading test matrix in %f seconds' % (t1 - t0)
61 |     return counts
62 | 
63 | def load_tuple(file_name,data_size):
64 |     '''
65 |     :return:
66 |     '''
67 |     rating_tuple = np.zeros((data_size,2))
68 |     for i,line in enumerate(open(file_name,'r')):
69 |         user, item = line.strip().split('\t')
70 |         rating_tuple[i-1,0] = int(user)-1
71 |         rating_tuple[i-1,1] = int(item)-1
72 |     return rating_tuple


--------------------------------------------------------------------------------
/IMF.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | __author__ = 'jake221'
 3 | 
 4 | import numpy as np
 5 | import scipy.sparse as sparse
 6 | from scipy.sparse.linalg import spsolve
 7 | import time
 8 | 
 9 | class ImplicitMF():
10 | 
11 |     def __init__(self, counts, num_factors, num_iterations,reg_param):
12 |         self.counts = counts
13 |         self.num_users = counts.shape[0]
14 |         self.num_items = counts.shape[1]
15 |         self.num_factors = num_factors
16 |         self.num_iterations = num_iterations
17 |         self.reg_param = reg_param
18 | 
19 |     def train_model(self):
20 |         #创建user_vectors和item_vectors，他们的元素~N(0,1)的正态分布
21 |         self.user_vectors = np.random.normal(size=(self.num_users,
22 |                                                    self.num_factors))
23 |         self.item_vectors = np.random.normal(size=(self.num_items,
24 |                                                    self.num_factors))
25 |         '''要生成很大的数字序列的时候，用xrange会比range性能优很多，
26 |         因为不需要一上来就开辟一块很大的内存空间，这两个基本上都是在循环的时候用'''
27 |         for i in xrange(self.num_iterations):
28 |             t0 = time.time()
29 |             print 'Solving for user vectors...'
30 |             self.user_vectors = self.iteration(True, sparse.csr_matrix(self.item_vectors))
31 |             print 'Solving for item vectors...'
32 |             self.item_vectors = self.iteration(False, sparse.csr_matrix(self.user_vectors))
33 |             t1 = time.time()
34 |             print 'iteration %i finished in %f seconds' % (i + 1, t1 - t0)
35 |         return self.user_vectors,self.item_vectors
36 | 
37 |     def iteration(self, user, fixed_vecs):
38 |         #相当于C的三木运算符，if user=True num_solve = num_users,反之为num_items
39 |         num_solve = self.num_users if user else self.num_items  # 用户（或者产品）个数：待求解的向量的规模
40 |         num_fixed = fixed_vecs.shape[0]                         # 产品（或者用户）个数：固定的向量的规模
41 |         YTY = fixed_vecs.T.dot(fixed_vecs)      # Y^T * Y
42 |         eye = sparse.eye(num_fixed)             # 用于计算后面的Y^T * C^u * p(u)
43 |         lambda_eye = self.reg_param * sparse.eye(self.num_factors)  # lambda * I
44 |         solve_vecs = np.zeros((num_solve, self.num_factors))        # 结果存储器
45 |         t = time.time()
46 |         for i in xrange(num_solve):
47 |             if user:
48 |                 counts_i = self.counts[i].toarray()     # Return a dense ndarray representation of this matrix：将第i个用户的评分向量由稀疏矩阵转变为向量
49 |             else:
50 |                 #如果要求item_vec,counts_i为counts中的第i列的转置
51 |                 counts_i = self.counts[:, i].T.toarray()
52 |             ''' 原论文中c_ui=1+alpha*r_ui,但是在计算Y’CuY时为了降低时间复杂度,利用了
53 |                 Y'CuY=Y'Y+Y'(Cu-I)Y,由于Cu是对角矩阵,其元素为c_ui，即1+alpha*r_ui。
54 |                 所以Cu-I也就是对角元素为alpha*r_ui的对角矩阵'''
55 |             CuI = sparse.diags(counts_i, [0])           # 将r_ui放到对角线上，若r_ui != 0,则r_ii != 0
56 |             pu = counts_i.copy()                        # 复制pu
57 |             #np.where(pu != 0)返回pu中元素不为0的索引，然后将这些元素赋值为1,不知道这里为什么要赋值为1?:将pu中的非零项置为1；因为pu是由counts_i得到的而counts_i本质是alpha * r_ui
58 |             pu[np.where(pu != 0)] = 1.0
59 |             YTCuIY = fixed_vecs.T.dot(CuI).dot(fixed_vecs)
60 |             YTCupu = fixed_vecs.T.dot(CuI + eye).dot(sparse.csr_matrix(pu).T)
61 |             xu = spsolve(YTY + YTCuIY + lambda_eye, YTCupu)     # spsolve(A,B)求解Ax = B
62 |             solve_vecs[i] = xu
63 |             if i % 1000 == 0:
64 |                 print 'Solved %i vecs in %d seconds' % (i, time.time() - t)
65 |                 t = time.time()
66 |         return solve_vecs


--------------------------------------------------------------------------------
/Evaluate.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jake221'
 2 | 
 3 | import numpy as np
 4 | import CalcPreRec
 5 | import CalcAuc
 6 | import CalcNdcg
 7 | import CalcMrr
 8 | import time
 9 | 
10 | class Evaluate():
11 |     '''
12 |     evaluate the effectiveness of the recommendation model using eight different information retrieval metrics
13 |     '''
14 |     def __init__(self,user_vecs,item_vecs,train_matrix,test_matrix,test_users,candidate_items):
15 |         self.user_vecs = user_vecs
16 |         self.item_vecs = item_vecs
17 |         self.train_matrix = train_matrix
18 |         self.test_matrix = test_matrix
19 |         self.test_users = test_users
20 |         self.candidate_items = candidate_items
21 | 
22 |     def CalcMetrics(self):
23 |         '''
24 |         :return: eight metrics
25 |         '''
26 |         num_users = 0
27 |         ret = np.zeros((8,1))
28 |         user_num = len(self.test_users)
29 | 
30 |         precision = np.zeros((user_num,2))
31 |         recall = np.zeros((user_num,2))
32 |         map = np.zeros((user_num,1))
33 |         auc = np.zeros((user_num,1))
34 |         ndcg = np.zeros((user_num,1))
35 |         mrr = np.zeros((user_num,1))
36 |         AtN = [5,10]
37 | 
38 |         t0 = time.time()
39 |         print 'Start evaluating...'
40 | 
41 |         for i in xrange(user_num):
42 |             user_id = self.test_users[i]
43 |             # print 'user_id',user_id
44 | 
45 |             # find items that user has rated in the test set
46 |             test_nonzero_idx = self.test_matrix[user_id,:].nonzero()
47 |             test_items_idx = test_nonzero_idx[0]
48 |             correct_items = np.intersect1d(test_items_idx,self.candidate_items)
49 | 
50 |             # find items that user has rated in the train set
51 |             train_nonzero_idx = self.train_matrix[user_id,:].nonzero()
52 |             # print 'train_nonzero_idx',train_nonzero_idx
53 |             train_items_idx = train_nonzero_idx[0]
54 |             # print 'self.candidate_items',self.candidate_items
55 |             candidate_items_in_train = np.intersect1d(train_items_idx,self.candidate_items)
56 | 
57 |             num_eval_items = self.candidate_items.size - candidate_items_in_train.size
58 | 
59 |             # if user has not rated any items in test set or all items in test set are relevant then continue
60 |             if correct_items.size == 0 | num_eval_items - correct_items.size == 0:
61 |                 continue
62 | 
63 |             # generate a item recommendation list for user_id
64 |             recommendation_list = self.GenerateLists(self.user_vecs,self.item_vecs,user_id,self.candidate_items)
65 | 
66 |             ignore_items = train_items_idx
67 | 
68 |             precision[i,:],recall[i,:],map[i] = CalcPreRec.PrecisionAndRecall(recommendation_list, correct_items, ignore_items, AtN)
69 |             auc[i] = CalcAuc.AUC(recommendation_list, correct_items, ignore_items)
70 |             ndcg[i] = CalcNdcg.NDCG(recommendation_list, correct_items, ignore_items)
71 |             mrr[i] = CalcMrr.MRR(recommendation_list, correct_items, ignore_items)
72 |             num_users = num_users + 1
73 | 
74 |         t1 = time.time()
75 |         print 'Evaluation finished in %f seconds' %  (t1 - t0)
76 | 
77 |         ret[0] = sum(auc) / (num_users * 1.0)
78 |         ret[1] = sum(precision[:,0]) / (num_users * 1.0)
79 |         ret[2] = sum(precision[:,1]) / (num_users * 1.0)
80 |         ret[3] = sum(map) / (num_users * 1.0)
81 |         ret[4] = sum(recall[:,0]) / (num_users * 1.0)
82 |         ret[5] = sum(recall[:,1]) / (num_users * 1.0)
83 |         ret[6] = sum(ndcg) / (num_users * 1.0)
84 |         ret[7] = sum(mrr) / (num_users * 1.0)
85 | 
86 |         return ret
87 | 
88 |     def GenerateLists(self,user_vecs,item_vecs,user_id,candidate_items):
89 |         predict_list = np.zeros((candidate_items.size,1))
90 |         for i in range(candidate_items.size):
91 |             predict_list[i] = np.dot(user_vecs[user_id,:], item_vecs[candidate_items[i],:])
92 |         list_asc = np.argsort(predict_list,axis=0)
93 |         sorted_list = list_asc[::-1]
94 | 
95 |         return sorted_list


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jake221'
 2 | 
 3 | '''
 4 |  * Weighted matrix factorization method proposed by Hu et al. and Pan et al..
 5 |  *
 6 |  * We use the fast learning method proposed by Hu et al. (alternating least squares),
 7 |  * and we use a global weight to penalize observed/unobserved values.
 8 |  *
 9 |  * Literature:
10 |  *
11 |  *     Y. Hu, Y. Koren, C. Volinsky: Collaborative filtering for implicit feedback datasets.
12 |  *     ICDM 2008.
13 |  *     http://research.yahoo.net/files/HuKorenVolinsky-ICDM08.pdf
14 |  *
15 |  *     R. Pan, Y. Zhou, B. Cao, N. N. Liu, R. M. Lukose, M. Scholz, Q. Yang:
16 |  *     One-class collaborative filtering,
17 |  *     ICDM 2008.
18 |  *     http://www.hpl.hp.com/techreports/2008/HPL-2008-48R1.pdf
19 |  *
20 |  * This recommendation framework is the Python version of MyMediaLite and only integrates the WRMF model.
21 |  * You can contact me via wangjinkun@main.hfut.edu.cn
22 | '''
23 | 
24 | import LoadData
25 | from IMF import *
26 | from Evaluate import *
27 | 
28 | class ItemRecommendation():
29 |     '''
30 |     This class includes three components: load train data and test data; train the recommendation model; evaluate the model by eight different metrics
31 |     '''
32 |     def __init__(self,TRAIN_FILE,TEST_FILE,NUM_USERS,NUM_ITEMS,TRAIN_SIZE,TEST_SIZE):
33 |         '''
34 |         :param TRAIN_FILE:
35 |         :param TEST_FILE:
36 |         :param NUM_USERS:
37 |         :param NUM_ITEMS:
38 |         :param TRAIN_SIZE:
39 |         :param TEST_SIZE:
40 |         :return:
41 |         '''
42 |         self.TRAIN_FILE = TRAIN_FILE
43 |         self.TEST_FILE = TEST_FILE
44 |         self.NUM_USERS = NUM_USERS
45 |         self.NUM_ITEMS = NUM_ITEMS
46 |         self.TRAIN_SIZE = TRAIN_SIZE
47 |         self.TEST_SIZE = TEST_SIZE
48 | 
49 |     def load_data(self,cPos):
50 |         # load train matrix and test matrix
51 |         self.train_sparsematrix, self.train_matrix = LoadData.load_train_matrix(self.TRAIN_FILE,self.NUM_USERS,self.NUM_ITEMS,cPos)
52 |         self.train_tuple = LoadData.load_tuple(self.TRAIN_FILE,self.TRAIN_SIZE)
53 |         self.test_matrix = LoadData.load_test_matrix(self.TEST_FILE,self.NUM_USERS,self.NUM_ITEMS)
54 |         self.test_tuple = LoadData.load_tuple(self.TEST_FILE,self.TEST_SIZE)
55 | 
56 |     def model_train(self,NUM_FACTORS,NUM_ITERATIONS,REG_PARAMETERS):
57 |         # train the model
58 |         imf = ImplicitMF(self.train_sparsematrix,NUM_FACTORS,NUM_ITERATIONS,REG_PARAMETERS)
59 |         self.userFactors,self.itemFactors = imf.train_model()
60 | 
61 |     def model_evaluate(self):
62 |         # evaluate the model
63 | 
64 |         ## preprocess: find test_users and candidate_items
65 |         test_users = np.unique(self.test_tuple[:,0])                     # all users in the test set
66 |         allItems_train = np.unique(self.train_tuple[:,1])                # all items in the train set
67 |         allItems_test = np.unique(self.test_tuple[:,1])                  # all items in the test set
68 |         candidate_items = np.union1d(allItems_train,allItems_test)  # all items in the train and test set
69 | 
70 |         recEval = Evaluate(self.userFactors,self.itemFactors,self.train_matrix,self.test_matrix,test_users,candidate_items)
71 |         ret = recEval.CalcMetrics()
72 |         print 'AUC =',ret[0],'Prec@5 =',ret[1],'Prec@10 =',ret[2], 'MAP =', ret[3], 'Rec@5 =', ret[4], 'Rec@10 =', ret[5], 'NDCG =', ret[6], 'MRR =', ret[7]
73 |         return ret
74 | 
75 | if __name__ == '__main__':
76 |     '''
77 |     Parameter setting
78 |     '''
79 |     TRAIN_FILE = './data/ml100k/train_data.txt'
80 |     TEST_FILE = './data/ml100k/test_data.txt'
81 |     NUM_USERS = 943
82 |     NUM_ITEMS = 1682
83 |     TRAIN_SIZE = 90570
84 |     TEST_SIZE = 9430
85 |     cPos = 2
86 | 
87 |     NUM_FACTORS = 10
88 |     NUM_ITERATIONS = 20
89 |     REG_PARAMETERS = 0.01
90 | 
91 |     # initialize the ItemRecommendation class
92 |     item_rec = ItemRecommendation(TRAIN_FILE,TEST_FILE,NUM_USERS,NUM_ITEMS,TRAIN_SIZE,TEST_SIZE)
93 |     # load data
94 |     item_rec.load_data(cPos)
95 |     item_rec.model_train(NUM_FACTORS,NUM_ITERATIONS,REG_PARAMETERS)
96 |     metrics = item_rec.model_evaluate()


--------------------------------------------------------------------------------