├── README.md ├── evaluate.py ├── DataSet_pre.py ├── sparse-Deep-SetRank.py └── MF-SetRank.jl /README.md: -------------------------------------------------------------------------------- 1 | # SetRank 2 | 3 | This is our implementation for the paper: 4 | 5 | Chao Wang, Hengshu Zhu, Chen Zhu, Chuan Qin and Hui Xiong (2020). SetRank: A Setwise Bayesian Approach for Collaborative Ranking from Implicit Feedback. In Proceedings of AAAI'20, New York, New York, USA, February 7-12, 2020. 6 | 7 | Please cite our AAAI'20 paper if you use our codes. Thanks! 8 | 9 | The code has been tested running under Python 3.6.5 (tensorflow 1.11.0) and Julia 0.6.4. 10 | 11 | The dataset in this file was provided by https://github.com/wuliwei9278/SQL-Rank. 12 | 13 | We provide two implementations, MF-SetRank and Deep-SetRank. 14 | 15 | You can run the code for MF-SetRank like this: 16 | 17 | ```Julia 18 | julia MF-SetRank.jl 19 | ``` 20 | 21 | You can run the code for Deep-SetRank like this: 22 | 23 | ```Python 24 | python sparse-Deep-SetRank.py -print 10 -reg 1.8 -lr 0.0001 -negnum 30 -posnum 20 25 | ``` 26 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | # This is the evaluation function 2 | import numpy as np 3 | 4 | 5 | def evaluate(postrain, posprobe, r, k): 6 | userlist = list(posprobe.keys()) 7 | for user in userlist: 8 | r[user, postrain[user]] = -9999 # delete the items in training set 9 | 10 | pred = np.argsort(r, axis=1)[:, ::-1] 11 | 12 | recall = [] 13 | precision = [] 14 | map = [] 15 | for kk in k: 16 | recall_tmp = [] 17 | precision_tmp = [] 18 | map_tmp = [] 19 | for user in userlist: 20 | predict_tmp = np.zeros(kk, dtype=np.float) 21 | ll = 1 22 | for l in range(kk): 23 | if pred[user, l] in posprobe[user]: 24 | predict_tmp[l] = ll 25 | ll += 1 26 | recall_tmp.append(np.float(np.sum(predict_tmp > 0)) / len(posprobe[user])) 27 | precision_tmp.append(np.float(np.sum(predict_tmp > 0)) / kk) 28 | map_tmp.append(np.sum(predict_tmp / (np.array(range(kk)) + 1)) / kk) 29 | 30 | recall.append(np.mean(recall_tmp)) 31 | precision.append(np.mean(precision_tmp)) 32 | map.append(np.mean(map_tmp)) 33 | 34 | return recall, precision, map 35 | 36 | 37 | -------------------------------------------------------------------------------- /DataSet_pre.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding:UTF-8 -*- 2 | # This is the data preprocessing for Deep-SetRank. 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from collections import defaultdict 7 | 8 | 9 | class DataSet(object): 10 | def __init__(self, args): 11 | self.train, self.test, self.train_list = self.load_data(args.trainName, args.testName) 12 | self.nrow, self.ncol = np.max(self.train_list[:, 0:2], axis=0) + 1 13 | self.shape = [self.nrow, self.ncol] 14 | self.n_train = len(self.train_list) 15 | self.user_ratings = defaultdict(list) 16 | self.nuser_item = np.zeros(self.nrow, dtype=np.int64) 17 | for line in self.train_list: 18 | self.user_ratings[line[0]].append(line[1]) 19 | self.nuser_item[line[0]] += 1 20 | self.maxn_item = np.minimum(np.max(self.nuser_item), args.posnum) 21 | self.id_matrix = np.ones((self.nrow, self.maxn_item), dtype=np.float64) 22 | for i in range(self.nrow): 23 | self.id_matrix[i, self.nuser_item[i]:] = 0 24 | self.train_matrix, self.trainDict = self.getEmbedding() 25 | self.item_all = np.array(np.arange(self.ncol)) 26 | self.sample_prob = 1 - self.train_matrix 27 | self.sample_prob = self.sample_prob / np.sum(self.sample_prob, axis=1, keepdims=True) 28 | 29 | def load_data(self, trainName, testName): 30 | data = {} 31 | data["test_list"] = pd.read_csv("data/" + testName + ".csv", header=None).values 32 | data["train_list"] = pd.read_csv("data/" + trainName + ".csv", header=None).values 33 | data["test_list"][:, 0:2] = data["test_list"][:, 0:2] - 1 34 | data["train_list"][:, 0:2] = data["train_list"][:, 0:2] - 1 35 | train = [] 36 | for i in data["train_list"]: 37 | if i[2] == 1: 38 | train.append((i[0], i[1], 1.0)) 39 | return train, data["test_list"], data["train_list"] 40 | 41 | def getEmbedding(self): 42 | train_matrix = np.zeros([self.shape[0], self.shape[1]], dtype=np.float64) 43 | dataDict = {} 44 | for i in self.train: 45 | user = i[0] 46 | movie = i[1] 47 | rating = i[2] 48 | train_matrix[user][movie] = rating 49 | dataDict[(i[0], i[1])] = i[2] 50 | return np.array(train_matrix), dataDict 51 | 52 | def getInstances(self, negNum): # 选部分正样本,随机负样本 53 | item_pos = np.zeros((self.nrow, self.maxn_item), dtype=np.int64) 54 | item_neg = np.zeros((self.nrow, negNum), dtype=np.int64) 55 | for i in range(self.nrow): 56 | if self.user_ratings[i]: 57 | temp_pos = np.random.choice(self.user_ratings[i], size=np.minimum(self.maxn_item, len(self.user_ratings[i])), replace=False) 58 | item_pos[i, :] = np.pad(temp_pos, (0, self.maxn_item - len(temp_pos)), 'constant') 59 | item_neg[i, :] = np.random.choice(self.item_all, size=negNum, replace=False, p=self.sample_prob[i, :]) 60 | return np.array(range(self.nrow)), item_pos, item_neg 61 | 62 | 63 | -------------------------------------------------------------------------------- /sparse-Deep-SetRank.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding:UTF-8 -*- 2 | # This is the implementation for Deep-SetRank using sparse matrix. 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import argparse 7 | from DataSet_pre import DataSet 8 | import os 9 | import pandas as pd 10 | from evaluate import * 11 | from collections import defaultdict 12 | 13 | tf.set_random_seed(1) 14 | np.random.seed(1) 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser(description="Options") 18 | 19 | parser.add_argument('-dev', action='store', dest='dev', default='0') 20 | parser.add_argument('-trainName', action='store', dest='trainName', default='ml1m_oc_50_train_ratings') 21 | parser.add_argument('-testName', action='store', dest='testName', default='ml1m_oc_50_test_ratings') 22 | parser.add_argument('-repath', action='store', dest='repath', default='model/set.ckpt') 23 | parser.add_argument('-reloop', type=str, default=199) 24 | parser.add_argument('-negnum', action='store', dest='negnum', default=30, type=int) # The number of sampled unobserved items in one epoch for each user 25 | parser.add_argument('-posnum', action='store', dest='posnum', default=20, type=int) # The number of positive items in one epoch for each user 26 | parser.add_argument('-userLayer', action='store', dest='userLayer', default=[512, 100]) # The shape of user network 27 | parser.add_argument('-itemLayer', action='store', dest='itemLayer', default=[1024, 100]) # The shape of item network 28 | parser.add_argument('-reg', action='store', dest='reg', default=1.2, type=np.float64) # The regularization parameter 29 | parser.add_argument('-lr', action='store', dest='lr', default=0.0003, type=np.float64) # The learning rate 30 | parser.add_argument('-keep', action='store', dest='keep', default=0.5, type=np.float64) # The keep ratio in dropout layer 31 | parser.add_argument('-maxepochs', action='store', dest='maxEpochs', default=6000, type=int) 32 | parser.add_argument('-print', action='store', dest='print', default=1, type=int) 33 | 34 | args = parser.parse_args() 35 | classifier = Model(args) 36 | classifier.run() 37 | # restore_path = args.repath + '-%s' % args.reloop 38 | # classifier.saver.restore(classifier.sess, restore_path) 39 | # classifier.evaluate(classifier.sess) 40 | # classifier.run_epoch(classifier.sess) 41 | 42 | 43 | def sparse_dropout(x, keep_prob, noise_shape): # The dropout layer for sparse matrix 44 | """Dropout for sparse tensors.""" 45 | random_tensor = keep_prob 46 | random_tensor += tf.random_uniform([noise_shape], dtype=tf.float64) 47 | dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool) 48 | pre_out = tf.sparse_retain(x, dropout_mask) 49 | return pre_out * (1. / keep_prob) 50 | 51 | 52 | recall = [] 53 | precision = [] 54 | map = [] 55 | 56 | 57 | class Model: 58 | def __init__(self, args): 59 | os.environ['CUDA_VISIBLE_DEVICES'] = args.dev 60 | self.dataSet = DataSet(args) 61 | self.nrow = self.dataSet.nrow 62 | self.ncol = self.dataSet.ncol 63 | self.shape = [self.nrow,self.ncol] 64 | self.train_list = self.dataSet.train_list 65 | self.test_list = self.dataSet.test 66 | self.id_matrix = tf.convert_to_tensor(self.dataSet.id_matrix) 67 | self.posprobe = defaultdict(list) 68 | for line in self.test_list: 69 | self.posprobe[line[0]].append(line[1]) 70 | self.postrain = defaultdict(list) 71 | for line in self.train_list: 72 | self.postrain[line[0]].append(line[1]) 73 | self.negNum = args.negnum 74 | self.reg = args.reg 75 | self.keep = args.keep 76 | self.add_embedding_matrix() 77 | 78 | self.add_placeholders() 79 | self.userLayer = args.userLayer 80 | self.itemLayer = args.itemLayer 81 | self.add_model() 82 | 83 | self.add_loss() 84 | 85 | self.lr = args.lr 86 | self.add_train_step() 87 | 88 | self.filename = "dumper/deep-setrank" + "_" + str(self.keep) + "_" + str(args.negnum) + "_" + str(self.reg) + "_" + str(self.lr) + "_" + str(self.itemLayer[-1]) 89 | self.init_sess() 90 | 91 | self.maxEpochs = args.maxEpochs 92 | self.print = args.print 93 | self.repath = args.repath 94 | 95 | def add_placeholders(self): 96 | self.user = tf.placeholder(tf.int64) 97 | self.item = tf.placeholder(tf.int64) 98 | self.item2 = tf.placeholder(tf.int64) 99 | self.drop = tf.placeholder(tf.float64) 100 | 101 | def add_embedding_matrix(self): 102 | self.user_item_embedding = tf.SparseTensor(self.dataSet.train_list[:, 0:2], values=self.dataSet.train_list[:, 2].astype(np.float64), dense_shape=[self.nrow, self.ncol]) 103 | self.item_user_embedding = tf.sparse_transpose(self.user_item_embedding) 104 | 105 | def add_model(self): 106 | # user_input = tf.nn.embedding_lookup(self.user_item_embedding, self.user) 107 | user_input = self.user_item_embedding 108 | item_input = self.item_user_embedding 109 | self.id_input = tf.nn.embedding_lookup(self.id_matrix, self.user) 110 | self.n_id = tf.reduce_sum(self.id_input) 111 | user_input = sparse_dropout(user_input, self.drop, self.dataSet.n_train) 112 | item_input = sparse_dropout(item_input, self.drop, self.dataSet.n_train) 113 | 114 | def init_variable(shape, name): 115 | return tf.get_variable(name, shape, initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float64) 116 | 117 | with tf.name_scope("User_Layer"): 118 | user_W1 = init_variable([self.shape[1], self.userLayer[0]], "user_W1") 119 | user_b1 = tf.get_variable("user_b1", [self.userLayer[0]], initializer=tf.constant_initializer(0.0), dtype=tf.float64) 120 | self.user_out = tf.nn.sigmoid(tf.sparse_tensor_dense_matmul(user_input, user_W1) + user_b1) 121 | for i in range(0, len(self.userLayer)-1): 122 | W = init_variable([self.userLayer[i], self.userLayer[i+1]], "user_W"+str(i+2)) 123 | b = tf.get_variable("user_b"+str(i+2), [self.userLayer[i+1]], initializer=tf.constant_initializer(0.0), dtype=tf.float64) 124 | self.user_out = tf.nn.tanh(tf.add(tf.matmul(self.user_out, W), b)) 125 | 126 | with tf.name_scope("Item_Layer"): 127 | item_W1 = init_variable([self.shape[0], self.itemLayer[0]], "item_W1") 128 | item_b1 = tf.get_variable("item_b1" , [self.itemLayer[0]], initializer=tf.constant_initializer(0.0), dtype=tf.float64) 129 | self.item_preout = tf.nn.sigmoid(tf.sparse_tensor_dense_matmul(item_input, item_W1) + item_b1) 130 | for i in range(0, len(self.itemLayer)-1): 131 | W = init_variable([self.itemLayer[i], self.itemLayer[i+1]], "item_W"+str(i+2)) 132 | b = tf.get_variable("item_b"+str(i+2), [self.itemLayer[i+1]], initializer=tf.constant_initializer(0.0), dtype=tf.float64) 133 | self.item_preout = tf.nn.tanh(tf.add(tf.matmul(self.item_preout, W), b)) 134 | 135 | self.user_tile = tf.reshape(tf.tile(self.user_out, [1, self.dataSet.maxn_item]), (-1, self.itemLayer[-1])) 136 | self.user_tile2 = tf.reshape(tf.tile(self.user_out, [1, self.negNum]), (-1, self.itemLayer[-1])) 137 | self.item_out = tf.nn.embedding_lookup(self.item_preout, self.item) 138 | self.item2_out = tf.nn.embedding_lookup(self.item_preout, self.item2) 139 | 140 | self.y = tf.reduce_sum(tf.multiply(self.user_tile, self.item_out), axis=1) 141 | self.y2 = tf.reduce_sum(tf.multiply(self.user_tile2, self.item2_out), axis=1) 142 | self.y_ = tf.reshape(tf.nn.sigmoid(self.y), (-1, self.dataSet.maxn_item)) 143 | self.y2_ = tf.reshape(tf.nn.sigmoid(self.y2), (-1, self.negNum)) 144 | 145 | self.y_sum = tf.reshape(tf.reduce_sum(self.y2_, axis=1), (-1, 1)) + self.y_ 146 | 147 | def add_loss(self): 148 | self.model_loss = - tf.reduce_mean(tf.reduce_sum(tf.log(tf.maximum(self.y_ / self.y_sum, 1e-9)) * self.id_input, 1)) 149 | self.norm_loss = self.reg * (tf.nn.l2_loss(self.user_out)/self.nrow + tf.nn.l2_loss(self.item_preout)/self.ncol) 150 | self.loss = self.model_loss + self.norm_loss 151 | 152 | def add_train_step(self): 153 | self.train_step = tf.train.AdamOptimizer(self.lr).minimize(self.loss) 154 | 155 | def init_sess(self): 156 | self.config = tf.ConfigProto() 157 | self.config.gpu_options.allow_growth = True 158 | self.config.allow_soft_placement = True 159 | self.sess = tf.Session(config=self.config) 160 | self.sess.run(tf.global_variables_initializer()) 161 | # self.saver = tf.train.Saver(self.weights, max_to_keep=30) 162 | 163 | def run(self): 164 | print("Start Training!") 165 | for epoch in range(self.maxEpochs): 166 | print("=" * 20 + "Epoch ", epoch, "=" * 20) 167 | if epoch % self.print == 0: 168 | self.evaluate(self.sess) 169 | # if epoch % 100 == 0: 170 | # self.saver.save(self.sess, self.repath, global_step=epoch) 171 | self.run_epoch(self.sess) 172 | 173 | def run_epoch(self, sess): 174 | train_u, train_i, train_j = self.dataSet.getInstances(self.negNum) 175 | losses = [] 176 | losses2 = [] 177 | losses1 = [] 178 | 179 | feed_dict = self.create_feed_dict(train_u, np.reshape(train_i, (-1)), np.reshape(train_j, (-1)), self.keep) 180 | _, tmp_loss, loss1, loss2 = sess.run([self.train_step, self.loss, self.model_loss, self.norm_loss], feed_dict=feed_dict) 181 | losses.append(tmp_loss) 182 | losses1.append(loss1) 183 | losses2.append(loss2) 184 | 185 | loss = np.mean(losses) 186 | loss1 = np.mean(losses1) 187 | loss2 = np.mean(losses2) 188 | print("\nMean loss in this epoch is: {}".format(loss), loss1, loss2) 189 | return loss 190 | 191 | def create_feed_dict(self, u, i, j, drop=None): 192 | return {self.user: u, 193 | self.item: i, 194 | self.item2: j, 195 | self.drop: drop} 196 | 197 | def evaluate(self, sess): 198 | testUser1 = np.arange(self.nrow) 199 | testItem1 = np.arange(self.ncol) 200 | self.u, self.v = self.sess.run((self.user_out, self.item_out), feed_dict={self.user: testUser1, self.item: testItem1, self.drop: 1}) 201 | epoch_rating = np.dot(self.u, self.v.T) 202 | recall_batch, precision_batch, map_batch = evaluate(self.postrain, self.posprobe, epoch_rating, [1, 5, 10, 20]) 203 | print(precision_batch, recall_batch, map_batch) 204 | precision.append(precision_batch) 205 | recall.append(recall_batch) 206 | map.append(map_batch) 207 | evaluation = pd.concat([pd.DataFrame(precision), pd.DataFrame(recall), pd.DataFrame(map)], axis=1) 208 | evaluation.to_csv(self.filename + ".csv", header=False, index=False) 209 | 210 | 211 | if __name__ == '__main__': 212 | main() 213 | -------------------------------------------------------------------------------- /MF-SetRank.jl: -------------------------------------------------------------------------------- 1 | 2 | # This is the implementation for MF-SetRank. 3 | # This code is based on the implementation of SQL-Rank on https://github.com/wuliwei9278/SQL-Rank. 4 | 5 | function logit(x) 6 | return 1.0/(1+exp(-x)) 7 | end 8 | 9 | function comp_m(rows, cols, U, V) 10 | m = zeros(length(rows)); 11 | for i = 1:length(rows) 12 | m[i] = logit(dot(U[:,cols[i]], V[:,rows[i]])); 13 | end 14 | return m 15 | end 16 | 17 | function objective(index, m, rows, d1, lambda, U, V) 18 | res = 0.0; 19 | for i = 1:d1 20 | tt = 0.0; 21 | d_bar = index[i+1] - index[i]; 22 | for t = d_bar:-1:1 23 | tmp = m[index[i] - 1 + t]; 24 | tt += exp(m[index[i] - 1 + t]); 25 | res -= tmp; 26 | res += log(tt); 27 | end 28 | end 29 | res += lambda / 2 * (vecnorm(U) ^ 2 +vecnorm(V) ^ 2); 30 | return res 31 | end 32 | 33 | function comp_gradient_ui(rows, cols, index, d_bar, m, i, V, r, ratio) 34 | nowOnes = div(d_bar, 1 + ratio); 35 | cc = zeros(d_bar); 36 | 37 | total = 0 38 | for t = (nowOnes+1):d_bar 39 | ttt = m[index[i] - 1 + t]; 40 | total += exp(ttt); 41 | end 42 | total_i = zeros(nowOnes); 43 | total_sum = 0 44 | for sel_i = 1:nowOnes 45 | ttt = m[index[i] - 1 + sel_i]; 46 | cc[sel_i] -= ttt * (1 - ttt); 47 | total_i[sel_i] = exp(ttt) + total; 48 | total_sum += 1 / total_i[sel_i] 49 | end 50 | for t = (nowOnes+1):d_bar 51 | ttt = m[index[i] - 1 + t]; 52 | cc[t] += exp(ttt) * ttt * (1 - ttt) * total_sum; 53 | end 54 | for sel_i = 1:nowOnes 55 | ttt = m[index[i] - 1 + sel_i]; 56 | cc[sel_i] += exp(ttt) * ttt * (1 - ttt) / total_i[sel_i]; 57 | end 58 | 59 | res = zeros(r); 60 | for t = 1:d_bar 61 | res += cc[t] * V[:,rows[index[i] - 1 + t]]; 62 | end 63 | return res 64 | end 65 | 66 | function comp_gradient_U(rows, cols, index, m, U, V, s, d1, r, lambda, ratio) 67 | grad_U = zeros(size(U)); 68 | for i = 1:d1 69 | d_bar = index[i+1] - index[i]; 70 | grad_U[:,i] = comp_gradient_ui(rows, cols, index, d_bar, m, i, V, r, ratio); 71 | end 72 | grad_U += lambda * U; 73 | return grad_U 74 | end 75 | 76 | function obtain_U(rows, cols, index, U, V, s, d1, r, lambda, ratio) 77 | m = comp_m(rows, cols, U, V); 78 | grad_U = comp_gradient_U(rows, cols, index, m, U, V, s, d1, r, lambda, ratio); 79 | U = U - s * grad_U; 80 | m = comp_m(rows, cols, U, V); 81 | return U, m 82 | end 83 | 84 | function comp_gradient_V(rows, cols, index, m, U, V, s, d1, r, lambda, ratio) 85 | grad_V = zeros(size(V)); 86 | for i = 1:d1 87 | d_bar = index[i+1] - index[i]; 88 | cc = zeros(d_bar); 89 | nowOnes = div(d_bar, 1 + ratio); 90 | 91 | total = 0 92 | for t = (nowOnes+1):d_bar 93 | ttt = m[index[i] - 1 + t]; 94 | total += exp(ttt); 95 | end 96 | total_i = zeros(nowOnes); 97 | total_sum = 0 98 | for sel_i = 1:nowOnes 99 | ttt = m[index[i] - 1 + sel_i]; 100 | cc[sel_i] -= ttt * (1 - ttt); 101 | total_i[sel_i] = exp(ttt) + total; 102 | total_sum += 1 / total_i[sel_i] 103 | end 104 | for t = (nowOnes+1):d_bar 105 | ttt = m[index[i] - 1 + t]; 106 | cc[t] += exp(ttt) * ttt * (1 - ttt) * total_sum; 107 | end 108 | for sel_i = 1:nowOnes 109 | ttt = m[index[i] - 1 + sel_i]; 110 | cc[sel_i] += exp(ttt) * ttt * (1 - ttt) / total_i[sel_i]; 111 | end 112 | 113 | for t = 1:d_bar 114 | j = rows[index[i] - 1 + t] 115 | grad_V[:,j] += cc[t] * U[:,i] 116 | # println("t: ", t, "cc[t]: ", cc[t]); 117 | # println("grad_V[:,j]: ", grad_V[:,j]); 118 | end 119 | end 120 | grad_V += lambda * V; 121 | return grad_V 122 | end 123 | 124 | function obtain_V(rows, cols, index, m, U, V, s, d1, r, lambda, ratio) 125 | grad_V = comp_gradient_V(rows, cols, index, m, U, V, s, d1, r, lambda, ratio); 126 | V = V - s * grad_V; 127 | return V 128 | end 129 | 130 | function stochasticQueuing(rows, index, d1, d2, ratio) 131 | new_rows = zeros(Int, size(rows)[1]); 132 | for i = 1:d1 133 | nowlen = index[i + 1] - index[i]; 134 | nowOnes = div(nowlen, 1 + ratio); 135 | newOrder = shuffle(1:nowOnes); 136 | rows_set = Set{Int}(); 137 | for j = 1:nowOnes 138 | oldIdx = index[i] + j - 1; 139 | row_j = rows[oldIdx]; 140 | push!(rows_set, row_j); 141 | newIdx = index[i] + newOrder[j] - 1; 142 | new_rows[newIdx] = row_j; 143 | end 144 | nowStart = index[i] + nowOnes; 145 | nowEnd = index[i + 1] - 1; 146 | for j = nowStart:nowEnd 147 | while true 148 | row_idx = rand(1:d2); 149 | if !(row_idx in rows_set) 150 | new_rows[j] = row_idx; 151 | push!(rows_set, row_idx); 152 | break; 153 | end 154 | end 155 | end 156 | end 157 | return new_rows 158 | end 159 | 160 | function evaluate(U, V, X, Y, d1, d2, rows, vals, rows_t, vals_t, testsize, K) 161 | precision = zeros(length(K)); 162 | recall = zeros(length(K)); 163 | map = zeros(length(K)); 164 | score = V'* U; 165 | for i = 1:d1 166 | precision_tmp = zeros(length(K)); 167 | recall_tmp = zeros(length(K)); 168 | map_tmp = zeros(length(K)); 169 | tmp = nzrange(Y, i); 170 | test = Set{Int64}(); 171 | for j in tmp 172 | push!(test, rows_t[j]); 173 | end 174 | if isempty(test) 175 | continue 176 | end 177 | tmp = nzrange(X, i); 178 | vals_d2_bar = vals[tmp]; 179 | train = rows[tmp]; 180 | score[train, i] = -999; 181 | p = sortperm(score[:, i], rev = true); 182 | cc = 0; 183 | for c = 1: K[length(K)] 184 | j = p[c]; 185 | if j in test 186 | cc += 1; 187 | for k in length(K):-1:1 188 | if c <= K[k] 189 | precision_tmp[k] += 1; 190 | recall_tmp[k] += 1; 191 | map_tmp[k] += cc / c; 192 | else 193 | break; 194 | end 195 | end 196 | end 197 | end 198 | ntest = length(test); 199 | precision += precision_tmp ./ K; 200 | recall += recall_tmp / ntest; 201 | map += map_tmp ./ K; 202 | end 203 | return precision/testsize, recall/testsize, map/testsize 204 | end 205 | 206 | 207 | train = "data/ml1m_oc_50_train_ratings.csv" 208 | test = "data/ml1m_oc_50_test_ratings.csv" 209 | 210 | T = 1; # Every T epoches, learning_rate *= decay_rate 211 | ratio = 3; # the sample ratio of negative items to positive items 212 | learning_rate = 0.3; # the learning rate 213 | decay_rate = 0.97; # the decay rate 214 | lambda = 1.4; # the regularization parameter 215 | r = 200; # the dimension of latent vectors 216 | 217 | filename1 = string("dumper/", "mfsetrank", ".csv") 218 | 219 | X = readdlm(train, ',' , Int64); 220 | x = vec(X[:,1]); 221 | y = vec(X[:,2]); 222 | v = vec(X[:,3]); 223 | Y = readdlm(test, ',' , Int64); 224 | xx = vec(Y[:,1]); 225 | yy = vec(Y[:,2]); 226 | vv = vec(Y[:,3]); 227 | n = max(maximum(x), maximum(xx)); 228 | msize = max(maximum(y), maximum(yy)); 229 | testsize = length(unique(xx)); 230 | 231 | X = sparse(x, y, v, n, msize); # userid by movieid 232 | Y = sparse(xx, yy, vv, n, msize); 233 | X = X'; 234 | Y = Y'; 235 | rows = rowvals(X); 236 | vals = nonzeros(X); 237 | cols = zeros(Int, size(vals)[1]); 238 | index = zeros(Int, n + 1); 239 | 240 | d2, d1 = size(X); 241 | cc = 0; 242 | new_len = 0; 243 | new_index = zeros(Int, d1 + 1); 244 | new_index[1] = 1; 245 | for i = 1:d1 246 | index[i] = cc + 1; 247 | tmp = nzrange(X, i); 248 | nowlen = size(tmp)[1]; 249 | newlen = nowlen * (1 + ratio); 250 | new_len += newlen; 251 | new_index[i + 1] = new_index[i] + newlen; 252 | for j = 1:nowlen 253 | cc += 1; 254 | cols[cc] = i; 255 | end 256 | end 257 | index[d1 + 1] = cc + 1; 258 | 259 | new_rows = zeros(Int, new_len); 260 | new_cols = zeros(Int, new_len); 261 | new_vals = zeros(Int, new_len); 262 | for i = 1:d1 263 | rows_set = Set{Int}(); 264 | for j = index[i]:(index[i + 1] - 1) 265 | push!(rows_set, rows[j]); 266 | end 267 | nowlen = new_index[i + 1] - new_index[i]; 268 | nowOnes = div(nowlen, 1 + ratio); 269 | for j = 1:nowOnes 270 | new_rows[new_index[i] + j - 1] = rows[index[i] + j - 1]; 271 | new_cols[new_index[i] + j - 1] = i; 272 | new_vals[new_index[i] + j - 1] = vals[index[i] + j - 1]; 273 | end 274 | nowStart = new_index[i] + nowOnes; 275 | nowEnd = new_index[i + 1] - 1; 276 | for j = nowStart:nowEnd 277 | while true 278 | row_idx = rand(1:d2); 279 | if !(row_idx in rows_set) 280 | new_rows[j] = row_idx; 281 | new_cols[j] = i; 282 | new_vals[j] = 0.0; 283 | push!(rows_set, row_idx); 284 | break; 285 | end 286 | end 287 | end 288 | end 289 | 290 | rows_t = rowvals(Y); 291 | vals_t = nonzeros(Y); 292 | cols_t = zeros(Int, size(vals_t)[1]); 293 | index_t = zeros(Int, n + 1) 294 | cc = 0; 295 | for i = 1:d1 296 | index_t[i] = cc + 1; 297 | tmp = nzrange(Y, i); 298 | nowlen = size(tmp)[1]; 299 | for j = 1:nowlen 300 | cc += 1 301 | cols_t[cc] = i 302 | end 303 | end 304 | index_t[d1 + 1] = cc + 1; 305 | 306 | srand(123456789); 307 | U = 0.1*randn(r, d1); 308 | V = 0.1*randn(r, d2); 309 | m = comp_m(new_rows, new_cols, U, V); 310 | println("rank: ", r, ", ratio of 0 vs 1: ", ratio, ", lambda:", lambda, ", learning_rate: ", learning_rate); 311 | 312 | num_epoch = 100; 313 | num_iterations_per_epoch = 1; 314 | K = [1, 5, 10, 20]; 315 | precision = zeros(Float64, num_epoch, 4) 316 | recall = zeros(Float64, num_epoch, 4) 317 | map = zeros(Float64, num_epoch, 4) 318 | println("iter time objective_function precision@K = 1, 5, 10"); 319 | obj = objective(new_index, m, new_rows, d1, lambda, U, V); 320 | p1,p2,p3=evaluate(U, V, X, Y, d1, d2, rows, vals, rows_t, vals_t, testsize, K); 321 | println("[", 0, ",", obj, ", ", p1," ",p2," ",p3, "],"); 322 | 323 | totaltime = 0.00000; 324 | nowobj = obj; 325 | for epoch = 1:num_epoch 326 | tic(); 327 | for iter = 1:num_iterations_per_epoch 328 | U, m = obtain_U(new_rows, new_cols, new_index, U, V, learning_rate, d1, r, lambda, ratio); 329 | V = obtain_V(new_rows, new_cols, new_index, m, U, V, learning_rate, d1, r, lambda, ratio); 330 | end 331 | 332 | new_rows = stochasticQueuing(new_rows, new_index, d1, d2, ratio); 333 | 334 | totaltime += toq(); 335 | if (epoch - 1) % T == 0 336 | learning_rate = learning_rate * decay_rate 337 | p1,p2,p3=evaluate(U, V, X, Y, d1, d2, rows, vals, rows_t, vals_t, testsize, K); 338 | precision[epoch, :] = p1 339 | recall[epoch, :] = p2 340 | map[epoch, :] = p3 341 | m = comp_m(new_rows, new_cols, U, V); 342 | nowobj = objective(new_index, m, new_rows, d1, lambda, U, V); 343 | println("[", epoch, ", ", totaltime, ", ", nowobj, ", ", p1,", ",p2,", ",p3, "],"); 344 | else 345 | m = comp_m(new_rows, new_cols, U, V); 346 | nowobj = objective(new_index, m, new_rows, d1, lambda, U, V); 347 | println("[", epoch, ", ", totaltime, ", ", nowobj); 348 | end 349 | end 350 | 351 | writedlm(filename1, [precision recall map]) 352 | --------------------------------------------------------------------------------