├── Data ├── README ├── amazon-book │ ├── README.md │ ├── item_list.txt │ ├── test.txt │ ├── train.txt │ └── user_list.txt └── gowalla │ ├── README.md │ ├── item_list.txt │ ├── test.txt │ ├── train.txt │ └── user_list.txt ├── NGCF ├── NGCF.py ├── __init__.py ├── main.py └── utility │ ├── README.md │ ├── batch_test.py │ ├── helper.py │ ├── load_data.py │ ├── metrics.py │ └── parser.py └── README.md /Data/README: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Data/amazon-book/README.md: -------------------------------------------------------------------------------- 1 | Look for the full dataset? Please visit the [websit](http://jmcauley.ucsd.edu/data/amazon). 2 | 3 | -------------------------------------------------------------------------------- /Data/gowalla/README.md: -------------------------------------------------------------------------------- 1 | Look for the full dataset? 2 | Please visit the [websit](https://snap.stanford.edu/data/loc-gowalla.html). 3 | -------------------------------------------------------------------------------- /NGCF/NGCF.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on March 24, 2020 3 | 4 | @author: Tinglin Huang (huangtinglin@outlook.com) 5 | ''' 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | 12 | class NGCF(nn.Module): 13 | def __init__(self, n_user, n_item, norm_adj, args): 14 | super(NGCF, self).__init__() 15 | self.n_user = n_user 16 | self.n_item = n_item 17 | self.device = args.device 18 | self.emb_size = args.embed_size 19 | self.batch_size = args.batch_size 20 | self.node_dropout = args.node_dropout[0] 21 | self.mess_dropout = args.mess_dropout 22 | self.batch_size = args.batch_size 23 | 24 | self.norm_adj = norm_adj 25 | 26 | self.layers = eval(args.layer_size) 27 | self.decay = eval(args.regs)[0] 28 | 29 | """ 30 | ********************************************************* 31 | Init the weight of user-item. 32 | """ 33 | self.embedding_dict, self.weight_dict = self.init_weight() 34 | 35 | """ 36 | ********************************************************* 37 | Get sparse adj. 38 | """ 39 | self.sparse_norm_adj = self._convert_sp_mat_to_sp_tensor(self.norm_adj).to(self.device) 40 | 41 | def init_weight(self): 42 | # xavier init 43 | initializer = nn.init.xavier_uniform_ 44 | 45 | embedding_dict = nn.ParameterDict({ 46 | 'user_emb': nn.Parameter(initializer(torch.empty(self.n_user, 47 | self.emb_size))), 48 | 'item_emb': nn.Parameter(initializer(torch.empty(self.n_item, 49 | self.emb_size))) 50 | }) 51 | 52 | weight_dict = nn.ParameterDict() 53 | layers = [self.emb_size] + self.layers 54 | for k in range(len(self.layers)): 55 | weight_dict.update({'W_gc_%d'%k: nn.Parameter(initializer(torch.empty(layers[k], 56 | layers[k+1])))}) 57 | weight_dict.update({'b_gc_%d'%k: nn.Parameter(initializer(torch.empty(1, layers[k+1])))}) 58 | 59 | weight_dict.update({'W_bi_%d'%k: nn.Parameter(initializer(torch.empty(layers[k], 60 | layers[k+1])))}) 61 | weight_dict.update({'b_bi_%d'%k: nn.Parameter(initializer(torch.empty(1, layers[k+1])))}) 62 | 63 | return embedding_dict, weight_dict 64 | 65 | def _convert_sp_mat_to_sp_tensor(self, X): 66 | coo = X.tocoo() 67 | i = torch.LongTensor([coo.row, coo.col]) 68 | v = torch.from_numpy(coo.data).float() 69 | return torch.sparse.FloatTensor(i, v, coo.shape) 70 | 71 | def sparse_dropout(self, x, rate, noise_shape): 72 | random_tensor = 1 - rate 73 | random_tensor += torch.rand(noise_shape).to(x.device) 74 | dropout_mask = torch.floor(random_tensor).type(torch.bool) 75 | i = x._indices() 76 | v = x._values() 77 | 78 | i = i[:, dropout_mask] 79 | v = v[dropout_mask] 80 | 81 | out = torch.sparse.FloatTensor(i, v, x.shape).to(x.device) 82 | return out * (1. / (1 - rate)) 83 | 84 | def create_bpr_loss(self, users, pos_items, neg_items): 85 | pos_scores = torch.sum(torch.mul(users, pos_items), axis=1) 86 | neg_scores = torch.sum(torch.mul(users, neg_items), axis=1) 87 | 88 | maxi = nn.LogSigmoid()(pos_scores - neg_scores) 89 | 90 | mf_loss = -1 * torch.mean(maxi) 91 | 92 | # cul regularizer 93 | regularizer = (torch.norm(users) ** 2 94 | + torch.norm(pos_items) ** 2 95 | + torch.norm(neg_items) ** 2) / 2 96 | emb_loss = self.decay * regularizer / self.batch_size 97 | 98 | return mf_loss + emb_loss, mf_loss, emb_loss 99 | 100 | def rating(self, u_g_embeddings, pos_i_g_embeddings): 101 | return torch.matmul(u_g_embeddings, pos_i_g_embeddings.t()) 102 | 103 | def forward(self, users, pos_items, neg_items, drop_flag=True): 104 | 105 | A_hat = self.sparse_dropout(self.sparse_norm_adj, 106 | self.node_dropout, 107 | self.sparse_norm_adj._nnz()) if drop_flag else self.sparse_norm_adj 108 | 109 | ego_embeddings = torch.cat([self.embedding_dict['user_emb'], 110 | self.embedding_dict['item_emb']], 0) 111 | 112 | all_embeddings = [ego_embeddings] 113 | 114 | for k in range(len(self.layers)): 115 | side_embeddings = torch.sparse.mm(A_hat, ego_embeddings) 116 | 117 | # transformed sum messages of neighbors. 118 | sum_embeddings = torch.matmul(side_embeddings, self.weight_dict['W_gc_%d' % k]) \ 119 | + self.weight_dict['b_gc_%d' % k] 120 | 121 | # bi messages of neighbors. 122 | # element-wise product 123 | bi_embeddings = torch.mul(ego_embeddings, side_embeddings) 124 | # transformed bi messages of neighbors. 125 | bi_embeddings = torch.matmul(bi_embeddings, self.weight_dict['W_bi_%d' % k]) \ 126 | + self.weight_dict['b_bi_%d' % k] 127 | 128 | # non-linear activation. 129 | ego_embeddings = nn.LeakyReLU(negative_slope=0.2)(sum_embeddings + bi_embeddings) 130 | 131 | # message dropout. 132 | ego_embeddings = nn.Dropout(self.mess_dropout[k])(ego_embeddings) 133 | 134 | # normalize the distribution of embeddings. 135 | norm_embeddings = F.normalize(ego_embeddings, p=2, dim=1) 136 | 137 | all_embeddings += [norm_embeddings] 138 | 139 | all_embeddings = torch.cat(all_embeddings, 1) 140 | u_g_embeddings = all_embeddings[:self.n_user, :] 141 | i_g_embeddings = all_embeddings[self.n_user:, :] 142 | 143 | """ 144 | ********************************************************* 145 | look up. 146 | """ 147 | u_g_embeddings = u_g_embeddings[users, :] 148 | pos_i_g_embeddings = i_g_embeddings[pos_items, :] 149 | neg_i_g_embeddings = i_g_embeddings[neg_items, :] 150 | 151 | return u_g_embeddings, pos_i_g_embeddings, neg_i_g_embeddings 152 | -------------------------------------------------------------------------------- /NGCF/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangtinglin/NGCF-PyTorch/f4e7f6960cd8cd443d5537261d30b89cf52ed52d/NGCF/__init__.py -------------------------------------------------------------------------------- /NGCF/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on March 24, 2020 3 | 4 | @author: Tinglin Huang (huangtinglin@outlook.com) 5 | ''' 6 | 7 | import torch 8 | import torch.optim as optim 9 | 10 | from NGCF import NGCF 11 | from utility.helper import * 12 | from utility.batch_test import * 13 | 14 | import warnings 15 | warnings.filterwarnings('ignore') 16 | from time import time 17 | 18 | 19 | if __name__ == '__main__': 20 | 21 | args.device = torch.device('cuda:' + str(args.gpu_id)) 22 | 23 | plain_adj, norm_adj, mean_adj = data_generator.get_adj_mat() 24 | 25 | args.node_dropout = eval(args.node_dropout) 26 | args.mess_dropout = eval(args.mess_dropout) 27 | 28 | model = NGCF(data_generator.n_users, 29 | data_generator.n_items, 30 | norm_adj, 31 | args).to(args.device) 32 | 33 | t0 = time() 34 | """ 35 | ********************************************************* 36 | Train. 37 | """ 38 | cur_best_pre_0, stopping_step = 0, 0 39 | optimizer = optim.Adam(model.parameters(), lr=args.lr) 40 | 41 | loss_loger, pre_loger, rec_loger, ndcg_loger, hit_loger = [], [], [], [], [] 42 | for epoch in range(args.epoch): 43 | t1 = time() 44 | loss, mf_loss, emb_loss = 0., 0., 0. 45 | n_batch = data_generator.n_train // args.batch_size + 1 46 | 47 | for idx in range(n_batch): 48 | users, pos_items, neg_items = data_generator.sample() 49 | u_g_embeddings, pos_i_g_embeddings, neg_i_g_embeddings = model(users, 50 | pos_items, 51 | neg_items, 52 | drop_flag=args.node_dropout_flag) 53 | 54 | batch_loss, batch_mf_loss, batch_emb_loss = model.create_bpr_loss(u_g_embeddings, 55 | pos_i_g_embeddings, 56 | neg_i_g_embeddings) 57 | optimizer.zero_grad() 58 | batch_loss.backward() 59 | optimizer.step() 60 | 61 | loss += batch_loss 62 | mf_loss += batch_mf_loss 63 | emb_loss += batch_emb_loss 64 | 65 | if (epoch + 1) % 10 != 0: 66 | if args.verbose > 0 and epoch % args.verbose == 0: 67 | perf_str = 'Epoch %d [%.1fs]: train==[%.5f=%.5f + %.5f]' % ( 68 | epoch, time() - t1, loss, mf_loss, emb_loss) 69 | print(perf_str) 70 | continue 71 | 72 | t2 = time() 73 | users_to_test = list(data_generator.test_set.keys()) 74 | ret = test(model, users_to_test, drop_flag=False) 75 | 76 | t3 = time() 77 | 78 | loss_loger.append(loss) 79 | rec_loger.append(ret['recall']) 80 | pre_loger.append(ret['precision']) 81 | ndcg_loger.append(ret['ndcg']) 82 | hit_loger.append(ret['hit_ratio']) 83 | 84 | if args.verbose > 0: 85 | perf_str = 'Epoch %d [%.1fs + %.1fs]: train==[%.5f=%.5f + %.5f], recall=[%.5f, %.5f], ' \ 86 | 'precision=[%.5f, %.5f], hit=[%.5f, %.5f], ndcg=[%.5f, %.5f]' % \ 87 | (epoch, t2 - t1, t3 - t2, loss, mf_loss, emb_loss, ret['recall'][0], ret['recall'][-1], 88 | ret['precision'][0], ret['precision'][-1], ret['hit_ratio'][0], ret['hit_ratio'][-1], 89 | ret['ndcg'][0], ret['ndcg'][-1]) 90 | print(perf_str) 91 | 92 | cur_best_pre_0, stopping_step, should_stop = early_stopping(ret['recall'][0], cur_best_pre_0, 93 | stopping_step, expected_order='acc', flag_step=5) 94 | 95 | # ********************************************************* 96 | # early stopping when cur_best_pre_0 is decreasing for ten successive steps. 97 | if should_stop == True: 98 | break 99 | 100 | # ********************************************************* 101 | # save the user & item embeddings for pretraining. 102 | if ret['recall'][0] == cur_best_pre_0 and args.save_flag == 1: 103 | torch.save(model.state_dict(), args.weights_path + str(epoch) + '.pkl') 104 | print('save the weights in path: ', args.weights_path + str(epoch) + '.pkl') 105 | 106 | recs = np.array(rec_loger) 107 | pres = np.array(pre_loger) 108 | ndcgs = np.array(ndcg_loger) 109 | hit = np.array(hit_loger) 110 | 111 | best_rec_0 = max(recs[:, 0]) 112 | idx = list(recs[:, 0]).index(best_rec_0) 113 | 114 | final_perf = "Best Iter=[%d]@[%.1f]\trecall=[%s], precision=[%s], hit=[%s], ndcg=[%s]" % \ 115 | (idx, time() - t0, '\t'.join(['%.5f' % r for r in recs[idx]]), 116 | '\t'.join(['%.5f' % r for r in pres[idx]]), 117 | '\t'.join(['%.5f' % r for r in hit[idx]]), 118 | '\t'.join(['%.5f' % r for r in ndcgs[idx]])) 119 | print(final_perf) -------------------------------------------------------------------------------- /NGCF/utility/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /NGCF/utility/batch_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 10, 2018 3 | Tensorflow Implementation of Neural Graph Collaborative Filtering (NGCF) model in: 4 | Wang Xiang et al. Neural Graph Collaborative Filtering. In SIGIR 2019. 5 | 6 | @author: Xiang Wang (xiangwang@u.nus.edu) 7 | ''' 8 | import utility.metrics as metrics 9 | from utility.parser import parse_args 10 | from utility.load_data import * 11 | import multiprocessing 12 | import heapq 13 | 14 | cores = multiprocessing.cpu_count() // 2 15 | 16 | args = parse_args() 17 | Ks = eval(args.Ks) 18 | 19 | data_generator = Data(path=args.data_path + args.dataset, batch_size=args.batch_size) 20 | USR_NUM, ITEM_NUM = data_generator.n_users, data_generator.n_items 21 | N_TRAIN, N_TEST = data_generator.n_train, data_generator.n_test 22 | BATCH_SIZE = args.batch_size 23 | 24 | def ranklist_by_heapq(user_pos_test, test_items, rating, Ks): 25 | item_score = {} 26 | for i in test_items: 27 | item_score[i] = rating[i] 28 | 29 | K_max = max(Ks) 30 | K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get) 31 | 32 | r = [] 33 | for i in K_max_item_score: 34 | if i in user_pos_test: 35 | r.append(1) 36 | else: 37 | r.append(0) 38 | auc = 0. 39 | return r, auc 40 | 41 | def get_auc(item_score, user_pos_test): 42 | item_score = sorted(item_score.items(), key=lambda kv: kv[1]) 43 | item_score.reverse() 44 | item_sort = [x[0] for x in item_score] 45 | posterior = [x[1] for x in item_score] 46 | 47 | r = [] 48 | for i in item_sort: 49 | if i in user_pos_test: 50 | r.append(1) 51 | else: 52 | r.append(0) 53 | auc = metrics.auc(ground_truth=r, prediction=posterior) 54 | return auc 55 | 56 | def ranklist_by_sorted(user_pos_test, test_items, rating, Ks): 57 | item_score = {} 58 | for i in test_items: 59 | item_score[i] = rating[i] 60 | 61 | K_max = max(Ks) 62 | K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get) 63 | 64 | r = [] 65 | for i in K_max_item_score: 66 | if i in user_pos_test: 67 | r.append(1) 68 | else: 69 | r.append(0) 70 | auc = get_auc(item_score, user_pos_test) 71 | return r, auc 72 | 73 | def get_performance(user_pos_test, r, auc, Ks): 74 | precision, recall, ndcg, hit_ratio = [], [], [], [] 75 | 76 | for K in Ks: 77 | precision.append(metrics.precision_at_k(r, K)) 78 | recall.append(metrics.recall_at_k(r, K, len(user_pos_test))) 79 | ndcg.append(metrics.ndcg_at_k(r, K, user_pos_test)) 80 | hit_ratio.append(metrics.hit_at_k(r, K)) 81 | 82 | return {'recall': np.array(recall), 'precision': np.array(precision), 83 | 'ndcg': np.array(ndcg), 'hit_ratio': np.array(hit_ratio), 'auc': auc} 84 | 85 | 86 | def test_one_user(x): 87 | # user u's ratings for user u 88 | rating = x[0] 89 | #uid 90 | u = x[1] 91 | #user u's items in the training set 92 | try: 93 | training_items = data_generator.train_items[u] 94 | except Exception: 95 | training_items = [] 96 | #user u's items in the test set 97 | user_pos_test = data_generator.test_set[u] 98 | 99 | all_items = set(range(ITEM_NUM)) 100 | 101 | test_items = list(all_items - set(training_items)) 102 | 103 | if args.test_flag == 'part': 104 | r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks) 105 | else: 106 | r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks) 107 | 108 | return get_performance(user_pos_test, r, auc, Ks) 109 | 110 | 111 | def test(model, users_to_test, drop_flag=False, batch_test_flag=False): 112 | result = {'precision': np.zeros(len(Ks)), 'recall': np.zeros(len(Ks)), 'ndcg': np.zeros(len(Ks)), 113 | 'hit_ratio': np.zeros(len(Ks)), 'auc': 0.} 114 | 115 | pool = multiprocessing.Pool(cores) 116 | 117 | u_batch_size = BATCH_SIZE * 2 118 | i_batch_size = BATCH_SIZE 119 | 120 | test_users = users_to_test 121 | n_test_users = len(test_users) 122 | n_user_batchs = n_test_users // u_batch_size + 1 123 | 124 | count = 0 125 | 126 | for u_batch_id in range(n_user_batchs): 127 | start = u_batch_id * u_batch_size 128 | end = (u_batch_id + 1) * u_batch_size 129 | 130 | user_batch = test_users[start: end] 131 | 132 | if batch_test_flag: 133 | # batch-item test 134 | n_item_batchs = ITEM_NUM // i_batch_size + 1 135 | rate_batch = np.zeros(shape=(len(user_batch), ITEM_NUM)) 136 | 137 | i_count = 0 138 | for i_batch_id in range(n_item_batchs): 139 | i_start = i_batch_id * i_batch_size 140 | i_end = min((i_batch_id + 1) * i_batch_size, ITEM_NUM) 141 | 142 | item_batch = range(i_start, i_end) 143 | 144 | if drop_flag == False: 145 | u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch, 146 | item_batch, 147 | [], 148 | drop_flag=False) 149 | i_rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu() 150 | else: 151 | u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch, 152 | item_batch, 153 | [], 154 | drop_flag=True) 155 | i_rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu() 156 | 157 | rate_batch[:, i_start: i_end] = i_rate_batch 158 | i_count += i_rate_batch.shape[1] 159 | 160 | assert i_count == ITEM_NUM 161 | 162 | else: 163 | # all-item test 164 | item_batch = range(ITEM_NUM) 165 | 166 | if drop_flag == False: 167 | u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch, 168 | item_batch, 169 | [], 170 | drop_flag=False) 171 | rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu() 172 | else: 173 | u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch, 174 | item_batch, 175 | [], 176 | drop_flag=True) 177 | rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu() 178 | 179 | user_batch_rating_uid = zip(rate_batch.numpy(), user_batch) 180 | batch_result = pool.map(test_one_user, user_batch_rating_uid) 181 | count += len(batch_result) 182 | 183 | for re in batch_result: 184 | result['precision'] += re['precision']/n_test_users 185 | result['recall'] += re['recall']/n_test_users 186 | result['ndcg'] += re['ndcg']/n_test_users 187 | result['hit_ratio'] += re['hit_ratio']/n_test_users 188 | result['auc'] += re['auc']/n_test_users 189 | 190 | 191 | assert count == n_test_users 192 | pool.close() 193 | return result 194 | -------------------------------------------------------------------------------- /NGCF/utility/helper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 19, 2016 3 | @author: Xiang Wang (xiangwang@u.nus.edu) 4 | ''' 5 | __author__ = "xiangwang" 6 | import os 7 | import re 8 | 9 | def txt2list(file_src): 10 | orig_file = open(file_src, "r") 11 | lines = orig_file.readlines() 12 | return lines 13 | 14 | def ensureDir(dir_path): 15 | d = os.path.dirname(dir_path) 16 | if not os.path.exists(d): 17 | os.makedirs(d) 18 | 19 | def uni2str(unicode_str): 20 | return str(unicode_str.encode('ascii', 'ignore')).replace('\n', '').strip() 21 | 22 | def hasNumbers(inputString): 23 | return bool(re.search(r'\d', inputString)) 24 | 25 | def delMultiChar(inputString, chars): 26 | for ch in chars: 27 | inputString = inputString.replace(ch, '') 28 | return inputString 29 | 30 | def merge_two_dicts(x, y): 31 | z = x.copy() # start with x's keys and values 32 | z.update(y) # modifies z with y's keys and values & returns None 33 | return z 34 | 35 | def early_stopping(log_value, best_value, stopping_step, expected_order='acc', flag_step=100): 36 | # early stopping strategy: 37 | assert expected_order in ['acc', 'dec'] 38 | 39 | if (expected_order == 'acc' and log_value >= best_value) or (expected_order == 'dec' and log_value <= best_value): 40 | stopping_step = 0 41 | best_value = log_value 42 | else: 43 | stopping_step += 1 44 | 45 | if stopping_step >= flag_step: 46 | print("Early stopping is trigger at step: {} log:{}".format(flag_step, log_value)) 47 | should_stop = True 48 | else: 49 | should_stop = False 50 | return best_value, stopping_step, should_stop 51 | -------------------------------------------------------------------------------- /NGCF/utility/load_data.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 10, 2018 3 | Tensorflow Implementation of Neural Graph Collaborative Filtering (NGCF) model in: 4 | Wang Xiang et al. Neural Graph Collaborative Filtering. In SIGIR 2019. 5 | 6 | @author: Xiang Wang (xiangwang@u.nus.edu) 7 | ''' 8 | import numpy as np 9 | import random as rd 10 | import scipy.sparse as sp 11 | from time import time 12 | 13 | class Data(object): 14 | def __init__(self, path, batch_size): 15 | self.path = path 16 | self.batch_size = batch_size 17 | 18 | train_file = path + '/train.txt' 19 | test_file = path + '/test.txt' 20 | 21 | #get number of users and items 22 | self.n_users, self.n_items = 0, 0 23 | self.n_train, self.n_test = 0, 0 24 | self.neg_pools = {} 25 | 26 | self.exist_users = [] 27 | 28 | with open(train_file) as f: 29 | for l in f.readlines(): 30 | if len(l) > 0: 31 | l = l.strip('\n').split(' ') 32 | items = [int(i) for i in l[1:]] 33 | uid = int(l[0]) 34 | self.exist_users.append(uid) 35 | self.n_items = max(self.n_items, max(items)) 36 | self.n_users = max(self.n_users, uid) 37 | self.n_train += len(items) 38 | 39 | with open(test_file) as f: 40 | for l in f.readlines(): 41 | if len(l) > 0: 42 | l = l.strip('\n') 43 | try: 44 | items = [int(i) for i in l.split(' ')[1:]] 45 | except Exception: 46 | continue 47 | self.n_items = max(self.n_items, max(items)) 48 | self.n_test += len(items) 49 | self.n_items += 1 50 | self.n_users += 1 51 | 52 | self.print_statistics() 53 | 54 | self.R = sp.dok_matrix((self.n_users, self.n_items), dtype=np.float32) 55 | 56 | self.train_items, self.test_set = {}, {} 57 | with open(train_file) as f_train: 58 | with open(test_file) as f_test: 59 | for l in f_train.readlines(): 60 | if len(l) == 0: 61 | break 62 | l = l.strip('\n') 63 | items = [int(i) for i in l.split(' ')] 64 | uid, train_items = items[0], items[1:] 65 | 66 | for i in train_items: 67 | self.R[uid, i] = 1. 68 | # self.R[uid][i] = 1 69 | 70 | self.train_items[uid] = train_items 71 | 72 | for l in f_test.readlines(): 73 | if len(l) == 0: break 74 | l = l.strip('\n') 75 | try: 76 | items = [int(i) for i in l.split(' ')] 77 | except Exception: 78 | continue 79 | 80 | uid, test_items = items[0], items[1:] 81 | self.test_set[uid] = test_items 82 | 83 | def get_adj_mat(self): 84 | try: 85 | t1 = time() 86 | adj_mat = sp.load_npz(self.path + '/s_adj_mat.npz') 87 | norm_adj_mat = sp.load_npz(self.path + '/s_norm_adj_mat.npz') 88 | mean_adj_mat = sp.load_npz(self.path + '/s_mean_adj_mat.npz') 89 | print('already load adj matrix', adj_mat.shape, time() - t1) 90 | 91 | except Exception: 92 | adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat() 93 | sp.save_npz(self.path + '/s_adj_mat.npz', adj_mat) 94 | sp.save_npz(self.path + '/s_norm_adj_mat.npz', norm_adj_mat) 95 | sp.save_npz(self.path + '/s_mean_adj_mat.npz', mean_adj_mat) 96 | return adj_mat, norm_adj_mat, mean_adj_mat 97 | 98 | def create_adj_mat(self): 99 | t1 = time() 100 | adj_mat = sp.dok_matrix((self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32) 101 | adj_mat = adj_mat.tolil() 102 | R = self.R.tolil() 103 | 104 | adj_mat[:self.n_users, self.n_users:] = R 105 | adj_mat[self.n_users:, :self.n_users] = R.T 106 | adj_mat = adj_mat.todok() 107 | print('already create adjacency matrix', adj_mat.shape, time() - t1) 108 | 109 | t2 = time() 110 | 111 | def mean_adj_single(adj): 112 | # D^-1 * A 113 | rowsum = np.array(adj.sum(1)) 114 | 115 | d_inv = np.power(rowsum, -1).flatten() 116 | d_inv[np.isinf(d_inv)] = 0. 117 | d_mat_inv = sp.diags(d_inv) 118 | 119 | norm_adj = d_mat_inv.dot(adj) 120 | # norm_adj = adj.dot(d_mat_inv) 121 | print('generate single-normalized adjacency matrix.') 122 | return norm_adj.tocoo() 123 | 124 | def normalized_adj_single(adj): 125 | # D^-1/2 * A * D^-1/2 126 | rowsum = np.array(adj.sum(1)) 127 | 128 | d_inv_sqrt = np.power(rowsum, -0.5).flatten() 129 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 130 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 131 | 132 | # bi_lap = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt) 133 | bi_lap = d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt) 134 | return bi_lap.tocoo() 135 | 136 | def check_adj_if_equal(adj): 137 | dense_A = np.array(adj.todense()) 138 | degree = np.sum(dense_A, axis=1, keepdims=False) 139 | 140 | temp = np.dot(np.diag(np.power(degree, -1)), dense_A) 141 | print('check normalized adjacency matrix whether equal to this laplacian matrix.') 142 | return temp 143 | 144 | norm_adj_mat = mean_adj_single(adj_mat + sp.eye(adj_mat.shape[0])) 145 | # norm_adj_mat = normalized_adj_single(adj_mat + sp.eye(adj_mat.shape[0])) 146 | mean_adj_mat = mean_adj_single(adj_mat) 147 | 148 | print('already normalize adjacency matrix', time() - t2) 149 | return adj_mat.tocsr(), norm_adj_mat.tocsr(), mean_adj_mat.tocsr() 150 | 151 | def negative_pool(self): 152 | t1 = time() 153 | for u in self.train_items.keys(): 154 | neg_items = list(set(range(self.n_items)) - set(self.train_items[u])) 155 | pools = [rd.choice(neg_items) for _ in range(100)] 156 | self.neg_pools[u] = pools 157 | print('refresh negative pools', time() - t1) 158 | 159 | def sample(self): 160 | if self.batch_size <= self.n_users: 161 | users = rd.sample(self.exist_users, self.batch_size) 162 | else: 163 | users = [rd.choice(self.exist_users) for _ in range(self.batch_size)] 164 | 165 | def sample_pos_items_for_u(u, num): 166 | # sample num pos items for u-th user 167 | pos_items = self.train_items[u] 168 | n_pos_items = len(pos_items) 169 | pos_batch = [] 170 | while True: 171 | if len(pos_batch) == num: 172 | break 173 | pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0] 174 | pos_i_id = pos_items[pos_id] 175 | 176 | if pos_i_id not in pos_batch: 177 | pos_batch.append(pos_i_id) 178 | return pos_batch 179 | 180 | def sample_neg_items_for_u(u, num): 181 | # sample num neg items for u-th user 182 | neg_items = [] 183 | while True: 184 | if len(neg_items) == num: 185 | break 186 | neg_id = np.random.randint(low=0, high=self.n_items,size=1)[0] 187 | if neg_id not in self.train_items[u] and neg_id not in neg_items: 188 | neg_items.append(neg_id) 189 | return neg_items 190 | 191 | def sample_neg_items_for_u_from_pools(u, num): 192 | neg_items = list(set(self.neg_pools[u]) - set(self.train_items[u])) 193 | return rd.sample(neg_items, num) 194 | 195 | pos_items, neg_items = [], [] 196 | for u in users: 197 | pos_items += sample_pos_items_for_u(u, 1) 198 | neg_items += sample_neg_items_for_u(u, 1) 199 | 200 | return users, pos_items, neg_items 201 | 202 | def get_num_users_items(self): 203 | return self.n_users, self.n_items 204 | 205 | def print_statistics(self): 206 | print('n_users=%d, n_items=%d' % (self.n_users, self.n_items)) 207 | print('n_interactions=%d' % (self.n_train + self.n_test)) 208 | print('n_train=%d, n_test=%d, sparsity=%.5f' % (self.n_train, self.n_test, (self.n_train + self.n_test)/(self.n_users * self.n_items))) 209 | 210 | def get_sparsity_split(self): 211 | try: 212 | split_uids, split_state = [], [] 213 | lines = open(self.path + '/sparsity.split', 'r').readlines() 214 | 215 | for idx, line in enumerate(lines): 216 | if idx % 2 == 0: 217 | split_state.append(line.strip()) 218 | print(line.strip()) 219 | else: 220 | split_uids.append([int(uid) for uid in line.strip().split(' ')]) 221 | print('get sparsity split.') 222 | 223 | except Exception: 224 | split_uids, split_state = self.create_sparsity_split() 225 | f = open(self.path + '/sparsity.split', 'w') 226 | for idx in range(len(split_state)): 227 | f.write(split_state[idx] + '\n') 228 | f.write(' '.join([str(uid) for uid in split_uids[idx]]) + '\n') 229 | print('create sparsity split.') 230 | 231 | return split_uids, split_state 232 | 233 | def create_sparsity_split(self): 234 | all_users_to_test = list(self.test_set.keys()) 235 | user_n_iid = dict() 236 | 237 | # generate a dictionary to store (key=n_iids, value=a list of uid). 238 | for uid in all_users_to_test: 239 | train_iids = self.train_items[uid] 240 | test_iids = self.test_set[uid] 241 | 242 | n_iids = len(train_iids) + len(test_iids) 243 | 244 | if n_iids not in user_n_iid.keys(): 245 | user_n_iid[n_iids] = [uid] 246 | else: 247 | user_n_iid[n_iids].append(uid) 248 | split_uids = list() 249 | 250 | # split the whole user set into four subset. 251 | temp = [] 252 | count = 1 253 | fold = 4 254 | n_count = (self.n_train + self.n_test) 255 | n_rates = 0 256 | 257 | split_state = [] 258 | for idx, n_iids in enumerate(sorted(user_n_iid)): 259 | temp += user_n_iid[n_iids] 260 | n_rates += n_iids * len(user_n_iid[n_iids]) 261 | n_count -= n_iids * len(user_n_iid[n_iids]) 262 | 263 | if n_rates >= count * 0.25 * (self.n_train + self.n_test): 264 | split_uids.append(temp) 265 | 266 | state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' %(n_iids, len(temp), n_rates) 267 | split_state.append(state) 268 | print(state) 269 | 270 | temp = [] 271 | n_rates = 0 272 | fold -= 1 273 | 274 | if idx == len(user_n_iid.keys()) - 1 or n_count == 0: 275 | split_uids.append(temp) 276 | 277 | state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' % (n_iids, len(temp), n_rates) 278 | split_state.append(state) 279 | print(state) 280 | 281 | 282 | 283 | return split_uids, split_state 284 | -------------------------------------------------------------------------------- /NGCF/utility/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import roc_auc_score 3 | 4 | 5 | def recall(rank, ground_truth, N): 6 | return len(set(rank[:N]) & set(ground_truth)) / float(len(set(ground_truth))) 7 | 8 | 9 | def precision_at_k(r, k): 10 | """Score is precision @ k 11 | Relevance is binary (nonzero is relevant). 12 | Returns: 13 | Precision @ k 14 | Raises: 15 | ValueError: len(r) must be >= k 16 | """ 17 | assert k >= 1 18 | r = np.asarray(r)[:k] 19 | return np.mean(r) 20 | 21 | 22 | def average_precision(r,cut): 23 | """Score is average precision (area under PR curve) 24 | Relevance is binary (nonzero is relevant). 25 | Returns: 26 | Average precision 27 | """ 28 | r = np.asarray(r) 29 | out = [precision_at_k(r, k + 1) for k in range(cut) if r[k]] 30 | if not out: 31 | return 0. 32 | return np.sum(out)/float(min(cut, np.sum(r))) 33 | 34 | 35 | def mean_average_precision(rs): 36 | """Score is mean average precision 37 | Relevance is binary (nonzero is relevant). 38 | Returns: 39 | Mean average precision 40 | """ 41 | return np.mean([average_precision(r) for r in rs]) 42 | 43 | 44 | def dcg_at_k(r, k, method=1): 45 | """Score is discounted cumulative gain (dcg) 46 | Relevance is positive real values. Can use binary 47 | as the previous methods. 48 | Returns: 49 | Discounted cumulative gain 50 | """ 51 | r = np.asfarray(r)[:k] 52 | if r.size: 53 | if method == 0: 54 | return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1))) 55 | elif method == 1: 56 | return np.sum(r / np.log2(np.arange(2, r.size + 2))) 57 | else: 58 | raise ValueError('method must be 0 or 1.') 59 | return 0. 60 | 61 | 62 | def ndcg_at_k(r, k, ground_truth, method=1): 63 | """Score is normalized discounted cumulative gain (ndcg) 64 | Relevance is positive real values. Can use binary 65 | as the previous methods. 66 | Returns: 67 | Normalized discounted cumulative gain 68 | 69 | Low but correct defination 70 | """ 71 | GT = set(ground_truth) 72 | if len(GT) > k : 73 | sent_list = [1.0] * k 74 | else: 75 | sent_list = [1.0]*len(GT) + [0.0]*(k-len(GT)) 76 | dcg_max = dcg_at_k(sent_list, k, method) 77 | if not dcg_max: 78 | return 0. 79 | return dcg_at_k(r, k, method) / dcg_max 80 | 81 | 82 | def recall_at_k(r, k, all_pos_num): 83 | # if all_pos_num == 0: 84 | # return 0 85 | r = np.asfarray(r)[:k] 86 | return np.sum(r) / all_pos_num 87 | 88 | 89 | def hit_at_k(r, k): 90 | r = np.array(r)[:k] 91 | if np.sum(r) > 0: 92 | return 1. 93 | else: 94 | return 0. 95 | 96 | def F1(pre, rec): 97 | if pre + rec > 0: 98 | return (2.0 * pre * rec) / (pre + rec) 99 | else: 100 | return 0. 101 | 102 | def AUC(ground_truth, prediction): 103 | try: 104 | res = roc_auc_score(y_true=ground_truth, y_score=prediction) 105 | except Exception: 106 | res = 0. 107 | return res -------------------------------------------------------------------------------- /NGCF/utility/parser.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 10, 2018 3 | Tensorflow Implementation of Neural Graph Collaborative Filtering (NGCF) model in: 4 | Wang Xiang et al. Neural Graph Collaborative Filtering. In SIGIR 2019. 5 | 6 | @author: Xiang Wang (xiangwang@u.nus.edu) 7 | ''' 8 | import argparse 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser(description="Run NGCF.") 12 | parser.add_argument('--weights_path', nargs='?', default='model/', 13 | help='Store model path.') 14 | parser.add_argument('--data_path', nargs='?', default='../Data/', 15 | help='Input data path.') 16 | parser.add_argument('--proj_path', nargs='?', default='', 17 | help='Project path.') 18 | 19 | parser.add_argument('--dataset', nargs='?', default='gowalla', 20 | help='Choose a dataset from {gowalla, yelp2018, amazon-book}') 21 | parser.add_argument('--pretrain', type=int, default=0, 22 | help='0: No pretrain, -1: Pretrain with the learned embeddings, 1:Pretrain with stored models.') 23 | parser.add_argument('--verbose', type=int, default=1, 24 | help='Interval of evaluation.') 25 | parser.add_argument('--epoch', type=int, default=400, 26 | help='Number of epoch.') 27 | 28 | parser.add_argument('--embed_size', type=int, default=64, 29 | help='Embedding size.') 30 | parser.add_argument('--layer_size', nargs='?', default='[64,64,64]', 31 | help='Output sizes of every layer') 32 | parser.add_argument('--batch_size', type=int, default=1024, 33 | help='Batch size.') 34 | 35 | parser.add_argument('--regs', nargs='?', default='[1e-5]', 36 | help='Regularizations.') 37 | parser.add_argument('--lr', type=float, default=0.0001, 38 | help='Learning rate.') 39 | 40 | parser.add_argument('--model_type', nargs='?', default='ngcf', 41 | help='Specify the name of model (ngcf).') 42 | parser.add_argument('--adj_type', nargs='?', default='norm', 43 | help='Specify the type of the adjacency (laplacian) matrix from {plain, norm, mean}.') 44 | 45 | parser.add_argument('--gpu_id', type=int, default=6) 46 | 47 | parser.add_argument('--node_dropout_flag', type=int, default=1, 48 | help='0: Disable node dropout, 1: Activate node dropout') 49 | parser.add_argument('--node_dropout', nargs='?', default='[0.1]', 50 | help='Keep probability w.r.t. node dropout (i.e., 1-dropout_ratio) for each deep layer. 1: no dropout.') 51 | parser.add_argument('--mess_dropout', nargs='?', default='[0.1,0.1,0.1]', 52 | help='Keep probability w.r.t. message dropout (i.e., 1-dropout_ratio) for each deep layer. 1: no dropout.') 53 | 54 | parser.add_argument('--Ks', nargs='?', default='[20, 40, 60, 80, 100]', 55 | help='Output sizes of every layer') 56 | 57 | parser.add_argument('--save_flag', type=int, default=0, 58 | help='0: Disable model saver, 1: Activate model saver') 59 | 60 | parser.add_argument('--test_flag', nargs='?', default='part', 61 | help='Specify the test type from {part, full}, indicating whether the reference is done in mini-batch') 62 | 63 | parser.add_argument('--report', type=int, default=0, 64 | help='0: Disable performance report w.r.t. sparsity levels, 1: Show performance report w.r.t. sparsity levels') 65 | return parser.parse_args() 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neural Graph Collaborative Filtering 2 | This is my PyTorch implementation for the paper: 3 | 4 | >Xiang Wang, Xiangnan He, Meng Wang, Fuli Feng, and Tat-Seng Chua (2019). Neural Graph Collaborative Filtering, [Paper in ACM DL](https://dl.acm.org/citation.cfm?doid=3331184.3331267) or [Paper in arXiv](https://arxiv.org/abs/1905.08108). In SIGIR'19, Paris, France, July 21-25, 2019. 5 | 6 | The TensorFlow implementation can be found [here](). 7 | 8 | ## Introduction 9 | My implementation mainly refers to the original TensorFlow implementation. It has the evaluation metrics as the original project. Here is the example of Gowalla dataset: 10 | 11 | ``` 12 | Best Iter=[38]@[32904.5] recall=[0.15571 0.21793 0.26385 0.30103 0.33170], precision=[0.04763 0.03370 0.02744 0.02359 0.02088], hit=[0.53996 0.64559 0.70464 0.74546 0.77406], ndcg=[0.22752 0.26555 0.29044 0.30926 0.32406] 13 | ``` 14 | 15 | Hope it can help you! 16 | 17 | ## Environment Requirement 18 | The code has been tested under Python 3.6.9. The required packages are as follows: 19 | * pytorch == 1.3.1 20 | * numpy == 1.18.1 21 | * scipy == 1.3.2 22 | * sklearn == 0.21.3 23 | 24 | ## Example to Run the Codes 25 | The instruction of commands has been clearly stated in the codes (see the parser function in NGCF/utility/parser.py). 26 | * Gowalla dataset 27 | ``` 28 | python main.py --dataset gowalla --regs [1e-5] --embed_size 64 --layer_size [64,64,64] --lr 0.0001 --save_flag 1 --pretrain 0 --batch_size 1024 --epoch 400 --verbose 1 --node_dropout [0.1] --mess_dropout [0.1,0.1,0.1] 29 | ``` 30 | 31 | * Amazon-book dataset 32 | ``` 33 | python main.py --dataset amazon-book --regs [1e-5] --embed_size 64 --layer_size [64,64,64] --lr 0.0005 --save_flag 1 --pretrain 0 --batch_size 1024 --epoch 200 --verbose 50 --node_dropout [0.1] --mess_dropout [0.1,0.1,0.1] 34 | ``` 35 | ## Supplement 36 | 37 | * The parameter `negative_slope` of LeakyReLu was set to 0.2, since the default value of PyTorch and TensorFlow is different. 38 | * If the arguement `node_dropout_flag` is set to 1, it will lead to higher calculational cost. --------------------------------------------------------------------------------