├── DGCF_v1 ├── DGCF.py ├── README.md └── utility │ ├── README.md │ ├── batch_test.py │ ├── helper.py │ ├── load_data.py │ ├── metrics.py │ └── parser.py ├── DGCF_v2 └── README.md ├── Data ├── README.md ├── amazon-book │ ├── README.md │ ├── item_list.txt │ ├── test.txt │ ├── train.txt │ └── user_list.txt ├── gowalla │ ├── README.md │ ├── item_list.txt │ ├── test.txt │ ├── train.txt │ └── user_list.txt └── yelp2018 │ ├── README.md │ ├── item_list.txt │ ├── test.txt │ ├── train.txt │ └── user_list.txt └── README.md /DGCF_v1/DGCF.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/bash 2 | ''' 3 | Created on Oct 10, 2019 4 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in: 5 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020. 6 | Note that: This implementation is based on the codes of NGCF. 7 | 8 | @author: Xiang Wang (xiangwang@u.nus.edu) 9 | ''' 10 | 11 | import tensorflow as tf 12 | import tensorflow.compat.v1 as tfv1 13 | import os 14 | import sys 15 | import random as rd 16 | import pickle 17 | import numpy as np 18 | 19 | 20 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 21 | 22 | from utility.helper import * 23 | from utility.batch_test import * 24 | 25 | 26 | class GDCF(object): 27 | def __init__(self, data_config, pretrain_data): 28 | # argument settings 29 | 30 | self.pretrain_data = pretrain_data 31 | self.n_users = data_config['n_users'] 32 | self.n_items = data_config['n_items'] 33 | 34 | self.n_fold = 1 35 | self.norm_adj = data_config['norm_adj'] 36 | self.all_h_list = data_config['all_h_list'] 37 | self.all_t_list = data_config['all_t_list'] 38 | self.A_in_shape = self.norm_adj.tocoo().shape 39 | 40 | self.n_nonzero_elems = self.norm_adj.count_nonzero() 41 | self.lr = args.lr 42 | self.emb_dim = args.embed_size 43 | self.n_factors = args.n_factors 44 | self.n_iterations = args.n_iterations 45 | self.n_layers = args.n_layers 46 | self.pick_level = args.pick_scale 47 | self.cor_flag = args.cor_flag 48 | if args.pick == 1: 49 | self.is_pick = True 50 | else: 51 | self.is_pick = False 52 | 53 | self.batch_size = args.batch_size 54 | self.regs = eval(args.regs) 55 | self.decay = self.regs[0] 56 | self.verbose = args.verbose 57 | 58 | ''' 59 | ********************************************************* 60 | Create Placeholder for Input Data & Dropout. 61 | ''' 62 | # placeholder definition 63 | self.users = tfv1.placeholder(tf.int32, shape=(None,)) 64 | self.pos_items = tfv1.placeholder(tf.int32, shape=(None,)) 65 | self.neg_items = tfv1.placeholder(tf.int32, shape=(None,)) 66 | 67 | # additional placeholders for the distance correlation 68 | self.cor_users = tfv1.placeholder(tf.int32, shape=(None,)) 69 | self.cor_items = tfv1.placeholder(tf.int32, shape=(None,)) 70 | 71 | # assign different values with different factors (channels). 72 | self.A_values = tfv1.placeholder(tf.float32, shape=[self.n_factors, len(self.all_h_list)], name='A_values') 73 | 74 | """ 75 | ********************************************************* 76 | Create Model Parameters (i.e., Initialize Weights). 77 | """ 78 | # initialization of model parameters 79 | self.weights = self._init_weights() 80 | 81 | # create models 82 | self.ua_embeddings, self.ia_embeddings, self.f_weight, self.ua_embeddings_t, self.ia_embeddings_t = self._create_star_routing_embed_with_P(pick_=self.is_pick) 83 | 84 | """ 85 | ********************************************************* 86 | Establish the final representations for user-item pairs in batch. 87 | """ 88 | self.u_g_embeddings = tf.nn.embedding_lookup(self.ua_embeddings, self.users) 89 | self.u_g_embeddings_t = tf.nn.embedding_lookup(self.ua_embeddings_t, self.users) 90 | self.pos_i_g_embeddings = tf.nn.embedding_lookup(self.ia_embeddings, self.pos_items) 91 | self.pos_i_g_embeddings_t = tf.nn.embedding_lookup(self.ia_embeddings_t, self.pos_items) 92 | 93 | 94 | self.neg_i_g_embeddings = tf.nn.embedding_lookup(self.ia_embeddings, self.neg_items) 95 | self.u_g_embeddings_pre = tf.nn.embedding_lookup(self.weights['user_embedding'], self.users) 96 | self.pos_i_g_embeddings_pre = tf.nn.embedding_lookup(self.weights['item_embedding'], self.pos_items) 97 | self.neg_i_g_embeddings_pre = tf.nn.embedding_lookup(self.weights['item_embedding'], self.neg_items) 98 | 99 | self.cor_u_g_embeddings = tf.nn.embedding_lookup(self.ua_embeddings, self.cor_users) 100 | self.cor_i_g_embeddings = tf.nn.embedding_lookup(self.ia_embeddings, self.cor_items) 101 | 102 | 103 | #Inference for the testing phase. 104 | self.batch_ratings = tf.matmul(self.u_g_embeddings_t, self.pos_i_g_embeddings_t, transpose_a=False, transpose_b=True) 105 | 106 | #Generate Predictions & Optimize via BPR loss. 107 | self.mf_loss, self.emb_loss = self.create_bpr_loss(self.u_g_embeddings, self.pos_i_g_embeddings, self.neg_i_g_embeddings) 108 | 109 | # whether user distance correlation 110 | if args.corDecay < 1e-9: 111 | self.cor_loss = tf.constant(0.0) 112 | else: 113 | self.cor_loss = args.corDecay * self.create_cor_loss(self.cor_u_g_embeddings, self.cor_i_g_embeddings) 114 | 115 | # self.loss = self.mf_loss + self.emb_loss + self.reg_loss 116 | self.loss = self.mf_loss + self.emb_loss + self.cor_loss 117 | self.opt = tfv1.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss) 118 | 119 | def _init_weights(self): 120 | all_weights = dict() 121 | 122 | initializer = tf.contrib.layers.xavier_initializer() 123 | 124 | if self.pretrain_data is None: 125 | all_weights['user_embedding'] = tf.Variable(initializer([self.n_users, self.emb_dim]), 126 | name='user_embedding') 127 | all_weights['item_embedding'] = tf.Variable(initializer([self.n_items, self.emb_dim]), 128 | name='item_embedding') 129 | print('using xavier initialization') 130 | else: 131 | all_weights['user_embedding'] = tf.Variable(initial_value=self.pretrain_data['user_embed'], trainable=True, 132 | name='user_embedding', dtype=tf.float32) 133 | all_weights['item_embedding'] = tf.Variable(initial_value=self.pretrain_data['item_embed'], trainable=True, 134 | name='item_embedding', dtype=tf.float32) 135 | print('using pretrained initialization') 136 | 137 | return all_weights 138 | 139 | def _create_star_routing_embed_with_P(self, pick_ = False): 140 | ''' 141 | pick_ : True, the model would narrow the weight of the least important factor down to 1/args.pick_scale. 142 | pick_ : False, do nothing. 143 | ''' 144 | p_test = False 145 | p_train = False 146 | 147 | A_values = tf.ones(shape=[self.n_factors, len(self.all_h_list)]) 148 | # get a (n_factors)-length list of [n_users+n_items, n_users+n_items] 149 | 150 | # load the initial all-one adjacency values 151 | # .... A_values is a all-ones dense tensor with the size of [n_factors, all_h_list]. 152 | 153 | 154 | # get the ID embeddings of users and items 155 | # .... ego_embeddings is a dense tensor with the size of [n_users+n_items, embed_size]; 156 | # .... all_embeddings stores a (n_layers)-len list of outputs derived from different layers. 157 | ego_embeddings = tf.concat([self.weights['user_embedding'], self.weights['item_embedding']], axis=0) 158 | all_embeddings = [ego_embeddings] 159 | all_embeddings_t = [ego_embeddings] 160 | 161 | output_factors_distribution = [] 162 | 163 | factor_num = [self.n_factors, self.n_factors, self.n_factors] 164 | iter_num = [self.n_iterations, self.n_iterations, self.n_iterations] 165 | for k in range(0, self.n_layers): 166 | # prepare the output embedding list 167 | # .... layer_embeddings stores a (n_factors)-len list of outputs derived from the last routing iterations. 168 | n_factors_l = factor_num[k] 169 | n_iterations_l = iter_num[k] 170 | layer_embeddings = [] 171 | layer_embeddings_t = [] 172 | 173 | # split the input embedding table 174 | # .... ego_layer_embeddings is a (n_factors)-leng list of embeddings [n_users+n_items, embed_size/n_factors] 175 | ego_layer_embeddings = tf.split(ego_embeddings, n_factors_l, 1) 176 | ego_layer_embeddings_t = tf.split(ego_embeddings, n_factors_l, 1) 177 | 178 | # perform routing mechanism 179 | for t in range(0, n_iterations_l): 180 | iter_embeddings = [] 181 | iter_embeddings_t = [] 182 | A_iter_values = [] 183 | 184 | # split the adjacency values & get three lists of [n_users+n_items, n_users+n_items] sparse tensors 185 | # .... A_factors is a (n_factors)-len list, each of which is an adjacency matrix 186 | # .... D_col_factors is a (n_factors)-len list, each of which is a degree matrix w.r.t. columns 187 | # .... D_row_factors is a (n_factors)-len list, each of which is a degree matrix w.r.t. rows 188 | if t == n_iterations_l - 1: 189 | p_test = pick_ 190 | p_train = False 191 | 192 | A_factors, D_col_factors, D_row_factors = self._convert_A_values_to_A_factors_with_P(n_factors_l, A_values, pick= p_train) 193 | A_factors_t, D_col_factors_t, D_row_factors_t = self._convert_A_values_to_A_factors_with_P(n_factors_l, A_values, pick= p_test) 194 | for i in range(0, n_factors_l): 195 | # update the embeddings via simplified graph convolution layer 196 | # .... D_col_factors[i] * A_factors[i] * D_col_factors[i] is Laplacian matrix w.r.t. the i-th factor 197 | # .... factor_embeddings is a dense tensor with the size of [n_users+n_items, embed_size/n_factors] 198 | factor_embeddings = tf.sparse.sparse_dense_matmul(D_col_factors[i], ego_layer_embeddings[i]) 199 | factor_embeddings_t = tf.sparse.sparse_dense_matmul(D_col_factors_t[i], ego_layer_embeddings_t[i]) 200 | 201 | factor_embeddings_t = tf.sparse.sparse_dense_matmul(A_factors_t[i], factor_embeddings_t) 202 | factor_embeddings = tf.sparse.sparse_dense_matmul(A_factors[i], factor_embeddings) 203 | 204 | factor_embeddings = tf.sparse.sparse_dense_matmul(D_col_factors[i], factor_embeddings) 205 | factor_embeddings_t = tf.sparse.sparse_dense_matmul(D_col_factors_t[i], factor_embeddings_t) 206 | 207 | iter_embeddings.append(factor_embeddings) 208 | iter_embeddings_t.append(factor_embeddings_t) 209 | 210 | if t == n_iterations_l - 1: 211 | layer_embeddings = iter_embeddings 212 | layer_embeddings_t = iter_embeddings_t 213 | 214 | # get the factor-wise embeddings 215 | # .... head_factor_embeddings is a dense tensor with the size of [all_h_list, embed_size/n_factors] 216 | # .... analogous to tail_factor_embeddings 217 | head_factor_embedings = tf.nn.embedding_lookup(factor_embeddings, self.all_h_list) 218 | tail_factor_embedings = tf.nn.embedding_lookup(ego_layer_embeddings[i], self.all_t_list) 219 | 220 | # .... constrain the vector length 221 | # .... make the following attentive weights within the range of (0,1) 222 | head_factor_embedings = tf.math.l2_normalize(head_factor_embedings, axis=1) 223 | tail_factor_embedings = tf.math.l2_normalize(tail_factor_embedings, axis=1) 224 | 225 | # get the attentive weights 226 | # .... A_factor_values is a dense tensor with the size of [all_h_list,1] 227 | A_factor_values = tf.reduce_sum(tf.multiply(head_factor_embedings, tf.tanh(tail_factor_embedings)), axis=1) 228 | 229 | # update the attentive weights 230 | A_iter_values.append(A_factor_values) 231 | 232 | # pack (n_factors) adjacency values into one [n_factors, all_h_list] tensor 233 | A_iter_values = tf.stack(A_iter_values, 0) 234 | # add all layer-wise attentive weights up. 235 | A_values += A_iter_values 236 | 237 | if t == n_iterations_l - 1: 238 | #layer_embeddings = iter_embeddings 239 | output_factors_distribution.append(A_factors) 240 | 241 | # sum messages of neighbors, [n_users+n_items, embed_size] 242 | side_embeddings = tf.concat(layer_embeddings, 1) 243 | side_embeddings_t = tf.concat(layer_embeddings_t, 1) 244 | 245 | ego_embeddings = side_embeddings 246 | ego_embeddings_t = side_embeddings_t 247 | # concatenate outputs of all layers 248 | all_embeddings_t += [ego_embeddings_t] 249 | all_embeddings += [ego_embeddings] 250 | 251 | all_embeddings = tf.stack(all_embeddings, 1) 252 | all_embeddings = tf.reduce_mean(all_embeddings, axis=1, keepdims=False) 253 | 254 | all_embeddings_t = tf.stack(all_embeddings_t, 1) 255 | all_embeddings_t = tf.reduce_mean(all_embeddings_t, axis=1, keep_dims=False) 256 | 257 | u_g_embeddings, i_g_embeddings = tf.split(all_embeddings, [self.n_users, self.n_items], 0) 258 | u_g_embeddings_t, i_g_embeddings_t = tf.split(all_embeddings_t, [self.n_users, self.n_items], 0) 259 | 260 | return u_g_embeddings, i_g_embeddings, output_factors_distribution, u_g_embeddings_t, i_g_embeddings_t 261 | 262 | def create_bpr_loss(self, users, pos_items, neg_items): 263 | pos_scores = tf.reduce_sum(tf.multiply(users, pos_items), axis=1) 264 | neg_scores = tf.reduce_sum(tf.multiply(users, neg_items), axis=1) 265 | 266 | regularizer = tf.nn.l2_loss(self.u_g_embeddings_pre) + tf.nn.l2_loss( 267 | self.pos_i_g_embeddings_pre) + tf.nn.l2_loss(self.neg_i_g_embeddings_pre) 268 | regularizer = regularizer / self.batch_size 269 | 270 | # In the first version, we implement the bpr loss via the following codes: 271 | # We report the performance in our paper using this implementation. 272 | # maxi = tf.log(tf.nn.sigmoid(pos_scores - neg_scores)) 273 | # mf_loss = tf.negative(tf.reduce_mean(maxi)) 274 | 275 | ## In the second version, we implement the bpr loss via the following codes to avoid 'NAN' loss during training: 276 | ## However, it will change the training performance and training performance. 277 | ## Please retrain the model and do a grid search for the best experimental setting. 278 | 279 | mf_loss = tf.reduce_mean(tf.nn.softplus(-(pos_scores - neg_scores))) 280 | 281 | emb_loss = self.decay * regularizer 282 | 283 | return mf_loss, emb_loss 284 | 285 | def create_cor_loss(self, cor_u_embeddings, cor_i_embeddings): 286 | cor_loss = tf.constant(0.0, tf.float32) 287 | 288 | if self.cor_flag == 0: 289 | return cor_loss 290 | 291 | ui_embeddings = tf.concat([cor_u_embeddings, cor_i_embeddings], axis=0) 292 | ui_factor_embeddings = tf.split(ui_embeddings, self.n_factors, 1) 293 | 294 | for i in range(0, self.n_factors-1): 295 | x = ui_factor_embeddings[i] 296 | y = ui_factor_embeddings[i+1] 297 | cor_loss += self._create_distance_correlation(x, y) 298 | 299 | cor_loss /= ((self.n_factors + 1.0) * self.n_factors/2) 300 | 301 | return cor_loss 302 | 303 | def model_save(self, path, dataset, ses, savename='best_model'): 304 | save_pretrain_path = '%spretrain/%s/%s' % (path, dataset, savename) 305 | np.savez(save_pretrain_path,user_embed=np.array(self.weights['user_embedding'].eval(session=ses)), 306 | item_embed=np.array(model.weights['item_embedding'].eval(session=ses))) 307 | 308 | def _create_distance_correlation(self, X1, X2): 309 | 310 | def _create_centered_distance(X): 311 | ''' 312 | Used to calculate the distance matrix of N samples. 313 | (However how could tf store a HUGE matrix with the shape like 70000*70000*4 Bytes????) 314 | ''' 315 | # calculate the pairwise distance of X 316 | # .... A with the size of [batch_size, embed_size/n_factors] 317 | # .... D with the size of [batch_size, batch_size] 318 | # X = tf.math.l2_normalize(XX, axis=1) 319 | 320 | r = tf.reduce_sum(tf.square(X), 1, keepdims=True) 321 | D = tf.sqrt(tf.maximum(r - 2 * tf.matmul(a=X, b=X, transpose_b=True) + tf.transpose(r), 0.0) + 1e-8) 322 | 323 | # # calculate the centered distance of X 324 | # # .... D with the size of [batch_size, batch_size] 325 | D = D - tf.reduce_mean(D, axis=0, keepdims=True) - tf.reduce_mean(D, axis=1, keepdims=True) \ 326 | + tf.reduce_mean(D) 327 | return D 328 | 329 | def _create_distance_covariance(D1, D2): 330 | # calculate distance covariance between D1 and D2 331 | n_samples = tf.dtypes.cast(tf.shape(D1)[0], tf.float32) 332 | dcov = tf.sqrt(tf.maximum(tf.reduce_sum(D1 * D2) / (n_samples * n_samples), 0.0) + 1e-8) 333 | # dcov = tf.sqrt(tf.maximum(tf.reduce_sum(D1 * D2)) / n_samples 334 | return dcov 335 | 336 | D1 = _create_centered_distance(X1) 337 | D2 = _create_centered_distance(X2) 338 | 339 | dcov_12 = _create_distance_covariance(D1, D2) 340 | dcov_11 = _create_distance_covariance(D1, D1) 341 | dcov_22 = _create_distance_covariance(D2, D2) 342 | 343 | # calculate the distance correlation 344 | dcor = dcov_12 / (tf.sqrt(tf.maximum(dcov_11 * dcov_22, 0.0)) + 1e-10) 345 | # return tf.reduce_sum(D1) + tf.reduce_sum(D2) 346 | return dcor 347 | 348 | def _convert_A_values_to_A_factors_with_P(self, f_num, A_factor_values, pick=True): 349 | 350 | A_factors = [] 351 | D_col_factors = [] 352 | D_row_factors = [] 353 | # get the indices of adjacency matrix. 354 | A_indices = np.mat([self.all_h_list, self.all_t_list]).transpose() 355 | D_indices = np.mat([list(range(self.n_users+self.n_items)), list(range(self.n_users+self.n_items))]).transpose() 356 | 357 | # apply factor-aware softmax function over the values of adjacency matrix 358 | # .... A_factor_values is [n_factors, all_h_list] 359 | if pick: 360 | A_factor_scores = tf.nn.softmax(A_factor_values, 0) 361 | min_A = tf.reduce_min(A_factor_scores, 0) 362 | index = A_factor_scores > (min_A + 0.0000001) 363 | index = tf.cast(index, tf.float32)*(self.pick_level-1.0) + 1.0 # adjust the weight of the minimum factor to 1/self.pick_level 364 | 365 | A_factor_scores = A_factor_scores * index 366 | A_factor_scores = A_factor_scores / tf.reduce_sum(A_factor_scores, 0) 367 | else: 368 | A_factor_scores = tf.nn.softmax(A_factor_values, 0) 369 | 370 | for i in range(0, f_num): 371 | # in the i-th factor, couple the adjacency values with the adjacency indices 372 | # .... A_i_tensor is a sparse tensor with size of [n_users+n_items, n_users+n_items] 373 | A_i_scores = A_factor_scores[i] 374 | A_i_tensor = tf.SparseTensor(A_indices, A_i_scores, self.A_in_shape) 375 | 376 | # get the degree values of A_i_tensor 377 | # .... D_i_scores_col is [n_users+n_items, 1] 378 | # .... D_i_scores_row is [1, n_users+n_items] 379 | D_i_col_scores = 1/tf.math.sqrt(tf.sparse_reduce_sum(A_i_tensor, axis=1)) 380 | D_i_row_scores = 1/tf.math.sqrt(tf.sparse_reduce_sum(A_i_tensor, axis=0)) 381 | 382 | # couple the laplacian values with the adjacency indices 383 | # .... A_i_tensor is a sparse tensor with size of [n_users+n_items, n_users+n_items] 384 | D_i_col_tensor = tf.SparseTensor(D_indices, D_i_col_scores, self.A_in_shape) 385 | D_i_row_tensor = tf.SparseTensor(D_indices, D_i_row_scores, self.A_in_shape) 386 | 387 | A_factors.append(A_i_tensor) 388 | D_col_factors.append(D_i_col_tensor) 389 | D_row_factors.append(D_i_row_tensor) 390 | 391 | # return a (n_factors)-length list of laplacian matrix 392 | return A_factors, D_col_factors, D_row_factors 393 | 394 | 395 | def load_best(name="best_model"): 396 | pretrain_path = '%spretrain/%s/%s.npz' % (args.proj_path, args.dataset, name) 397 | try: 398 | pretrain_data = np.load(pretrain_path) 399 | print('load the best model:', name) 400 | except Exception: 401 | pretrain_data = None 402 | return pretrain_data 403 | 404 | def load_adjacency_list_data(adj_mat): 405 | tmp = adj_mat.tocoo() 406 | all_h_list = list(tmp.row) 407 | all_t_list = list(tmp.col) 408 | all_v_list = list(tmp.data) 409 | 410 | return all_h_list, all_t_list, all_v_list 411 | 412 | def create_initial_A_values(n_factors, all_v_list): 413 | return np.array([all_v_list] * n_factors) 414 | 415 | def sample_cor_samples(n_users, n_items, cor_batch_size): 416 | ''' 417 | We have to sample some embedded representations out of all nodes. 418 | Becasue we have no way to store cor-distance for each pair. 419 | ''' 420 | cor_users = rd.sample(list(range(n_users)), cor_batch_size) 421 | cor_items = rd.sample(list(range(n_items)), cor_batch_size) 422 | 423 | return cor_users, cor_items 424 | 425 | if __name__ == '__main__': 426 | whether_test_batch = True 427 | 428 | print("************************* Run with following settings 🏃 ***************************") 429 | print(args) 430 | print("************************************************************************************") 431 | 432 | config = dict() 433 | config['n_users'] = data_generator.n_users 434 | config['n_items'] = data_generator.n_items 435 | 436 | """ 437 | ********************************************************* 438 | Generate the Laplacian matrix, where each entry defines the decay factor (e.g., p_ui) between two connected nodes. 439 | """ 440 | plain_adj, norm_adj, mean_adj, pre_adj = data_generator.get_adj_mat() 441 | 442 | all_h_list, all_t_list, all_v_list = load_adjacency_list_data(plain_adj) 443 | 444 | A_values_init = create_initial_A_values(args.n_factors, all_v_list) 445 | 446 | config['norm_adj'] = plain_adj 447 | config['all_h_list'] = all_h_list 448 | config['all_t_list'] = all_t_list 449 | 450 | 451 | t0 = time() 452 | """ 453 | ********************************************************* 454 | pretrain = 1: load embeddings with name such as embedding_xxx(.npz), l2_best_model(.npz) 455 | pretrain = 0: default value, no pretrained embeddings. 456 | """ 457 | if args.pretrain == 1: 458 | print("Try to load pretain: ", args.embed_name) 459 | pretrain_data = load_best(name=args.embed_name) 460 | if pretrain_data == None: 461 | print("Load pretrained model(%s)fail!!!!!!!!!!!!!!!"%(args.embed_name)) 462 | else: 463 | pretrain_data = None 464 | 465 | model = GDCF(data_config=config, pretrain_data=pretrain_data) 466 | 467 | 468 | tf_config = tfv1.ConfigProto() 469 | tf_config.gpu_options.allow_growth = True 470 | sess = tfv1.Session(config=tf_config) 471 | 472 | 473 | sess.run(tfv1.global_variables_initializer()) 474 | cur_best_pre_0 = 0. 475 | 476 | 477 | """ 478 | ********************************************************* 479 | Train 480 | """ 481 | loss_loger, pre_loger, rec_loger, ndcg_loger, hit_loger = [], [], [], [], [] 482 | stopping_step = 0 483 | should_stop = False 484 | for epoch in range(args.epoch): 485 | t1 = time() 486 | loss, mf_loss, emb_loss, cor_loss = 0., 0., 0., 0. 487 | n_batch = data_generator.n_train // args.batch_size + 1 488 | cor_batch_size = int(max(data_generator.n_users/n_batch, data_generator.n_items/n_batch)) 489 | 490 | for idx in range(n_batch): 491 | users, pos_items, neg_items = data_generator.sample() 492 | cor_users, cor_items = sample_cor_samples(data_generator.n_users, data_generator.n_items, cor_batch_size) 493 | _, batch_loss, batch_mf_loss, batch_emb_loss, batch_cor_loss = sess.run([model.opt, model.loss, 494 | model.mf_loss, model.emb_loss, 495 | model.cor_loss], 496 | feed_dict={model.users: users, 497 | model.pos_items: pos_items, 498 | model.neg_items: neg_items, 499 | model.cor_users: cor_users, 500 | model.cor_items: cor_items}) 501 | loss += batch_loss / n_batch 502 | mf_loss += batch_mf_loss / n_batch 503 | emb_loss += batch_emb_loss / n_batch 504 | cor_loss += batch_cor_loss / n_batch 505 | 506 | if np.isnan(loss) == True: 507 | print('ERROR: loss is nan.') 508 | print(mf_loss, emb_loss) 509 | sys.exit() 510 | 511 | # print the test evaluation metrics each 10 epochs; pos:neg = 1:10. 512 | if (epoch + 1) % args.show_step != 0: 513 | if args.verbose > 0 and epoch % args.verbose == 0: 514 | perf_str = 'Epoch %d [%.1fs]: train==[%.5f=%.5f + %.5f + %.5f]' % (epoch, time() - t1, loss, mf_loss, emb_loss, cor_loss) 515 | print(perf_str) 516 | # Skip testing 517 | continue 518 | 519 | # Begin test at this epoch. 520 | loss_test, mf_loss_test, emb_loss_test, cor_loss_test = 0., 0., 0., 0. 521 | for idx in range(n_batch): 522 | cor_users, cor_items = sample_cor_samples(data_generator.n_users, data_generator.n_items, cor_batch_size) 523 | users, pos_items, neg_items = data_generator.sample_test() 524 | batch_loss_test, batch_mf_loss_test, batch_emb_loss_test, batch_cor_loss_test = sess.run( 525 | [model.loss, model.mf_loss, model.emb_loss, model.cor_loss], 526 | feed_dict={model.users: users, 527 | model.pos_items: pos_items, 528 | model.neg_items: neg_items, 529 | model.A_values: A_values_init, 530 | model.cor_users: cor_users, 531 | model.cor_items: cor_items}) 532 | loss_test += batch_loss_test / n_batch 533 | mf_loss_test += batch_mf_loss_test / n_batch 534 | emb_loss_test += batch_emb_loss_test / n_batch 535 | cor_loss_test += batch_cor_loss_test / n_batch 536 | 537 | t2 = time() 538 | users_to_test = list(data_generator.test_set.keys()) 539 | ret = test(sess, model, users_to_test, drop_flag=True, batch_test_flag=whether_test_batch) 540 | 541 | 542 | t3 = time() 543 | 544 | loss_loger.append(loss) 545 | rec_loger.append(ret['recall']) 546 | pre_loger.append(ret['precision']) 547 | ndcg_loger.append(ret['ndcg']) 548 | hit_loger.append(ret['hit_ratio']) 549 | 550 | if args.verbose > 0: 551 | perf_str = 'Epoch %d [%.1fs + %.1fs]: test==[%.5f=%.5f + %.5f + %.5f], recall=[%.5f, %.5f], ' \ 552 | 'precision=[%.5f, %.5f], hit=[%.5f, %.5f], ndcg=[%.5f, %.5f]' % \ 553 | (epoch, t2 - t1, t3 - t2, loss_test, mf_loss_test, emb_loss_test, cor_loss_test, ret['recall'][0], 554 | ret['recall'][-1], 555 | ret['precision'][0], ret['precision'][-1], ret['hit_ratio'][0], ret['hit_ratio'][-1], 556 | ret['ndcg'][0], ret['ndcg'][-1]) 557 | print(perf_str) 558 | 559 | cur_best_pre_0, stopping_step, should_stop = early_stopping(ret['recall'][0], cur_best_pre_0, stopping_step, expected_order='acc', flag_step=args.early) 560 | 561 | # early stopping when cur_best_pre_0 is decreasing for given steps. 562 | if should_stop == True: 563 | break 564 | 565 | # ********************************************************* 566 | # save the user & item embeddings for pretraining. 567 | if ret['recall'][0] == cur_best_pre_0 and args.save_flag == 1 : 568 | model.model_save(args.proj_path, args.dataset, sess, savename=args.save_name) 569 | print('save the model with performance: ', cur_best_pre_0) 570 | 571 | 572 | recs = np.array(rec_loger) 573 | pres = np.array(pre_loger) 574 | ndcgs = np.array(ndcg_loger) 575 | hit = np.array(hit_loger) 576 | 577 | best_rec_0 = max(recs[:, 0]) 578 | idx = list(recs[:, 0]).index(best_rec_0) 579 | 580 | final_perf = "Best Iter=[%d]@[%.1f]\trecall=[%s], precision=[%s], hit=[%s], ndcg=[%s]" % \ 581 | (idx, time() - t0, '\t'.join(['%.5f' % r for r in recs[idx]]), 582 | '\t'.join(['%.5f' % r for r in pres[idx]]), 583 | '\t'.join(['%.5f' % r for r in hit[idx]]), 584 | '\t'.join(['%.5f' % r for r in ndcgs[idx]])) 585 | print(final_perf) 586 | 587 | 588 | -------------------------------------------------------------------------------- /DGCF_v1/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DGCF_v1/utility/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DGCF_v1/utility/batch_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 10, 2019 3 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in: 4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020. 5 | Note that: This implementation is based on the codes of NGCF. 6 | 7 | @author: Xiang Wang (xiangwang@u.nus.edu) 8 | ''' 9 | 10 | import utility.metrics as metrics 11 | from utility.parser import parse_args 12 | from utility.load_data import * 13 | import multiprocessing 14 | import heapq 15 | import pandas as pd 16 | import tensorflow as tf 17 | import numpy as np 18 | 19 | cores = multiprocessing.cpu_count() // 2 20 | 21 | args = parse_args() 22 | Ks = eval(args.Ks) 23 | 24 | data_generator = Data(path=args.data_path + args.dataset, batch_size=args.batch_size) 25 | USR_NUM, ITEM_NUM = data_generator.n_users, data_generator.n_items 26 | N_TRAIN, N_TEST = data_generator.n_train, data_generator.n_test 27 | if args.dataset=='amazon-book': 28 | BATCH_SIZE = args.batch_size//4 29 | else: 30 | BATCH_SIZE = args.batch_size 31 | 32 | def ranklist_by_heapq(user_pos_test, test_items, rating, Ks): 33 | item_score = {} 34 | for i in test_items: 35 | item_score[i] = rating[i] 36 | 37 | K_max = max(Ks) 38 | K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get) 39 | 40 | r = [] 41 | for i in K_max_item_score: 42 | if i in user_pos_test: 43 | r.append(1) 44 | else: 45 | r.append(0) 46 | auc = 0. 47 | return r, auc 48 | 49 | def get_auc(item_score, user_pos_test): 50 | item_score = sorted(item_score.items(), key=lambda kv: kv[1]) 51 | item_score.reverse() 52 | item_sort = [x[0] for x in item_score] 53 | posterior = [x[1] for x in item_score] 54 | 55 | r = [] 56 | for i in item_sort: 57 | if i in user_pos_test: 58 | r.append(1) 59 | else: 60 | r.append(0) 61 | auc = metrics.auc(ground_truth=r, prediction=posterior) 62 | return auc 63 | 64 | def ranklist_by_sorted(user_pos_test, test_items, rating, Ks): 65 | item_score = {} 66 | for i in test_items: 67 | item_score[i] = rating[i] 68 | 69 | K_max = max(Ks) 70 | K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get) 71 | 72 | r = [] 73 | for i in K_max_item_score: 74 | if i in user_pos_test: 75 | r.append(1) 76 | else: 77 | r.append(0) 78 | auc = get_auc(item_score, user_pos_test) 79 | return r, auc 80 | 81 | def get_performance(user_pos_test, r, auc, Ks): 82 | precision, recall, ndcg, hit_ratio = [], [], [], [] 83 | 84 | for K in Ks: 85 | precision.append(metrics.precision_at_k(r, K)) 86 | recall.append(metrics.recall_at_k(r, K, len(user_pos_test))) 87 | ndcg.append(metrics.ndcg_at_k(r, K, user_pos_test)) 88 | hit_ratio.append(metrics.hit_at_k(r, K)) 89 | 90 | return {'recall': np.array(recall), 'precision': np.array(precision), 91 | 'ndcg': np.array(ndcg), 'hit_ratio': np.array(hit_ratio), 'auc': auc} 92 | 93 | 94 | def test_one_user(x): 95 | # user u's ratings for user u 96 | rating = x[0] 97 | #uid 98 | u = x[1] 99 | #user u's items in the training set 100 | try: 101 | training_items = data_generator.train_items[u] 102 | except Exception: 103 | training_items = [] 104 | #user u's items in the test set 105 | user_pos_test = data_generator.test_set[u] 106 | 107 | all_items = set(range(ITEM_NUM)) 108 | 109 | test_items = list(all_items - set(training_items)) 110 | 111 | if args.test_flag == 'part': 112 | r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks) 113 | else: 114 | r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks) 115 | 116 | return get_performance(user_pos_test, r, auc, Ks) 117 | 118 | def test_one_user_train(x): 119 | # user u's ratings for user u 120 | rating = x[0] 121 | # uid 122 | u = x[1] 123 | # user u's items in the training set 124 | 125 | training_items = [] 126 | # user u's items in the test set 127 | user_pos_test = data_generator.train_items[u] 128 | 129 | all_items = set(range(ITEM_NUM)) 130 | 131 | test_items = list(all_items - set(training_items)) 132 | 133 | if args.test_flag == 'part': 134 | r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks) 135 | else: 136 | r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks) 137 | 138 | return get_performance(user_pos_test, r, auc, Ks) 139 | 140 | def test(sess, model, users_to_test, drop_flag=False, batch_test_flag=False,train_set_flag=0): 141 | result = {'precision': np.zeros(len(Ks)), 'recall': np.zeros(len(Ks)), 'ndcg': np.zeros(len(Ks)), 142 | 'hit_ratio': np.zeros(len(Ks)), 'auc': 0.} 143 | 144 | pool = multiprocessing.Pool(cores) 145 | 146 | u_batch_size = BATCH_SIZE * 2 147 | i_batch_size = BATCH_SIZE 148 | 149 | test_users = users_to_test 150 | n_test_users = len(test_users) 151 | n_user_batchs = n_test_users // u_batch_size + 1 152 | 153 | count = 0 154 | for u_batch_id in range(n_user_batchs): 155 | start = u_batch_id * u_batch_size 156 | end = (u_batch_id + 1) * u_batch_size 157 | 158 | user_batch = test_users[start: end] 159 | 160 | if batch_test_flag: 161 | 162 | n_item_batchs = ITEM_NUM // i_batch_size + 1 163 | rate_batch = np.zeros(shape=(len(user_batch), ITEM_NUM)) 164 | 165 | i_count = 0 166 | for i_batch_id in range(n_item_batchs): 167 | i_start = i_batch_id * i_batch_size 168 | i_end = min((i_batch_id + 1) * i_batch_size, ITEM_NUM) 169 | 170 | item_batch = range(i_start, i_end) 171 | 172 | if drop_flag == False: 173 | i_rate_batch = sess.run(model.batch_ratings, {model.users: user_batch, model.pos_items: item_batch}) 174 | else: 175 | i_rate_batch = sess.run(model.batch_ratings, {model.users: user_batch, model.pos_items: item_batch}) 176 | 177 | rate_batch[:, i_start: i_end] = i_rate_batch 178 | i_count += i_rate_batch.shape[1] 179 | 180 | assert i_count == ITEM_NUM 181 | 182 | else: 183 | item_batch = range(ITEM_NUM) 184 | if drop_flag == False: 185 | rate_batch, _1 = sess.run([model.batch_ratings, model._1], {model.users: user_batch, 186 | model.pos_items: item_batch}) 187 | else: 188 | rate_batch, _1, _2 = sess.run([model.batch_ratings, model.print_pick, model.print_embed], {model.users: user_batch, 189 | model.pos_items: item_batch}) 190 | 191 | 192 | user_batch_rating_uid = zip(rate_batch, user_batch) 193 | 194 | if train_set_flag==0: 195 | batch_result = pool.map(test_one_user, user_batch_rating_uid) 196 | else: 197 | batch_result = pool.map(test_one_user_train, user_batch_rating_uid) 198 | count += len(batch_result) 199 | 200 | for re in batch_result: 201 | result['precision'] += re['precision']/n_test_users 202 | result['recall'] += re['recall']/n_test_users 203 | result['ndcg'] += re['ndcg']/n_test_users 204 | result['hit_ratio'] += re['hit_ratio']/n_test_users 205 | result['auc'] += re['auc']/n_test_users 206 | 207 | 208 | assert count == n_test_users 209 | pool.close() 210 | return result 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /DGCF_v1/utility/helper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 10, 2019 3 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in: 4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020. 5 | Note that: This implementation is based on the codes of NGCF. 6 | 7 | @author: Xiang Wang (xiangwang@u.nus.edu) 8 | ''' 9 | 10 | __author__ = "xiangwang" 11 | import os 12 | import re 13 | 14 | def txt2list(file_src): 15 | orig_file = open(file_src, "r") 16 | lines = orig_file.readlines() 17 | return lines 18 | 19 | def ensureDir(dir_path): 20 | d = os.path.dirname(dir_path) 21 | if not os.path.exists(d): 22 | os.makedirs(d) 23 | 24 | def uni2str(unicode_str): 25 | return str(unicode_str.encode('ascii', 'ignore')).replace('\n', '').strip() 26 | 27 | def hasNumbers(inputString): 28 | return bool(re.search(r'\d', inputString)) 29 | 30 | def delMultiChar(inputString, chars): 31 | for ch in chars: 32 | inputString = inputString.replace(ch, '') 33 | return inputString 34 | 35 | def merge_two_dicts(x, y): 36 | z = x.copy() # start with x's keys and values 37 | z.update(y) # modifies z with y's keys and values & returns None 38 | return z 39 | 40 | def early_stopping(log_value, best_value, stopping_step, expected_order='acc', flag_step=100): 41 | # early stopping strategy: 42 | assert expected_order in ['acc', 'dec'] 43 | 44 | if (expected_order == 'acc' and log_value >= best_value) or (expected_order == 'dec' and log_value <= best_value): 45 | stopping_step = 0 46 | best_value = log_value 47 | else: 48 | stopping_step += 1 49 | 50 | if stopping_step >= flag_step: 51 | print("Early stopping is trigger at step: {} log:{}".format(flag_step, log_value)) 52 | should_stop = True 53 | else: 54 | should_stop = False 55 | return best_value, stopping_step, should_stop 56 | -------------------------------------------------------------------------------- /DGCF_v1/utility/load_data.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 10, 2019 3 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in: 4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020. 5 | Note that: This implementation is based on the codes of NGCF. 6 | 7 | @author: Xiang Wang (xiangwang@u.nus.edu) 8 | ''' 9 | 10 | import numpy as np 11 | import random as rd 12 | import scipy.sparse as sp 13 | from time import time 14 | 15 | class Data(object): 16 | def __init__(self, path, batch_size): 17 | self.path = path 18 | self.batch_size = batch_size 19 | 20 | train_file = path + '/train.txt' 21 | test_file = path + '/test.txt' 22 | 23 | #get number of users and items 24 | self.n_users, self.n_items = 0, 0 25 | self.n_train, self.n_test = 0, 0 26 | self.neg_pools = {} 27 | 28 | self.exist_users = [] 29 | 30 | with open(train_file) as f: 31 | for l in f.readlines(): 32 | if len(l) > 0: 33 | l = l.strip('\n').split(' ') 34 | items = [int(i) for i in l[1:]] 35 | uid = int(l[0]) 36 | self.exist_users.append(uid) 37 | self.n_items = max(self.n_items, max(items)) 38 | self.n_users = max(self.n_users, uid) 39 | self.n_train += len(items) 40 | 41 | with open(test_file) as f: 42 | for l in f.readlines(): 43 | if len(l) > 0: 44 | l = l.strip('\n') 45 | try: 46 | items = [int(i) for i in l.split(' ')[1:]] 47 | except Exception: 48 | continue 49 | self.n_items = max(self.n_items, max(items)) 50 | self.n_test += len(items) 51 | self.n_items += 1 52 | self.n_users += 1 53 | 54 | self.print_statistics() 55 | 56 | self.R = sp.dok_matrix((self.n_users, self.n_items), dtype=np.float32) 57 | 58 | self.train_items, self.test_set = {}, {} 59 | with open(train_file) as f_train: 60 | with open(test_file) as f_test: 61 | for l in f_train.readlines(): 62 | if len(l) == 0: break 63 | l = l.strip('\n') 64 | items = [int(i) for i in l.split(' ')] 65 | uid, train_items = items[0], items[1:] 66 | 67 | for i in train_items: 68 | self.R[uid, i] = 1. 69 | # self.R[uid][i] = 1 70 | 71 | self.train_items[uid] = train_items 72 | 73 | for l in f_test.readlines(): 74 | if len(l) == 0: break 75 | l = l.strip('\n') 76 | try: 77 | items = [int(i) for i in l.split(' ')] 78 | except Exception: 79 | continue 80 | 81 | uid, test_items = items[0], items[1:] 82 | self.test_set[uid] = test_items 83 | 84 | def get_adj_mat(self): 85 | try: 86 | t1 = time() 87 | adj_mat = sp.load_npz(self.path + '/s_adj_mat.npz') 88 | norm_adj_mat = sp.load_npz(self.path + '/s_norm_adj_mat.npz') 89 | mean_adj_mat = sp.load_npz(self.path + '/s_mean_adj_mat.npz') 90 | print('already load adj matrix', adj_mat.shape, time() - t1) 91 | 92 | except Exception: 93 | adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat() 94 | sp.save_npz(self.path + '/s_adj_mat.npz', adj_mat) 95 | sp.save_npz(self.path + '/s_norm_adj_mat.npz', norm_adj_mat) 96 | sp.save_npz(self.path + '/s_mean_adj_mat.npz', mean_adj_mat) 97 | 98 | try: 99 | pre_adj_mat = sp.load_npz(self.path + '/s_pre_adj_mat.npz') 100 | except Exception: 101 | adj_mat=adj_mat 102 | rowsum = np.array(adj_mat.sum(1)) 103 | d_inv = np.power(rowsum, -0.5).flatten() 104 | d_inv[np.isinf(d_inv)] = 0. 105 | d_mat_inv = sp.diags(d_inv) 106 | 107 | norm_adj = d_mat_inv.dot(adj_mat) 108 | norm_adj = norm_adj.dot(d_mat_inv) 109 | print('generate pre adjacency matrix.') 110 | pre_adj_mat = norm_adj.tocsr() 111 | sp.save_npz(self.path + '/s_pre_adj_mat.npz', norm_adj) 112 | 113 | 114 | return adj_mat, norm_adj_mat, mean_adj_mat, pre_adj_mat 115 | 116 | def create_adj_mat(self): 117 | t1 = time() 118 | adj_mat = sp.dok_matrix((self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32) 119 | adj_mat = adj_mat.tolil() 120 | R = self.R.tolil() 121 | 122 | adj_mat[:self.n_users, self.n_users:] = R 123 | adj_mat[self.n_users:, :self.n_users] = R.T 124 | adj_mat = adj_mat.todok() 125 | print('already create adjacency matrix', adj_mat.shape, time() - t1) 126 | 127 | t2 = time() 128 | 129 | def normalized_adj_single(adj): 130 | rowsum = np.array(adj.sum(1)) 131 | 132 | d_inv = np.power(rowsum, -1).flatten() 133 | d_inv[np.isinf(d_inv)] = 0. 134 | d_mat_inv = sp.diags(d_inv) 135 | 136 | norm_adj = d_mat_inv.dot(adj) 137 | # norm_adj = adj.dot(d_mat_inv) 138 | print('generate single-normalized adjacency matrix.') 139 | return norm_adj.tocoo() 140 | 141 | def check_adj_if_equal(adj): 142 | dense_A = np.array(adj.todense()) 143 | degree = np.sum(dense_A, axis=1, keepdims=False) 144 | 145 | temp = np.dot(np.diag(np.power(degree, -1)), dense_A) 146 | print('check normalized adjacency matrix whether equal to this laplacian matrix.') 147 | return temp 148 | 149 | norm_adj_mat = normalized_adj_single(adj_mat + sp.eye(adj_mat.shape[0])) 150 | mean_adj_mat = normalized_adj_single(adj_mat) 151 | 152 | print('already normalize adjacency matrix', time() - t2) 153 | return adj_mat.tocsr(), norm_adj_mat.tocsr(), mean_adj_mat.tocsr() 154 | 155 | def negative_pool(self): 156 | t1 = time() 157 | for u in self.train_items.keys(): 158 | neg_items = list(set(range(self.n_items)) - set(self.train_items[u])) 159 | pools = [rd.choice(neg_items) for _ in range(100)] 160 | self.neg_pools[u] = pools 161 | print('refresh negative pools', time() - t1) 162 | 163 | def sample(self): 164 | if self.batch_size <= self.n_users: 165 | users = rd.sample(self.exist_users, self.batch_size) 166 | else: 167 | users = [rd.choice(self.exist_users) for _ in range(self.batch_size)] 168 | 169 | 170 | def sample_pos_items_for_u(u, num): 171 | pos_items = self.train_items[u] 172 | n_pos_items = len(pos_items) 173 | pos_batch = [] 174 | while True: 175 | if len(pos_batch) == num: break 176 | pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0] 177 | pos_i_id = pos_items[pos_id] 178 | 179 | if pos_i_id not in pos_batch: 180 | pos_batch.append(pos_i_id) 181 | return pos_batch 182 | 183 | def sample_neg_items_for_u(u, num): 184 | neg_items = [] 185 | while True: 186 | if len(neg_items) == num: break 187 | neg_id = np.random.randint(low=0, high=self.n_items,size=1)[0] 188 | if neg_id not in self.train_items[u] and neg_id not in neg_items: 189 | neg_items.append(neg_id) 190 | return neg_items 191 | 192 | def sample_neg_items_for_u_from_pools(u, num): 193 | neg_items = list(set(self.neg_pools[u]) - set(self.train_items[u])) 194 | return rd.sample(neg_items, num) 195 | 196 | pos_items, neg_items = [], [] 197 | for u in users: 198 | pos_items += sample_pos_items_for_u(u, 1) 199 | neg_items += sample_neg_items_for_u(u, 1) 200 | 201 | return users, pos_items, neg_items 202 | 203 | def sample_test(self): 204 | if self.batch_size <= self.n_users: 205 | users = rd.sample(self.test_set.keys(), self.batch_size) 206 | else: 207 | users = [rd.choice(self.exist_users) for _ in range(self.batch_size)] 208 | 209 | def sample_pos_items_for_u(u, num): 210 | pos_items = self.test_set[u] 211 | n_pos_items = len(pos_items) 212 | pos_batch = [] 213 | while True: 214 | if len(pos_batch) == num: break 215 | pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0] 216 | pos_i_id = pos_items[pos_id] 217 | 218 | if pos_i_id not in pos_batch: 219 | pos_batch.append(pos_i_id) 220 | return pos_batch 221 | 222 | def sample_neg_items_for_u(u, num): 223 | neg_items = [] 224 | while True: 225 | if len(neg_items) == num: break 226 | neg_id = np.random.randint(low=0, high=self.n_items, size=1)[0] 227 | if neg_id not in (self.test_set[u]+self.train_items[u]) and neg_id not in neg_items: 228 | neg_items.append(neg_id) 229 | return neg_items 230 | 231 | def sample_neg_items_for_u_from_pools(u, num): 232 | neg_items = list(set(self.neg_pools[u]) - set(self.train_items[u])) 233 | return rd.sample(neg_items, num) 234 | 235 | pos_items, neg_items = [], [] 236 | for u in users: 237 | pos_items += sample_pos_items_for_u(u, 1) 238 | neg_items += sample_neg_items_for_u(u, 1) 239 | 240 | return users, pos_items, neg_items 241 | 242 | def get_num_users_items(self): 243 | return self.n_users, self.n_items 244 | 245 | def print_statistics(self): 246 | print('n_users=%d, n_items=%d' % (self.n_users, self.n_items)) 247 | print('n_interactions=%d' % (self.n_train + self.n_test)) 248 | print('n_train=%d, n_test=%d, sparsity=%.5f' % (self.n_train, self.n_test, (self.n_train + self.n_test)/(self.n_users * self.n_items))) 249 | 250 | def get_sparsity_split(self): 251 | try: 252 | split_uids, split_state = [], [] 253 | lines = open(self.path + '/sparsity.split', 'r').readlines() 254 | 255 | for idx, line in enumerate(lines): 256 | if idx % 2 == 0: 257 | split_state.append(line.strip()) 258 | print(line.strip()) 259 | else: 260 | split_uids.append([int(uid) for uid in line.strip().split(' ')]) 261 | print('get sparsity split.') 262 | 263 | except Exception: 264 | split_uids, split_state = self.create_sparsity_split() 265 | f = open(self.path + '/sparsity.split', 'w') 266 | for idx in range(len(split_state)): 267 | f.write(split_state[idx] + '\n') 268 | f.write(' '.join([str(uid) for uid in split_uids[idx]]) + '\n') 269 | print('create sparsity split.') 270 | 271 | return split_uids, split_state 272 | 273 | def create_sparsity_split(self): 274 | all_users_to_test = list(self.test_set.keys()) 275 | user_n_iid = dict() 276 | 277 | # generate a dictionary to store (key=n_iids, value=a list of uid). 278 | for uid in all_users_to_test: 279 | train_iids = self.train_items[uid] 280 | test_iids = self.test_set[uid] 281 | 282 | n_iids = len(train_iids) + len(test_iids) 283 | 284 | if n_iids not in user_n_iid.keys(): 285 | user_n_iid[n_iids] = [uid] 286 | else: 287 | user_n_iid[n_iids].append(uid) 288 | split_uids = list() 289 | 290 | # split the whole user set into four subset. 291 | temp = [] 292 | count = 1 293 | fold = 4 294 | n_count = (self.n_train + self.n_test) 295 | n_rates = 0 296 | 297 | split_state = [] 298 | for idx, n_iids in enumerate(sorted(user_n_iid)): 299 | temp += user_n_iid[n_iids] 300 | n_rates += n_iids * len(user_n_iid[n_iids]) 301 | n_count -= n_iids * len(user_n_iid[n_iids]) 302 | 303 | if n_rates >= count * 0.25 * (self.n_train + self.n_test): 304 | split_uids.append(temp) 305 | 306 | state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' %(n_iids, len(temp), n_rates) 307 | split_state.append(state) 308 | print(state) 309 | 310 | temp = [] 311 | n_rates = 0 312 | fold -= 1 313 | 314 | if idx == len(user_n_iid.keys()) - 1 or n_count == 0: 315 | split_uids.append(temp) 316 | 317 | state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' % (n_iids, len(temp), n_rates) 318 | split_state.append(state) 319 | print(state) 320 | 321 | 322 | 323 | return split_uids, split_state 324 | -------------------------------------------------------------------------------- /DGCF_v1/utility/metrics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 10, 2019 3 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in: 4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020. 5 | Note that: This implementation is based on the codes of NGCF. 6 | 7 | @author: Xiang Wang (xiangwang@u.nus.edu) 8 | ''' 9 | 10 | import numpy as np 11 | from sklearn.metrics import roc_auc_score 12 | 13 | def recall(rank, ground_truth, N): 14 | return len(set(rank[:N]) & set(ground_truth)) / float(len(set(ground_truth))) 15 | 16 | def precision_at_k(r, k): 17 | """Score is precision @ k 18 | Relevance is binary (nonzero is relevant). 19 | Returns: 20 | Precision @ k 21 | Raises: 22 | ValueError: len(r) must be >= k 23 | """ 24 | assert k >= 1 25 | r = np.asarray(r)[:k] 26 | return np.mean(r) 27 | 28 | def average_precision(r,cut): 29 | """Score is average precision (area under PR curve) 30 | Relevance is binary (nonzero is relevant). 31 | Returns: 32 | Average precision 33 | """ 34 | r = np.asarray(r) 35 | out = [precision_at_k(r, k + 1) for k in range(cut) if r[k]] 36 | if not out: 37 | return 0. 38 | return np.sum(out)/float(min(cut, np.sum(r))) 39 | 40 | def mean_average_precision(rs): 41 | """Score is mean average precision 42 | Relevance is binary (nonzero is relevant). 43 | Returns: 44 | Mean average precision 45 | """ 46 | return np.mean([average_precision(r) for r in rs]) 47 | 48 | def dcg_at_k(r, k, method=1): 49 | """Score is discounted cumulative gain (dcg) 50 | Relevance is positive real values. Can use binary 51 | as the previous methods. 52 | Returns: 53 | Discounted cumulative gain 54 | """ 55 | r = np.asfarray(r)[:k] 56 | if r.size: 57 | if method == 0: 58 | return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1))) 59 | elif method == 1: 60 | return np.sum(r / np.log2(np.arange(2, r.size + 2))) 61 | else: 62 | raise ValueError('method must be 0 or 1.') 63 | return 0. 64 | 65 | def ndcg_at_k(r, k, ground_truth, method=1): 66 | """Score is normalized discounted cumulative gain (ndcg) 67 | Relevance is positive real values. Can use binary 68 | as the previous methods. 69 | Returns: 70 | Normalized discounted cumulative gain 71 | 72 | Low but correct defination 73 | """ 74 | GT = set(ground_truth) 75 | if len(GT) > k : 76 | sent_list = [1.0] * k 77 | else: 78 | sent_list = [1.0]*len(GT) + [0.0]*(k-len(GT)) 79 | dcg_max = dcg_at_k(sent_list, k, method) 80 | if not dcg_max: 81 | return 0. 82 | return dcg_at_k(r, k, method) / dcg_max 83 | 84 | def recall_at_k(r, k, all_pos_num): 85 | r = np.asfarray(r)[:k] 86 | return np.sum(r) / all_pos_num 87 | 88 | def hit_at_k(r, k): 89 | r = np.array(r)[:k] 90 | if np.sum(r) > 0: 91 | return 1. 92 | else: 93 | return 0. 94 | 95 | def F1(pre, rec): 96 | if pre + rec > 0: 97 | return (2.0 * pre * rec) / (pre + rec) 98 | else: 99 | return 0. 100 | 101 | def auc(ground_truth, prediction): 102 | try: 103 | res = roc_auc_score(y_true=ground_truth, y_score=prediction) 104 | except Exception: 105 | res = 0. 106 | return res -------------------------------------------------------------------------------- /DGCF_v1/utility/parser.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 10, 2019 3 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in: 4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020. 5 | Note that: This implementation is based on the codes of NGCF. 6 | 7 | @author: Xiang Wang (xiangwang@u.nus.edu) 8 | ''' 9 | import argparse 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser(description="Run DGCF.") 13 | parser.add_argument('--data_path', nargs='?', default='../Data/', 14 | help='Input data path.') 15 | parser.add_argument('--proj_path', nargs='?', default='', 16 | help='Project path.') 17 | 18 | parser.add_argument('--pick', type=int, default=0, 19 | help='O for no pick, 1 for pick') 20 | parser.add_argument('--pick_scale', type=float, default=1e10, 21 | help='Scale') 22 | parser.add_argument('--dataset', nargs='?', default='gowalla', 23 | help='Choose a dataset from {gowalla, yelp2018, amazon-book}') 24 | parser.add_argument('--pretrain', type=int, default=0, 25 | help='0: No pretrain, 1:Use stored models.') 26 | parser.add_argument('--embed_name', nargs='?', default='', 27 | help='Name for pretrained model.') 28 | parser.add_argument('--verbose', type=int, default=1, 29 | help='Interval of evaluation.') 30 | 31 | 32 | parser.add_argument('--epoch', type=int, default=3000, 33 | help='Number of epochs') 34 | parser.add_argument('--embed_size', type=int, default=64, 35 | help='Embedding size.') 36 | parser.add_argument('--layer_size', nargs='?', default='[64]', 37 | help='Output sizes of every layer') 38 | parser.add_argument('--batch_size', type=int, default=1024, 39 | help='Batch size.') 40 | parser.add_argument('--lr', type=float, default=0.01, 41 | help='Learning rate.') 42 | parser.add_argument('--cor_flag', type=int, default=1, 43 | help='Correlation matrix flag') 44 | parser.add_argument('--corDecay', type=float, default=0.0, 45 | help='Distance Correlation Weight') 46 | parser.add_argument('--regs', nargs='?', default='[1e-3,1e-4,1e-4]', 47 | help='Regularizations.') 48 | 49 | parser.add_argument('--n_layers', type=int, default=1, 50 | help='Layer numbers.') 51 | parser.add_argument('--n_factors', type=int, default=4, 52 | help='Number of factors to disentangle the original embed-size representation.') 53 | parser.add_argument('--n_iterations', type=int, default=2, 54 | help='Number of iterations to perform the routing mechanism.') 55 | 56 | 57 | parser.add_argument('--show_step', type=int, default=15, 58 | help='Test every show_step epochs.') 59 | parser.add_argument('--early', type=int, default=40, 60 | help='Step for stopping') 61 | parser.add_argument('--Ks', nargs='?', default='[20, 40, 60, 80, 100]', 62 | help='Metrics scale') 63 | 64 | parser.add_argument('--save_flag', type=int, default=0, 65 | help='0: Disable model saver, 1: Save Better Model') 66 | parser.add_argument('--save_name', nargs='?', default='best_model', 67 | help='Save_name.') 68 | 69 | parser.add_argument('--test_flag', nargs='?', default='part', 70 | help='Specify the test type from {part, full}, indicating whether the reference is done in mini-batch') 71 | 72 | 73 | return parser.parse_args() 74 | -------------------------------------------------------------------------------- /DGCF_v2/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Data/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Data/amazon-book/README.md: -------------------------------------------------------------------------------- 1 | Look for the full dataset? Please visit the [websit](http://jmcauley.ucsd.edu/data/amazon). 2 | -------------------------------------------------------------------------------- /Data/gowalla/README.md: -------------------------------------------------------------------------------- 1 | Look for the full dataset? 2 | Please visit the [websit](https://snap.stanford.edu/data/loc-gowalla.html). 3 | -------------------------------------------------------------------------------- /Data/yelp2018/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Disentangled Graph Collaborative Filtering 2 | This is our Tensorflow implementation for the paper: 3 | 4 | >Xiang Wang, Hongye Jin, An Zhang, Xiangnan He, Tong Xu, and Tat-Seng Chua (2020). Disentangled Graph Collaborative Filtering, [Paper in arXiv](https://arxiv.org/abs/2007.01764). In SIGIR'20, Xi'an, China, July 25-30, 2020. 5 | 6 | Author: Dr. Xiang Wang (xiangwang at u.nus.edu) 7 | 8 | ## Introduction 9 | Disentangled Graph Collaborative Filtering (DGCF) is an explainable recommendation framework, which is equipped with (1) dynamic routing mechanism of capsule networks, to refine the strengths of user-item interactions in intent-aware graphs, (2) embedding propagation mechanism of graph neural networks, to distill the pertinent information from higher-order connectivity, and (3) distance correlation of independence modeling, to ensure the independence among intents. As such, we explicitly disentangle the hidden intents of users in the representation learning. 10 | 11 | ## Citation 12 | If you want to use our codes and datasets in your research, please cite: 13 | ``` 14 | @inproceedings{DGCF19, 15 | author = {Xiang Wang and 16 | Hongye Jin and 17 | An Zhang and 18 | Xiangnan He and 19 | Tong Xu and 20 | Tat{-}Seng Chua}, 21 | title = {Disentangled Graph Collaborative Filtering}, 22 | booktitle = {Proceedings of the 43nd International {ACM} {SIGIR} Conference on 23 | Research and Development in Information Retrieval, {SIGIR} 2020, Xi'an, 24 | China, July 25-30, 2020.}, 25 | year = {2020}, 26 | } 27 | ``` 28 | ## Environment Requirement 29 | We recommend to run this code in GPUs. The code has been tested running under Python 3.6.5. The required packages are as follows: 30 | * tensorflow_gpu == 1.14.0 31 | * numpy == 1.14.3 32 | * scipy == 1.1.0 33 | * sklearn == 0.19.1 34 | 35 | ## Versions 36 | We released the implementation based on the NGCF code as DGCF_v1. Later, we will release another implementation based on the LightGCN code as DGCF_v2, which is equipped with some speedup techniques. 37 | 38 | ## Example to Run the Codes 39 | The instruction of commands has been clearly stated in the codes (see the parser function in DGCF/utility/parser.py). 40 | * Gowalla dataset 41 | ``` 42 | CUDA_VISIBLE_DEVICES=0 python GDCF.py --dataset gowalla --batch_size 2000 --n_layers 1 --n_iterations 2 --corDecay 0.01 --n_factors 4 --show_step 3 --lr 0.001 43 | ``` 44 | 45 | Some important arguments (additional to that of NGCF): 46 | * `cor_flag` 47 | * It specifies whether the distance correlation (i.e., independence modeling) is activated.. 48 | * Here we provide two options: 49 | * 1 (by default), which activates the distance correlation in [Disentangled Graph Collaborative Filtering](https://arxiv.org/abs/2007.01764), SIGIR2020. Usage: `--cor_flag 1`. 50 | * 0, which disables the distance correlation. Usage: `--cor_flag 0`. 51 | 52 | * `corDecay` 53 | * It specifies the weight to control the distance correlation. 54 | * Here we provide four options: 55 | * 0.0 (by default), which similarly disables the distance correlation and makes DGCF rely only on the dynamic routing mechanism to disentangle the user intents. Usage: `--corDecay 0.0`. 56 | * other scales like 0.1, which uses 0.1 to control the strengths of distance correlation. Usage: `--corDecay 0.1`. 57 | 58 | * `n_factors` 59 | * It indicates the number of latent intents to disentangle the holistic representation into chunked intent-aware representations. Usage: `--n_factors 4`. 60 | * Note that the arguement `embed_size` needs to be exactly divisible by the arguement `n_factors`. 61 | 62 | * `n_iterations` 63 | * It indicates the number of iterations to perform the dynamic routing mechanism. Usage `--n_iterations 2`. 64 | 65 | ## Dataset 66 | Following our prior work NGCF and LightGCN, We provide three processed datasets: Gowalla, Amazon-book, and Yelp2018. 67 | Note that the Yelp2018 dataset used in DGCF is slightly different from the original in NGCF, since we found some bugs in the preprocessing code to construct the Yelp2018 dataset. We rerun the experiments and report the performance in the corrected dataset. 68 | 69 | ## Acknowledgement 70 | 71 | This research is supported by the National Research Foundation, Singapore under its International Research Centres in Singapore Funding Initiative. Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not reflect the views of National Research Foundation, Singapore. 72 | --------------------------------------------------------------------------------