├── DGCF_v1
    ├── DGCF.py
    ├── README.md
    └── utility
    │   ├── README.md
    │   ├── batch_test.py
    │   ├── helper.py
    │   ├── load_data.py
    │   ├── metrics.py
    │   └── parser.py
├── DGCF_v2
    └── README.md
├── Data
    ├── README.md
    ├── amazon-book
    │   ├── README.md
    │   ├── item_list.txt
    │   ├── test.txt
    │   ├── train.txt
    │   └── user_list.txt
    ├── gowalla
    │   ├── README.md
    │   ├── item_list.txt
    │   ├── test.txt
    │   ├── train.txt
    │   └── user_list.txt
    └── yelp2018
    │   ├── README.md
    │   ├── item_list.txt
    │   ├── test.txt
    │   ├── train.txt
    │   └── user_list.txt
└── README.md


/DGCF_v1/DGCF.py:
--------------------------------------------------------------------------------
  1 |  #!/usr/local/bin/bash
  2 | '''
  3 | Created on Oct 10, 2019
  4 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
  5 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
  6 | Note that: This implementation is based on the codes of NGCF.
  7 | 
  8 | @author: Xiang Wang (xiangwang@u.nus.edu)
  9 | '''
 10 | 
 11 | import tensorflow as tf
 12 | import tensorflow.compat.v1 as tfv1
 13 | import os
 14 | import sys
 15 | import random as rd
 16 | import pickle
 17 | import numpy as np
 18 | 
 19 | 
 20 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 21 | 
 22 | from utility.helper import *
 23 | from utility.batch_test import *
 24 | 
 25 | 
 26 | class GDCF(object):
 27 |     def __init__(self, data_config, pretrain_data):
 28 |         # argument settings
 29 | 
 30 |         self.pretrain_data = pretrain_data
 31 |         self.n_users = data_config['n_users']
 32 |         self.n_items = data_config['n_items']
 33 | 
 34 |         self.n_fold = 1
 35 |         self.norm_adj = data_config['norm_adj']
 36 |         self.all_h_list = data_config['all_h_list']
 37 |         self.all_t_list = data_config['all_t_list']
 38 |         self.A_in_shape = self.norm_adj.tocoo().shape
 39 | 
 40 |         self.n_nonzero_elems = self.norm_adj.count_nonzero()
 41 |         self.lr = args.lr
 42 |         self.emb_dim = args.embed_size
 43 |         self.n_factors = args.n_factors
 44 |         self.n_iterations = args.n_iterations
 45 |         self.n_layers = args.n_layers
 46 |         self.pick_level = args.pick_scale
 47 |         self.cor_flag = args.cor_flag
 48 |         if args.pick == 1:
 49 |             self.is_pick = True
 50 |         else:
 51 |             self.is_pick = False
 52 | 
 53 |         self.batch_size = args.batch_size
 54 |         self.regs = eval(args.regs)
 55 |         self.decay = self.regs[0]
 56 |         self.verbose = args.verbose
 57 | 
 58 |         '''
 59 |         *********************************************************
 60 |         Create Placeholder for Input Data & Dropout.
 61 |         '''
 62 |         # placeholder definition
 63 |         self.users = tfv1.placeholder(tf.int32, shape=(None,))
 64 |         self.pos_items = tfv1.placeholder(tf.int32, shape=(None,))
 65 |         self.neg_items = tfv1.placeholder(tf.int32, shape=(None,))
 66 | 
 67 |         # additional placeholders for the distance correlation
 68 |         self.cor_users = tfv1.placeholder(tf.int32, shape=(None,))
 69 |         self.cor_items = tfv1.placeholder(tf.int32, shape=(None,))
 70 | 
 71 |         # assign different values with different factors (channels).
 72 |         self.A_values = tfv1.placeholder(tf.float32, shape=[self.n_factors, len(self.all_h_list)], name='A_values')
 73 | 
 74 |         """
 75 |         *********************************************************
 76 |         Create Model Parameters (i.e., Initialize Weights).
 77 |         """
 78 |         # initialization of model parameters
 79 |         self.weights = self._init_weights()
 80 | 
 81 |         # create models
 82 |         self.ua_embeddings, self.ia_embeddings, self.f_weight, self.ua_embeddings_t, self.ia_embeddings_t = self._create_star_routing_embed_with_P(pick_=self.is_pick)
 83 | 
 84 |         """
 85 |         *********************************************************
 86 |         Establish the final representations for user-item pairs in batch.
 87 |         """
 88 |         self.u_g_embeddings = tf.nn.embedding_lookup(self.ua_embeddings, self.users)
 89 |         self.u_g_embeddings_t = tf.nn.embedding_lookup(self.ua_embeddings_t, self.users)
 90 |         self.pos_i_g_embeddings = tf.nn.embedding_lookup(self.ia_embeddings, self.pos_items)
 91 |         self.pos_i_g_embeddings_t = tf.nn.embedding_lookup(self.ia_embeddings_t, self.pos_items)
 92 | 
 93 | 
 94 |         self.neg_i_g_embeddings = tf.nn.embedding_lookup(self.ia_embeddings, self.neg_items)
 95 |         self.u_g_embeddings_pre = tf.nn.embedding_lookup(self.weights['user_embedding'], self.users)
 96 |         self.pos_i_g_embeddings_pre = tf.nn.embedding_lookup(self.weights['item_embedding'], self.pos_items)
 97 |         self.neg_i_g_embeddings_pre = tf.nn.embedding_lookup(self.weights['item_embedding'], self.neg_items)
 98 | 
 99 |         self.cor_u_g_embeddings = tf.nn.embedding_lookup(self.ua_embeddings, self.cor_users)
100 |         self.cor_i_g_embeddings = tf.nn.embedding_lookup(self.ia_embeddings, self.cor_items)
101 | 
102 | 
103 |         #Inference for the testing phase.
104 |         self.batch_ratings = tf.matmul(self.u_g_embeddings_t, self.pos_i_g_embeddings_t, transpose_a=False, transpose_b=True)
105 |     
106 |         #Generate Predictions & Optimize via BPR loss.
107 |         self.mf_loss, self.emb_loss = self.create_bpr_loss(self.u_g_embeddings, self.pos_i_g_embeddings, self.neg_i_g_embeddings)
108 | 
109 |         # whether user distance correlation
110 |         if args.corDecay < 1e-9:
111 |             self.cor_loss = tf.constant(0.0)
112 |         else:
113 |             self.cor_loss = args.corDecay * self.create_cor_loss(self.cor_u_g_embeddings, self.cor_i_g_embeddings)                   
114 | 
115 |         # self.loss = self.mf_loss + self.emb_loss + self.reg_loss
116 |         self.loss = self.mf_loss + self.emb_loss + self.cor_loss
117 |         self.opt = tfv1.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss)
118 | 
119 |     def _init_weights(self):
120 |         all_weights = dict()
121 | 
122 |         initializer = tf.contrib.layers.xavier_initializer()
123 | 
124 |         if self.pretrain_data is None:
125 |             all_weights['user_embedding'] = tf.Variable(initializer([self.n_users, self.emb_dim]),
126 |                                                         name='user_embedding')
127 |             all_weights['item_embedding'] = tf.Variable(initializer([self.n_items, self.emb_dim]),
128 |                                                         name='item_embedding')
129 |             print('using xavier initialization')
130 |         else:
131 |             all_weights['user_embedding'] = tf.Variable(initial_value=self.pretrain_data['user_embed'], trainable=True,
132 |                                                         name='user_embedding', dtype=tf.float32)
133 |             all_weights['item_embedding'] = tf.Variable(initial_value=self.pretrain_data['item_embed'], trainable=True,
134 |                                                         name='item_embedding', dtype=tf.float32)
135 |             print('using pretrained initialization')
136 | 
137 |         return all_weights
138 | 
139 |     def _create_star_routing_embed_with_P(self, pick_ = False):
140 |         '''
141 |         pick_ : True, the model would narrow the weight of the least important factor down to 1/args.pick_scale.
142 |         pick_ : False, do nothing.
143 |         '''
144 |         p_test = False
145 |         p_train = False
146 | 
147 |         A_values = tf.ones(shape=[self.n_factors, len(self.all_h_list)])
148 |         # get a (n_factors)-length list of [n_users+n_items, n_users+n_items]
149 | 
150 |         # load the initial all-one adjacency values
151 |         # .... A_values is a all-ones dense tensor with the size of [n_factors, all_h_list].
152 |         
153 | 
154 |         # get the ID embeddings of users and items
155 |         # .... ego_embeddings is a dense tensor with the size of [n_users+n_items, embed_size];
156 |         # .... all_embeddings stores a (n_layers)-len list of outputs derived from different layers.
157 |         ego_embeddings = tf.concat([self.weights['user_embedding'], self.weights['item_embedding']], axis=0)
158 |         all_embeddings = [ego_embeddings]
159 |         all_embeddings_t = [ego_embeddings]
160 | 
161 |         output_factors_distribution = []
162 |         
163 |         factor_num = [self.n_factors, self.n_factors, self.n_factors]
164 |         iter_num = [self.n_iterations, self.n_iterations, self.n_iterations]
165 |         for k in range(0, self.n_layers):
166 |             # prepare the output embedding list
167 |             # .... layer_embeddings stores a (n_factors)-len list of outputs derived from the last routing iterations.
168 |             n_factors_l = factor_num[k]
169 |             n_iterations_l = iter_num[k]
170 |             layer_embeddings = []
171 |             layer_embeddings_t = []
172 |             
173 |             # split the input embedding table
174 |             # .... ego_layer_embeddings is a (n_factors)-leng list of embeddings [n_users+n_items, embed_size/n_factors]
175 |             ego_layer_embeddings = tf.split(ego_embeddings, n_factors_l, 1)
176 |             ego_layer_embeddings_t = tf.split(ego_embeddings, n_factors_l, 1) 
177 | 
178 |             # perform routing mechanism
179 |             for t in range(0, n_iterations_l):
180 |                 iter_embeddings = []
181 |                 iter_embeddings_t = []
182 |                 A_iter_values = []
183 | 
184 |                 # split the adjacency values & get three lists of [n_users+n_items, n_users+n_items] sparse tensors
185 |                 # .... A_factors is a (n_factors)-len list, each of which is an adjacency matrix
186 |                 # .... D_col_factors is a (n_factors)-len list, each of which is a degree matrix w.r.t. columns
187 |                 # .... D_row_factors is a (n_factors)-len list, each of which is a degree matrix w.r.t. rows
188 |                 if t == n_iterations_l - 1:
189 |                     p_test = pick_
190 |                     p_train = False
191 | 
192 |                 A_factors, D_col_factors, D_row_factors = self._convert_A_values_to_A_factors_with_P(n_factors_l, A_values, pick= p_train)
193 |                 A_factors_t, D_col_factors_t, D_row_factors_t = self._convert_A_values_to_A_factors_with_P(n_factors_l, A_values, pick= p_test)
194 |                 for i in range(0, n_factors_l):
195 |                     # update the embeddings via simplified graph convolution layer
196 |                     # .... D_col_factors[i] * A_factors[i] * D_col_factors[i] is Laplacian matrix w.r.t. the i-th factor
197 |                     # .... factor_embeddings is a dense tensor with the size of [n_users+n_items, embed_size/n_factors]
198 |                     factor_embeddings = tf.sparse.sparse_dense_matmul(D_col_factors[i], ego_layer_embeddings[i])
199 |                     factor_embeddings_t = tf.sparse.sparse_dense_matmul(D_col_factors_t[i], ego_layer_embeddings_t[i])
200 | 
201 |                     factor_embeddings_t = tf.sparse.sparse_dense_matmul(A_factors_t[i], factor_embeddings_t)
202 |                     factor_embeddings = tf.sparse.sparse_dense_matmul(A_factors[i], factor_embeddings)
203 | 
204 |                     factor_embeddings = tf.sparse.sparse_dense_matmul(D_col_factors[i], factor_embeddings)
205 |                     factor_embeddings_t = tf.sparse.sparse_dense_matmul(D_col_factors_t[i], factor_embeddings_t)
206 | 
207 |                     iter_embeddings.append(factor_embeddings)
208 |                     iter_embeddings_t.append(factor_embeddings_t)
209 |                     
210 |                     if t == n_iterations_l - 1:
211 |                         layer_embeddings = iter_embeddings
212 |                         layer_embeddings_t = iter_embeddings_t
213 | 
214 |                     # get the factor-wise embeddings
215 |                     # .... head_factor_embeddings is a dense tensor with the size of [all_h_list, embed_size/n_factors]
216 |                     # .... analogous to tail_factor_embeddings
217 |                     head_factor_embedings = tf.nn.embedding_lookup(factor_embeddings, self.all_h_list)
218 |                     tail_factor_embedings = tf.nn.embedding_lookup(ego_layer_embeddings[i], self.all_t_list)
219 | 
220 |                     # .... constrain the vector length
221 |                     # .... make the following attentive weights within the range of (0,1)
222 |                     head_factor_embedings = tf.math.l2_normalize(head_factor_embedings, axis=1)
223 |                     tail_factor_embedings = tf.math.l2_normalize(tail_factor_embedings, axis=1)
224 | 
225 |                     # get the attentive weights
226 |                     # .... A_factor_values is a dense tensor with the size of [all_h_list,1]
227 |                     A_factor_values = tf.reduce_sum(tf.multiply(head_factor_embedings, tf.tanh(tail_factor_embedings)), axis=1)
228 | 
229 |                     # update the attentive weights
230 |                     A_iter_values.append(A_factor_values)
231 | 
232 |                 # pack (n_factors) adjacency values into one [n_factors, all_h_list] tensor
233 |                 A_iter_values = tf.stack(A_iter_values, 0)
234 |                 # add all layer-wise attentive weights up.
235 |                 A_values += A_iter_values
236 |                 
237 |                 if t == n_iterations_l - 1:
238 |                     #layer_embeddings = iter_embeddings
239 |                     output_factors_distribution.append(A_factors)
240 | 
241 |             # sum messages of neighbors, [n_users+n_items, embed_size]
242 |             side_embeddings = tf.concat(layer_embeddings, 1)
243 |             side_embeddings_t = tf.concat(layer_embeddings_t, 1)
244 |             
245 |             ego_embeddings = side_embeddings
246 |             ego_embeddings_t = side_embeddings_t
247 |             # concatenate outputs of all layers
248 |             all_embeddings_t += [ego_embeddings_t]
249 |             all_embeddings += [ego_embeddings]
250 | 
251 |         all_embeddings = tf.stack(all_embeddings, 1)
252 |         all_embeddings = tf.reduce_mean(all_embeddings, axis=1, keepdims=False)
253 | 
254 |         all_embeddings_t = tf.stack(all_embeddings_t, 1)
255 |         all_embeddings_t = tf.reduce_mean(all_embeddings_t, axis=1, keep_dims=False)
256 | 
257 |         u_g_embeddings, i_g_embeddings = tf.split(all_embeddings, [self.n_users, self.n_items], 0)
258 |         u_g_embeddings_t, i_g_embeddings_t = tf.split(all_embeddings_t, [self.n_users, self.n_items], 0)
259 | 
260 |         return u_g_embeddings, i_g_embeddings, output_factors_distribution, u_g_embeddings_t, i_g_embeddings_t
261 | 
262 |     def create_bpr_loss(self, users, pos_items, neg_items):
263 |         pos_scores = tf.reduce_sum(tf.multiply(users, pos_items), axis=1)
264 |         neg_scores = tf.reduce_sum(tf.multiply(users, neg_items), axis=1)
265 | 
266 |         regularizer = tf.nn.l2_loss(self.u_g_embeddings_pre) + tf.nn.l2_loss(
267 |             self.pos_i_g_embeddings_pre) + tf.nn.l2_loss(self.neg_i_g_embeddings_pre)
268 |         regularizer = regularizer / self.batch_size
269 | 
270 |         # In the first version, we implement the bpr loss via the following codes:
271 |         # We report the performance in our paper using this implementation.
272 |         #         maxi = tf.log(tf.nn.sigmoid(pos_scores - neg_scores))
273 |         #         mf_loss = tf.negative(tf.reduce_mean(maxi))
274 | 
275 |         ## In the second version, we implement the bpr loss via the following codes to avoid 'NAN' loss during training:
276 |         ## However, it will change the training performance and training performance.
277 |         ## Please retrain the model and do a grid search for the best experimental setting.
278 | 
279 |         mf_loss = tf.reduce_mean(tf.nn.softplus(-(pos_scores - neg_scores)))
280 | 
281 |         emb_loss = self.decay * regularizer
282 | 
283 |         return mf_loss, emb_loss
284 | 
285 |     def create_cor_loss(self, cor_u_embeddings, cor_i_embeddings):
286 |         cor_loss = tf.constant(0.0, tf.float32)
287 | 
288 |         if self.cor_flag == 0:
289 |             return  cor_loss
290 | 
291 |         ui_embeddings = tf.concat([cor_u_embeddings, cor_i_embeddings], axis=0)
292 |         ui_factor_embeddings = tf.split(ui_embeddings, self.n_factors, 1)
293 | 
294 |         for i in range(0, self.n_factors-1):
295 |             x = ui_factor_embeddings[i]
296 |             y = ui_factor_embeddings[i+1]
297 |             cor_loss += self._create_distance_correlation(x, y)
298 | 
299 |         cor_loss /= ((self.n_factors + 1.0) * self.n_factors/2)
300 | 
301 |         return cor_loss
302 | 
303 |     def model_save(self, path, dataset, ses, savename='best_model'):
304 |         save_pretrain_path = '%spretrain/%s/%s' % (path, dataset, savename)        
305 |         np.savez(save_pretrain_path,user_embed=np.array(self.weights['user_embedding'].eval(session=ses)),
306 |                                     item_embed=np.array(model.weights['item_embedding'].eval(session=ses))) 
307 | 
308 |     def _create_distance_correlation(self, X1, X2):
309 | 
310 |         def _create_centered_distance(X):
311 |             '''
312 |                 Used to calculate the distance matrix of N samples.
313 |                 (However how could tf store a HUGE matrix with the shape like 70000*70000*4 Bytes????)
314 |             '''
315 |             # calculate the pairwise distance of X
316 |             # .... A with the size of [batch_size, embed_size/n_factors]
317 |             # .... D with the size of [batch_size, batch_size]
318 |             # X = tf.math.l2_normalize(XX, axis=1)
319 | 
320 |             r = tf.reduce_sum(tf.square(X), 1, keepdims=True)
321 |             D = tf.sqrt(tf.maximum(r - 2 * tf.matmul(a=X, b=X, transpose_b=True) + tf.transpose(r), 0.0) + 1e-8)
322 |  
323 |             # # calculate the centered distance of X
324 |             # # .... D with the size of [batch_size, batch_size]
325 |             D = D - tf.reduce_mean(D, axis=0, keepdims=True) - tf.reduce_mean(D, axis=1, keepdims=True) \
326 |                 + tf.reduce_mean(D)
327 |             return D
328 | 
329 |         def _create_distance_covariance(D1, D2):
330 |             # calculate distance covariance between D1 and D2
331 |             n_samples = tf.dtypes.cast(tf.shape(D1)[0], tf.float32)
332 |             dcov = tf.sqrt(tf.maximum(tf.reduce_sum(D1 * D2) / (n_samples * n_samples), 0.0) + 1e-8)
333 |             # dcov = tf.sqrt(tf.maximum(tf.reduce_sum(D1 * D2)) / n_samples
334 |             return dcov
335 | 
336 |         D1 = _create_centered_distance(X1)
337 |         D2 = _create_centered_distance(X2)
338 | 
339 |         dcov_12 = _create_distance_covariance(D1, D2)
340 |         dcov_11 = _create_distance_covariance(D1, D1)
341 |         dcov_22 = _create_distance_covariance(D2, D2)
342 | 
343 |         # calculate the distance correlation
344 |         dcor = dcov_12 / (tf.sqrt(tf.maximum(dcov_11 * dcov_22, 0.0)) + 1e-10)
345 |         # return tf.reduce_sum(D1) + tf.reduce_sum(D2)
346 |         return dcor
347 | 
348 |     def _convert_A_values_to_A_factors_with_P(self, f_num, A_factor_values, pick=True):
349 | 
350 |         A_factors = []
351 |         D_col_factors = []
352 |         D_row_factors = []
353 |         # get the indices of adjacency matrix.
354 |         A_indices = np.mat([self.all_h_list, self.all_t_list]).transpose()
355 |         D_indices = np.mat([list(range(self.n_users+self.n_items)), list(range(self.n_users+self.n_items))]).transpose()
356 | 
357 |         # apply factor-aware softmax function over the values of adjacency matrix
358 |         # .... A_factor_values is [n_factors, all_h_list]
359 |         if pick:
360 |             A_factor_scores = tf.nn.softmax(A_factor_values, 0)
361 |             min_A = tf.reduce_min(A_factor_scores, 0)
362 |             index = A_factor_scores > (min_A + 0.0000001)
363 |             index = tf.cast(index, tf.float32)*(self.pick_level-1.0) + 1.0  # adjust the weight of the minimum factor to 1/self.pick_level
364 | 
365 |             A_factor_scores = A_factor_scores * index
366 |             A_factor_scores = A_factor_scores / tf.reduce_sum(A_factor_scores, 0)
367 |         else:
368 |             A_factor_scores = tf.nn.softmax(A_factor_values, 0)
369 | 
370 |         for i in range(0, f_num):
371 |             # in the i-th factor, couple the adjacency values with the adjacency indices
372 |             # .... A_i_tensor is a sparse tensor with size of [n_users+n_items, n_users+n_items]
373 |             A_i_scores = A_factor_scores[i]
374 |             A_i_tensor = tf.SparseTensor(A_indices, A_i_scores, self.A_in_shape)
375 | 
376 |             # get the degree values of A_i_tensor
377 |             # .... D_i_scores_col is [n_users+n_items, 1]
378 |             # .... D_i_scores_row is [1, n_users+n_items]
379 |             D_i_col_scores = 1/tf.math.sqrt(tf.sparse_reduce_sum(A_i_tensor, axis=1))
380 |             D_i_row_scores = 1/tf.math.sqrt(tf.sparse_reduce_sum(A_i_tensor, axis=0))
381 | 
382 |             # couple the laplacian values with the adjacency indices
383 |             # .... A_i_tensor is a sparse tensor with size of [n_users+n_items, n_users+n_items]
384 |             D_i_col_tensor = tf.SparseTensor(D_indices, D_i_col_scores, self.A_in_shape)
385 |             D_i_row_tensor = tf.SparseTensor(D_indices, D_i_row_scores, self.A_in_shape)
386 | 
387 |             A_factors.append(A_i_tensor)
388 |             D_col_factors.append(D_i_col_tensor)
389 |             D_row_factors.append(D_i_row_tensor)
390 | 
391 |         # return a (n_factors)-length list of laplacian matrix
392 |         return A_factors, D_col_factors, D_row_factors
393 | 
394 | 
395 | def load_best(name="best_model"):
396 |     pretrain_path = '%spretrain/%s/%s.npz' % (args.proj_path, args.dataset, name)
397 |     try:
398 |         pretrain_data = np.load(pretrain_path)
399 |         print('load the best model:', name)
400 |     except Exception:
401 |         pretrain_data = None
402 |     return pretrain_data
403 | 
404 | def load_adjacency_list_data(adj_mat):
405 |     tmp = adj_mat.tocoo()
406 |     all_h_list = list(tmp.row)
407 |     all_t_list = list(tmp.col)
408 |     all_v_list = list(tmp.data)
409 | 
410 |     return all_h_list, all_t_list, all_v_list
411 | 
412 | def create_initial_A_values(n_factors, all_v_list):
413 |     return np.array([all_v_list] * n_factors)
414 | 
415 | def sample_cor_samples(n_users, n_items, cor_batch_size):
416 |     '''
417 |         We have to sample some embedded representations out of all nodes.
418 |         Becasue we have no way to store cor-distance for each pair.
419 |     '''
420 |     cor_users = rd.sample(list(range(n_users)), cor_batch_size)
421 |     cor_items = rd.sample(list(range(n_items)), cor_batch_size)
422 | 
423 |     return cor_users, cor_items
424 | 
425 | if __name__ == '__main__':
426 |     whether_test_batch = True 
427 |     
428 |     print("************************* Run with following settings 🏃 ***************************")
429 |     print(args)
430 |     print("************************************************************************************")
431 | 
432 |     config = dict()
433 |     config['n_users'] = data_generator.n_users
434 |     config['n_items'] = data_generator.n_items
435 | 
436 |     """
437 |     *********************************************************
438 |     Generate the Laplacian matrix, where each entry defines the decay factor (e.g., p_ui) between two connected nodes.
439 |     """
440 |     plain_adj, norm_adj, mean_adj, pre_adj = data_generator.get_adj_mat()
441 | 
442 |     all_h_list, all_t_list, all_v_list = load_adjacency_list_data(plain_adj)
443 | 
444 |     A_values_init = create_initial_A_values(args.n_factors, all_v_list)
445 | 
446 |     config['norm_adj'] = plain_adj
447 |     config['all_h_list'] = all_h_list
448 |     config['all_t_list'] = all_t_list
449 | 
450 | 
451 |     t0 = time()
452 |     """
453 |     *********************************************************
454 |     pretrain = 1: load embeddings with name such as embedding_xxx(.npz), l2_best_model(.npz)
455 |     pretrain = 0: default value, no pretrained embeddings.
456 |     """
457 |     if args.pretrain == 1:
458 |         print("Try to load pretain: ", args.embed_name)
459 |         pretrain_data = load_best(name=args.embed_name)
460 |         if pretrain_data == None:
461 |              print("Load pretrained model(%s)fail!!!!!!!!!!!!!!!"%(args.embed_name))
462 |     else:
463 |         pretrain_data = None
464 | 
465 |     model = GDCF(data_config=config, pretrain_data=pretrain_data)
466 | 
467 | 
468 |     tf_config = tfv1.ConfigProto()
469 |     tf_config.gpu_options.allow_growth = True
470 |     sess = tfv1.Session(config=tf_config)
471 | 
472 | 
473 |     sess.run(tfv1.global_variables_initializer())
474 |     cur_best_pre_0 = 0.
475 | 
476 | 
477 |     """
478 |     *********************************************************
479 |     Train
480 |     """
481 |     loss_loger, pre_loger, rec_loger, ndcg_loger, hit_loger = [], [], [], [], []
482 |     stopping_step = 0
483 |     should_stop = False
484 |     for epoch in range(args.epoch):
485 |         t1 = time()
486 |         loss, mf_loss, emb_loss, cor_loss = 0., 0., 0., 0.
487 |         n_batch = data_generator.n_train // args.batch_size + 1
488 |         cor_batch_size = int(max(data_generator.n_users/n_batch, data_generator.n_items/n_batch))
489 | 
490 |         for idx in range(n_batch):
491 |             users, pos_items, neg_items = data_generator.sample()
492 |             cor_users, cor_items = sample_cor_samples(data_generator.n_users, data_generator.n_items, cor_batch_size)
493 |             _, batch_loss, batch_mf_loss, batch_emb_loss, batch_cor_loss = sess.run([model.opt, model.loss, 
494 |                                                                                     model.mf_loss, model.emb_loss, 
495 |                                                                                     model.cor_loss],
496 |                                                                                     feed_dict={model.users: users, 
497 |                                                                                             model.pos_items: pos_items,
498 |                                                                                             model.neg_items: neg_items,
499 |                                                                                             model.cor_users: cor_users,
500 |                                                                                             model.cor_items: cor_items})
501 |             loss += batch_loss / n_batch
502 |             mf_loss += batch_mf_loss / n_batch
503 |             emb_loss += batch_emb_loss / n_batch
504 |             cor_loss += batch_cor_loss / n_batch
505 | 
506 |         if np.isnan(loss) == True:
507 |             print('ERROR: loss is nan.')
508 |             print(mf_loss, emb_loss)
509 |             sys.exit()
510 | 
511 |         # print the test evaluation metrics each 10 epochs; pos:neg = 1:10.
512 |         if (epoch + 1)  % args.show_step != 0:
513 |             if args.verbose > 0 and epoch % args.verbose == 0:
514 |                 perf_str = 'Epoch %d [%.1fs]: train==[%.5f=%.5f + %.5f + %.5f]' % (epoch, time() - t1, loss, mf_loss, emb_loss, cor_loss)
515 |                 print(perf_str)
516 |             # Skip testing
517 |             continue
518 | 
519 |         # Begin test at this epoch. 
520 |         loss_test, mf_loss_test, emb_loss_test, cor_loss_test = 0., 0., 0., 0.
521 |         for idx in range(n_batch):
522 |             cor_users, cor_items = sample_cor_samples(data_generator.n_users, data_generator.n_items, cor_batch_size)
523 |             users, pos_items, neg_items = data_generator.sample_test()
524 |             batch_loss_test, batch_mf_loss_test, batch_emb_loss_test, batch_cor_loss_test = sess.run(
525 |                 [model.loss, model.mf_loss, model.emb_loss, model.cor_loss],
526 |                 feed_dict={model.users: users, 
527 |                         model.pos_items: pos_items,
528 |                         model.neg_items: neg_items,
529 |                         model.A_values: A_values_init,
530 |                         model.cor_users: cor_users,
531 |                         model.cor_items: cor_items})
532 |             loss_test += batch_loss_test / n_batch
533 |             mf_loss_test += batch_mf_loss_test / n_batch
534 |             emb_loss_test += batch_emb_loss_test / n_batch
535 |             cor_loss_test += batch_cor_loss_test / n_batch
536 | 
537 |         t2 = time()
538 |         users_to_test = list(data_generator.test_set.keys())
539 |         ret = test(sess, model, users_to_test, drop_flag=True, batch_test_flag=whether_test_batch)
540 | 
541 | 
542 |         t3 = time()
543 | 
544 |         loss_loger.append(loss)
545 |         rec_loger.append(ret['recall'])
546 |         pre_loger.append(ret['precision'])
547 |         ndcg_loger.append(ret['ndcg'])
548 |         hit_loger.append(ret['hit_ratio'])
549 | 
550 |         if args.verbose > 0:
551 |             perf_str = 'Epoch %d [%.1fs + %.1fs]: test==[%.5f=%.5f + %.5f + %.5f], recall=[%.5f, %.5f], ' \
552 |                        'precision=[%.5f, %.5f], hit=[%.5f, %.5f], ndcg=[%.5f, %.5f]' % \
553 |                        (epoch, t2 - t1, t3 - t2, loss_test, mf_loss_test, emb_loss_test, cor_loss_test, ret['recall'][0],
554 |                        ret['recall'][-1],
555 |                        ret['precision'][0], ret['precision'][-1], ret['hit_ratio'][0], ret['hit_ratio'][-1],
556 |                        ret['ndcg'][0], ret['ndcg'][-1])
557 |             print(perf_str)
558 |             
559 |         cur_best_pre_0, stopping_step, should_stop = early_stopping(ret['recall'][0], cur_best_pre_0, stopping_step, expected_order='acc', flag_step=args.early)
560 | 
561 |         # early stopping when cur_best_pre_0 is decreasing for given steps. 
562 |         if should_stop == True:
563 |             break
564 | 
565 |         # *********************************************************
566 |         # save the user & item embeddings for pretraining.
567 |         if ret['recall'][0] == cur_best_pre_0 and args.save_flag == 1 :
568 |             model.model_save(args.proj_path, args.dataset, sess, savename=args.save_name)
569 |             print('save the model with performance: ', cur_best_pre_0)
570 | 
571 |         
572 |     recs = np.array(rec_loger)
573 |     pres = np.array(pre_loger)
574 |     ndcgs = np.array(ndcg_loger)
575 |     hit = np.array(hit_loger)
576 | 
577 |     best_rec_0 = max(recs[:, 0])
578 |     idx = list(recs[:, 0]).index(best_rec_0)
579 | 
580 |     final_perf = "Best Iter=[%d]@[%.1f]\trecall=[%s], precision=[%s], hit=[%s], ndcg=[%s]" % \
581 |                  (idx, time() - t0, '\t'.join(['%.5f' % r for r in recs[idx]]),
582 |                   '\t'.join(['%.5f' % r for r in pres[idx]]),
583 |                   '\t'.join(['%.5f' % r for r in hit[idx]]),
584 |                   '\t'.join(['%.5f' % r for r in ndcgs[idx]]))
585 |     print(final_perf)
586 | 
587 | 
588 | 


--------------------------------------------------------------------------------
/DGCF_v1/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/DGCF_v1/utility/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/DGCF_v1/utility/batch_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 10, 2019
  3 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
  4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
  5 | Note that: This implementation is based on the codes of NGCF.
  6 | 
  7 | @author: Xiang Wang (xiangwang@u.nus.edu)
  8 | '''
  9 | 
 10 | import utility.metrics as metrics
 11 | from utility.parser import parse_args
 12 | from utility.load_data import *
 13 | import multiprocessing
 14 | import heapq
 15 | import pandas as pd
 16 | import tensorflow as tf
 17 | import numpy as np
 18 | 
 19 | cores = multiprocessing.cpu_count() // 2
 20 | 
 21 | args = parse_args()
 22 | Ks = eval(args.Ks)
 23 | 
 24 | data_generator = Data(path=args.data_path + args.dataset, batch_size=args.batch_size)
 25 | USR_NUM, ITEM_NUM = data_generator.n_users, data_generator.n_items
 26 | N_TRAIN, N_TEST = data_generator.n_train, data_generator.n_test
 27 | if args.dataset=='amazon-book':
 28 |     BATCH_SIZE = args.batch_size//4
 29 | else:
 30 |     BATCH_SIZE = args.batch_size
 31 | 
 32 | def ranklist_by_heapq(user_pos_test, test_items, rating, Ks):
 33 |     item_score = {}
 34 |     for i in test_items:
 35 |         item_score[i] = rating[i]
 36 | 
 37 |     K_max = max(Ks)
 38 |     K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)
 39 | 
 40 |     r = []
 41 |     for i in K_max_item_score:
 42 |         if i in user_pos_test:
 43 |             r.append(1)
 44 |         else:
 45 |             r.append(0)
 46 |     auc = 0.
 47 |     return r, auc
 48 | 
 49 | def get_auc(item_score, user_pos_test):
 50 |     item_score = sorted(item_score.items(), key=lambda kv: kv[1])
 51 |     item_score.reverse()
 52 |     item_sort = [x[0] for x in item_score]
 53 |     posterior = [x[1] for x in item_score]
 54 | 
 55 |     r = []
 56 |     for i in item_sort:
 57 |         if i in user_pos_test:
 58 |             r.append(1)
 59 |         else:
 60 |             r.append(0)
 61 |     auc = metrics.auc(ground_truth=r, prediction=posterior)
 62 |     return auc
 63 | 
 64 | def ranklist_by_sorted(user_pos_test, test_items, rating, Ks):
 65 |     item_score = {}
 66 |     for i in test_items:
 67 |         item_score[i] = rating[i]
 68 | 
 69 |     K_max = max(Ks)
 70 |     K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)
 71 | 
 72 |     r = []
 73 |     for i in K_max_item_score:
 74 |         if i in user_pos_test:
 75 |             r.append(1)
 76 |         else:
 77 |             r.append(0)
 78 |     auc = get_auc(item_score, user_pos_test)
 79 |     return r, auc
 80 | 
 81 | def get_performance(user_pos_test, r, auc, Ks):
 82 |     precision, recall, ndcg, hit_ratio = [], [], [], []
 83 | 
 84 |     for K in Ks:
 85 |         precision.append(metrics.precision_at_k(r, K))
 86 |         recall.append(metrics.recall_at_k(r, K, len(user_pos_test)))
 87 |         ndcg.append(metrics.ndcg_at_k(r, K, user_pos_test))
 88 |         hit_ratio.append(metrics.hit_at_k(r, K))
 89 | 
 90 |     return {'recall': np.array(recall), 'precision': np.array(precision),
 91 |             'ndcg': np.array(ndcg), 'hit_ratio': np.array(hit_ratio), 'auc': auc}
 92 | 
 93 | 
 94 | def test_one_user(x):
 95 |     # user u's ratings for user u
 96 |     rating = x[0]
 97 |     #uid
 98 |     u = x[1]
 99 |     #user u's items in the training set
100 |     try:
101 |         training_items = data_generator.train_items[u]
102 |     except Exception:
103 |         training_items = []
104 |     #user u's items in the test set
105 |     user_pos_test = data_generator.test_set[u]
106 | 
107 |     all_items = set(range(ITEM_NUM))
108 | 
109 |     test_items = list(all_items - set(training_items))
110 | 
111 |     if args.test_flag == 'part':
112 |         r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks)
113 |     else:
114 |         r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks)
115 | 
116 |     return get_performance(user_pos_test, r, auc, Ks)
117 | 
118 | def test_one_user_train(x):
119 |     # user u's ratings for user u
120 |     rating = x[0]
121 |     # uid
122 |     u = x[1]
123 |     # user u's items in the training set
124 | 
125 |     training_items = []
126 |     # user u's items in the test set
127 |     user_pos_test = data_generator.train_items[u]
128 | 
129 |     all_items = set(range(ITEM_NUM))
130 | 
131 |     test_items = list(all_items - set(training_items))
132 | 
133 |     if args.test_flag == 'part':
134 |         r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks)
135 |     else:
136 |         r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks)
137 | 
138 |     return get_performance(user_pos_test, r, auc, Ks)
139 | 
140 | def test(sess, model, users_to_test, drop_flag=False, batch_test_flag=False,train_set_flag=0):
141 |     result = {'precision': np.zeros(len(Ks)), 'recall': np.zeros(len(Ks)), 'ndcg': np.zeros(len(Ks)),
142 |               'hit_ratio': np.zeros(len(Ks)), 'auc': 0.}
143 | 
144 |     pool = multiprocessing.Pool(cores)
145 | 
146 |     u_batch_size = BATCH_SIZE * 2
147 |     i_batch_size = BATCH_SIZE 
148 | 
149 |     test_users = users_to_test
150 |     n_test_users = len(test_users)
151 |     n_user_batchs = n_test_users // u_batch_size + 1
152 | 
153 |     count = 0
154 |     for u_batch_id in range(n_user_batchs):
155 |         start = u_batch_id * u_batch_size
156 |         end = (u_batch_id + 1) * u_batch_size
157 | 
158 |         user_batch = test_users[start: end]
159 | 
160 |         if batch_test_flag:
161 | 
162 |             n_item_batchs = ITEM_NUM // i_batch_size + 1
163 |             rate_batch = np.zeros(shape=(len(user_batch), ITEM_NUM))
164 | 
165 |             i_count = 0
166 |             for i_batch_id in range(n_item_batchs):
167 |                 i_start = i_batch_id * i_batch_size
168 |                 i_end = min((i_batch_id + 1) * i_batch_size, ITEM_NUM)
169 | 
170 |                 item_batch = range(i_start, i_end)
171 | 
172 |                 if drop_flag == False:
173 |                     i_rate_batch = sess.run(model.batch_ratings, {model.users: user_batch, model.pos_items: item_batch})
174 |                 else:
175 |                     i_rate_batch = sess.run(model.batch_ratings, {model.users: user_batch, model.pos_items: item_batch})
176 | 
177 |                 rate_batch[:, i_start: i_end] = i_rate_batch
178 |                 i_count += i_rate_batch.shape[1]
179 | 
180 |             assert i_count == ITEM_NUM
181 | 
182 |         else:
183 |             item_batch = range(ITEM_NUM)
184 |             if drop_flag == False:
185 |                 rate_batch, _1 = sess.run([model.batch_ratings, model._1], {model.users: user_batch,
186 |                                                               model.pos_items: item_batch})
187 |             else:
188 |                 rate_batch, _1, _2 = sess.run([model.batch_ratings, model.print_pick, model.print_embed], {model.users: user_batch,
189 |                                                               model.pos_items: item_batch})
190 |                 
191 | 
192 |         user_batch_rating_uid = zip(rate_batch, user_batch)
193 | 
194 |         if train_set_flag==0:
195 |             batch_result = pool.map(test_one_user, user_batch_rating_uid)
196 |         else:
197 |             batch_result = pool.map(test_one_user_train, user_batch_rating_uid)
198 |         count += len(batch_result)
199 | 
200 |         for re in batch_result:
201 |             result['precision'] += re['precision']/n_test_users
202 |             result['recall'] += re['recall']/n_test_users
203 |             result['ndcg'] += re['ndcg']/n_test_users
204 |             result['hit_ratio'] += re['hit_ratio']/n_test_users
205 |             result['auc'] += re['auc']/n_test_users
206 | 
207 | 
208 |     assert count == n_test_users
209 |     pool.close()
210 |     return result
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 


--------------------------------------------------------------------------------
/DGCF_v1/utility/helper.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 10, 2019
 3 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
 4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
 5 | Note that: This implementation is based on the codes of NGCF.
 6 | 
 7 | @author: Xiang Wang (xiangwang@u.nus.edu)
 8 | '''
 9 | 
10 | __author__ = "xiangwang"
11 | import os
12 | import re
13 | 
14 | def txt2list(file_src):
15 |     orig_file = open(file_src, "r")
16 |     lines = orig_file.readlines()
17 |     return lines
18 | 
19 | def ensureDir(dir_path):
20 |     d = os.path.dirname(dir_path)
21 |     if not os.path.exists(d):
22 |         os.makedirs(d)
23 | 
24 | def uni2str(unicode_str):
25 |     return str(unicode_str.encode('ascii', 'ignore')).replace('\n', '').strip()
26 | 
27 | def hasNumbers(inputString):
28 |     return bool(re.search(r'\d', inputString))
29 | 
30 | def delMultiChar(inputString, chars):
31 |     for ch in chars:
32 |         inputString = inputString.replace(ch, '')
33 |     return inputString
34 | 
35 | def merge_two_dicts(x, y):
36 |     z = x.copy()   # start with x's keys and values
37 |     z.update(y)    # modifies z with y's keys and values & returns None
38 |     return z
39 | 
40 | def early_stopping(log_value, best_value, stopping_step, expected_order='acc', flag_step=100):
41 |     # early stopping strategy:
42 |     assert expected_order in ['acc', 'dec']
43 | 
44 |     if (expected_order == 'acc' and log_value >= best_value) or (expected_order == 'dec' and log_value <= best_value):
45 |         stopping_step = 0
46 |         best_value = log_value
47 |     else:
48 |         stopping_step += 1
49 | 
50 |     if stopping_step >= flag_step:
51 |         print("Early stopping is trigger at step: {} log:{}".format(flag_step, log_value))
52 |         should_stop = True
53 |     else:
54 |         should_stop = False
55 |     return best_value, stopping_step, should_stop
56 | 


--------------------------------------------------------------------------------
/DGCF_v1/utility/load_data.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 10, 2019
  3 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
  4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
  5 | Note that: This implementation is based on the codes of NGCF.
  6 | 
  7 | @author: Xiang Wang (xiangwang@u.nus.edu)
  8 | '''
  9 | 
 10 | import numpy as np
 11 | import random as rd
 12 | import scipy.sparse as sp
 13 | from time import time
 14 | 
 15 | class Data(object):
 16 |     def __init__(self, path, batch_size):
 17 |         self.path = path
 18 |         self.batch_size = batch_size
 19 | 
 20 |         train_file = path + '/train.txt'
 21 |         test_file = path + '/test.txt'
 22 | 
 23 |         #get number of users and items
 24 |         self.n_users, self.n_items = 0, 0
 25 |         self.n_train, self.n_test = 0, 0
 26 |         self.neg_pools = {}
 27 | 
 28 |         self.exist_users = []
 29 | 
 30 |         with open(train_file) as f:
 31 |             for l in f.readlines():
 32 |                 if len(l) > 0:
 33 |                     l = l.strip('\n').split(' ')
 34 |                     items = [int(i) for i in l[1:]]
 35 |                     uid = int(l[0])
 36 |                     self.exist_users.append(uid)
 37 |                     self.n_items = max(self.n_items, max(items))
 38 |                     self.n_users = max(self.n_users, uid)
 39 |                     self.n_train += len(items)
 40 | 
 41 |         with open(test_file) as f:
 42 |             for l in f.readlines():
 43 |                 if len(l) > 0:
 44 |                     l = l.strip('\n')
 45 |                     try:
 46 |                         items = [int(i) for i in l.split(' ')[1:]]
 47 |                     except Exception:
 48 |                         continue
 49 |                     self.n_items = max(self.n_items, max(items))
 50 |                     self.n_test += len(items)
 51 |         self.n_items += 1
 52 |         self.n_users += 1
 53 | 
 54 |         self.print_statistics()
 55 | 
 56 |         self.R = sp.dok_matrix((self.n_users, self.n_items), dtype=np.float32)
 57 | 
 58 |         self.train_items, self.test_set = {}, {}
 59 |         with open(train_file) as f_train:
 60 |             with open(test_file) as f_test:
 61 |                 for l in f_train.readlines():
 62 |                     if len(l) == 0: break
 63 |                     l = l.strip('\n')
 64 |                     items = [int(i) for i in l.split(' ')]
 65 |                     uid, train_items = items[0], items[1:]
 66 | 
 67 |                     for i in train_items:
 68 |                         self.R[uid, i] = 1.
 69 |                         # self.R[uid][i] = 1
 70 | 
 71 |                     self.train_items[uid] = train_items
 72 | 
 73 |                 for l in f_test.readlines():
 74 |                     if len(l) == 0: break
 75 |                     l = l.strip('\n')
 76 |                     try:
 77 |                         items = [int(i) for i in l.split(' ')]
 78 |                     except Exception:
 79 |                         continue
 80 | 
 81 |                     uid, test_items = items[0], items[1:]
 82 |                     self.test_set[uid] = test_items
 83 | 
 84 |     def get_adj_mat(self):
 85 |         try:
 86 |             t1 = time()
 87 |             adj_mat = sp.load_npz(self.path + '/s_adj_mat.npz')
 88 |             norm_adj_mat = sp.load_npz(self.path + '/s_norm_adj_mat.npz')
 89 |             mean_adj_mat = sp.load_npz(self.path + '/s_mean_adj_mat.npz')
 90 |             print('already load adj matrix', adj_mat.shape, time() - t1)
 91 | 
 92 |         except Exception:
 93 |             adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat()
 94 |             sp.save_npz(self.path + '/s_adj_mat.npz', adj_mat)
 95 |             sp.save_npz(self.path + '/s_norm_adj_mat.npz', norm_adj_mat)
 96 |             sp.save_npz(self.path + '/s_mean_adj_mat.npz', mean_adj_mat)
 97 |             
 98 |         try:
 99 |             pre_adj_mat = sp.load_npz(self.path + '/s_pre_adj_mat.npz')
100 |         except Exception:
101 |             adj_mat=adj_mat
102 |             rowsum = np.array(adj_mat.sum(1))
103 |             d_inv = np.power(rowsum, -0.5).flatten()
104 |             d_inv[np.isinf(d_inv)] = 0.
105 |             d_mat_inv = sp.diags(d_inv)
106 | 
107 |             norm_adj = d_mat_inv.dot(adj_mat)
108 |             norm_adj = norm_adj.dot(d_mat_inv)
109 |             print('generate pre adjacency matrix.')
110 |             pre_adj_mat = norm_adj.tocsr()
111 |             sp.save_npz(self.path + '/s_pre_adj_mat.npz', norm_adj)
112 | 
113 | 
114 |         return adj_mat, norm_adj_mat, mean_adj_mat, pre_adj_mat
115 | 
116 |     def create_adj_mat(self):
117 |         t1 = time()
118 |         adj_mat = sp.dok_matrix((self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32)
119 |         adj_mat = adj_mat.tolil()
120 |         R = self.R.tolil()
121 | 
122 |         adj_mat[:self.n_users, self.n_users:] = R
123 |         adj_mat[self.n_users:, :self.n_users] = R.T
124 |         adj_mat = adj_mat.todok()
125 |         print('already create adjacency matrix', adj_mat.shape, time() - t1)
126 | 
127 |         t2 = time()
128 | 
129 |         def normalized_adj_single(adj):
130 |             rowsum = np.array(adj.sum(1))
131 | 
132 |             d_inv = np.power(rowsum, -1).flatten()
133 |             d_inv[np.isinf(d_inv)] = 0.
134 |             d_mat_inv = sp.diags(d_inv)
135 | 
136 |             norm_adj = d_mat_inv.dot(adj)
137 |             # norm_adj = adj.dot(d_mat_inv)
138 |             print('generate single-normalized adjacency matrix.')
139 |             return norm_adj.tocoo()
140 | 
141 |         def check_adj_if_equal(adj):
142 |             dense_A = np.array(adj.todense())
143 |             degree = np.sum(dense_A, axis=1, keepdims=False)
144 | 
145 |             temp = np.dot(np.diag(np.power(degree, -1)), dense_A)
146 |             print('check normalized adjacency matrix whether equal to this laplacian matrix.')
147 |             return temp
148 | 
149 |         norm_adj_mat = normalized_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
150 |         mean_adj_mat = normalized_adj_single(adj_mat)
151 | 
152 |         print('already normalize adjacency matrix', time() - t2)
153 |         return adj_mat.tocsr(), norm_adj_mat.tocsr(), mean_adj_mat.tocsr()
154 | 
155 |     def negative_pool(self):
156 |         t1 = time()
157 |         for u in self.train_items.keys():
158 |             neg_items = list(set(range(self.n_items)) - set(self.train_items[u]))
159 |             pools = [rd.choice(neg_items) for _ in range(100)]
160 |             self.neg_pools[u] = pools
161 |         print('refresh negative pools', time() - t1)
162 | 
163 |     def sample(self):
164 |         if self.batch_size <= self.n_users:
165 |             users = rd.sample(self.exist_users, self.batch_size)
166 |         else:
167 |             users = [rd.choice(self.exist_users) for _ in range(self.batch_size)]
168 | 
169 | 
170 |         def sample_pos_items_for_u(u, num):
171 |             pos_items = self.train_items[u]
172 |             n_pos_items = len(pos_items)
173 |             pos_batch = []
174 |             while True:
175 |                 if len(pos_batch) == num: break
176 |                 pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
177 |                 pos_i_id = pos_items[pos_id]
178 | 
179 |                 if pos_i_id not in pos_batch:
180 |                     pos_batch.append(pos_i_id)
181 |             return pos_batch
182 | 
183 |         def sample_neg_items_for_u(u, num):
184 |             neg_items = []
185 |             while True:
186 |                 if len(neg_items) == num: break
187 |                 neg_id = np.random.randint(low=0, high=self.n_items,size=1)[0]
188 |                 if neg_id not in self.train_items[u] and neg_id not in neg_items:
189 |                     neg_items.append(neg_id)
190 |             return neg_items
191 | 
192 |         def sample_neg_items_for_u_from_pools(u, num):
193 |             neg_items = list(set(self.neg_pools[u]) - set(self.train_items[u]))
194 |             return rd.sample(neg_items, num)
195 | 
196 |         pos_items, neg_items = [], []
197 |         for u in users:
198 |             pos_items += sample_pos_items_for_u(u, 1)
199 |             neg_items += sample_neg_items_for_u(u, 1)
200 | 
201 |         return users, pos_items, neg_items
202 |     
203 |     def sample_test(self):
204 |         if self.batch_size <= self.n_users:
205 |             users = rd.sample(self.test_set.keys(), self.batch_size)
206 |         else:
207 |             users = [rd.choice(self.exist_users) for _ in range(self.batch_size)]
208 | 
209 |         def sample_pos_items_for_u(u, num):
210 |             pos_items = self.test_set[u]
211 |             n_pos_items = len(pos_items)
212 |             pos_batch = []
213 |             while True:
214 |                 if len(pos_batch) == num: break
215 |                 pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
216 |                 pos_i_id = pos_items[pos_id]
217 | 
218 |                 if pos_i_id not in pos_batch:
219 |                     pos_batch.append(pos_i_id)
220 |             return pos_batch
221 | 
222 |         def sample_neg_items_for_u(u, num):
223 |             neg_items = []
224 |             while True:
225 |                 if len(neg_items) == num: break
226 |                 neg_id = np.random.randint(low=0, high=self.n_items, size=1)[0]
227 |                 if neg_id not in (self.test_set[u]+self.train_items[u]) and neg_id not in neg_items:
228 |                     neg_items.append(neg_id)
229 |             return neg_items
230 |     
231 |         def sample_neg_items_for_u_from_pools(u, num):
232 |             neg_items = list(set(self.neg_pools[u]) - set(self.train_items[u]))
233 |             return rd.sample(neg_items, num)
234 | 
235 |         pos_items, neg_items = [], []
236 |         for u in users:
237 |             pos_items += sample_pos_items_for_u(u, 1)
238 |             neg_items += sample_neg_items_for_u(u, 1)
239 | 
240 |         return users, pos_items, neg_items  
241 |     
242 |     def get_num_users_items(self):
243 |         return self.n_users, self.n_items
244 | 
245 |     def print_statistics(self):
246 |         print('n_users=%d, n_items=%d' % (self.n_users, self.n_items))
247 |         print('n_interactions=%d' % (self.n_train + self.n_test))
248 |         print('n_train=%d, n_test=%d, sparsity=%.5f' % (self.n_train, self.n_test, (self.n_train + self.n_test)/(self.n_users * self.n_items)))
249 | 
250 |     def get_sparsity_split(self):
251 |         try:
252 |             split_uids, split_state = [], []
253 |             lines = open(self.path + '/sparsity.split', 'r').readlines()
254 | 
255 |             for idx, line in enumerate(lines):
256 |                 if idx % 2 == 0:
257 |                     split_state.append(line.strip())
258 |                     print(line.strip())
259 |                 else:
260 |                     split_uids.append([int(uid) for uid in line.strip().split(' ')])
261 |             print('get sparsity split.')
262 | 
263 |         except Exception:
264 |             split_uids, split_state = self.create_sparsity_split()
265 |             f = open(self.path + '/sparsity.split', 'w')
266 |             for idx in range(len(split_state)):
267 |                 f.write(split_state[idx] + '\n')
268 |                 f.write(' '.join([str(uid) for uid in split_uids[idx]]) + '\n')
269 |             print('create sparsity split.')
270 | 
271 |         return split_uids, split_state
272 | 
273 |     def create_sparsity_split(self):
274 |         all_users_to_test = list(self.test_set.keys())
275 |         user_n_iid = dict()
276 | 
277 |         # generate a dictionary to store (key=n_iids, value=a list of uid).
278 |         for uid in all_users_to_test:
279 |             train_iids = self.train_items[uid]
280 |             test_iids = self.test_set[uid]
281 | 
282 |             n_iids = len(train_iids) + len(test_iids)
283 | 
284 |             if n_iids not in user_n_iid.keys():
285 |                 user_n_iid[n_iids] = [uid]
286 |             else:
287 |                 user_n_iid[n_iids].append(uid)
288 |         split_uids = list()
289 | 
290 |         # split the whole user set into four subset.
291 |         temp = []
292 |         count = 1
293 |         fold = 4
294 |         n_count = (self.n_train + self.n_test)
295 |         n_rates = 0
296 | 
297 |         split_state = []
298 |         for idx, n_iids in enumerate(sorted(user_n_iid)):
299 |             temp += user_n_iid[n_iids]
300 |             n_rates += n_iids * len(user_n_iid[n_iids])
301 |             n_count -= n_iids * len(user_n_iid[n_iids])
302 | 
303 |             if n_rates >= count * 0.25 * (self.n_train + self.n_test):
304 |                 split_uids.append(temp)
305 | 
306 |                 state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' %(n_iids, len(temp), n_rates)
307 |                 split_state.append(state)
308 |                 print(state)
309 | 
310 |                 temp = []
311 |                 n_rates = 0
312 |                 fold -= 1
313 | 
314 |             if idx == len(user_n_iid.keys()) - 1 or n_count == 0:
315 |                 split_uids.append(temp)
316 | 
317 |                 state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' % (n_iids, len(temp), n_rates)
318 |                 split_state.append(state)
319 |                 print(state)
320 | 
321 | 
322 | 
323 |         return split_uids, split_state
324 | 


--------------------------------------------------------------------------------
/DGCF_v1/utility/metrics.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 10, 2019
  3 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
  4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
  5 | Note that: This implementation is based on the codes of NGCF.
  6 | 
  7 | @author: Xiang Wang (xiangwang@u.nus.edu)
  8 | '''
  9 | 
 10 | import numpy as np
 11 | from sklearn.metrics import roc_auc_score
 12 | 
 13 | def recall(rank, ground_truth, N):
 14 |     return len(set(rank[:N]) & set(ground_truth)) / float(len(set(ground_truth)))
 15 | 
 16 | def precision_at_k(r, k):
 17 |     """Score is precision @ k
 18 |     Relevance is binary (nonzero is relevant).
 19 |     Returns:
 20 |         Precision @ k
 21 |     Raises:
 22 |         ValueError: len(r) must be >= k
 23 |     """
 24 |     assert k >= 1
 25 |     r = np.asarray(r)[:k]
 26 |     return np.mean(r)
 27 | 
 28 | def average_precision(r,cut):
 29 |     """Score is average precision (area under PR curve)
 30 |     Relevance is binary (nonzero is relevant).
 31 |     Returns:
 32 |         Average precision
 33 |     """
 34 |     r = np.asarray(r)
 35 |     out = [precision_at_k(r, k + 1) for k in range(cut) if r[k]]
 36 |     if not out:
 37 |         return 0.
 38 |     return np.sum(out)/float(min(cut, np.sum(r)))
 39 | 
 40 | def mean_average_precision(rs):
 41 |     """Score is mean average precision
 42 |     Relevance is binary (nonzero is relevant).
 43 |     Returns:
 44 |         Mean average precision
 45 |     """
 46 |     return np.mean([average_precision(r) for r in rs])
 47 | 
 48 | def dcg_at_k(r, k, method=1):
 49 |     """Score is discounted cumulative gain (dcg)
 50 |     Relevance is positive real values.  Can use binary
 51 |     as the previous methods.
 52 |     Returns:
 53 |         Discounted cumulative gain
 54 |     """
 55 |     r = np.asfarray(r)[:k]
 56 |     if r.size:
 57 |         if method == 0:
 58 |             return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
 59 |         elif method == 1:
 60 |             return np.sum(r / np.log2(np.arange(2, r.size + 2)))
 61 |         else:
 62 |             raise ValueError('method must be 0 or 1.')
 63 |     return 0.
 64 | 
 65 | def ndcg_at_k(r, k, ground_truth, method=1):
 66 |     """Score is normalized discounted cumulative gain (ndcg)
 67 |     Relevance is positive real values.  Can use binary
 68 |     as the previous methods.
 69 |     Returns:
 70 |         Normalized discounted cumulative gain
 71 | 
 72 |         Low but correct defination 
 73 |     """
 74 |     GT = set(ground_truth)
 75 |     if len(GT) > k :
 76 |         sent_list = [1.0] * k
 77 |     else:
 78 |         sent_list = [1.0]*len(GT) + [0.0]*(k-len(GT))   
 79 |     dcg_max = dcg_at_k(sent_list, k, method)
 80 |     if not dcg_max:
 81 |         return 0.
 82 |     return dcg_at_k(r, k, method) / dcg_max
 83 | 
 84 | def recall_at_k(r, k, all_pos_num):
 85 |     r = np.asfarray(r)[:k]
 86 |     return np.sum(r) / all_pos_num
 87 | 
 88 | def hit_at_k(r, k):
 89 |     r = np.array(r)[:k]
 90 |     if np.sum(r) > 0:
 91 |         return 1.
 92 |     else:
 93 |         return 0.
 94 | 
 95 | def F1(pre, rec):
 96 |     if pre + rec > 0:
 97 |         return (2.0 * pre * rec) / (pre + rec)
 98 |     else:
 99 |         return 0.
100 | 
101 | def auc(ground_truth, prediction):
102 |     try:
103 |         res = roc_auc_score(y_true=ground_truth, y_score=prediction)
104 |     except Exception:
105 |         res = 0.
106 |     return res


--------------------------------------------------------------------------------
/DGCF_v1/utility/parser.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 10, 2019
 3 | Tensorflow Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
 4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
 5 | Note that: This implementation is based on the codes of NGCF.
 6 | 
 7 | @author: Xiang Wang (xiangwang@u.nus.edu)
 8 | '''
 9 | import argparse
10 | 
11 | def parse_args():
12 |     parser = argparse.ArgumentParser(description="Run DGCF.")
13 |     parser.add_argument('--data_path', nargs='?', default='../Data/',
14 |                         help='Input data path.')
15 |     parser.add_argument('--proj_path', nargs='?', default='',
16 |                         help='Project path.')
17 |     
18 |     parser.add_argument('--pick', type=int, default=0,
19 |                         help='O for no pick, 1 for pick')
20 |     parser.add_argument('--pick_scale', type=float, default=1e10,
21 |                         help='Scale')
22 |     parser.add_argument('--dataset', nargs='?', default='gowalla',
23 |                         help='Choose a dataset from {gowalla, yelp2018, amazon-book}')
24 |     parser.add_argument('--pretrain', type=int, default=0,
25 |                         help='0: No pretrain, 1:Use stored models.')
26 |     parser.add_argument('--embed_name', nargs='?', default='',
27 |                         help='Name for pretrained model.')
28 |     parser.add_argument('--verbose', type=int, default=1,
29 |                         help='Interval of evaluation.')
30 | 
31 | 
32 |     parser.add_argument('--epoch', type=int, default=3000,
33 |                         help='Number of epochs')      
34 |     parser.add_argument('--embed_size', type=int, default=64,
35 |                         help='Embedding size.')
36 |     parser.add_argument('--layer_size', nargs='?', default='[64]',
37 |                         help='Output sizes of every layer')
38 |     parser.add_argument('--batch_size', type=int, default=1024,
39 |                         help='Batch size.')
40 |     parser.add_argument('--lr', type=float, default=0.01,
41 |                         help='Learning rate.')
42 |     parser.add_argument('--cor_flag', type=int, default=1,
43 |                         help='Correlation matrix flag')
44 |     parser.add_argument('--corDecay', type=float, default=0.0,
45 |                         help='Distance Correlation Weight')
46 |     parser.add_argument('--regs', nargs='?', default='[1e-3,1e-4,1e-4]',
47 |                         help='Regularizations.')
48 |         
49 |     parser.add_argument('--n_layers', type=int, default=1,
50 |                         help='Layer numbers.')
51 |     parser.add_argument('--n_factors', type=int, default=4,
52 |                         help='Number of factors to disentangle the original embed-size representation.')
53 |     parser.add_argument('--n_iterations', type=int, default=2,
54 |                         help='Number of iterations to perform the routing mechanism.')
55 |     
56 |     
57 |     parser.add_argument('--show_step', type=int, default=15,
58 |                         help='Test every show_step epochs.')
59 |     parser.add_argument('--early', type=int, default=40,
60 |                         help='Step for stopping')           
61 |     parser.add_argument('--Ks', nargs='?', default='[20, 40, 60, 80, 100]',
62 |                         help='Metrics scale')
63 | 
64 |     parser.add_argument('--save_flag', type=int, default=0,
65 |                         help='0: Disable model saver, 1: Save Better Model')
66 |     parser.add_argument('--save_name', nargs='?', default='best_model',
67 |                         help='Save_name.')
68 |     
69 |     parser.add_argument('--test_flag', nargs='?', default='part',
70 |                         help='Specify the test type from {part, full}, indicating whether the reference is done in mini-batch')
71 | 
72 | 
73 |     return parser.parse_args()
74 | 


--------------------------------------------------------------------------------
/DGCF_v2/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Data/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Data/amazon-book/README.md:
--------------------------------------------------------------------------------
1 | Look for the full dataset? Please visit the [websit](http://jmcauley.ucsd.edu/data/amazon).
2 | 


--------------------------------------------------------------------------------
/Data/gowalla/README.md:
--------------------------------------------------------------------------------
1 | Look for the full dataset?
2 | Please visit the [websit](https://snap.stanford.edu/data/loc-gowalla.html).
3 | 


--------------------------------------------------------------------------------
/Data/yelp2018/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Disentangled Graph Collaborative Filtering
 2 | This is our Tensorflow implementation for the paper:
 3 | 
 4 | >Xiang Wang, Hongye Jin, An Zhang, Xiangnan He, Tong Xu, and Tat-Seng Chua (2020). Disentangled Graph Collaborative Filtering, [Paper in arXiv](https://arxiv.org/abs/2007.01764). In SIGIR'20, Xi'an, China, July 25-30, 2020.
 5 | 
 6 | Author: Dr. Xiang Wang (xiangwang at u.nus.edu)
 7 | 
 8 | ## Introduction
 9 | Disentangled Graph Collaborative Filtering (DGCF) is an explainable recommendation framework, which is equipped with (1) dynamic routing mechanism of capsule networks, to refine the strengths of user-item interactions in intent-aware graphs, (2) embedding propagation mechanism of graph neural networks, to distill the pertinent information from higher-order connectivity, and (3) distance correlation of independence modeling, to ensure the independence among intents. As such, we explicitly disentangle the hidden intents of users in the representation learning.
10 | 
11 | ## Citation 
12 | If you want to use our codes and datasets in your research, please cite:
13 | ```
14 | @inproceedings{DGCF19,
15 |   author    = {Xiang Wang and
16 |                Hongye Jin and
17 |                An Zhang and
18 |                Xiangnan He and
19 |                Tong Xu and
20 |                Tat{-}Seng Chua},
21 |   title     = {Disentangled Graph Collaborative Filtering},
22 |   booktitle = {Proceedings of the 43nd International {ACM} {SIGIR} Conference on
23 |                Research and Development in Information Retrieval, {SIGIR} 2020, Xi'an,
24 |                China, July 25-30, 2020.},
25 |   year      = {2020},
26 | }
27 | ```
28 | ## Environment Requirement
29 | We recommend to run this code in GPUs. The code has been tested running under Python 3.6.5. The required packages are as follows:
30 | * tensorflow_gpu == 1.14.0
31 | * numpy == 1.14.3
32 | * scipy == 1.1.0
33 | * sklearn == 0.19.1
34 | 
35 | ## Versions
36 | We released the implementation based on the NGCF code as DGCF_v1. Later, we will release another implementation based on the LightGCN code as DGCF_v2, which is equipped with some speedup techniques.
37 | 
38 | ## Example to Run the Codes
39 | The instruction of commands has been clearly stated in the codes (see the parser function in DGCF/utility/parser.py).
40 | * Gowalla dataset
41 | ```
42 | CUDA_VISIBLE_DEVICES=0 python GDCF.py --dataset gowalla --batch_size 2000 --n_layers 1 --n_iterations 2 --corDecay 0.01 --n_factors 4 --show_step 3 --lr 0.001 
43 | ```
44 | 
45 | Some important arguments (additional to that of NGCF):
46 | * `cor_flag`
47 |   * It specifies whether the distance correlation (i.e., independence modeling) is activated..
48 |   * Here we provide two options:
49 |     * 1 (by default), which activates the distance correlation in [Disentangled Graph Collaborative Filtering](https://arxiv.org/abs/2007.01764), SIGIR2020. Usage: `--cor_flag 1`.
50 |     * 0, which disables the distance correlation. Usage: `--cor_flag 0`.
51 | 
52 | * `corDecay`
53 |   * It specifies the weight to control the distance correlation.
54 |   * Here we provide four options:
55 |     * 0.0 (by default), which similarly disables the distance correlation and makes DGCF rely only on the dynamic routing mechanism to disentangle the user intents. Usage: `--corDecay 0.0`.
56 |     * other scales like 0.1, which uses 0.1 to control the strengths of distance correlation. Usage: `--corDecay 0.1`.
57 | 
58 | * `n_factors`
59 |   * It indicates the number of latent intents to disentangle the holistic representation into chunked intent-aware representations. Usage: `--n_factors 4`.
60 |   * Note that the arguement `embed_size` needs to be exactly divisible by the arguement `n_factors`.
61 | 
62 | * `n_iterations`
63 |   * It indicates the number of iterations to perform the dynamic routing mechanism. Usage `--n_iterations 2`.
64 | 
65 | ## Dataset
66 | Following our prior work NGCF and LightGCN, We provide three processed datasets: Gowalla, Amazon-book, and Yelp2018.
67 | Note that the Yelp2018 dataset used in DGCF is slightly different from the original in NGCF, since we found some bugs in the preprocessing code to construct the Yelp2018 dataset. We rerun the experiments and report the performance in the corrected dataset.
68 | 
69 | ## Acknowledgement
70 | 
71 | This research is supported by the National Research Foundation, Singapore under its International Research Centres in Singapore Funding Initiative. Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not reflect the views of National Research Foundation, Singapore.
72 | 


--------------------------------------------------------------------------------