├── DGCF.py
├── Data
    ├── amazon-book
    │   ├── README.md
    │   ├── item_list.txt
    │   ├── test.txt
    │   ├── train.txt
    │   └── user_list.txt
    ├── gowalla
    │   ├── README.md
    │   ├── item_list.txt
    │   ├── test.txt
    │   ├── train.txt
    │   └── user_list.txt
    └── yelp2018
    │   ├── README.md
    │   ├── item_list.txt
    │   ├── test.txt
    │   ├── train.txt
    │   └── user_list.txt
├── README.md
└── utility
    ├── batch_test.py
    ├── helper.py
    ├── load_data.py
    ├── metrics.py
    └── parser.py


/DGCF.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr , 2021
  3 | Pytorch Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
  4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
  5 | Note that: This implementation is based on the codes of NGCF.
  6 | @author: Xiang Wang (xiangwang@u.nus.edu)
  7 | @author: Jisu Rho (jsroh1013@gmail.com)
  8 | '''
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import torch.optim as optim
 13 | import os
 14 | import sys
 15 | import random as rd
 16 | import pickle
 17 | import numpy as np
 18 | from pathlib import Path
 19 | import multiprocessing
 20 | 
 21 | import warnings
 22 | warnings.filterwarnings('ignore')
 23 | from time import time
 24 | 
 25 | from utility.helper import *
 26 | from utility.batch_test import *
 27 | 
 28 | class DGCF(nn.Module):
 29 |     def __init__(self, data_config, pretrain_data):
 30 |         super(DGCF, self).__init__()
 31 |         #argument settings
 32 |         self.pretrain_data = pretrain_data
 33 |         self.n_users = data_config['n_users']
 34 |         self.n_items = data_config['n_items']
 35 | 
 36 |         self.n_fold = 1
 37 |         self.norm_adj = data_config['norm_adj']
 38 |         self.all_h_list = data_config['all_h_list']
 39 |         self.all_t_list = data_config['all_t_list']
 40 |         self.A_in_shape = self.norm_adj.tocoo().shape
 41 | 
 42 |         self.n_nonzero_elems = self.norm_adj.count_nonzero()
 43 |         self.lr = args.lr
 44 |         self.emb_dim = args.embed_size
 45 |         self.n_factors = args.n_factors
 46 |         self.n_iterations = args.n_iterations
 47 |         self.n_layers = args.n_layers
 48 |         self.pick_level = args.pick_scale
 49 |         self.cor_flag = args.cor_flag
 50 |         if args.pick == 1:
 51 |             self.is_pick = True
 52 |         else:
 53 |             self.is_pick = False
 54 |         self.batch_size = args.batch_size
 55 |         #regularization
 56 |         self.regs = eval(args.regs)
 57 |         self.decay = self.regs[0]
 58 |         #interval of evaluation
 59 |         self.verbose = args.verbose
 60 |         '''
 61 |         *********************************************************
 62 |         Create Placeholder for Input Data & Dropout.
 63 |         # placeholder definition
 64 |         self.users = tfv1.placeholder(tf.int32, shape=(None,))
 65 |         self.pos_items = tfv1.placeholder(tf.int32, shape=(None,))
 66 |         self.neg_items = tfv1.placeholder(tf.int32, shape=(None,))
 67 | 
 68 |         # additional placeholders for the distance correlation
 69 |         self.cor_users = tfv1.placeholder(tf.int32, shape=(None,))
 70 |         self.cor_items = tfv1.placeholder(tf.int32, shape=(None,))
 71 | 
 72 |         # assign different values with different factors (channels).
 73 |         self.A_values = tfv1.placeholder(tf.float32, shape=[self.n_factors, len(self.all_h_list)], name='A_values')
 74 |         '''
 75 |         """
 76 |         *********************************************************
 77 |         Create Model Parameters (i.e., Initialize Weights).
 78 |         """
 79 |         # initialization of model parameter
 80 |         self.init_weights()
 81 |         # create models
 82 |         #self.ua_embeddings, self.ia_embeddings, self.f_weight, self.ua_embeddings_t, self.ia_embeddings_t = self._create_star_routing_embed_with_P(pick_=self.is_pick)
 83 |         """
 84 |         *********************************************************
 85 |         Establish the final representations for user-item pairs in batch.
 86 |         """        
 87 |         '''
 88 |         self.u_g_embeddings = nn.Embedding(self.ua_embeddings, self.users)
 89 |         self.u_g_embeddings_t = nn.Embedding(self.ua_embeddings_t, self.users)
 90 |         self.pos_i_g_embeddings = nn.Embedding(self.ia_embeddings, self.pos_items)
 91 |         self.pos_i_g_embeddings_t = nn.Embedding(self.ia_embeddings_t, self.pos_items)
 92 | 
 93 |         self.neg_i_g_embeddings = nn.Embedding(self.ia_embeddings, self.neg_items)
 94 |         self.u_g_embeddings_pre = nn.Embedding(self.weights['user_embedding'], self.users)
 95 |         self.pos_i_g_embeddings_pre = nn.Embedding(self.weights['item_embedding'], self.pos_items)
 96 |         self.neg_i_g_embeddings_pre = nn.Embedding(self.weights['item_embedding'], self.neg_items)
 97 | 
 98 |         self.cor_u_g_embeddings = nn.Embedding(self.ua_embeddings, self.cor_users)
 99 |         self.cor_i_g_embeddings = nn.Embedding(self.ia_embeddings, self.cor_items)
100 |         
101 |         #Inference for the testing phase.
102 |         self.batch_ratings = torch.matmul(self.u_g_embeddings_t, self.pos_i_g_embeddings_t.t())
103 |     
104 |         #Generate Predictions & Optimize via BPR loss.
105 |         self.mf_loss, self.emb_loss = self.create_bpr_loss(self.u_g_embeddings, self.pos_i_g_embeddings, self.neg_i_g_embeddings)
106 | 
107 |         # whether user distance correlation
108 |         if args.corDecay < 1e-9:
109 |             self.cor_loss = torch.zeros(1)
110 |         else:
111 |             self.cor_loss = args.corDecay * self.create_cor_loss(self.cor_u_g_embeddings, self.cor_i_g_embeddings)                   
112 | 
113 |         self.loss = self.mf_loss + self.emb_loss + self.cor_loss
114 |         #self.opt = tfv1.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss)
115 |         #self.opt = optim.Adam(model.parameters(), lr=args.lr) #main
116 |         '''
117 | 
118 |     def init_weights(self):
119 |         # xavier init
120 |         initializer = nn.init.xavier_uniform_
121 | 
122 |         if self.pretrain_data is None:
123 |             all_weights = nn.ParameterDict({
124 |                 'user_embedding': nn.Parameter(initializer(torch.empty(self.n_users, self.emb_dim))),
125 |                 'item_embedding': nn.Parameter(initializer(torch.empty(self.n_users, self.emb_dim)))
126 |             })
127 |             print('using xavier initialization')
128 |         else:
129 |             #check
130 |             all_weights = nn.ParameterDict({
131 |                 'user_embedding': nn.Parameter(self.pretrain_data['user_embed']),
132 |                 'item_embedding': nn.Parameter(self.pretrain_data['item_embed'])
133 |             })
134 |             print('using pretrained initialization')
135 | 
136 |         self.u_g_embeddings = nn.Embedding(self.ua_embeddings, self.users)
137 |         self.u_g_embeddings_t = nn.Embedding(self.ua_embeddings_t, self.users)
138 |         self.pos_i_g_embeddings = nn.Embedding(self.ia_embeddings, self.pos_items)
139 |         self.pos_i_g_embeddings_t = nn.Embedding(self.ia_embeddings_t, self.pos_items)
140 | 
141 |         self.neg_i_g_embeddings = nn.Embedding(self.ia_embeddings, self.neg_items)
142 |         self.u_g_embeddings_pre = nn.Embedding(self.weights['user_embedding'], self.users)
143 |         self.pos_i_g_embeddings_pre = nn.Embedding(self.weights['item_embedding'], self.pos_items)
144 |         self.neg_i_g_embeddings_pre = nn.Embedding(self.weights['item_embedding'], self.neg_items)
145 | 
146 |         self.cor_u_g_embeddings = nn.Embedding(self.ua_embeddings, self.cor_users)
147 |         self.cor_i_g_embeddings = nn.Embedding(self.ia_embeddings, self.cor_items)
148 | 
149 | 
150 |     def _create_star_routing_embed_with_p(self,pick_=False):
151 |         '''
152 |         pick_ : True, the model would narrow the weight of the least important factor down to 1/args.pick_scale.
153 |         pick_ : False, do nothing.
154 |         '''
155 |         p_test=False
156 |         p_train=False
157 | 
158 |         A_values=torch.ones(self.n_factors,len(self.all_h_list))
159 |         # get a (n_factors)-length list of [n_users+n_items, n_users+n_items]
160 | 
161 |         # load the initial all-one adjacency values
162 |         # .... A_values is a all-ones dense tensor with the size of [n_factors, all_h_list].
163 |         
164 | 
165 |         # get the ID embeddings of users and items
166 |         # .... ego_embeddings is a dense tensor with the size of [n_users+n_items, embed_size];
167 |         # .... all_embeddings stores a (n_layers)-len list of outputs derived from different layers.
168 |         ego_embeddings = torch.cat([self.weights['user_embedding'],self.weights['item_embeddings']],0)
169 |         all_embeddings = [ego_embeddings]
170 |         all_embeddings_t = [ego_embeddings]
171 | 
172 |         output_factors_distribution = []
173 | 
174 |         factor_num = [self.n_factors,self.n_factors,self.n_factors]
175 |         iter_num = [self.n_iterations,self.n_iterations,self.n_iterations]
176 |         for k in range(0,self.n_layers):
177 |             # prepare the output embedding list
178 |             # .... layer_embeddings stores a (n_factors)-len list of outputs derived from the last routing iterations.
179 |             n_factors_l = factor_num[k]
180 |             n_iterations_l = iter_num[k]
181 |             layer_embeddings = []
182 |             layer_embeddings_t = []
183 | 
184 |             # split the input embedding table
185 |             # .... ego_layer_embeddings is a (n_factors)-len list of embeddings [n_users+n_items, embed_size/n_factors]
186 |             ego_layer_embeddings = torch.split(ego_embeddings, n_factors_l, 1)
187 |             ego_layer_embeddings_t = torch.split(ego_embeddings, n_factors_l, 1) 
188 | 
189 |             # perform routing mechanism
190 |             for t in range(0, n_iterations_l):
191 |                 iter_embeddings = []
192 |                 iter_embeddings_t = []
193 |                 A_iter_values = []
194 | 
195 |                 # split the adjacency values & get three lists of [n_users+n_items, n_users+n_items] sparse tensors
196 |                 # .... A_factors is a (n_factors)-len list, each of which is an adjacency matrix
197 |                 # .... D_col_factors is a (n_factors)-len list, each of which is a degree matrix w.r.t. columns
198 |                 # .... D_row_factors is a (n_factors)-len list, each of which is a degree matrix w.r.t. rows
199 |                 if t == n_iterations_l - 1:
200 |                     p_test = pick_
201 |                     p_train = False
202 | 
203 |                 A_factors, D_col_factors, D_row_factors = self._convert_A_values_to_A_factors_with_P(n_factors_l, A_values, pick= p_train)
204 |                 A_factors_t, D_col_factors_t, D_row_factors_t = self._convert_A_values_to_A_factors_with_P(n_factors_l, A_values, pick= p_test)
205 |                 for i in range(0, n_factors_l):
206 |                     # update the embeddings via simplified graph convolution layer
207 |                     # .... D_col_factors[i] * A_factors[i] * D_col_factors[i] is Laplacian matrix w.r.t. the i-th factor
208 |                     # .... factor_embeddings is a dense tensor with the size of [n_users+n_items, embed_size/n_factors]
209 |                     factor_embeddings = torch.sparse.mm(D_col_factors[i], ego_layer_embeddings[i])
210 |                     factor_embeddings_t = torch.sparse.mm(D_col_factors_t[i], ego_layer_embeddings_t[i])
211 | 
212 |                     factor_embeddings_t = torch.sparse.mm(A_factors_t[i], factor_embeddings_t)
213 |                     factor_embeddings = torch.sparse.mm(A_factors[i], factor_embeddings)
214 | 
215 |                     factor_embeddings = torch.sparse.mm(D_col_factors[i], factor_embeddings)
216 |                     factor_embeddings_t = torch.sparse.mm(D_col_factors_t[i], factor_embeddings_t)
217 | 
218 |                     iter_embeddings.append(factor_embeddings)
219 |                     iter_embeddings_t.append(factor_embeddings_t)
220 |                     
221 |                     if t == n_iterations_l - 1:
222 |                         layer_embeddings = iter_embeddings
223 |                         layer_embeddings_t = iter_embeddings_t 
224 | 
225 |                     # get the factor-wise embeddings
226 |                     # .... head_factor_embeddings is a dense tensor with the size of [all_h_list, embed_size/n_factors]
227 |                     # .... analogous to tail_factor_embeddings
228 |                     head_factor_embedings = nn.Embedding(factor_embeddings, self.all_h_list)
229 |                     tail_factor_embedings = nn.Embedding(ego_layer_embeddings[i], self.all_t_list)
230 | 
231 |                     # .... constrain the vector length
232 |                     # .... make the following attentive weights within the range of (0,1)
233 |                     head_factor_embedings = F.normalize(head_factor_embedings, dim=1)
234 |                     tail_factor_embedings = F.normalize(tail_factor_embedings, dim=1)
235 | 
236 |                     # get the attentive weights
237 |                     # .... A_factor_values is a dense tensor with the size of [all_h_list,1]
238 |                     A_factor_values = torch.sum(torch.mul(head_factor_embedings, F.tanh(tail_factor_embedings)), axis=1)
239 | 
240 |                     # update the attentive weights
241 |                     A_iter_values.append(A_factor_values)
242 | 
243 |                 # pack (n_factors) adjacency values into one [n_factors, all_h_list] tensor
244 |                 A_iter_values = torch.stack(A_iter_values, 0)
245 |                 # add all layer-wise attentive weights up.
246 |                 A_values += A_iter_values
247 |                 
248 |                 if t == n_iterations_l - 1:
249 |                     #layer_embeddings = iter_embeddings
250 |                     output_factors_distribution.append(A_factors)
251 | 
252 |             # sum messages of neighbors, [n_users+n_items, embed_size]
253 |             side_embeddings = torch.cat(layer_embeddings, 1)
254 |             side_embeddings_t = torch.cat(layer_embeddings_t, 1)
255 |             
256 |             ego_embeddings = side_embeddings
257 |             ego_embeddings_t = side_embeddings_t
258 |             # concatenate outputs of all layers
259 |             all_embeddings_t += [ego_embeddings_t]
260 |             all_embeddings += [ego_embeddings]
261 | 
262 |         all_embeddings = torch.stack(all_embeddings, 1)
263 |         all_embeddings = torch.mean(all_embeddings, dim=1, keepdims=False)
264 | 
265 |         all_embeddings_t = torch.stack(all_embeddings_t, 1)
266 |         all_embeddings_t = torch.mean(all_embeddings_t, dim=1, keep_dims=False)
267 | 
268 |         u_g_embeddings, i_g_embeddings = torch.split(all_embeddings, [self.n_users, self.n_items], 0)
269 |         u_g_embeddings_t, i_g_embeddings_t = torch.split(all_embeddings_t, [self.n_users, self.n_items], 0)
270 | 
271 |         return u_g_embeddings, i_g_embeddings, output_factors_distribution, u_g_embeddings_t, i_g_embeddings_t
272 | 
273 | 
274 |     def create_bpr_loss(self, users, pos_items, neg_items):
275 |         pos_scores = torch.sum(torch.mul(users, pos_items),axis=1)
276 |         neg_scores = torch.sum(torch.mul(users, pos_items),axis=1)
277 | 
278 |         regularizer = (torch.norm(self.u_g_embeddings_pre) ** 2 + torch.norm(self.pos_i_g_embeddings_pre) ** 2 + 
279 |                         torch.norm(self.neg_i_g_embeddings_pre) ** 2) / 2
280 |         regularizer = regularizer / self.batch_size
281 |         
282 |         mf_loss = torch.mean(torch.nn.fuctional.softplus(neg_scores-pos_scores))
283 |         emb_loss = self.decay * regularizer
284 | 
285 |         return mf_loss, emb_loss
286 |     
287 |     def create_cor_loss(self, cor_u_embeddings, cor_i_embeddings):
288 |         cor_loss = torch.zeros(1)
289 | 
290 |         if self.cor_flag == 0:
291 |             return cor_loss
292 |         
293 |         ui_embeddings = torch.cat([cor_u_embeddings, cor_i_embeddings],0)
294 |         ui_factor_embeddings = torch.split(ui_embeddings, self.n_factors, 1)
295 | 
296 |         for i in range(0, self.n_factors-1):
297 |             x = ui_factor_embeddings[i]
298 |             y = ui_factor_embeddings[i+1]
299 |             cor_loss += self._create_distance_correlation(x, y)
300 | 
301 |         cor_loss /= ((self.n_factors + 1.0) * self.n_factors/2)
302 | 
303 |         return cor_loss
304 | 
305 |     def model_save(self, path, dataset, savename='best_model'):
306 |         save_pretrain_path = '%spretrain/%s/%s' % (path, dataset, savename)
307 |         '''
308 |         out_dir = '%spretrain/' % (path)
309 |         
310 |         if not os.path.isdir(out_dir):
311 |             os.mkdir(out_dir)
312 |         
313 |         model_file = Path(save_pretrain_path)
314 |         model_file.touch(exist_ok=True)
315 | 
316 |         print("Saving model...")
317 |         torch.save(model.state_dict(), model_file)
318 |         '''
319 |         np.savez(save_pretrain_path,user_embed=np.array(self.weights['user_embedding']),
320 |                                     item_embed=np.array(self.weights['item_embedding'])) 
321 |     
322 |     def _create_distance_correlation(self, X1, X2):
323 | 
324 |         def _create_centered_distance(X):
325 |             '''
326 |                 Used to calculate the distance matrix of N samples
327 |             '''
328 |             # calculate the pairwise distance of X
329 |             # .... A with the size of [batch_size, embed_size/n_factors]
330 |             # .... D with the size of [batch_size, batch_size]
331 |             # X = tf.math.l2_normalize(XX, axis=1)
332 |             r = torch.sum(torch.square(X),1,keepdims=True)
333 |             D = torch.sqrt(torch.maximum(r - 2 * torch.matmul(X,X.t()) + r.t(), 0.0) + 1e-8)
334 | 
335 |             # # calculate the centered distance of X
336 |             # # .... D with the size of [batch_size, batch_size]
337 |             D = D - torch.mean(D,dim=0,keepdims=True)-torch.mean(D,dim=1,keepdims=True) \
338 |                 + torch.mean(D)
339 |             return D
340 |         
341 |         def _create_distance_covariance(D1,D2):
342 |             #calculate distance covariance between D1 and D2
343 |             n_samples = D1.shape[0].type(torch.float32)
344 |             dcov = torch.sqrt(torch.maximum(torch.sum(D1 * D2) / (n_samples * n_samples), 0.0) + 1e-8)
345 |             # dcov = torch.sqrt(torch.maximum(torch.sum(D1 * D2)) / n_samples)
346 |             return dcov
347 | 
348 |         D1 = _create_centered_distance(X1)
349 |         D2 = _create_centered_distance(X2)
350 | 
351 |         dcov_12 = _create_distance_covariance(D1,D2)
352 |         dcov_11 = _create_distance_covariance(D1,D1)
353 |         dcov_22 = _create_distance_covariance(D2,D2)
354 | 
355 |         #calculate the distance correlation
356 |         dcor = dcov_12 / (torch.sqrt(torch.maximum(dcov_11 * dcov_22, 0.0)) + 1e-10)
357 |         #return tf.reduce_sum(D1) + tf.reduce_sum(D2)
358 |         return dcor
359 |     
360 |     def _convert_A_values_to_A_factors_with_P(self, f_num, A_factor_values, pick=True):
361 |         A_factors = []
362 |         D_col_factors = []
363 |         D_row_factors = []
364 |         #get the indices of adjacency matrix
365 |         A_indices = np.mat([self.all_h_list, self.all_t_list]).transpose()
366 |         D_indices = np.mat([list(range(self.n_users+self.n_items)),list(range(self.n_users+self.n_items))]).transpose()
367 | 
368 |         #apply factor-aware softmax function over the values of adjacency matrix
369 |         #....A_factor_values is [n_factors, all_h_list]
370 |         if pick:
371 |             A_factor_scores = F.softmax(A_factor_values, 0)
372 |             min_A = torch.min(A_factor_scores, 0)
373 |             index = A_factor_scores > (min_A + 0.0000001)
374 |             index = index.type(torch.float32) * (self.pick_level - 1.0) + 1.0 #adjust the weight of the minimum factor to 1/self.pick_level
375 | 
376 |             A_factor_scores = A_factor_scores * index
377 |             A_factor_scores = A_factor_scores / torch.sum(A_factor_scores, 0)
378 |         else:
379 |             A_factor_scores = F.softmax(A_factor_values, 0)
380 |         
381 |         for i in range(0, f_num):
382 |             # in the i-th factor, couple the adjcency values with the adjacency indices
383 |             # .... A i-tensor is a sparse tensor with size of [n_users+n_items,n_users+n_items]
384 |             A_i_scores = A_factor_scores[i]
385 |             A_i_tensor = torch.sparse_coo_tensor(A_indices, A_i_scores, self.A_in_shape)
386 | 
387 |             # get the degree values of A_i_tensor
388 |             # .... D_i_scores_col is [n_users+n_items, 1]
389 |             # .... D_i_scores_row is [1, n_users+n_items]
390 |             D_i_col_scores = 1 / torch.sqrt(torch.sparse.sum(A_i_tensor, axis=1).to_dense())
391 |             D_i_row_scores = 1 / torch.sqrt(torch.sparse.sum(A_i_tensor, axis=0).to_dense())
392 | 
393 |             # couple the laplacian values with the adjacency indices
394 |             # .... A_i_tensor is a sparse tensor with size of [n_users+n_items, n_users+n_items]
395 |             D_i_col_tensor = torch.sparse_coo_tensor(D_indices, D_i_col_scores, self.A_in_shape)
396 |             D_i_row_tensor = torch.sparse_coo_tensor(D_indices, D_i_row_scores, self.A_in_shape)
397 | 
398 |             A_factors.append(A_i_tensor)
399 |             D_col_factors.append(D_i_col_tensor)
400 |             D_row_factors.append(D_i_row_tensor)
401 | 
402 |         #return a (n_factors)-length list of laplacian matrix
403 |         return A_factors, D_col_factors, D_row_factors
404 |     
405 |     def forward(self):        
406 |         # create models
407 |         self.ua_embeddings, self.ia_embeddings, self.f_weight, self.ua_embeddings_t, self.ia_embeddings_t = self._create_star_routing_embed_with_P(pick_=self.is_pick)
408 |         #Inference for the testing phase.
409 |         self.batch_ratings = torch.matmul(self.u_g_embeddings_t, self.pos_i_g_embeddings_t.t())    
410 |         #Generate Predictions & Optimize via BPR loss.
411 |         self.mf_loss, self.emb_loss = self.create_bpr_loss(self.u_g_embeddings, self.pos_i_g_embeddings, self.neg_i_g_embeddings)
412 |         # whether user distance correlation
413 |         if args.corDecay < 1e-9:
414 |             self.cor_loss = torch.zeros(1)
415 |         else:
416 |             self.cor_loss = args.corDecay * self.create_cor_loss(self.cor_u_g_embeddings, self.cor_i_g_embeddings)
417 |         self.loss = self.mf_loss + self.emb_loss + self.cor_loss
418 | 
419 |         return loss, mf_loss, emb_loss, cor_loss          
420 |     
421 | def load_best(name="best_model"):
422 |     pretrain_path='%spretrain/%s/%s.npz' % (args.proj_path, args.dataset, name)
423 |     try:
424 |         pretrain_data = torch.load(pretrain_path)
425 |         print('load the best model: ',name)
426 |     except Exception:
427 |         pretrain_data = None
428 |     return pretrain_data
429 | 
430 | def load_adjacency_list_data(adj_mat):
431 |     tmp = adj_mat.tocoo()
432 |     all_h_list = list(tmp.row)
433 |     all_t_list = list(tmp.col)
434 |     all_v_list = list(tmp.data)
435 |     return all_h_list, all_t_list, all_v_list
436 |     
437 | def create_initial_A_values(n_factors, all_v_list):
438 |     return np.array([all_v_list] * n_factors)
439 | 
440 | def sample_cor_samples(n_users, n_items, cor_batch_size):
441 |     '''
442 |         We have to sample some embedded representations out of all nodes.
443 |         Becasue we have no way to store cor-distance for each pair.
444 |     '''
445 |     cor_users = rd.sample(list(range(n_users)),cor_batch_size)
446 |     cor_items = rd.sample(list(range(n_items)),cor_batch_size)
447 |     return cor_users, cor_items
448 | 
449 | if __name__ == '__main__':
450 |     whether_test_batch = True 
451 |     
452 |     print("************************* Run with following settings 🏃 ***************************")
453 |     print(args)
454 |     print("************************************************************************************")
455 |     
456 |     GPU = torch.cuda.is_available()
457 |     device = torch.device('cuda:' + str(args.gpu_id) if GPU else "cpu")
458 |     CORES = multiprocessing.cpu_count() // 2
459 | 
460 |     config = dict()
461 |     config['n_users'] = data_generator.n_users
462 |     config['n_items'] = data_generator.n_items
463 | 
464 |     """
465 |     *********************************************************
466 |     Generate the Laplacian matrix, where each entry defines the decay factor (e.g., p_ui) between two connected nodes.
467 |     """
468 |     plain_adj, norm_adj, mean_adj, pre_adj = data_generator.get_adj_mat()
469 |     
470 |     all_h_list, all_t_list, all_v_list = load_adjacency_list_data(plain_adj)
471 | 
472 |     A_values_init = create_initial_A_values(args.n_factors, all_v_list)
473 | 
474 |     config['norm_adj'] = plain_adj
475 |     config['all_h_list'] = all_h_list
476 |     config['all_t_list'] = all_t_list
477 | 
478 |     t0 = time()
479 |     """
480 |     ***********************************************************
481 |     pretrain = 1: load embeddings with name such as embedding_xxx(.npz), l2_best_model(.npz)
482 |     pretrain = 0: default value, no pretrained embeddings.
483 |     """
484 |     if args.pretrain == 1:
485 |         print("Try to load pretrain: ", args.embed_name)
486 |         pretrain_data = load_best(name=args.embed_name)
487 |         if pretrain_data == None:
488 |             print("Load pretrained model(%s)fail!!!!!!!!!" % (args.embed_name))
489 |     else:
490 |         pretrain_data = None
491 | 
492 |     optimizer = optim.Adam(model.parameters(), lr=args.lr)
493 | 
494 |     model = GDCF(data_config=config, pretrain_data=pretrain_data).to(device)
495 | 
496 |     """
497 |     *********************************************************
498 |     Train
499 |     """
500 |     loss_loger, pre_loger, rec_loger, ndcg_loger, hit_loger = [], [], [], [], []
501 |     stopping_step = 0
502 |     should_stop = False
503 |     for epoch in range(args.epoch):
504 |         t1 = time()
505 |         loss, mf_loss, emb_loss, cor_loss = 0., 0., 0., 0.
506 |         n_batch = data_generator.n_train // args.batch_size + 1
507 |         cor_batch_size = int(max(data_generator.n_users/n_batch, data_generator.n_items/n_batch))
508 | 
509 |         for idx in range(n_batch):
510 |             users, pos_items, neg_items = data_generator.sample()
511 |             cor_users, cor_items = sample_cor_samples(data_generator.n_users, data_generator.n_items, cor_batch_size)
512 |             
513 |             batch_loss, batch_mf_loss, batch_emb_loss, batch_cor_loss = model()
514 |             
515 |             loss += batch_loss / n_batch
516 |             mf_loss += batch_mf_loss / n_batch
517 |             emb_loss += batch_emb_loss / n_batch
518 |             cor_loss += batch_cor_loss / n_batch
519 | 
520 |         if np.isnan(loss) == True:
521 |             print('ERROR: loss is nan.')
522 |             print(mf_loss, emb_loss)
523 |             sys.exit()
524 | 
525 |         # print the test evaluation metrics each 10 epochs; pos:neg = 1:10.
526 |         if (epoch + 1)  % args.show_step != 0:
527 |             if args.verbose > 0 and epoch % args.verbose == 0:
528 |                 perf_str = 'Epoch %d [%.1fs]: train==[%.5f=%.5f + %.5f + %.5f]' % (epoch, time() - t1, loss, mf_loss, emb_loss, cor_loss)
529 |                 print(perf_str)   
530 |             # Skip testing
531 |             continue
532 |             
533 |         loss_test, mf_loss_test, emb_loss_test, cor_loss_test = 0., 0., 0., 0.
534 |         for idx in range(n_batch):
535 |             cor_users, cor_items = sample_cor_samples(data_generator.n_users, data_generator.n_items, cor_batch_size)
536 |             users, pos_items, neg_items = data_generator.sample_test()
537 |             
538 |             batch_loss_test, batch_mf_loss_test, batch_emb_loss_test, batch_cor_loss_test = model()
539 | 
540 |             loss_test += batch_loss_test / n_batch
541 |             mf_loss_test += batch_mf_loss_test / n_batch
542 |             emb_loss_test += batch_emb_loss_test / n_batch
543 |             cor_loss_test += batch_cor_loss_test / n_batch
544 |         
545 |         t2 = time()
546 |         users_to_test = list(data_generator.test_set_keys())
547 |         ret = test(model, users_to_test, drop_flag=True, batch_test_flag=whether_test_batch)
548 | 
549 |         t3 = time()
550 | 
551 |         loss_loger.append(loss)        
552 |         rec_loger.append(ret['recall'])
553 |         pre_loger.append(ret['precision'])
554 |         ndcg_loger.append(ret['ndcg'])
555 |         hit_loger.append(ret['hit_ratio'])
556 | 
557 |         if args.verbose > 0:
558 |             perf_str = 'Epoch %d [%.1fs + %.1fs]: test==[%.5f=%.5f + %.5f + %.5f], recall=[%.5f, %.5f], ' \
559 |                        'precision=[%.5f, %.5f], hit=[%.5f, %.5f], ndcg=[%.5f, %.5f]' % \
560 |                        (epoch, t2 - t1, t3 - t2, loss_test, mf_loss_test, emb_loss_test, cor_loss_test, ret['recall'][0],
561 |                        ret['recall'][-1],
562 |                        ret['precision'][0], ret['precision'][-1], ret['hit_ratio'][0], ret['hit_ratio'][-1],
563 |                        ret['ndcg'][0], ret['ndcg'][-1])
564 |             print(perf_str)
565 |             
566 |         cur_best_pre_0, stopping_step, should_stop = early_stopping(ret['recall'][0], cur_best_pre_0, stopping_step, expected_order='acc', flag_step=args.early)
567 | 
568 |         # early stopping when cur_best_pre_0 is decreasing for given steps. 
569 |         if should_stop == True:
570 |             break
571 | 
572 |         # *********************************************************
573 |         # save the user & item embeddings for pretraining.
574 |         if ret['recall'][0] == cur_best_pre_0 and args.save_flag == 1 :
575 |             model.model_save(args.proj_path, args.dataset, savename=args.save_name)
576 |             print('save the model with performance: ', cur_best_pre_0)
577 |         
578 |     recs = np.array(rec_loger)
579 |     pres = np.array(pre_loger)
580 |     ndcgs = np.array(ndcg_loger)
581 |     hit = np.array(hit_loger)
582 | 
583 |     best_rec_0 = max(recs[:, 0])
584 |     idx = list(recs[:, 0]).index(best_rec_0)
585 | 
586 |     final_perf = "Best Iter=[%d]@[%.1f]\trecall=[%s], precision=[%s], hit=[%s], ndcg=[%s]" % \
587 |                  (idx, time() - t0, '\t'.join(['%.5f' % r for r in recs[idx]]),
588 |                   '\t'.join(['%.5f' % r for r in pres[idx]]),
589 |                   '\t'.join(['%.5f' % r for r in hit[idx]]),
590 |                   '\t'.join(['%.5f' % r for r in ndcgs[idx]]))
591 |     print(final_perf)
592 | 
593 | 
594 |             
595 | 
596 | 
597 | 
598 | 
599 | 
600 |        
601 | 
602 | 


--------------------------------------------------------------------------------
/Data/amazon-book/README.md:
--------------------------------------------------------------------------------
1 | Look for the full dataset? Please visit the [websit](http://jmcauley.ucsd.edu/data/amazon).
2 | 


--------------------------------------------------------------------------------
/Data/gowalla/README.md:
--------------------------------------------------------------------------------
1 | Look for the full dataset?
2 | Please visit the [websit](https://snap.stanford.edu/data/loc-gowalla.html).
3 | 


--------------------------------------------------------------------------------
/Data/yelp2018/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr , 2021
 3 | 
 4 | 
 5 | Pytorch Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
 6 | 
 7 | 
 8 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
 9 | 
10 | 
11 | Note that: This implementation is based on the codes of NGCF.
12 | 
13 | 
14 | @ Jisu Rho (jsroh1013@gmail.com)
15 | '''
16 | 
17 | # Disentangled Graph Collaborative Filtering
18 | This is Pytorch Implemenatation for the paper:
19 | 
20 | >Xiang Wang, Hongye Jin, An Zhang, Xiangnan He, Tong Xu, and Tat-Seng Chua (2020). Disentangled Graph Collaborative Filtering, [Paper in arXiv](https://arxiv.org/abs/2007.01764). In SIGIR'20, Xi'an, China, July 25-30, 2020.
21 | 
22 | Author: Dr. Xiang Wang (xiangwang at u.nus.edu)
23 | 
24 | ## Introduction
25 | Disentangled Graph Collaborative Filtering (DGCF) is an explainable recommendation framework, which is equipped with (1) dynamic routing mechanism of capsule networks, to refine the strengths of user-item interactions in intent-aware graphs, (2) embedding propagation mechanism of graph neural networks, to distill the pertinent information from higher-order connectivity, and (3) distance correlation of independence modeling, to ensure the independence among intents. As such, we explicitly disentangle the hidden intents of users in the representation learning.
26 | 
27 | ## Environment Requirement
28 | We recommend to run this code in GPUs. The code has been tested running under Python 3.6.5. The required packages are as follows:
29 | * torch == 1.4.0
30 | * scipy == 1.5.4
31 | * numpy == 1.16.1
32 | * sklearn == 0.24.1
33 | 


--------------------------------------------------------------------------------
/utility/batch_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr , 2021
  3 | Pytorch Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
  4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
  5 | Note that: This implementation is based on the codes of NGCF.
  6 | @author: Xiang Wang (xiangwang@u.nus.edu)
  7 | @author: Jisu Rho (jsroh1013@gmail.com)
  8 | '''
  9 | 
 10 | import utility.metrics as metrics
 11 | from utility.parser import parse_args
 12 | from utility.load_data import *
 13 | import multiprocessing
 14 | import heapq
 15 | import pandas as pd
 16 | #import tensorflow as tf
 17 | import numpy as np
 18 | 
 19 | cores = multiprocessing.cpu_count() // 2
 20 | 
 21 | args = parse_args()
 22 | Ks = eval(args.Ks)
 23 | 
 24 | data_generator = Data(path=args.data_path + args.dataset, batch_size=args.batch_size)
 25 | USR_NUM, ITEM_NUM = data_generator.n_users, data_generator.n_items
 26 | N_TRAIN, N_TEST = data_generator.n_train, data_generator.n_test
 27 | if args.dataset=='amazon-book':
 28 |     BATCH_SIZE = args.batch_size//4
 29 | else:
 30 |     BATCH_SIZE = args.batch_size
 31 | 
 32 | def ranklist_by_heapq(user_pos_test, test_items, rating, Ks):
 33 |     item_score = {}
 34 |     for i in test_items:
 35 |         item_score[i] = rating[i]
 36 | 
 37 |     K_max = max(Ks)
 38 |     K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)
 39 | 
 40 |     r = []
 41 |     for i in K_max_item_score:
 42 |         if i in user_pos_test:
 43 |             r.append(1)
 44 |         else:
 45 |             r.append(0)
 46 |     auc = 0.
 47 |     return r, auc
 48 | 
 49 | def get_auc(item_score, user_pos_test):
 50 |     item_score = sorted(item_score.items(), key=lambda kv: kv[1])
 51 |     item_score.reverse()
 52 |     item_sort = [x[0] for x in item_score]
 53 |     posterior = [x[1] for x in item_score]
 54 | 
 55 |     r = []
 56 |     for i in item_sort:
 57 |         if i in user_pos_test:
 58 |             r.append(1)
 59 |         else:
 60 |             r.append(0)
 61 |     auc = metrics.auc(ground_truth=r, prediction=posterior)
 62 |     return auc
 63 | 
 64 | def ranklist_by_sorted(user_pos_test, test_items, rating, Ks):
 65 |     item_score = {}
 66 |     for i in test_items:
 67 |         item_score[i] = rating[i]
 68 | 
 69 |     K_max = max(Ks)
 70 |     K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)
 71 | 
 72 |     r = []
 73 |     for i in K_max_item_score:
 74 |         if i in user_pos_test:
 75 |             r.append(1)
 76 |         else:
 77 |             r.append(0)
 78 |     auc = get_auc(item_score, user_pos_test)
 79 |     return r, auc
 80 | 
 81 | def get_performance(user_pos_test, r, auc, Ks):
 82 |     precision, recall, ndcg, hit_ratio = [], [], [], []
 83 | 
 84 |     for K in Ks:
 85 |         precision.append(metrics.precision_at_k(r, K))
 86 |         recall.append(metrics.recall_at_k(r, K, len(user_pos_test)))
 87 |         ndcg.append(metrics.ndcg_at_k(r, K, user_pos_test))
 88 |         hit_ratio.append(metrics.hit_at_k(r, K))
 89 | 
 90 |     return {'recall': np.array(recall), 'precision': np.array(precision),
 91 |             'ndcg': np.array(ndcg), 'hit_ratio': np.array(hit_ratio), 'auc': auc}
 92 | 
 93 | 
 94 | def test_one_user(x):
 95 |     # user u's ratings for user u
 96 |     rating = x[0]
 97 |     #uid
 98 |     u = x[1]
 99 |     #user u's items in the training set
100 |     try:
101 |         training_items = data_generator.train_items[u]
102 |     except Exception:
103 |         training_items = []
104 |     #user u's items in the test set
105 |     user_pos_test = data_generator.test_set[u]
106 | 
107 |     all_items = set(range(ITEM_NUM))
108 | 
109 |     test_items = list(all_items - set(training_items))
110 | 
111 |     if args.test_flag == 'part':
112 |         r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks)
113 |     else:
114 |         r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks)
115 | 
116 |     return get_performance(user_pos_test, r, auc, Ks)
117 | 
118 | def test_one_user_train(x):
119 |     # user u's ratings for user u
120 |     rating = x[0]
121 |     # uid
122 |     u = x[1]
123 |     # user u's items in the training set
124 | 
125 |     training_items = []
126 |     # user u's items in the test set
127 |     user_pos_test = data_generator.train_items[u]
128 | 
129 |     all_items = set(range(ITEM_NUM))
130 | 
131 |     test_items = list(all_items - set(training_items))
132 | 
133 |     if args.test_flag == 'part':
134 |         r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks)
135 |     else:
136 |         r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks)
137 | 
138 |     return get_performance(user_pos_test, r, auc, Ks)
139 | 
140 | def test(model, users_to_test, drop_flag=False, batch_test_flag=False,train_set_flag=0):
141 |     result = {'precision': np.zeros(len(Ks)), 'recall': np.zeros(len(Ks)), 'ndcg': np.zeros(len(Ks)),
142 |               'hit_ratio': np.zeros(len(Ks)), 'auc': 0.}
143 | 
144 |     pool = multiprocessing.Pool(cores)
145 | 
146 |     u_batch_size = BATCH_SIZE * 2
147 |     i_batch_size = BATCH_SIZE 
148 | 
149 |     test_users = users_to_test
150 |     n_test_users = len(test_users)
151 |     n_user_batchs = n_test_users // u_batch_size + 1
152 | 
153 |     count = 0
154 | 
155 |     for u_batch_id in range(n_user_batchs):
156 |         start = u_batch_id * u_batch_size
157 |         end = (u_batch_id + 1) * u_batch_size
158 | 
159 |         user_batch = test_users[start: end]
160 | 
161 |         if batch_test_flag:
162 |             #batch-item test
163 |             n_item_batchs = ITEM_NUM // i_batch_size + 1
164 |             rate_batch = np.zeros(shape=(len(user_batch), ITEM_NUM))
165 | 
166 |             i_count = 0
167 |             for i_batch_id in range(n_item_batchs):
168 |                 i_start = i_batch_id * i_batch_size
169 |                 i_end = min((i_batch_id + 1) * i_batch_size, ITEM_NUM)
170 | 
171 |                 item_batch = range(i_start, i_end)
172 | 
173 |                 if drop_flag == False:
174 |                     u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch,item_batch,[],drop_flag=False)
175 |                     i_rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu()
176 |                     #i_rate_batch = sess.run(model.batch_ratings, {model.users: user_batch, model.pos_items: item_batch})
177 |                 else:
178 |                     u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch,item_batch,[],drop_flag=False)
179 |                     i_rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu()
180 |                     #i_rate_batch = sess.run(model.batch_ratings, {model.users: user_batch, model.pos_items: item_batch})
181 | 
182 |                 rate_batch[:, i_start: i_end] = i_rate_batch
183 |                 i_count += i_rate_batch.shape[1]
184 | 
185 |             assert i_count == ITEM_NUM
186 | 
187 |         else:
188 |             item_batch = range(ITEM_NUM)
189 |             if drop_flag == False:
190 |                 u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch,item_batch,[],drop_flag=False)
191 |                 rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu()
192 |                 #rate_batch, _1 = sess.run([model.batch_ratings, model._1], {model.users: user_batch,
193 |                 #                                              model.pos_items: item_batch})
194 |             else:
195 |                 u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch,item_batch,[],drop_flag=False)
196 |                 rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu()
197 |                 #rate_batch, _1, _2 = sess.run([model.batch_ratings, model.print_pick, model.print_embed], {model.users: user_batch,
198 |                 #                                              model.pos_items: item_batch})
199 |                 
200 | 
201 |         user_batch_rating_uid = zip(rate_batch.numpy(), user_batch)
202 | 
203 |         if train_set_flag==0:
204 |             batch_result = pool.map(test_one_user, user_batch_rating_uid)
205 |         else:
206 |             batch_result = pool.map(test_one_user_train, user_batch_rating_uid)
207 |         count += len(batch_result)
208 | 
209 |         for re in batch_result:
210 |             result['precision'] += re['precision']/n_test_users
211 |             result['recall'] += re['recall']/n_test_users
212 |             result['ndcg'] += re['ndcg']/n_test_users
213 |             result['hit_ratio'] += re['hit_ratio']/n_test_users
214 |             result['auc'] += re['auc']/n_test_users
215 | 
216 | 
217 |     assert count == n_test_users
218 |     pool.close()
219 |     return result


--------------------------------------------------------------------------------
/utility/helper.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr , 2021
 3 | Pytorch Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
 4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
 5 | Note that: This implementation is based on the codes of NGCF.
 6 | @author: Xiang Wang (xiangwang@u.nus.edu)
 7 | @author: Jisu Rho (jsroh1013@gmail.com)
 8 | '''
 9 | 
10 | __author__ = "xiangwang"
11 | import os
12 | import re
13 | 
14 | def txt2list(file_src):
15 |     orig_file = open(file_src, "r")
16 |     lines = orig_file.readlines()
17 |     return lines
18 | 
19 | def ensureDir(dir_path):
20 |     d = os.path.dirname(dir_path)
21 |     if not os.path.exists(d):
22 |         os.makedirs(d)
23 | 
24 | def uni2str(unicode_str):
25 |     return str(unicode_str.encode('ascii', 'ignore')).replace('\n', '').strip()
26 | 
27 | def hasNumbers(inputString):
28 |     return bool(re.search(r'\d', inputString))
29 | 
30 | def delMultiChar(inputString, chars):
31 |     for ch in chars:
32 |         inputString = inputString.replace(ch, '')
33 |     return inputString
34 | 
35 | def merge_two_dicts(x, y):
36 |     z = x.copy()   # start with x's keys and values
37 |     z.update(y)    # modifies z with y's keys and values & returns None
38 |     return z
39 | 
40 | def early_stopping(log_value, best_value, stopping_step, expected_order='acc', flag_step=100):
41 |     # early stopping strategy:
42 |     assert expected_order in ['acc', 'dec']
43 | 
44 |     if (expected_order == 'acc' and log_value >= best_value) or (expected_order == 'dec' and log_value <= best_value):
45 |         stopping_step = 0
46 |         best_value = log_value
47 |     else:
48 |         stopping_step += 1
49 | 
50 |     if stopping_step >= flag_step:
51 |         print("Early stopping is trigger at step: {} log:{}".format(flag_step, log_value))
52 |         should_stop = True
53 |     else:
54 |         should_stop = False
55 |     return best_value, stopping_step, should_stop


--------------------------------------------------------------------------------
/utility/load_data.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr , 2021
  3 | Pytorch Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
  4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
  5 | Note that: This implementation is based on the codes of NGCF.
  6 | @author: Xiang Wang (xiangwang@u.nus.edu)
  7 | @author: Jisu Rho (jsroh1013@gmail.com)
  8 | '''
  9 | 
 10 | import numpy as np
 11 | import random as rd
 12 | import scipy.sparse as sp
 13 | from time import time
 14 | 
 15 | class Data(object):
 16 |     def __init__(self, path, batch_size):
 17 |         self.path = path
 18 |         self.batch_size = batch_size
 19 | 
 20 |         train_file = path + '/train.txt'
 21 |         test_file = path + '/test.txt'
 22 | 
 23 |         #get number of users and items
 24 |         self.n_users, self.n_items = 0, 0
 25 |         self.n_train, self.n_test = 0, 0
 26 |         self.neg_pools = {}
 27 | 
 28 |         self.exist_users = []
 29 | 
 30 |         with open(train_file) as f:
 31 |             for l in f.readlines():
 32 |                 if len(l) > 0:
 33 |                     l = l.strip('\n').split(' ')
 34 |                     items = [int(i) for i in l[1:]]
 35 |                     uid = int(l[0])
 36 |                     self.exist_users.append(uid)
 37 |                     self.n_items = max(self.n_items, max(items))
 38 |                     self.n_users = max(self.n_users, uid)
 39 |                     self.n_train += len(items)
 40 | 
 41 |         with open(test_file) as f:
 42 |             for l in f.readlines():
 43 |                 if len(l) > 0:
 44 |                     l = l.strip('\n')
 45 |                     try:
 46 |                         items = [int(i) for i in l.split(' ')[1:]]
 47 |                     except Exception:
 48 |                         continue
 49 |                     self.n_items = max(self.n_items, max(items))
 50 |                     self.n_test += len(items)
 51 |         self.n_items += 1
 52 |         self.n_users += 1
 53 | 
 54 |         self.print_statistics()
 55 | 
 56 |         self.R = sp.dok_matrix((self.n_users, self.n_items), dtype=np.float32)
 57 | 
 58 |         self.train_items, self.test_set = {}, {}
 59 |         with open(train_file) as f_train:
 60 |             with open(test_file) as f_test:
 61 |                 for l in f_train.readlines():
 62 |                     if len(l) == 0: break
 63 |                     l = l.strip('\n')
 64 |                     items = [int(i) for i in l.split(' ')]
 65 |                     uid, train_items = items[0], items[1:]
 66 | 
 67 |                     for i in train_items:
 68 |                         self.R[uid, i] = 1.
 69 |                         # self.R[uid][i] = 1
 70 | 
 71 |                     self.train_items[uid] = train_items
 72 | 
 73 |                 for l in f_test.readlines():
 74 |                     if len(l) == 0: break
 75 |                     l = l.strip('\n')
 76 |                     try:
 77 |                         items = [int(i) for i in l.split(' ')]
 78 |                     except Exception:
 79 |                         continue
 80 | 
 81 |                     uid, test_items = items[0], items[1:]
 82 |                     self.test_set[uid] = test_items
 83 | 
 84 |     def get_adj_mat(self):
 85 |         try:
 86 |             t1 = time()
 87 |             adj_mat = sp.load_npz(self.path + '/s_adj_mat.npz')
 88 |             norm_adj_mat = sp.load_npz(self.path + '/s_norm_adj_mat.npz')
 89 |             mean_adj_mat = sp.load_npz(self.path + '/s_mean_adj_mat.npz')
 90 |             print('already load adj matrix', adj_mat.shape, time() - t1)
 91 | 
 92 |         except Exception:
 93 |             adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat()
 94 |             sp.save_npz(self.path + '/s_adj_mat.npz', adj_mat)
 95 |             sp.save_npz(self.path + '/s_norm_adj_mat.npz', norm_adj_mat)
 96 |             sp.save_npz(self.path + '/s_mean_adj_mat.npz', mean_adj_mat)
 97 |         # pre_adj_mat ???    
 98 |         try:
 99 |             pre_adj_mat = sp.load_npz(self.path + '/s_pre_adj_mat.npz')
100 |         except Exception:
101 |             adj_mat=adj_mat
102 |             rowsum = np.array(adj_mat.sum(1))
103 |             d_inv = np.power(rowsum, -0.5).flatten()
104 |             d_inv[np.isinf(d_inv)] = 0.
105 |             d_mat_inv = sp.diags(d_inv)
106 | 
107 |             norm_adj = d_mat_inv.dot(adj_mat)
108 |             norm_adj = norm_adj.dot(d_mat_inv)
109 |             print('generate pre adjacency matrix.')
110 |             pre_adj_mat = norm_adj.tocsr()
111 |             sp.save_npz(self.path + '/s_pre_adj_mat.npz', norm_adj)
112 | 
113 | 
114 |         return adj_mat, norm_adj_mat, mean_adj_mat, pre_adj_mat
115 | 
116 |     def create_adj_mat(self):
117 |         t1 = time()
118 |         adj_mat = sp.dok_matrix((self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32)
119 |         adj_mat = adj_mat.tolil()
120 |         R = self.R.tolil()
121 | 
122 |         adj_mat[:self.n_users, self.n_users:] = R
123 |         adj_mat[self.n_users:, :self.n_users] = R.T
124 |         adj_mat = adj_mat.todok()
125 |         print('already create adjacency matrix', adj_mat.shape, time() - t1)
126 | 
127 |         t2 = time()
128 | 
129 |         def normalized_adj_single(adj):
130 |             rowsum = np.array(adj.sum(1))
131 | 
132 |             d_inv = np.power(rowsum, -1).flatten()
133 |             d_inv[np.isinf(d_inv)] = 0.
134 |             d_mat_inv = sp.diags(d_inv)
135 | 
136 |             norm_adj = d_mat_inv.dot(adj)
137 |             # norm_adj = adj.dot(d_mat_inv)
138 |             print('generate single-normalized adjacency matrix.')
139 |             return norm_adj.tocoo()
140 | 
141 |         def check_adj_if_equal(adj):
142 |             dense_A = np.array(adj.todense())
143 |             degree = np.sum(dense_A, axis=1, keepdims=False)
144 | 
145 |             temp = np.dot(np.diag(np.power(degree, -1)), dense_A)
146 |             print('check normalized adjacency matrix whether equal to this laplacian matrix.')
147 |             return temp
148 | 
149 |         norm_adj_mat = normalized_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
150 |         mean_adj_mat = normalized_adj_single(adj_mat)
151 | 
152 |         print('already normalize adjacency matrix', time() - t2)
153 |         return adj_mat.tocsr(), norm_adj_mat.tocsr(), mean_adj_mat.tocsr()
154 | 
155 |     def negative_pool(self):
156 |         t1 = time()
157 |         for u in self.train_items.keys():
158 |             neg_items = list(set(range(self.n_items)) - set(self.train_items[u]))
159 |             pools = [rd.choice(neg_items) for _ in range(100)]
160 |             self.neg_pools[u] = pools
161 |         print('refresh negative pools', time() - t1)
162 | 
163 |     def sample(self):
164 |         if self.batch_size <= self.n_users:
165 |             users = rd.sample(self.exist_users, self.batch_size)
166 |         else:
167 |             users = [rd.choice(self.exist_users) for _ in range(self.batch_size)]
168 | 
169 | 
170 |         def sample_pos_items_for_u(u, num):
171 |             #sample num pos items for u-th user
172 |             pos_items = self.train_items[u]
173 |             n_pos_items = len(pos_items)
174 |             pos_batch = []
175 |             while True:
176 |                 if len(pos_batch) == num: 
177 |                     break
178 |                 pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
179 |                 pos_i_id = pos_items[pos_id]
180 | 
181 |                 if pos_i_id not in pos_batch:
182 |                     pos_batch.append(pos_i_id)
183 |             return pos_batch
184 | 
185 |         def sample_neg_items_for_u(u, num):
186 |             #sample num neg items for u-th user
187 |             neg_items = []
188 |             while True:
189 |                 if len(neg_items) == num: 
190 |                     break
191 |                 neg_id = np.random.randint(low=0, high=self.n_items,size=1)[0]
192 |                 if neg_id not in self.train_items[u] and neg_id not in neg_items:
193 |                     neg_items.append(neg_id)
194 |             return neg_items
195 | 
196 |         def sample_neg_items_for_u_from_pools(u, num):
197 |             neg_items = list(set(self.neg_pools[u]) - set(self.train_items[u]))
198 |             return rd.sample(neg_items, num)
199 | 
200 |         pos_items, neg_items = [], []
201 |         for u in users:
202 |             pos_items += sample_pos_items_for_u(u, 1)
203 |             neg_items += sample_neg_items_for_u(u, 1)
204 | 
205 |         return users, pos_items, neg_items
206 |     
207 |     def sample_test(self):
208 |         if self.batch_size <= self.n_users:
209 |             users = rd.sample(self.test_set.keys(), self.batch_size)
210 |         else:
211 |             users = [rd.choice(self.exist_users) for _ in range(self.batch_size)]
212 | 
213 |         def sample_pos_items_for_u(u, num):
214 |             pos_items = self.test_set[u]
215 |             n_pos_items = len(pos_items)
216 |             pos_batch = []
217 |             while True:
218 |                 if len(pos_batch) == num: 
219 |                     break
220 |                 pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
221 |                 pos_i_id = pos_items[pos_id]
222 | 
223 |                 if pos_i_id not in pos_batch:
224 |                     pos_batch.append(pos_i_id)
225 |             return pos_batch
226 | 
227 |         def sample_neg_items_for_u(u, num):
228 |             neg_items = []
229 |             while True:
230 |                 if len(neg_items) == num: 
231 |                     break
232 |                 neg_id = np.random.randint(low=0, high=self.n_items, size=1)[0]
233 |                 if neg_id not in (self.test_set[u]+self.train_items[u]) and neg_id not in neg_items:
234 |                     neg_items.append(neg_id)
235 |             return neg_items
236 |     
237 |         def sample_neg_items_for_u_from_pools(u, num):
238 |             neg_items = list(set(self.neg_pools[u]) - set(self.train_items[u]))
239 |             return rd.sample(neg_items, num)
240 | 
241 |         pos_items, neg_items = [], []
242 |         for u in users:
243 |             pos_items += sample_pos_items_for_u(u, 1)
244 |             neg_items += sample_neg_items_for_u(u, 1)
245 | 
246 |         return users, pos_items, neg_items  
247 |     
248 |     def get_num_users_items(self):
249 |         return self.n_users, self.n_items
250 | 
251 |     def print_statistics(self):
252 |         print('n_users=%d, n_items=%d' % (self.n_users, self.n_items))
253 |         print('n_interactions=%d' % (self.n_train + self.n_test))
254 |         print('n_train=%d, n_test=%d, sparsity=%.5f' % (self.n_train, self.n_test, (self.n_train + self.n_test)/(self.n_users * self.n_items)))
255 | 
256 |     def get_sparsity_split(self):
257 |         try:
258 |             split_uids, split_state = [], []
259 |             lines = open(self.path + '/sparsity.split', 'r').readlines()
260 | 
261 |             for idx, line in enumerate(lines):
262 |                 if idx % 2 == 0:
263 |                     split_state.append(line.strip())
264 |                     print(line.strip())
265 |                 else:
266 |                     split_uids.append([int(uid) for uid in line.strip().split(' ')])
267 |             print('get sparsity split.')
268 | 
269 |         except Exception:
270 |             split_uids, split_state = self.create_sparsity_split()
271 |             f = open(self.path + '/sparsity.split', 'w')
272 |             for idx in range(len(split_state)):
273 |                 f.write(split_state[idx] + '\n')
274 |                 f.write(' '.join([str(uid) for uid in split_uids[idx]]) + '\n')
275 |             print('create sparsity split.')
276 | 
277 |         return split_uids, split_state
278 | 
279 |     def create_sparsity_split(self):
280 |         all_users_to_test = list(self.test_set.keys())
281 |         user_n_iid = dict()
282 | 
283 |         # generate a dictionary to store (key=n_iids, value=a list of uid).
284 |         for uid in all_users_to_test:
285 |             train_iids = self.train_items[uid]
286 |             test_iids = self.test_set[uid]
287 | 
288 |             n_iids = len(train_iids) + len(test_iids)
289 | 
290 |             if n_iids not in user_n_iid.keys():
291 |                 user_n_iid[n_iids] = [uid]
292 |             else:
293 |                 user_n_iid[n_iids].append(uid)
294 |         split_uids = list()
295 | 
296 |         # split the whole user set into four subset.
297 |         temp = []
298 |         count = 1
299 |         fold = 4
300 |         n_count = (self.n_train + self.n_test)
301 |         n_rates = 0
302 | 
303 |         split_state = []
304 |         for idx, n_iids in enumerate(sorted(user_n_iid)):
305 |             temp += user_n_iid[n_iids]
306 |             n_rates += n_iids * len(user_n_iid[n_iids])
307 |             n_count -= n_iids * len(user_n_iid[n_iids])
308 | 
309 |             if n_rates >= count * 0.25 * (self.n_train + self.n_test):
310 |                 split_uids.append(temp)
311 | 
312 |                 state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' %(n_iids, len(temp), n_rates)
313 |                 split_state.append(state)
314 |                 print(state)
315 | 
316 |                 temp = []
317 |                 n_rates = 0
318 |                 fold -= 1
319 | 
320 |             if idx == len(user_n_iid.keys()) - 1 or n_count == 0:
321 |                 split_uids.append(temp)
322 | 
323 |                 state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' % (n_iids, len(temp), n_rates)
324 |                 split_state.append(state)
325 |                 print(state)
326 | 
327 | 
328 | 
329 |         return split_uids, split_state


--------------------------------------------------------------------------------
/utility/metrics.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr , 2021
  3 | Pytorch Implementation of Disentangled Graph Collaborative Filtering (DGCF) model in:
  4 | Wang Xiang et al. Disentangled Graph Collaborative Filtering. In SIGIR 2020.
  5 | Note that: This implementation is based on the codes of NGCF.
  6 | @author: Xiang Wang (xiangwang@u.nus.edu)
  7 | @author: Jisu Rho (jsroh1013@gmail.com)
  8 | '''
  9 | 
 10 | import numpy as np
 11 | from sklearn.metrics import roc_auc_score
 12 | 
 13 | def recall(rank, ground_truth, N):
 14 |     return len(set(rank[:N]) & set(ground_truth)) / float(len(set(ground_truth)))
 15 | 
 16 | def precision_at_k(r, k):
 17 |     """Score is precision @ k
 18 |     Relevance is binary (nonzero is relevant).
 19 |     Returns:
 20 |         Precision @ k
 21 |     Raises:
 22 |         ValueError: len(r) must be >= k
 23 |     """
 24 |     assert k >= 1
 25 |     r = np.asarray(r)[:k]
 26 |     return np.mean(r)
 27 | 
 28 | def average_precision(r,cut):
 29 |     """Score is average precision (area under PR curve)
 30 |     Relevance is binary (nonzero is relevant).
 31 |     Returns:
 32 |         Average precision
 33 |     """
 34 |     r = np.asarray(r)
 35 |     out = [precision_at_k(r, k + 1) for k in range(cut) if r[k]]
 36 |     if not out:
 37 |         return 0.
 38 |     return np.sum(out)/float(min(cut, np.sum(r)))
 39 | 
 40 | def mean_average_precision(rs):
 41 |     """Score is mean average precision
 42 |     Relevance is binary (nonzero is relevant).
 43 |     Returns:
 44 |         Mean average precision
 45 |     """
 46 |     return np.mean([average_precision(r) for r in rs])
 47 | 
 48 | def dcg_at_k(r, k, method=1):
 49 |     """Score is discounted cumulative gain (dcg)
 50 |     Relevance is positive real values.  Can use binary
 51 |     as the previous methods.
 52 |     Returns:
 53 |         Discounted cumulative gain
 54 |     """
 55 |     r = np.asfarray(r)[:k]
 56 |     if r.size:
 57 |         if method == 0:
 58 |             return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
 59 |         elif method == 1:
 60 |             return np.sum(r / np.log2(np.arange(2, r.size + 2)))
 61 |         else:
 62 |             raise ValueError('method must be 0 or 1.')
 63 |     return 0.
 64 | 
 65 | def ndcg_at_k(r, k, ground_truth, method=1):
 66 |     """Score is normalized discounted cumulative gain (ndcg)
 67 |     Relevance is positive real values.  Can use binary
 68 |     as the previous methods.
 69 |     Returns:
 70 |         Normalized discounted cumulative gain
 71 |         Low but correct defination 
 72 |     """
 73 |     GT = set(ground_truth)
 74 |     if len(GT) > k :
 75 |         sent_list = [1.0] * k
 76 |     else:
 77 |         sent_list = [1.0]*len(GT) + [0.0]*(k-len(GT))   
 78 |     dcg_max = dcg_at_k(sent_list, k, method)
 79 |     if not dcg_max:
 80 |         return 0.
 81 |     return dcg_at_k(r, k, method) / dcg_max
 82 | 
 83 | def recall_at_k(r, k, all_pos_num):
 84 |     #if all_pos_num == 0:
 85 |     #return 0
 86 |     r = np.asfarray(r)[:k]
 87 |     return np.sum(r) / all_pos_num
 88 | 
 89 | def hit_at_k(r, k):
 90 |     r = np.array(r)[:k]
 91 |     if np.sum(r) > 0:
 92 |         return 1.
 93 |     else:
 94 |         return 0.
 95 | 
 96 | def F1(pre, rec):
 97 |     if pre + rec > 0:
 98 |         return (2.0 * pre * rec) / (pre + rec)
 99 |     else:
100 |         return 0.
101 | 
102 | def auc(ground_truth, prediction):
103 |     try:
104 |         res = roc_auc_score(y_true=ground_truth, y_score=prediction)
105 |     except Exception:
106 |         res = 0.
107 |     return res


--------------------------------------------------------------------------------
/utility/parser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def parse_args():
 4 |     parser = argparse.ArgumentParser(description="Run DGCF.")
 5 |     parser.add_argument('--data_path', nargs='?', default='../Data/',
 6 |                         help='Input data path.')
 7 |     parser.add_argument('--proj_path', nargs='?', default='../',
 8 |                         help='Project path.')
 9 |     
10 |     parser.add_argument('--pick', type=int, default=0,
11 |                         help='O for no pick, 1 for pick')
12 |     parser.add_argument('--pick_scale', type=float, default=1e10,
13 |                         help='Scale')
14 |     parser.add_argument('--dataset', nargs='?', default='gowalla',
15 |                         help='Choose a dataset from {gowalla, yelp2018, amazon-book}')
16 |     parser.add_argument('--pretrain', type=int, default=0,
17 |                         help='0: No pretrain, 1:Use stored models.')
18 |     parser.add_argument('--embed_name', nargs='?', default='',
19 |                         help='Name for pretrained model.')
20 |     parser.add_argument('--verbose', type=int, default=1,
21 |                         help='Interval of evaluation.')
22 | 
23 | 
24 |     parser.add_argument('--epoch', type=int, default=3000,
25 |                         help='Number of epochs')      
26 |     parser.add_argument('--embed_size', type=int, default=64,
27 |                         help='Embedding size.')
28 |     parser.add_argument('--layer_size', nargs='?', default='[64]',
29 |                         help='Output sizes of every layer')
30 |     parser.add_argument('--batch_size', type=int, default=1024,
31 |                         help='Batch size.')
32 |     parser.add_argument('--lr', type=float, default=0.01,
33 |                         help='Learning rate.')
34 |     parser.add_argument('--cor_flag', type=int, default=1,
35 |                         help='Correlation matrix flag')
36 |     parser.add_argument('--corDecay', type=float, default=0.0,
37 |                         help='Distance Correlation Weight')
38 |     parser.add_argument('--regs', nargs='?', default='[1e-3,1e-4,1e-4]',
39 |                         help='Regularizations.')
40 |     parser.add_argument('--gpu_id', type=int, default=0)
41 |     parser.add_argument('--multicore', type=int, default=0, help='whether we use multiprocessing or not in test')
42 |         
43 |     parser.add_argument('--n_layers', type=int, default=1,
44 |                         help='Layer numbers.')
45 |     parser.add_argument('--n_factors', type=int, default=4,
46 |                         help='Number of factors to disentangle the original embed-size representation.')
47 |     parser.add_argument('--n_iterations', type=int, default=2,
48 |                         help='Number of iterations to perform the routing mechanism.')
49 |     
50 |     
51 |     parser.add_argument('--show_step', type=int, default=15,
52 |                         help='Test every show_step epochs.')
53 |     parser.add_argument('--early', type=int, default=40,
54 |                         help='Step for stopping')           
55 |     parser.add_argument('--Ks', nargs='?', default='[20, 40, 60, 80, 100]',
56 |                         help='Metrics scale')
57 | 
58 |     parser.add_argument('--save_flag', type=int, default=0,
59 |                         help='0: Disable model saver, 1: Save Better Model')
60 |     parser.add_argument('--save_name', nargs='?', default='best_model',
61 |                         help='Save_name.')
62 |     
63 |     parser.add_argument('--test_flag', nargs='?', default='part',
64 |                         help='Specify the test type from {part, full}, indicating whether the reference is done in mini-batch')
65 | 
66 | 
67 |     return parser.parse_args()


--------------------------------------------------------------------------------