├── base ├── __init__.py ├── seq_recommender.py ├── ssl_interface.py ├── torch_interface.py ├── tf_interface.py ├── recommender.py └── graph_recommender.py ├── conf ├── __init__.py └── CPTPP.conf ├── data ├── __init__.py ├── feature.py ├── data.py ├── sequence.py ├── graph.py ├── augmentor.py ├── loader.py ├── social.py └── ui_graph.py ├── dataset ├── gowalla │ └── process.py ├── douban │ └── split.py └── ml-1M │ └── split.py ├── README.MD ├── util ├── logger.py ├── loss_tf.py ├── loss_torch.py ├── structure.py ├── conf.py ├── sampler.py ├── algorithm.py └── evaluation.py ├── main.py ├── SELFRec.py └── model └── graph └── CPTPP.py /base/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /conf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/feature.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /base/seq_recommender.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /base/ssl_interface.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/data.py: -------------------------------------------------------------------------------- 1 | class Data(object): 2 | def __init__(self, conf, training, test): 3 | self.config = conf 4 | self.training_data = training[:] 5 | self.test_data = test[:] 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /dataset/gowalla/process.py: -------------------------------------------------------------------------------- 1 | record = [] 2 | with open('train.txt') as f: 3 | for line in f: 4 | items = line.strip().split() 5 | for i in items[1:]: 6 | record.append(items[0]+' '+i+' 1\n') 7 | with open('train.txt','w') as f: 8 | f.writelines(record) -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | The source codes take [SELFRec](https://github.com/Coder-Yu/SELFRec) as the backbone to implement baselines and our proposed method. Please follow the detailed instructions in SELFRec to run the codes. The hyper-parameter settings are provided in our paper. 2 | 3 | Please cite both our work and [SELFRec](https://github.com/Coder-Yu/SELFRec) if you would like to use our source codes. -------------------------------------------------------------------------------- /base/torch_interface.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class TorchGraphInterface(object): 4 | def __init__(self): 5 | pass 6 | 7 | @staticmethod 8 | def convert_sparse_mat_to_tensor(X): 9 | coo = X.tocoo() 10 | i = torch.LongTensor([coo.row, coo.col]) 11 | v = torch.from_numpy(coo.data).float() 12 | return torch.sparse.FloatTensor(i, v, coo.shape) -------------------------------------------------------------------------------- /dataset/douban/split.py: -------------------------------------------------------------------------------- 1 | import random 2 | random.seed(12345) 3 | train = [] 4 | test = [] 5 | test_ratio=0.2 6 | with open('ratings.txt') as f: 7 | for line in f: 8 | items = line.strip().split() 9 | if random.random()>test_ratio: 10 | train.append(line) 11 | else: 12 | test.append(line) 13 | 14 | with open('train.txt','w') as f: 15 | f.writelines(train) 16 | 17 | with open('test.txt','w') as f: 18 | f.writelines(test) 19 | 20 | -------------------------------------------------------------------------------- /dataset/ml-1M/split.py: -------------------------------------------------------------------------------- 1 | import random 2 | train = [] 3 | test = [] 4 | with open('ratings.dat') as f: 5 | for line in f: 6 | items = line.strip().split('::') 7 | new_line = ' '.join(items[:-1])+'\n' 8 | if int(items[-2])<4: 9 | continue 10 | if random.random() > 0.2: 11 | train.append(new_line) 12 | else: 13 | test.append(new_line) 14 | 15 | with open('train.txt','w') as f: 16 | f.writelines(train) 17 | 18 | with open('test.txt','w') as f: 19 | f.writelines(test) -------------------------------------------------------------------------------- /util/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | 5 | class Log(object): 6 | def __init__(self,module,filename): 7 | self.logger = logging.getLogger(module) 8 | self.logger.setLevel(level=logging.INFO) 9 | if not os.path.exists('./log/'): 10 | os.makedirs('./log/') 11 | handler = logging.FileHandler('./log/'+filename+'.log') 12 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 13 | handler.setFormatter(formatter) 14 | self.logger.addHandler(handler) 15 | 16 | def add(self,text): 17 | self.logger.info(text) 18 | -------------------------------------------------------------------------------- /base/tf_interface.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | class TFGraphInterface(object): 6 | def __init__(self): 7 | pass 8 | 9 | @staticmethod 10 | def convert_sparse_mat_to_tensor(adj): 11 | row, col = adj.nonzero() 12 | indices = np.array(list(zip(row, col))) 13 | adj_tensor = tf.SparseTensor(indices=indices, values=adj.data, dense_shape=adj.shape) 14 | return adj_tensor 15 | 16 | @staticmethod 17 | def convert_sparse_mat_to_tensor_inputs(X): 18 | coo = X.tocoo() 19 | indices = np.mat([coo.row, coo.col]).transpose() 20 | return indices, coo.data, coo.shape -------------------------------------------------------------------------------- /data/sequence.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from data.data import Data 3 | 4 | 5 | class Sequence(Data): 6 | def __init__(self, conf, training, test): 7 | super(Sequence, self).__init__(conf, training, test) 8 | self.item = {} 9 | self.id2item = {} 10 | self.__generate_set() 11 | self.raw_seq_num = len(self.training_data) 12 | self.item_num = len(self.item) 13 | 14 | def __generate_set(self): 15 | for seq in self.training_data: 16 | for item in seq: 17 | if item not in self.item: 18 | self.item[item] = len(self.item) 19 | self.id2item[self.item[item]] = item 20 | 21 | 22 | 23 | def get_item_id(self, i): 24 | if i in self.item: 25 | return self.item[i] 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /data/graph.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | 4 | 5 | class Graph(object): 6 | def __init__(self): 7 | pass 8 | 9 | @staticmethod 10 | def normalize_graph_mat(adj_mat): 11 | shape = adj_mat.get_shape() 12 | rowsum = np.array(adj_mat.sum(1)) 13 | if shape[0] == shape[1]: 14 | d_inv = np.power(rowsum, -0.5).flatten() 15 | d_inv[np.isinf(d_inv)] = 0. 16 | d_mat_inv = sp.diags(d_inv) 17 | norm_adj_tmp = d_mat_inv.dot(adj_mat) 18 | norm_adj_mat = norm_adj_tmp.dot(d_mat_inv) 19 | else: 20 | d_inv = np.power(rowsum, -1).flatten() 21 | d_inv[np.isinf(d_inv)] = 0. 22 | d_mat_inv = sp.diags(d_inv) 23 | norm_adj_mat = d_mat_inv.dot(adj_mat) 24 | return norm_adj_mat 25 | 26 | def convert_to_laplacian_mat(self, adj_mat): 27 | pass 28 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from SELFRec import SELFRec 2 | from util.conf import ModelConf 3 | 4 | if __name__ == '__main__': 5 | # Register your model here 6 | baseline = ['LightGCN','MF'] 7 | graph_models = ['SGL', 'SimGCL', 'BUIR', 'SelfCF', 'NCL', 'CPTPP'] 8 | sequential_models = [] 9 | 10 | print('=' * 80) 11 | print(' SELFRec: A library for self-supervised recommendation. ') 12 | print('=' * 80) 13 | 14 | print('Baseline Models:') 15 | print(' '.join(baseline)) 16 | print('-' * 80) 17 | print('Graph-Based Models:') 18 | print(' '.join(graph_models)) 19 | 20 | print('=' * 80) 21 | model = input('Please enter the model you want to run:') 22 | import time 23 | 24 | s = time.time() 25 | if model in baseline or model in graph_models or model in sequential_models: 26 | conf = ModelConf('./conf/' + model + '.conf') 27 | else: 28 | print('Wrong model name!') 29 | exit(-1) 30 | rec = SELFRec(conf) 31 | rec.execute() 32 | e = time.time() 33 | print("Running time: %f s" % (e - s)) 34 | -------------------------------------------------------------------------------- /conf/CPTPP.conf: -------------------------------------------------------------------------------- 1 | # training.set=./dataset/ml-1M/train.txt 2 | # test.set=./dataset/ml-1M/test.txt 3 | # model.name=CPTPP 4 | # model.type=graph 5 | # item.ranking=-topN 5,20 6 | # embbedding.size=64 7 | # num.max.preepoch=10 8 | # num.max.epoch=100 9 | # batch_size=512 10 | # learnRate=0.003 11 | # reg.lambda=0.0001 12 | # CPTPP=-n_layer 2 -lambda 0.1 -droprate 0.1 -augtype 1 -temp 0.2 -inputs_type 2 -prompt_size 256 13 | # output.setup=-dir ./results/ 14 | 15 | 16 | # training.set=./dataset/douban/train.txt 17 | # test.set=./dataset/douban/test.txt 18 | # model.name=CPTPP 19 | # model.type=graph 20 | # item.ranking=-topN 5,20 21 | # embbedding.size=64 22 | # num.max.preepoch=10 23 | # num.max.epoch=100 24 | # batch_size=512 25 | # learnRate=0.001 26 | # reg.lambda=0.0001 27 | # CPTPP=-n_layer 2 -lambda 0.1 -droprate 0.1 -augtype 1 -temp 0.2 -inputs_type 2 -prompt_size 256 28 | # output.setup=-dir ./results/ 29 | 30 | 31 | training.set=./dataset/gowalla/train.txt 32 | test.set=./dataset/gowalla/test.txt 33 | model.name=CPTPP 34 | model.type=graph 35 | item.ranking=-topN 5,20 36 | embbedding.size=64 37 | num.max.preepoch=10 38 | num.max.epoch=100 39 | batch_size=2048 40 | learnRate=0.001 41 | reg.lambda=0.0001 42 | CPTPP=-n_layer 2 -lambda 0.1 -droprate 0.1 -augtype 1 -temp 0.2 -inputs_type 2 -prompt_size 256 43 | output.setup=-dir ./results/ -------------------------------------------------------------------------------- /util/loss_tf.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def bpr_loss(user_emb, pos_item_emb, neg_item_emb): 5 | score = tf.reduce_sum(tf.multiply(user_emb, pos_item_emb), 1) - tf.reduce_sum(tf.multiply(user_emb, neg_item_emb), 1) 6 | loss = -tf.reduce_sum(tf.log(tf.sigmoid(score) + 10e-8)) 7 | return loss 8 | 9 | 10 | def InfoNCE(view1, view2, temperature): 11 | pos_score = tf.reduce_sum(tf.multiply(view1, view2), axis=1) 12 | ttl_score = tf.matmul(view1, view2, transpose_a=False, transpose_b=True) 13 | pos_score = tf.exp(pos_score / temperature) 14 | ttl_score = tf.reduce_sum(tf.exp(ttl_score / temperature), axis=1) 15 | cl_loss = -tf.reduce_sum(tf.log(pos_score / ttl_score)) 16 | return cl_loss 17 | 18 | 19 | # Sampled Softmax 20 | def ssm_loss(user_emb, pos_item_emb, neg_item_emb): 21 | user_emb = tf.nn.l2_normalize(user_emb, 1) 22 | pos_item_emb = tf.nn.l2_normalize(pos_item_emb, 1) 23 | neg_item_emb = tf.nn.l2_normalize(neg_item_emb, 1) 24 | pos_score = tf.reduce_sum(tf.multiply(user_emb, pos_item_emb), 1) 25 | ttl_score = tf.matmul(user_emb, neg_item_emb, transpose_a=False, transpose_b=True) 26 | ttl_score = tf.concat([tf.reshape(pos_score, (-1, 1)), ttl_score], axis=1) 27 | pos_score = tf.exp(pos_score / 0.2) 28 | ttl_score = tf.reduce_sum(tf.exp(ttl_score / 0.2), axis=1) 29 | return -tf.reduce_mean(tf.log(pos_score / ttl_score)) 30 | -------------------------------------------------------------------------------- /SELFRec.py: -------------------------------------------------------------------------------- 1 | from data.loader import FileIO 2 | 3 | 4 | class SELFRec(object): 5 | def __init__(self, config): 6 | self.social_data = [] 7 | self.feature_data = [] 8 | self.config = config 9 | if config['model.type'] == 'sequential': 10 | self.training_data, self.test_data = FileIO.load_data_set(config['sequence.data'], config['model.type']) 11 | else: 12 | self.training_data = FileIO.load_data_set(config['training.set'], config['model.type']) 13 | self.test_data = FileIO.load_data_set(config['test.set'], config['model.type']) 14 | 15 | self.kwargs = {} 16 | if config.contain('social.data'): 17 | social_data = FileIO.load_social_data(self.config['social.data']) 18 | self.kwargs['social.data'] = social_data 19 | # if config.contains('feature.data'): 20 | # self.social_data = FileIO.loadFeature(config,self.config['feature.data']) 21 | print('Reading data and preprocessing...') 22 | 23 | def execute(self): 24 | # import the model module 25 | import_str = 'from model.'+ self.config['model.type'] +'.' + self.config['model.name'] + ' import ' + self.config['model.name'] 26 | exec(import_str) 27 | recommender = self.config['model.name'] + '(self.config,self.training_data,self.test_data,**self.kwargs)' 28 | eval(recommender).execute() 29 | -------------------------------------------------------------------------------- /data/augmentor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import scipy.sparse as sp 4 | 5 | class GraphAugmentor(object): 6 | def __init__(self): 7 | pass 8 | 9 | @staticmethod 10 | def node_dropout(sp_adj, drop_rate): 11 | """Input: a sparse adjacency matrix and a dropout rate.""" 12 | adj_shape = sp_adj.get_shape() 13 | row_idx, col_idx = sp_adj.nonzero() 14 | drop_user_idx = random.sample(range(adj_shape[0]), int(adj_shape[0] * drop_rate)) 15 | drop_item_idx = random.sample(range(adj_shape[1]), int(adj_shape[1] * drop_rate)) 16 | indicator_user = np.ones(adj_shape[0], dtype=np.float32) 17 | indicator_item = np.ones(adj_shape[1], dtype=np.float32) 18 | indicator_user[drop_user_idx] = 0. 19 | indicator_item[drop_item_idx] = 0. 20 | diag_indicator_user = sp.diags(indicator_user) 21 | diag_indicator_item = sp.diags(indicator_item) 22 | mat = sp.csr_matrix( 23 | (np.ones_like(row_idx, dtype=np.float32), (row_idx, col_idx)), 24 | shape=(adj_shape[0], adj_shape[1])) 25 | mat_prime = diag_indicator_user.dot(mat).dot(diag_indicator_item) 26 | return mat_prime 27 | 28 | @staticmethod 29 | def edge_dropout(sp_adj, drop_rate): 30 | """Input: a sparse user-item adjacency matrix and a dropout rate.""" 31 | adj_shape = sp_adj.get_shape() 32 | edge_count = sp_adj.count_nonzero() 33 | row_idx, col_idx = sp_adj.nonzero() 34 | keep_idx = random.sample(range(edge_count), int(edge_count * (1 - drop_rate))) 35 | user_np = np.array(row_idx)[keep_idx] 36 | item_np = np.array(col_idx)[keep_idx] 37 | edges = np.ones_like(user_np, dtype=np.float32) 38 | dropped_adj = sp.csr_matrix((edges, (user_np, item_np)), shape=adj_shape) 39 | return dropped_adj 40 | 41 | -------------------------------------------------------------------------------- /util/loss_torch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def bpr_loss(user_emb, pos_item_emb, neg_item_emb): 6 | pos_score = torch.mul(user_emb, pos_item_emb).sum(dim=1) 7 | neg_score = torch.mul(user_emb, neg_item_emb).sum(dim=1) 8 | loss = -torch.log(10e-8 + torch.sigmoid(pos_score - neg_score)) 9 | return torch.mean(loss) 10 | 11 | 12 | def l2_reg_loss(reg, *args): 13 | emb_loss = 0 14 | for emb in args: 15 | emb_loss += torch.norm(emb, p=2) 16 | return emb_loss * reg 17 | 18 | 19 | def batch_softmax_loss(user_emb, item_emb, temperature): 20 | user_emb, item_emb = F.normalize(user_emb, dim=1), F.normalize(item_emb, dim=1) 21 | pos_score = (user_emb * item_emb).sum(dim=-1) 22 | pos_score = torch.exp(pos_score / temperature) 23 | ttl_score = torch.matmul(user_emb, item_emb.transpose(0, 1)) 24 | ttl_score = torch.exp(ttl_score / temperature).sum(dim=1) 25 | loss = -torch.log(pos_score / ttl_score) 26 | return torch.mean(loss) 27 | 28 | 29 | def InfoNCE(view1, view2, temperature): 30 | view1, view2 = F.normalize(view1, dim=1), F.normalize(view2, dim=1) 31 | pos_score = (view1 * view2).sum(dim=-1) 32 | pos_score = torch.exp(pos_score / temperature) 33 | ttl_score = torch.matmul(view1, view2.transpose(0, 1)) 34 | ttl_score = torch.exp(ttl_score / temperature).sum(dim=1) 35 | cl_loss = -torch.log(pos_score / ttl_score) 36 | return torch.mean(cl_loss) 37 | 38 | 39 | def kl_divergence(p_logit, q_logit): 40 | p = F.softmax(p_logit, dim=-1) 41 | kl = torch.sum(p * (F.log_softmax(p_logit, dim=-1) - F.log_softmax(q_logit, dim=-1)), 1) 42 | return torch.mean(kl) 43 | 44 | def js_divergence(p_logit, q_logit): 45 | p = F.softmax(p_logit, dim=-1) 46 | q = F.softmax(q_logit, dim=-1) 47 | kl_p = torch.sum(p * (F.log_softmax(p_logit, dim=-1) - F.log_softmax(q_logit, dim=-1)), 1) 48 | kl_q = torch.sum(q * (F.log_softmax(q_logit, dim=-1) - F.log_softmax(p_logit, dim=-1)), 1) 49 | return torch.mean(kl_p+kl_q) -------------------------------------------------------------------------------- /util/structure.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class SparseMatrix(): 5 | def __init__(self,triple): 6 | self.matrix_user = {} 7 | self.matrix_item = {} 8 | for item in triple: 9 | if item[0] not in self.matrix_user: 10 | self.matrix_user[item[0]] = {} 11 | if item[1] not in self.matrix_item: 12 | self.matrix_item[item[1]] = {} 13 | self.matrix_user[item[0]][item[1]] = item[2] 14 | self.matrix_item[item[1]][item[0]] = item[2] 15 | self.elemNum = len(triple) 16 | self.size = len(self.matrix_user), len(self.matrix_item) 17 | 18 | def row(self,r): 19 | if r not in self.matrix_user: 20 | return {} 21 | else: 22 | return self.matrix_user[r] 23 | 24 | def col(self,c): 25 | if c not in self.matrix_item: 26 | return {} 27 | else: 28 | return self.matrix_item[c] 29 | 30 | def dense_row(self,r): 31 | if r not in self.matrix_user: 32 | return np.zeros((1,self.size[1])) 33 | else: 34 | array = np.zeros((1,self.size[1])) 35 | ind = list(self.matrix_user[r].keys()) 36 | val = list(self.matrix_user[r].values()) 37 | array[0][ind] = val 38 | return array 39 | 40 | def dense_col(self,c): 41 | if c not in self.matrix_item: 42 | return np.zeros((1,self.size[0])) 43 | else: 44 | array = np.zeros((1,self.size[0])) 45 | ind = list(self.matrix_item[c].keys()) 46 | val = list(self.matrix_item[c].values()) 47 | array[0][ind] = val 48 | return array 49 | 50 | def elem(self,r,c): 51 | if not self.contain(r,c): 52 | return 0 53 | return self.matrix_user[r][c] 54 | 55 | def contain(self,r,c): 56 | if r in self.matrix_user and c in self.matrix_user[r]: 57 | return True 58 | return False 59 | 60 | def elem_count(self): 61 | return self.elemNum 62 | 63 | def size(self): 64 | return self.size 65 | 66 | 67 | -------------------------------------------------------------------------------- /data/loader.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from os import remove 3 | from re import split 4 | 5 | 6 | class FileIO(object): 7 | def __init__(self): 8 | pass 9 | 10 | @staticmethod 11 | def write_file(dir, file, content, op='w'): 12 | if not os.path.exists(dir): 13 | os.makedirs(dir) 14 | with open(dir + file, op) as f: 15 | f.writelines(content) 16 | 17 | @staticmethod 18 | def delete_file(file_path): 19 | if os.path.exists(file_path): 20 | remove(file_path) 21 | 22 | @staticmethod 23 | def load_data_set(file, dtype): 24 | data = [] 25 | if dtype == 'graph': 26 | with open(file) as f: 27 | for line in f: 28 | items = split(' ', line.strip()) 29 | user_id = items[0] 30 | item_id = items[1] 31 | weight = items[2] 32 | data.append([user_id, item_id, float(weight)]) 33 | 34 | if dtype == 'sequential': 35 | training_data, test_data = [], [] 36 | with open(file) as f: 37 | for line in f: 38 | items = split(':', line.strip()) 39 | user_id = items[0] 40 | seq = items[1].strip().split() 41 | training_data.append(seq[:-1]) 42 | test_data.append(seq[-1]) 43 | data = (training_data, test_data) 44 | return data 45 | 46 | @staticmethod 47 | def load_user_list(file): 48 | user_list = [] 49 | print('loading user List...') 50 | with open(file) as f: 51 | for line in f: 52 | user_list.append(line.strip().split()[0]) 53 | return user_list 54 | 55 | @staticmethod 56 | def load_social_data(file): 57 | social_data = [] 58 | print('loading social data...') 59 | with open(file) as f: 60 | for line in f: 61 | items = split(' ', line.strip()) 62 | user1 = items[0] 63 | user2 = items[1] 64 | if len(items) < 3: 65 | weight = 1 66 | else: 67 | weight = float(items[2]) 68 | social_data.append([user1, user2, weight]) 69 | return social_data 70 | -------------------------------------------------------------------------------- /util/conf.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | 4 | class ModelConf(object): 5 | def __init__(self,file): 6 | self.config = {} 7 | self.read_configuration(file) 8 | 9 | def __getitem__(self, item): 10 | if not self.contain(item): 11 | print('parameter '+item+' is not found in the configuration file!') 12 | exit(-1) 13 | return self.config[item] 14 | 15 | def contain(self,key): 16 | return key in self.config 17 | 18 | def read_configuration(self,file): 19 | if not os.path.exists(file): 20 | print('config file is not found!') 21 | raise IOError 22 | with open(file) as f: 23 | for ind,line in enumerate(f): 24 | if line.strip()!='': 25 | try: 26 | key,value=line.strip().split('=') 27 | self.config[key]=value 28 | except ValueError: 29 | print('config file is not in the correct format! Error Line:%d' % ind) 30 | 31 | 32 | class OptionConf(object): 33 | def __init__(self,content): 34 | self.line = content.strip().split(' ') 35 | self.options = {} 36 | self.mainOption = False 37 | if self.line[0] == 'on': 38 | self.mainOption = True 39 | elif self.line[0] == 'off': 40 | self.mainOption = False 41 | for i,item in enumerate(self.line): 42 | if (item.startswith('-') or item.startswith('--')) and not item[1:].isdigit(): 43 | ind = i+1 44 | for j,sub in enumerate(self.line[ind:]): 45 | if (sub.startswith('-') or sub.startswith('--')) and not sub[1:].isdigit(): 46 | ind = j 47 | break 48 | if j == len(self.line[ind:])-1: 49 | ind=j+1 50 | break 51 | try: 52 | self.options[item] = ' '.join(self.line[i+1:i+1+ind]) 53 | except IndexError: 54 | self.options[item] = 1 55 | 56 | def __getitem__(self, item): 57 | if not self.contain(item): 58 | print('parameter '+item+' is invalid!') 59 | exit(-1) 60 | return self.options[item] 61 | 62 | def keys(self): 63 | return self.options.keys() 64 | 65 | def is_main_on(self): 66 | return self.mainOption 67 | 68 | def contain(self,key): 69 | return key in self.options 70 | 71 | 72 | -------------------------------------------------------------------------------- /util/sampler.py: -------------------------------------------------------------------------------- 1 | from random import shuffle,randint,choice 2 | 3 | 4 | def next_batch_pairwise(data,batch_size): 5 | training_data = data.training_data 6 | shuffle(training_data) 7 | batch_id = 0 8 | data_size = len(training_data) 9 | while batch_id < data_size: 10 | if batch_id + batch_size <= data_size: 11 | users = [training_data[idx][0] for idx in range(batch_id, batch_size + batch_id)] 12 | items = [training_data[idx][1] for idx in range(batch_id, batch_size + batch_id)] 13 | batch_id += batch_size 14 | else: 15 | users = [training_data[idx][0] for idx in range(batch_id, data_size)] 16 | items = [training_data[idx][1] for idx in range(batch_id, data_size)] 17 | batch_id = data_size 18 | u_idx, i_idx, j_idx = [], [], [] 19 | item_list = list(data.item.keys()) 20 | for i, user in enumerate(users): 21 | i_idx.append(data.item[items[i]]) 22 | u_idx.append(data.user[user]) 23 | neg_item = choice(item_list) 24 | while neg_item in data.training_set_u[user]: 25 | neg_item = choice(item_list) 26 | j_idx.append(data.item[neg_item]) 27 | yield u_idx, i_idx, j_idx 28 | 29 | 30 | def next_batch_pointwise(data,batch_size): 31 | training_data = data.training_data 32 | data_size = len(training_data) 33 | batch_id = 0 34 | while batch_id < data_size: 35 | if batch_id + batch_size <= data_size: 36 | users = [training_data[idx][0] for idx in range(batch_id, batch_size + batch_id)] 37 | items = [training_data[idx][1] for idx in range(batch_id, batch_size + batch_id)] 38 | batch_id += batch_size 39 | else: 40 | users = [training_data[idx][0] for idx in range(batch_id, data_size)] 41 | items = [training_data[idx][1] for idx in range(batch_id, data_size)] 42 | batch_id = data_size 43 | u_idx, i_idx, y = [], [], [] 44 | for i, user in enumerate(users): 45 | i_idx.append(data.item[items[i]]) 46 | u_idx.append(data.user[user]) 47 | y.append(1) 48 | for instance in range(4): 49 | item_j = randint(0, data.item_num - 1) 50 | while data.id2item[item_j] in data.training_set_u[user]: 51 | item_j = randint(0, data.item_num - 1) 52 | u_idx.append(data.user[user]) 53 | i_idx.append(item_j) 54 | y.append(0) 55 | yield u_idx, i_idx, y -------------------------------------------------------------------------------- /base/recommender.py: -------------------------------------------------------------------------------- 1 | from data.data import Data 2 | from util.conf import OptionConf 3 | from util.logger import Log 4 | from os.path import abspath 5 | from time import strftime, localtime, time 6 | 7 | 8 | class Recommender(object): 9 | def __init__(self, conf, training_set, test_set, **kwargs): 10 | self.config = conf 11 | self.data = Data(self.config, training_set, test_set) 12 | self.model_name = self.config['model.name'] 13 | self.ranking = OptionConf(self.config['item.ranking']) 14 | self.emb_size = int(self.config['embbedding.size']) 15 | self.maxEpoch = int(self.config['num.max.epoch']) 16 | self.maxPreEpoch = int(self.config['num.max.preepoch']) 17 | self.batch_size = int(self.config['batch_size']) 18 | self.lRate = float(self.config['learnRate']) 19 | self.reg = float(self.config['reg.lambda']) 20 | self.output = OptionConf(self.config['output.setup']) 21 | current_time = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) 22 | self.model_log = Log(self.model_name, self.model_name + ' ' + current_time) 23 | self.result = [] 24 | self.recOutput = [] 25 | 26 | def initializing_log(self): 27 | self.model_log.add('### model configuration ###') 28 | for k in self.config.config: 29 | self.model_log.add(k + '=' + self.config[k]) 30 | 31 | def print_model_info(self): 32 | print('Model:', self.config['model.name']) 33 | print('Training Set:', abspath(self.config['training.set'])) 34 | print('Test Set:', abspath(self.config['test.set'])) 35 | print('Embedding Dimension:', self.emb_size) 36 | print('Maximum Epoch:', self.maxEpoch) 37 | print('Learning Rate:', self.lRate) 38 | print('Batch Size:', self.batch_size) 39 | print('Regularization Parameter: reg %.4f' % self.reg) 40 | parStr = '' 41 | if self.config.contain(self.config['model.name']): 42 | args = OptionConf(self.config[self.config['model.name']]) 43 | for key in args.keys(): 44 | parStr += key[1:] + ':' + args[key] + ' ' 45 | print('Specific parameters:', parStr) 46 | 47 | def build(self): 48 | pass 49 | 50 | def train(self): 51 | pass 52 | 53 | def predict(self, u): 54 | pass 55 | 56 | def test(self): 57 | pass 58 | 59 | def save(self): 60 | pass 61 | 62 | def load(self): 63 | pass 64 | 65 | def evaluate(self, rec_list): 66 | pass 67 | 68 | def execute(self): 69 | self.initializing_log() 70 | self.print_model_info() 71 | print('Initializing and building model...') 72 | self.build() 73 | print('Training Model...') 74 | self.train() 75 | print('Testing...') 76 | rec_list = self.test() 77 | print('Evaluating...') 78 | self.evaluate(rec_list) 79 | -------------------------------------------------------------------------------- /data/social.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from data.graph import Graph 3 | import numpy as np 4 | import scipy.sparse as sp 5 | 6 | 7 | class Relation(Graph): 8 | def __init__(self, conf, relation, user): 9 | super().__init__() 10 | self.config = conf 11 | self.social_user = {} 12 | self.relation = relation 13 | self.followees = defaultdict(dict) 14 | self.followers = defaultdict(dict) 15 | self.user = user 16 | self.__initialize() 17 | 18 | def __initialize(self): 19 | idx = [] 20 | for n, pair in enumerate(self.relation): 21 | if pair[0] not in self.user or pair[1] not in self.user: 22 | idx.append(n) 23 | for item in reversed(idx): 24 | del self.relation[item] 25 | for line in self.relation: 26 | user1, user2, weight = line 27 | # add relations to dict 28 | self.followees[user1][user2] = weight 29 | self.followers[user2][user1] = weight 30 | 31 | def get_social_mat(self): 32 | row, col, entries = [], [], [] 33 | for pair in self.relation: 34 | row += [self.user[pair[0]]] 35 | col += [self.user[pair[1]]] 36 | entries += [1.0] 37 | social_mat = sp.csr_matrix((entries, (row, col)), shape=(len(self.user), len(self.user)), dtype=np.float32) 38 | return social_mat 39 | 40 | def get_birectional_social_mat(self): 41 | social_mat = self.get_social_mat() 42 | bi_social_mat = social_mat.multiply(social_mat) 43 | return bi_social_mat 44 | 45 | def convert_to_laplacian_mat(self, adj_mat): 46 | adj_shape = adj_mat.get_shape() 47 | (row_np_keep, col_np_keep) = adj_mat.nonzero() 48 | ratings_keep = adj_mat.data 49 | tmp_adj = sp.csr_matrix((ratings_keep, (row_np_keep, col_np_keep)), shape=adj_shape, dtype=np.float32) 50 | return self.normalize_graph_mat(tmp_adj) 51 | 52 | def weight(self, u1, u2): 53 | if u1 in self.followees and u2 in self.followees[u1]: 54 | return self.followees[u1][u2] 55 | else: 56 | return 0 57 | 58 | def get_followers(self, u): 59 | if u in self.followers: 60 | return self.followers[u] 61 | else: 62 | return {} 63 | 64 | def get_followees(self, u): 65 | if u in self.followees: 66 | return self.followees[u] 67 | else: 68 | return {} 69 | 70 | def has_followee(self, u1, u2): 71 | if u1 in self.followees: 72 | if u2 in self.followees[u1]: 73 | return True 74 | else: 75 | return False 76 | return False 77 | 78 | def has_follower(self, u1, u2): 79 | if u1 in self.followers: 80 | if u2 in self.followers[u1]: 81 | return True 82 | else: 83 | return False 84 | return False 85 | 86 | def size(self): 87 | return len(self.followers), len(self.relation) 88 | -------------------------------------------------------------------------------- /util/algorithm.py: -------------------------------------------------------------------------------- 1 | from numpy.linalg import norm 2 | from math import sqrt, exp 3 | from numba import jit 4 | 5 | 6 | def l1(x): 7 | return norm(x, ord=1) 8 | 9 | 10 | def l2(x): 11 | return norm(x) 12 | 13 | 14 | def common(x1, x2): 15 | # find common ratings 16 | overlap = (x1 != 0) & (x2 != 0) 17 | new_x1 = x1[overlap] 18 | new_x2 = x2[overlap] 19 | return new_x1, new_x2 20 | 21 | 22 | def cosine_sp(x1, x2): 23 | 'x1,x2 are dicts,this version is for sparse representation' 24 | total = 0 25 | denom1 = 0 26 | denom2 = 0 27 | try: 28 | for k in x1: 29 | if k in x2: 30 | total += x1[k] * x2[k] 31 | denom1 += x1[k] ** 2 32 | denom2 += x2[k] ** 2 33 | return total / (sqrt(denom1) * sqrt(denom2)) 34 | except ZeroDivisionError: 35 | return 0 36 | 37 | 38 | def euclidean_sp(x1, x2): 39 | 'x1,x2 are dicts,this version is for sparse representation' 40 | total = 0 41 | try: 42 | for k in x1: 43 | if k in x2: 44 | total += x1[k] ** 2 - x2[k] ** 2 45 | return 1 / total 46 | except ZeroDivisionError: 47 | return 0 48 | 49 | 50 | def cosine(x1, x2): 51 | # find common ratings 52 | # new_x1, new_x2 = common(x1,x2) 53 | # compute the cosine similarity between two vectors 54 | total = x1.dot(x2) 55 | denom = sqrt(x1.dot(x1) * x2.dot(x2)) 56 | try: 57 | return total / denom 58 | except ZeroDivisionError: 59 | return 0 60 | 61 | # return cosine_similarity(x1,x2)[0][0] 62 | 63 | 64 | def pearson_sp(x1, x2): 65 | total = 0 66 | denom1 = 0 67 | denom2 = 0 68 | overlapped = False 69 | try: 70 | mean1 = sum(x1.values()) / len(x1) 71 | mean2 = sum(x2.values()) / len(x2) 72 | for k in x1: 73 | if k in x2: 74 | total += (x1[k] - mean1) * (x2[k] - mean2) 75 | denom1 += (x1[k] - mean1) ** 2 76 | denom2 += (x2[k] - mean2) ** 2 77 | overlapped = True 78 | return total / (sqrt(denom1) * sqrt(denom2)) 79 | except ZeroDivisionError: 80 | if overlapped: 81 | return 1 82 | return 0 83 | 84 | 85 | def euclidean(x1, x2): 86 | # find common ratings 87 | new_x1, new_x2 = common(x1, x2) 88 | # compute the euclidean between two vectors 89 | diff = new_x1 - new_x2 90 | denom = sqrt((diff.dot(diff))) 91 | try: 92 | return 1 / denom 93 | except ZeroDivisionError: 94 | return 0 95 | 96 | 97 | def pearson(x1, x2): 98 | # find common ratings 99 | # new_x1, new_x2 = common(x1, x2) 100 | # compute the pearson similarity between two vectors 101 | # ind1 = new_x1 > 0 102 | # ind2 = new_x2 > 0 103 | try: 104 | mean_x1 = x1.sum() / len(x1) 105 | mean_x2 = x2.sum() / len(x2) 106 | new_x1 = x1 - mean_x1 107 | new_x2 = x2 - mean_x2 108 | total = new_x1.dot(new_x2) 109 | denom = sqrt((new_x1.dot(new_x1)) * (new_x2.dot(new_x2))) 110 | return total / denom 111 | except ZeroDivisionError: 112 | return 0 113 | 114 | 115 | def similarity(x1, x2, sim): 116 | if sim == 'pcc': 117 | return pearson_sp(x1, x2) 118 | if sim == 'euclidean': 119 | return euclidean_sp(x1, x2) 120 | else: 121 | return cosine_sp(x1, x2) 122 | 123 | 124 | def normalize(vec, maxVal, minVal): 125 | 'get the normalized value using min-max normalization' 126 | if maxVal > minVal: 127 | return (vec - minVal) / (maxVal - minVal) 128 | elif maxVal == minVal: 129 | return vec / maxVal 130 | else: 131 | print('error... maximum value is less than minimum value.') 132 | raise ArithmeticError 133 | 134 | 135 | def sigmoid(val): 136 | return 1 / (1 + exp(-val)) 137 | 138 | 139 | def denormalize(vec, max_val, min_val): 140 | return min_val + (vec - 0.01) * (max_val - min_val) 141 | 142 | 143 | @jit(nopython=True) 144 | def find_k_largest(K, candidates): 145 | n_candidates = [] 146 | for iid, score in enumerate(candidates[:K]): 147 | n_candidates.append((iid, score)) 148 | n_candidates.sort(key=lambda d: d[1], reverse=True) 149 | k_largest_scores = [item[1] for item in n_candidates] 150 | ids = [item[0] for item in n_candidates] 151 | # find the K biggest scores 152 | for iid, score in enumerate(candidates): 153 | ind = K 154 | l = 0 155 | r = K - 1 156 | if k_largest_scores[r] < score: 157 | while r >= l: 158 | mid = int((r - l) / 2) + l 159 | if k_largest_scores[mid] >= score: 160 | l = mid + 1 161 | elif k_largest_scores[mid] < score: 162 | r = mid - 1 163 | if r < l: 164 | ind = r 165 | break 166 | # move the items backwards 167 | if ind < K - 2: 168 | k_largest_scores[ind + 2:] = k_largest_scores[ind + 1:-1] 169 | ids[ind + 2:] = ids[ind + 1:-1] 170 | if ind < K - 1: 171 | k_largest_scores[ind + 1] = score 172 | ids[ind + 1] = iid 173 | return ids, k_largest_scores 174 | -------------------------------------------------------------------------------- /util/evaluation.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | class Metric(object): 5 | def __init__(self): 6 | pass 7 | 8 | @staticmethod 9 | def hits(origin, res): 10 | hit_count = {} 11 | for user in origin: 12 | items = list(origin[user].keys()) 13 | predicted = [item[0] for item in res[user]] 14 | hit_count[user] = len(set(items).intersection(set(predicted))) 15 | return hit_count 16 | 17 | @staticmethod 18 | def hit_ratio(origin, hits): 19 | """ 20 | Note: This type of hit ratio calculates the fraction: 21 | (# retrieved interactions in the test set / #all the interactions in the test set) 22 | """ 23 | total_num = 0 24 | for user in origin: 25 | items = list(origin[user].keys()) 26 | total_num += len(items) 27 | hit_num = 0 28 | for user in hits: 29 | hit_num += hits[user] 30 | return hit_num/total_num 31 | 32 | # # @staticmethod 33 | # def hit_ratio(origin, hits): 34 | # """ 35 | # Note: This type of hit ratio calculates the fraction: 36 | # (# users who are recommended items in the test set / #all the users in the test set) 37 | # """ 38 | # hit_num = 0 39 | # for user in hits: 40 | # if hits[user] > 0: 41 | # hit_num += 1 42 | # return hit_num / len(origin) 43 | 44 | @staticmethod 45 | def precision(hits, N): 46 | prec = sum([hits[user] for user in hits]) 47 | return prec / (len(hits) * N) 48 | 49 | @staticmethod 50 | def recall(hits, origin): 51 | recall_list = [hits[user]/len(origin[user]) for user in hits] 52 | recall = sum(recall_list) / len(recall_list) 53 | return recall 54 | 55 | @staticmethod 56 | def F1(prec, recall): 57 | if (prec + recall) != 0: 58 | return 2 * prec * recall / (prec + recall) 59 | else: 60 | return 0 61 | 62 | @staticmethod 63 | def MAE(res): 64 | error = 0 65 | count = 0 66 | for entry in res: 67 | error+=abs(entry[2]-entry[3]) 68 | count+=1 69 | if count==0: 70 | return error 71 | return error/count 72 | 73 | @staticmethod 74 | def RMSE(res): 75 | error = 0 76 | count = 0 77 | for entry in res: 78 | error += (entry[2] - entry[3])**2 79 | count += 1 80 | if count==0: 81 | return error 82 | return math.sqrt(error/count) 83 | 84 | @staticmethod 85 | def NDCG(origin,res,N): 86 | sum_NDCG = 0 87 | for user in res: 88 | DCG = 0 89 | IDCG = 0 90 | #1 = related, 0 = unrelated 91 | for n, item in enumerate(res[user]): 92 | if item[0] in origin[user]: 93 | DCG+= 1.0/math.log(n+2) 94 | for n, item in enumerate(list(origin[user].keys())[:N]): 95 | IDCG+=1.0/math.log(n+2) 96 | sum_NDCG += DCG / IDCG 97 | return sum_NDCG / len(res) 98 | 99 | # @staticmethod 100 | # def MAP(origin, res, N): 101 | # sum_prec = 0 102 | # for user in res: 103 | # hits = 0 104 | # precision = 0 105 | # for n, item in enumerate(res[user]): 106 | # if item[0] in origin[user]: 107 | # hits += 1 108 | # precision += hits / (n + 1.0) 109 | # sum_prec += precision / min(len(origin[user]), N) 110 | # return sum_prec / len(res) 111 | 112 | # @staticmethod 113 | # def AUC(origin, res, rawRes): 114 | # 115 | # from random import choice 116 | # sum_AUC = 0 117 | # for user in origin: 118 | # count = 0 119 | # larger = 0 120 | # itemList = rawRes[user].keys() 121 | # for item in origin[user]: 122 | # item2 = choice(itemList) 123 | # count += 1 124 | # try: 125 | # if rawRes[user][item] > rawRes[user][item2]: 126 | # larger += 1 127 | # except KeyError: 128 | # count -= 1 129 | # if count: 130 | # sum_AUC += float(larger) / count 131 | # 132 | # return float(sum_AUC) / len(origin) 133 | 134 | 135 | def ranking_evaluation(origin, res, N): 136 | measure = [] 137 | for n in N: 138 | predicted = {} 139 | for user in res: 140 | predicted[user] = res[user][:n] 141 | indicators = [] 142 | if len(origin) != len(predicted): 143 | print('The Lengths of test set and predicted set do not match!') 144 | exit(-1) 145 | hits = Metric.hits(origin, predicted) 146 | hr = Metric.hit_ratio(origin, hits) 147 | indicators.append('Hit Ratio:' + str(hr) + '\n') 148 | prec = Metric.precision(hits, n) 149 | indicators.append('Precision:' + str(prec) + '\n') 150 | recall = Metric.recall(hits, origin) 151 | indicators.append('Recall:' + str(recall) + '\n') 152 | # F1 = Metric.F1(prec, recall) 153 | # indicators.append('F1:' + str(F1) + '\n') 154 | #MAP = Measure.MAP(origin, predicted, n) 155 | #indicators.append('MAP:' + str(MAP) + '\n') 156 | NDCG = Metric.NDCG(origin, predicted, n) 157 | indicators.append('NDCG:' + str(NDCG) + '\n') 158 | # AUC = Measure.AUC(origin,res,rawRes) 159 | # measure.append('AUC:' + str(AUC) + '\n') 160 | measure.append('Top ' + str(n) + '\n') 161 | measure += indicators 162 | return measure 163 | 164 | def rating_evaluation(res): 165 | measure = [] 166 | mae = Metric.MAE(res) 167 | measure.append('MAE:' + str(mae) + '\n') 168 | rmse = Metric.RMSE(res) 169 | measure.append('RMSE:' + str(rmse) + '\n') 170 | return measure -------------------------------------------------------------------------------- /base/graph_recommender.py: -------------------------------------------------------------------------------- 1 | from base.recommender import Recommender 2 | from data.ui_graph import Interaction 3 | from util.algorithm import find_k_largest 4 | from time import strftime, localtime, time 5 | from data.loader import FileIO 6 | from os.path import abspath 7 | from util.evaluation import ranking_evaluation 8 | import sys 9 | 10 | 11 | class GraphRecommender(Recommender): 12 | def __init__(self, conf, training_set, test_set, **kwargs): 13 | super(GraphRecommender, self).__init__(conf, training_set, test_set, **kwargs) 14 | self.data = Interaction(conf, training_set, test_set) 15 | self.bestPerformance = [] 16 | top = self.ranking['-topN'].split(',') 17 | self.topN = [int(num) for num in top] 18 | self.max_N = max(self.topN) 19 | 20 | def print_model_info(self): 21 | super(GraphRecommender, self).print_model_info() 22 | # # print dataset statistics 23 | print('Training Set Size: (user number: %d, item number %d, interaction number: %d)' % (self.data.training_size())) 24 | print('Test Set Size: (user number: %d, item number %d, interaction number: %d)' % (self.data.test_size())) 25 | print('=' * 80) 26 | 27 | def build(self): 28 | pass 29 | 30 | def train(self): 31 | pass 32 | 33 | def predict(self, u): 34 | pass 35 | 36 | def test(self): 37 | def process_bar(num, total): 38 | rate = float(num) / total 39 | ratenum = int(50 * rate) 40 | r = '\rProgress: [{}{}]{}%'.format('+' * ratenum, ' ' * (50 - ratenum), ratenum*2) 41 | sys.stdout.write(r) 42 | sys.stdout.flush() 43 | 44 | # predict 45 | rec_list = {} 46 | user_count = len(self.data.test_set) 47 | for i, user in enumerate(self.data.test_set): 48 | candidates = self.predict(user) 49 | # predictedItems = denormalize(predictedItems, self.data.rScale[-1], self.data.rScale[0]) 50 | rated_list, li = self.data.user_rated(user) 51 | for item in rated_list: 52 | candidates[self.data.item[item]] = -10e8 53 | ids, scores = find_k_largest(self.max_N, candidates) 54 | item_names = [self.data.id2item[iid] for iid in ids] 55 | rec_list[user] = list(zip(item_names, scores)) 56 | if i % 1000 == 0: 57 | process_bar(i, user_count) 58 | process_bar(user_count, user_count) 59 | print('') 60 | return rec_list 61 | 62 | def evaluate(self, rec_list): 63 | self.recOutput.append('userId: recommendations in (itemId, ranking score) pairs, * means the item is hit.\n') 64 | for user in self.data.test_set: 65 | line = user + ':' 66 | for item in rec_list[user]: 67 | line += ' (' + item[0] + ',' + str(item[1]) + ')' 68 | if item[0] in self.data.test_set[user]: 69 | line += '*' 70 | line += '\n' 71 | self.recOutput.append(line) 72 | current_time = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) 73 | # output prediction result 74 | out_dir = self.output['-dir'] 75 | file_name = self.config['model.name'] + '@' + current_time + '-top-' + str(self.max_N) + 'items' + '.txt' 76 | FileIO.write_file(out_dir, file_name, self.recOutput) 77 | print('The result has been output to ', abspath(out_dir), '.') 78 | file_name = self.config['model.name'] + '@' + current_time + '-performance' + '.txt' 79 | self.result = ranking_evaluation(self.data.test_set, rec_list, self.topN) 80 | self.model_log.add('###Evaluation Results###') 81 | self.model_log.add(self.result) 82 | FileIO.write_file(out_dir, file_name, self.result) 83 | print('The result of %s:\n%s' % (self.model_name, ''.join(self.result))) 84 | 85 | def fast_evaluation(self, epoch): 86 | print('evaluating the model...') 87 | rec_list = self.test() 88 | measure = ranking_evaluation(self.data.test_set, rec_list, [self.max_N]) 89 | if len(self.bestPerformance) > 0: 90 | count = 0 91 | performance = {} 92 | for m in measure[1:]: 93 | k, v = m.strip().split(':') 94 | performance[k] = float(v) 95 | for k in self.bestPerformance[1]: 96 | if self.bestPerformance[1][k] > performance[k]: 97 | count += 1 98 | else: 99 | count -= 1 100 | if count < 0: 101 | self.bestPerformance[1] = performance 102 | self.bestPerformance[0] = epoch + 1 103 | self.save() 104 | else: 105 | self.bestPerformance.append(epoch + 1) 106 | performance = {} 107 | for m in measure[1:]: 108 | k, v = m.strip().split(':') 109 | performance[k] = float(v) 110 | self.bestPerformance.append(performance) 111 | self.save() 112 | print('-' * 120) 113 | print('Quick Ranking Performance ' + ' (Top-' + str(self.max_N) + ' Item Recommendation)') 114 | measure = [m.strip() for m in measure[1:]] 115 | print('*Current Performance*') 116 | print('Epoch:', str(epoch + 1) + ',', ' | '.join(measure)) 117 | bp = '' 118 | # for k in self.bestPerformance[1]: 119 | # bp+=k+':'+str(self.bestPerformance[1][k])+' | ' 120 | bp += 'Hit Ratio' + ':' + str(self.bestPerformance[1]['Hit Ratio']) + ' | ' 121 | bp += 'Precision' + ':' + str(self.bestPerformance[1]['Precision']) + ' | ' 122 | bp += 'Recall' + ':' + str(self.bestPerformance[1]['Recall']) + ' | ' 123 | # bp += 'F1' + ':' + str(self.bestPerformance[1]['F1']) + ' | ' 124 | bp += 'NDCG' + ':' + str(self.bestPerformance[1]['NDCG']) 125 | print('*Best Performance* ') 126 | print('Epoch:', str(self.bestPerformance[0]) + ',', bp) 127 | print('-' * 120) 128 | return measure 129 | -------------------------------------------------------------------------------- /data/ui_graph.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | from data.data import Data 4 | from data.graph import Graph 5 | import scipy.sparse as sp 6 | import pickle 7 | 8 | class Interaction(Data,Graph): 9 | def __init__(self, conf, training, test): 10 | Graph.__init__(self) 11 | Data.__init__(self,conf,training,test) 12 | 13 | self.user = {} 14 | self.item = {} 15 | self.id2user = {} 16 | self.id2item = {} 17 | self.training_set_u = defaultdict(dict) 18 | self.training_set_i = defaultdict(dict) 19 | self.test_set = defaultdict(dict) 20 | self.test_set_item = set() 21 | self.__generate_set() 22 | self.user_num = len(self.training_set_u) 23 | self.item_num = len(self.training_set_i) 24 | self.ui_adj = self.__create_sparse_bipartite_adjacency() 25 | self.norm_adj = self.normalize_graph_mat(self.ui_adj) 26 | self.interaction_mat = self.__create_sparse_interaction_matrix() 27 | # popularity_user = {} 28 | # for u in self.user: 29 | # popularity_user[self.user[u]] = len(self.training_set_u[u]) 30 | # popularity_item = {} 31 | # for u in self.item: 32 | # popularity_item[self.item[u]] = len(self.training_set_i[u]) 33 | 34 | 35 | def __generate_set(self): 36 | for entry in self.training_data: 37 | user, item, rating = entry 38 | if user not in self.user: 39 | self.user[user] = len(self.user) 40 | self.id2user[self.user[user]] = user 41 | if item not in self.item: 42 | self.item[item] = len(self.item) 43 | self.id2item[self.item[item]] = item 44 | # userList.append 45 | self.training_set_u[user][item] = rating 46 | self.training_set_i[item][user] = rating 47 | for entry in self.test_data: 48 | user, item, rating = entry 49 | if user not in self.user: 50 | continue 51 | self.test_set[user][item] = rating 52 | self.test_set_item.add(item) 53 | 54 | def __create_sparse_bipartite_adjacency(self, self_connection=False): 55 | ''' 56 | return a sparse adjacency matrix with the shape (user number + item number, user number + item number) 57 | ''' 58 | n_nodes = self.user_num + self.item_num 59 | row_idx = [self.user[pair[0]] for pair in self.training_data] 60 | col_idx = [self.item[pair[1]] for pair in self.training_data] 61 | user_np = np.array(row_idx) 62 | item_np = np.array(col_idx) 63 | ratings = np.ones_like(user_np, dtype=np.float32) 64 | tmp_adj = sp.csr_matrix((ratings, (user_np, item_np + self.user_num)), shape=(n_nodes, n_nodes),dtype=np.float32) 65 | adj_mat = tmp_adj + tmp_adj.T 66 | if self_connection: 67 | adj_mat += sp.eye(n_nodes) 68 | return adj_mat 69 | 70 | def convert_to_laplacian_mat(self, adj_mat): 71 | adj_shape = adj_mat.get_shape() 72 | n_nodes = adj_shape[0]+adj_shape[1] 73 | (user_np_keep, item_np_keep) = adj_mat.nonzero() 74 | ratings_keep = adj_mat.data 75 | tmp_adj = sp.csr_matrix((ratings_keep, (user_np_keep, item_np_keep + adj_shape[0])),shape=(n_nodes, n_nodes),dtype=np.float32) 76 | tmp_adj = tmp_adj + tmp_adj.T 77 | return self.normalize_graph_mat(tmp_adj) 78 | 79 | def __create_sparse_interaction_matrix(self): 80 | """ 81 | return a sparse adjacency matrix with the shape (user number, item number) 82 | """ 83 | row, col, entries = [], [], [] 84 | for pair in self.training_data: 85 | row += [self.user[pair[0]]] 86 | col += [self.item[pair[1]]] 87 | entries += [1.0] 88 | interaction_mat = sp.csr_matrix((entries, (row, col)), shape=(self.user_num,self.item_num),dtype=np.float32) 89 | return interaction_mat 90 | 91 | def get_user_id(self, u): 92 | if u in self.user: 93 | return self.user[u] 94 | 95 | def get_item_id(self, i): 96 | if i in self.item: 97 | return self.item[i] 98 | 99 | def training_size(self): 100 | return len(self.user), len(self.item), len(self.training_data) 101 | 102 | def test_size(self): 103 | return len(self.test_set), len(self.test_set_item), len(self.test_data) 104 | 105 | def contain(self, u, i): 106 | 'whether user u rated item i' 107 | if u in self.user and i in self.training_set_u[u]: 108 | return True 109 | else: 110 | return False 111 | 112 | def contain_user(self, u): 113 | 'whether user is in training set' 114 | if u in self.user: 115 | return True 116 | else: 117 | return False 118 | 119 | def contain_item(self, i): 120 | """whether item is in training set""" 121 | if i in self.item: 122 | return True 123 | else: 124 | return False 125 | 126 | def user_rated(self, u): 127 | return list(self.training_set_u[u].keys()), list(self.training_set_u[u].values()) 128 | 129 | def item_rated(self, i): 130 | return list(self.training_set_i[i].keys()), list(self.training_set_i[i].values()) 131 | 132 | def row(self, u): 133 | u = self.id2user[u] 134 | k, v = self.user_rated(u) 135 | vec = np.zeros(len(self.item)) 136 | # print vec 137 | for pair in zip(k, v): 138 | iid = self.item[pair[0]] 139 | vec[iid] = pair[1] 140 | return vec 141 | 142 | def col(self, i): 143 | i = self.id2item[i] 144 | k, v = self.item_rated(i) 145 | vec = np.zeros(len(self.user)) 146 | # print vec 147 | for pair in zip(k, v): 148 | uid = self.user[pair[0]] 149 | vec[uid] = pair[1] 150 | return vec 151 | 152 | def matrix(self): 153 | m = np.zeros((len(self.user), len(self.item))) 154 | for u in self.user: 155 | k, v = self.user_rated(u) 156 | vec = np.zeros(len(self.item)) 157 | # print vec 158 | for pair in zip(k, v): 159 | iid = self.item[pair[0]] 160 | vec[iid] = pair[1] 161 | m[self.user[u]] = vec 162 | return m 163 | -------------------------------------------------------------------------------- /model/graph/CPTPP.py: -------------------------------------------------------------------------------- 1 | ########################## 2 | # This code take SGL, implemented by Coder-Yu on Github, as the backbone. 3 | ########################## 4 | 5 | 6 | from turtle import forward 7 | import torch 8 | torch.manual_seed(12345) 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from base.graph_recommender import GraphRecommender 12 | from util.conf import OptionConf 13 | from util.sampler import next_batch_pairwise 14 | from base.torch_interface import TorchGraphInterface 15 | from util.loss_torch import bpr_loss, l2_reg_loss, InfoNCE 16 | from data.augmentor import GraphAugmentor 17 | from sklearn.decomposition import NMF 18 | import numpy as np 19 | 20 | # Paper: self-supervised graph learning for recommendation. SIGIR'21 21 | 22 | 23 | class CPTPP(GraphRecommender): 24 | def __init__(self, conf, training_set, test_set): 25 | super(CPTPP, self).__init__(conf, training_set, test_set) 26 | 27 | args = OptionConf(self.config['CPTPP']) 28 | self.cl_rate = float(args['-lambda']) 29 | aug_type = self.aug_type = int(args['-augtype']) 30 | drop_rate = float(args['-droprate']) 31 | self.n_layers = int(args['-n_layer']) 32 | temp = float(args['-temp']) 33 | self.inputs_type = int(args['-inputs_type']) 34 | prompt_size = int(args['-prompt_size']) 35 | 36 | self.model = SGL_Encoder(self.data, self.emb_size, drop_rate, self.n_layers, temp, aug_type) 37 | self.prompts_generator = Prompts_Generator(self.emb_size, prompt_size).cuda() 38 | self.fusion_mlp = Fusion_MLP(self.emb_size, prompt_size).cuda() 39 | 40 | if self.inputs_type == 0: 41 | self.interaction_mat = TorchGraphInterface.convert_sparse_mat_to_tensor(self.data.interaction_mat).cuda() 42 | if self.inputs_type == 2: 43 | # small dataset 44 | # self.adj_sparse = TorchGraphInterface.convert_sparse_mat_to_tensor(self.data.ui_adj) 45 | # self.ui_high_order = torch.sparse.mm(self.adj_sparse, self.adj_sparse.to_dense()).cuda() 46 | 47 | # big dataset Ciao 48 | self.sparse_norm_adj = TorchGraphInterface.convert_sparse_mat_to_tensor(self.data.norm_adj).cuda() 49 | 50 | def _pre_train(self): 51 | pre_trained_model = self.model.cuda() 52 | optimizer = torch.optim.Adam(pre_trained_model.parameters(), lr=self.lRate) 53 | 54 | print('############## Pre-Training Phase ##############') 55 | for epoch in range(self.maxPreEpoch): 56 | dropped_adj1 = pre_trained_model.graph_reconstruction() 57 | dropped_adj2 = pre_trained_model.graph_reconstruction() 58 | 59 | for n, batch in enumerate(next_batch_pairwise(self.data, self.batch_size)): 60 | user_idx, pos_idx, neg_idx = batch 61 | cl_loss = pre_trained_model.cal_cl_loss([user_idx,pos_idx],dropped_adj1,dropped_adj2) 62 | batch_loss = cl_loss 63 | # Backward and optimize 64 | optimizer.zero_grad() 65 | if epoch == self.maxEpoch-1: 66 | batch_loss.backward(retain_graph=True) 67 | else: 68 | batch_loss.backward() 69 | optimizer.step() 70 | if n % 100==0: 71 | print('pre-training:', epoch + 1, 'batch', n, 'cl_loss', cl_loss.item()) 72 | 73 | def _csr_to_pytorch_dense(self, csr): 74 | array = csr.toarray() 75 | dense = torch.Tensor(array) 76 | return dense.cuda() 77 | 78 | def _prompts_generation(self, item_emb, user_emb): 79 | if self.inputs_type == 0: 80 | inputs = self._historical_records(item_emb) 81 | # elif self.inputs_type == 1: 82 | # inputs = self._adjacency_matrix_factorization() 83 | elif self.inputs_type == 2: 84 | inputs = self._high_order_u_relations(item_emb, user_emb) 85 | prompts = self.prompts_generator(inputs) 86 | return prompts 87 | 88 | def _historical_records(self, item_emb): 89 | inputs = torch.mm(self.interaction_mat, item_emb) 90 | return inputs 91 | 92 | # def _adjacency_matrix_factorization(self): 93 | # adjacency_matrix = self.data.interaction_mat 94 | # adjacency_matrix = adjacency_matrix.toarray() 95 | 96 | # print('######### Adjacency Matrix Factorization #############') 97 | # nmf = NMF(n_components=self.emb_size) 98 | # user_profiles = nmf.fit_transform(adjacency_matrix) 99 | # inputs = torch.Tensor(user_profiles).cuda() 100 | # return inputs 101 | 102 | def _high_order_u_relations(self, item_emb, user_emb): 103 | # small dataset 104 | # emb = torch.cat((user_emb, item_emb), 0) 105 | # inputs = torch.sparse.mm(self.ui_high_order, emb) 106 | # inputs = inputs[:self.data.user_num, :] 107 | # return inputs 108 | 109 | # big dataset Ciao 110 | ego_embeddings = torch.cat((user_emb, item_emb), 0) 111 | all_embeddings = [ego_embeddings] 112 | for k in range(self.n_layers): 113 | ego_embeddings = torch.sparse.mm(self.sparse_norm_adj, ego_embeddings) 114 | all_embeddings.append(ego_embeddings) 115 | all_embeddings = torch.stack(all_embeddings, dim=1) 116 | all_embeddings = torch.mean(all_embeddings, dim=1) 117 | inputs, item_all_embeddings = torch.split(all_embeddings, [self.data.user_num, self.data.item_num]) 118 | return inputs 119 | 120 | def _prompts_u_embeddings_fusion(self, prompts, user_emb): 121 | prompts_user_emb = torch.cat((prompts, user_emb), 1) 122 | prompted_user_emb = self.fusion_mlp(prompts_user_emb) 123 | return prompted_user_emb 124 | 125 | def train(self): 126 | self._pre_train() 127 | 128 | model = self.model.cuda() 129 | optimizer = torch.optim.Adam(model.parameters(), lr=self.lRate) 130 | 131 | if self.inputs_type == 1: 132 | nmf = NMF(n_components=self.emb_size, max_iter=1000) 133 | self.user_profiles = torch.Tensor(nmf.fit_transform(self.data.interaction_mat.toarray())).cuda() 134 | 135 | print('############## Downstream Training Phase ##############') 136 | for epoch in range(self.maxEpoch): 137 | # dropped_adj1 = model.graph_reconstruction() 138 | # dropped_adj2 = model.graph_reconstruction() 139 | for n, batch in enumerate(next_batch_pairwise(self.data, self.batch_size)): 140 | user_emb, item_emb = model() 141 | if self.inputs_type == 0 or self.inputs_type == 2: 142 | prompts = self._prompts_generation(item_emb, user_emb) 143 | else: 144 | prompts = self.prompts_generator(self.user_profiles) 145 | prompted_user_emb = self._prompts_u_embeddings_fusion(prompts, user_emb) 146 | 147 | user_idx, pos_idx, neg_idx = batch 148 | # rec_user_emb, rec_item_emb = model() 149 | rec_user_emb, rec_item_emb = prompted_user_emb, item_emb 150 | user_emb, pos_item_emb, neg_item_emb = rec_user_emb[user_idx], rec_item_emb[pos_idx], rec_item_emb[neg_idx] 151 | rec_loss = bpr_loss(user_emb, pos_item_emb, neg_item_emb) 152 | # cl_loss = self.cl_rate * model.cal_cl_loss([user_idx,pos_idx],dropped_adj1,dropped_adj2) 153 | batch_loss = rec_loss + l2_reg_loss(self.reg, user_emb, pos_item_emb) #+ cl_loss 154 | # Backward and optimize 155 | 156 | batch_loss.backward() 157 | optimizer.step() 158 | optimizer.zero_grad() 159 | 160 | if n % 100==0: 161 | print('training:', epoch + 1, 'batch', n, 'rec_loss:', rec_loss.item())#, 'cl_loss', cl_loss.item()) 162 | with torch.no_grad(): 163 | user_emb, self.item_emb = self.model() 164 | if self.inputs_type == 0 or self.inputs_type == 2: 165 | prompts = self._prompts_generation(self.item_emb, user_emb) 166 | else: 167 | prompts = self.prompts_generator(self.user_profiles) 168 | prompted_user_emb = self._prompts_u_embeddings_fusion(prompts, user_emb) 169 | self.user_emb = prompted_user_emb 170 | if epoch>=5: 171 | self.fast_evaluation(epoch) 172 | self.user_emb, self.item_emb = self.best_user_emb, self.best_item_emb 173 | 174 | #### save user embeddings 175 | # np_user_emb = self.user_emb.cpu().numpy() 176 | # np.save('./user_emb/cptpp-r-gowalla.npy', np_user_emb) 177 | 178 | def save(self): 179 | with torch.no_grad(): 180 | best_user_emb, self.best_item_emb = self.model.forward() 181 | if self.inputs_type == 0 or self.inputs_type == 2: 182 | prompts = self._prompts_generation(self.best_item_emb, best_user_emb) 183 | else: 184 | prompts = self.prompts_generator(self.user_profiles) 185 | prompted_user_emb = self._prompts_u_embeddings_fusion(prompts, best_user_emb) 186 | self.best_user_emb = prompted_user_emb 187 | 188 | def predict(self, u): 189 | u = self.data.get_user_id(u) 190 | score = torch.matmul(self.user_emb[u], self.item_emb.transpose(0, 1)) 191 | return score.cpu().numpy() 192 | 193 | 194 | class Prompts_Generator(nn.Module): 195 | def __init__(self, emb_size, prompt_size): 196 | super(Prompts_Generator, self).__init__() 197 | 198 | self.layers = nn.ModuleList([nn.Linear(emb_size, prompt_size), nn.Linear(prompt_size, prompt_size)]) 199 | self.activation = nn.Tanh() 200 | #self.activation = nn.Sigmoid() 201 | 202 | def forward(self, inputs): 203 | prompts = inputs 204 | for i in range(len(self.layers)): 205 | prompts = self.layers[i](prompts) 206 | prompts = self.activation(prompts) 207 | 208 | return prompts 209 | 210 | 211 | class Fusion_MLP(nn.Module): 212 | def __init__(self, emb_size, prompt_size): 213 | super(Fusion_MLP, self).__init__() 214 | 215 | self.layers = nn.ModuleList([nn.Linear(emb_size+prompt_size, emb_size), nn.Linear(emb_size, emb_size)]) 216 | self.activation = nn.Tanh() 217 | 218 | def forward(self, x): 219 | for i in range(len(self.layers)): 220 | x = self.layers[i](x) 221 | x = self.activation(x) 222 | 223 | return x 224 | 225 | 226 | class SGL_Encoder(nn.Module): 227 | def __init__(self, data, emb_size, drop_rate, n_layers, temp, aug_type): 228 | super(SGL_Encoder, self).__init__() 229 | self.data = data 230 | self.drop_rate = drop_rate 231 | self.emb_size = emb_size 232 | self.n_layers = n_layers 233 | self.temp = temp 234 | self.aug_type = aug_type 235 | self.norm_adj = data.norm_adj 236 | self.embedding_dict = self._init_model() 237 | self.sparse_norm_adj = TorchGraphInterface.convert_sparse_mat_to_tensor(self.norm_adj).cuda() 238 | 239 | def _init_model(self): 240 | initializer = nn.init.xavier_uniform_ 241 | embedding_dict = nn.ParameterDict({ 242 | 'user_emb': nn.Parameter(initializer(torch.empty(self.data.user_num, self.emb_size))), 243 | 'item_emb': nn.Parameter(initializer(torch.empty(self.data.item_num, self.emb_size))), 244 | }) 245 | return embedding_dict 246 | 247 | def graph_reconstruction(self): 248 | if self.aug_type==0 or 1: 249 | dropped_adj = self.random_graph_augment() 250 | else: 251 | dropped_adj = [] 252 | for k in range(self.n_layers): 253 | dropped_adj.append(self.random_graph_augment()) 254 | return dropped_adj 255 | 256 | def random_graph_augment(self): 257 | dropped_mat = None 258 | if self.aug_type == 0: 259 | dropped_mat = GraphAugmentor.node_dropout(self.data.interaction_mat, self.drop_rate) 260 | elif self.aug_type == 1 or self.aug_type == 2: 261 | dropped_mat = GraphAugmentor.edge_dropout(self.data.interaction_mat, self.drop_rate) 262 | dropped_mat = self.data.convert_to_laplacian_mat(dropped_mat) 263 | return TorchGraphInterface.convert_sparse_mat_to_tensor(dropped_mat).cuda() 264 | 265 | def forward(self, perturbed_adj=None): 266 | ego_embeddings = torch.cat([self.embedding_dict['user_emb'], self.embedding_dict['item_emb']], 0) 267 | all_embeddings = [ego_embeddings] 268 | for k in range(self.n_layers): 269 | if perturbed_adj is not None: 270 | if isinstance(perturbed_adj,list): 271 | ego_embeddings = torch.sparse.mm(perturbed_adj[k], ego_embeddings) 272 | else: 273 | ego_embeddings = torch.sparse.mm(perturbed_adj, ego_embeddings) 274 | else: 275 | ego_embeddings = torch.sparse.mm(self.sparse_norm_adj, ego_embeddings) 276 | all_embeddings.append(ego_embeddings) 277 | all_embeddings = torch.stack(all_embeddings, dim=1) 278 | all_embeddings = torch.mean(all_embeddings, dim=1) 279 | user_all_embeddings, item_all_embeddings = torch.split(all_embeddings, [self.data.user_num, self.data.item_num]) 280 | return user_all_embeddings, item_all_embeddings 281 | 282 | def cal_cl_loss(self, idx, perturbed_mat1, perturbed_mat2): 283 | u_idx = torch.unique(torch.Tensor(idx[0]).type(torch.long)).cuda() 284 | i_idx = torch.unique(torch.Tensor(idx[1]).type(torch.long)).cuda() 285 | user_view_1, item_view_1 = self.forward(perturbed_mat1) 286 | user_view_2, item_view_2 = self.forward(perturbed_mat2) 287 | view1 = torch.cat((user_view_1[u_idx],item_view_1[i_idx]),0) 288 | view2 = torch.cat((user_view_2[u_idx],item_view_2[i_idx]),0) 289 | # user_cl_loss = InfoNCE(user_view_1[u_idx], user_view_2[u_idx], self.temp) 290 | # item_cl_loss = InfoNCE(item_view_1[i_idx], item_view_2[i_idx], self.temp) 291 | #return user_cl_loss + item_cl_loss 292 | return InfoNCE(view1,view2,self.temp) --------------------------------------------------------------------------------