├── README.md ├── datasets ├── preprocess.py ├── sample │ ├── all_train_seq.txt │ ├── test.txt │ └── train.txt ├── sample_train-item-views.csv ├── sample_train-item-views_bac.csv └── test.py ├── pytorch_code ├── __pycache__ │ ├── model.cpython-36.pyc │ └── utils.cpython-36.pyc ├── main.py ├── model.py ├── test.py ├── testEmbedding.py └── utils.py ├── tensorflow_code ├── __pycache__ │ ├── model.cpython-36.pyc │ └── utils.cpython-36.pyc ├── main.py ├── model.py └── utils.py └── testData.py /README.md: -------------------------------------------------------------------------------- 1 | # SR-GNN中文注释 2 | 3 | ## 基于会话的图神经网络推荐 4 | 5 | 本项目来自[这里](https://github.com/CRIPAC-DIG/SR-GNN),只不过是增加了中文注释的学习版本。 6 | 7 | 8 | -------------------------------------------------------------------------------- /datasets/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python36 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on July, 2018 5 | 6 | @author: Tangrizzly 7 | """ 8 | 9 | import argparse 10 | import time 11 | import csv 12 | import pickle 13 | import operator 14 | import datetime 15 | import os 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--dataset', default='sample', help='dataset name: diginetica/yoochoose/sample') 19 | opt = parser.parse_args() 20 | print(opt) 21 | 22 | dataset = 'sample_train-item-views.csv' 23 | if opt.dataset == 'diginetica': 24 | dataset = 'train-item-views.csv' 25 | elif opt.dataset =='yoochoose': 26 | dataset = 'yoochoose-clicks.dat' 27 | 28 | print("-- Starting @ %ss" % datetime.datetime.now()) 29 | with open(dataset, "r") as f: 30 | if opt.dataset == 'yoochoose': 31 | reader = csv.DictReader(f, delimiter=',') 32 | else: 33 | reader = csv.DictReader(f, delimiter=';') 34 | sess_clicks = {} #存储所有点击事件的会话的集合 35 | sess_date = {} 36 | ctr = 0 37 | curid = -1 38 | curdate = None #再循环内存储事件的日期 39 | for data in reader: 40 | sessid = data['session_id'] 41 | if curdate and not curid == sessid: 42 | date = '' 43 | if opt.dataset == 'yoochoose': 44 | date = time.mktime(time.strptime(curdate[:19], '%Y-%m-%dT%H:%M:%S')) 45 | else: 46 | date = time.mktime(time.strptime(curdate, '%Y-%m-%d')) 47 | sess_date[curid] = date 48 | curid = sessid 49 | if opt.dataset == 'yoochoose': 50 | item = data['item_id'] 51 | else: 52 | item = data['item_id'], int(data['timeframe']) 53 | curdate = '' 54 | if opt.dataset == 'yoochoose': 55 | curdate = data['timestamp'] 56 | else: 57 | curdate = data['eventdate'] 58 | 59 | if sessid in sess_clicks: #同一个会话增加点击的商品项目 60 | sess_clicks[sessid] += [item] 61 | else: 62 | sess_clicks[sessid] = [item] 63 | ctr += 1 64 | date = '' 65 | if opt.dataset == 'yoochoose': 66 | date = time.mktime(time.strptime(curdate[:19], '%Y-%m-%dT%H:%M:%S')) 67 | else: 68 | date = time.mktime(time.strptime(curdate, '%Y-%m-%d')) 69 | for i in list(sess_clicks): 70 | sorted_clicks = sorted(sess_clicks[i], key=operator.itemgetter(1)) 71 | sess_clicks[i] = [c[0] for c in sorted_clicks] 72 | sess_date[curid] = date 73 | print("-- Reading data @ %ss" % datetime.datetime.now()) 74 | 75 | # Filter out length 1 sessions 76 | for s in list(sess_clicks): 77 | if len(sess_clicks[s]) == 1: 78 | del sess_clicks[s] 79 | del sess_date[s] 80 | 81 | # Count number of times each item appears 82 | iid_counts = {} 83 | for s in sess_clicks: 84 | seq = sess_clicks[s] 85 | for iid in seq: 86 | if iid in iid_counts: 87 | iid_counts[iid] += 1 88 | else: 89 | iid_counts[iid] = 1 90 | 91 | sorted_counts = sorted(iid_counts.items(), key=operator.itemgetter(1)) 92 | 93 | length = len(sess_clicks) 94 | for s in list(sess_clicks): 95 | curseq = sess_clicks[s] 96 | filseq = list(filter(lambda i: iid_counts[i] >= 5, curseq)) 97 | if len(filseq) < 2: 98 | del sess_clicks[s] 99 | del sess_date[s] 100 | else: 101 | sess_clicks[s] = filseq 102 | 103 | # Split out test set based on dates 104 | dates = list(sess_date.items()) 105 | maxdate = dates[0][1] 106 | 107 | for _, date in dates: 108 | if maxdate < date: 109 | maxdate = date 110 | 111 | # 7 days for test 112 | splitdate = 0 113 | if opt.dataset == 'yoochoose': 114 | splitdate = maxdate - 86400 * 1 # the number of seconds for a day:86400 115 | else: 116 | splitdate = maxdate - 86400 * 7 117 | 118 | print('Splitting date', splitdate) # Yoochoose: ('Split date', 1411930799.0) 119 | tra_sess = filter(lambda x: x[1] < splitdate, dates) 120 | tes_sess = filter(lambda x: x[1] > splitdate, dates) 121 | 122 | # Sort sessions by date 123 | tra_sess = sorted(tra_sess, key=operator.itemgetter(1)) # [(session_id, timestamp), (), ] 124 | tes_sess = sorted(tes_sess, key=operator.itemgetter(1)) # [(session_id, timestamp), (), ] 125 | print(len(tra_sess)) # 186670 # 7966257 126 | print(len(tes_sess)) # 15979 # 15324 127 | print(tra_sess[:3]) 128 | print(tes_sess[:3]) 129 | print("-- Splitting train set and test set @ %ss" % datetime.datetime.now()) 130 | 131 | # Choosing item count >=5 gives approximately the same number of items as reported in paper 132 | item_dict = {} 133 | # Convert training sessions to sequences and renumber items to start from 1 134 | def obtian_tra(): 135 | train_ids = [] 136 | train_seqs = [] 137 | train_dates = [] 138 | item_ctr = 1 139 | for s, date in tra_sess: 140 | seq = sess_clicks[s] 141 | outseq = [] 142 | for i in seq: 143 | if i in item_dict: 144 | outseq += [item_dict[i]] 145 | else: 146 | outseq += [item_ctr] 147 | item_dict[i] = item_ctr 148 | item_ctr += 1 149 | if len(outseq) < 2: # Doesn't occur 150 | continue 151 | train_ids += [s] 152 | train_dates += [date] 153 | train_seqs += [outseq] 154 | print(item_ctr) # 43098, 37484 155 | return train_ids, train_dates, train_seqs 156 | 157 | 158 | # Convert test sessions to sequences, ignoring items that do not appear in training set 159 | def obtian_tes(): 160 | test_ids = [] 161 | test_seqs = [] 162 | test_dates = [] 163 | for s, date in tes_sess: 164 | seq = sess_clicks[s] 165 | outseq = [] 166 | for i in seq: 167 | if i in item_dict: 168 | outseq += [item_dict[i]] 169 | if len(outseq) < 2: 170 | continue 171 | test_ids += [s] 172 | test_dates += [date] 173 | test_seqs += [outseq] 174 | return test_ids, test_dates, test_seqs 175 | 176 | 177 | tra_ids, tra_dates, tra_seqs = obtian_tra() 178 | tes_ids, tes_dates, tes_seqs = obtian_tes() 179 | 180 | 181 | def process_seqs(iseqs, idates): 182 | out_seqs = [] 183 | out_dates = [] 184 | labs = [] 185 | ids = [] 186 | for id, seq, date in zip(range(len(iseqs)), iseqs, idates): 187 | for i in range(1, len(seq)): 188 | tar = seq[-i] 189 | labs += [tar] 190 | out_seqs += [seq[:-i]] 191 | out_dates += [date] 192 | ids += [id] 193 | return out_seqs, out_dates, labs, ids 194 | 195 | 196 | tr_seqs, tr_dates, tr_labs, tr_ids = process_seqs(tra_seqs, tra_dates) 197 | te_seqs, te_dates, te_labs, te_ids = process_seqs(tes_seqs, tes_dates) 198 | tra = (tr_seqs, tr_labs) 199 | tes = (te_seqs, te_labs) 200 | print(len(tr_seqs)) 201 | print(len(te_seqs)) 202 | print(tr_seqs[:3], tr_dates[:3], tr_labs[:3]) 203 | print(te_seqs[:3], te_dates[:3], te_labs[:3]) 204 | all = 0 205 | 206 | for seq in tra_seqs: 207 | all += len(seq) 208 | for seq in tes_seqs: 209 | all += len(seq) 210 | print('avg length: ', all/(len(tra_seqs) + len(tes_seqs) * 1.0)) 211 | if opt.dataset == 'diginetica': 212 | if not os.path.exists('diginetica'): 213 | os.makedirs('diginetica') 214 | pickle.dump(tra, open('diginetica/train.txt', 'wb')) 215 | pickle.dump(tes, open('diginetica/test.txt', 'wb')) 216 | pickle.dump(tra_seqs, open('diginetica/all_train_seq.txt', 'wb')) 217 | elif opt.dataset == 'yoochoose': 218 | if not os.path.exists('yoochoose1_4'): 219 | os.makedirs('yoochoose1_4') 220 | if not os.path.exists('yoochoose1_64'): 221 | os.makedirs('yoochoose1_64') 222 | pickle.dump(tes, open('yoochoose1_4/test.txt', 'wb')) 223 | pickle.dump(tes, open('yoochoose1_64/test.txt', 'wb')) 224 | 225 | split4, split64 = int(len(tr_seqs) / 4), int(len(tr_seqs) / 64) 226 | print(len(tr_seqs[-split4:])) 227 | print(len(tr_seqs[-split64:])) 228 | 229 | tra4, tra64 = (tr_seqs[-split4:], tr_labs[-split4:]), (tr_seqs[-split64:], tr_labs[-split64:]) 230 | seq4, seq64 = tra_seqs[tr_ids[-split4]:], tra_seqs[tr_ids[-split64]:] 231 | 232 | pickle.dump(tra4, open('yoochoose1_4/train.txt', 'wb')) 233 | pickle.dump(seq4, open('yoochoose1_4/all_train_seq.txt', 'wb')) 234 | 235 | pickle.dump(tra64, open('yoochoose1_64/train.txt', 'wb')) 236 | pickle.dump(seq64, open('yoochoose1_64/all_train_seq.txt', 'wb')) 237 | 238 | else: 239 | if not os.path.exists('sample'): 240 | os.makedirs('sample') 241 | pickle.dump(tra, open('sample/train.txt', 'wb')) 242 | pickle.dump(tes, open('sample/test.txt', 'wb')) 243 | pickle.dump(tra_seqs, open('sample/all_train_seq.txt', 'wb')) 244 | 245 | print('Done.') 246 | -------------------------------------------------------------------------------- /datasets/sample/all_train_seq.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/userbehavioranalysis/SR-GNN-Chinese_Comment_edition/54ad512954ac9df513d6193f3942e2d5c7906256/datasets/sample/all_train_seq.txt -------------------------------------------------------------------------------- /datasets/sample/test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/userbehavioranalysis/SR-GNN-Chinese_Comment_edition/54ad512954ac9df513d6193f3942e2d5c7906256/datasets/sample/test.txt -------------------------------------------------------------------------------- /datasets/sample/train.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/userbehavioranalysis/SR-GNN-Chinese_Comment_edition/54ad512954ac9df513d6193f3942e2d5c7906256/datasets/sample/train.txt -------------------------------------------------------------------------------- /datasets/test.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import time 3 | import operator 4 | import datetime 5 | import os 6 | import pickle 7 | 8 | with open('sample_train-item-views.csv', "r") as f: 9 | reader = csv.DictReader(f, delimiter=';') 10 | sess_clicks = {} #存储所有点击事件的会话的集合 11 | sess_date = {} #存储——{会话ID:会话的日期对应的秒数},后期用来进行数据拆分(测试集、训练集) 12 | ctr = 0 13 | curid = -1 14 | curdate = None #再循环内存储事件的日期 15 | for data in reader: 16 | sessid = data['session_id'] 17 | if curdate and not curid == sessid: #当前的id不是session的id,则增加该sessionID的日期转换为秒的结果 18 | date = '' 19 | 20 | date = time.mktime(time.strptime(curdate, '%Y-%m-%d')) 21 | sess_date[curid] = date 22 | curid = sessid 23 | item = data['item_id'], int(data['timeframe']) 24 | curdate = '' 25 | 26 | curdate = data['eventdate'] 27 | 28 | if sessid in sess_clicks: #同一个会话增加点击的商品项目 29 | sess_clicks[sessid] += [item] 30 | else: 31 | sess_clicks[sessid] = [item] 32 | ctr += 1 33 | date = '' 34 | 35 | date = time.mktime(time.strptime(curdate, '%Y-%m-%d')) 36 | for i in list(sess_clicks): 37 | sorted_clicks = sorted(sess_clicks[i], key=operator.itemgetter(1)) 38 | sess_clicks[i] = [c[0] for c in sorted_clicks] 39 | sess_date[curid] = date 40 | print("-- Reading data @ %ss" % datetime.datetime.now()) 41 | 42 | 43 | length_berore = len(sess_clicks) 44 | # Filter out length 1 sessions 45 | for s in list(sess_clicks): 46 | if len(sess_clicks[s]) == 1: 47 | del sess_clicks[s] 48 | del sess_date[s] 49 | 50 | # Count number of times each item appears 51 | iid_counts = {} 52 | for s in sess_clicks: 53 | seq = sess_clicks[s] 54 | for iid in seq: 55 | if iid in iid_counts: 56 | iid_counts[iid] += 1 57 | else: 58 | iid_counts[iid] = 1 59 | 60 | sorted_counts = sorted(iid_counts.items(), key=operator.itemgetter(1)) 61 | 62 | #过滤,首先得到去除出现小于5次的项目,然后长度小于2个的会话直接删除,否则替换成去除出现小于5次的项目的点击会话 63 | length=len(sess_clicks) 64 | for s in list(sess_clicks): #list(字典)得到的是键的列表 65 | curseq = sess_clicks[s] 66 | filseq = list(filter(lambda i: iid_counts[i] >= 5, curseq)) 67 | if len(filseq) < 2: 68 | del sess_clicks[s] 69 | del sess_date[s] 70 | else: 71 | sess_clicks[s] = filseq 72 | length_after = len(sess_clicks) 73 | 74 | print('The number of sessions has changed from %d to %d after filitered' %(length_berore,length_after)) 75 | 76 | # Split out test set based on dates 77 | dates = list(sess_date.items()) 78 | maxdate = dates[0][1] #Get the second corresponding to the maximum date. 79 | 80 | for _, date in dates: 81 | if maxdate < date: 82 | maxdate = date 83 | 84 | # 7 days for test 85 | splitdate = 0 86 | splitdate = maxdate - 86400 * 7 #A day contains 86400 seconds. 87 | 88 | print('Splitting date', splitdate) # Yoochoose: ('Split date', 1411930799.0) 89 | tra_sess = filter(lambda x: x[1] < splitdate, dates) 90 | tes_sess = filter(lambda x: x[1] > splitdate, dates) 91 | 92 | # Sort sessions by date 93 | tra_sess = sorted(tra_sess, key=operator.itemgetter(1)) # [(session_id, timestamp), (), ] 94 | tes_sess = sorted(tes_sess, key=operator.itemgetter(1)) # [(session_id, timestamp), (), ] 95 | print(len(tra_sess)) # 186670 # 7966257 96 | print(len(tes_sess)) # 15979 # 15324 97 | print(tra_sess[:3]) 98 | print(tes_sess[:3]) 99 | print("-- Splitting train set and test set @ %ss" % datetime.datetime.now()) 100 | 101 | item_dict = {} #The dictionary to record the items 102 | # Convert training sessions to sequences and renumber items to start from 1 103 | def obtian_tra(): 104 | train_ids = [] 105 | train_seqs = [] 106 | train_dates = [] 107 | item_ctr = 1 108 | for s, date in tra_sess: 109 | seq = sess_clicks[s] 110 | outseq = [] 111 | for i in seq: 112 | if i in item_dict: 113 | outseq += [item_dict[i]] 114 | else: 115 | outseq += [item_ctr] 116 | item_dict[i] = item_ctr 117 | item_ctr += 1 118 | if len(outseq) < 2: # Doesn't occur 119 | continue 120 | train_ids += [s] 121 | train_dates += [date] 122 | train_seqs += [outseq] 123 | print(item_ctr) # 43098, 37484 124 | return train_ids, train_dates, train_seqs 125 | 126 | 127 | # Convert test sessions to sequences, ignoring items that do not appear in training set 128 | def obtian_tes(): 129 | test_ids = [] 130 | test_seqs = [] 131 | test_dates = [] 132 | for s, date in tes_sess: 133 | seq = sess_clicks[s] 134 | outseq = [] 135 | for i in seq: 136 | if i in item_dict: 137 | outseq += [item_dict[i]] 138 | if len(outseq) < 2: 139 | continue 140 | test_ids += [s] 141 | test_dates += [date] 142 | test_seqs += [outseq] 143 | return test_ids, test_dates, test_seqs 144 | 145 | tra_ids, tra_dates, tra_seqs = obtian_tra() #sessionID,to which the data by second corresponding,to which the sequence corresponding. 146 | tes_ids, tes_dates, tes_seqs = obtian_tes() #test sets wihch are same as above of the contents. 147 | 148 | def process_seqs(iseqs, idates): #每个长度为n的序列拆分成n-1组输入和输出标签 149 | out_seqs = [] 150 | out_dates = [] 151 | labs = [] 152 | ids = [] 153 | for id, seq, date in zip(range(len(iseqs)), iseqs, idates): 154 | for i in range(1, len(seq)): 155 | tar = seq[-i] 156 | labs += [tar] 157 | out_seqs += [seq[:-i]] 158 | out_dates += [date] 159 | ids += [id] 160 | return out_seqs, out_dates, labs, ids 161 | 162 | tr_seqs, tr_dates, tr_labs, tr_ids = process_seqs(tra_seqs, tra_dates) 163 | te_seqs, te_dates, te_labs, te_ids = process_seqs(tes_seqs, tes_dates) 164 | tra = (tr_seqs, tr_labs) 165 | tes = (te_seqs, te_labs) 166 | print(len(tr_seqs)) 167 | print(len(te_seqs)) 168 | print(tr_seqs[:3], tr_dates[:3], tr_labs[:3]) 169 | print(te_seqs[:3], te_dates[:3], te_labs[:3]) 170 | all = 0 171 | 172 | for seq in tra_seqs: 173 | all += len(seq) 174 | for seq in tes_seqs: 175 | all += len(seq) 176 | print('avg length: ', all/(len(tra_seqs) + len(tes_seqs) * 1.0)) 177 | 178 | if not os.path.exists('sample'): 179 | os.makedirs('sample') 180 | pickle.dump(tra, open('sample/train.txt', 'wb')) #所有训练的序列+标签 181 | pickle.dump(tes, open('sample/test.txt', 'wb')) #所以测试的序列+标签 182 | pickle.dump(tra_seqs, open('sample/all_train_seq.txt', 'wb')) #所以训练的序列 183 | 184 | print('Done.') 185 | 186 | -------------------------------------------------------------------------------- /pytorch_code/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/userbehavioranalysis/SR-GNN-Chinese_Comment_edition/54ad512954ac9df513d6193f3942e2d5c7906256/pytorch_code/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /pytorch_code/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/userbehavioranalysis/SR-GNN-Chinese_Comment_edition/54ad512954ac9df513d6193f3942e2d5c7906256/pytorch_code/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /pytorch_code/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python36 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on July, 2018 5 | 6 | @author: Tangrizzly 7 | """ 8 | 9 | import argparse 10 | import pickle 11 | import time 12 | from utils import build_graph, Data, split_validation 13 | from model import * 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--dataset', default='sample', help='dataset name: diginetica/yoochoose1_4/yoochoose1_64/sample') 17 | parser.add_argument('--batchSize', type=int, default=100, help='input batch size') 18 | parser.add_argument('--hiddenSize', type=int, default=100, help='hidden state size') 19 | parser.add_argument('--epoch', type=int, default=30, help='the number of epochs to train for') 20 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') # [0.001, 0.0005, 0.0001] 21 | parser.add_argument('--lr_dc', type=float, default=0.1, help='learning rate decay rate') 22 | parser.add_argument('--lr_dc_step', type=int, default=3, help='the number of steps after which the learning rate decay') 23 | parser.add_argument('--l2', type=float, default=1e-5, help='l2 penalty') # [0.001, 0.0005, 0.0001, 0.00005, 0.00001] 24 | parser.add_argument('--step', type=int, default=1, help='gnn propogation steps') 25 | parser.add_argument('--patience', type=int, default=10, help='the number of epoch to wait before early stop ') 26 | parser.add_argument('--nonhybrid', action='store_true', help='only use the global preference to predict') 27 | parser.add_argument('--validation', action='store_true', help='validation') 28 | parser.add_argument('--valid_portion', type=float, default=0.1, help='split the portion of training set as validation set') 29 | opt = parser.parse_args() 30 | print(opt) 31 | 32 | 33 | def main(): 34 | train_data = pickle.load(open('../datasets/' + opt.dataset + '/train.txt', 'rb')) 35 | if opt.validation: 36 | train_data, valid_data = split_validation(train_data, opt.valid_portion) 37 | test_data = valid_data 38 | else: 39 | test_data = pickle.load(open('../datasets/' + opt.dataset + '/test.txt', 'rb')) 40 | # all_train_seq = pickle.load(open('../datasets/' + opt.dataset + '/all_train_seq.txt', 'rb')) 41 | # g = build_graph(all_train_seq) #测试数据调用len(g.node)=309就是下面n_node = 310的来源,为了得到构建GNN的嵌入层的输入节点数目 42 | train_data = Data(train_data, shuffle=True) 43 | test_data = Data(test_data, shuffle=False) 44 | # del all_train_seq, g 45 | if opt.dataset == 'diginetica': 46 | n_node = 43098 47 | elif opt.dataset == 'yoochoose1_64' or opt.dataset == 'yoochoose1_4': 48 | n_node = 37484 49 | else: 50 | n_node = 310 51 | 52 | model = trans_to_cuda(SessionGraph(opt, n_node)) #模型构建就靠这句话 53 | # print(SessionGraph(opt, n_node)) #测试打印下网络结构 54 | 55 | start = time.time() 56 | best_result = [0, 0] 57 | best_epoch = [0, 0] 58 | bad_counter = 0 59 | for epoch in range(opt.epoch): 60 | print('-------------------------------------------------------') 61 | print('epoch: ', epoch) 62 | hit, mrr = train_test(model, train_data, test_data) #模型训练就靠这句话 63 | flag = 0 64 | if hit >= best_result[0]: 65 | best_result[0] = hit 66 | best_epoch[0] = epoch 67 | flag = 1 68 | if mrr >= best_result[1]: 69 | best_result[1] = mrr 70 | best_epoch[1] = epoch 71 | flag = 1 72 | print('Best Result:') 73 | print('\tRecall@20:\t%.4f\tMMR@20:\t%.4f\tEpoch:\t%d,\t%d'% (best_result[0], best_result[1], best_epoch[0], best_epoch[1])) 74 | bad_counter += 1 - flag 75 | if bad_counter >= opt.patience: 76 | break 77 | print('-------------------------------------------------------') 78 | end = time.time() 79 | print("Run time: %f s" % (end - start)) 80 | 81 | 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /pytorch_code/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python36 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on July, 2018 5 | 6 | @author: Tangrizzly 7 | """ 8 | 9 | import datetime 10 | import math 11 | import numpy as np 12 | import torch 13 | from torch import nn 14 | from torch.nn import Module, Parameter 15 | import torch.nn.functional as F 16 | 17 | 18 | class GNN(Module): 19 | def __init__(self, hidden_size, step=1): #输入仅需确定隐状态数和步数 20 | super(GNN, self).__init__() 21 | self.step = step #gnn前向传播的步数 default=1 22 | self.hidden_size = hidden_size 23 | self.input_size = hidden_size * 2 24 | self.gate_size = 3 * hidden_size 25 | #有关Parameter函数的解释:首先可以把这个函数理解为类型转换函数,将一个不可训练的类型Tensor转换成可以训练的类型parameter 26 | #并将这个parameter绑定到这个module里面(net.parameter()中就有这个绑定的parameter,所以在参数优化的时候可以进行优化的), 27 | #所以经过类型转换这个self.XX变成了模型的一部分,成为了模型中根据训练可以改动的参数了。 28 | #使用这个函数的目的也是想让某些变量在学习的过程中不断的修改其值以达到最优化。——————https://www.jianshu.com/p/d8b77cc02410 29 | self.w_ih = Parameter(torch.Tensor(self.gate_size, self.input_size)) 30 | self.w_hh = Parameter(torch.Tensor(self.gate_size, self.hidden_size)) 31 | self.b_ih = Parameter(torch.Tensor(self.gate_size)) 32 | self.b_hh = Parameter(torch.Tensor(self.gate_size)) 33 | self.b_iah = Parameter(torch.Tensor(self.hidden_size)) 34 | self.b_oah = Parameter(torch.Tensor(self.hidden_size)) 35 | #有关nn.Linear的解释:torch.nn.Linear(in_features, out_features, bias=True),对输入数据做线性变换:y=Ax+b 36 | #形状:输入: (N,in_features) 输出: (N,out_features) 37 | self.linear_edge_in = nn.Linear(self.hidden_size, self.hidden_size, bias=True) 38 | self.linear_edge_out = nn.Linear(self.hidden_size, self.hidden_size, bias=True) 39 | self.linear_edge_f = nn.Linear(self.hidden_size, self.hidden_size, bias=True) 40 | 41 | def GNNCell(self, A, hidden): 42 | #A-->实际上是该批数据图矩阵的列表 eg:(100,5?,10?(即5?X2)) 43 | #hidden--> eg(100-batch_size,5?,100-embeding_size) 44 | #后面所有的5?代表这个维的长度是该批唯一最大类别长度(类别数目不足该长度的会话补零),根据不同批会变化 45 | #有关matmul的解释:矩阵相乘,多维会广播相乘 46 | input_in = torch.matmul(A[:, :, :A.shape[1]], self.linear_edge_in(hidden)) + self.b_iah #input_in-->(100,5?,100) 47 | input_out = torch.matmul(A[:, :, A.shape[1]: 2 * A.shape[1]], self.linear_edge_out(hidden)) + self.b_oah #input_out-->(100,5?,100) 48 | #在第2个轴将tensor连接起来 49 | inputs = torch.cat([input_in, input_out], 2) #inputs-->(100,5?,200) 50 | #关于functional.linear(input, weight, bias=None)的解释:y= xA^T + b 应用线性变换,返回Output: (N,∗,out_features) 51 | #[*代表任意其他的东西] 52 | gi = F.linear(inputs, self.w_ih, self.b_ih) #gi-->(100,5?,300) 53 | gh = F.linear(hidden, self.w_hh, self.b_hh) #gh-->(100,5?,300) 54 | #torch.chunk(tensor, chunks, dim=0):将tensor拆分成指定数量的块,比如下面就是沿着第2个轴拆分成3块 55 | i_r, i_i, i_n = gi.chunk(3, 2) #三个都是(100,5?,100) 56 | h_r, h_i, h_n = gh.chunk(3, 2) #三个都是(100,5?,100) 57 | resetgate = torch.sigmoid(i_r + h_r) #resetgate-->(100,5?,100) 原文公式(3) 58 | inputgate = torch.sigmoid(i_i + h_i) #inputgate-->(100,5?,100) 59 | newgate = torch.tanh(i_n + resetgate * h_n) #newgate-->(100,5?,100) 原文公式(4) 60 | hy = newgate + inputgate * (hidden - newgate) #hy-->(100,5?,100) 原文公式(5) 61 | return hy 62 | 63 | def forward(self, A, hidden): 64 | #A-->实际上是该批数据图矩阵的列表 eg:(100,5?,10?(即5?X2)) 5?代表这个维的长度是该批唯一最大类别长度(类别数目不足该长度的会话补零),根据不同批会变化 65 | #hidden--> eg:(100-batch_size,5?,100-embeding_size) 即数据图中节点类别对应低维嵌入的表示 66 | for i in range(self.step): 67 | hidden = self.GNNCell(A, hidden) 68 | return hidden 69 | 70 | 71 | class SessionGraph(Module): 72 | def __init__(self, opt, n_node): #opt-->可控输入参数, n_node-->嵌入层图的节点数目 73 | super(SessionGraph, self).__init__() 74 | self.hidden_size = opt.hiddenSize #opt.hiddenSize-->hidden state size 75 | self.n_node = n_node 76 | self.batch_size = opt.batchSize #opt.batch_siza-->input batch size *default=100 77 | self.nonhybrid = opt.nonhybrid #opt.nonhybrid-->only use the global preference to predicts 78 | self.embedding = nn.Embedding(self.n_node, self.hidden_size) 79 | self.gnn = GNN(self.hidden_size, step=opt.step) #opt.step-->gnn propogation steps 80 | self.linear_one = nn.Linear(self.hidden_size, self.hidden_size, bias=True) 81 | self.linear_two = nn.Linear(self.hidden_size, self.hidden_size, bias=True) 82 | self.linear_three = nn.Linear(self.hidden_size, 1, bias=False) 83 | self.linear_transform = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=True) 84 | self.loss_function = nn.CrossEntropyLoss() #交叉熵损失 85 | self.optimizer = torch.optim.Adam(self.parameters(), lr=opt.lr, weight_decay=opt.l2) #Adam优化算法 86 | #StepLR(optimizer, step_size, gamma=0.1, last_epoch=-1) 将每个参数组的学习率设置为每个step_size epoch 87 | #由gamma衰减的初始lr。当last_epoch=-1时,将初始lr设置为lr。 88 | self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=opt.lr_dc_step, gamma=opt.lr_dc) 89 | self.reset_parameters() #初始化权重参数 90 | 91 | def reset_parameters(self): 92 | stdv = 1.0 / math.sqrt(self.hidden_size) 93 | for weight in self.parameters(): 94 | weight.data.uniform_(-stdv, stdv) 95 | 96 | def compute_scores(self, hidden, mask): 97 | #hidden-->(100,16?,100) 其中16?代表该样本所有数据最长会话的长度(不同数据集会不同),单个样本其余部分补了0 98 | #mask-->(100,16?) 有序列的位置是[1],没有动作序列的位置是[0] 99 | ht = hidden[torch.arange(mask.shape[0]).long(), torch.sum(mask, 1) - 1] # batch_size x latent_size(100,100) 这是最后一个动作对应的位置,即文章中说的局部偏好 100 | q1 = self.linear_one(ht).view(ht.shape[0], 1, ht.shape[1]) # batch_size x 1 x latent_size(100,1,100) 局部偏好线性变换后改成能计算的维度 101 | q2 = self.linear_two(hidden) # batch_size x seq_length x latent_size (100,16?,100) 即全局偏好 102 | alpha = self.linear_three(torch.sigmoid(q1 + q2)) #(100,16,1) 103 | a = torch.sum(alpha * hidden * mask.view(mask.shape[0], -1, 1).float(), 1) #(100,100) 原文中公式(6) 104 | if not self.nonhybrid: 105 | a = self.linear_transform(torch.cat([a, ht], 1)) #原文中公式(7) 106 | b = self.embedding.weight[1:] # n_nodes x latent_size (309,100) 107 | scores = torch.matmul(a, b.transpose(1, 0)) #原文中公式(8) 108 | return scores #(100,309) 109 | 110 | def forward(self, inputs, A): 111 | #inputs-->单个点击动作序列的唯一类别并按照批最大唯一类别长度补全0列表(即图矩阵的元素的类别标签列表) A-->实际上是该批数据图矩阵的列表 112 | # print(inputs.size()) #测试打印下输入的维度 (100-batch_size,5?) 5?代表这个维的长度是该批唯一最大类别长度(类别数目不足该长度的会话补0),根据不同批会变化 113 | hidden = self.embedding(inputs) #返回的hidden的shape -->(100-batch_size,5?,100-embeding_size) 114 | hidden = self.gnn(A, hidden) 115 | return hidden #(100,5?,100) 116 | 117 | 118 | def trans_to_cuda(variable): 119 | if torch.cuda.is_available(): 120 | return variable.cuda() 121 | else: 122 | return variable 123 | 124 | 125 | def trans_to_cpu(variable): 126 | if torch.cuda.is_available(): 127 | return variable.cpu() 128 | else: 129 | return variable 130 | 131 | 132 | def forward(model, i, data): #传入模型model(SessionGraph), 数据批的索引i, 训练的数据data(Data) 133 | #返回:动作序列对应唯一动作集合的位置角标,该批数据图矩阵的列表,单个点击动作序列的唯一类别并按照批最大类别补全0列表,面罩,目标数据 134 | alias_inputs, A, items, mask, targets = data.get_slice(i) 135 | alias_inputs = trans_to_cuda(torch.Tensor(alias_inputs).long()) #(100,16?) 136 | test_alias_inputs = alias_inputs.numpy() #测试查看alias_inputs的内容 137 | strange = torch.arange(len(alias_inputs)).long() #0到99 138 | items = trans_to_cuda(torch.Tensor(items).long()) 139 | A = trans_to_cuda(torch.Tensor(A).float()) 140 | mask = trans_to_cuda(torch.Tensor(mask).long()) 141 | hidden = model(items, A) #这里调用了SessionGraph的forward函数,返回维度数目(100,5?,100) 142 | get = lambda i: hidden[i][alias_inputs[i]] #选择第这一批第i个样本对应类别序列的函数 143 | test_get = get(0) # (16?,100) 144 | seq_hidden = torch.stack([get(i) for i in torch.arange(len(alias_inputs)).long()]) #(100,16?,100) 145 | return targets, model.compute_scores(seq_hidden, mask) 146 | 147 | 148 | def train_test(model, train_data, test_data): #传入模型SessionGraph,训练数据和测试数据Data 149 | model.scheduler.step() #调度设置优化器的参数 150 | print('start training: ', datetime.datetime.now()) 151 | model.train() # 指定模型为训练模式,计算梯度 152 | total_loss = 0.0 153 | slices = train_data.generate_batch(model.batch_size) 154 | for i, j in zip(slices, np.arange(len(slices))): #根据批的索引数据进行数据提取训练:i-->批索引, j-->第几批 155 | model.optimizer.zero_grad() #前一步的损失清零 156 | targets, scores = forward(model, i, train_data) # 157 | targets = trans_to_cuda(torch.Tensor(targets).long()) 158 | loss = model.loss_function(scores, targets - 1) 159 | loss.backward() # 反向传播 160 | model.optimizer.step() # 优化 161 | total_loss += loss 162 | if j % int(len(slices) / 5 + 1) == 0: 163 | print('[%d/%d] Loss: %.4f' % (j, len(slices), loss.item())) 164 | print('\tLoss:\t%.3f' % total_loss) 165 | 166 | print('start predicting: ', datetime.datetime.now()) 167 | model.eval() # 指定模型为计算模式 168 | hit, mrr = [], [] 169 | slices = test_data.generate_batch(model.batch_size) 170 | for i in slices: 171 | targets, scores = forward(model, i, test_data) 172 | sub_scores = scores.topk(20)[1] 173 | sub_scores = trans_to_cpu(sub_scores).detach().numpy() 174 | for score, target, mask in zip(sub_scores, targets, test_data.mask): 175 | hit.append(np.isin(target - 1, score)) 176 | if len(np.where(score == target - 1)[0]) == 0: 177 | mrr.append(0) 178 | else: 179 | mrr.append(1 / (np.where(score == target - 1)[0][0] + 1)) 180 | hit = np.mean(hit) * 100 181 | mrr = np.mean(mrr) * 100 182 | return hit, mrr 183 | -------------------------------------------------------------------------------- /pytorch_code/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 10 10:17:52 2019 4 | 5 | @author: dell 6 | """ 7 | 8 | import argparse 9 | import pickle 10 | import time 11 | from utils import build_graph, Data, split_validation 12 | from model import * 13 | import networkx as nx 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--dataset', default='sample', help='dataset name: diginetica/yoochoose1_4/yoochoose1_64/sample') 17 | parser.add_argument('--batchSize', type=int, default=100, help='input batch size') 18 | parser.add_argument('--hiddenSize', type=int, default=100, help='hidden state size') 19 | parser.add_argument('--epoch', type=int, default=30, help='the number of epochs to train for') 20 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') # [0.001, 0.0005, 0.0001] 21 | parser.add_argument('--lr_dc', type=float, default=0.1, help='learning rate decay rate') 22 | parser.add_argument('--lr_dc_step', type=int, default=3, help='the number of steps after which the learning rate decay') 23 | parser.add_argument('--l2', type=float, default=1e-5, help='l2 penalty') # [0.001, 0.0005, 0.0001, 0.00005, 0.00001] 24 | parser.add_argument('--step', type=int, default=1, help='gnn propogation steps') 25 | parser.add_argument('--patience', type=int, default=10, help='the number of epoch to wait before early stop ') 26 | parser.add_argument('--nonhybrid', action='store_true', help='only use the global preference to predict') 27 | parser.add_argument('--validation', action='store_true', help='validation') 28 | parser.add_argument('--valid_portion', type=float, default=0.1, help='split the portion of training set as validation set') 29 | opt = parser.parse_args() 30 | print(opt) 31 | 32 | #test = SessionGraph(opt, 310) 33 | #print(test) #打印图神经网络的结构 34 | 35 | #测试显示会话图 36 | #model = trans_to_cuda(SessionGraph(opt, 310)) #模型构建就靠这句话 37 | 38 | #测试读取训练数据的图的节点 39 | #all_train_seq = pickle.load(open('../datasets/' + opt.dataset + '/all_train_seq.txt', 'rb')) 40 | #g = build_graph(all_train_seq) 41 | #print(len(g.node)) 42 | 43 | #测试 44 | train_data = pickle.load(open('../datasets/' + opt.dataset + '/train.txt', 'rb')) 45 | train_data_compare = Data(train_data, shuffle=True) 46 | slices = train_data_compare.generate_batch(100) 47 | slicesData = train_data_compare.get_slice(slices[1]) 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /pytorch_code/testEmbedding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 12 10:57:22 2019 4 | 5 | @author: dell 6 | """ 7 | 8 | from pytorch import nn 9 | 10 | -------------------------------------------------------------------------------- /pytorch_code/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python36 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on July, 2018 5 | 6 | @author: Tangrizzly 7 | """ 8 | 9 | import networkx as nx 10 | import numpy as np 11 | 12 | 13 | def build_graph(train_data): 14 | graph = nx.DiGraph() 15 | for seq in train_data: 16 | for i in range(len(seq) - 1): 17 | if graph.get_edge_data(seq[i], seq[i + 1]) is None: 18 | weight = 1 19 | else: 20 | weight = graph.get_edge_data(seq[i], seq[i + 1])['weight'] + 1 21 | graph.add_edge(seq[i], seq[i + 1], weight=weight) 22 | for node in graph.nodes(): 23 | sum = 0 24 | for j, i in graph.in_edges(node): 25 | sum += graph.get_edge_data(j, i)['weight'] 26 | if sum != 0: 27 | for j, i in graph.in_edges(i): 28 | graph.add_edge(j, i, weight=graph.get_edge_data(j, i)['weight'] / sum) 29 | return graph 30 | 31 | 32 | def data_masks(all_usr_pois, item_tail): #输入:将所有输入序列all_usr_pois, 末尾补全的数据item_tail 33 | us_lens = [len(upois) for upois in all_usr_pois] #每一个输入序列的长度的列表 34 | len_max = max(us_lens) #得到输入序列的最大长度 35 | us_pois = [upois + item_tail * (len_max - le) for upois, le in zip(all_usr_pois, us_lens)] #将所有输入序列按照最长长度尾部补全 item_tail 36 | us_msks = [[1] * le + [0] * (len_max - le) for le in us_lens] #有序列的位置是[1],没有动作序列的位置是[0] 37 | return us_pois, us_msks, len_max #输出:补全0后的序列us_pois, 面罩序列us_msks, 最大序列长度len_max 38 | 39 | 40 | def split_validation(train_set, valid_portion): 41 | train_set_x, train_set_y = train_set 42 | n_samples = len(train_set_x) 43 | sidx = np.arange(n_samples, dtype='int32') 44 | np.random.shuffle(sidx) 45 | n_train = int(np.round(n_samples * (1. - valid_portion))) 46 | valid_set_x = [train_set_x[s] for s in sidx[n_train:]] 47 | valid_set_y = [train_set_y[s] for s in sidx[n_train:]] 48 | train_set_x = [train_set_x[s] for s in sidx[:n_train]] 49 | train_set_y = [train_set_y[s] for s in sidx[:n_train]] 50 | 51 | return (train_set_x, train_set_y), (valid_set_x, valid_set_y) 52 | 53 | 54 | class Data(): 55 | def __init__(self, data, shuffle=False, graph=None): 56 | inputs = data[0] #输入序列的列表 57 | inputs, mask, len_max = data_masks(inputs, [0]) #详见函数 ---> data_masks() 这个函数使得所有会话按照最长的长度补0了! 58 | self.inputs = np.asarray(inputs) #补全0后的输入序列,并转化成array() 59 | self.mask = np.asarray(mask) #面罩序列,并转化成array() 60 | self.len_max = len_max #最大序列长度 61 | self.targets = np.asarray(data[1]) #预测的序列的列表 62 | self.length = len(inputs) #输入样本的大小 63 | self.shuffle = shuffle #是否打乱数据 64 | self.graph = graph #数据图 (?) 这个似乎没有用到 65 | 66 | def generate_batch(self, batch_size): #根据批的大小生成批数据的索引,如果shuffle则打乱数据 67 | if self.shuffle: #如果需要打乱数据 68 | shuffled_arg = np.arange(self.length) #生成array([0,1,...,样本长度-1]) 69 | np.random.shuffle(shuffled_arg) #随机打乱shuffled_arg的顺序 70 | self.inputs = self.inputs[shuffled_arg] #按照shuffled_arg来索引输入数据 71 | self.mask = self.mask[shuffled_arg] #按照shuffled_arg来索引面罩数据 72 | self.targets = self.targets[shuffled_arg] #按照shuffled_arg来索引预测目标数据 73 | n_batch = int(self.length / batch_size) #得到训练批数 74 | if self.length % batch_size != 0: #批数需要取向上取整 75 | n_batch += 1 76 | slices = np.split(np.arange(n_batch * batch_size), n_batch) #所有数据按照批进行拆分。eg:[0,..,99][100,..,199]... 77 | slices[-1] = slices[-1][:(self.length - batch_size * (n_batch - 1))] #最后一批有多少给多少。eg:[500,..506] 78 | return slices 79 | 80 | def get_slice(self, i): #根据索引i得到对应的数据 81 | inputs, mask, targets = self.inputs[i], self.mask[i], self.targets[i] #得到对应索引的输入,面罩,目标数据 82 | items, n_node, A, alias_inputs = [], [], [], [] 83 | for u_input in inputs: 84 | n_node.append(len(np.unique(u_input))) #n_node存储每个输入序列单独出现的点击动作类别的个数的列表 85 | max_n_node = np.max(n_node) #得到批最长唯一动作会话序列的长度 86 | for u_input in inputs: # u_input 为一个会话序列 87 | node = np.unique(u_input) #该循环的会话的唯一动作序列 88 | items.append(node.tolist() + (max_n_node - len(node)) * [0]) #单个点击动作序列的唯一类别并按照批最大类别补全0 89 | u_A = np.zeros((max_n_node, max_n_node)) #存储行为矩阵的二维向量(方阵),长度是最大唯一动作的数量 90 | for i in np.arange(len(u_input) - 1): #循环该序列的长度 91 | if u_input[i + 1] == 0: #循环到i的下一个动作时“0”动作时退出循环,因为0代表序列已经结束,后面都是补的动作0 92 | break 93 | u = np.where(node == u_input[i])[0][0] #该动作对应唯一动作集合的序号 94 | v = np.where(node == u_input[i + 1])[0][0] #下一个动作对应唯一动作集合的序号 95 | u_A[u][v] = 1 #前一个动作u_input[i]转移到后一个动作u_input[i + 1]的次数变成1 96 | u_sum_in = np.sum(u_A, 0) #矩阵列求和,最后变成一行 97 | u_sum_in[np.where(u_sum_in == 0)] = 1 98 | u_A_in = np.divide(u_A, u_sum_in) 99 | u_sum_out = np.sum(u_A, 1) #矩阵行求和,最后变成一列 100 | u_sum_out[np.where(u_sum_out == 0)] = 1 101 | u_A_out = np.divide(u_A.transpose(), u_sum_out) 102 | u_A = np.concatenate([u_A_in, u_A_out]).transpose() #得到一个会话的连接矩阵 103 | A.append(u_A) #存储该批数据图矩阵的列表,u_A方阵的长度相同——为该批最长唯一动作会话序列的长度 104 | alias_inputs.append([np.where(node == i)[0][0] for i in u_input]) #动作序列对应唯一动作集合的位置角标 105 | return alias_inputs, A, items, mask, targets 106 | #返回:动作序列对应唯一动作集合的位置角标,该批数据图矩阵的列表,单个点击动作序列的唯一类别并按照批最大类别补全0列表,面罩,目标数据 107 | 108 | -------------------------------------------------------------------------------- /tensorflow_code/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/userbehavioranalysis/SR-GNN-Chinese_Comment_edition/54ad512954ac9df513d6193f3942e2d5c7906256/tensorflow_code/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /tensorflow_code/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/userbehavioranalysis/SR-GNN-Chinese_Comment_edition/54ad512954ac9df513d6193f3942e2d5c7906256/tensorflow_code/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /tensorflow_code/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2018/10/17 5:40 4 | # @Author : {ZM7} 5 | # @File : main.py 6 | # @Software: PyCharm 7 | 8 | # 引入 division 模块,将整数除法转换为浮点数除法 9 | from __future__ import division 10 | import numpy as np 11 | # 导入模型和工具函数 12 | from model import * 13 | from utils import build_graph, Data, split_validation 14 | import pickle 15 | import argparse 16 | import datetime 17 | 18 | # 解析命令行参数 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--dataset', default='sample', help='dataset name: diginetica/yoochoose1_4/yoochoose1_64/sample') 21 | parser.add_argument('--method', type=str, default='ggnn', help='ggnn/gat/gcn') 22 | parser.add_argument('--validation', action='store_true', help='validation') 23 | parser.add_argument('--epoch', type=int, default=30, help='number of epochs to train for') 24 | parser.add_argument('--batchSize', type=int, default=100, help='input batch size') 25 | parser.add_argument('--hiddenSize', type=int, default=100, help='hidden state size') 26 | parser.add_argument('--l2', type=float, default=1e-5, help='l2 penalty') 27 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 28 | parser.add_argument('--step', type=int, default=1, help='gnn propogation steps') 29 | parser.add_argument('--nonhybrid', action='store_true', help='global preference') 30 | parser.add_argument('--lr_dc', type=float, default=0.1, help='learning rate decay rate') 31 | parser.add_argument('--lr_dc_step', type=int, default=3, help='the number of steps after which the learning rate decay') 32 | opt = parser.parse_args() 33 | 34 | # 加载训练集和测试集 35 | train_data = pickle.load(open('../datasets/' + opt.dataset + '/train.txt', 'rb')) 36 | test_data = pickle.load(open('../datasets/' + opt.dataset + '/test.txt', 'rb')) 37 | 38 | # 加载所有训练序列数据 39 | # all_train_seq = pickle.load(open('../datasets/' + opt.dataset + '/all_train_seq.txt', 'rb')) 40 | 41 | # 根据不同的数据集设定节点数量 42 | if opt.dataset == 'diginetica': 43 | n_node = 43098 44 | elif opt.dataset == 'yoochoose1_64' or opt.dataset == 'yoochoose1_4': 45 | n_node = 37484 46 | else: 47 | n_node = 310 48 | 49 | # 构建训练数据和测试数据的 Data 对象 50 | train_data = Data(train_data, sub_graph=True, method=opt.method, shuffle=True) 51 | test_data = Data(test_data, sub_graph=True, method=opt.method, shuffle=False) 52 | 53 | # 初始化模型对象 54 | model = GGNN(hidden_size=opt.hiddenSize, out_size=opt.hiddenSize, batch_size=opt.batchSize, n_node=n_node, 55 | lr=opt.lr, l2=opt.l2, step=opt.step, decay=opt.lr_dc_step * len(train_data.inputs) / opt.batchSize, lr_dc=opt.lr_dc, 56 | nonhybrid=opt.nonhybrid) 57 | 58 | # 打印参数 59 | print(opt) 60 | 61 | # 初始化最佳结果列表 62 | best_result = [0, 0] 63 | best_epoch = [0, 0] 64 | # 对模型进行多轮训练和测试 65 | for epoch in range(opt.epoch): 66 | print('epoch: ', epoch, '===========================================') 67 | # 生成训练数据的batch 68 | slices = train_data.generate_batch(model.batch_size) 69 | fetches = [model.opt, model.loss_train, model.global_step] 70 | print('start training: ', datetime.datetime.now()) 71 | loss_ = [] 72 | # 遍历所有训练数据batch进行训练 73 | for i, j in zip(slices, np.arange(len(slices))): 74 | # 获取当前训练数据batch中的数据 75 | adj_in, adj_out, alias, item, mask, targets = train_data.get_slice(i) 76 | # 进行一次训练 77 | _, loss, _ = model.run(fetches, targets, item, adj_in, adj_out, alias, mask) 78 | loss_.append(loss) 79 | # 计算当前epoch的平均训练loss 80 | loss = np.mean(loss_) 81 | 82 | # 生成测试数据的batch 83 | slices = test_data.generate_batch(model.batch_size) 84 | print('start predicting: ', datetime.datetime.now()) 85 | hit, mrr, test_loss_ = [], [],[] 86 | # 遍历所有测试数据batch进行预测 87 | for i, j in zip(slices, np.arange(len(slices))): 88 | # 获取当前测试数据batch中的数据 89 | adj_in, adj_out, alias, item, mask, targets = test_data.get_slice(i) 90 | # 进行一次预测 91 | scores, test_loss = model.run([model.score_test, model.loss_test], targets, item, adj_in, adj_out, alias, mask) 92 | # 保存当前测试batch的loss 93 | test_loss_.append(test_loss) 94 | # 对每个用户的推荐结果进行评估 95 | index = np.argsort(scores, 1)[:, -20:] 96 | for score, target in zip(index, targets): 97 | hit.append(np.isin(target - 1, score)) 98 | if len(np.where(score == target - 1)[0]) == 0: 99 | mrr.append(0) 100 | else: 101 | mrr.append(1 / (20-np.where(score == target - 1)[0][0])) 102 | # 计算当前epoch的平均测试loss、Recall@20和MMR@20 103 | hit = np.mean(hit)*100 104 | mrr = np.mean(mrr)*100 105 | test_loss = np.mean(test_loss_) 106 | # 更新最佳结果 107 | if hit >= best_result[0]: 108 | best_result[0] = hit 109 | best_epoch[0] = epoch 110 | if mrr >= best_result[1]: 111 | best_result[1] = mrr 112 | best_epoch[1]=epoch 113 | # 打印当前epoch的训练和测试结果 114 | print('train_loss:\t%.4f\ttest_loss:\t%4f\tRecall@20:\t%.4f\tMMR@20:\t%.4f\tEpoch:\t%d,\t%d'% 115 | (loss, test_loss, best_result[0], best_result[1], best_epoch[0], best_epoch[1])) 116 | 117 | -------------------------------------------------------------------------------- /tensorflow_code/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2018/10/16 4:36 4 | # @Author : {ZM7} 5 | # @File : model.py 6 | # @Software: PyCharm 7 | # 导入 TensorFlow 和 math 库 8 | import tensorflow as tf 9 | import math 10 | 11 | # 定义一个模型类 Model 12 | class Model(object): 13 | 14 | # 初始化模型的参数 15 | def __init__(self, hidden_size=100, out_size=100, batch_size=100, nonhybrid=True): 16 | 17 | # 隐藏层的大小、输出的大小、批次的大小 18 | self.hidden_size = hidden_size 19 | self.out_size = out_size 20 | self.batch_size = batch_size 21 | 22 | # 占位符 23 | self.mask = tf.placeholder(dtype=tf.float32) # 掩码矩阵 24 | self.alias = tf.placeholder(dtype=tf.int32) # 给每个输入重新编号的序列构成的矩阵 25 | self.item = tf.placeholder(dtype=tf.int32) # 序列构成的矩阵 26 | self.tar = tf.placeholder(dtype=tf.int32) # 目标序列 27 | 28 | # 是否使用非混合模式 29 | self.nonhybrid = nonhybrid 30 | 31 | # 标准差 32 | self.stdv = 1.0 / math.sqrt(self.hidden_size) 33 | 34 | # 定义 NASR 模型参数 35 | self.nasr_w1 = tf.get_variable('nasr_w1', [self.out_size, self.out_size], dtype=tf.float32, 36 | initializer=tf.random_uniform_initializer(-self.stdv, self.stdv)) 37 | self.nasr_w2 = tf.get_variable('nasr_w2', [self.out_size, self.out_size], dtype=tf.float32, 38 | initializer=tf.random_uniform_initializer(-self.stdv, self.stdv)) 39 | self.nasr_v = tf.get_variable('nasrv', [1, self.out_size], dtype=tf.float32, 40 | initializer=tf.random_uniform_initializer(-self.stdv, self.stdv)) 41 | self.nasr_b = tf.get_variable('nasr_b', [self.out_size], dtype=tf.float32, initializer=tf.zeros_initializer()) 42 | 43 | # 前向传播过程 44 | def forward(self, re_embedding, train=True): 45 | # 计算每个序列的有效长度 46 | rm = tf.reduce_sum(self.mask, 1) 47 | # 获取每个序列的最后一个节点的id 48 | last_id = tf.gather_nd(self.alias, tf.stack([tf.range(self.batch_size), tf.to_int32(rm)-1], axis=1)) 49 | # 获取每个序列最后一个节点的嵌入表示 50 | last_h = tf.gather_nd(re_embedding, tf.stack([tf.range(self.batch_size), last_id], axis=1)) 51 | # 获取每个序列所有节点的嵌入表示 52 | seq_h = tf.stack([tf.nn.embedding_lookup(re_embedding[i], self.alias[i]) for i in range(self.batch_size)], 53 | axis=0) # batch_size*T*d 54 | # 计算注意力系数 55 | last = tf.matmul(last_h, self.nasr_w1) 56 | seq = tf.matmul(tf.reshape(seq_h, [-1, self.out_size]), self.nasr_w2) 57 | last = tf.reshape(last, [self.batch_size, 1, -1]) 58 | m = tf.nn.sigmoid(last + tf.reshape(seq, [self.batch_size, -1, self.out_size]) + self.nasr_b) 59 | coef = tf.matmul(tf.reshape(m, [-1, self.out_size]), self.nasr_v, transpose_b=True) * tf.reshape( 60 | self.mask, [-1, 1]) 61 | b = self.embedding[1:] 62 | if not self.nonhybrid: 63 | # 非纯注意力模型,将注意力系数和节点嵌入表示进行拼接 64 | ma = tf.concat([tf.reduce_sum(tf.reshape(coef, [self.batch_size, -1, 1]) * seq_h, 1), 65 | tf.reshape(last, [-1, self.out_size])], -1) 66 | # 将拼接后的结果通过全连接层转换 67 | self.B = tf.get_variable('B', [2 * self.out_size, self.out_size], 68 | initializer=tf.random_uniform_initializer(-self.stdv, self.stdv)) 69 | y1 = tf.matmul(ma, self.B) 70 | # 计算预测结果 71 | logits = tf.matmul(y1, b, transpose_b=True) 72 | else: 73 | # 纯注意力模型,只使用注意力系数来计算预测结果 74 | ma = tf.reduce_sum(tf.reshape(coef, [self.batch_size, -1, 1]) * seq_h, 1) 75 | logits = tf.matmul(ma, b, transpose_b=True) 76 | # 计算损失 77 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.tar - 1, logits=logits)) 78 | # 获取所有可训练变量 79 | self.vars = tf.trainable_variables() 80 | if train: 81 | # 加入L2正则化 82 | lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in self.vars if v.name not 83 | in ['bias', 'gamma', 'b', 'g', 'beta']]) * self.L2 84 | loss = loss + lossL2 85 | return loss, logits 86 | 87 | def run(self, fetches, tar, item, adj_in, adj_out, alias, mask): 88 | # 运行模型 89 | return self.sess.run(fetches, feed_dict={self.tar: tar, self.item: item, self.adj 90 | 91 | 92 | 93 | class GGNN(Model): 94 | def __init__(self,hidden_size=100, out_size=100, batch_size=300, n_node=None, 95 | lr=None, l2=None, step=1, decay=None, lr_dc=0.1, nonhybrid=False): 96 | super(GGNN,self).__init__(hidden_size, out_size, batch_size, nonhybrid) 97 | self.embedding = tf.get_variable(shape=[n_node, hidden_size], name='embedding', dtype=tf.float32, 98 | initializer=tf.random_uniform_initializer(-self.stdv, self.stdv)) 99 | self.adj_in = tf.placeholder(dtype=tf.float32, shape=[self.batch_size, None, None]) 100 | self.adj_out = tf.placeholder(dtype=tf.float32, shape=[self.batch_size, None, None]) 101 | self.n_node = n_node 102 | self.L2 = l2 103 | self.step = step 104 | self.nonhybrid = nonhybrid 105 | self.W_in = tf.get_variable('W_in', shape=[self.out_size, self.out_size], dtype=tf.float32, 106 | initializer=tf.random_uniform_initializer(-self.stdv, self.stdv)) 107 | self.b_in = tf.get_variable('b_in', [self.out_size], dtype=tf.float32, 108 | initializer=tf.random_uniform_initializer(-self.stdv, self.stdv)) 109 | self.W_out = tf.get_variable('W_out', [self.out_size, self.out_size], dtype=tf.float32, 110 | initializer=tf.random_uniform_initializer(-self.stdv, self.stdv)) 111 | self.b_out = tf.get_variable('b_out', [self.out_size], dtype=tf.float32, 112 | initializer=tf.random_uniform_initializer(-self.stdv, self.stdv)) 113 | with tf.variable_scope('ggnn_model', reuse=None): 114 | self.loss_train, _ = self.forward(self.ggnn()) 115 | with tf.variable_scope('ggnn_model', reuse=True): 116 | self.loss_test, self.score_test = self.forward(self.ggnn(), train=False) 117 | self.global_step = tf.Variable(0) 118 | self.learning_rate = tf.train.exponential_decay(lr, global_step=self.global_step, decay_steps=decay, 119 | decay_rate=lr_dc, staircase=True) 120 | self.opt = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_train, global_step=self.global_step) 121 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) 122 | config = tf.ConfigProto(gpu_options=gpu_options) 123 | config.gpu_options.allow_growth = True 124 | self.sess = tf.Session(config=config) 125 | self.sess.run(tf.global_variables_initializer()) 126 | 127 | def ggnn(self): 128 | fin_state = tf.nn.embedding_lookup(self.embedding, self.item) 129 | cell = tf.nn.rnn_cell.GRUCell(self.out_size) 130 | with tf.variable_scope('gru'): 131 | for i in range(self.step): 132 | fin_state = tf.reshape(fin_state, [self.batch_size, -1, self.out_size]) 133 | fin_state_in = tf.reshape(tf.matmul(tf.reshape(fin_state, [-1, self.out_size]), 134 | self.W_in) + self.b_in, [self.batch_size, -1, self.out_size]) 135 | fin_state_out = tf.reshape(tf.matmul(tf.reshape(fin_state, [-1, self.out_size]), 136 | self.W_out) + self.b_out, [self.batch_size, -1, self.out_size]) 137 | av = tf.concat([tf.matmul(self.adj_in, fin_state_in), 138 | tf.matmul(self.adj_out, fin_state_out)], axis=-1) 139 | state_output, fin_state = \ 140 | tf.nn.dynamic_rnn(cell, tf.expand_dims(tf.reshape(av, [-1, 2*self.out_size]), axis=1), 141 | initial_state=tf.reshape(fin_state, [-1, self.out_size])) 142 | return tf.reshape(fin_state, [self.batch_size, -1, self.out_size]) 143 | 144 | 145 | -------------------------------------------------------------------------------- /tensorflow_code/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2018/9/23 2:52 4 | # @Author : {ZM7} 5 | # @File : utils.py 6 | # @Software: PyCharm 7 | 8 | import networkx as nx 9 | import numpy as np 10 | 11 | 12 | def build_graph(train_data): 13 | graph = nx.DiGraph() 14 | for seq in train_data: 15 | for i in range(len(seq) - 1): 16 | if graph.get_edge_data(seq[i], seq[i + 1]) is None: 17 | weight = 1 18 | else: 19 | weight = graph.get_edge_data(seq[i], seq[i + 1])['weight'] + 1 20 | graph.add_edge(seq[i], seq[i + 1], weight=weight) 21 | for node in graph.nodes: 22 | sum = 0 23 | for j, i in graph.in_edges(node): 24 | sum += graph.get_edge_data(j, i)['weight'] 25 | if sum != 0: 26 | for j, i in graph.in_edges(i): 27 | graph.add_edge(j, i, weight=graph.get_edge_data(j, i)['weight'] / sum) 28 | return graph 29 | 30 | 31 | def data_masks(all_usr_pois, item_tail): 32 | us_lens = [len(upois) for upois in all_usr_pois] 33 | len_max = max(us_lens) 34 | us_pois = [upois + item_tail * (len_max - le) for upois, le in zip(all_usr_pois, us_lens)] 35 | us_msks = [[1] * le + [0] * (len_max - le) for le in us_lens] 36 | return us_pois, us_msks, len_max 37 | 38 | 39 | def split_validation(train_set, valid_portion): 40 | train_set_x, train_set_y = train_set 41 | n_samples = len(train_set_x) 42 | sidx = np.arange(n_samples, dtype='int32') 43 | np.random.shuffle(sidx) 44 | n_train = int(np.round(n_samples * (1. - valid_portion))) 45 | valid_set_x = [train_set_x[s] for s in sidx[n_train:]] 46 | valid_set_y = [train_set_y[s] for s in sidx[n_train:]] 47 | train_set_x = [train_set_x[s] for s in sidx[:n_train]] 48 | train_set_y = [train_set_y[s] for s in sidx[:n_train]] 49 | 50 | return (train_set_x, train_set_y), (valid_set_x, valid_set_y) 51 | 52 | 53 | # 定义一个Data类 54 | class Data(): 55 | 56 | # 初始化函数,传入数据、是否使用子图、使用的方法、是否使用稀疏矩阵、是否打乱数据 57 | def __init__(self, data, sub_graph=False, method='ggnn', sparse=False, shuffle=False): 58 | 59 | # 获取输入数据并进行mask处理 60 | inputs = data[0] 61 | inputs, mask, len_max = data_masks(inputs, [0]) 62 | self.inputs = np.asarray(inputs) # 转换为numpy数组 63 | self.mask = np.asarray(mask) # 转换为numpy数组 64 | self.len_max = len_max # 最大序列长度 65 | self.targets = np.asarray(data[1]) # 获取目标数据并转换为numpy数组 66 | self.length = len(inputs) # 获取数据长度 67 | self.shuffle = shuffle # 是否打乱数据 68 | self.sub_graph = sub_graph # 是否使用子图 69 | self.sparse = sparse # 是否使用稀疏矩阵 70 | self.method = method # 使用的方法 71 | 72 | # 生成batch数据的函数,传入batch_size参数 73 | def generate_batch(self, batch_size): 74 | 75 | # 如果打乱数据 76 | if self.shuffle: 77 | shuffled_arg = np.arange(self.length) 78 | np.random.shuffle(shuffled_arg) # 随机打乱索引 79 | self.inputs = self.inputs[shuffled_arg] # 根据打乱的索引更新输入数据 80 | self.mask = self.mask[shuffled_arg] # 根据打乱的索引更新mask数据 81 | self.targets = self.targets[shuffled_arg] # 根据打乱的索引更新目标数据 82 | 83 | # 计算batch数量 84 | n_batch = int(self.length / batch_size) 85 | if self.length % batch_size != 0: 86 | n_batch += 1 87 | 88 | # 将数据分成n_batch个batch,每个batch的数据量为batch_size 89 | slices = np.split(np.arange(n_batch * batch_size), n_batch) 90 | 91 | # 最后一个batch数据量不足batch_size,补齐 92 | slices[-1] = np.arange(self.length-batch_size, self.length) 93 | 94 | # 返回分好的batch数据 95 | return slices 96 | 97 | 98 | def get_slice(self, index): #这里将输入数据转换成为图的结构矩阵 99 | if 1: 100 | items, n_node, A_in, A_out, alias_inputs = [], [], [], [], [] 101 | for u_input in self.inputs[index]: 102 | n_node.append(len(np.unique(u_input))) 103 | max_n_node = np.max(n_node) 104 | if self.method == 'ggnn': 105 | for u_input in self.inputs[index]: 106 | node = np.unique(u_input) 107 | items.append(node.tolist() + (max_n_node - len(node)) * [0]) 108 | u_A = np.zeros((max_n_node, max_n_node)) 109 | for i in np.arange(len(u_input) - 1): 110 | if u_input[i + 1] == 0: 111 | break 112 | u = np.where(node == u_input[i])[0][0] 113 | v = np.where(node == u_input[i + 1])[0][0] 114 | u_A[u][v] = 1 115 | u_sum_in = np.sum(u_A, 0) 116 | u_sum_in[np.where(u_sum_in == 0)] = 1 117 | u_A_in = np.divide(u_A, u_sum_in) 118 | u_sum_out = np.sum(u_A, 1) 119 | u_sum_out[np.where(u_sum_out == 0)] = 1 120 | u_A_out = np.divide(u_A.transpose(), u_sum_out) 121 | 122 | A_in.append(u_A_in) 123 | A_out.append(u_A_out) 124 | alias_inputs.append([np.where(node == i)[0][0] for i in u_input]) 125 | return A_in, A_out, alias_inputs, items, self.mask[index], self.targets[index] 126 | elif self.method == 'gat': 127 | A_in = [] 128 | A_out = [] 129 | for u_input in self.inputs[index]: 130 | node = np.unique(u_input) 131 | items.append(node.tolist() + (max_n_node - len(node)) * [0]) 132 | u_A = np.eye(max_n_node) 133 | for i in np.arange(len(u_input) - 1): 134 | if u_input[i + 1] == 0: 135 | break 136 | u = np.where(node == u_input[i])[0][0] 137 | v = np.where(node == u_input[i + 1])[0][0] 138 | u_A[u][v] = 1 139 | A_in.append(-1e9 * (1 - u_A)) 140 | A_out.append(-1e9 * (1 - u_A.transpose())) 141 | alias_inputs.append([np.where(node == i)[0][0] for i in u_input]) 142 | return A_in, A_out, alias_inputs, items, self.mask[index], self.targets[index] 143 | 144 | else: 145 | return self.inputs[index], self.mask[index], self.targets[index] 146 | -------------------------------------------------------------------------------- /testData.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 13 22:18:19 2019 4 | 5 | @author: dell 6 | """ 7 | 8 | import torch 9 | from torch_geometric.data import Data 10 | 11 | edge_index = torch.tensor([[0, 1, 1, 2], 12 | [1, 0, 2, 1]], dtype=torch.long) 13 | x = torch.tensor([[-1], [0], [1]], dtype=torch.float) 14 | 15 | data = Data(x=x, edge_index=edge_index) 16 | 17 | 18 | #TEST_X = [[3,7,7,3,6],[1,2,3,2,4] ] 19 | #TEST_Y = [4,5] 20 | # 21 | #for sequences, y in zip(TEST_X, TEST_Y): 22 | # i = 0 23 | # nodes = {} # dict{15: 0, 16: 1, 18: 2, ...} 24 | # senders = [] 25 | # x = [] 26 | # for node in sequences: 27 | # if node not in nodes: 28 | # nodes[node] = i 29 | # x.append([node]) 30 | # i += 1 31 | # senders.append(nodes[node]) 32 | # receivers = senders[:] 33 | # del senders[-1] # the last item is a receiver 34 | # del receivers[0] # the first item is a sender 35 | 36 | --------------------------------------------------------------------------------