├── README.md └── train ├── data_process.py ├── foursquare_cut_one_day.pkl └── train.py /README.md: -------------------------------------------------------------------------------- 1 | # LSTPM 2 | Implementation of paper "Ke Sun, Tieyun Qian, Tong Chen, Yile Liang, Quoc Viet Hung Nguyen, Hongzhi Yin. Where to Go Next: Modeling Long- and Short-Term User Preferences for 3 | Point-of-Interest Recommendation." accepted by AAAI 2020. 4 | 5 | # Update 6 | * The "distance.pkl" is generated by the "caculate_poi_distance" function in the "train.py". 7 | * We have uploaded the data processing script "data_process.py" for generating "foursquare_cut_one_day.pkl". 8 | 9 | # Requirements 10 | * python 3.6 11 | * pytorch 1.0.1 12 | -------------------------------------------------------------------------------- /train/data_process.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | 4 | import time 5 | import argparse 6 | import numpy as np 7 | import pickle 8 | # import cPickle as pickle 9 | from collections import Counter 10 | 11 | 12 | def entropy_spatial(sessions): 13 | locations = {} 14 | days = sorted(sessions.keys()) 15 | for d in days: 16 | session = sessions[d] 17 | for s in session: 18 | if s[0] not in locations: 19 | locations[s[0]] = 1 20 | else: 21 | locations[s[0]] += 1 22 | frequency = np.array([locations[loc] for loc in locations]) 23 | frequency = frequency / np.sum(frequency) 24 | entropy = - np.sum(frequency * np.log(frequency)) 25 | return entropy 26 | 27 | 28 | class DataFoursquare(object): 29 | def __init__(self, trace_min=10, global_visit=10, hour_gap=72, min_gap=10, session_min=2, session_max=10, 30 | sessions_min=2, train_split=0.8, embedding_len=50): 31 | tmp_path = "data/" 32 | self.TWITTER_PATH = tmp_path + 'foursquare/tweet_clean_all.txt' 33 | self.VENUES_PATH = tmp_path + 'foursquare/venues_all.txt' 34 | self.SAVE_PATH = tmp_path 35 | self.save_name = 'foursquare_cut_one_day' 36 | 37 | self.trace_len_min = trace_min 38 | self.location_global_visit_min = global_visit 39 | self.hour_gap = hour_gap 40 | self.min_gap = min_gap 41 | self.session_max = session_max 42 | self.filter_short_session = session_min 43 | self.sessions_count_min = sessions_min 44 | self.words_embeddings_len = embedding_len 45 | 46 | self.train_split = train_split 47 | 48 | self.data = {} 49 | self.venues = {} 50 | self.venues_cat = {} 51 | self.words_original = [] 52 | self.words_lens = [] 53 | self.dictionary = dict() 54 | self.words_dict = None 55 | self.data_filter = {} 56 | self.user_filter3 = None 57 | self.uid_list = {} 58 | self.vid_list = {'unk': [0, -1]} 59 | self.vid_list_lookup = {} 60 | self.vid_lookup = {} 61 | self.pid_loc_lat = {} 62 | self.data_neural = {} 63 | 64 | 65 | 66 | 67 | # ############# 1. read trajectory data from twitters 68 | def load_trajectory_from_tweets(self): 69 | with open(self.TWITTER_PATH,encoding='UTF-8') as fid: 70 | for i, line in enumerate(fid): 71 | _, uid, _, _, tim, _, _, tweet, pid, cat_id = line.strip('\r\n').split('') 72 | if uid not in self.data: 73 | self.data[uid] = [[pid, tim, cat_id]] 74 | else: 75 | self.data[uid].append([pid, tim, cat_id]) 76 | if pid not in self.venues: 77 | self.venues[pid] = 1 78 | else: 79 | self.venues[pid] += 1 80 | if cat_id not in self.venues_cat: 81 | self.venues_cat[cat_id] = 1 82 | else: 83 | self.venues_cat[cat_id] += 1 84 | 85 | # ########### 3.0 basically filter users based on visit length and other statistics 86 | def filter_users_by_length(self): 87 | uid_3 = [x for x in self.data if len(self.data[x]) > self.trace_len_min] 88 | xixi = [(x, len(self.data[x])) for x in uid_3] 89 | pick3 = sorted([(x, len(self.data[x])) for x in uid_3], key=lambda x: x[1], reverse=True) 90 | pid_3 = [x for x in self.venues if self.venues[x] > self.location_global_visit_min] 91 | pid_pic3 = sorted([(x, self.venues[x]) for x in pid_3], key=lambda x: x[1], reverse=True) 92 | pid_3 = dict(pid_pic3) 93 | 94 | session_len_list = [] 95 | for u in pick3: 96 | uid = u[0] 97 | info = self.data[uid] 98 | xixi = Counter([x[0] for x in info]) 99 | topk = Counter([x[0] for x in info]).most_common() 100 | topk1 = [x[0] for x in topk if x[1] > 1] 101 | sessions = {} 102 | for i, record in enumerate(info): 103 | poi, tmd, cat_id = record 104 | try: 105 | current_date = tmd.split(' ')[0] 106 | 107 | tid = int(time.mktime(time.strptime(tmd, "%Y-%m-%d %H:%M:%S"))) 108 | except Exception as e: 109 | print('error:{}'.format(e)) 110 | continue 111 | sid = len(sessions) 112 | if poi not in pid_3 and poi not in topk1: 113 | # if poi not in topk1: 114 | continue 115 | if i == 0 or len(sessions) == 0: 116 | sessions[sid] = [record] 117 | else: 118 | # if (tid - last_tid) / 3600 > self.hour_gap or len(sessions[sid - 1]) > self.session_max: 119 | if last_date != current_date: 120 | sessions[sid] = [record] 121 | elif (tid - last_tid) / 60 > self.min_gap: 122 | sessions[sid - 1].append(record) 123 | else: 124 | pass 125 | last_tid = tid 126 | last_date = current_date 127 | sessions_filter = {} 128 | for s in sessions: 129 | if len(sessions[s]) >= self.filter_short_session: 130 | sessions_filter[len(sessions_filter)] = sessions[s] 131 | session_len_list.append(len(sessions[s])) 132 | if len(sessions_filter) >= self.sessions_count_min: 133 | self.data_filter[uid] = {'sessions_count': len(sessions_filter), 'topk_count': len(topk), 'topk': topk, 134 | 'sessions': sessions_filter, 'raw_sessions': sessions} 135 | 136 | self.user_filter3 = [x for x in self.data_filter if 137 | self.data_filter[x]['sessions_count'] >= self.sessions_count_min] 138 | 139 | def filter_users_by_length_source(self): 140 | uid_3 = [x for x in self.data if len(self.data[x]) > self.trace_len_min] 141 | xixi = [(x, len(self.data[x])) for x in uid_3] 142 | pick3 = sorted([(x, len(self.data[x])) for x in uid_3], key=lambda x: x[1], reverse=True) 143 | pid_3 = [x for x in self.venues if self.venues[x] > self.location_global_visit_min] 144 | pid_pic3 = sorted([(x, self.venues[x]) for x in pid_3], key=lambda x: x[1], reverse=True) 145 | pid_3 = dict(pid_pic3) 146 | 147 | session_len_list = [] 148 | for u in pick3: 149 | uid = u[0] 150 | info = self.data[uid] 151 | xixi = Counter([x[0] for x in info]) 152 | topk = Counter([x[0] for x in info]).most_common() 153 | topk1 = [x[0] for x in topk if x[1] > 1] 154 | sessions = {} 155 | for i, record in enumerate(info): 156 | poi, tmd = record 157 | try: 158 | tid = int(time.mktime(time.strptime(tmd, "%Y-%m-%d %H:%M:%S"))) 159 | except Exception as e: 160 | print('error:{}'.format(e)) 161 | continue 162 | sid = len(sessions) 163 | if poi not in pid_3 and poi not in topk1: 164 | # if poi not in topk1: 165 | continue 166 | if i == 0 or len(sessions) == 0: 167 | sessions[sid] = [record] 168 | else: 169 | if (tid - last_tid) / 3600 > self.hour_gap or len(sessions[sid - 1]) > self.session_max: 170 | sessions[sid] = [record] 171 | elif (tid - last_tid) / 60 > self.min_gap: 172 | sessions[sid - 1].append(record) 173 | else: 174 | pass 175 | last_tid = tid 176 | sessions_filter = {} 177 | for s in sessions: 178 | if len(sessions[s]) >= self.filter_short_session: 179 | sessions_filter[len(sessions_filter)] = sessions[s] 180 | session_len_list.append(len(sessions[s])) 181 | if len(sessions_filter) >= self.sessions_count_min: 182 | self.data_filter[uid] = {'sessions_count': len(sessions_filter), 'topk_count': len(topk), 'topk': topk, 183 | 'sessions': sessions_filter, 'raw_sessions': sessions} 184 | 185 | self.user_filter3 = [x for x in self.data_filter if 186 | self.data_filter[x]['sessions_count'] >= self.sessions_count_min] 187 | 188 | # ########### 4. build dictionary for users and location 189 | def build_users_locations_dict(self): 190 | for u in self.user_filter3: 191 | sessions = self.data_filter[u]['sessions'] 192 | if u not in self.uid_list: 193 | self.uid_list[u] = [len(self.uid_list), len(sessions)] 194 | for sid in sessions: 195 | poi = [p[0] for p in sessions[sid]] 196 | for p in poi: 197 | if p not in self.vid_list: 198 | self.vid_list_lookup[len(self.vid_list)] = p 199 | self.vid_list[p] = [len(self.vid_list), 1] 200 | else: 201 | self.vid_list[p][1] += 1 202 | 203 | # support for radius of gyration 204 | def load_venues(self): 205 | with open(self.TWITTER_PATH, 'r',encoding='UTF-8') as fid: 206 | for line in fid: 207 | _, uid, lon, lat, tim, _, _, tweet, pid, cid = line.strip('\r\n').split('') 208 | self.pid_loc_lat[pid] = [float(lon), float(lat)] 209 | 210 | def venues_lookup(self): 211 | for vid in self.vid_list_lookup: 212 | pid = self.vid_list_lookup[vid] 213 | lon_lat = self.pid_loc_lat[pid] 214 | self.vid_lookup[vid] = lon_lat 215 | 216 | # ########## 5.0 prepare training data for neural network 217 | @staticmethod 218 | def tid_list(tmd): 219 | tm = time.strptime(tmd, "%Y-%m-%d %H:%M:%S") 220 | tid = tm.tm_wday * 24 + tm.tm_hour 221 | return tid 222 | 223 | @staticmethod 224 | def tid_list_48(tmd): 225 | tm = time.strptime(tmd, "%Y-%m-%d %H:%M:%S") 226 | timeStamp = (time.mktime(tm)) 227 | # tid = tm.tm_hour 228 | if tm.tm_wday in [0, 1, 2, 3, 4]: 229 | tid = tm.tm_hour 230 | else: 231 | tid = tm.tm_hour + 24 232 | return [timeStamp, tid] 233 | 234 | def prepare_neural_data(self): 235 | for u in self.uid_list: 236 | sessions = self.data_filter[u]['sessions'] 237 | sessions_tran = {} 238 | sessions_id = [] 239 | for sid in sessions: 240 | sessions_tran[sid] = [[self.vid_list[p[0]][0], p[1], p[-1]] for p in 241 | sessions[sid]] 242 | sessions_id.append(sid) 243 | split_id = int(np.floor(self.train_split * len(sessions_id))) 244 | train_id = sessions_id[:split_id] 245 | test_id = sessions_id[split_id:] 246 | pred_len = sum([len(sessions_tran[i]) - 1 for i in train_id]) 247 | valid_len = sum([len(sessions_tran[i]) - 1 for i in test_id]) 248 | train_loc = {} 249 | for i in train_id: 250 | for sess in sessions_tran[i]: 251 | if sess[0] in train_loc: 252 | train_loc[sess[0]] += 1 253 | else: 254 | train_loc[sess[0]] = 1 255 | # calculate entropy 256 | entropy = entropy_spatial(sessions) 257 | 258 | # calculate location ratio 259 | train_location = [] 260 | for i in train_id: 261 | train_location.extend([s[0] for s in sessions[i]]) 262 | train_location_set = set(train_location) 263 | test_location = [] 264 | for i in test_id: 265 | test_location.extend([s[0] for s in sessions[i]]) 266 | test_location_set = set(test_location) 267 | whole_location = train_location_set | test_location_set 268 | test_unique = whole_location - train_location_set 269 | location_ratio = len(test_unique) / len(whole_location) 270 | 271 | # calculate radius of gyration 272 | lon_lat = [] 273 | for pid in train_location: 274 | try: 275 | lon_lat.append(self.pid_loc_lat[pid]) 276 | except: 277 | print(pid) 278 | print('error') 279 | lon_lat = np.array(lon_lat) 280 | center = np.mean(lon_lat, axis=0, keepdims=True) 281 | center = np.repeat(center, axis=0, repeats=len(lon_lat)) 282 | rg = np.sqrt(np.mean(np.sum((lon_lat - center) ** 2, axis=1, keepdims=True), axis=0))[0] 283 | 284 | self.data_neural[self.uid_list[u][0]] = {'sessions': sessions_tran, 'train': train_id, 'test': test_id, 285 | 'pred_len': pred_len, 'valid_len': valid_len, 286 | 'train_loc': train_loc, 'explore': location_ratio, 287 | 'entropy': entropy, 'rg': rg} 288 | 289 | # ############# 6. save variables 290 | def get_parameters(self): 291 | parameters = {} 292 | parameters['TWITTER_PATH'] = self.TWITTER_PATH 293 | parameters['SAVE_PATH'] = self.SAVE_PATH 294 | 295 | parameters['trace_len_min'] = self.trace_len_min 296 | parameters['location_global_visit_min'] = self.location_global_visit_min 297 | parameters['hour_gap'] = self.hour_gap 298 | parameters['min_gap'] = self.min_gap 299 | parameters['session_max'] = self.session_max 300 | parameters['filter_short_session'] = self.filter_short_session 301 | parameters['sessions_min'] = self.sessions_count_min 302 | parameters['train_split'] = self.train_split 303 | 304 | return parameters 305 | 306 | def save_variables(self): 307 | foursquare_dataset = {'data_neural': self.data_neural, 'vid_list': self.vid_list, 'uid_list': self.uid_list, 308 | 'parameters': self.get_parameters(), 'data_filter': self.data_filter, 309 | 'vid_lookup': self.vid_lookup} 310 | pickle.dump(foursquare_dataset, open(self.SAVE_PATH + self.save_name + '.pk', 'wb')) 311 | 312 | 313 | def parse_args(): 314 | parser = argparse.ArgumentParser() 315 | parser.add_argument('--trace_min', type=int, default=10, help="raw trace length filter threshold") 316 | parser.add_argument('--global_visit', type=int, default=10, help="location global visit threshold") 317 | parser.add_argument('--hour_gap', type=int, default=72, help="maximum interval of two trajectory points") 318 | parser.add_argument('--min_gap', type=int, default=0, help="minimum interval of two trajectory points") 319 | parser.add_argument('--session_max', type=int, default=10, help="control the length of session not too long") 320 | parser.add_argument('--session_min', type=int, default=3, help="control the length of session not too short") 321 | parser.add_argument('--sessions_min', type=int, default=5, help="the minimum amount of the good user's sessions") 322 | parser.add_argument('--train_split', type=float, default=0.8, help="train/test ratio") 323 | return parser.parse_args() 324 | 325 | 326 | if __name__ == '__main__': 327 | args = parse_args() 328 | data_generator = DataFoursquare(trace_min=args.trace_min, global_visit=args.global_visit, 329 | hour_gap=args.hour_gap, min_gap=args.min_gap, 330 | session_min=args.session_min, session_max=args.session_max, 331 | sessions_min=args.sessions_min, train_split=args.train_split) 332 | parameters = data_generator.get_parameters() 333 | print('############PARAMETER SETTINGS:\n' + '\n'.join([p + ':' + str(parameters[p]) for p in parameters])) 334 | print('############START PROCESSING:') 335 | print('load trajectory from {}'.format(data_generator.TWITTER_PATH)) 336 | data_generator.load_trajectory_from_tweets() 337 | print('filter users') 338 | data_generator.filter_users_by_length() 339 | print('build users/locations dictionary') 340 | data_generator.build_users_locations_dict() 341 | data_generator.load_venues() 342 | data_generator.venues_lookup() 343 | print('prepare data for neural network') 344 | data_generator.prepare_neural_data() 345 | print('save prepared data') 346 | data_generator.save_variables() 347 | print('raw users:{} raw locations:{}'.format( 348 | len(data_generator.data), len(data_generator.venues))) 349 | print('final users:{} final locations:{}'.format( 350 | len(data_generator.data_neural), len(data_generator.vid_list))) 351 | -------------------------------------------------------------------------------- /train/foursquare_cut_one_day.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLPWM-WHU/LSTPM/bce2384f33ec3efe08c106033632987d14738abe/train/foursquare_cut_one_day.pkl -------------------------------------------------------------------------------- /train/train.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from torch.autograd import Variable 4 | import pickle 5 | import numpy as np 6 | import torch 7 | from collections import defaultdict 8 | import gc 9 | import os 10 | from math import radians, cos, sin, asin, sqrt 11 | from collections import deque,Counter 12 | def shuffle(*arrays, **kwargs): 13 | require_indices = kwargs.get('indices', False) 14 | if len(set(len(x) for x in arrays)) != 1: 15 | raise ValueError('All inputs to shuffle must have ' 16 | 'the same length.') 17 | shuffle_indices = np.arange(len(arrays[0])) 18 | np.random.shuffle(shuffle_indices) 19 | if len(arrays) == 1: 20 | result = arrays[0][shuffle_indices] 21 | else: 22 | result = tuple(x[shuffle_indices] for x in arrays) 23 | if require_indices: 24 | return result, shuffle_indices 25 | else: 26 | return result 27 | 28 | def minibatch(*tensors, **kwargs): 29 | batch_size = kwargs.get('batch_size', 128) 30 | if len(tensors) == 1: 31 | tensor = tensors[0] 32 | for i in range(0, len(tensor), batch_size): 33 | yield tensor[i:i + batch_size] 34 | else: 35 | for i in range(0, len(tensors[0]), batch_size): 36 | yield tuple(x[i:i + batch_size] for x in tensors) 37 | 38 | def pad_batch_of_lists_masks(batch_of_lists, max_len): 39 | padded = [l + [0] * (max_len - len(l)) for l in batch_of_lists] 40 | padded_mask = [[1.0]*(len(l) - 1) + [0.0] * (max_len - len(l) + 1) for l in batch_of_lists] 41 | padde_mask_non_local = [[1.0] * (len(l)) + [0.0] * (max_len - len(l)) for l in batch_of_lists] 42 | return padded, padded_mask, padde_mask_non_local 43 | 44 | def pad_batch_of_lists_masks_test(batch_of_lists, max_len): 45 | padded = [l + [0] * (max_len - len(l)) for l in batch_of_lists] 46 | padded2 = [l[:-1] + [0] * (max_len - len(l) + 1) for l in batch_of_lists] 47 | padded_mask = [[0.0]*(len(l) - 2) + [1.0] + [0.0] * (max_len - len(l) + 1) for l in batch_of_lists] 48 | padde_mask_non_local = [[1.0] * (len(l) - 1) + [0.0] * (max_len - len(l) + 1) for l in batch_of_lists] 49 | return padded, padded2, padded_mask, padde_mask_non_local 50 | 51 | class Model(nn.Module): 52 | def __init__(self, n_users, n_items, emb_size=500, hidden_units=500, dropout=0.8, user_dropout=0.5, data_neural = None, tim_sim_matrix = None): 53 | super(self.__class__, self).__init__() 54 | self.n_users = n_users 55 | self.n_items = n_items 56 | self.hidden_units = hidden_units 57 | if emb_size == None: 58 | emb_size = hidden_units 59 | self.emb_size = emb_size 60 | ## todo why embeding? 61 | self.item_emb = nn.Embedding(n_items, emb_size) 62 | self.emb_tim = nn.Embedding(48, 10) 63 | self.lstmcell = nn.LSTM(input_size=emb_size, hidden_size=hidden_units) 64 | self.lstmcell_history = nn.LSTM(input_size=emb_size, hidden_size=hidden_units) 65 | self.linear = nn.Linear(hidden_units*2 , n_items) 66 | self.dropout = nn.Dropout(0.0) 67 | self.user_dropout = nn.Dropout(user_dropout) 68 | self.data_neural = data_neural 69 | self.tim_sim_matrix = tim_sim_matrix 70 | self.dilated_rnn = nn.LSTMCell(input_size=emb_size, hidden_size=hidden_units)# could be the same as self.lstmcell 71 | self.linear1 = nn.Linear(hidden_units, hidden_units) 72 | self.init_weights() 73 | 74 | def init_weights(self): 75 | ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name) 76 | hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name) 77 | b = (param.data for name, param in self.named_parameters() if 'bias' in name) 78 | for t in ih: 79 | nn.init.xavier_uniform(t) 80 | for t in hh: 81 | nn.init.orthogonal(t) 82 | for t in b: 83 | nn.init.constant(t, 0) 84 | 85 | def forward(self, user_vectors, item_vectors, mask_batch_ix_non_local, session_id_batch, sequence_tim_batch, is_train, poi_distance_matrix, sequence_dilated_rnn_index_batch): 86 | batch_size = item_vectors.size()[0] 87 | sequence_size = item_vectors.size()[1] 88 | items = self.item_emb(item_vectors) 89 | item_vectors = item_vectors.cpu() 90 | x = items 91 | x = x.transpose(0, 1) 92 | h1 = Variable(torch.zeros(1, batch_size, self.hidden_units)).cuda() 93 | c1 = Variable(torch.zeros(1, batch_size, self.hidden_units)).cuda() 94 | out, (h1, c1) = self.lstmcell(x, (h1, c1)) 95 | out = out.transpose(0, 1)#batch_size * sequence_length * embedding_dim 96 | x1 = items 97 | # ########################################################### 98 | user_batch = np.array(user_vectors.cpu()) 99 | y_list = [] 100 | out_hie = [] 101 | for ii in range(batch_size): 102 | ########################################## 103 | current_session_input_dilated_rnn_index = sequence_dilated_rnn_index_batch[ii] 104 | hiddens_current = x1[ii] 105 | dilated_lstm_outs_h = [] 106 | dilated_lstm_outs_c = [] 107 | for index_dilated in range(len(current_session_input_dilated_rnn_index)): 108 | index_dilated_explicit = current_session_input_dilated_rnn_index[index_dilated] 109 | hidden_current = hiddens_current[index_dilated].unsqueeze(0) 110 | if index_dilated == 0: 111 | h = Variable(torch.zeros(1, self.hidden_units)).cuda() 112 | c = Variable(torch.zeros(1, self.hidden_units)).cuda() 113 | (h, c) = self.dilated_rnn(hidden_current, (h, c)) 114 | dilated_lstm_outs_h.append(h) 115 | dilated_lstm_outs_c.append(c) 116 | else: 117 | (h, c) = self.dilated_rnn(hidden_current, (dilated_lstm_outs_h[index_dilated_explicit], dilated_lstm_outs_c[index_dilated_explicit])) 118 | dilated_lstm_outs_h.append(h) 119 | dilated_lstm_outs_c.append(c) 120 | dilated_lstm_outs_h.append(hiddens_current[len(current_session_input_dilated_rnn_index):]) 121 | dilated_out = torch.cat(dilated_lstm_outs_h, dim = 0).unsqueeze(0) 122 | out_hie.append(dilated_out) 123 | user_id_current = user_batch[ii] 124 | current_session_timid = sequence_tim_batch[ii][:-1] 125 | current_session_poiid = item_vectors[ii][:len(current_session_timid)] 126 | session_id_current = session_id_batch[ii] 127 | current_session_embed = out[ii] 128 | current_session_mask = mask_batch_ix_non_local[ii].unsqueeze(1) 129 | sequence_length = int(sum(np.array(current_session_mask.cpu()))[0]) 130 | current_session_represent_list = [] 131 | if is_train: 132 | for iii in range(sequence_length-1): 133 | current_session_represent = torch.sum(current_session_embed * current_session_mask, dim=0).unsqueeze(0)/sum(current_session_mask) 134 | current_session_represent_list.append(current_session_represent) 135 | else: 136 | for iii in range(sequence_length-1): 137 | current_session_represent_rep_item = current_session_embed[0:iii+1] 138 | current_session_represent_rep_item = torch.sum(current_session_represent_rep_item, dim = 0).unsqueeze(0)/(iii + 1) 139 | current_session_represent_list.append(current_session_represent_rep_item) 140 | 141 | current_session_represent = torch.cat(current_session_represent_list, dim = 0) 142 | list_for_sessions = [] 143 | list_for_avg_distance = [] 144 | h2 = Variable(torch.zeros(1, 1, self.hidden_units)).cuda()###whole sequence 145 | c2 = Variable(torch.zeros(1, 1, self.hidden_units)).cuda() 146 | for jj in range(session_id_current): 147 | sequence = [s[0] for s in self.data_neural[user_id_current]['sessions'][jj]] 148 | sequence = Variable(torch.LongTensor(np.array(sequence))).cuda() 149 | sequence_emb = self.item_emb(sequence).unsqueeze(1) 150 | sequence = sequence.cpu() 151 | sequence_emb, (h2, c2) = self.lstmcell_history(sequence_emb, (h2, c2)) 152 | sequence_tim_id = [s[1] for s in self.data_neural[user_id_current]['sessions'][jj]] 153 | jaccard_sim_row = Variable(torch.FloatTensor(self.tim_sim_matrix[current_session_timid]),requires_grad=False).cuda() 154 | jaccard_sim_expicit = jaccard_sim_row[:,sequence_tim_id] 155 | distance_row = poi_distance_matrix[current_session_poiid] 156 | distance_row_expicit = Variable(torch.FloatTensor(distance_row[:,sequence]),requires_grad=False).cuda() 157 | distance_row_expicit_avg = torch.mean(distance_row_expicit, dim = 1) 158 | jaccard_sim_expicit_last = F.softmax(jaccard_sim_expicit) 159 | hidden_sequence_for_current1 = torch.mm(jaccard_sim_expicit_last, sequence_emb.squeeze(1)) 160 | hidden_sequence_for_current = hidden_sequence_for_current1 161 | list_for_sessions.append(hidden_sequence_for_current.unsqueeze(0)) 162 | list_for_avg_distance.append(distance_row_expicit_avg.unsqueeze(0)) 163 | avg_distance = torch.cat(list_for_avg_distance, dim = 0).transpose(0,1) 164 | sessions_represent = torch.cat(list_for_sessions, dim=0).transpose(0,1) ##current_items * history_session_length * embedding_size 165 | current_session_represent = current_session_represent.unsqueeze(2) ### current_items * embedding_size * 1 166 | sims = F.softmax(sessions_represent.bmm(current_session_represent).squeeze(2), dim = 1).unsqueeze(1) ##==> current_items * 1 * history_session_length 167 | #out_y_current = sims.bmm(sessions_represent).squeeze(1) 168 | out_y_current =torch.selu(self.linear1(sims.bmm(sessions_represent).squeeze(1))) 169 | ##############layer_2 170 | #layer_2_current = (lambda*out_y_current + (1-lambda)*current_session_embed[:sequence_length-1]).unsqueeze(2) #lambda from [0.1-0.9] better performance 171 | # layer_2_current = (out_y_current + current_session_embed[:sequence_length-1]).unsqueeze(2)##==>current_items * embedding_size * 1 172 | layer_2_current = (0.5 *out_y_current + 0.5 * current_session_embed[:sequence_length - 1]).unsqueeze(2) 173 | layer_2_sims = F.softmax(sessions_represent.bmm(layer_2_current).squeeze(2) * 1.0/avg_distance, dim = 1).unsqueeze(1)##==>>current_items * 1 * history_session_length 174 | out_layer_2 = layer_2_sims.bmm(sessions_represent).squeeze(1) 175 | out_y_current_padd = Variable(torch.FloatTensor(sequence_size - sequence_length + 1, self.emb_size).zero_(),requires_grad=False).cuda() 176 | out_layer_2_list = [] 177 | out_layer_2_list.append(out_layer_2) 178 | out_layer_2_list.append(out_y_current_padd) 179 | out_layer_2 = torch.cat(out_layer_2_list,dim = 0).unsqueeze(0) 180 | y_list.append(out_layer_2) 181 | y = torch.selu(torch.cat(y_list,dim=0)) 182 | out_hie = F.selu(torch.cat(out_hie, dim = 0)) 183 | out = F.selu(out) 184 | out = (out + out_hie) * 0.5 185 | out_put_emb_v1 = torch.cat([y, out], dim=2) 186 | output_ln = self.linear(out_put_emb_v1) 187 | output = F.log_softmax(output_ln, dim=-1) 188 | return output 189 | 190 | 191 | 192 | 193 | def caculate_time_sim(data_neural): 194 | time_checkin_set = defaultdict(set) 195 | for uid in data_neural: 196 | uid_sessions = data_neural[uid] 197 | for sid in uid_sessions['sessions']: 198 | session_current = uid_sessions['sessions'][sid] 199 | for checkin in session_current: 200 | timid = checkin[1] 201 | locid = checkin[0] 202 | if timid not in time_checkin_set: 203 | time_checkin_set[timid] = set() 204 | time_checkin_set[timid].add(locid) 205 | sim_matrix = np.zeros((48,48)) 206 | for i in range(48): 207 | for j in range(48): 208 | set_i = time_checkin_set[i] 209 | set_j = time_checkin_set[j] 210 | jaccard_ij = len(set_i & set_j)/len(set_i | set_j) 211 | sim_matrix[i][j] = jaccard_ij 212 | return sim_matrix 213 | 214 | def caculate_poi_distance(poi_coors): 215 | print("distance matrix") 216 | sim_matrix = np.zeros((len(poi_coors) + 1, len(poi_coors) + 1)) 217 | for i in range(len(poi_coors)): 218 | for j in range(i , len(poi_coors)): 219 | poi_current = i + 1 220 | poi_target = j + 1 221 | poi_current_coor = poi_coors[poi_current] 222 | poi_target_coor = poi_coors[poi_target] 223 | distance_between = geodistance(poi_current_coor[1], poi_current_coor[0], poi_target_coor[1], poi_target_coor[0]) 224 | if distance_between<1: 225 | distance_between = 1 226 | sim_matrix[poi_current][poi_target] = distance_between 227 | sim_matrix[poi_target][poi_current] = distance_between 228 | pickle.dump(sim_matrix, open('distance.pkl', 'wb')) 229 | return sim_matrix 230 | 231 | def generate_input_history(data_neural, mode, candidate=None): 232 | data_train = {} 233 | train_idx = {} 234 | if candidate is None: 235 | candidate = data_neural.keys() 236 | for u in candidate: 237 | sessions = data_neural[u]['sessions'] 238 | train_id = data_neural[u][mode] 239 | data_train[u] = {} 240 | for c, i in enumerate(train_id): 241 | if mode == 'train' and c == 0: 242 | continue 243 | session = sessions[i] 244 | trace = {} 245 | loc_np = np.reshape(np.array([s[0] for s in session[:-1]]), (len(session[:-1]), 1)) 246 | tim_np = np.reshape(np.array([s[1] for s in session[:-1]]), (len(session[:-1]), 1)) 247 | target = np.array([s[0] for s in session[1:]]) 248 | trace['loc'] = Variable(torch.LongTensor(loc_np)) 249 | trace['target'] = Variable(torch.LongTensor(target)) 250 | trace['tim'] = Variable(torch.LongTensor(tim_np)) 251 | history = [] 252 | if mode == 'test': 253 | test_id = data_neural[u]['train'] 254 | for tt in test_id: 255 | history.extend([(s[0], s[1]) for s in sessions[tt]]) 256 | for j in range(c): 257 | history.extend([(s[0], s[1]) for s in sessions[train_id[j]]]) 258 | history = sorted(history, key=lambda x: x[1], reverse=False) 259 | history_loc = np.reshape(np.array([s[0] for s in history]), (len(history), 1)) 260 | history_tim = np.reshape(np.array([s[1] for s in history]), (len(history), 1)) 261 | trace['history_loc'] = Variable(torch.LongTensor(history_loc)) 262 | trace['history_tim'] = Variable(torch.LongTensor(history_tim)) 263 | data_train[u][i] = trace 264 | train_idx[u] = train_id 265 | return data_train, train_idx 266 | 267 | def generate_input_long_history(data_neural, mode, candidate=None): 268 | data_train = {} 269 | train_idx = {} 270 | if candidate is None: 271 | candidate = data_neural.keys() 272 | for u in candidate: 273 | sessions = data_neural[u]['sessions'] 274 | train_id = data_neural[u][mode] 275 | data_train[u] = {} 276 | for c, i in enumerate(train_id): 277 | trace = {} 278 | if mode == 'train' and c == 0: 279 | continue 280 | session = sessions[i] 281 | target = np.array([s[0] for s in session[1:]]) 282 | history = [] 283 | if mode == 'test': 284 | test_id = data_neural[u]['train'] 285 | for tt in test_id: 286 | history.extend([(s[0], s[1]) for s in sessions[tt]]) 287 | for j in range(c): 288 | history.extend([(s[0], s[1]) for s in sessions[train_id[j]]]) 289 | history_tim = [t[1] for t in history] 290 | history_count = [1] 291 | last_t = history_tim[0] 292 | count = 1 293 | for t in history_tim[1:]: 294 | if t == last_t: 295 | count += 1 296 | else: 297 | history_count[-1] = count 298 | history_count.append(1) 299 | last_t = t 300 | count = 1 301 | history_loc = np.reshape(np.array([s[0] for s in history]), (len(history), 1)) 302 | history_tim = np.reshape(np.array([s[1] for s in history]), (len(history), 1)) 303 | trace['history_loc'] = Variable(torch.LongTensor(history_loc)) 304 | trace['history_tim'] = Variable(torch.LongTensor(history_tim)) 305 | trace['history_count'] = history_count 306 | loc_tim = history 307 | loc_tim.extend([(s[0], s[1]) for s in session[:-1]]) 308 | loc_np = np.reshape(np.array([s[0] for s in loc_tim]), (len(loc_tim), 1)) 309 | tim_np = np.reshape(np.array([s[1] for s in loc_tim]), (len(loc_tim), 1)) 310 | trace['loc'] = Variable(torch.LongTensor(loc_np)) 311 | trace['tim'] = Variable(torch.LongTensor(tim_np)) 312 | trace['target'] = Variable(torch.LongTensor(target)) 313 | data_train[u][i] = trace 314 | train_idx[u] = train_id 315 | return data_train, train_idx 316 | 317 | def generate_queue(train_idx, mode, mode2): 318 | user = list(train_idx.keys()) 319 | train_queue = list() 320 | if mode == 'random': 321 | initial_queue = {} 322 | for u in user: 323 | if mode2 == 'train': 324 | initial_queue[u] = deque(train_idx[u][1:]) 325 | else: 326 | initial_queue[u] = deque(train_idx[u]) 327 | queue_left = 1 328 | while queue_left > 0: 329 | for j, u in enumerate(user): 330 | if len(initial_queue[u]) > 0: 331 | train_queue.append((u, initial_queue[u].popleft())) 332 | queue_left = sum([1 for x in initial_queue if len(initial_queue[x]) > 0]) 333 | elif mode == 'normal': 334 | for u in user: 335 | for i in train_idx[u]: 336 | train_queue.append((u, i)) 337 | return train_queue 338 | 339 | 340 | def create_dilated_rnn_input(session_sequence_current, poi_distance_matrix): 341 | sequence_length = len(session_sequence_current) 342 | session_sequence_current.reverse() 343 | session_dilated_rnn_input_index = [0] * sequence_length 344 | for i in range(sequence_length - 1): 345 | current_poi = [session_sequence_current[i]] 346 | poi_before = session_sequence_current[i + 1 :] 347 | distance_row = poi_distance_matrix[current_poi] 348 | distance_row_explicit = distance_row[:, poi_before][0] 349 | index_closet = np.argmin(distance_row_explicit) 350 | session_dilated_rnn_input_index[sequence_length - i - 1] = sequence_length-2-index_closet-i 351 | session_sequence_current.reverse() 352 | return session_dilated_rnn_input_index 353 | 354 | 355 | 356 | def generate_detailed_batch_data(one_train_batch): 357 | session_id_batch = [] 358 | user_id_batch = [] 359 | sequence_batch = [] 360 | sequences_lens_batch = [] 361 | sequences_tim_batch = [] 362 | sequences_dilated_input_batch = [] 363 | for sample in one_train_batch: 364 | user_id_batch.append(sample[0]) 365 | session_id_batch.append(sample[1]) 366 | session_sequence_current = [s[0] for s in data_neural[sample[0]]['sessions'][sample[1]]] 367 | session_sequence_tim_current = [s[1] for s in data_neural[sample[0]]['sessions'][sample[1]]] 368 | session_sequence_dilated_input = create_dilated_rnn_input(session_sequence_current, poi_distance_matrix) 369 | sequence_batch.append(session_sequence_current) 370 | sequences_lens_batch.append(len(session_sequence_current)) 371 | sequences_tim_batch.append(session_sequence_tim_current) 372 | sequences_dilated_input_batch.append(session_sequence_dilated_input) 373 | return user_id_batch, session_id_batch, sequence_batch, sequences_lens_batch, sequences_tim_batch, sequences_dilated_input_batch 374 | 375 | 376 | def train_network(network, num_epoch=40 ,batch_size = 32,criterion = None): 377 | candidate = data_neural.keys() 378 | data_train, train_idx = generate_input_history(data_neural, 'train', candidate=candidate) 379 | for epoch in range(num_epoch): 380 | network.train(True) 381 | i = 0 382 | run_queue = generate_queue(train_idx, 'random', 'train') 383 | for one_train_batch in minibatch(run_queue, batch_size = batch_size): 384 | user_id_batch, session_id_batch, sequence_batch, sequences_lens_batch, sequence_tim_batch, sequence_dilated_rnn_index_batch = generate_detailed_batch_data(one_train_batch) 385 | max_len = max(sequences_lens_batch) 386 | padded_sequence_batch, mask_batch_ix, mask_batch_ix_non_local = pad_batch_of_lists_masks(sequence_batch, 387 | max_len) 388 | padded_sequence_batch = Variable(torch.LongTensor(np.array(padded_sequence_batch))).to(device) 389 | mask_batch_ix = Variable(torch.FloatTensor(np.array(mask_batch_ix))).to(device) 390 | mask_batch_ix_non_local = Variable(torch.FloatTensor(np.array(mask_batch_ix_non_local))).to(device) 391 | user_id_batch = Variable(torch.LongTensor(np.array(user_id_batch))).to(device) 392 | logp_seq = network(user_id_batch, padded_sequence_batch, mask_batch_ix_non_local, session_id_batch, sequence_tim_batch, True, poi_distance_matrix, sequence_dilated_rnn_index_batch) 393 | predictions_logp = logp_seq[:, :-1] * mask_batch_ix[:, :-1, None] 394 | actual_next_tokens = padded_sequence_batch[:, 1:] 395 | logp_next = torch.gather(predictions_logp, dim=2, index=actual_next_tokens[:, :, None]) 396 | loss = -logp_next.sum() / mask_batch_ix[:, :-1].sum() 397 | # train with backprop 398 | opt.zero_grad() 399 | loss.backward() 400 | nn.utils.clip_grad_norm_(network.parameters(), 5.0) 401 | opt.step() 402 | if (i + 1) % 20 == 0: 403 | print("epoch" + str(epoch) + ": loss: " + str(loss)) 404 | i += 1 405 | results = evaluate(network, 1) 406 | print("Scores: ", results) 407 | 408 | 409 | def get_acc(target, scores): 410 | target = target.data.cpu().numpy() 411 | val, idxx = scores.data.topk(10, 1) 412 | predx = idxx.cpu().numpy() 413 | acc = np.zeros((3, 1)) 414 | ndcg = np.zeros((3, 1)) 415 | for i, p in enumerate(predx): 416 | t = target[i] 417 | if t != 0: 418 | if t in p[:10] and t > 0: 419 | acc[0] += 1 420 | rank_list = list(p[:10]) 421 | rank_index = rank_list.index(t) 422 | ndcg[0] += 1.0 / np.log2(rank_index + 2) 423 | if t in p[:5] and t > 0: 424 | acc[1] += 1 425 | rank_list = list(p[:5]) 426 | rank_index = rank_list.index(t) 427 | ndcg[1] += 1.0 / np.log2(rank_index + 2) 428 | if t == p[0] and t > 0: 429 | acc[2] += 1 430 | rank_list = list(p[:1]) 431 | rank_index = rank_list.index(t) 432 | ndcg[2] += 1.0 / np.log2(rank_index + 2) 433 | else: 434 | break 435 | return acc.tolist(), ndcg.tolist() 436 | 437 | def evaluate(network, batch_size = 2): 438 | network.train(False) 439 | candidate = data_neural.keys() 440 | data_test, test_idx = generate_input_long_history(data_neural, 'test', candidate=candidate) 441 | users_acc = {} 442 | with torch.no_grad(): 443 | run_queue = generate_queue(test_idx, 'normal', 'test') 444 | for one_test_batch in minibatch(run_queue, batch_size=batch_size): 445 | user_id_batch, session_id_batch, sequence_batch, sequences_lens_batch, sequence_tim_batch, sequence_dilated_rnn_index_batch = generate_detailed_batch_data( 446 | one_test_batch) 447 | user_id_batch_test = user_id_batch 448 | max_len = max(sequences_lens_batch) 449 | padded_sequence_batch, mask_batch_ix, mask_batch_ix_non_local = pad_batch_of_lists_masks(sequence_batch, 450 | max_len) 451 | padded_sequence_batch = Variable(torch.LongTensor(np.array(padded_sequence_batch))).to(device) 452 | mask_batch_ix = Variable(torch.FloatTensor(np.array(mask_batch_ix))).to(device) 453 | mask_batch_ix_non_local = Variable(torch.FloatTensor(np.array(mask_batch_ix_non_local))).to(device) 454 | user_id_batch = Variable(torch.LongTensor(np.array(user_id_batch))).to(device) 455 | logp_seq = network(user_id_batch, padded_sequence_batch, mask_batch_ix_non_local, session_id_batch, sequence_tim_batch, False, poi_distance_matrix, sequence_dilated_rnn_index_batch) 456 | predictions_logp = logp_seq[:, :-1] * mask_batch_ix[:, :-1, None] 457 | actual_next_tokens = padded_sequence_batch[:, 1:] 458 | for ii, u_current in enumerate(user_id_batch_test): 459 | if u_current not in users_acc: 460 | users_acc[u_current] = [0, 0, 0, 0, 0, 0, 0] 461 | acc, ndcg = get_acc(actual_next_tokens[ii], predictions_logp[ii]) 462 | users_acc[u_current][1] += acc[2][0]#@1 463 | users_acc[u_current][2] += acc[1][0]#@5 464 | users_acc[u_current][3] += acc[0][0]#@10 465 | ###ndcg 466 | users_acc[u_current][4] += ndcg[2][0] # @1 467 | users_acc[u_current][5] += ndcg[1][0] # @5 468 | users_acc[u_current][6] += ndcg[0][0] # @10 469 | users_acc[u_current][0] += (sequences_lens_batch[ii]-1) 470 | tmp_acc = [0.0,0.0,0.0, 0.0, 0.0, 0.0]##last 3 ndcg 471 | sum_test_samples = 0.0 472 | for u in users_acc: 473 | tmp_acc[0] = users_acc[u][1] + tmp_acc[0] 474 | tmp_acc[1] = users_acc[u][2] + tmp_acc[1] 475 | tmp_acc[2] = users_acc[u][3] + tmp_acc[2] 476 | 477 | tmp_acc[3] = users_acc[u][4] + tmp_acc[3] 478 | tmp_acc[4] = users_acc[u][5] + tmp_acc[4] 479 | tmp_acc[5] = users_acc[u][6] + tmp_acc[5] 480 | sum_test_samples = sum_test_samples + users_acc[u][0] 481 | avg_acc = (np.array(tmp_acc)/sum_test_samples).tolist() 482 | return avg_acc 483 | 484 | def geodistance(lng1,lat1,lng2,lat2): 485 | lng1, lat1, lng2, lat2 = map(radians, [float(lng1), float(lat1), float(lng2), float(lat2)]) 486 | dlon=lng2-lng1 487 | dlat=lat2-lat1 488 | a=sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 489 | distance=2*asin(sqrt(a))*6371*1000 490 | distance=round(distance/1000,3) 491 | return distance 492 | 493 | 494 | if __name__ == '__main__': 495 | np.random.seed(1) 496 | torch.manual_seed(1) 497 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 498 | data = pickle.load(open('foursquare_cut_one_day.pkl', 'rb'), encoding='iso-8859-1') 499 | vid_list = data['vid_list'] 500 | uid_list = data['uid_list'] 501 | data_neural = data['data_neural'] 502 | poi_coordinate = data['vid_lookup'] 503 | loc_size = len(vid_list) 504 | uid_size = len(uid_list) 505 | time_sim_matrix = caculate_time_sim(data_neural) 506 | # poi_distance_matrix = caculate_poi_distance(poi_coordinate) 507 | poi_distance_matrix = pickle.load(open('distance.pkl', 'rb'), encoding='iso-8859-1') 508 | torch.cuda.empty_cache() 509 | gc.collect() 510 | device = torch.device("cuda") 511 | n_users = uid_size 512 | n_items = loc_size 513 | session_id_sequences = None 514 | user_id_session = None 515 | network = Model(n_users=n_users, n_items=n_items, data_neural=data_neural, tim_sim_matrix=time_sim_matrix).to( 516 | device) 517 | opt = torch.optim.Adam(filter(lambda p: p.requires_grad, network.parameters()), lr=0.0001, 518 | weight_decay=1 * 1e-6) 519 | criterion = nn.NLLLoss().cuda() 520 | train_network(network,criterion=criterion) 521 | --------------------------------------------------------------------------------