├── README.md
└── train
    ├── data_process.py
    ├── foursquare_cut_one_day.pkl
    └── train.py


/README.md:
--------------------------------------------------------------------------------
 1 | # LSTPM
 2 | Implementation of paper "Ke Sun, Tieyun Qian, Tong Chen, Yile Liang, Quoc Viet Hung Nguyen, Hongzhi Yin. Where to Go Next: Modeling Long- and Short-Term User Preferences for
 3 | Point-of-Interest Recommendation." accepted by AAAI 2020.
 4 | 
 5 | # Update
 6 | * The "distance.pkl" is generated by the "caculate_poi_distance" function in the "train.py".
 7 | * We have uploaded the data processing script "data_process.py" for generating "foursquare_cut_one_day.pkl".
 8 | 
 9 | # Requirements
10 | * python 3.6
11 | * pytorch 1.0.1
12 | 


--------------------------------------------------------------------------------
/train/data_process.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | 
  4 | import time
  5 | import argparse
  6 | import numpy as np
  7 | import pickle
  8 | # import cPickle as pickle
  9 | from collections import Counter
 10 | 
 11 | 
 12 | def entropy_spatial(sessions):
 13 |     locations = {}
 14 |     days = sorted(sessions.keys())
 15 |     for d in days:
 16 |         session = sessions[d]
 17 |         for s in session:
 18 |             if s[0] not in locations:
 19 |                 locations[s[0]] = 1
 20 |             else:
 21 |                 locations[s[0]] += 1
 22 |     frequency = np.array([locations[loc] for loc in locations])
 23 |     frequency = frequency / np.sum(frequency)
 24 |     entropy = - np.sum(frequency * np.log(frequency))
 25 |     return entropy
 26 | 
 27 | 
 28 | class DataFoursquare(object):
 29 |     def __init__(self, trace_min=10, global_visit=10, hour_gap=72, min_gap=10, session_min=2, session_max=10,
 30 |                  sessions_min=2, train_split=0.8, embedding_len=50):
 31 |         tmp_path = "data/"
 32 |         self.TWITTER_PATH = tmp_path + 'foursquare/tweet_clean_all.txt'
 33 |         self.VENUES_PATH = tmp_path + 'foursquare/venues_all.txt'
 34 |         self.SAVE_PATH = tmp_path
 35 |         self.save_name = 'foursquare_cut_one_day'
 36 | 
 37 |         self.trace_len_min = trace_min
 38 |         self.location_global_visit_min = global_visit
 39 |         self.hour_gap = hour_gap
 40 |         self.min_gap = min_gap
 41 |         self.session_max = session_max
 42 |         self.filter_short_session = session_min
 43 |         self.sessions_count_min = sessions_min
 44 |         self.words_embeddings_len = embedding_len
 45 | 
 46 |         self.train_split = train_split
 47 | 
 48 |         self.data = {}
 49 |         self.venues = {}
 50 |         self.venues_cat = {}
 51 |         self.words_original = []
 52 |         self.words_lens = []
 53 |         self.dictionary = dict()
 54 |         self.words_dict = None
 55 |         self.data_filter = {}
 56 |         self.user_filter3 = None
 57 |         self.uid_list = {}
 58 |         self.vid_list = {'unk': [0, -1]}
 59 |         self.vid_list_lookup = {}
 60 |         self.vid_lookup = {}
 61 |         self.pid_loc_lat = {}
 62 |         self.data_neural = {}
 63 | 
 64 | 
 65 | 
 66 | 
 67 |     # ############# 1. read trajectory data from twitters
 68 |     def load_trajectory_from_tweets(self):
 69 |         with open(self.TWITTER_PATH,encoding='UTF-8') as fid:
 70 |             for i, line in enumerate(fid):
 71 |                 _, uid, _, _, tim, _, _, tweet, pid, cat_id = line.strip('\r\n').split('')
 72 |                 if uid not in self.data:
 73 |                     self.data[uid] = [[pid, tim, cat_id]]
 74 |                 else:
 75 |                     self.data[uid].append([pid, tim, cat_id])
 76 |                 if pid not in self.venues:
 77 |                     self.venues[pid] = 1
 78 |                 else:
 79 |                     self.venues[pid] += 1
 80 |                 if cat_id not in self.venues_cat:
 81 |                     self.venues_cat[cat_id] = 1
 82 |                 else:
 83 |                     self.venues_cat[cat_id] += 1
 84 | 
 85 |     # ########### 3.0 basically filter users based on visit length and other statistics
 86 |     def filter_users_by_length(self):
 87 |         uid_3 = [x for x in self.data if len(self.data[x]) > self.trace_len_min]
 88 |         xixi = [(x, len(self.data[x])) for x in uid_3]
 89 |         pick3 = sorted([(x, len(self.data[x])) for x in uid_3], key=lambda x: x[1], reverse=True)
 90 |         pid_3 = [x for x in self.venues if self.venues[x] > self.location_global_visit_min]
 91 |         pid_pic3 = sorted([(x, self.venues[x]) for x in pid_3], key=lambda x: x[1], reverse=True)
 92 |         pid_3 = dict(pid_pic3)
 93 | 
 94 |         session_len_list = []
 95 |         for u in pick3:
 96 |             uid = u[0]
 97 |             info = self.data[uid]
 98 |             xixi = Counter([x[0] for x in info])
 99 |             topk = Counter([x[0] for x in info]).most_common()
100 |             topk1 = [x[0] for x in topk if x[1] > 1]
101 |             sessions = {}
102 |             for i, record in enumerate(info):
103 |                 poi, tmd, cat_id = record
104 |                 try:
105 |                     current_date = tmd.split(' ')[0]
106 | 
107 |                     tid = int(time.mktime(time.strptime(tmd, "%Y-%m-%d %H:%M:%S")))
108 |                 except Exception as e:
109 |                     print('error:{}'.format(e))
110 |                     continue
111 |                 sid = len(sessions)
112 |                 if poi not in pid_3 and poi not in topk1:
113 |                     # if poi not in topk1:
114 |                     continue
115 |                 if i == 0 or len(sessions) == 0:
116 |                     sessions[sid] = [record]
117 |                 else:
118 |                     # if (tid - last_tid) / 3600 > self.hour_gap or len(sessions[sid - 1]) > self.session_max:
119 |                     if last_date != current_date:
120 |                         sessions[sid] = [record]
121 |                     elif (tid - last_tid) / 60 > self.min_gap:
122 |                         sessions[sid - 1].append(record)
123 |                     else:
124 |                         pass
125 |                 last_tid = tid
126 |                 last_date = current_date
127 |             sessions_filter = {}
128 |             for s in sessions:
129 |                 if len(sessions[s]) >= self.filter_short_session:
130 |                     sessions_filter[len(sessions_filter)] = sessions[s]
131 |                     session_len_list.append(len(sessions[s]))
132 |             if len(sessions_filter) >= self.sessions_count_min:
133 |                 self.data_filter[uid] = {'sessions_count': len(sessions_filter), 'topk_count': len(topk), 'topk': topk,
134 |                                          'sessions': sessions_filter, 'raw_sessions': sessions}
135 | 
136 |         self.user_filter3 = [x for x in self.data_filter if
137 |                              self.data_filter[x]['sessions_count'] >= self.sessions_count_min]
138 | 
139 |     def filter_users_by_length_source(self):
140 |         uid_3 = [x for x in self.data if len(self.data[x]) > self.trace_len_min]
141 |         xixi = [(x, len(self.data[x])) for x in uid_3]
142 |         pick3 = sorted([(x, len(self.data[x])) for x in uid_3], key=lambda x: x[1], reverse=True)
143 |         pid_3 = [x for x in self.venues if self.venues[x] > self.location_global_visit_min]
144 |         pid_pic3 = sorted([(x, self.venues[x]) for x in pid_3], key=lambda x: x[1], reverse=True)
145 |         pid_3 = dict(pid_pic3)
146 | 
147 |         session_len_list = []
148 |         for u in pick3:
149 |             uid = u[0]
150 |             info = self.data[uid]
151 |             xixi = Counter([x[0] for x in info])
152 |             topk = Counter([x[0] for x in info]).most_common()
153 |             topk1 = [x[0] for x in topk if x[1] > 1]
154 |             sessions = {}
155 |             for i, record in enumerate(info):
156 |                 poi, tmd = record
157 |                 try:
158 |                     tid = int(time.mktime(time.strptime(tmd, "%Y-%m-%d %H:%M:%S")))
159 |                 except Exception as e:
160 |                     print('error:{}'.format(e))
161 |                     continue
162 |                 sid = len(sessions)
163 |                 if poi not in pid_3 and poi not in topk1:
164 |                     # if poi not in topk1:
165 |                     continue
166 |                 if i == 0 or len(sessions) == 0:
167 |                     sessions[sid] = [record]
168 |                 else:
169 |                     if (tid - last_tid) / 3600 > self.hour_gap or len(sessions[sid - 1]) > self.session_max:
170 |                         sessions[sid] = [record]
171 |                     elif (tid - last_tid) / 60 > self.min_gap:
172 |                         sessions[sid - 1].append(record)
173 |                     else:
174 |                         pass
175 |                 last_tid = tid
176 |             sessions_filter = {}
177 |             for s in sessions:
178 |                 if len(sessions[s]) >= self.filter_short_session:
179 |                     sessions_filter[len(sessions_filter)] = sessions[s]
180 |                     session_len_list.append(len(sessions[s]))
181 |             if len(sessions_filter) >= self.sessions_count_min:
182 |                 self.data_filter[uid] = {'sessions_count': len(sessions_filter), 'topk_count': len(topk), 'topk': topk,
183 |                                          'sessions': sessions_filter, 'raw_sessions': sessions}
184 | 
185 |         self.user_filter3 = [x for x in self.data_filter if
186 |                              self.data_filter[x]['sessions_count'] >= self.sessions_count_min]
187 | 
188 |     # ########### 4. build dictionary for users and location
189 |     def build_users_locations_dict(self):
190 |         for u in self.user_filter3:
191 |             sessions = self.data_filter[u]['sessions']
192 |             if u not in self.uid_list:
193 |                 self.uid_list[u] = [len(self.uid_list), len(sessions)]
194 |             for sid in sessions:
195 |                 poi = [p[0] for p in sessions[sid]]
196 |                 for p in poi:
197 |                     if p not in self.vid_list:
198 |                         self.vid_list_lookup[len(self.vid_list)] = p
199 |                         self.vid_list[p] = [len(self.vid_list), 1]
200 |                     else:
201 |                         self.vid_list[p][1] += 1
202 | 
203 |     # support for radius of gyration
204 |     def load_venues(self):
205 |         with open(self.TWITTER_PATH, 'r',encoding='UTF-8') as fid:
206 |             for line in fid:
207 |                 _, uid, lon, lat, tim, _, _, tweet, pid, cid = line.strip('\r\n').split('')
208 |                 self.pid_loc_lat[pid] = [float(lon), float(lat)]
209 | 
210 |     def venues_lookup(self):
211 |         for vid in self.vid_list_lookup:
212 |             pid = self.vid_list_lookup[vid]
213 |             lon_lat = self.pid_loc_lat[pid]
214 |             self.vid_lookup[vid] = lon_lat
215 | 
216 |     # ########## 5.0 prepare training data for neural network
217 |     @staticmethod
218 |     def tid_list(tmd):
219 |         tm = time.strptime(tmd, "%Y-%m-%d %H:%M:%S")
220 |         tid = tm.tm_wday * 24 + tm.tm_hour
221 |         return tid
222 | 
223 |     @staticmethod
224 |     def tid_list_48(tmd):
225 |         tm = time.strptime(tmd, "%Y-%m-%d %H:%M:%S")
226 |         timeStamp = (time.mktime(tm))
227 |         # tid = tm.tm_hour
228 |         if tm.tm_wday in [0, 1, 2, 3, 4]:
229 |             tid = tm.tm_hour
230 |         else:
231 |             tid = tm.tm_hour + 24
232 |         return [timeStamp, tid]
233 | 
234 |     def prepare_neural_data(self):
235 |         for u in self.uid_list:
236 |             sessions = self.data_filter[u]['sessions']
237 |             sessions_tran = {}
238 |             sessions_id = []
239 |             for sid in sessions:
240 |                 sessions_tran[sid] = [[self.vid_list[p[0]][0], p[1], p[-1]] for p in
241 |                                       sessions[sid]]
242 |                 sessions_id.append(sid)
243 |             split_id = int(np.floor(self.train_split * len(sessions_id)))
244 |             train_id = sessions_id[:split_id]
245 |             test_id = sessions_id[split_id:]
246 |             pred_len = sum([len(sessions_tran[i]) - 1 for i in train_id])
247 |             valid_len = sum([len(sessions_tran[i]) - 1 for i in test_id])
248 |             train_loc = {}
249 |             for i in train_id:
250 |                 for sess in sessions_tran[i]:
251 |                     if sess[0] in train_loc:
252 |                         train_loc[sess[0]] += 1
253 |                     else:
254 |                         train_loc[sess[0]] = 1
255 |             # calculate entropy
256 |             entropy = entropy_spatial(sessions)
257 | 
258 |             # calculate location ratio
259 |             train_location = []
260 |             for i in train_id:
261 |                 train_location.extend([s[0] for s in sessions[i]])
262 |             train_location_set = set(train_location)
263 |             test_location = []
264 |             for i in test_id:
265 |                 test_location.extend([s[0] for s in sessions[i]])
266 |             test_location_set = set(test_location)
267 |             whole_location = train_location_set | test_location_set
268 |             test_unique = whole_location - train_location_set
269 |             location_ratio = len(test_unique) / len(whole_location)
270 | 
271 |             # calculate radius of gyration
272 |             lon_lat = []
273 |             for pid in train_location:
274 |                 try:
275 |                     lon_lat.append(self.pid_loc_lat[pid])
276 |                 except:
277 |                     print(pid)
278 |                     print('error')
279 |             lon_lat = np.array(lon_lat)
280 |             center = np.mean(lon_lat, axis=0, keepdims=True)
281 |             center = np.repeat(center, axis=0, repeats=len(lon_lat))
282 |             rg = np.sqrt(np.mean(np.sum((lon_lat - center) ** 2, axis=1, keepdims=True), axis=0))[0]
283 | 
284 |             self.data_neural[self.uid_list[u][0]] = {'sessions': sessions_tran, 'train': train_id, 'test': test_id,
285 |                                                      'pred_len': pred_len, 'valid_len': valid_len,
286 |                                                      'train_loc': train_loc, 'explore': location_ratio,
287 |                                                      'entropy': entropy, 'rg': rg}
288 | 
289 |     # ############# 6. save variables
290 |     def get_parameters(self):
291 |         parameters = {}
292 |         parameters['TWITTER_PATH'] = self.TWITTER_PATH
293 |         parameters['SAVE_PATH'] = self.SAVE_PATH
294 | 
295 |         parameters['trace_len_min'] = self.trace_len_min
296 |         parameters['location_global_visit_min'] = self.location_global_visit_min
297 |         parameters['hour_gap'] = self.hour_gap
298 |         parameters['min_gap'] = self.min_gap
299 |         parameters['session_max'] = self.session_max
300 |         parameters['filter_short_session'] = self.filter_short_session
301 |         parameters['sessions_min'] = self.sessions_count_min
302 |         parameters['train_split'] = self.train_split
303 | 
304 |         return parameters
305 | 
306 |     def save_variables(self):
307 |         foursquare_dataset = {'data_neural': self.data_neural, 'vid_list': self.vid_list, 'uid_list': self.uid_list,
308 |                               'parameters': self.get_parameters(), 'data_filter': self.data_filter,
309 |                               'vid_lookup': self.vid_lookup}
310 |         pickle.dump(foursquare_dataset, open(self.SAVE_PATH + self.save_name + '.pk', 'wb'))
311 | 
312 | 
313 | def parse_args():
314 |     parser = argparse.ArgumentParser()
315 |     parser.add_argument('--trace_min', type=int, default=10, help="raw trace length filter threshold")
316 |     parser.add_argument('--global_visit', type=int, default=10, help="location global visit threshold")
317 |     parser.add_argument('--hour_gap', type=int, default=72, help="maximum interval of two trajectory points")
318 |     parser.add_argument('--min_gap', type=int, default=0, help="minimum interval of two trajectory points")
319 |     parser.add_argument('--session_max', type=int, default=10, help="control the length of session not too long")
320 |     parser.add_argument('--session_min', type=int, default=3, help="control the length of session not too short")
321 |     parser.add_argument('--sessions_min', type=int, default=5, help="the minimum amount of the good user's sessions")
322 |     parser.add_argument('--train_split', type=float, default=0.8, help="train/test ratio")
323 |     return parser.parse_args()
324 | 
325 | 
326 | if __name__ == '__main__':
327 |     args = parse_args()
328 |     data_generator = DataFoursquare(trace_min=args.trace_min, global_visit=args.global_visit,
329 |                                     hour_gap=args.hour_gap, min_gap=args.min_gap,
330 |                                     session_min=args.session_min, session_max=args.session_max,
331 |                                     sessions_min=args.sessions_min, train_split=args.train_split)
332 |     parameters = data_generator.get_parameters()
333 |     print('############PARAMETER SETTINGS:\n' + '\n'.join([p + ':' + str(parameters[p]) for p in parameters]))
334 |     print('############START PROCESSING:')
335 |     print('load trajectory from {}'.format(data_generator.TWITTER_PATH))
336 |     data_generator.load_trajectory_from_tweets()
337 |     print('filter users')
338 |     data_generator.filter_users_by_length()
339 |     print('build users/locations dictionary')
340 |     data_generator.build_users_locations_dict()
341 |     data_generator.load_venues()
342 |     data_generator.venues_lookup()
343 |     print('prepare data for neural network')
344 |     data_generator.prepare_neural_data()
345 |     print('save prepared data')
346 |     data_generator.save_variables()
347 |     print('raw users:{} raw locations:{}'.format(
348 |         len(data_generator.data), len(data_generator.venues)))
349 |     print('final users:{} final locations:{}'.format(
350 |         len(data_generator.data_neural), len(data_generator.vid_list)))
351 | 


--------------------------------------------------------------------------------
/train/foursquare_cut_one_day.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NLPWM-WHU/LSTPM/bce2384f33ec3efe08c106033632987d14738abe/train/foursquare_cut_one_day.pkl


--------------------------------------------------------------------------------
/train/train.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | from torch.autograd import Variable
  4 | import pickle
  5 | import numpy as np
  6 | import torch
  7 | from collections import defaultdict
  8 | import gc
  9 | import os
 10 | from math import radians, cos, sin, asin, sqrt
 11 | from collections import deque,Counter
 12 | def shuffle(*arrays, **kwargs):
 13 |     require_indices = kwargs.get('indices', False)
 14 |     if len(set(len(x) for x in arrays)) != 1:
 15 |         raise ValueError('All inputs to shuffle must have '
 16 |                          'the same length.')
 17 |     shuffle_indices = np.arange(len(arrays[0]))
 18 |     np.random.shuffle(shuffle_indices)
 19 |     if len(arrays) == 1:
 20 |         result = arrays[0][shuffle_indices]
 21 |     else:
 22 |         result = tuple(x[shuffle_indices] for x in arrays)
 23 |     if require_indices:
 24 |         return result, shuffle_indices
 25 |     else:
 26 |         return result
 27 | 
 28 | def minibatch(*tensors, **kwargs):
 29 |     batch_size = kwargs.get('batch_size', 128)
 30 |     if len(tensors) == 1:
 31 |         tensor = tensors[0]
 32 |         for i in range(0, len(tensor), batch_size):
 33 |             yield tensor[i:i + batch_size]
 34 |     else:
 35 |         for i in range(0, len(tensors[0]), batch_size):
 36 |             yield tuple(x[i:i + batch_size] for x in tensors)
 37 | 
 38 | def pad_batch_of_lists_masks(batch_of_lists, max_len):
 39 |     padded = [l + [0] * (max_len - len(l)) for l in batch_of_lists]
 40 |     padded_mask = [[1.0]*(len(l) - 1) + [0.0] * (max_len - len(l) + 1) for l in batch_of_lists]
 41 |     padde_mask_non_local = [[1.0] * (len(l)) + [0.0] * (max_len - len(l)) for l in batch_of_lists]
 42 |     return padded, padded_mask, padde_mask_non_local
 43 | 
 44 | def pad_batch_of_lists_masks_test(batch_of_lists, max_len):
 45 |     padded = [l + [0] * (max_len - len(l)) for l in batch_of_lists]
 46 |     padded2 = [l[:-1] + [0] * (max_len - len(l) + 1) for l in batch_of_lists]
 47 |     padded_mask = [[0.0]*(len(l) - 2) + [1.0] + [0.0] * (max_len - len(l) + 1) for l in batch_of_lists]
 48 |     padde_mask_non_local = [[1.0] * (len(l) - 1) + [0.0] * (max_len - len(l) + 1) for l in batch_of_lists]
 49 |     return padded, padded2, padded_mask, padde_mask_non_local
 50 | 
 51 | class Model(nn.Module):
 52 |     def __init__(self, n_users, n_items, emb_size=500, hidden_units=500, dropout=0.8, user_dropout=0.5, data_neural = None, tim_sim_matrix = None):
 53 |         super(self.__class__, self).__init__()
 54 |         self.n_users = n_users
 55 |         self.n_items = n_items
 56 |         self.hidden_units = hidden_units
 57 |         if emb_size == None:
 58 |             emb_size = hidden_units
 59 |         self.emb_size = emb_size
 60 |         ## todo why embeding?
 61 |         self.item_emb = nn.Embedding(n_items, emb_size)
 62 |         self.emb_tim = nn.Embedding(48, 10)
 63 |         self.lstmcell = nn.LSTM(input_size=emb_size, hidden_size=hidden_units)
 64 |         self.lstmcell_history = nn.LSTM(input_size=emb_size, hidden_size=hidden_units)
 65 |         self.linear = nn.Linear(hidden_units*2 , n_items)
 66 |         self.dropout = nn.Dropout(0.0)
 67 |         self.user_dropout = nn.Dropout(user_dropout)
 68 |         self.data_neural = data_neural
 69 |         self.tim_sim_matrix = tim_sim_matrix
 70 |         self.dilated_rnn = nn.LSTMCell(input_size=emb_size, hidden_size=hidden_units)# could be the same as self.lstmcell
 71 |         self.linear1 = nn.Linear(hidden_units, hidden_units)
 72 |         self.init_weights()
 73 | 
 74 |     def init_weights(self):
 75 |         ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
 76 |         hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
 77 |         b = (param.data for name, param in self.named_parameters() if 'bias' in name)
 78 |         for t in ih:
 79 |             nn.init.xavier_uniform(t)
 80 |         for t in hh:
 81 |             nn.init.orthogonal(t)
 82 |         for t in b:
 83 |             nn.init.constant(t, 0)
 84 | 
 85 |     def forward(self, user_vectors, item_vectors, mask_batch_ix_non_local, session_id_batch, sequence_tim_batch, is_train, poi_distance_matrix, sequence_dilated_rnn_index_batch):
 86 |         batch_size = item_vectors.size()[0]
 87 |         sequence_size = item_vectors.size()[1]
 88 |         items = self.item_emb(item_vectors)
 89 |         item_vectors = item_vectors.cpu()
 90 |         x = items
 91 |         x = x.transpose(0, 1)
 92 |         h1 = Variable(torch.zeros(1, batch_size, self.hidden_units)).cuda()
 93 |         c1 = Variable(torch.zeros(1, batch_size, self.hidden_units)).cuda()
 94 |         out, (h1, c1) = self.lstmcell(x, (h1, c1))
 95 |         out = out.transpose(0, 1)#batch_size * sequence_length * embedding_dim
 96 |         x1 = items
 97 |         # ###########################################################
 98 |         user_batch = np.array(user_vectors.cpu())
 99 |         y_list = []
100 |         out_hie = []
101 |         for ii in range(batch_size):
102 |             ##########################################
103 |             current_session_input_dilated_rnn_index = sequence_dilated_rnn_index_batch[ii]
104 |             hiddens_current = x1[ii]
105 |             dilated_lstm_outs_h = []
106 |             dilated_lstm_outs_c = []
107 |             for index_dilated in range(len(current_session_input_dilated_rnn_index)):
108 |                 index_dilated_explicit = current_session_input_dilated_rnn_index[index_dilated]
109 |                 hidden_current = hiddens_current[index_dilated].unsqueeze(0)
110 |                 if index_dilated == 0:
111 |                     h = Variable(torch.zeros(1, self.hidden_units)).cuda()
112 |                     c = Variable(torch.zeros(1, self.hidden_units)).cuda()
113 |                     (h, c) = self.dilated_rnn(hidden_current, (h, c))
114 |                     dilated_lstm_outs_h.append(h)
115 |                     dilated_lstm_outs_c.append(c)
116 |                 else:
117 |                     (h, c) = self.dilated_rnn(hidden_current, (dilated_lstm_outs_h[index_dilated_explicit], dilated_lstm_outs_c[index_dilated_explicit]))
118 |                     dilated_lstm_outs_h.append(h)
119 |                     dilated_lstm_outs_c.append(c)
120 |             dilated_lstm_outs_h.append(hiddens_current[len(current_session_input_dilated_rnn_index):])
121 |             dilated_out = torch.cat(dilated_lstm_outs_h, dim = 0).unsqueeze(0)
122 |             out_hie.append(dilated_out)
123 |             user_id_current = user_batch[ii]
124 |             current_session_timid = sequence_tim_batch[ii][:-1]
125 |             current_session_poiid = item_vectors[ii][:len(current_session_timid)]
126 |             session_id_current = session_id_batch[ii]
127 |             current_session_embed = out[ii]
128 |             current_session_mask = mask_batch_ix_non_local[ii].unsqueeze(1)
129 |             sequence_length = int(sum(np.array(current_session_mask.cpu()))[0])
130 |             current_session_represent_list = []
131 |             if is_train:
132 |                 for iii in range(sequence_length-1):
133 |                     current_session_represent = torch.sum(current_session_embed * current_session_mask, dim=0).unsqueeze(0)/sum(current_session_mask)
134 |                     current_session_represent_list.append(current_session_represent)
135 |             else:
136 |                 for iii in range(sequence_length-1):
137 |                     current_session_represent_rep_item = current_session_embed[0:iii+1]
138 |                     current_session_represent_rep_item = torch.sum(current_session_represent_rep_item, dim = 0).unsqueeze(0)/(iii + 1)
139 |                     current_session_represent_list.append(current_session_represent_rep_item)
140 | 
141 |             current_session_represent = torch.cat(current_session_represent_list, dim = 0)
142 |             list_for_sessions = []
143 |             list_for_avg_distance = []
144 |             h2 = Variable(torch.zeros(1, 1, self.hidden_units)).cuda()###whole sequence
145 |             c2 = Variable(torch.zeros(1, 1, self.hidden_units)).cuda()
146 |             for jj in range(session_id_current):
147 |                 sequence = [s[0] for s in self.data_neural[user_id_current]['sessions'][jj]]
148 |                 sequence = Variable(torch.LongTensor(np.array(sequence))).cuda()
149 |                 sequence_emb = self.item_emb(sequence).unsqueeze(1)
150 |                 sequence = sequence.cpu()
151 |                 sequence_emb, (h2, c2) = self.lstmcell_history(sequence_emb, (h2, c2))
152 |                 sequence_tim_id = [s[1] for s in self.data_neural[user_id_current]['sessions'][jj]]
153 |                 jaccard_sim_row = Variable(torch.FloatTensor(self.tim_sim_matrix[current_session_timid]),requires_grad=False).cuda()
154 |                 jaccard_sim_expicit = jaccard_sim_row[:,sequence_tim_id]
155 |                 distance_row = poi_distance_matrix[current_session_poiid]
156 |                 distance_row_expicit = Variable(torch.FloatTensor(distance_row[:,sequence]),requires_grad=False).cuda()
157 |                 distance_row_expicit_avg = torch.mean(distance_row_expicit, dim = 1)
158 |                 jaccard_sim_expicit_last = F.softmax(jaccard_sim_expicit)
159 |                 hidden_sequence_for_current1 = torch.mm(jaccard_sim_expicit_last, sequence_emb.squeeze(1))
160 |                 hidden_sequence_for_current =  hidden_sequence_for_current1
161 |                 list_for_sessions.append(hidden_sequence_for_current.unsqueeze(0))
162 |                 list_for_avg_distance.append(distance_row_expicit_avg.unsqueeze(0))
163 |             avg_distance = torch.cat(list_for_avg_distance, dim = 0).transpose(0,1)
164 |             sessions_represent = torch.cat(list_for_sessions, dim=0).transpose(0,1) ##current_items * history_session_length * embedding_size
165 |             current_session_represent = current_session_represent.unsqueeze(2) ### current_items * embedding_size * 1
166 |             sims = F.softmax(sessions_represent.bmm(current_session_represent).squeeze(2), dim = 1).unsqueeze(1) ##==> current_items * 1 * history_session_length
167 |             #out_y_current = sims.bmm(sessions_represent).squeeze(1)
168 |             out_y_current =torch.selu(self.linear1(sims.bmm(sessions_represent).squeeze(1)))
169 |             ##############layer_2
170 |             #layer_2_current = (lambda*out_y_current + (1-lambda)*current_session_embed[:sequence_length-1]).unsqueeze(2) #lambda from [0.1-0.9] better performance
171 |             # layer_2_current = (out_y_current + current_session_embed[:sequence_length-1]).unsqueeze(2)##==>current_items * embedding_size * 1
172 |             layer_2_current = (0.5 *out_y_current + 0.5 * current_session_embed[:sequence_length - 1]).unsqueeze(2)
173 |             layer_2_sims =  F.softmax(sessions_represent.bmm(layer_2_current).squeeze(2) * 1.0/avg_distance, dim = 1).unsqueeze(1)##==>>current_items * 1 * history_session_length
174 |             out_layer_2 = layer_2_sims.bmm(sessions_represent).squeeze(1)
175 |             out_y_current_padd = Variable(torch.FloatTensor(sequence_size - sequence_length + 1, self.emb_size).zero_(),requires_grad=False).cuda()
176 |             out_layer_2_list = []
177 |             out_layer_2_list.append(out_layer_2)
178 |             out_layer_2_list.append(out_y_current_padd)
179 |             out_layer_2 = torch.cat(out_layer_2_list,dim = 0).unsqueeze(0)
180 |             y_list.append(out_layer_2)
181 |         y = torch.selu(torch.cat(y_list,dim=0))
182 |         out_hie = F.selu(torch.cat(out_hie, dim = 0))
183 |         out = F.selu(out)
184 |         out = (out + out_hie) * 0.5
185 |         out_put_emb_v1 = torch.cat([y, out], dim=2)
186 |         output_ln = self.linear(out_put_emb_v1)
187 |         output = F.log_softmax(output_ln, dim=-1)
188 |         return output
189 | 
190 | 
191 | 
192 | 
193 | def caculate_time_sim(data_neural):
194 |     time_checkin_set = defaultdict(set)
195 |     for uid in data_neural:
196 |         uid_sessions = data_neural[uid]
197 |         for sid in uid_sessions['sessions']:
198 |             session_current = uid_sessions['sessions'][sid]
199 |             for checkin in session_current:
200 |                 timid = checkin[1]
201 |                 locid = checkin[0]
202 |                 if timid not in time_checkin_set:
203 |                     time_checkin_set[timid] = set()
204 |                 time_checkin_set[timid].add(locid)
205 |     sim_matrix = np.zeros((48,48))
206 |     for i in range(48):
207 |         for j in range(48):
208 |             set_i = time_checkin_set[i]
209 |             set_j = time_checkin_set[j]
210 |             jaccard_ij = len(set_i & set_j)/len(set_i | set_j)
211 |             sim_matrix[i][j] = jaccard_ij
212 |     return sim_matrix
213 | 
214 | def caculate_poi_distance(poi_coors):
215 |     print("distance matrix")
216 |     sim_matrix = np.zeros((len(poi_coors) + 1, len(poi_coors) + 1))
217 |     for i in range(len(poi_coors)):
218 |         for j in range(i , len(poi_coors)):
219 |             poi_current = i + 1
220 |             poi_target = j + 1
221 |             poi_current_coor = poi_coors[poi_current]
222 |             poi_target_coor = poi_coors[poi_target]
223 |             distance_between = geodistance(poi_current_coor[1], poi_current_coor[0], poi_target_coor[1], poi_target_coor[0])
224 |             if distance_between<1:
225 |                 distance_between = 1
226 |             sim_matrix[poi_current][poi_target] = distance_between
227 |             sim_matrix[poi_target][poi_current] = distance_between
228 |     pickle.dump(sim_matrix, open('distance.pkl', 'wb'))
229 |     return sim_matrix
230 | 
231 | def generate_input_history(data_neural, mode, candidate=None):
232 |     data_train = {}
233 |     train_idx = {}
234 |     if candidate is None:
235 |         candidate = data_neural.keys()
236 |     for u in candidate:
237 |         sessions = data_neural[u]['sessions']
238 |         train_id = data_neural[u][mode]
239 |         data_train[u] = {}
240 |         for c, i in enumerate(train_id):
241 |             if mode == 'train' and c == 0:
242 |                 continue
243 |             session = sessions[i]
244 |             trace = {}
245 |             loc_np = np.reshape(np.array([s[0] for s in session[:-1]]), (len(session[:-1]), 1))
246 |             tim_np = np.reshape(np.array([s[1] for s in session[:-1]]), (len(session[:-1]), 1))
247 |             target = np.array([s[0] for s in session[1:]])
248 |             trace['loc'] = Variable(torch.LongTensor(loc_np))
249 |             trace['target'] = Variable(torch.LongTensor(target))
250 |             trace['tim'] = Variable(torch.LongTensor(tim_np))
251 |             history = []
252 |             if mode == 'test':
253 |                 test_id = data_neural[u]['train']
254 |                 for tt in test_id:
255 |                     history.extend([(s[0], s[1]) for s in sessions[tt]])
256 |             for j in range(c):
257 |                 history.extend([(s[0], s[1]) for s in sessions[train_id[j]]])
258 |             history = sorted(history, key=lambda x: x[1], reverse=False)
259 |             history_loc = np.reshape(np.array([s[0] for s in history]), (len(history), 1))
260 |             history_tim = np.reshape(np.array([s[1] for s in history]), (len(history), 1))
261 |             trace['history_loc'] = Variable(torch.LongTensor(history_loc))
262 |             trace['history_tim'] = Variable(torch.LongTensor(history_tim))
263 |             data_train[u][i] = trace
264 |         train_idx[u] = train_id
265 |     return data_train, train_idx
266 | 
267 | def generate_input_long_history(data_neural, mode, candidate=None):
268 |     data_train = {}
269 |     train_idx = {}
270 |     if candidate is None:
271 |         candidate = data_neural.keys()
272 |     for u in candidate:
273 |         sessions = data_neural[u]['sessions']
274 |         train_id = data_neural[u][mode]
275 |         data_train[u] = {}
276 |         for c, i in enumerate(train_id):
277 |             trace = {}
278 |             if mode == 'train' and c == 0:
279 |                 continue
280 |             session = sessions[i]
281 |             target = np.array([s[0] for s in session[1:]])
282 |             history = []
283 |             if mode == 'test':
284 |                 test_id = data_neural[u]['train']
285 |                 for tt in test_id:
286 |                     history.extend([(s[0], s[1]) for s in sessions[tt]])
287 |             for j in range(c):
288 |                 history.extend([(s[0], s[1]) for s in sessions[train_id[j]]])
289 |             history_tim = [t[1] for t in history]
290 |             history_count = [1]
291 |             last_t = history_tim[0]
292 |             count = 1
293 |             for t in history_tim[1:]:
294 |                 if t == last_t:
295 |                     count += 1
296 |                 else:
297 |                     history_count[-1] = count
298 |                     history_count.append(1)
299 |                     last_t = t
300 |                     count = 1
301 |             history_loc = np.reshape(np.array([s[0] for s in history]), (len(history), 1))
302 |             history_tim = np.reshape(np.array([s[1] for s in history]), (len(history), 1))
303 |             trace['history_loc'] = Variable(torch.LongTensor(history_loc))
304 |             trace['history_tim'] = Variable(torch.LongTensor(history_tim))
305 |             trace['history_count'] = history_count
306 |             loc_tim = history
307 |             loc_tim.extend([(s[0], s[1]) for s in session[:-1]])
308 |             loc_np = np.reshape(np.array([s[0] for s in loc_tim]), (len(loc_tim), 1))
309 |             tim_np = np.reshape(np.array([s[1] for s in loc_tim]), (len(loc_tim), 1))
310 |             trace['loc'] = Variable(torch.LongTensor(loc_np))
311 |             trace['tim'] = Variable(torch.LongTensor(tim_np))
312 |             trace['target'] = Variable(torch.LongTensor(target))
313 |             data_train[u][i] = trace
314 |         train_idx[u] = train_id
315 |     return data_train, train_idx
316 | 
317 | def generate_queue(train_idx, mode, mode2):
318 |     user = list(train_idx.keys())
319 |     train_queue = list()
320 |     if mode == 'random':
321 |         initial_queue = {}
322 |         for u in user:
323 |             if mode2 == 'train':
324 |                 initial_queue[u] = deque(train_idx[u][1:])
325 |             else:
326 |                 initial_queue[u] = deque(train_idx[u])
327 |         queue_left = 1
328 |         while queue_left > 0:
329 |             for j, u in enumerate(user):
330 |                 if len(initial_queue[u]) > 0:
331 |                     train_queue.append((u, initial_queue[u].popleft()))
332 |             queue_left = sum([1 for x in initial_queue if len(initial_queue[x]) > 0])
333 |     elif mode == 'normal':
334 |         for u in user:
335 |             for i in train_idx[u]:
336 |                 train_queue.append((u, i))
337 |     return train_queue
338 | 
339 | 
340 | def create_dilated_rnn_input(session_sequence_current, poi_distance_matrix):
341 |     sequence_length = len(session_sequence_current)
342 |     session_sequence_current.reverse()
343 |     session_dilated_rnn_input_index = [0] * sequence_length
344 |     for i in range(sequence_length - 1):
345 |         current_poi = [session_sequence_current[i]]
346 |         poi_before = session_sequence_current[i + 1 :]
347 |         distance_row = poi_distance_matrix[current_poi]
348 |         distance_row_explicit = distance_row[:, poi_before][0]
349 |         index_closet = np.argmin(distance_row_explicit)
350 |         session_dilated_rnn_input_index[sequence_length - i - 1] = sequence_length-2-index_closet-i
351 |     session_sequence_current.reverse()
352 |     return session_dilated_rnn_input_index
353 | 
354 | 
355 | 
356 | def generate_detailed_batch_data(one_train_batch):
357 |     session_id_batch = []
358 |     user_id_batch = []
359 |     sequence_batch = []
360 |     sequences_lens_batch = []
361 |     sequences_tim_batch = []
362 |     sequences_dilated_input_batch = []
363 |     for sample in one_train_batch:
364 |         user_id_batch.append(sample[0])
365 |         session_id_batch.append(sample[1])
366 |         session_sequence_current = [s[0] for s in data_neural[sample[0]]['sessions'][sample[1]]]
367 |         session_sequence_tim_current = [s[1] for s in data_neural[sample[0]]['sessions'][sample[1]]]
368 |         session_sequence_dilated_input = create_dilated_rnn_input(session_sequence_current, poi_distance_matrix)
369 |         sequence_batch.append(session_sequence_current)
370 |         sequences_lens_batch.append(len(session_sequence_current))
371 |         sequences_tim_batch.append(session_sequence_tim_current)
372 |         sequences_dilated_input_batch.append(session_sequence_dilated_input)
373 |     return user_id_batch, session_id_batch, sequence_batch, sequences_lens_batch, sequences_tim_batch, sequences_dilated_input_batch
374 | 
375 | 
376 | def train_network(network, num_epoch=40 ,batch_size = 32,criterion = None):
377 |     candidate = data_neural.keys()
378 |     data_train, train_idx = generate_input_history(data_neural, 'train', candidate=candidate)
379 |     for epoch in range(num_epoch):
380 |         network.train(True)
381 |         i = 0
382 |         run_queue = generate_queue(train_idx, 'random', 'train')
383 |         for one_train_batch in minibatch(run_queue, batch_size = batch_size):
384 |             user_id_batch, session_id_batch, sequence_batch, sequences_lens_batch, sequence_tim_batch, sequence_dilated_rnn_index_batch = generate_detailed_batch_data(one_train_batch)
385 |             max_len = max(sequences_lens_batch)
386 |             padded_sequence_batch, mask_batch_ix, mask_batch_ix_non_local = pad_batch_of_lists_masks(sequence_batch,
387 |                                                                                                      max_len)
388 |             padded_sequence_batch = Variable(torch.LongTensor(np.array(padded_sequence_batch))).to(device)
389 |             mask_batch_ix = Variable(torch.FloatTensor(np.array(mask_batch_ix))).to(device)
390 |             mask_batch_ix_non_local = Variable(torch.FloatTensor(np.array(mask_batch_ix_non_local))).to(device)
391 |             user_id_batch = Variable(torch.LongTensor(np.array(user_id_batch))).to(device)
392 |             logp_seq = network(user_id_batch, padded_sequence_batch, mask_batch_ix_non_local, session_id_batch, sequence_tim_batch, True, poi_distance_matrix, sequence_dilated_rnn_index_batch)
393 |             predictions_logp = logp_seq[:, :-1] * mask_batch_ix[:, :-1, None]
394 |             actual_next_tokens = padded_sequence_batch[:, 1:]
395 |             logp_next = torch.gather(predictions_logp, dim=2, index=actual_next_tokens[:, :, None])
396 |             loss = -logp_next.sum() / mask_batch_ix[:, :-1].sum()
397 |             # train with backprop
398 |             opt.zero_grad()
399 |             loss.backward()
400 |             nn.utils.clip_grad_norm_(network.parameters(), 5.0)
401 |             opt.step()
402 |             if (i + 1) % 20 == 0:
403 |                 print("epoch" + str(epoch) + ": loss: " + str(loss))
404 |             i += 1
405 |         results = evaluate(network, 1)
406 |         print("Scores: ", results)
407 | 
408 | 
409 | def get_acc(target, scores):
410 |     target = target.data.cpu().numpy()
411 |     val, idxx = scores.data.topk(10, 1)
412 |     predx = idxx.cpu().numpy()
413 |     acc = np.zeros((3, 1))
414 |     ndcg = np.zeros((3, 1))
415 |     for i, p in enumerate(predx):
416 |         t = target[i]
417 |         if t != 0:
418 |             if t in p[:10] and t > 0:
419 |                 acc[0] += 1
420 |                 rank_list = list(p[:10])
421 |                 rank_index = rank_list.index(t)
422 |                 ndcg[0] += 1.0 / np.log2(rank_index + 2)
423 |             if t in p[:5] and t > 0:
424 |                 acc[1] += 1
425 |                 rank_list = list(p[:5])
426 |                 rank_index = rank_list.index(t)
427 |                 ndcg[1] += 1.0 / np.log2(rank_index + 2)
428 |             if t == p[0] and t > 0:
429 |                 acc[2] += 1
430 |                 rank_list = list(p[:1])
431 |                 rank_index = rank_list.index(t)
432 |                 ndcg[2] += 1.0 / np.log2(rank_index + 2)
433 |         else:
434 |             break
435 |     return acc.tolist(), ndcg.tolist()
436 | 
437 | def evaluate(network, batch_size = 2):
438 |     network.train(False)
439 |     candidate = data_neural.keys()
440 |     data_test, test_idx = generate_input_long_history(data_neural, 'test', candidate=candidate)
441 |     users_acc = {}
442 |     with torch.no_grad():
443 |         run_queue = generate_queue(test_idx, 'normal', 'test')
444 |         for one_test_batch in minibatch(run_queue, batch_size=batch_size):
445 |             user_id_batch, session_id_batch, sequence_batch, sequences_lens_batch, sequence_tim_batch, sequence_dilated_rnn_index_batch = generate_detailed_batch_data(
446 |                 one_test_batch)
447 |             user_id_batch_test = user_id_batch
448 |             max_len = max(sequences_lens_batch)
449 |             padded_sequence_batch, mask_batch_ix, mask_batch_ix_non_local = pad_batch_of_lists_masks(sequence_batch,
450 |                                                                                                      max_len)
451 |             padded_sequence_batch = Variable(torch.LongTensor(np.array(padded_sequence_batch))).to(device)
452 |             mask_batch_ix = Variable(torch.FloatTensor(np.array(mask_batch_ix))).to(device)
453 |             mask_batch_ix_non_local = Variable(torch.FloatTensor(np.array(mask_batch_ix_non_local))).to(device)
454 |             user_id_batch = Variable(torch.LongTensor(np.array(user_id_batch))).to(device)
455 |             logp_seq = network(user_id_batch, padded_sequence_batch, mask_batch_ix_non_local, session_id_batch, sequence_tim_batch, False, poi_distance_matrix, sequence_dilated_rnn_index_batch)
456 |             predictions_logp = logp_seq[:, :-1] * mask_batch_ix[:, :-1, None]
457 |             actual_next_tokens = padded_sequence_batch[:, 1:]
458 |             for ii, u_current in enumerate(user_id_batch_test):
459 |                 if u_current not in users_acc:
460 |                     users_acc[u_current] = [0, 0, 0, 0, 0, 0, 0]
461 |                 acc, ndcg = get_acc(actual_next_tokens[ii], predictions_logp[ii])
462 |                 users_acc[u_current][1] += acc[2][0]#@1
463 |                 users_acc[u_current][2] += acc[1][0]#@5
464 |                 users_acc[u_current][3] += acc[0][0]#@10
465 |                 ###ndcg
466 |                 users_acc[u_current][4] += ndcg[2][0]  # @1
467 |                 users_acc[u_current][5] += ndcg[1][0]  # @5
468 |                 users_acc[u_current][6] += ndcg[0][0]  # @10
469 |                 users_acc[u_current][0] += (sequences_lens_batch[ii]-1)
470 |         tmp_acc = [0.0,0.0,0.0, 0.0, 0.0, 0.0]##last 3 ndcg
471 |         sum_test_samples = 0.0
472 |         for u in users_acc:
473 |             tmp_acc[0] = users_acc[u][1] + tmp_acc[0]
474 |             tmp_acc[1] = users_acc[u][2] + tmp_acc[1]
475 |             tmp_acc[2] = users_acc[u][3] + tmp_acc[2]
476 | 
477 |             tmp_acc[3] = users_acc[u][4] + tmp_acc[3]
478 |             tmp_acc[4] = users_acc[u][5] + tmp_acc[4]
479 |             tmp_acc[5] = users_acc[u][6] + tmp_acc[5]
480 |             sum_test_samples = sum_test_samples + users_acc[u][0]
481 |         avg_acc = (np.array(tmp_acc)/sum_test_samples).tolist()
482 |         return avg_acc
483 | 
484 | def geodistance(lng1,lat1,lng2,lat2):
485 |     lng1, lat1, lng2, lat2 = map(radians, [float(lng1), float(lat1), float(lng2), float(lat2)])
486 |     dlon=lng2-lng1
487 |     dlat=lat2-lat1
488 |     a=sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
489 |     distance=2*asin(sqrt(a))*6371*1000
490 |     distance=round(distance/1000,3)
491 |     return distance
492 | 
493 | 
494 | if __name__ == '__main__':
495 |     np.random.seed(1)
496 |     torch.manual_seed(1)
497 |     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
498 |     data = pickle.load(open('foursquare_cut_one_day.pkl', 'rb'), encoding='iso-8859-1')
499 |     vid_list = data['vid_list']
500 |     uid_list = data['uid_list']
501 |     data_neural = data['data_neural']
502 |     poi_coordinate = data['vid_lookup']
503 |     loc_size = len(vid_list)
504 |     uid_size = len(uid_list)
505 |     time_sim_matrix = caculate_time_sim(data_neural)
506 |     # poi_distance_matrix = caculate_poi_distance(poi_coordinate)
507 |     poi_distance_matrix = pickle.load(open('distance.pkl', 'rb'), encoding='iso-8859-1')
508 |     torch.cuda.empty_cache()
509 |     gc.collect()
510 |     device = torch.device("cuda")
511 |     n_users = uid_size
512 |     n_items = loc_size
513 |     session_id_sequences = None
514 |     user_id_session = None
515 |     network = Model(n_users=n_users, n_items=n_items, data_neural=data_neural, tim_sim_matrix=time_sim_matrix).to(
516 |         device)
517 |     opt = torch.optim.Adam(filter(lambda p: p.requires_grad, network.parameters()), lr=0.0001,
518 |                                weight_decay=1 * 1e-6)
519 |     criterion = nn.NLLLoss().cuda()
520 |     train_network(network,criterion=criterion)
521 | 


--------------------------------------------------------------------------------