├── paper.pdf
├── data
    └── foursquare
    │   └── dataset_TSMC2014_NYC.txt
├── README.md
├── Bi-STDDP.py
└── Bi-STDDP-preprocess.py


/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xidongbo/AAAI19_Bi-STDDP/HEAD/paper.pdf


--------------------------------------------------------------------------------
/data/foursquare/dataset_TSMC2014_NYC.txt:
--------------------------------------------------------------------------------
1 | Please download the data from https://sites.google.com/site/yangdingqi/home/foursquaredataset
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AAAI19_Bi-STDDP
 2 | Keras Implementation of Bi-directional Spatio-Temporal Dependence and Users’ Dynamic Preferences Model
 3 | 
 4 | Code for the paper:
 5 | 
 6 | Dongbo Xi, Fuzhen Zhuang, Yanchi Liu, Jingjing Gu, Hui Xiong, Qing He: Modelling of Bi-directional Spatio-Temporal Dependence and Users' Dynamic Preferences for Missing POI Check-in Identification. AAAI 2019: 5458-5465
 7 | 
 8 | Please cite our AAAI'19 paper if you use our codes. Thanks!
 9 | 
10 | Author: Dongbo Xi
11 | 
12 | # Requirement
13 | python==2.7  
14 | Keras==2.1.0  
15 | tensorflow-gpu==1.2.1  
16 | 
17 | # Example to run the codes.
18 | ```
19 | python Bi-STDDP.py --embedded_dim 64 --hidden_unit 256 --length 5 --batch_size 256 --dropout 0.5 --lr 0.001 --nb_epoch 50 --earlystop 1 --model_name STDDP.model --dataset NYC 
20 | ```
21 | 
22 | The instruction of commands has been clearly stated in the codes (see the parse_args function).
23 | 
24 | # Dataset
25 | We use the real-world LBSN datasets from Foursquare. [https://sites.google.com/site/yangdingqi/home/foursquaredataset]
26 | 
27 | Split the data to train/validation/test files to run the codes directly (see Bi-STDDP-preprocess.py).
28 | 
29 | # Reference
30 | If you are interested in the code, please cite our paper:
31 | ```
32 | Xi D, Zhuang F, Liu Y, et al. Modelling of bi-directional spatio-temporal dependence and users’ dynamic preferences for missing poi check-in identification[C]//Proceedings of the AAAI Conference on Artificial Intelligence. 2019, 33(01): 5458-5465.
33 | ```
34 | or in bibtex style:
35 | ```
36 | @inproceedings{xi2019modelling,
37 |   title={Modelling of bi-directional spatio-temporal dependence and users’ dynamic preferences for missing poi check-in identification},
38 |   author={Xi, Dongbo and Zhuang, Fuzhen and Liu, Yanchi and Gu, Jingjing and Xiong, Hui and He, Qing},
39 |   booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
40 |   volume={33},
41 |   number={01},
42 |   pages={5458--5465},
43 |   year={2019}
44 | }
45 | ```
46 | 
47 | Last Update Date: July 28, 2019
48 | 


--------------------------------------------------------------------------------
/Bi-STDDP.py:
--------------------------------------------------------------------------------
  1 | #encoding:utf8
  2 | '''
  3 | Keras implementation of Bi-directional Spatio-Temporal Dependence and Users’Dynamic
  4 | Preferences Model( Bi-STDDP) based on tensorflow backend.
  5 | 
  6 | @author:
  7 | xidongbo17s@ict.ac.cn
  8 | '''
  9 | from keras.layers import Dense, Embedding, merge, Input,Reshape,Activation,Subtract,Multiply,Lambda,Dropout
 10 | from keras.models import Model
 11 | from keras.optimizers import adam
 12 | from keras.callbacks import EarlyStopping,ModelCheckpoint
 13 | from sklearn.externals import joblib
 14 | from sklearn import preprocessing as process
 15 | import numpy as np
 16 | import os
 17 | import tensorflow as tf
 18 | import argparse
 19 | import keras.backend.tensorflow_backend as k
 20 | from itertools import izip
 21 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
 22 | os.environ["CUDA_VISIBLE_DEVICES"]='0'
 23 | gpu_options = tf.GPUOptions(allow_growth=True)
 24 | sess=tf.InteractiveSession(
 25 |         config=tf.ConfigProto(
 26 |             gpu_options=gpu_options))
 27 | k.set_session(sess)
 28 | 
 29 | def parse_args():
 30 |     parser = argparse.ArgumentParser(description="Run STDDP.")
 31 |     parser.add_argument('--embedded_dim', type=int, default=64,
 32 |                         help='Embedding dim.')
 33 |     parser.add_argument('--hidden_unit', type=int, default=256,
 34 |                         help='Number of hidden units.')
 35 |     parser.add_argument('--length', type=int, default=1,
 36 |                         help='Length of POI sequence.')
 37 |     parser.add_argument('--batch_size', type=int, default=128,
 38 |                         help='Batch size.')
 39 |     parser.add_argument('--dropout', type=float, default=0.5,
 40 |                         help='Dropout rate.')
 41 |     parser.add_argument('--lr', type=float, default=0.001,
 42 |                         help='Learning rate.')
 43 |     parser.add_argument('--nb_epoch', type=int, default=50,
 44 |                         help='Number of epoch.')
 45 |     parser.add_argument('--earlystop', type=int, default=1,
 46 |                         help='Earlystop to avoid overfitting.')
 47 |     parser.add_argument('--model_name', type=str, default='STDDP.model',
 48 |                         help='Name of best model to save.')
 49 |     parser.add_argument('--dataset', type=str, default='NYC',
 50 |                         help='Which dataset to use.')
 51 |     parser.add_argument('--basepath', type=str, default="data/foursquare/",
 52 |                         help='Dataset path.')
 53 |     return parser.parse_args()
 54 | 
 55 | class STDDP(object):
 56 |     def __init__(self,embedded_dim,hidden_unit,length,dropout,batch_size,lr,nb_epoch,earlystop,model_name,dataset,basepath):
 57 |         self.embedded_dim=embedded_dim
 58 |         self.hidden_unit=hidden_unit
 59 |         self.length=length
 60 |         self.dropout=dropout
 61 |         self.batch_size=batch_size
 62 |         self.lr=lr
 63 |         self.nb_epoch=nb_epoch
 64 |         self.earlystop=earlystop
 65 |         self.model_name=model_name
 66 |         self.dataset=dataset
 67 |         self.basepath=basepath
 68 |         self.para = joblib.load(os.path.join(self.basepath, "{}.para".format(self.dataset)))
 69 |         self.poi_index = joblib.load(os.path.join(self.basepath, "poi_index_{}.pkl".format(self.dataset)))
 70 |         self.poi_loc = joblib.load(os.path.join(self.basepath, 'poi_loc.{}'.format(self.dataset)))
 71 |         self.maxlen = self.para['maxlen']
 72 |         self.nb_poi = self.para['nb_poi']
 73 |         self.nb_train = self.para['nb_train']
 74 |         self.nb_dev = self.para['nb_dev']
 75 |         self.nb_test = self.para['nb_test']
 76 |         self.nb_user = self.para['nb_user']
 77 |         print('maxlen:' + str(self.maxlen))
 78 |         # nb: Number
 79 |         print('nb_poi:' + str(self.nb_poi))
 80 |         print('nb_train:' + str(self.nb_train))
 81 |         print('nb_dev:' + str(self.nb_dev))
 82 |         print('nb_test:' + str(self.nb_test))
 83 |         print('nb_user:' + str(self.nb_user))
 84 |         self._init_model()
 85 | 
 86 |     def _init_model(self):
 87 |         # Model
 88 |         ###################user embedding#############
 89 |         user = Input(shape=(1,), dtype='int32')
 90 |         user_vec = Embedding(input_dim=self.nb_user + 1,
 91 |                              output_dim=self.embedded_dim)(user)
 92 |         user_vec = Reshape((self.embedded_dim,))(user_vec)
 93 |         ###############################################
 94 | 
 95 |         ###################poi embedding#############
 96 |         x1 = Input(shape=(self.length,), dtype='int32')  # prior POI sequence
 97 |         x2 = Input(shape=(self.length,), dtype='int32')  # next POI sequence
 98 |         # share embedding vector
 99 |         embedding = Embedding(
100 |             input_dim=self.nb_poi + 1,
101 |             output_dim=self.embedded_dim)
102 |         prior_pois = embedding(x1)
103 |         next_pois = embedding(x2)
104 |         # None,length,dim->None,length*dim
105 |         prior_pois = Reshape((self.length * self.embedded_dim,))(prior_pois)
106 |         next_pois = Reshape((self.length * self.embedded_dim,))(next_pois)
107 |         ############################################
108 | 
109 |         ###################Bi-STDDP#############
110 |         x1_t = Input(shape=(1,), dtype='float32')  # prior POI's time(seconds)
111 |         x2_t = Input(shape=(1,), dtype='float32')  # next POI's time(seconds)
112 |         x1_d = Input(shape=(self.nb_poi,), dtype='float32')  # prior POI's distance vec of all candidate POIs
113 |         x2_d = Input(shape=(self.nb_poi,), dtype='float32')  # next POI's distance vec of all candidate POIs
114 |         y_t1 = Input(shape=(7,), dtype='float32')  # target time pattern
115 |         y_t2 = Input(shape=(1,), dtype='float32')  # target time
116 | 
117 |         # --Bi-STD: Bi-directional Spatio-Temporal Dependence
118 |         sub1 = Subtract()([y_t2, x1_t])
119 |         sub2 = Subtract()([x2_t, y_t2])
120 |         sub1 = Dense(self.nb_poi, activation='tanh')(Lambda(lambda x: x / 3600.)(sub1))
121 |         sub2 = Dense(self.nb_poi, activation='tanh')(Lambda(lambda x: x / 3600.)(sub2))
122 |         x1_dis = Multiply()([sub1, x1_d])  # STD_{t-1}
123 |         x2_dis = Multiply()([sub2, x2_d])  # STD_{t+1}
124 |         # --DP: Dynamic Preference
125 |         wt = Dense(self.hidden_unit, activation='tanh')(y_t1)
126 |         wp1 = Dense(self.hidden_unit, activation='tanh')(prior_pois)
127 |         wp2 = Dense(self.hidden_unit, activation='tanh')(next_pois)
128 |         user_vec = Dense(self.hidden_unit, activation='tanh')(user_vec)
129 |         dynamic_preference = merge([wp1, wp2, user_vec, wt], mode='sum', output_shape=(self.hidden_unit,))
130 | 
131 |         output = Dense(self.nb_poi)(dynamic_preference)
132 |         output=Dropout(self.dropout)(output)
133 |         output = merge([output, x1_dis, x2_dis], mode='sum')
134 |         output = Activation('softmax')(output)
135 | 
136 |         self.model = Model(inputs=[x1, x2, x1_t, x2_t, x1_d, x2_d, y_t1, y_t2, user], outputs=output)
137 |         self.model.compile(
138 |             loss='categorical_crossentropy',
139 |             optimizer=adam(
140 |                 lr=self.lr),
141 |             metrics=[
142 |                 self.acc_top1,
143 |                 self.acc_top5,
144 |                 self.acc_top10])
145 |         self.early_stopping = EarlyStopping(monitor='val_loss', patience=self.earlystop)
146 |         self.checkpoint = ModelCheckpoint(self.model_name, monitor='val_loss', verbose=0, save_best_only=True, mode='min')
147 | 
148 |     def fit(self):
149 |         self. model.fit_generator(
150 |             generator=self.generate_data_from_file(type='train'),
151 |             steps_per_epoch=np.ceil(self.nb_train/self.batch_size).astype(int), epochs=self.nb_epoch, verbose=1,
152 |             callbacks=[self.early_stopping, self.checkpoint],
153 |             validation_data=self.generate_data_from_file(type='dev'),
154 |             validation_steps=np.ceil(self.nb_dev/self.batch_size).astype(int), max_queue_size=20, workers=1)
155 | 
156 |     def test(self):
157 |         self.model.load_weights(self.model_name, {'acc_top1': self.acc_top1, 'acc_top5': self.acc_top5, 'acc_top10': self.acc_top10})
158 |         print(self.model.evaluate_generator(generator=self.generate_data_from_file(type='test'),
159 |                                        steps=np.ceil(self.nb_test/self.batch_size).astype(int), max_queue_size=20, workers=1))
160 | 
161 |         count = 0
162 |         MAP = 0
163 |         step = 0
164 |         steps = np.ceil(self.nb_test/self.batch_size).astype(int)
165 |         for x, y_true in self.generate_data_from_file(type='test'):
166 |             step += 1
167 |             y_priord = self.model.predict(x)
168 |             for i in range(len(y_true)):
169 |                 count += 1
170 |                 ziped = zip(y_true[i], y_priord[i])
171 |                 ziped.sort(key=lambda x: x[1], reverse=True)
172 |                 p = [t[0] for t in ziped]
173 |                 rank = np.argmax(p) + 1
174 |                 MAP += 1.0 / rank
175 |             if step >= steps:
176 |                 break
177 |         print('test samples:%d' % count)
178 |         print('MAP:%.4f' % (MAP / count))
179 | 
180 |     def acc_topk(self,y_true, y_pred, k):
181 |         topk = tf.nn.top_k(y_pred, k).indices
182 |         y = tf.argmax(y_true, axis=-1)
183 |         y = tf.reshape(y, (-1, 1))
184 |         y = tf.cast(y, dtype=tf.int32)
185 |         acc = tf.equal(y, topk)
186 |         return tf.reduce_mean(tf.cast(acc, dtype=tf.float32)) * acc.shape[1].value
187 | 
188 |     def acc_top1(self,y_true, y_priord):
189 |         return self.acc_topk(y_true, y_priord, k=1)
190 | 
191 |     def acc_top5(self,y_true, y_priord):
192 |         return self.acc_topk(y_true, y_priord, k=5)
193 | 
194 |     def acc_top10(self,y_true, y_priord):
195 |         return self.acc_topk(y_true, y_priord, k=10)
196 | 
197 | 
198 | 
199 |     def generate_data_from_file(self,type='train'):
200 |         if type == 'train':
201 |             x_path = 'train.{}'.format(self.dataset)
202 |             x_st_path = 'train_st.{}'.format(self.dataset)
203 |             y_path = 'train_label_poi.{}'.format(self.dataset)
204 |         elif type == 'dev':
205 |             x_path = 'dev.{}'.format(self.dataset)
206 |             x_st_path = 'dev_st.{}'.format(self.dataset)
207 |             y_path = 'dev_label_poi.{}'.format(self.dataset)
208 |         elif type == 'test':
209 |             x_path = 'test.{}'.format(self.dataset)
210 |             x_st_path = 'test_st.{}'.format(self.dataset)
211 |             y_path = 'test_label_poi.{}'.format(self.dataset)
212 |         else:
213 |             print('data type error')
214 |             exit()
215 |         flag = True
216 |         # POI sequence information and user. Split by space.
217 |         # Each line: forward_POI_sequence(self.maxlen) + backward_POI_sequence(self.maxlen)+user
218 |         x_path = os.path.join(self.basepath, x_path)
219 | 
220 |         # Neighbor POI's temporal and spatio information. Split by space.
221 |         # Each line: prior POI' visit time+next POI' visit time
222 |         x_st_path = os.path.join(self.basepath, x_st_path)
223 | 
224 |         # the missing POI(label) information. Split by space.
225 |         # Each line: missing_POI_name+target_time_pattern(7-dim)+target_time(seconds)
226 |         y_path = os.path.join(self.basepath, y_path)
227 |         while True:
228 |             with open(x_path) as f1, open(x_st_path) as f2, open(y_path) as f3:
229 |                 count = 0
230 |                 for x_line, x_st_line, y_line in izip(f1, f2, f3):
231 |                     y_line=y_line.split()
232 |                     label_poi = y_line[0] # missing_POI
233 |                     label_t = np.array(y_line[1:8], dtype=np.float32) # target_time_pattern(7-dim)
234 |                     label_second = float(y_line[8]) # target_time(seconds)
235 |                     onehot_poi = np.zeros(len(self.poi_index) + 1, dtype=np.int)
236 |                     #if self.poi_index.has_key(label_poi):
237 |                     pindex = self.poi_index.get(label_poi,0)
238 |                     onehot_poi[pindex] = 1
239 |                     if flag:
240 |                         # x1 and x2 means forward and backward, s means spatio, t means temporal
241 |                         x1, x2, x1_t, x2_t, x1_s, x2_s, y, y_t1, y_t2, user = [], [], [], [], [], [], [], [], [], []
242 |                         #loc1_batch,loc2_batch=[],[] 
243 |                     flag = False
244 |                     count += 1
245 |                     x_line = x_line.split()
246 |                     x1.append(x_line[self.maxlen - self.length:self.maxlen]) # the forward POI sequence
247 |                     x2.append(x_line[self.maxlen:self.maxlen + self.length]) # the backward POI sequence
248 |                     user.append([x_line[-1]])
249 | 
250 |                     x_st_line = x_st_line.split()
251 |                     x1_t.append([float(x_st_line[0])]) # the prior POI's visit time
252 |                     x2_t.append([float(x_st_line[1])]) # the next POI's visit time
253 |                     ###################################################
254 |                     x1_s.append(self.poi_loc[int(x1[-1][-1])])# prior POI's space vector
255 |                     x2_s.append(self.poi_loc[int(x2[-1][0])])# next POI's space vector
256 |                     y.append(onehot_poi)
257 |                     y_t1.append(label_t)
258 |                     y_t2.append([label_second])
259 |                     if count >= self.batch_size:
260 |                         count = 0
261 |                         flag = True
262 |                         x1 = np.array(x1, dtype=np.int)
263 |                         x2 = np.array(x2, dtype=np.int)
264 |                         x1_t = np.array(x1_t, dtype=np.float32)
265 |                         x2_t = np.array(x2_t, dtype=np.float32)
266 |                         x1_s = np.array(x1_s, dtype=np.float32)
267 |                         x2_s = np.array(x2_s, dtype=np.float32)
268 |                         y_t1 = np.array(y_t1, dtype=np.float32)
269 |                         y_t2 = np.array(y_t2, dtype=np.float32)
270 |                         user = np.array(user, dtype=np.int)
271 |                         y = np.array(y, dtype=np.int)
272 |                         yield ([x1, x2, x1_t, x2_t, x1_s, x2_s, y_t1, y_t2, user], y)
273 |                 if not flag:
274 |                     flag = True
275 |                     x1 = np.array(x1, dtype=np.int)
276 |                     x2 = np.array(x2, dtype=np.int)
277 |                     x1_t = np.array(x1_t, dtype=np.float32)
278 |                     x2_t = np.array(x2_t, dtype=np.float32)
279 |                     x1_s = np.array(x1_s, dtype=np.float32)
280 |                     x2_s = np.array(x2_s, dtype=np.float32)
281 |                     y_t1 = np.array(y_t1, dtype=np.float32)
282 |                     y_t2 = np.array(y_t2, dtype=np.float32)
283 |                     user = np.array(user, dtype=np.int)
284 |                     y = np.array(y, dtype=np.int)
285 |                     yield ([x1, x2, x1_t, x2_t, x1_s, x2_s, y_t1, y_t2, user], y)
286 | 
287 | if __name__ == '__main__':
288 |     args = parse_args()
289 |     stddp=STDDP(embedded_dim=args.embedded_dim,hidden_unit=args.hidden_unit,length=args.length,batch_size=args.batch_size,
290 |                 lr=args.lr,nb_epoch=args.nb_epoch,earlystop=args.earlystop,model_name=args.model_name,dataset=args.dataset,
291 |                 basepath=args.basepath)
292 |     stddp.fit()
293 |     stddp.test()
294 | 


--------------------------------------------------------------------------------
/Bi-STDDP-preprocess.py:
--------------------------------------------------------------------------------
  1 | #encoding:utf8
  2 | '''
  3 | Preprocess code for Bi-directional Spatio-Temporal Dependence and Users’ Dynamic Preferences Model( Bi-STDDP) .
  4 | @author:
  5 | xidongbo17s@ict.ac.cn
  6 | '''
  7 | import os
  8 | import pandas as pd
  9 | import time
 10 | from sklearn.externals import joblib
 11 | import numpy as np
 12 | from keras.preprocessing.text import Tokenizer
 13 | from keras.preprocessing.sequence import pad_sequences
 14 | from sklearn import preprocessing as process
 15 | from scipy.spatial.distance import cdist
 16 | class DataHelp(object):
 17 |     def __init__(self):
 18 |         self.url = r"data/"
 19 |         self.basepath = os.path.join(self.url, "foursquare/")
 20 |         # user whose number of check-in less this will be removed
 21 |         self.min_poi_num = 10
 22 |         self.min_left_right_length = 4
 23 |         # each sample length,0 padding
 24 |         self.max_len = 10
 25 |         self.nb_train, self.nb_dev, self.nb_test = 0, 0, 0
 26 |         self.data_list = []
 27 |         self.data = []
 28 | 
 29 |     def preprocess_foursquare(self, data='NYC'):
 30 |         print ('preprocess {} data'.format(data))
 31 | 
 32 |         datapath = os.path.join(
 33 |             self.basepath, "dataset_TSMC2014_{}.txt".format(data))
 34 |         data_df = pd.read_csv(datapath, header=None, sep='\t', encoding='utf8')
 35 |         # 0:user_id, 1:poi_id, 2:class_id, 3:class_name, 4:lati, 5:long, 6:offset, 7:utc_time
 36 |         # NYC_dict.data:
 37 |         # format:map{user_id:map{'poi_id':[],'class_name':[],'loc':[(1,2)],'time':[]}}
 38 |         self.nb_user=len(data_df[0].unique())
 39 |         print('user:%d' % self.nb_user)
 40 |         print('poi:%d' % len(data_df[1].unique()))
 41 |         
 42 |         train_data_dict = dict()
 43 |         dev_data_dict = dict()
 44 |         test_data_dict = dict()
 45 |         print ('user:%d' % len(data_df[0].unique()))
 46 |         print ('poi:%d' % len(data_df[1].unique()))
 47 |         # transfer to timestamp
 48 |         data_df[7] = [time.mktime(time.strptime(
 49 |             data_df[7][index], '%a %b %d %X +0000 %Y')) + int(data_df[6][index]) * 60 for index in data_df[7].index]
 50 |         #sort by time
 51 |         data_df = data_df.sort_values(by=7,axis=0,ascending=True)
 52 |         # NYC:1-1083
 53 |         user_id_list = list(data_df[0])
 54 |         poi_id_list = list(data_df[1])
 55 |         lati_list = list(data_df[4])
 56 |         long_list = list(data_df[5])
 57 |         uct_time_list = list(data_df[7])
 58 |         nb_check_in = len(user_id_list)
 59 |         print ('total %d check_ins' % nb_check_in)
 60 |         # 8:1:1split
 61 |         dev_split = '2012-12-14 12:00:00'  # '2012-08-14 12:00:00'  #
 62 |         test_split = '2013-01-15 12:00:00'  # '2012-11-15 12:00:00'  #
 63 |         dev_split_timetamp = time.mktime(
 64 |             time.strptime(dev_split, "%Y-%m-%d %H:%M:%S"))
 65 |         test_split_timetamp = time.mktime(
 66 |             time.strptime(test_split, "%Y-%m-%d %H:%M:%S"))
 67 |         for i in range(nb_check_in):
 68 |             check_in_timetamp = uct_time_list[i]
 69 |             user_id = user_id_list[i]
 70 |             if check_in_timetamp < dev_split_timetamp:  # train data
 71 |                 if user_id not in train_data_dict:
 72 |                     train_data_dict[user_id] = dict()
 73 |                     train_data_dict[user_id]['poi_id'] = [poi_id_list[i]]
 74 |                     train_data_dict[user_id]['loc'] = [(float(lati_list[i]), float(long_list[i]))]
 75 |                     train_data_dict[user_id]['time'] = [check_in_timetamp]
 76 |                 else:
 77 |                     train_data_dict[user_id]['poi_id'].append(poi_id_list[i])
 78 |                     train_data_dict[user_id]['loc'].append(
 79 |                         (float(lati_list[i]), float(long_list[i])))
 80 |                     train_data_dict[user_id]['time'].append(check_in_timetamp)
 81 |             elif check_in_timetamp > test_split_timetamp:  # test data
 82 |                 if user_id not in test_data_dict:
 83 |                     test_data_dict[user_id] = dict()
 84 |                     test_data_dict[user_id]['poi_id'] = [poi_id_list[i]]
 85 |                     test_data_dict[user_id]['loc'] = [(float(lati_list[i]), float(long_list[i]))]
 86 |                     test_data_dict[user_id]['time'] = [check_in_timetamp]
 87 |                 else:
 88 |                     test_data_dict[user_id]['poi_id'].append(poi_id_list[i])
 89 |                     test_data_dict[user_id]['loc'].append(
 90 |                         (float(lati_list[i]), float(long_list[i])))
 91 |                     test_data_dict[user_id]['time'].append(check_in_timetamp)
 92 |             else:  # dev data
 93 |                 if user_id not in dev_data_dict:
 94 |                     dev_data_dict[user_id] = dict()
 95 |                     dev_data_dict[user_id]['poi_id'] = [poi_id_list[i]]
 96 |                     dev_data_dict[user_id]['loc'] = [(float(lati_list[i]), float(long_list[i]))]
 97 |                     dev_data_dict[user_id]['time'] = [check_in_timetamp]
 98 |                 else:
 99 |                     dev_data_dict[user_id]['poi_id'].append(poi_id_list[i])
100 |                     dev_data_dict[user_id]['loc'].append(
101 |                         (float(lati_list[i]), float(long_list[i])))
102 |                     dev_data_dict[user_id]['time'].append(check_in_timetamp)
103 | 
104 |         # remove less self.min_poi_num sequence
105 |         train_temp = list(train_data_dict.keys())
106 |         dev_temp = list(dev_data_dict.keys())
107 |         test_temp = list(test_data_dict.keys())
108 |         print ('total train %d users' % len(train_data_dict))
109 |         print ('total dev %d users' % len(dev_data_dict))
110 |         print ('total test %d users' % len(test_data_dict))
111 |         print (
112 |             'user whose number of check-in  less %d will be removed' %
113 |             self.min_poi_num)
114 |         for key in train_temp:
115 |             if len(train_data_dict[key]['poi_id']) < self.min_poi_num:
116 |                 train_data_dict.pop(key)
117 |         for key in dev_temp:
118 |             if len(dev_data_dict[key]['poi_id']) < self.min_poi_num:
119 |                 dev_data_dict.pop(key)
120 |         for key in test_temp:
121 |             if len(test_data_dict[key]['poi_id']) < self.min_poi_num:
122 |                 test_data_dict.pop(key) 
123 |         
124 |         print ('total train %d users' % len(train_data_dict))
125 |         print ('total dev %d users' % len(dev_data_dict))
126 |         print ('total test %d users' % len(test_data_dict))
127 |         joblib.dump([train_data_dict, dev_data_dict, test_data_dict], os.path.join(
128 |             self.basepath, "{}_dict.data".format(data)), compress=3)
129 | 
130 |     def load_preprocessed_foursquare(self, data='NYC'):
131 |         print ('load {} data'.format(data))
132 |         datapath = os.path.join(self.basepath, "{}_dict.data".format(data))
133 |         # data
134 |         # format:map{user_id:map{'poi_id':[],'loc':[(1,2)],'time':[]}}
135 |         train_data_dict, dev_data_dict, test_data_dict = joblib.load(datapath)
136 |         #train_data_list's each entry is a list:[poi_sequences_list,poi_sequences_loc_list,poi_sequences_time_list,user]
137 |         train_data_list = []
138 |         #key is user
139 |         for key in train_data_dict:
140 |             #python 3 not use str()
141 |             train_data_list.append([train_data_dict[key]['poi_id'],train_data_dict[key]['loc'],train_data_dict[key]['time'],key])
142 |         dev_data_list = []
143 |         for key in dev_data_dict:
144 |             dev_data_list.append([dev_data_dict[key]['poi_id'],dev_data_dict[key]['loc'],dev_data_dict[key]['time'],key])
145 |         test_data_list = []
146 |         for key in test_data_dict:
147 |             test_data_list.append([test_data_dict[key]['poi_id'],test_data_dict[key]['loc'],test_data_dict[key]['time'],key])
148 |         self.nb_train, self.nb_dev, self.nb_test = len(
149 |             train_data_list), len(dev_data_list), len(test_data_list)
150 |         print ('before sample:train:%d, dev:%d, test:%d' %
151 |                (self.nb_train, self.nb_dev, self.nb_test))
152 |         self.train_data_list = train_data_list
153 |         self.dev_data_list = dev_data_list
154 |         self.test_data_list = test_data_list
155 | 
156 |     def encode_padding(self, data='NYC'):
157 |         self.load_preprocessed_foursquare(data=data)
158 |         counts = []
159 |         data_list1 = []
160 |         data_list2 = []
161 |         # list1+label+list2
162 |         data_list_lmr = []
163 |         labels_poi = []
164 |         left_neighbor_narray =[]
165 |         right_neighbor_narray =[]
166 |         label_narray=[]
167 | 
168 |         # train_data_list's each entry is a list:
169 |         # [poi_sequences_list,poi_sequences_loc_list,poi_sequences_time_list,user]
170 |         users=[]
171 |         train_user=set()
172 |         for data_list in [
173 |                 self.train_data_list,
174 |                 self.dev_data_list,
175 |                 self.test_data_list]:
176 |             count = 0
177 |             for pois in data_list:
178 |                 #[poi_sequences_list,poi_sequences_loc_list,poi_sequences_time_list,user]
179 |                 sequence = pois[0]
180 |                 loc_sequence = pois[1]
181 |                 time_sequence = pois[2]
182 |                 user=pois[3]
183 |                 if data_list==self.train_data_list:
184 |                     train_user.add(user)
185 |                 elif user not in train_user:
186 |                     #not in the train user is oov,index is 0
187 |                     user=0
188 |                 for label_rand_index in range(self.min_left_right_length,len(sequence)-self.min_left_right_length):
189 |                     count+=1
190 |                     #user for user_vec
191 |                     users.append(user)
192 |                     left_sequence = sequence[max(label_rand_index-self.max_len,0):label_rand_index]
193 |                     right_sequence = sequence[label_rand_index+1:min(label_rand_index+self.max_len+1,len(sequence))]
194 | 
195 |                     # time : 1
196 |                     left_neighbor_narray.append([time_sequence[label_rand_index - 1]],)
197 |                     right_neighbor_narray.append([time_sequence[label_rand_index + 1]])
198 |                     label_time=self.encode_time_loc(time_sequence[label_rand_index])
199 |                     label_time.append(time_sequence[label_rand_index])
200 |                     label_narray.append(label_time)
201 | 
202 |                     sequence_lmr = sequence[
203 |                                    max(label_rand_index - self.max_len, 0):min(label_rand_index + self.max_len + 1,
204 |                                                                                len(sequence))]
205 |                     data_list1.append(str(' '.join(left_sequence)))
206 |                     data_list2.append(str(' '.join(right_sequence)))
207 |                     data_list_lmr.append(str(' '.join(sequence_lmr)))
208 |                     labels_poi.append(sequence[label_rand_index])
209 |             counts.append(count)
210 |         left_neighbor_narray=np.array(left_neighbor_narray)
211 |         right_neighbor_narray=np.array(right_neighbor_narray)
212 |         label_narray=np.array(label_narray)
213 | 
214 |         self.nb_train, self.nb_dev, self.nb_test = counts
215 |         print('after sample:train:%d, dev:%d, test:%d' %
216 |               (self.nb_train, self.nb_dev, self.nb_test))
217 |         # set nb_words will result in  random
218 |         #for poi sequence, string split using ' '
219 |         tokenizer = Tokenizer(
220 |             nb_words=None,
221 |             filters='',
222 |             lower=False,
223 |             split=' ',
224 |             char_level=False)
225 |         self.poi_data=data_list_lmr
226 |         #only count in train,not in train is oov,index is 0
227 |         tokenizer.fit_on_texts(self.poi_data[:self.nb_train])
228 |         self.poi_index = tokenizer.word_index
229 |         joblib.dump(self.poi_index, os.path.join(
230 |             self.basepath, 'poi_index_{}.pkl'.format(data)), compress=3)
231 |         print('Found %d unique poi in train.' % len(self.poi_index))
232 |         left_sequences = tokenizer.texts_to_sequences(data_list1)
233 |         right_sequences = tokenizer.texts_to_sequences(data_list2)
234 | 
235 |         index_poi = dict(zip(self.poi_index.values(), self.poi_index.keys()))
236 |         joblib.dump(index_poi, os.path.join(
237 |             self.basepath,'index_poi_{}.pkl'.format(data)), compress=3)
238 |         train_dev_num = self.nb_train + self.nb_dev
239 | 
240 |         left_lens = sorted([len(sequence) for sequence in left_sequences])
241 |         left_zero_count = 0
242 |         for length in left_lens:
243 |             if length == 0:
244 |                 left_zero_count += 1
245 |         print('left_samples minlen:%d,maxlen:%d,avglen:%d,zero_len_num:%d' %
246 |               (left_lens[0], left_lens[-1], np.mean(left_lens),left_zero_count))
247 | 
248 |         right_lens = sorted([len(sequence) for sequence in right_sequences])
249 |         right_zero_count = 0
250 |         for length in right_lens:
251 |             if length == 0:
252 |                 right_zero_count += 1
253 |         print('right_samples minlen:%d,maxlen:%d,avglen:%d,zero_len_num:%d' %
254 |               (right_lens[0], right_lens[-1], np.mean(right_lens),right_zero_count))
255 |         self.nb_poi = len(self.poi_index)+1
256 |         params={'maxlen':self.max_len,'nb_poi':self.nb_poi,
257 |                 'nb_train':self.nb_train,'nb_dev':self.nb_dev,'nb_test':self.nb_test,'nb_user':self.nb_user}
258 |         print("pad...")
259 |         # default maxlen is the length of the longest sequence
260 |         #len(left_sequences)<=maxlen
261 |         #post padding!!!
262 |         data1 = pad_sequences(left_sequences, maxlen=self.max_len,padding='pre')
263 |         data2 = pad_sequences(right_sequences, maxlen=self.max_len,padding='post')
264 | 
265 |         print('Shape of all left_data tensor:', data1.shape)
266 |         print('Shape of all right_data tensor:', data2.shape)
267 |         print('Shape of all left_neighbor_data tensor:', left_neighbor_narray.shape)
268 |         print('Shape of all right_neighbor_data tensor:', right_neighbor_narray.shape)
269 |         print('Shape of all label_narray tensor:', label_narray.shape)
270 | 
271 |         le=process.LabelEncoder()
272 |         users = np.array(users)
273 |         users=le.fit_transform(users)
274 |         users = np.reshape(users, (-1, 1))
275 |         ######################shuffle train#########################
276 |         train_users = users[:self.nb_train]
277 | 
278 |         # each line:poi_name,time_vec,loc_vec
279 |         labels_poi = np.concatenate([np.reshape(np.array(labels_poi), (-1, 1)), label_narray], axis=-1)
280 |         train_labels_poi = labels_poi[:self.nb_train]
281 |         train_data_poi1 = data1[:self.nb_train]
282 |         train_data_poi2 = data2[:self.nb_train]
283 |         train_left_neighbor_narray=left_neighbor_narray[:self.nb_train]
284 |         train_right_neighbor_narray = right_neighbor_narray[:self.nb_train]
285 | 
286 |         shuffle_index = range(self.nb_train)
287 |         np.random.shuffle(shuffle_index)
288 |         train_users = list(train_users[shuffle_index])
289 | 
290 |         train_labels_poi = train_labels_poi[shuffle_index]
291 |         train_data_poi1 = train_data_poi1[shuffle_index]
292 |         train_data_poi2 = train_data_poi2[shuffle_index]
293 |         train_left_neighbor_narray=train_left_neighbor_narray[shuffle_index]
294 |         train_right_neighbor_narray=train_right_neighbor_narray[shuffle_index]
295 | 
296 |         #labels_poi=np.concatenate([train_labels_poi,labels_poi[self.nb_train:]],axis=0)
297 |         np.savetxt(os.path.join(self.basepath, "train_label_poi.{}".format(data)),train_labels_poi,fmt='%s')
298 |         np.savetxt(os.path.join(self.basepath, "dev_label_poi.{}".format(data)),labels_poi[self.nb_train:train_dev_num],fmt='%s')
299 |         np.savetxt(os.path.join(self.basepath, "test_label_poi.{}".format(data)),labels_poi[train_dev_num:],fmt='%s')
300 |         ###############################################################
301 |         np.savetxt(os.path.join(self.basepath, 'train.{}'.format(data)),
302 |                    np.concatenate((train_data_poi1,train_data_poi2,train_users),
303 |                                   axis=-1), fmt='%d')
304 |         np.savetxt(os.path.join(self.basepath, 'train_st.{}'.format(data)),
305 |                    np.concatenate((train_left_neighbor_narray, train_right_neighbor_narray),
306 |                                   axis=-1), fmt='%.6f')
307 |         np.savetxt(os.path.join(self.basepath, 'dev.{}'.format(data)),
308 |                    np.concatenate((data1[self.nb_train:train_dev_num], data2[self.nb_train:train_dev_num],users[self.nb_train:train_dev_num]),
309 |                                   axis=-1), fmt='%d')
310 |         np.savetxt(os.path.join(self.basepath, 'dev_st.{}'.format(data)),
311 |                    np.concatenate((left_neighbor_narray[self.nb_train:train_dev_num], right_neighbor_narray[self.nb_train:train_dev_num]),
312 |                                   axis=-1), fmt='%.6f')
313 |         np.savetxt(os.path.join(self.basepath, 'test.{}'.format(data)),
314 |                    np.concatenate((data1[train_dev_num:], data2[train_dev_num:],users[train_dev_num:]),
315 |                                   axis=-1), fmt='%d')
316 |         np.savetxt(os.path.join(self.basepath, 'test_st.{}'.format(data)),
317 |                    np.concatenate((left_neighbor_narray[train_dev_num:],
318 |                                    right_neighbor_narray[train_dev_num:]),
319 |                                   axis=-1), fmt='%.6f')
320 |         joblib.dump(params, os.path.join(self.basepath, "{}.para".format(data)), compress=3)
321 |         self.poi_loc(data)
322 | 
323 |     def poi_loc(self,data):
324 |         # nb_poi*2
325 |         p_loc = np.zeros((self.nb_poi, 2), dtype=np.float32)##
326 |         print('p_loc matrix shape:', p_loc.shape)
327 |         # only count data in train ,rather than dev and test data
328 |         # index 0 is oov poi
329 |         for user in self.train_data_list:
330 |             # [poi_sequences_list,poi_sequences_loc_list,poi_sequences_time_list,user]
331 |             pois = user[0]
332 |             loc = user[1]
333 |             for index in range(len(pois)):
334 |                 cindex = self.poi_index[pois[index]]
335 |                 if p_loc[cindex][0] == 0.:
336 |                     p_loc[cindex] = loc[index]
337 |                 else:# average loc for mulit same POI
338 |                     p_loc[cindex] = (p_loc[cindex]+loc[index])/2.
339 |         p_loc[0]=np.mean(p_loc[1:],axis=0)
340 |         # nb_poi*nb_poi
341 |         loc_matrix=cdist(p_loc, p_loc, 'euclidean')
342 |         loc_matrix = process.scale(loc_matrix, with_mean=False, axis=-1)  # with_mean=True
343 |         np.savetxt(os.path.join(self.basepath, "poi_loc_{}.txt".format(data)),loc_matrix,fmt='%.6f')
344 |         joblib.dump(loc_matrix, os.path.join(self.basepath, "poi_loc.{}".format(data)), compress=3)
345 | 
346 |     def encode_time_loc(self,poi_time):
347 |         #int
348 |         #encode weekend and weekday in one week and 5 time slot in one day
349 |         #each poi_time to be a 2+5 dim vector
350 |         onehot_time=[0]*7
351 |         #timestamp to week and time
352 |         time_str=time.strftime("%w:%H%M",time.localtime(poi_time))
353 |         time_str=time_str.split(':')
354 |         if int(time_str[0]) in [0,6]:
355 |             week_index=0
356 |         elif int(time_str[0]) in [1,2,3,4,5]:
357 |             week_index = 1
358 |         else:
359 |             print('int(time_str[0])'+str(time_str[0]))
360 |             exit()
361 |         onehot_time[week_index]=1
362 |         hour_minute=int(time_str[1])
363 |         if 800<=hour_minute<1130:
364 |             index=2
365 |         elif 1130<=hour_minute<1400:
366 |             index=3
367 |         elif 1400<=hour_minute<1730:
368 |             index=4
369 |         elif 1730<=hour_minute<2200:
370 |             index=5
371 |         elif hour_minute>=2200 or hour_minute<800:
372 |             index=6
373 |         else:
374 |             print('time error:'+str(hour_minute))
375 |             exit()
376 |         onehot_time[index]=1
377 |         return onehot_time
378 | 
379 | if __name__ == '__main__':
380 |     datahelp=DataHelp()
381 |     datahelp.preprocess_foursquare('NYC')
382 |     datahelp.encode_padding('NYC')
383 | 


--------------------------------------------------------------------------------