├── paper.pdf ├── data └── foursquare │ └── dataset_TSMC2014_NYC.txt ├── README.md ├── Bi-STDDP.py └── Bi-STDDP-preprocess.py /paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xidongbo/AAAI19_Bi-STDDP/HEAD/paper.pdf -------------------------------------------------------------------------------- /data/foursquare/dataset_TSMC2014_NYC.txt: -------------------------------------------------------------------------------- 1 | Please download the data from https://sites.google.com/site/yangdingqi/home/foursquaredataset 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AAAI19_Bi-STDDP 2 | Keras Implementation of Bi-directional Spatio-Temporal Dependence and Users’ Dynamic Preferences Model 3 | 4 | Code for the paper: 5 | 6 | Dongbo Xi, Fuzhen Zhuang, Yanchi Liu, Jingjing Gu, Hui Xiong, Qing He: Modelling of Bi-directional Spatio-Temporal Dependence and Users' Dynamic Preferences for Missing POI Check-in Identification. AAAI 2019: 5458-5465 7 | 8 | Please cite our AAAI'19 paper if you use our codes. Thanks! 9 | 10 | Author: Dongbo Xi 11 | 12 | # Requirement 13 | python==2.7 14 | Keras==2.1.0 15 | tensorflow-gpu==1.2.1 16 | 17 | # Example to run the codes. 18 | ``` 19 | python Bi-STDDP.py --embedded_dim 64 --hidden_unit 256 --length 5 --batch_size 256 --dropout 0.5 --lr 0.001 --nb_epoch 50 --earlystop 1 --model_name STDDP.model --dataset NYC 20 | ``` 21 | 22 | The instruction of commands has been clearly stated in the codes (see the parse_args function). 23 | 24 | # Dataset 25 | We use the real-world LBSN datasets from Foursquare. [https://sites.google.com/site/yangdingqi/home/foursquaredataset] 26 | 27 | Split the data to train/validation/test files to run the codes directly (see Bi-STDDP-preprocess.py). 28 | 29 | # Reference 30 | If you are interested in the code, please cite our paper: 31 | ``` 32 | Xi D, Zhuang F, Liu Y, et al. Modelling of bi-directional spatio-temporal dependence and users’ dynamic preferences for missing poi check-in identification[C]//Proceedings of the AAAI Conference on Artificial Intelligence. 2019, 33(01): 5458-5465. 33 | ``` 34 | or in bibtex style: 35 | ``` 36 | @inproceedings{xi2019modelling, 37 | title={Modelling of bi-directional spatio-temporal dependence and users’ dynamic preferences for missing poi check-in identification}, 38 | author={Xi, Dongbo and Zhuang, Fuzhen and Liu, Yanchi and Gu, Jingjing and Xiong, Hui and He, Qing}, 39 | booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, 40 | volume={33}, 41 | number={01}, 42 | pages={5458--5465}, 43 | year={2019} 44 | } 45 | ``` 46 | 47 | Last Update Date: July 28, 2019 48 | -------------------------------------------------------------------------------- /Bi-STDDP.py: -------------------------------------------------------------------------------- 1 | #encoding:utf8 2 | ''' 3 | Keras implementation of Bi-directional Spatio-Temporal Dependence and Users’Dynamic 4 | Preferences Model( Bi-STDDP) based on tensorflow backend. 5 | 6 | @author: 7 | xidongbo17s@ict.ac.cn 8 | ''' 9 | from keras.layers import Dense, Embedding, merge, Input,Reshape,Activation,Subtract,Multiply,Lambda,Dropout 10 | from keras.models import Model 11 | from keras.optimizers import adam 12 | from keras.callbacks import EarlyStopping,ModelCheckpoint 13 | from sklearn.externals import joblib 14 | from sklearn import preprocessing as process 15 | import numpy as np 16 | import os 17 | import tensorflow as tf 18 | import argparse 19 | import keras.backend.tensorflow_backend as k 20 | from itertools import izip 21 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2' 22 | os.environ["CUDA_VISIBLE_DEVICES"]='0' 23 | gpu_options = tf.GPUOptions(allow_growth=True) 24 | sess=tf.InteractiveSession( 25 | config=tf.ConfigProto( 26 | gpu_options=gpu_options)) 27 | k.set_session(sess) 28 | 29 | def parse_args(): 30 | parser = argparse.ArgumentParser(description="Run STDDP.") 31 | parser.add_argument('--embedded_dim', type=int, default=64, 32 | help='Embedding dim.') 33 | parser.add_argument('--hidden_unit', type=int, default=256, 34 | help='Number of hidden units.') 35 | parser.add_argument('--length', type=int, default=1, 36 | help='Length of POI sequence.') 37 | parser.add_argument('--batch_size', type=int, default=128, 38 | help='Batch size.') 39 | parser.add_argument('--dropout', type=float, default=0.5, 40 | help='Dropout rate.') 41 | parser.add_argument('--lr', type=float, default=0.001, 42 | help='Learning rate.') 43 | parser.add_argument('--nb_epoch', type=int, default=50, 44 | help='Number of epoch.') 45 | parser.add_argument('--earlystop', type=int, default=1, 46 | help='Earlystop to avoid overfitting.') 47 | parser.add_argument('--model_name', type=str, default='STDDP.model', 48 | help='Name of best model to save.') 49 | parser.add_argument('--dataset', type=str, default='NYC', 50 | help='Which dataset to use.') 51 | parser.add_argument('--basepath', type=str, default="data/foursquare/", 52 | help='Dataset path.') 53 | return parser.parse_args() 54 | 55 | class STDDP(object): 56 | def __init__(self,embedded_dim,hidden_unit,length,dropout,batch_size,lr,nb_epoch,earlystop,model_name,dataset,basepath): 57 | self.embedded_dim=embedded_dim 58 | self.hidden_unit=hidden_unit 59 | self.length=length 60 | self.dropout=dropout 61 | self.batch_size=batch_size 62 | self.lr=lr 63 | self.nb_epoch=nb_epoch 64 | self.earlystop=earlystop 65 | self.model_name=model_name 66 | self.dataset=dataset 67 | self.basepath=basepath 68 | self.para = joblib.load(os.path.join(self.basepath, "{}.para".format(self.dataset))) 69 | self.poi_index = joblib.load(os.path.join(self.basepath, "poi_index_{}.pkl".format(self.dataset))) 70 | self.poi_loc = joblib.load(os.path.join(self.basepath, 'poi_loc.{}'.format(self.dataset))) 71 | self.maxlen = self.para['maxlen'] 72 | self.nb_poi = self.para['nb_poi'] 73 | self.nb_train = self.para['nb_train'] 74 | self.nb_dev = self.para['nb_dev'] 75 | self.nb_test = self.para['nb_test'] 76 | self.nb_user = self.para['nb_user'] 77 | print('maxlen:' + str(self.maxlen)) 78 | # nb: Number 79 | print('nb_poi:' + str(self.nb_poi)) 80 | print('nb_train:' + str(self.nb_train)) 81 | print('nb_dev:' + str(self.nb_dev)) 82 | print('nb_test:' + str(self.nb_test)) 83 | print('nb_user:' + str(self.nb_user)) 84 | self._init_model() 85 | 86 | def _init_model(self): 87 | # Model 88 | ###################user embedding############# 89 | user = Input(shape=(1,), dtype='int32') 90 | user_vec = Embedding(input_dim=self.nb_user + 1, 91 | output_dim=self.embedded_dim)(user) 92 | user_vec = Reshape((self.embedded_dim,))(user_vec) 93 | ############################################### 94 | 95 | ###################poi embedding############# 96 | x1 = Input(shape=(self.length,), dtype='int32') # prior POI sequence 97 | x2 = Input(shape=(self.length,), dtype='int32') # next POI sequence 98 | # share embedding vector 99 | embedding = Embedding( 100 | input_dim=self.nb_poi + 1, 101 | output_dim=self.embedded_dim) 102 | prior_pois = embedding(x1) 103 | next_pois = embedding(x2) 104 | # None,length,dim->None,length*dim 105 | prior_pois = Reshape((self.length * self.embedded_dim,))(prior_pois) 106 | next_pois = Reshape((self.length * self.embedded_dim,))(next_pois) 107 | ############################################ 108 | 109 | ###################Bi-STDDP############# 110 | x1_t = Input(shape=(1,), dtype='float32') # prior POI's time(seconds) 111 | x2_t = Input(shape=(1,), dtype='float32') # next POI's time(seconds) 112 | x1_d = Input(shape=(self.nb_poi,), dtype='float32') # prior POI's distance vec of all candidate POIs 113 | x2_d = Input(shape=(self.nb_poi,), dtype='float32') # next POI's distance vec of all candidate POIs 114 | y_t1 = Input(shape=(7,), dtype='float32') # target time pattern 115 | y_t2 = Input(shape=(1,), dtype='float32') # target time 116 | 117 | # --Bi-STD: Bi-directional Spatio-Temporal Dependence 118 | sub1 = Subtract()([y_t2, x1_t]) 119 | sub2 = Subtract()([x2_t, y_t2]) 120 | sub1 = Dense(self.nb_poi, activation='tanh')(Lambda(lambda x: x / 3600.)(sub1)) 121 | sub2 = Dense(self.nb_poi, activation='tanh')(Lambda(lambda x: x / 3600.)(sub2)) 122 | x1_dis = Multiply()([sub1, x1_d]) # STD_{t-1} 123 | x2_dis = Multiply()([sub2, x2_d]) # STD_{t+1} 124 | # --DP: Dynamic Preference 125 | wt = Dense(self.hidden_unit, activation='tanh')(y_t1) 126 | wp1 = Dense(self.hidden_unit, activation='tanh')(prior_pois) 127 | wp2 = Dense(self.hidden_unit, activation='tanh')(next_pois) 128 | user_vec = Dense(self.hidden_unit, activation='tanh')(user_vec) 129 | dynamic_preference = merge([wp1, wp2, user_vec, wt], mode='sum', output_shape=(self.hidden_unit,)) 130 | 131 | output = Dense(self.nb_poi)(dynamic_preference) 132 | output=Dropout(self.dropout)(output) 133 | output = merge([output, x1_dis, x2_dis], mode='sum') 134 | output = Activation('softmax')(output) 135 | 136 | self.model = Model(inputs=[x1, x2, x1_t, x2_t, x1_d, x2_d, y_t1, y_t2, user], outputs=output) 137 | self.model.compile( 138 | loss='categorical_crossentropy', 139 | optimizer=adam( 140 | lr=self.lr), 141 | metrics=[ 142 | self.acc_top1, 143 | self.acc_top5, 144 | self.acc_top10]) 145 | self.early_stopping = EarlyStopping(monitor='val_loss', patience=self.earlystop) 146 | self.checkpoint = ModelCheckpoint(self.model_name, monitor='val_loss', verbose=0, save_best_only=True, mode='min') 147 | 148 | def fit(self): 149 | self. model.fit_generator( 150 | generator=self.generate_data_from_file(type='train'), 151 | steps_per_epoch=np.ceil(self.nb_train/self.batch_size).astype(int), epochs=self.nb_epoch, verbose=1, 152 | callbacks=[self.early_stopping, self.checkpoint], 153 | validation_data=self.generate_data_from_file(type='dev'), 154 | validation_steps=np.ceil(self.nb_dev/self.batch_size).astype(int), max_queue_size=20, workers=1) 155 | 156 | def test(self): 157 | self.model.load_weights(self.model_name, {'acc_top1': self.acc_top1, 'acc_top5': self.acc_top5, 'acc_top10': self.acc_top10}) 158 | print(self.model.evaluate_generator(generator=self.generate_data_from_file(type='test'), 159 | steps=np.ceil(self.nb_test/self.batch_size).astype(int), max_queue_size=20, workers=1)) 160 | 161 | count = 0 162 | MAP = 0 163 | step = 0 164 | steps = np.ceil(self.nb_test/self.batch_size).astype(int) 165 | for x, y_true in self.generate_data_from_file(type='test'): 166 | step += 1 167 | y_priord = self.model.predict(x) 168 | for i in range(len(y_true)): 169 | count += 1 170 | ziped = zip(y_true[i], y_priord[i]) 171 | ziped.sort(key=lambda x: x[1], reverse=True) 172 | p = [t[0] for t in ziped] 173 | rank = np.argmax(p) + 1 174 | MAP += 1.0 / rank 175 | if step >= steps: 176 | break 177 | print('test samples:%d' % count) 178 | print('MAP:%.4f' % (MAP / count)) 179 | 180 | def acc_topk(self,y_true, y_pred, k): 181 | topk = tf.nn.top_k(y_pred, k).indices 182 | y = tf.argmax(y_true, axis=-1) 183 | y = tf.reshape(y, (-1, 1)) 184 | y = tf.cast(y, dtype=tf.int32) 185 | acc = tf.equal(y, topk) 186 | return tf.reduce_mean(tf.cast(acc, dtype=tf.float32)) * acc.shape[1].value 187 | 188 | def acc_top1(self,y_true, y_priord): 189 | return self.acc_topk(y_true, y_priord, k=1) 190 | 191 | def acc_top5(self,y_true, y_priord): 192 | return self.acc_topk(y_true, y_priord, k=5) 193 | 194 | def acc_top10(self,y_true, y_priord): 195 | return self.acc_topk(y_true, y_priord, k=10) 196 | 197 | 198 | 199 | def generate_data_from_file(self,type='train'): 200 | if type == 'train': 201 | x_path = 'train.{}'.format(self.dataset) 202 | x_st_path = 'train_st.{}'.format(self.dataset) 203 | y_path = 'train_label_poi.{}'.format(self.dataset) 204 | elif type == 'dev': 205 | x_path = 'dev.{}'.format(self.dataset) 206 | x_st_path = 'dev_st.{}'.format(self.dataset) 207 | y_path = 'dev_label_poi.{}'.format(self.dataset) 208 | elif type == 'test': 209 | x_path = 'test.{}'.format(self.dataset) 210 | x_st_path = 'test_st.{}'.format(self.dataset) 211 | y_path = 'test_label_poi.{}'.format(self.dataset) 212 | else: 213 | print('data type error') 214 | exit() 215 | flag = True 216 | # POI sequence information and user. Split by space. 217 | # Each line: forward_POI_sequence(self.maxlen) + backward_POI_sequence(self.maxlen)+user 218 | x_path = os.path.join(self.basepath, x_path) 219 | 220 | # Neighbor POI's temporal and spatio information. Split by space. 221 | # Each line: prior POI' visit time+next POI' visit time 222 | x_st_path = os.path.join(self.basepath, x_st_path) 223 | 224 | # the missing POI(label) information. Split by space. 225 | # Each line: missing_POI_name+target_time_pattern(7-dim)+target_time(seconds) 226 | y_path = os.path.join(self.basepath, y_path) 227 | while True: 228 | with open(x_path) as f1, open(x_st_path) as f2, open(y_path) as f3: 229 | count = 0 230 | for x_line, x_st_line, y_line in izip(f1, f2, f3): 231 | y_line=y_line.split() 232 | label_poi = y_line[0] # missing_POI 233 | label_t = np.array(y_line[1:8], dtype=np.float32) # target_time_pattern(7-dim) 234 | label_second = float(y_line[8]) # target_time(seconds) 235 | onehot_poi = np.zeros(len(self.poi_index) + 1, dtype=np.int) 236 | #if self.poi_index.has_key(label_poi): 237 | pindex = self.poi_index.get(label_poi,0) 238 | onehot_poi[pindex] = 1 239 | if flag: 240 | # x1 and x2 means forward and backward, s means spatio, t means temporal 241 | x1, x2, x1_t, x2_t, x1_s, x2_s, y, y_t1, y_t2, user = [], [], [], [], [], [], [], [], [], [] 242 | #loc1_batch,loc2_batch=[],[] 243 | flag = False 244 | count += 1 245 | x_line = x_line.split() 246 | x1.append(x_line[self.maxlen - self.length:self.maxlen]) # the forward POI sequence 247 | x2.append(x_line[self.maxlen:self.maxlen + self.length]) # the backward POI sequence 248 | user.append([x_line[-1]]) 249 | 250 | x_st_line = x_st_line.split() 251 | x1_t.append([float(x_st_line[0])]) # the prior POI's visit time 252 | x2_t.append([float(x_st_line[1])]) # the next POI's visit time 253 | ################################################### 254 | x1_s.append(self.poi_loc[int(x1[-1][-1])])# prior POI's space vector 255 | x2_s.append(self.poi_loc[int(x2[-1][0])])# next POI's space vector 256 | y.append(onehot_poi) 257 | y_t1.append(label_t) 258 | y_t2.append([label_second]) 259 | if count >= self.batch_size: 260 | count = 0 261 | flag = True 262 | x1 = np.array(x1, dtype=np.int) 263 | x2 = np.array(x2, dtype=np.int) 264 | x1_t = np.array(x1_t, dtype=np.float32) 265 | x2_t = np.array(x2_t, dtype=np.float32) 266 | x1_s = np.array(x1_s, dtype=np.float32) 267 | x2_s = np.array(x2_s, dtype=np.float32) 268 | y_t1 = np.array(y_t1, dtype=np.float32) 269 | y_t2 = np.array(y_t2, dtype=np.float32) 270 | user = np.array(user, dtype=np.int) 271 | y = np.array(y, dtype=np.int) 272 | yield ([x1, x2, x1_t, x2_t, x1_s, x2_s, y_t1, y_t2, user], y) 273 | if not flag: 274 | flag = True 275 | x1 = np.array(x1, dtype=np.int) 276 | x2 = np.array(x2, dtype=np.int) 277 | x1_t = np.array(x1_t, dtype=np.float32) 278 | x2_t = np.array(x2_t, dtype=np.float32) 279 | x1_s = np.array(x1_s, dtype=np.float32) 280 | x2_s = np.array(x2_s, dtype=np.float32) 281 | y_t1 = np.array(y_t1, dtype=np.float32) 282 | y_t2 = np.array(y_t2, dtype=np.float32) 283 | user = np.array(user, dtype=np.int) 284 | y = np.array(y, dtype=np.int) 285 | yield ([x1, x2, x1_t, x2_t, x1_s, x2_s, y_t1, y_t2, user], y) 286 | 287 | if __name__ == '__main__': 288 | args = parse_args() 289 | stddp=STDDP(embedded_dim=args.embedded_dim,hidden_unit=args.hidden_unit,length=args.length,batch_size=args.batch_size, 290 | lr=args.lr,nb_epoch=args.nb_epoch,earlystop=args.earlystop,model_name=args.model_name,dataset=args.dataset, 291 | basepath=args.basepath) 292 | stddp.fit() 293 | stddp.test() 294 | -------------------------------------------------------------------------------- /Bi-STDDP-preprocess.py: -------------------------------------------------------------------------------- 1 | #encoding:utf8 2 | ''' 3 | Preprocess code for Bi-directional Spatio-Temporal Dependence and Users’ Dynamic Preferences Model( Bi-STDDP) . 4 | @author: 5 | xidongbo17s@ict.ac.cn 6 | ''' 7 | import os 8 | import pandas as pd 9 | import time 10 | from sklearn.externals import joblib 11 | import numpy as np 12 | from keras.preprocessing.text import Tokenizer 13 | from keras.preprocessing.sequence import pad_sequences 14 | from sklearn import preprocessing as process 15 | from scipy.spatial.distance import cdist 16 | class DataHelp(object): 17 | def __init__(self): 18 | self.url = r"data/" 19 | self.basepath = os.path.join(self.url, "foursquare/") 20 | # user whose number of check-in less this will be removed 21 | self.min_poi_num = 10 22 | self.min_left_right_length = 4 23 | # each sample length,0 padding 24 | self.max_len = 10 25 | self.nb_train, self.nb_dev, self.nb_test = 0, 0, 0 26 | self.data_list = [] 27 | self.data = [] 28 | 29 | def preprocess_foursquare(self, data='NYC'): 30 | print ('preprocess {} data'.format(data)) 31 | 32 | datapath = os.path.join( 33 | self.basepath, "dataset_TSMC2014_{}.txt".format(data)) 34 | data_df = pd.read_csv(datapath, header=None, sep='\t', encoding='utf8') 35 | # 0:user_id, 1:poi_id, 2:class_id, 3:class_name, 4:lati, 5:long, 6:offset, 7:utc_time 36 | # NYC_dict.data: 37 | # format:map{user_id:map{'poi_id':[],'class_name':[],'loc':[(1,2)],'time':[]}} 38 | self.nb_user=len(data_df[0].unique()) 39 | print('user:%d' % self.nb_user) 40 | print('poi:%d' % len(data_df[1].unique())) 41 | 42 | train_data_dict = dict() 43 | dev_data_dict = dict() 44 | test_data_dict = dict() 45 | print ('user:%d' % len(data_df[0].unique())) 46 | print ('poi:%d' % len(data_df[1].unique())) 47 | # transfer to timestamp 48 | data_df[7] = [time.mktime(time.strptime( 49 | data_df[7][index], '%a %b %d %X +0000 %Y')) + int(data_df[6][index]) * 60 for index in data_df[7].index] 50 | #sort by time 51 | data_df = data_df.sort_values(by=7,axis=0,ascending=True) 52 | # NYC:1-1083 53 | user_id_list = list(data_df[0]) 54 | poi_id_list = list(data_df[1]) 55 | lati_list = list(data_df[4]) 56 | long_list = list(data_df[5]) 57 | uct_time_list = list(data_df[7]) 58 | nb_check_in = len(user_id_list) 59 | print ('total %d check_ins' % nb_check_in) 60 | # 8:1:1split 61 | dev_split = '2012-12-14 12:00:00' # '2012-08-14 12:00:00' # 62 | test_split = '2013-01-15 12:00:00' # '2012-11-15 12:00:00' # 63 | dev_split_timetamp = time.mktime( 64 | time.strptime(dev_split, "%Y-%m-%d %H:%M:%S")) 65 | test_split_timetamp = time.mktime( 66 | time.strptime(test_split, "%Y-%m-%d %H:%M:%S")) 67 | for i in range(nb_check_in): 68 | check_in_timetamp = uct_time_list[i] 69 | user_id = user_id_list[i] 70 | if check_in_timetamp < dev_split_timetamp: # train data 71 | if user_id not in train_data_dict: 72 | train_data_dict[user_id] = dict() 73 | train_data_dict[user_id]['poi_id'] = [poi_id_list[i]] 74 | train_data_dict[user_id]['loc'] = [(float(lati_list[i]), float(long_list[i]))] 75 | train_data_dict[user_id]['time'] = [check_in_timetamp] 76 | else: 77 | train_data_dict[user_id]['poi_id'].append(poi_id_list[i]) 78 | train_data_dict[user_id]['loc'].append( 79 | (float(lati_list[i]), float(long_list[i]))) 80 | train_data_dict[user_id]['time'].append(check_in_timetamp) 81 | elif check_in_timetamp > test_split_timetamp: # test data 82 | if user_id not in test_data_dict: 83 | test_data_dict[user_id] = dict() 84 | test_data_dict[user_id]['poi_id'] = [poi_id_list[i]] 85 | test_data_dict[user_id]['loc'] = [(float(lati_list[i]), float(long_list[i]))] 86 | test_data_dict[user_id]['time'] = [check_in_timetamp] 87 | else: 88 | test_data_dict[user_id]['poi_id'].append(poi_id_list[i]) 89 | test_data_dict[user_id]['loc'].append( 90 | (float(lati_list[i]), float(long_list[i]))) 91 | test_data_dict[user_id]['time'].append(check_in_timetamp) 92 | else: # dev data 93 | if user_id not in dev_data_dict: 94 | dev_data_dict[user_id] = dict() 95 | dev_data_dict[user_id]['poi_id'] = [poi_id_list[i]] 96 | dev_data_dict[user_id]['loc'] = [(float(lati_list[i]), float(long_list[i]))] 97 | dev_data_dict[user_id]['time'] = [check_in_timetamp] 98 | else: 99 | dev_data_dict[user_id]['poi_id'].append(poi_id_list[i]) 100 | dev_data_dict[user_id]['loc'].append( 101 | (float(lati_list[i]), float(long_list[i]))) 102 | dev_data_dict[user_id]['time'].append(check_in_timetamp) 103 | 104 | # remove less self.min_poi_num sequence 105 | train_temp = list(train_data_dict.keys()) 106 | dev_temp = list(dev_data_dict.keys()) 107 | test_temp = list(test_data_dict.keys()) 108 | print ('total train %d users' % len(train_data_dict)) 109 | print ('total dev %d users' % len(dev_data_dict)) 110 | print ('total test %d users' % len(test_data_dict)) 111 | print ( 112 | 'user whose number of check-in less %d will be removed' % 113 | self.min_poi_num) 114 | for key in train_temp: 115 | if len(train_data_dict[key]['poi_id']) < self.min_poi_num: 116 | train_data_dict.pop(key) 117 | for key in dev_temp: 118 | if len(dev_data_dict[key]['poi_id']) < self.min_poi_num: 119 | dev_data_dict.pop(key) 120 | for key in test_temp: 121 | if len(test_data_dict[key]['poi_id']) < self.min_poi_num: 122 | test_data_dict.pop(key) 123 | 124 | print ('total train %d users' % len(train_data_dict)) 125 | print ('total dev %d users' % len(dev_data_dict)) 126 | print ('total test %d users' % len(test_data_dict)) 127 | joblib.dump([train_data_dict, dev_data_dict, test_data_dict], os.path.join( 128 | self.basepath, "{}_dict.data".format(data)), compress=3) 129 | 130 | def load_preprocessed_foursquare(self, data='NYC'): 131 | print ('load {} data'.format(data)) 132 | datapath = os.path.join(self.basepath, "{}_dict.data".format(data)) 133 | # data 134 | # format:map{user_id:map{'poi_id':[],'loc':[(1,2)],'time':[]}} 135 | train_data_dict, dev_data_dict, test_data_dict = joblib.load(datapath) 136 | #train_data_list's each entry is a list:[poi_sequences_list,poi_sequences_loc_list,poi_sequences_time_list,user] 137 | train_data_list = [] 138 | #key is user 139 | for key in train_data_dict: 140 | #python 3 not use str() 141 | train_data_list.append([train_data_dict[key]['poi_id'],train_data_dict[key]['loc'],train_data_dict[key]['time'],key]) 142 | dev_data_list = [] 143 | for key in dev_data_dict: 144 | dev_data_list.append([dev_data_dict[key]['poi_id'],dev_data_dict[key]['loc'],dev_data_dict[key]['time'],key]) 145 | test_data_list = [] 146 | for key in test_data_dict: 147 | test_data_list.append([test_data_dict[key]['poi_id'],test_data_dict[key]['loc'],test_data_dict[key]['time'],key]) 148 | self.nb_train, self.nb_dev, self.nb_test = len( 149 | train_data_list), len(dev_data_list), len(test_data_list) 150 | print ('before sample:train:%d, dev:%d, test:%d' % 151 | (self.nb_train, self.nb_dev, self.nb_test)) 152 | self.train_data_list = train_data_list 153 | self.dev_data_list = dev_data_list 154 | self.test_data_list = test_data_list 155 | 156 | def encode_padding(self, data='NYC'): 157 | self.load_preprocessed_foursquare(data=data) 158 | counts = [] 159 | data_list1 = [] 160 | data_list2 = [] 161 | # list1+label+list2 162 | data_list_lmr = [] 163 | labels_poi = [] 164 | left_neighbor_narray =[] 165 | right_neighbor_narray =[] 166 | label_narray=[] 167 | 168 | # train_data_list's each entry is a list: 169 | # [poi_sequences_list,poi_sequences_loc_list,poi_sequences_time_list,user] 170 | users=[] 171 | train_user=set() 172 | for data_list in [ 173 | self.train_data_list, 174 | self.dev_data_list, 175 | self.test_data_list]: 176 | count = 0 177 | for pois in data_list: 178 | #[poi_sequences_list,poi_sequences_loc_list,poi_sequences_time_list,user] 179 | sequence = pois[0] 180 | loc_sequence = pois[1] 181 | time_sequence = pois[2] 182 | user=pois[3] 183 | if data_list==self.train_data_list: 184 | train_user.add(user) 185 | elif user not in train_user: 186 | #not in the train user is oov,index is 0 187 | user=0 188 | for label_rand_index in range(self.min_left_right_length,len(sequence)-self.min_left_right_length): 189 | count+=1 190 | #user for user_vec 191 | users.append(user) 192 | left_sequence = sequence[max(label_rand_index-self.max_len,0):label_rand_index] 193 | right_sequence = sequence[label_rand_index+1:min(label_rand_index+self.max_len+1,len(sequence))] 194 | 195 | # time : 1 196 | left_neighbor_narray.append([time_sequence[label_rand_index - 1]],) 197 | right_neighbor_narray.append([time_sequence[label_rand_index + 1]]) 198 | label_time=self.encode_time_loc(time_sequence[label_rand_index]) 199 | label_time.append(time_sequence[label_rand_index]) 200 | label_narray.append(label_time) 201 | 202 | sequence_lmr = sequence[ 203 | max(label_rand_index - self.max_len, 0):min(label_rand_index + self.max_len + 1, 204 | len(sequence))] 205 | data_list1.append(str(' '.join(left_sequence))) 206 | data_list2.append(str(' '.join(right_sequence))) 207 | data_list_lmr.append(str(' '.join(sequence_lmr))) 208 | labels_poi.append(sequence[label_rand_index]) 209 | counts.append(count) 210 | left_neighbor_narray=np.array(left_neighbor_narray) 211 | right_neighbor_narray=np.array(right_neighbor_narray) 212 | label_narray=np.array(label_narray) 213 | 214 | self.nb_train, self.nb_dev, self.nb_test = counts 215 | print('after sample:train:%d, dev:%d, test:%d' % 216 | (self.nb_train, self.nb_dev, self.nb_test)) 217 | # set nb_words will result in random 218 | #for poi sequence, string split using ' ' 219 | tokenizer = Tokenizer( 220 | nb_words=None, 221 | filters='', 222 | lower=False, 223 | split=' ', 224 | char_level=False) 225 | self.poi_data=data_list_lmr 226 | #only count in train,not in train is oov,index is 0 227 | tokenizer.fit_on_texts(self.poi_data[:self.nb_train]) 228 | self.poi_index = tokenizer.word_index 229 | joblib.dump(self.poi_index, os.path.join( 230 | self.basepath, 'poi_index_{}.pkl'.format(data)), compress=3) 231 | print('Found %d unique poi in train.' % len(self.poi_index)) 232 | left_sequences = tokenizer.texts_to_sequences(data_list1) 233 | right_sequences = tokenizer.texts_to_sequences(data_list2) 234 | 235 | index_poi = dict(zip(self.poi_index.values(), self.poi_index.keys())) 236 | joblib.dump(index_poi, os.path.join( 237 | self.basepath,'index_poi_{}.pkl'.format(data)), compress=3) 238 | train_dev_num = self.nb_train + self.nb_dev 239 | 240 | left_lens = sorted([len(sequence) for sequence in left_sequences]) 241 | left_zero_count = 0 242 | for length in left_lens: 243 | if length == 0: 244 | left_zero_count += 1 245 | print('left_samples minlen:%d,maxlen:%d,avglen:%d,zero_len_num:%d' % 246 | (left_lens[0], left_lens[-1], np.mean(left_lens),left_zero_count)) 247 | 248 | right_lens = sorted([len(sequence) for sequence in right_sequences]) 249 | right_zero_count = 0 250 | for length in right_lens: 251 | if length == 0: 252 | right_zero_count += 1 253 | print('right_samples minlen:%d,maxlen:%d,avglen:%d,zero_len_num:%d' % 254 | (right_lens[0], right_lens[-1], np.mean(right_lens),right_zero_count)) 255 | self.nb_poi = len(self.poi_index)+1 256 | params={'maxlen':self.max_len,'nb_poi':self.nb_poi, 257 | 'nb_train':self.nb_train,'nb_dev':self.nb_dev,'nb_test':self.nb_test,'nb_user':self.nb_user} 258 | print("pad...") 259 | # default maxlen is the length of the longest sequence 260 | #len(left_sequences)<=maxlen 261 | #post padding!!! 262 | data1 = pad_sequences(left_sequences, maxlen=self.max_len,padding='pre') 263 | data2 = pad_sequences(right_sequences, maxlen=self.max_len,padding='post') 264 | 265 | print('Shape of all left_data tensor:', data1.shape) 266 | print('Shape of all right_data tensor:', data2.shape) 267 | print('Shape of all left_neighbor_data tensor:', left_neighbor_narray.shape) 268 | print('Shape of all right_neighbor_data tensor:', right_neighbor_narray.shape) 269 | print('Shape of all label_narray tensor:', label_narray.shape) 270 | 271 | le=process.LabelEncoder() 272 | users = np.array(users) 273 | users=le.fit_transform(users) 274 | users = np.reshape(users, (-1, 1)) 275 | ######################shuffle train######################### 276 | train_users = users[:self.nb_train] 277 | 278 | # each line:poi_name,time_vec,loc_vec 279 | labels_poi = np.concatenate([np.reshape(np.array(labels_poi), (-1, 1)), label_narray], axis=-1) 280 | train_labels_poi = labels_poi[:self.nb_train] 281 | train_data_poi1 = data1[:self.nb_train] 282 | train_data_poi2 = data2[:self.nb_train] 283 | train_left_neighbor_narray=left_neighbor_narray[:self.nb_train] 284 | train_right_neighbor_narray = right_neighbor_narray[:self.nb_train] 285 | 286 | shuffle_index = range(self.nb_train) 287 | np.random.shuffle(shuffle_index) 288 | train_users = list(train_users[shuffle_index]) 289 | 290 | train_labels_poi = train_labels_poi[shuffle_index] 291 | train_data_poi1 = train_data_poi1[shuffle_index] 292 | train_data_poi2 = train_data_poi2[shuffle_index] 293 | train_left_neighbor_narray=train_left_neighbor_narray[shuffle_index] 294 | train_right_neighbor_narray=train_right_neighbor_narray[shuffle_index] 295 | 296 | #labels_poi=np.concatenate([train_labels_poi,labels_poi[self.nb_train:]],axis=0) 297 | np.savetxt(os.path.join(self.basepath, "train_label_poi.{}".format(data)),train_labels_poi,fmt='%s') 298 | np.savetxt(os.path.join(self.basepath, "dev_label_poi.{}".format(data)),labels_poi[self.nb_train:train_dev_num],fmt='%s') 299 | np.savetxt(os.path.join(self.basepath, "test_label_poi.{}".format(data)),labels_poi[train_dev_num:],fmt='%s') 300 | ############################################################### 301 | np.savetxt(os.path.join(self.basepath, 'train.{}'.format(data)), 302 | np.concatenate((train_data_poi1,train_data_poi2,train_users), 303 | axis=-1), fmt='%d') 304 | np.savetxt(os.path.join(self.basepath, 'train_st.{}'.format(data)), 305 | np.concatenate((train_left_neighbor_narray, train_right_neighbor_narray), 306 | axis=-1), fmt='%.6f') 307 | np.savetxt(os.path.join(self.basepath, 'dev.{}'.format(data)), 308 | np.concatenate((data1[self.nb_train:train_dev_num], data2[self.nb_train:train_dev_num],users[self.nb_train:train_dev_num]), 309 | axis=-1), fmt='%d') 310 | np.savetxt(os.path.join(self.basepath, 'dev_st.{}'.format(data)), 311 | np.concatenate((left_neighbor_narray[self.nb_train:train_dev_num], right_neighbor_narray[self.nb_train:train_dev_num]), 312 | axis=-1), fmt='%.6f') 313 | np.savetxt(os.path.join(self.basepath, 'test.{}'.format(data)), 314 | np.concatenate((data1[train_dev_num:], data2[train_dev_num:],users[train_dev_num:]), 315 | axis=-1), fmt='%d') 316 | np.savetxt(os.path.join(self.basepath, 'test_st.{}'.format(data)), 317 | np.concatenate((left_neighbor_narray[train_dev_num:], 318 | right_neighbor_narray[train_dev_num:]), 319 | axis=-1), fmt='%.6f') 320 | joblib.dump(params, os.path.join(self.basepath, "{}.para".format(data)), compress=3) 321 | self.poi_loc(data) 322 | 323 | def poi_loc(self,data): 324 | # nb_poi*2 325 | p_loc = np.zeros((self.nb_poi, 2), dtype=np.float32)## 326 | print('p_loc matrix shape:', p_loc.shape) 327 | # only count data in train ,rather than dev and test data 328 | # index 0 is oov poi 329 | for user in self.train_data_list: 330 | # [poi_sequences_list,poi_sequences_loc_list,poi_sequences_time_list,user] 331 | pois = user[0] 332 | loc = user[1] 333 | for index in range(len(pois)): 334 | cindex = self.poi_index[pois[index]] 335 | if p_loc[cindex][0] == 0.: 336 | p_loc[cindex] = loc[index] 337 | else:# average loc for mulit same POI 338 | p_loc[cindex] = (p_loc[cindex]+loc[index])/2. 339 | p_loc[0]=np.mean(p_loc[1:],axis=0) 340 | # nb_poi*nb_poi 341 | loc_matrix=cdist(p_loc, p_loc, 'euclidean') 342 | loc_matrix = process.scale(loc_matrix, with_mean=False, axis=-1) # with_mean=True 343 | np.savetxt(os.path.join(self.basepath, "poi_loc_{}.txt".format(data)),loc_matrix,fmt='%.6f') 344 | joblib.dump(loc_matrix, os.path.join(self.basepath, "poi_loc.{}".format(data)), compress=3) 345 | 346 | def encode_time_loc(self,poi_time): 347 | #int 348 | #encode weekend and weekday in one week and 5 time slot in one day 349 | #each poi_time to be a 2+5 dim vector 350 | onehot_time=[0]*7 351 | #timestamp to week and time 352 | time_str=time.strftime("%w:%H%M",time.localtime(poi_time)) 353 | time_str=time_str.split(':') 354 | if int(time_str[0]) in [0,6]: 355 | week_index=0 356 | elif int(time_str[0]) in [1,2,3,4,5]: 357 | week_index = 1 358 | else: 359 | print('int(time_str[0])'+str(time_str[0])) 360 | exit() 361 | onehot_time[week_index]=1 362 | hour_minute=int(time_str[1]) 363 | if 800<=hour_minute<1130: 364 | index=2 365 | elif 1130<=hour_minute<1400: 366 | index=3 367 | elif 1400<=hour_minute<1730: 368 | index=4 369 | elif 1730<=hour_minute<2200: 370 | index=5 371 | elif hour_minute>=2200 or hour_minute<800: 372 | index=6 373 | else: 374 | print('time error:'+str(hour_minute)) 375 | exit() 376 | onehot_time[index]=1 377 | return onehot_time 378 | 379 | if __name__ == '__main__': 380 | datahelp=DataHelp() 381 | datahelp.preprocess_foursquare('NYC') 382 | datahelp.encode_padding('NYC') 383 | --------------------------------------------------------------------------------