├── README.md ├── autocomplete ├── evaluate.py └── train.py ├── config_evaluate.json ├── config_train.json └── utilities ├── beamsearch.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # NQAC 2 | A Neural Query Auto Completion system based on a neural language model enriched with time sensitivity and user information 3 | 4 | **This repository is under construction** 5 | 6 | Currently, the repository provides the core code that was used for the experiments for our paper. Especially: 7 | ``` 8 | autocomplete/train.py 9 | autocomplete/evaluate.py 10 | ``` 11 | 12 | As-is, the code is not yet working. We are working on adding a single entry point to both train and evaluate, as well as providing resources (or pointers to resources) to reproduce our experiments. Ultimately, the objective is to demonstrate how the NQAC network works, but first the repository needs some clean up. 13 | -------------------------------------------------------------------------------- /autocomplete/evaluate.py: -------------------------------------------------------------------------------- 1 | from keras.models import load_model 2 | import pickle 3 | import gensim 4 | from beamsearch import BeamSearch 5 | import os 6 | from random import randint 7 | import time 8 | import json 9 | import sys 10 | 11 | def createExamples(text): 12 | examples = [] 13 | lines = text.split("\n") 14 | 15 | for line in lines: 16 | linesp = line.split("\t") 17 | if len(linesp) == 3: 18 | u = linesp[0] 19 | q = linesp[1] 20 | t = float(linesp[2].strip()) 21 | sp = q.split(" ") 22 | if len(sp) > 1: 23 | start = q.find(" ")+1 # +1 bc we incorporate the space character in the first input: the user wants completion 24 | for j in range(start,len(q)): 25 | # Generate all prefixes 26 | x = q[:j] 27 | examples.append([x,q,u,t]) 28 | return examples 29 | 30 | def loadData(generate): 31 | bg = {} 32 | qvocab = {} 33 | users = {} 34 | print(" BG data") 35 | with open("./queries/queries_"+study+".txt") as f: 36 | for line in f: 37 | sp = line.split("\t") 38 | query = sp[1] 39 | user = sp[0] 40 | words = query.split(" ") 41 | for word in words: 42 | if word not in qvocab: 43 | qvocab[word] = 0 44 | qvocab[word] += 1 45 | if query not in bg: 46 | bg[query] = 0 47 | if user not in users: 48 | users[user] = 0 49 | bg[query] += 1 50 | if generate: 51 | print(" prefix generation") 52 | examples = createExamples(open("./queries/queries_"+study+"t_sample.txt",'r').read()) 53 | with open("./queries/queries_"+study+"_prefixes.txt", "w") as fw: 54 | for quad in examples: 55 | fw.write(quad[0]+"\t"+quad[1]+"\t"+quad[2]+"\t"+str(quad[3])+"\n") 56 | else: 57 | print(" prefix loading") 58 | examples = [] 59 | with open("./queries/queries_"+study+"_prefixes.txt") as f: 60 | for line in f: 61 | examples.append(line.strip().split("\t")) 62 | # with open("results/seen-unseen_users_"+study+".txt", "w") as fw: 63 | # with open("results/seen-unseen_queries_"+study+".txt", "w") as f: 64 | # for quad in examples: 65 | # if quad[1] in bg: 66 | # f.write("1\n") 67 | # else: 68 | # f.write("0\n") 69 | # if quad[2] in users: 70 | # fw.write("1\n") 71 | # else: 72 | # fw.write("0\n") 73 | return examples, qvocab, bg 74 | 75 | def RR(prefix, solution, raw_results, diverse): 76 | inverse_rp = 0 77 | inverse_rp_partial = 0 78 | found = "" 79 | try: 80 | # Make a list, strip \n and order according to output probability 81 | candidates = [k for k in sorted(raw_results, key=raw_results.get, reverse=True)] 82 | except: 83 | # Or just take the list if that's the input 84 | candidates = raw_results 85 | for i,c in enumerate(candidates): 86 | stripped = c.strip() 87 | if solution == stripped: 88 | found = stripped 89 | inverse_rp = 1/(i+1) 90 | if inverse_rp_partial == 0: 91 | inverse_rp_partial = 1/(i+1) 92 | break 93 | if solution.startswith(stripped+" ") and found == "": 94 | found = stripped 95 | inverse_rp_partial = 1/(i+1) 96 | d = "" 97 | if diverse: 98 | d = "_d" 99 | with open("results/"+modelname+"/epoch"+epoch+"_sample"+sample+d+"_scores.txt", "a") as f: 100 | f.write(prefix+"\t"+str(inverse_rp)+"\t"+str(inverse_rp_partial)+"\n") 101 | return inverse_rp, inverse_rp_partial 102 | 103 | def MRR(prefixes, bs, batchlen, diverse=False): 104 | total = 0 105 | total_partial = 0 106 | iteration = 0 107 | batch = [] 108 | solutions_recorded = [] 109 | t = 0 110 | # For each prefix-solution pair 111 | for ps in prefixes: 112 | prefix = ps[0] 113 | solution = ps[1] 114 | user = None 115 | if use_u2v: 116 | user = ps[2] 117 | timestamp = None 118 | if use_timestamps: 119 | timestamp = ps[3] 120 | # Run the net and beam search 121 | start_time = time.time() 122 | raw_results = bs.search(prefix, user, timestamp, diverse) 123 | t = time.time() - start_time 124 | d = "" 125 | if diverse: 126 | d = "_d" 127 | with open("results/"+modelname+"/epoch"+epoch+"_sample"+sample+d+".txt", "a") as f: 128 | for c,s in raw_results.items(): 129 | f.write(c.strip()+"\t"+str(s)+"\t"+prefix+"\t"+solution+"\n") 130 | addtotal, addpartial = RR(prefix, solution, raw_results, diverse) 131 | total += addtotal 132 | total_partial += addpartial 133 | iteration += 1 134 | print(str(iteration)+" ~"+str(t)+" seconds per beam search ", end="\r") 135 | return total/len(prefixes), total_partial/len(prefixes) 136 | 137 | def MPC(prefix, bg): 138 | # Limit BG queries to the given prefix 139 | candidates = {} 140 | for q,v in bg.items(): 141 | if q.startswith(prefix) and q != "prefix": 142 | candidates[q] = v 143 | 144 | # Convert occurences to probability (estimated by frequency) 145 | total = len(candidates.keys()) 146 | for q,v in candidates.items(): 147 | candidates[q] = v/total 148 | return sorted(candidates, key=candidates.get, reverse=True)[:10] 149 | 150 | def MPCEval(prefixes, bg): 151 | total = 0 152 | total_partial = 0 153 | iteration = 0 154 | for ps in prefixes: 155 | prefix = ps[0] 156 | solution = ps[1] 157 | candidates = MPC(prefix, bg) 158 | addtotal, addpartial = RR(prefix, solution, candidates) 159 | total += addtotal 160 | total_partial += addpartial 161 | iteration += 1 162 | print(str(iteration), end="\r") 163 | total /= len(prefixes) 164 | total_partial /= len(prefixes) 165 | print("MRR: "+str(total)+", PMRR: "+str(total_partial)) 166 | 167 | config = json.load(open("config_evaluate.json")) 168 | os.environ['CUDA_VISIBLE_DEVICES'] = config["gpuID"] 169 | 170 | #### PARAMS 171 | print("Loading params") 172 | modelname = config["modelname"] 173 | study = config["study"] 174 | maxlen = 100 175 | epoch = config["epoch"] 176 | sample = config["sample"] 177 | 178 | # Some various setups 179 | use_w2v = config["use_w2v"] 180 | w2v_size = 200 181 | use_u2v = config["use_u2v"] 182 | u2v_size = 30 183 | use_timestamps = config["use_timestamps"] 184 | timestamp_size = 4 185 | print("Done") 186 | ############ 187 | 188 | ###### Do not touch unless you want to resample the test set 189 | generatePrefixes = False 190 | 191 | print("Loading data") 192 | examples, qvocab, bg = loadData(generatePrefixes) 193 | if modelname == "MPC": 194 | print(MPCEval(examples, bg)) 195 | sys.exit() 196 | fname = "models/"+modelname+"/epoch"+epoch+"_sample"+sample+".h5" 197 | print(" model "+modelname+" "+epoch+"s"+sample) 198 | model = load_model(fname) 199 | print(model.summary()) 200 | print("Done.") 201 | 202 | print("Loading embeddings") 203 | U = pickle.load(open("./pkl/u_"+study+".pkl", 'rb')) 204 | UNK = pickle.load(open("./pkl/unk_"+study+".pkl", 'rb')) 205 | INC = pickle.load(open("./pkl/inc_"+study+".pkl", 'rb')) 206 | char_indices = pickle.load(open("./pkl/char_indices_"+study,'rb')) 207 | indices_char = pickle.load(open("./pkl/indices_char_"+study,'rb')) 208 | if use_w2v: 209 | w2v = gensim.models.KeyedVectors.load_word2vec_format('vectors/GoogleNews-vectors-negative300.bin', binary=True) 210 | else: w2v = {} 211 | if use_u2v: 212 | u2v = gensim.models.Doc2Vec.load('vectors/user2vec_d30_2.model') 213 | else: u2v = {} 214 | print("Done.") 215 | 216 | print("Initializing beam searcher") 217 | data = { 218 | "ci": char_indices, 219 | "ic": indices_char, 220 | "INC": INC, 221 | "UNK": UNK, 222 | "U": U, 223 | "w2v": w2v, 224 | "u2v": u2v, 225 | "qvocab": qvocab 226 | } 227 | features = {"chars": len(char_indices)} 228 | if use_w2v: 229 | features["w2v"] = w2v_size 230 | if use_u2v: 231 | features["u2v"] = u2v_size 232 | if use_timestamps: 233 | features["timestamp"] = timestamp_size 234 | bs = BeamSearch(model, 10, maxlen, data, features) 235 | print("Done") 236 | 237 | diverse = False 238 | d = "" 239 | if diverse: 240 | d = "_d" 241 | suggestions = bs.search("www ", diverse=diverse) 242 | for s,p in suggestions.items(): 243 | print(s.strip()+":"+str(p)) 244 | print() 245 | if not os.path.exists("results/"+modelname): 246 | os.makedirs("results/"+modelname) 247 | try: 248 | os.remove("results/"+modelname+"/epoch"+epoch+"_sample"+sample+d+"_scores.txt") 249 | except OSError: 250 | pass 251 | try: 252 | os.remove("results/"+modelname+"/epoch"+epoch+"_sample"+sample+d+".txt") 253 | except OSError: 254 | pass 255 | print("Number of tests: "+str(len(examples))) 256 | print() 257 | print(MRR(examples, bs, 1, diverse)) 258 | -------------------------------------------------------------------------------- /autocomplete/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from keras.callbacks import LambdaCallback 3 | from keras.models import Sequential 4 | from keras.layers import LSTM, GRU, Dropout, TimeDistributed, Dense 5 | from keras.optimizers import Adam 6 | from keras.utils.data_utils import get_file 7 | import numpy as np 8 | import random 9 | import sys 10 | import os 11 | import gensim 12 | import pickle 13 | from beamsearch import BeamSearch 14 | from keras.models import load_model 15 | import dateutil.parser 16 | import utils 17 | import json 18 | 19 | config = json.load(open("config_learn.json")) 20 | os.environ['CUDA_VISIBLE_DEVICES'] = config["gpuID"] 21 | 22 | #### PARAMS 23 | studyID = config["studyID"] 24 | maxlen = 100 25 | update = config["update"] 26 | lr = config["lr"] 27 | batch_size = config["batch_size"] 28 | epochs = config["epochs"] 29 | 30 | # Some various setups 31 | use_w2v = config["use_w2v"] 32 | w2v_size = 200 33 | use_u2v = config["use_u2v"] 34 | u2v_size = 30 35 | use_timestamps = config["use_timestamps"] 36 | timestamp_size = 4 37 | ############ 38 | 39 | path = "queries/train.txt" 40 | U_file = "pkl/u.pkl" 41 | UNK_file = "pkl/unk.pkl" 42 | INC_file = "pkl/inc.pkl" 43 | ci = "pkl/char_indices" 44 | ic = "pkl/indices_char" 45 | run_id = "run_"+studyID 46 | if not os.path.exists("models/"+run_id): 47 | os.makedirs("models/"+run_id) 48 | 49 | print("Loading embeddings") 50 | if use_w2v: 51 | w2v = gensim.models.KeyedVectors.load_word2vec_format('vectors/GoogleNews-vectors-negative300.bin', binary=True) 52 | else: w2v = {} 53 | if use_u2v: 54 | u2v = gensim.models.Doc2Vec.load('vectors/user2vec_d30_2.model') 55 | else: u2v = {} 56 | print("Done.") 57 | 58 | def createVocabulary(string): 59 | # Create query vocabulary to filter out words that appear less than 5 tiems in logs 60 | qvocab = {} 61 | words = string.split() 62 | for word in words: 63 | if word not in qvocab: 64 | qvocab[word] = 0 65 | qvocab[word] += 1 66 | return qvocab 67 | 68 | def string_to_sequences(string, qvocab, seq_maxlen, char_idx, features): 69 | len_chars = len(char_idx) 70 | w2vs = 0 71 | if "w2v" in features: 72 | w2vs = features["w2v"] 73 | u2vs = 0 74 | if "u2v" in features: 75 | u2vs = features["u2v"] 76 | ts = 0 77 | if "timestamp" in features: 78 | ts = features["timestamp"] 79 | 80 | sequences = [] 81 | next_chars = [] 82 | print(" sequencing") 83 | ses = string.split("\n") 84 | users = [] 85 | sequences = [] 86 | timestamps = [] 87 | for s in ses: 88 | sp = s.split("\t") 89 | if len(sp) == 3: 90 | sequences.append(sp[1]) 91 | if u2vs > 0: 92 | users.append(sp[0]) 93 | if ts > 0: 94 | unformatted_date = sp[2] 95 | date_pieces = unformatted_date.split(" ") 96 | formatted_date = date_pieces[0]+"-"+date_pieces[1]+"-"+date_pieces[2]+" "+date_pieces[3]+":"+date_pieces[4]+":"+date_pieces[5] 97 | weekday = dateutil.parser.parse(formatted_date).weekday() 98 | cos_s, sin_s = utils.time_to_real(int(date_pieces[3]), int(date_pieces[4]), int(date_pieces[5])) 99 | cos_d, sin_d = utils.weekday_to_real(weekday) 100 | timestamps.append([cos_s, sin_s, cos_d, sin_d]) 101 | 102 | print(" io init") 103 | X = np.zeros((len(sequences), seq_maxlen, sum(features.values())), dtype=np.float32) 104 | Y = np.zeros((len(sequences), seq_maxlen, len_chars), dtype=np.float32) 105 | for i, seq in enumerate(sequences): 106 | last_word = "" 107 | last_word_t = 0 108 | for t, char in enumerate(seq): 109 | X[i, t, char_idx[char]] = 1 110 | if u2vs > 0: 111 | try: 112 | X[i, t, len_chars+w2vs:len_chars+w2vs+u2vs] = u2v[users[i]] 113 | except: pass 114 | # X[i, t, len_chars+w2vs:len_chars+w2vs+u2vs] = U 115 | if ts > 0: 116 | X[i, t, len_chars+w2vs+u2vs:len_chars+w2vs+u2vs+ts] = timestamps[i] 117 | if char == " ": 118 | last_word = seq[last_word_t:t] 119 | last_word_t = t+1 120 | try: 121 | embed = UNK 122 | if qvocab[last_word] >= 5: 123 | embed = w2v[last_word] 124 | X[i, t, len_chars:len_chars+w2vs] = embed 125 | except: 126 | X[i, t, len_chars:len_chars+w2vs] = UNK 127 | else: 128 | X[i, t, len_chars:len_chars+w2vs] = INC 129 | try: 130 | Y[i, t, char_idx[seq[t+1]]] = 1 131 | except: 132 | Y[i, t, char_idx["\n"]] = 1 133 | return X, Y, qvocab 134 | 135 | queries = "" 136 | with open(path) as f: 137 | queries = f.read()#[:10000000] 138 | 139 | # Calculate static vectors 140 | if not update: 141 | print('Loading previous char_indices') 142 | char_indices = pickle.load(open(ci,'rb')) 143 | indices_char = pickle.load(open(ic,'rb')) 144 | print('Loading UNK and INC and U') 145 | U = pickle.load(open(U_file, 'rb')) 146 | UNK = pickle.load(open(UNK_file, 'rb')) 147 | INC = pickle.load(open(INC_file, 'rb')) 148 | else: 149 | print("Calculating char_indices") 150 | chars = sorted(list(set(queries))) 151 | char_indices = dict((c, i) for i, c in enumerate(chars)) 152 | indices_char = dict((i, c) for i, c in enumerate(chars)) 153 | print('Generating UNK and INC and U') 154 | UNK = np.random.uniform(-0.25, 0.25, 300) 155 | INC = np.random.uniform(-0.25, 0.25, 300) 156 | U = np.random.uniform(-0.25, 0.25, 30) 157 | pickle.dump(U, open(U_file,'wb')) 158 | pickle.dump(UNK, open(UNK_file,'wb')) 159 | pickle.dump(INC, open(INC_file,'wb')) 160 | pickle.dump(char_indices, open(ci,'wb')) 161 | pickle.dump(indices_char, open(ic,'wb')) 162 | print("Done.") 163 | 164 | print('Done.') 165 | 166 | features = {"chars": len(char_indices)} 167 | if use_w2v: 168 | features["w2v"] = w2v_size 169 | if use_u2v: 170 | features["u2v"] = u2v_size 171 | if use_timestamps: 172 | features["timestamp"] = timestamp_size 173 | 174 | # Net 175 | print("Building network") 176 | model = Sequential() 177 | model.add(GRU(1024, return_sequences=True, input_shape=(maxlen, sum(features.values())), activation="relu")) 178 | model.add(Dropout(0.5)) 179 | model.add(GRU(1024, return_sequences=True, input_shape=(maxlen, sum(features.values())), activation="relu")) 180 | model.add(Dropout(0.5)) 181 | model.add(TimeDistributed(Dense(len(char_indices), activation='softmax'))) 182 | optimizer = Adam(lr=lr, clipnorm=0.5) 183 | model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=["accuracy"]) 184 | print(model.summary()) 185 | print(run_id) 186 | print("Done.") 187 | 188 | iterations = 30 # 189 | split_size = int(len(queries.split("\n"))/iterations) 190 | print("Init vacabulary") 191 | qvocab = createVocabulary(queries) 192 | print("Done.") 193 | 194 | data = { 195 | "ci": char_indices, 196 | "ic": indices_char, 197 | "INC": INC, 198 | "UNK": UNK, 199 | "U": U, 200 | "w2v": w2v, 201 | "u2v": u2v, 202 | "qvocab": qvocab 203 | } 204 | 205 | def on_epoch_end(epoch, logs): 206 | print() 207 | bs = BeamSearch(model, 10, maxlen, data, features) 208 | timestamp = None 209 | user = None 210 | if use_timestamps: 211 | timestamp = 1148693365.0 212 | if use_u2v: 213 | user = "9032971" 214 | suggestions = bs.search(["www "], user, timestamp) 215 | for s,p in suggestions[0].items(): 216 | print(s.strip()+":"+str(p)) 217 | print() 218 | print_callback = LambdaCallback(on_epoch_end=on_epoch_end) 219 | 220 | for i in range(epochs): 221 | for j in range(iterations): 222 | print("Preparing data for next sample - sample "+str(j)+" of epoch "+str(i)) 223 | start = j*split_size 224 | end = min((j+1)*split_size, len(queries)) 225 | qsplit = "\n".join(queries.split("\n")[start:end]) 226 | X, Y, _ = string_to_sequences(qsplit, qvocab, maxlen, char_indices, features) 227 | print("Done.") 228 | model.fit(X, Y, validation_split=0.2, batch_size=batch_size, epochs=1, callbacks=[print_callback]) 229 | model.save("models/"+run_id+"/epoch"+str(i)+"_sample"+str(j)+".h5") 230 | model.save("models/"+run_id+"/epoch"+str(i)+".h5") 231 | model.save("models/"+run_id+"/final.h5") 232 | -------------------------------------------------------------------------------- /config_evaluate.json: -------------------------------------------------------------------------------- 1 | { 2 | "gpuID": "2", 3 | "modelname": "pubmed31", 4 | "study": "aol", 5 | "epoch": "1", 6 | "sample": "3", 7 | "use_w2v": true, 8 | "use_u2v": true, 9 | "use_timestamps": true 10 | } 11 | -------------------------------------------------------------------------------- /config_train.json: -------------------------------------------------------------------------------- 1 | { 2 | "gpuID": "1", 3 | "studyID": "31", 4 | "update": false, 5 | "lr": 0.0005, 6 | "batch_size": 512, 7 | "epochs": 10, 8 | "use_w2v": true, 9 | "use_u2v": true, 10 | "use_timestamps": true, 11 | } 12 | -------------------------------------------------------------------------------- /utilities/beamsearch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | from collections import Counter 4 | import utils 5 | import operator 6 | 7 | class BeamSearch(): 8 | def __init__(self, model, beam, maxlen, data, features): 9 | self.m = model 10 | self.beam = beam 11 | self.maxlen = maxlen 12 | self.char_indices = data["ci"] 13 | self.indices_char = data["ic"] 14 | self.INC = data["INC"] 15 | self.UNK = data["UNK"] 16 | self.U = data["U"] 17 | self.w2v = data["w2v"] 18 | self.u2v = data["u2v"] 19 | self.qvocab = data["qvocab"] 20 | self.features = features 21 | 22 | def search(self, sequence, user=None, timestamp=None, diverse=False): 23 | suggestions = {sequence:0} 24 | 25 | unfinished = True 26 | while unfinished: 27 | unfinished = False 28 | new_suggestions = {} 29 | for seq,prob in suggestions.items(): 30 | if seq[-1] == "\n" or len(seq) == self.maxlen: 31 | new_suggestions[seq] = prob 32 | else: 33 | unfinished = True 34 | predictions = self.predictCharacters(seq, user, timestamp, diverse) 35 | # Add diversity, ref: Vijayakumar et al., 2016 36 | if diverse: 37 | initlen = len(sequence) 38 | # key-val tuples sorted by value 39 | sorted_predictions = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True) 40 | # First group is argmax of beam search 41 | new_suggestions[seq + sorted_predictions[0][0]] = suggestions[seq] + np.log(sorted_predictions[0][1]) 42 | avgWeight = 0 43 | for g in range(1, len(sorted_predictions)): 44 | # For each remaining, we minimize the similarity with previous suggestions 45 | # To this end, we use a normalized levenshtein distance 46 | current_prob = suggestions[seq] + np.log(sorted_predictions[g][1]) 47 | newseq = seq + sorted_predictions[g][0] 48 | avgDistance = 0 49 | for previous in new_suggestions.keys(): 50 | avgDistance += utils.levenshtein_distance(newseq[initlen:], previous[initlen:])/max(len(newseq[initlen:]), len(previous[initlen:])) 51 | avgDistance /= len(new_suggestions.keys()) 52 | to_add = 0.26*np.log(avgDistance) 53 | avgWeight += to_add 54 | new_suggestions[newseq] = current_prob + to_add 55 | # Rebalance 56 | new_suggestions[seq + sorted_predictions[0][0]] += avgWeight/(self.beam-1) 57 | else: 58 | for c,p in predictions.items(): 59 | new_suggestions[seq + str(c)] = suggestions[seq] + np.log(p) 60 | new_suggestions = dict(Counter(new_suggestions).most_common(self.beam)) 61 | suggestions = new_suggestions 62 | return suggestions 63 | 64 | def predictCharacters(self, sequence, user=None, timestamp=None, diverse=False): 65 | len_chars = len(self.char_indices) 66 | 67 | # Features 68 | w2vs = 0 69 | if "w2v" in self.features: 70 | w2vs = self.features["w2v"] 71 | u2vs = 0 72 | if "u2v" in self.features: 73 | u2vs = self.features["u2v"] 74 | ts = 0 75 | if "timestamp" in self.features: 76 | ts = self.features["timestamp"] 77 | 78 | # Init input 79 | x = np.zeros((1, self.maxlen, sum(self.features.values()))) 80 | last_word = "" 81 | last_word_i = 0 82 | for t, char in enumerate(sequence): 83 | if u2vs > 0: 84 | try: 85 | x[0, t, len_chars+w2vs:len_chars+w2vs+u2vs] = self.u2v[user] 86 | except: pass 87 | # x[i, t, len_chars+w2vs:len_chars+w2vs+u2vs] = self.U 88 | if ts > 0: 89 | try: 90 | cos_s, sin_s, cos_d, sin_d = utils.timestamp_to_features(timestamp) 91 | time_features = [cos_s, sin_s, cos_d, sin_d] 92 | x[0, t, len_chars+w2vs+u2vs:len_chars+w2vs+u2vs+ts] = time_features 93 | except: pass 94 | x[0, t, self.char_indices[char]] = 1 95 | if char == " ": 96 | last_word = sequence[last_word_i:t] 97 | last_word_i = t+1 98 | try: 99 | embed = self.UNK 100 | if last_word in self.qvocab and self.qvocab[last_word] >= 5: 101 | embed = self.w2v[last_word] 102 | x[0, t, len_chars:len_chars+w2vs] = embed 103 | except: 104 | x[0, t, len_chars:len_chars+w2vs] = self.UNK 105 | else: 106 | x[0, t, len_chars:len_chars+w2vs] = self.INC 107 | chars = {} 108 | predictions = self.m.predict(x)[0] 109 | preds = predictions[len(sequence)-1] 110 | indices = np.argpartition(preds, -self.beam)[-self.beam:] 111 | for i in indices: 112 | chars[self.indices_char[i]] = preds[i] 113 | return chars 114 | -------------------------------------------------------------------------------- /utilities/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import dateutil.parser 3 | from datetime import datetime 4 | 5 | def time_to_real(hours, minutes, seconds): 6 | seconds_in_day = 24*60*60 7 | seconds = seconds + minutes*60 + hours*3600 8 | sin_time = np.sin(2*np.pi*seconds/seconds_in_day) 9 | cos_time = np.cos(2*np.pi*seconds/seconds_in_day) 10 | return cos_time, sin_time 11 | 12 | def weekday_to_real(day): 13 | days_in_week = 7 14 | sin_time = np.sin(2*np.pi*day/days_in_week) 15 | cos_time = np.cos(2*np.pi*day/days_in_week) 16 | return cos_time, sin_time 17 | 18 | def timestamp_to_features(timestamp): 19 | time = datetime.fromtimestamp(float(timestamp)) 20 | weekday = time.weekday() 21 | cos_s, sin_s = time_to_real(time.hour, time.minute, time.second) 22 | cos_d, sin_d = weekday_to_real(weekday) 23 | return cos_s, sin_s, cos_d, sin_d 24 | 25 | def levenshtein_distance(s1, s2): 26 | if len(s1) > len(s2): 27 | s1, s2 = s2, s1 28 | 29 | distances = range(len(s1) + 1) 30 | for i2, c2 in enumerate(s2): 31 | distances_ = [i2+1] 32 | for i1, c1 in enumerate(s1): 33 | if c1 == c2: 34 | distances_.append(distances[i1]) 35 | else: 36 | distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) 37 | distances = distances_ 38 | return distances[-1] 39 | --------------------------------------------------------------------------------