├── README.md
├── autocomplete
    ├── evaluate.py
    └── train.py
├── config_evaluate.json
├── config_train.json
└── utilities
    ├── beamsearch.py
    └── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # NQAC
 2 | A Neural Query Auto Completion system based on a neural language model enriched with time sensitivity and user information
 3 | 
 4 | **This repository is under construction**
 5 | 
 6 | Currently, the repository provides the core code that was used for the experiments for our paper. Especially:
 7 | ```
 8 | autocomplete/train.py
 9 | autocomplete/evaluate.py
10 | ```
11 | 
12 | As-is, the code is not yet working. We are working on adding a single entry point to both train and evaluate, as well as providing resources (or pointers to resources) to reproduce our experiments. Ultimately, the objective is to demonstrate how the NQAC network works, but first the repository needs some clean up.
13 | 


--------------------------------------------------------------------------------
/autocomplete/evaluate.py:
--------------------------------------------------------------------------------
  1 | from keras.models import load_model
  2 | import pickle
  3 | import gensim
  4 | from beamsearch import BeamSearch
  5 | import os
  6 | from random import randint
  7 | import time
  8 | import json
  9 | import sys
 10 | 
 11 | def createExamples(text):
 12 |     examples = []
 13 |     lines = text.split("\n")
 14 | 
 15 |     for line in lines:
 16 |         linesp = line.split("\t")
 17 |         if len(linesp) == 3:
 18 |             u = linesp[0]
 19 |             q = linesp[1]
 20 |             t = float(linesp[2].strip())
 21 |             sp = q.split(" ")
 22 |             if len(sp) > 1:
 23 |                 start = q.find(" ")+1 # +1 bc we incorporate the space character in the first input: the user wants completion
 24 |                 for j in range(start,len(q)):
 25 |                     # Generate all prefixes
 26 |                     x = q[:j]
 27 |                     examples.append([x,q,u,t])
 28 |     return examples
 29 | 
 30 | def loadData(generate):
 31 |     bg = {}
 32 |     qvocab = {}
 33 |     users = {}
 34 |     print("    BG data")
 35 |     with open("./queries/queries_"+study+".txt") as f:
 36 |         for line in f:
 37 |             sp = line.split("\t")
 38 |             query = sp[1]
 39 |             user = sp[0]
 40 |             words = query.split(" ")
 41 |             for word in words:
 42 |                 if word not in qvocab:
 43 |                     qvocab[word] = 0
 44 |                 qvocab[word] += 1
 45 |             if query not in bg:
 46 |                 bg[query] = 0
 47 |             if user not in users:
 48 |                 users[user] = 0
 49 |             bg[query] += 1
 50 |     if generate:
 51 |         print("    prefix generation")
 52 |         examples = createExamples(open("./queries/queries_"+study+"t_sample.txt",'r').read())
 53 |         with open("./queries/queries_"+study+"_prefixes.txt", "w") as fw:
 54 |             for quad in examples:
 55 |                 fw.write(quad[0]+"\t"+quad[1]+"\t"+quad[2]+"\t"+str(quad[3])+"\n")
 56 |     else:
 57 |         print("    prefix loading")
 58 |         examples = []
 59 |         with open("./queries/queries_"+study+"_prefixes.txt") as f:
 60 |             for line in f:
 61 |                 examples.append(line.strip().split("\t"))
 62 |     # with open("results/seen-unseen_users_"+study+".txt", "w") as fw:
 63 |     #     with open("results/seen-unseen_queries_"+study+".txt", "w") as f:
 64 |     #         for quad in examples:
 65 |     #             if quad[1] in bg:
 66 |     #                 f.write("1\n")
 67 |     #             else:
 68 |     #                 f.write("0\n")
 69 |     #             if quad[2] in users:
 70 |     #                 fw.write("1\n")
 71 |     #             else:
 72 |     #                 fw.write("0\n")
 73 |     return examples, qvocab, bg
 74 | 
 75 | def RR(prefix, solution, raw_results, diverse):
 76 |     inverse_rp = 0
 77 |     inverse_rp_partial = 0
 78 |     found = ""
 79 |     try:
 80 |         # Make a list, strip \n and order according to output probability
 81 |         candidates = [k for k in sorted(raw_results, key=raw_results.get, reverse=True)]
 82 |     except:
 83 |         # Or just take the list if that's the input
 84 |         candidates = raw_results
 85 |     for i,c in enumerate(candidates):
 86 |         stripped = c.strip()
 87 |         if solution == stripped:
 88 |             found = stripped
 89 |             inverse_rp = 1/(i+1)
 90 |             if inverse_rp_partial == 0:
 91 |                 inverse_rp_partial = 1/(i+1)
 92 |             break
 93 |         if solution.startswith(stripped+" ") and found == "":
 94 |             found = stripped
 95 |             inverse_rp_partial = 1/(i+1)
 96 |     d = ""
 97 |     if diverse:
 98 |         d = "_d"
 99 |     with open("results/"+modelname+"/epoch"+epoch+"_sample"+sample+d+"_scores.txt", "a") as f:
100 |         f.write(prefix+"\t"+str(inverse_rp)+"\t"+str(inverse_rp_partial)+"\n")
101 |     return inverse_rp, inverse_rp_partial
102 | 
103 | def MRR(prefixes, bs, batchlen, diverse=False):
104 |     total = 0
105 |     total_partial = 0
106 |     iteration = 0
107 |     batch = []
108 |     solutions_recorded = []
109 |     t = 0
110 |     # For each prefix-solution pair
111 |     for ps in prefixes:
112 |         prefix = ps[0]
113 |         solution = ps[1]
114 |         user = None
115 |         if use_u2v:
116 |             user = ps[2]
117 |         timestamp = None
118 |         if use_timestamps:
119 |             timestamp = ps[3]
120 |         # Run the net and beam search
121 |         start_time = time.time()
122 |         raw_results = bs.search(prefix, user, timestamp, diverse)
123 |         t = time.time() - start_time
124 |         d = ""
125 |         if diverse:
126 |             d = "_d"
127 |         with open("results/"+modelname+"/epoch"+epoch+"_sample"+sample+d+".txt", "a") as f:
128 |             for c,s in raw_results.items():
129 |                 f.write(c.strip()+"\t"+str(s)+"\t"+prefix+"\t"+solution+"\n")
130 |         addtotal, addpartial = RR(prefix, solution, raw_results, diverse)
131 |         total += addtotal
132 |         total_partial += addpartial
133 |         iteration += 1
134 |         print(str(iteration)+" ~"+str(t)+" seconds per beam search         ", end="\r")
135 |     return total/len(prefixes), total_partial/len(prefixes)
136 | 
137 | def MPC(prefix, bg):
138 |     # Limit BG queries to the given prefix
139 |     candidates = {}
140 |     for q,v in bg.items():
141 |         if q.startswith(prefix) and q != "prefix":
142 |             candidates[q] = v
143 | 
144 |     # Convert occurences to probability (estimated by frequency)
145 |     total = len(candidates.keys())
146 |     for q,v in candidates.items():
147 |         candidates[q] = v/total
148 |     return sorted(candidates, key=candidates.get, reverse=True)[:10]
149 | 
150 | def MPCEval(prefixes, bg):
151 |     total = 0
152 |     total_partial = 0
153 |     iteration = 0
154 |     for ps in prefixes:
155 |         prefix = ps[0]
156 |         solution = ps[1]
157 |         candidates = MPC(prefix, bg)
158 |         addtotal, addpartial = RR(prefix, solution, candidates)
159 |         total += addtotal
160 |         total_partial += addpartial
161 |         iteration += 1
162 |         print(str(iteration), end="\r")
163 |     total /= len(prefixes)
164 |     total_partial /= len(prefixes)
165 |     print("MRR: "+str(total)+", PMRR: "+str(total_partial))
166 | 
167 | config = json.load(open("config_evaluate.json"))
168 | os.environ['CUDA_VISIBLE_DEVICES'] = config["gpuID"]
169 | 
170 | #### PARAMS
171 | print("Loading params")
172 | modelname = config["modelname"]
173 | study = config["study"]
174 | maxlen = 100
175 | epoch = config["epoch"]
176 | sample = config["sample"]
177 | 
178 | # Some various setups
179 | use_w2v = config["use_w2v"]
180 | w2v_size = 200
181 | use_u2v = config["use_u2v"]
182 | u2v_size = 30
183 | use_timestamps = config["use_timestamps"]
184 | timestamp_size = 4
185 | print("Done")
186 | ############
187 | 
188 | ###### Do not touch unless you want to resample the test set
189 | generatePrefixes = False
190 | 
191 | print("Loading data")
192 | examples, qvocab, bg = loadData(generatePrefixes)
193 | if modelname == "MPC":
194 |     print(MPCEval(examples, bg))
195 |     sys.exit()
196 | fname = "models/"+modelname+"/epoch"+epoch+"_sample"+sample+".h5"
197 | print("    model "+modelname+" "+epoch+"s"+sample)
198 | model = load_model(fname)
199 | print(model.summary())
200 | print("Done.")
201 | 
202 | print("Loading embeddings")
203 | U = pickle.load(open("./pkl/u_"+study+".pkl", 'rb'))
204 | UNK = pickle.load(open("./pkl/unk_"+study+".pkl", 'rb'))
205 | INC = pickle.load(open("./pkl/inc_"+study+".pkl", 'rb'))
206 | char_indices = pickle.load(open("./pkl/char_indices_"+study,'rb'))
207 | indices_char = pickle.load(open("./pkl/indices_char_"+study,'rb'))
208 | if use_w2v:
209 |     w2v = gensim.models.KeyedVectors.load_word2vec_format('vectors/GoogleNews-vectors-negative300.bin', binary=True)
210 | else: w2v = {}
211 | if use_u2v:
212 |     u2v = gensim.models.Doc2Vec.load('vectors/user2vec_d30_2.model')
213 | else: u2v = {}
214 | print("Done.")
215 | 
216 | print("Initializing beam searcher")
217 | data = {
218 |         "ci": char_indices,
219 |         "ic": indices_char,
220 |         "INC": INC,
221 |         "UNK": UNK,
222 |         "U": U,
223 |         "w2v": w2v,
224 |         "u2v": u2v,
225 |         "qvocab": qvocab
226 |        }
227 | features = {"chars": len(char_indices)}
228 | if use_w2v:
229 |     features["w2v"] = w2v_size
230 | if use_u2v:
231 |     features["u2v"] = u2v_size
232 | if use_timestamps:
233 |     features["timestamp"] = timestamp_size
234 | bs = BeamSearch(model, 10, maxlen, data, features)
235 | print("Done")
236 | 
237 | diverse = False
238 | d = ""
239 | if diverse:
240 |     d = "_d"
241 | suggestions = bs.search("www ", diverse=diverse)
242 | for s,p in suggestions.items():
243 |     print(s.strip()+":"+str(p))
244 | print()
245 | if not os.path.exists("results/"+modelname):
246 |     os.makedirs("results/"+modelname)
247 | try:
248 |     os.remove("results/"+modelname+"/epoch"+epoch+"_sample"+sample+d+"_scores.txt")
249 | except OSError:
250 |     pass
251 | try:
252 |     os.remove("results/"+modelname+"/epoch"+epoch+"_sample"+sample+d+".txt")
253 | except OSError:
254 |     pass
255 | print("Number of tests: "+str(len(examples)))
256 | print()
257 | print(MRR(examples, bs, 1, diverse))
258 | 


--------------------------------------------------------------------------------
/autocomplete/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from keras.callbacks import LambdaCallback
  3 | from keras.models import Sequential
  4 | from keras.layers import LSTM, GRU, Dropout, TimeDistributed, Dense
  5 | from keras.optimizers import Adam
  6 | from keras.utils.data_utils import get_file
  7 | import numpy as np
  8 | import random
  9 | import sys
 10 | import os
 11 | import gensim
 12 | import pickle
 13 | from beamsearch import BeamSearch
 14 | from keras.models import load_model
 15 | import dateutil.parser
 16 | import utils
 17 | import json
 18 | 
 19 | config = json.load(open("config_learn.json"))
 20 | os.environ['CUDA_VISIBLE_DEVICES'] = config["gpuID"]
 21 | 
 22 | #### PARAMS
 23 | studyID = config["studyID"]
 24 | maxlen = 100
 25 | update = config["update"]
 26 | lr = config["lr"]
 27 | batch_size = config["batch_size"]
 28 | epochs = config["epochs"]
 29 | 
 30 | # Some various setups
 31 | use_w2v = config["use_w2v"]
 32 | w2v_size = 200
 33 | use_u2v = config["use_u2v"]
 34 | u2v_size = 30
 35 | use_timestamps = config["use_timestamps"]
 36 | timestamp_size = 4
 37 | ############
 38 | 
 39 | path = "queries/train.txt"
 40 | U_file = "pkl/u.pkl"
 41 | UNK_file = "pkl/unk.pkl"
 42 | INC_file = "pkl/inc.pkl"
 43 | ci = "pkl/char_indices"
 44 | ic = "pkl/indices_char"
 45 | run_id = "run_"+studyID
 46 | if not os.path.exists("models/"+run_id):
 47 |     os.makedirs("models/"+run_id)
 48 | 
 49 | print("Loading embeddings")
 50 | if use_w2v:
 51 |     w2v = gensim.models.KeyedVectors.load_word2vec_format('vectors/GoogleNews-vectors-negative300.bin', binary=True)
 52 | else: w2v = {}
 53 | if use_u2v:
 54 |     u2v = gensim.models.Doc2Vec.load('vectors/user2vec_d30_2.model')
 55 | else: u2v = {}
 56 | print("Done.")
 57 | 
 58 | def createVocabulary(string):
 59 |     # Create query vocabulary to filter out words that appear less than 5 tiems in logs
 60 |     qvocab = {}
 61 |     words = string.split()
 62 |     for word in words:
 63 |         if word not in qvocab:
 64 |             qvocab[word] = 0
 65 |         qvocab[word] += 1
 66 |     return qvocab
 67 | 
 68 | def string_to_sequences(string, qvocab, seq_maxlen, char_idx, features):
 69 |     len_chars = len(char_idx)
 70 |     w2vs = 0
 71 |     if "w2v" in features:
 72 |         w2vs = features["w2v"]
 73 |     u2vs = 0
 74 |     if "u2v" in features:
 75 |         u2vs = features["u2v"]
 76 |     ts = 0
 77 |     if "timestamp" in features:
 78 |         ts = features["timestamp"]
 79 | 
 80 |     sequences = []
 81 |     next_chars = []
 82 |     print("    sequencing")
 83 |     ses = string.split("\n")
 84 |     users = []
 85 |     sequences = []
 86 |     timestamps = []
 87 |     for s in ses:
 88 |         sp = s.split("\t")
 89 |         if len(sp) == 3:
 90 |             sequences.append(sp[1])
 91 |             if u2vs > 0:
 92 |                 users.append(sp[0])
 93 |             if ts > 0:
 94 |                 unformatted_date = sp[2]
 95 |                 date_pieces = unformatted_date.split(" ")
 96 |                 formatted_date = date_pieces[0]+"-"+date_pieces[1]+"-"+date_pieces[2]+" "+date_pieces[3]+":"+date_pieces[4]+":"+date_pieces[5]
 97 |                 weekday = dateutil.parser.parse(formatted_date).weekday()
 98 |                 cos_s, sin_s = utils.time_to_real(int(date_pieces[3]), int(date_pieces[4]), int(date_pieces[5]))
 99 |                 cos_d, sin_d = utils.weekday_to_real(weekday)
100 |                 timestamps.append([cos_s, sin_s, cos_d, sin_d])
101 | 
102 |     print("    io init")
103 |     X = np.zeros((len(sequences), seq_maxlen, sum(features.values())), dtype=np.float32)
104 |     Y = np.zeros((len(sequences), seq_maxlen, len_chars), dtype=np.float32)
105 |     for i, seq in enumerate(sequences):
106 |         last_word = ""
107 |         last_word_t = 0
108 |         for t, char in enumerate(seq):
109 |             X[i, t, char_idx[char]] = 1
110 |             if u2vs > 0:
111 |                 try:
112 |                     X[i, t, len_chars+w2vs:len_chars+w2vs+u2vs] = u2v[users[i]]
113 |                 except: pass
114 |                     # X[i, t, len_chars+w2vs:len_chars+w2vs+u2vs] = U
115 |             if ts > 0:
116 |                 X[i, t, len_chars+w2vs+u2vs:len_chars+w2vs+u2vs+ts] = timestamps[i]
117 |             if char == " ":
118 |                 last_word = seq[last_word_t:t]
119 |                 last_word_t = t+1
120 |                 try:
121 |                     embed = UNK
122 |                     if qvocab[last_word] >= 5:
123 |                         embed = w2v[last_word]
124 |                     X[i, t, len_chars:len_chars+w2vs] = embed
125 |                 except:
126 |                     X[i, t, len_chars:len_chars+w2vs] = UNK
127 |             else:
128 |                 X[i, t, len_chars:len_chars+w2vs] = INC
129 |             try:
130 |                 Y[i, t, char_idx[seq[t+1]]] = 1
131 |             except:
132 |                 Y[i, t, char_idx["\n"]] = 1
133 |     return X, Y, qvocab
134 | 
135 | queries = ""
136 | with open(path) as f:
137 |     queries = f.read()#[:10000000]
138 | 
139 | # Calculate static vectors
140 | if not update:
141 |     print('Loading previous char_indices')
142 |     char_indices = pickle.load(open(ci,'rb'))
143 |     indices_char = pickle.load(open(ic,'rb'))
144 |     print('Loading UNK and INC and U')
145 |     U = pickle.load(open(U_file, 'rb'))
146 |     UNK = pickle.load(open(UNK_file, 'rb'))
147 |     INC = pickle.load(open(INC_file, 'rb'))
148 | else:
149 |     print("Calculating char_indices")
150 |     chars = sorted(list(set(queries)))
151 |     char_indices = dict((c, i) for i, c in enumerate(chars))
152 |     indices_char = dict((i, c) for i, c in enumerate(chars))
153 |     print('Generating UNK and INC and U')
154 |     UNK = np.random.uniform(-0.25, 0.25, 300)
155 |     INC = np.random.uniform(-0.25, 0.25, 300)
156 |     U = np.random.uniform(-0.25, 0.25, 30)
157 |     pickle.dump(U, open(U_file,'wb'))
158 |     pickle.dump(UNK, open(UNK_file,'wb'))
159 |     pickle.dump(INC, open(INC_file,'wb'))
160 |     pickle.dump(char_indices, open(ci,'wb'))
161 |     pickle.dump(indices_char, open(ic,'wb'))
162 | print("Done.")
163 | 
164 | print('Done.')
165 | 
166 | features = {"chars": len(char_indices)}
167 | if use_w2v:
168 |     features["w2v"] = w2v_size
169 | if use_u2v:
170 |     features["u2v"] = u2v_size
171 | if use_timestamps:
172 |     features["timestamp"] = timestamp_size
173 | 
174 | # Net
175 | print("Building network")
176 | model = Sequential()
177 | model.add(GRU(1024, return_sequences=True, input_shape=(maxlen, sum(features.values())), activation="relu"))
178 | model.add(Dropout(0.5))
179 | model.add(GRU(1024, return_sequences=True, input_shape=(maxlen, sum(features.values())), activation="relu"))
180 | model.add(Dropout(0.5))
181 | model.add(TimeDistributed(Dense(len(char_indices), activation='softmax')))
182 | optimizer = Adam(lr=lr, clipnorm=0.5)
183 | model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=["accuracy"])
184 | print(model.summary())
185 | print(run_id)
186 | print("Done.")
187 | 
188 | iterations = 30 #
189 | split_size = int(len(queries.split("\n"))/iterations)
190 | print("Init vacabulary")
191 | qvocab = createVocabulary(queries)
192 | print("Done.")
193 | 
194 | data = {
195 |         "ci": char_indices,
196 |         "ic": indices_char,
197 |         "INC": INC,
198 |         "UNK": UNK,
199 |         "U": U,
200 |         "w2v": w2v,
201 |         "u2v": u2v,
202 |         "qvocab": qvocab
203 |        }
204 | 
205 | def on_epoch_end(epoch, logs):
206 |     print()
207 |     bs = BeamSearch(model, 10, maxlen, data, features)
208 |     timestamp = None
209 |     user = None
210 |     if use_timestamps:
211 |         timestamp = 1148693365.0
212 |     if use_u2v:
213 |         user = "9032971"
214 |     suggestions = bs.search(["www "], user, timestamp)
215 |     for s,p in suggestions[0].items():
216 |         print(s.strip()+":"+str(p))
217 |     print()
218 | print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
219 | 
220 | for i in range(epochs):
221 |     for j in range(iterations):
222 |         print("Preparing data for next sample - sample "+str(j)+" of epoch "+str(i))
223 |         start = j*split_size
224 |         end = min((j+1)*split_size, len(queries))
225 |         qsplit = "\n".join(queries.split("\n")[start:end])
226 |         X, Y, _ = string_to_sequences(qsplit, qvocab, maxlen, char_indices, features)
227 |         print("Done.")
228 |         model.fit(X, Y, validation_split=0.2, batch_size=batch_size, epochs=1, callbacks=[print_callback])
229 |         model.save("models/"+run_id+"/epoch"+str(i)+"_sample"+str(j)+".h5")
230 |     model.save("models/"+run_id+"/epoch"+str(i)+".h5")
231 | model.save("models/"+run_id+"/final.h5")
232 | 


--------------------------------------------------------------------------------
/config_evaluate.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "gpuID": "2",
 3 |   "modelname": "pubmed31",
 4 |   "study": "aol",
 5 |   "epoch": "1",
 6 |   "sample": "3",
 7 |   "use_w2v": true,
 8 |   "use_u2v": true,
 9 |   "use_timestamps": true
10 | }
11 | 


--------------------------------------------------------------------------------
/config_train.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "gpuID": "1",
 3 |   "studyID": "31",
 4 |   "update": false,
 5 |   "lr": 0.0005,
 6 |   "batch_size": 512,
 7 |   "epochs": 10,
 8 |   "use_w2v": true,
 9 |   "use_u2v": true,
10 |   "use_timestamps": true,
11 | }
12 | 


--------------------------------------------------------------------------------
/utilities/beamsearch.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys
  3 | from collections import Counter
  4 | import utils
  5 | import operator
  6 | 
  7 | class BeamSearch():
  8 |     def __init__(self, model, beam, maxlen, data, features):
  9 |         self.m = model
 10 |         self.beam = beam
 11 |         self.maxlen = maxlen
 12 |         self.char_indices = data["ci"]
 13 |         self.indices_char = data["ic"]
 14 |         self.INC = data["INC"]
 15 |         self.UNK = data["UNK"]
 16 |         self.U = data["U"]
 17 |         self.w2v = data["w2v"]
 18 |         self.u2v = data["u2v"]
 19 |         self.qvocab = data["qvocab"]
 20 |         self.features = features
 21 | 
 22 |     def search(self, sequence, user=None, timestamp=None, diverse=False):
 23 |         suggestions = {sequence:0}
 24 | 
 25 |         unfinished = True
 26 |         while unfinished:
 27 |             unfinished = False
 28 |             new_suggestions = {}
 29 |             for seq,prob in suggestions.items():
 30 |                 if seq[-1] == "\n" or len(seq) == self.maxlen:
 31 |                     new_suggestions[seq] = prob
 32 |                 else:
 33 |                     unfinished = True
 34 |                     predictions = self.predictCharacters(seq, user, timestamp, diverse)
 35 |                     # Add diversity, ref: Vijayakumar et al., 2016
 36 |                     if diverse:
 37 |                         initlen = len(sequence)
 38 |                         # key-val tuples sorted by value
 39 |                         sorted_predictions = sorted(predictions.items(), key=operator.itemgetter(1), reverse=True)
 40 |                         # First group is argmax of beam search
 41 |                         new_suggestions[seq + sorted_predictions[0][0]] = suggestions[seq] + np.log(sorted_predictions[0][1])
 42 |                         avgWeight = 0
 43 |                         for g in range(1, len(sorted_predictions)):
 44 |                             # For each remaining, we minimize the similarity with previous suggestions
 45 |                             # To this end, we use a normalized levenshtein distance
 46 |                             current_prob = suggestions[seq] + np.log(sorted_predictions[g][1])
 47 |                             newseq = seq + sorted_predictions[g][0]
 48 |                             avgDistance = 0
 49 |                             for previous in new_suggestions.keys():
 50 |                                 avgDistance += utils.levenshtein_distance(newseq[initlen:], previous[initlen:])/max(len(newseq[initlen:]), len(previous[initlen:]))
 51 |                             avgDistance /= len(new_suggestions.keys())
 52 |                             to_add = 0.26*np.log(avgDistance)
 53 |                             avgWeight += to_add
 54 |                             new_suggestions[newseq] = current_prob + to_add
 55 |                         # Rebalance
 56 |                         new_suggestions[seq + sorted_predictions[0][0]] += avgWeight/(self.beam-1)
 57 |                     else:
 58 |                         for c,p in predictions.items():
 59 |                             new_suggestions[seq + str(c)] = suggestions[seq] + np.log(p)
 60 |             new_suggestions = dict(Counter(new_suggestions).most_common(self.beam))
 61 |             suggestions = new_suggestions
 62 |         return suggestions
 63 | 
 64 |     def predictCharacters(self, sequence, user=None, timestamp=None, diverse=False):
 65 |         len_chars = len(self.char_indices)
 66 | 
 67 |         # Features
 68 |         w2vs = 0
 69 |         if "w2v" in self.features:
 70 |             w2vs = self.features["w2v"]
 71 |         u2vs = 0
 72 |         if "u2v" in self.features:
 73 |             u2vs = self.features["u2v"]
 74 |         ts = 0
 75 |         if "timestamp" in self.features:
 76 |             ts = self.features["timestamp"]
 77 | 
 78 |         # Init input
 79 |         x = np.zeros((1, self.maxlen, sum(self.features.values())))
 80 |         last_word = ""
 81 |         last_word_i = 0
 82 |         for t, char in enumerate(sequence):
 83 |             if u2vs > 0:
 84 |                 try:
 85 |                     x[0, t, len_chars+w2vs:len_chars+w2vs+u2vs] = self.u2v[user]
 86 |                 except: pass
 87 |                     # x[i, t, len_chars+w2vs:len_chars+w2vs+u2vs] = self.U
 88 |             if ts > 0:
 89 |                 try:
 90 |                     cos_s, sin_s, cos_d, sin_d = utils.timestamp_to_features(timestamp)
 91 |                     time_features = [cos_s, sin_s, cos_d, sin_d]
 92 |                     x[0, t, len_chars+w2vs+u2vs:len_chars+w2vs+u2vs+ts] = time_features
 93 |                 except: pass
 94 |             x[0, t, self.char_indices[char]] = 1
 95 |             if char == " ":
 96 |                 last_word = sequence[last_word_i:t]
 97 |                 last_word_i = t+1
 98 |                 try:
 99 |                     embed = self.UNK
100 |                     if last_word in self.qvocab and self.qvocab[last_word] >= 5:
101 |                         embed = self.w2v[last_word]
102 |                     x[0, t, len_chars:len_chars+w2vs] = embed
103 |                 except:
104 |                     x[0, t, len_chars:len_chars+w2vs] = self.UNK
105 |             else:
106 |                 x[0, t, len_chars:len_chars+w2vs] = self.INC
107 |         chars = {}
108 |         predictions = self.m.predict(x)[0]
109 |         preds = predictions[len(sequence)-1]
110 |         indices = np.argpartition(preds, -self.beam)[-self.beam:]
111 |         for i in indices:
112 |             chars[self.indices_char[i]] = preds[i]
113 |         return chars
114 | 


--------------------------------------------------------------------------------
/utilities/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import dateutil.parser
 3 | from datetime import datetime
 4 | 
 5 | def time_to_real(hours, minutes, seconds):
 6 |     seconds_in_day = 24*60*60
 7 |     seconds = seconds + minutes*60 + hours*3600
 8 |     sin_time = np.sin(2*np.pi*seconds/seconds_in_day)
 9 |     cos_time = np.cos(2*np.pi*seconds/seconds_in_day)
10 |     return cos_time, sin_time
11 | 
12 | def weekday_to_real(day):
13 |     days_in_week = 7
14 |     sin_time = np.sin(2*np.pi*day/days_in_week)
15 |     cos_time = np.cos(2*np.pi*day/days_in_week)
16 |     return cos_time, sin_time
17 | 
18 | def timestamp_to_features(timestamp):
19 |     time = datetime.fromtimestamp(float(timestamp))
20 |     weekday = time.weekday()
21 |     cos_s, sin_s = time_to_real(time.hour, time.minute, time.second)
22 |     cos_d, sin_d = weekday_to_real(weekday)
23 |     return cos_s, sin_s, cos_d, sin_d
24 | 
25 | def levenshtein_distance(s1, s2):
26 |     if len(s1) > len(s2):
27 |         s1, s2 = s2, s1
28 | 
29 |     distances = range(len(s1) + 1)
30 |     for i2, c2 in enumerate(s2):
31 |         distances_ = [i2+1]
32 |         for i1, c1 in enumerate(s1):
33 |             if c1 == c2:
34 |                 distances_.append(distances[i1])
35 |             else:
36 |                 distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
37 |         distances = distances_
38 |     return distances[-1]
39 | 


--------------------------------------------------------------------------------