├── NN_pipeline.py
├── README.md
├── bigtrain_fasttext_esim.py
├── bigtrain_w2v_esim.py
├── bigtrain_w2v_rnn.py
├── chizhu_rnn.py
├── fasttext_cos.py
├── finetuning_fasttext_esim.py
├── finetuning_w2v_esim.py
├── finetuning_w2v_rnn.py
├── gen_feature.py
├── get_corpus.py
├── train_fasttext.py
├── train_w2v.py
└── w2v_cos.py


/NN_pipeline.py:
--------------------------------------------------------------------------------
  1 | from sklearn.preprocessing import StandardScaler
  2 | import os
  3 | import pandas as pd
  4 | import numpy as np
  5 | import random as rn
  6 | from tqdm import tqdm, tqdm_notebook
  7 | import tensorflow as tf
  8 | from sklearn.metrics import roc_auc_score
  9 | from keras.preprocessing.text import Tokenizer
 10 | from keras.preprocessing.sequence import pad_sequences
 11 | from keras.optimizers import Adam
 12 | from keras import backend as K
 13 | from keras.optimizers import *
 14 | from keras.callbacks import *
 15 | from keras.layers import *
 16 | from keras.models import *
 17 | from keras.engine.topology import Layer
 18 | from keras import initializers, regularizers, constraints, optimizers, layers
 19 | from keras.initializers import *
 20 | import keras
 21 | from sklearn.model_selection import StratifiedKFold, GroupKFold
 22 | import gc
 23 | import time
 24 | from gensim.models import Word2Vec
 25 | import logging
 26 | import Levenshtein
 27 | tqdm.pandas()
 28 | np.random.seed(1017)
 29 | rn.seed(1017)
 30 | tf.set_random_seed(1017)
 31 | path = "/home/kesci/input/bytedance/"
 32 | out = '/home/kesci/work/chizhu/'
 33 | print(os.listdir(path))
 34 | 
 35 | train = pd.read_csv(path+"train_final.csv",skiprows=900000000,nrows=100000000,names=['query_id','query','query_title_id','title','label'])
 36 | test = pd.read_csv(path+"test_final_part1.csv",names=['query_id','query','query_title_id','title'])
 37 | 
 38 | train['title']=train['title'].apply(lambda x:str(x).replace("\t",""),1)
 39 | test['title']=test['title'].apply(lambda x:str(x).replace("\t",""),1)
 40 | data_all=pd.concat([train,test],ignore_index=True)
 41 | del train,test
 42 | gc.collect()
 43 | 
 44 | # 构造特征集 f1
 45 | def get_union_data(row):
 46 |     title_list = row['title'].split(' ')
 47 |     query_list = row['query'].split(' ')
 48 |     return len(list(set(title_list).intersection(set(query_list))))
 49 | 
 50 | def same_1(row):
 51 |     title_list = row['title'].split(' ')
 52 |     query_list = row['query'].split(' ')
 53 |     if title_list[0] ==  query_list[0]:
 54 |         return 1
 55 |     else:
 56 |         return 0
 57 | 
 58 | def same_2(row):
 59 |     title_list = row['title'].split(' ')
 60 |     query_list = row['query'].split(' ')
 61 |     if ' '.join(title_list[:2]) ==  ' '.join(query_list[:2]):
 62 |         return 1
 63 |     else:
 64 |         return 0
 65 | 
 66 | def same_3(row):
 67 |     title_list = row['title'].split(' ')
 68 |     query_list = row['query'].split(' ')
 69 |     if ' '.join(title_list[:3]) ==  ' '.join(query_list[:3]):
 70 |         return 1
 71 |     else:
 72 |         return 0
 73 | 
 74 | def is_all_in(row):
 75 |     if row['query'] in row['title']:
 76 |         return 1
 77 |     else:
 78 |         return 0
 79 | 
 80 | feature = pd.DataFrame()
 81 | feature['问题长度'] = data_all['query'].progress_apply(lambda row:len(row.split(' ')))
 82 | feature['标题长度'] = data_all['title'].progress_apply(lambda row:len(row.split(' ')))
 83 | feature['标题长度-问题长度'] = feature['标题长度'] - feature['问题长度']
 84 | feature['问题是否全部在标题里面'] = data_all.progress_apply(lambda row:is_all_in(row), axis=1)
 85 | feature['标题和问题的交集个数'] = data_all.progress_apply(lambda row:get_union_data(row), axis=1)
 86 | feature['标题问题词语的交集个数/问题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['问题长度']), 8)
 87 | feature['标题问题词语的交集个数/标题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['标题长度']), 8)
 88 | feature['编辑距离'] = data_all.progress_apply(lambda row:Levenshtein.distance(row['query'], row['title']), axis=1)
 89 | feature['前一个词语是否相同'] = data_all.progress_apply(lambda row:same_1(row), axis=1)
 90 | feature['前两个词语是否相同'] = data_all.progress_apply(lambda row:same_2(row), axis=1)
 91 | feature['前三个词语是否相同'] = data_all.progress_apply(lambda row:same_3(row), axis=1)
 92 | feature.to_csv(out + 'f1.csv', index=False)
 93 | 
 94 | 
 95 | # 构造特征集 f2
 96 | def pos_1(row):
 97 |     title_list = row['title'].split(' ')
 98 |     query_list = row['query'].split(' ')
 99 |     value = -1
100 |     try:
101 |         value = title_list.index(query_list[0])
102 |     except Exception:
103 |         value = -1
104 |     return value
105 | 
106 | def pos_2(row):
107 |     title_list = row['title'].split(' ')
108 |     query_list = row['query'].split(' ')
109 |     if len(query_list) <=1 :
110 |         return -1
111 |     try:
112 |         value = title_list.index(query_list[1])
113 |     except Exception:
114 |         value = -1
115 |     return value
116 | 
117 | def pos_3(row):
118 |     title_list = row['title'].split(' ')
119 |     query_list = row['query'].split(' ')
120 |     if len(query_list) <=2 :
121 |         return -1
122 |     try:
123 |         value = title_list.index(query_list[2])
124 |     except Exception:
125 |         value = -1
126 |     return value
127 | 
128 | feature = pd.DataFrame()
129 | feature['第一个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_1(row), axis=1)
130 | feature['第二个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_2(row), axis=1)
131 | feature['第三个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_3(row), axis=1)
132 | feature.to_csv(out + 'f2.csv', index=False)
133 | 
134 | feature = pd.DataFrame()
135 | feature['标题求组合后词语'] = data_all.groupby('title').query.transform('nunique')
136 | # feature['词语求组合后标题'] = data_all.groupby('query').title.transform('nunique')
137 | feature.to_csv(out + 'f3.csv', index=False)
138 | 
139 | # data_all = data_all.fillna(-1)
140 | # data_all.to_csv(out+"data.csv", index=False)
141 | 
142 | # data_all = pd.read_csv(out+"data.csv")
143 | 
144 | # f5 word2vec本身相似度
145 | from gensim.models import Word2Vec
146 | import gensim
147 | import logging
148 | feature = pd.DataFrame()
149 | w2v = Word2Vec.load(out + 'w2v.model')
150 | def get_new_w2v(seq1, seq2):
151 |     seq1 = seq1.split(' ')
152 |     seq2 = seq2.split(' ')
153 |     try:
154 |         return w2v.n_similarity(seq1, seq2)
155 |     except:
156 |         return -1
157 | 
158 | f3 = pd.read_csv(out + 'f3.csv')
159 | f3['w2v本身相似度'] = data_all.progress_apply(lambda row:get_new_w2v(row['query'], row['title']), axis=1)
160 | f3.to_csv(out + 'f3.csv', index=False)
161 | 
162 | f1 = pd.read_csv(out + 'f1.csv')
163 | f2 = pd.read_csv(out + 'f2.csv')
164 | f3 = pd.read_csv(out + 'f3.csv')
165 | feature = pd.concat([f1, f2, f3], sort=False, axis=1)
166 | del f1, f2, f3
167 | gc.collect()
168 | 
169 | train = data_all[data_all['label'] != -1]
170 | test = data_all[data_all['label'] == -1]
171 | del data_all
172 | gc.collect()
173 | train_feature = feature[:len(train)]
174 | test_feature = feature[len(train):]
175 | train.index = range(len(train))
176 | test.index = range(len(test))
177 | train_feature.index = range(len(train_feature))
178 | test_feature.index = range(len(test_feature))
179 | del feature
180 | gc.collect()
181 | 
182 | embed_size = 300  # how big is each word vector
183 | # how many unique words to use (i.e num rows in embedding vector)
184 | max_features = None
185 | maxlen1 = 8
186 | maxlen2 = 20  # max number of words in a question to use
187 | 
188 | train_X1 = train["query"].fillna("0").values
189 | test_X1 = test["query"].fillna("0").values
190 | 
191 | train_X2 = train["title"].fillna("0").values
192 | test_X2 = test["title"].fillna("0").values
193 | print("token...")
194 | tokenizer = Tokenizer(num_words=max_features)
195 | tokenizer.fit_on_texts(list(train_X1)+list(test_X1) +
196 |                        list(train_X2)+list(test_X2))
197 | train_X1 = tokenizer.texts_to_sequences(train_X1)
198 | test_X1 = tokenizer.texts_to_sequences(test_X1)
199 | ## Pad the sentences
200 | print("padding")
201 | train_X1 = pad_sequences(train_X1, maxlen=maxlen1)
202 | test_X1 = pad_sequences(test_X1, maxlen=maxlen1)
203 | 
204 | train_X2 = tokenizer.texts_to_sequences(train_X2)
205 | test_X2 = tokenizer.texts_to_sequences(test_X2)
206 | ## Pad the sentences
207 | train_X2 = pad_sequences(train_X2, maxlen=maxlen2)
208 | test_X2 = pad_sequences(test_X2, maxlen=maxlen2)
209 | ## Get the target values
210 | 
211 | train_y = train['label'].values
212 | 
213 | word_index = tokenizer.word_index
214 | gc.collect()
215 | 
216 | text_list = train['query'].values.tolist()
217 | text_list.extend(test['query'].values.tolist())
218 | text_list.extend(train['title'].values.tolist())
219 | text_list.extend(test['title'].values.tolist())
220 | del train,test
221 | gc.collect()
222 | import time
223 | time.sleep(10)
224 | text_list = [[word for word in str(document).split(' ') ] for document in text_list]
225 | logging.basicConfig(
226 |     format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
227 | w2v = Word2Vec(text_list, size=300, window=7, iter=30, seed=10, workers=4, min_count=3)
228 | w2v.save(out+"w2v.model")
229 | w2v.wv.save_word2vec_format(out+'new_w2v_300.txt')
230 | print("w2v model done")
231 | del w2v, text_list, texts
232 | gc.collect()
233 | 
234 | 
235 | def get_embedding_matrix(word_index, embed_size=embed_size, Emed_path=out+"new_w2v_300.txt"):
236 |     embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(
237 |         Emed_path, binary=False)
238 |     nb_words = len(word_index)+1
239 |     embedding_matrix = np.zeros((nb_words, embed_size))
240 |     count = 0
241 |     for word, i in tqdm(word_index.items()):
242 |         if i >= nb_words:
243 |             continue
244 |         try:
245 |             embedding_vector = embeddings_index[word]
246 |         except:
247 |             embedding_vector = np.zeros(embed_size)
248 |             count += 1
249 |         if embedding_vector is not None:
250 |             embedding_matrix[i] = embedding_vector
251 | 
252 |     print("null cnt", count)
253 |     return embedding_matrix
254 | 
255 | 
256 | embedding_matrix = get_embedding_matrix(word_index)
257 | 
258 | 
259 | class AdamW(Optimizer):
260 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/4)
261 |                  epsilon=1e-8, decay=0., **kwargs):
262 |         super(AdamW, self).__init__(**kwargs)
263 |         with K.name_scope(self.__class__.__name__):
264 |             self.iterations = K.variable(0, dtype='int64', name='iterations')
265 |             self.lr = K.variable(lr, name='lr')
266 |             self.beta_1 = K.variable(beta_1, name='beta_1')
267 |             self.beta_2 = K.variable(beta_2, name='beta_2')
268 |             self.decay = K.variable(decay, name='decay')
269 |             # decoupled weight decay (2/4)
270 |             self.wd = K.variable(weight_decay, name='weight_decay')
271 |         self.epsilon = epsilon
272 |         self.initial_decay = decay
273 | 
274 |     @interfaces.legacy_get_updates_support
275 |     def get_updates(self, loss, params):
276 |         grads = self.get_gradients(loss, params)
277 |         self.updates = [K.update_add(self.iterations, 1)]
278 |         wd = self.wd  # decoupled weight decay (3/4)
279 | 
280 |         lr = self.lr
281 |         if self.initial_decay > 0:
282 |             lr *= (1. / (1. + self.decay * K.cast(self.iterations,
283 |                                                   K.dtype(self.decay))))
284 | 
285 |         t = K.cast(self.iterations, K.floatx()) + 1
286 |         lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
287 |                      (1. - K.pow(self.beta_1, t)))
288 | 
289 |         ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
290 |         vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
291 |         self.weights = [self.iterations] + ms + vs
292 | 
293 |         for p, g, m, v in zip(params, grads, ms, vs):
294 |             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
295 |             v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
296 |             # decoupled weight decay (4/4)
297 |             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
298 | 
299 |             self.updates.append(K.update(m, m_t))
300 |             self.updates.append(K.update(v, v_t))
301 |             new_p = p_t
302 | 
303 |             # Apply constraints.
304 |             if getattr(p, 'constraint', None) is not None:
305 |                 new_p = p.constraint(new_p)
306 | 
307 |             self.updates.append(K.update(p, new_p))
308 |         return self.updates
309 | 
310 |     def get_config(self):
311 |         config = {'lr': float(K.get_value(self.lr)),
312 |                   'beta_1': float(K.get_value(self.beta_1)),
313 |                   'beta_2': float(K.get_value(self.beta_2)),
314 |                   'decay': float(K.get_value(self.decay)),
315 |                   'weight_decay': float(K.get_value(self.wd)),
316 |                   'epsilon': self.epsilon}
317 |         base_config = super(AdamW, self).get_config()
318 |         return dict(list(base_config.items()) + list(config.items()))
319 | 
320 | 
321 | class Attention(Layer):
322 |     def __init__(self, step_dim,
323 |                  W_regularizer=None, b_regularizer=None,
324 |                  W_constraint=None, b_constraint=None,
325 |                  bias=True, **kwargs):
326 |         self.supports_masking = True
327 |         self.init = initializers.get('glorot_uniform')
328 | 
329 |         self.W_regularizer = regularizers.get(W_regularizer)
330 |         self.b_regularizer = regularizers.get(b_regularizer)
331 | 
332 |         self.W_constraint = constraints.get(W_constraint)
333 |         self.b_constraint = constraints.get(b_constraint)
334 | 
335 |         self.bias = bias
336 |         self.step_dim = step_dim
337 |         self.features_dim = 0
338 |         super(Attention, self).__init__(**kwargs)
339 | 
340 |     def build(self, input_shape):
341 |         assert len(input_shape) == 3
342 | 
343 |         self.W = self.add_weight((input_shape[-1],),
344 |                                  initializer=self.init,
345 |                                  name='{}_W'.format(self.name),
346 |                                  regularizer=self.W_regularizer,
347 |                                  constraint=self.W_constraint)
348 |         self.features_dim = input_shape[-1]
349 | 
350 |         if self.bias:
351 |             self.b = self.add_weight((input_shape[1],),
352 |                                      initializer='zero',
353 |                                      name='{}_b'.format(self.name),
354 |                                      regularizer=self.b_regularizer,
355 |                                      constraint=self.b_constraint)
356 |         else:
357 |             self.b = None
358 | 
359 |         self.built = True
360 | 
361 |     def compute_mask(self, input, input_mask=None):
362 |         return None
363 | 
364 |     def call(self, x, mask=None):
365 |         features_dim = self.features_dim
366 |         step_dim = self.step_dim
367 | 
368 |         eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
369 |                               K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
370 | 
371 |         if self.bias:
372 |             eij += self.b
373 | 
374 |         eij = K.tanh(eij)
375 | 
376 |         a = K.exp(eij)
377 | 
378 |         if mask is not None:
379 |             a *= K.cast(mask, K.floatx())
380 | 
381 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
382 | 
383 |         a = K.expand_dims(a)
384 |         weighted_input = x * a
385 |         return K.sum(weighted_input, axis=1)
386 | 
387 |     def compute_output_shape(self, input_shape):
388 |         return input_shape[0],  self.features_dim
389 | 
390 | # AUC for a binary classifier
391 | def auc(y_true, y_pred):
392 |     ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
393 |     pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
394 |     pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
395 |     binSizes = -(pfas[1:]-pfas[:-1])
396 |     s = ptas*binSizes
397 |     return K.sum(s, axis=0)
398 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
399 | # PFA, prob false alert for binary classifier
400 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
401 |     y_pred = K.cast(y_pred >= threshold, 'float32')
402 |     # N = total number of negative labels
403 |     N = K.sum(1 - y_true)
404 |     # FP = total number of false alerts, alerts from the negative class labels
405 |     FP = K.sum(y_pred - y_pred * y_true)
406 |     return FP/N
407 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
408 | # P_TA prob true alerts for binary classifier
409 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
410 |     y_pred = K.cast(y_pred >= threshold, 'float32')
411 |     # P = total number of positive labels
412 |     P = K.sum(y_true)
413 |     # TP = total number of correct alerts, alerts from the positive class labels
414 |     TP = K.sum(y_pred * y_true)
415 |     return TP/P
416 | 
417 | 
418 | val = train[99000000:]
419 | train = train[:99000000]
420 | val_X1 = train_X1[99000000:]
421 | val_X2 = train_X2[99000000:]
422 | train_X1 = train_X1[:99000000]
423 | train_X2 = train_X2[:99000000]
424 | val_feature = train_feature[99000000:]
425 | train_feature = train_feature[:99000000]
426 | 
427 | class ManDist(keras.layers.Layer):  # 封装成keras层的曼哈顿距离计算
428 | 
429 |     # 初始化ManDist层，此时不需要任何参数输入
430 |     def __init__(self, **kwargs):
431 |         self.result = None
432 |         super(ManDist, self).__init__(**kwargs)
433 | 
434 |     # 自动建立ManDist层
435 |     def build(self, input_shape):
436 |         super(ManDist, self).build(input_shape)
437 | 
438 |     # 计算曼哈顿距离
439 |     def call(self, x, **kwargs):
440 |         self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True))
441 |         return self.result
442 | 
443 |     # 返回结果
444 |     def compute_output_shape(self, input_shape):
445 |         return K.int_shape(self.result)
446 | 
447 | 
448 | sc = StandardScaler()
449 | col_len = len(train_feature.columns)
450 | sc.fit(pd.concat([train_feature, val_feature, test_feature]))
451 | train_feature = sc.transform(train_feature)
452 | val_feature = sc.transform(val_feature)
453 | test_feature = sc.transform(test_feature)
454 | 
455 | def get_model(embedding_matrix):
456 | 
457 |     K.clear_session()
458 |     #The embedding layer containing the word vectors
459 |     emb_layer = Embedding(
460 |         input_dim=embedding_matrix.shape[0],
461 |         output_dim=embedding_matrix.shape[1],
462 |         weights=[embedding_matrix],
463 |         trainable=False
464 |     )
465 |     sdrop=SpatialDropout1D(rate=0.2)
466 |     lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True, 
467 | kernel_initializer=glorot_uniform(seed = 123)))
468 |     gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True, 
469 | kernel_initializer=glorot_uniform(seed = 123)))
470 |     
471 |     cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
472 | 
473 |     # Define inputs
474 |     seq1 = Input(shape=(maxlen1,))
475 |     x1 = emb_layer(seq1)
476 |     x1 = sdrop(x1)
477 |     lstm1 = lstm_layer(x1)
478 |     gru1 = gru_layer(lstm1)
479 |     att_1 = Attention(maxlen1)(lstm1)
480 |     att_3 = Attention(maxlen1)(gru1)
481 |     cnn1 = cnn1d_layer(lstm1)
482 |     
483 |     avg_pool = GlobalAveragePooling1D()
484 |     max_pool = GlobalMaxPooling1D()
485 |     
486 |     seq2 = Input(shape=(maxlen2,))
487 |     x2 = emb_layer(seq2)
488 |     x2 = sdrop(x2)
489 |     lstm2 = lstm_layer(x2)
490 |     gru2 = gru_layer(lstm2)
491 |     att_2 = Attention(maxlen2)(lstm2)
492 |     att_4 = Attention(maxlen2)(gru2)
493 |     cnn2 = cnn1d_layer(lstm2)
494 |     
495 |     x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)])
496 |     x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)])
497 |     
498 |     merge = Multiply()([x1, x2])
499 |     merge = Dropout(0.2)(merge)
500 |     
501 |     hin = Input(shape=(col_len,))
502 |     # htime = Dense(col_len,activation='relu')(hin)
503 |     x = Concatenate()([merge,hin])
504 |     # The MLP that determines the outcome
505 |     x = Dense(64,kernel_initializer=he_uniform(seed=123), activation='relu',)(x)
506 |     # x = Dropout(0.2)(x)
507 |     # x = BatchNormalization()(x)
508 | 
509 |     pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x)
510 | 
511 |     
512 |     model = Model(inputs=[seq1,seq2,hin], outputs=pred)
513 | 
514 |     model.compile(loss='binary_crossentropy',
515 |                   optimizer=AdamW(lr=0.001,weight_decay=0.02,),
516 |                   metrics=["accuracy",auc])
517 |     # model.summary()
518 |     return model
519 | 
520 | 
521 | ####模型训练
522 | 
523 | print("train...")
524 | print("###"*30)
525 | gc.collect()
526 | K.clear_session()
527 | model = get_model(embedding_matrix)
528 | # model = esim()
529 | model.summary()
530 | early_stopping = EarlyStopping(
531 |     monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
532 | reduce_lr = ReduceLROnPlateau(
533 |     monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
534 | bst_model_path = out+'chizhurnn_chizhu_weight.h5'
535 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
536 |                              save_best_only=True, verbose=1, save_weights_only=True)
537 | callbacks = [checkpoint, reduce_lr, early_stopping]
538 | print("load weight....")
539 | # model.load_weights(bst_model_path)
540 | 
541 | hist = model.fit([train_X1,train_X2,train_feature],train['label'],
542 |                     validation_data=([val_X1,val_X2,val_feature], val['label']),
543 |                      epochs=30, batch_size=2048,
544 | #                      class_weight="auto",
545 |                      callbacks=callbacks,verbose=1
546 | 
547 |                      )
548 | 
549 | model.load_weights(bst_model_path)
550 | 
551 | res = np.squeeze(model.predict(
552 |     [val_X1, val_X2, val_feature], batch_size=2048, verbose=1))
553 | 
554 | print("val auc:{}".format(roc_auc_score(val['label'], res)))
555 | val['prob'] = res
556 | 
557 | 
558 | def perauc(df):
559 |     temp = pd.DataFrame(index=range(1))
560 |     temp['query_id'] = df['query_id'].values[0]
561 |     try:
562 |         temp['auc'] = roc_auc_score(df['label'].values.astype(int), df['prob'])
563 |     except:
564 |         temp['auc'] = 0.5
565 |     return temp
566 | 
567 | 
568 | eval_df = val.groupby("query_id", as_index=False).apply(lambda x: perauc(x))
569 | eval_df.index = range(len(eval_df))
570 | print("qauc:", eval_df['auc'].mean())
571 | 
572 | test_prob = np.squeeze(model.predict(
573 |     [test_X1, test_X2, test_feature], batch_size=2048, verbose=1))
574 | 
575 | 
576 | sub = test[['query_id', 'query_title_id']]
577 | sub['prediction'] = test_prob
578 | sub.to_csv(out+"/submit_rnn.csv", index=False, header=False)
579 | 
580 | 
581 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ### 高校赛解决方案
  2 | #### 赛题介绍
  3 | * **数据**  
  4 | 
  5 |   提供10亿量级的数据，根据query和title预测query下doc点击率。数据已经脱敏并且分好词。
  6 | 
  7 | | 列名 | 类型 | 示例 |
  8 | | ------ | ------ | ------ |
  9 | | query_id | int | 3 |
 10 | | query | hash string，term空格分割 | 1 9 117 |
 11 | | query_title_id | title在query下的唯一标识 | 2 |
 12 | | title | hash string，term空格分割 | 3 9 120 |
 13 | | label | int，取值{0, 1} | 0 |
 14 | * **任务分析**
 15 |   二分类问题。文本相似度+ctr点击预测
 16 | * **难点**
 17 | 
 18 |   * 数据量大
 19 |   * 数据脱敏
 20 | 
 21 | #### 解决方案
 22 | ##### 特征工程(FE)
 23 | * 问题长度
 24 | * 标题长度
 25 | * 标题长度-问题长度
 26 | * 问题是否全部在标题里面
 27 | * 标题和问题的共词个数
 28 | * 标题问题词语的共词个数/问题长度
 29 | * 标题问题词语的共词个数/标题长度
 30 | * 编辑距离
 31 | * 前一个词语是否相同
 32 | * 前二个词语是否相同
 33 | * 前三个词语是否相同
 34 | * 第一个词语在标题里面出现位置
 35 | * 第二个词语在标题里面出现位置
 36 | * 第三个词语在标题里面出现位置
 37 | * 标题求组合后词语
 38 | * 词语求组合后标题
 39 | * w2v_n_similarity
 40 | * fasttext的余弦相似度
 41 | * word2vec的余弦相似度
 42 | 
 43 | (共19个特征,放入LGB模型lb是0.597)
 44 | ##### NN模型
 45 | * 孪生RNN
 46 |    * query+title双输入+FE特征
 47 |    * 使用最后一亿的数据（前9.9千万条数据训练+后1百万数据验证）
 48 |    * 网络结构
 49 |    ```python 
 50 |    def get_model(embedding_matrix):
 51 |         K.clear_session()
 52 |         #The embedding layer containing the word vectors
 53 |         emb_layer = Embedding(
 54 |         input_dim=embedding_matrix.shape[0],
 55 |         output_dim=embedding_matrix.shape[1],
 56 |         weights=[embedding_matrix],
 57 |         trainable=False
 58 |     )
 59 |         sdrop=SpatialDropout1D(rate=0.2)
 60 |         lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True, kernel_initializer=glorot_uniform(seed = 123)))
 61 |         gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True, kernel_initializer=glorot_uniform(seed = 123)))
 62 |     
 63 |         cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
 64 | 
 65 |         # Define inputs
 66 |         seq1 = Input(shape=(maxlen_query,))
 67 |         x1 = emb_layer(seq1)
 68 |         x1 = sdrop(x1)
 69 |         lstm1 = lstm_layer(x1)
 70 |         gru1 = gru_layer(lstm1)
 71 |         att_1 = Attention(maxlen_query)(lstm1)
 72 |         att_3 = Attention(maxlen_query)(gru1)
 73 |         cnn1 = cnn1d_layer(lstm1)
 74 |     
 75 |         avg_pool = GlobalAveragePooling1D()
 76 |         max_pool = GlobalMaxPooling1D()
 77 |     
 78 |         seq2 = Input(shape=(maxlen_answer,))
 79 |         x2 = emb_layer(seq2)
 80 |         x2 = sdrop(x2)
 81 |         lstm2 = lstm_layer(x2)
 82 |         gru2 = gru_layer(lstm2)
 83 |         att_2 = Attention(maxlen_answer)(lstm2)
 84 |         att_4 = Attention(maxlen_answer)(gru2)
 85 |         cnn2 = cnn1d_layer(lstm2)
 86 |     
 87 |         x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)])
 88 |         x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)])
 89 |     
 90 |         merge = Multiply()([x1, x2])
 91 |         merge = Dropout(0.2)(merge)
 92 |     
 93 |         hin = Input(shape=(19,))
 94 |         # htime = Dense(col_len,activation='relu')(hin)
 95 |         x = Concatenate()([merge,hin])
 96 |         # The MLP that determines the outcome
 97 |         x = Dense(64,kernel_initializer=he_uniform(seed=123), activation='relu',)(x)
 98 |         # x = Dropout(0.2)(x)
 99 |         # x = BatchNormalization()(x)
100 | 
101 |         pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x)
102 |         model = Model(inputs=[seq1,seq2,hin], outputs=pred)
103 |         model.compile(loss='binary_crossentropy',
104 |                   optimizer=AdamW(lr=0.001,weight_decay=0.02,),
105 |                   metrics=["accuracy",auc])
106 |         # model.summary()
107 |         return model
108 |    ``` 
109 |    
110 |    * 使用AdamW优化器加快训练过程
111 |    * 使用最新刚出的lookahead 优化器（reference:Lookahead Optimizer: k steps forward, 1 step back(https://arxiv.org/abs/1907.08610))
112 |    Lookahead 算法的性能显著优于 SGD 和 Adam,它迭代地更新两组权重。直观来说，Lookahead 算法通过提前观察另一个优化器生成的「fast weights」序列，来选择搜索方向。该研究发现，Lookahead 算法能够提升学习稳定性，不仅降低了调参需要的功夫，同时还能提升收敛速度与效果。
113 |    * 线上效果
114 |      **lb 0.6214**
115 | * **fine-tuning（亮点）**
116 |   * 思考：官方提供10亿的数据量？先验知识告诉我们，数据越多效果越好，那么如何充分利用数据？
117 |   * 解决方法
118 |      *  先用10亿数据训练一个不加任何特征的裸NN，保存权重(如何能训练10亿？)
119 |         > 文件流处理数据+分批次训练（训练10亿数据最大占用内存才10G） 
120 |     *   加载裸NN模型，获得倒二层的feature map作为输出，加入新的FE特征输入，然后把基模型的feature map和FE特征拼接最后送入全连接层。用最后一亿的数据fine-tuning 整个网络。
121 |         （再次展示预训练在NLP领域的举足轻重不可动摇的地位）
122 |     
123 |   * fine-tuning用到的模型（整体参数都是改小了的，因为只有单卡机器，如果可以多卡训练，放开参数估计单模可以0.64+）
124 |      * word2vec300维+孪生RNN(小参数)  **lb 0.6248**
125 |      * word2vec300维+ESIM（极小参数，最后时刻怕跑不完）    **lb 0.626**
126 |      * fasttext100维+ESIM(小参数)  **lb 0.6336 单模都可以在A榜排到第三**
127 |   * fine-tuning 网络结构
128 |   ```python 
129 |   def aux_esim_model(embed_matrix,model_weight_path):
130 |         base_model = esim(embed_matrix)
131 |         base_model.load_weights(model_weight_path)
132 |         input_q, input_a = base_model.inputs
133 |         input_f = Input((19,))
134 |         hidden_esim = base_model.get_layer(index=28).output
135 |         merged = Concatenate()([hidden_esim, input_f])
136 |         #dense = BatchNormalization()(merged)
137 |         dense = Dense(512, activation='relu')(merged)
138 |         #dense = BatchNormalization()(dense)
139 |         dense = Dropout(0.5)(dense)
140 |         dense = Dense(256, activation='relu')(dense)
141 |         #dense = BatchNormalization()(dense)
142 |         dense = Dropout(0.5)(dense)
143 |         out_ = Dense(1, activation='sigmoid')(dense)
144 | 
145 |         model = Model(inputs=[input_q,input_a,input_f], outputs=out_)
146 |         model.compile(loss='binary_crossentropy',
147 |                   optimizer=AdamW(lr=0.0003,weight_decay=0.02),
148 |                   metrics=["accuracy"])
149 |         return model    
150 |   ```
151 |   * ESIM 网络结构
152 |   ```python
153 |     def esim(embedding_matrix,
154 |          maxlen=20,
155 |          lstm_dim=64,
156 |          dense_dim=128,
157 |          dense_dropout=0.5):
158 |         # Based on arXiv:1609.06038
159 |         q1 = Input(name='q1', shape=(8,))
160 |         q2 = Input(name='q2', shape=(20,))
161 | 
162 |         # Embedding
163 |         embedding = create_pretrained_embedding(
164 |         embedding_matrix, mask_zero=False)
165 |         bn = BatchNormalization(axis=2)
166 |         q1_embed = bn(embedding(q1))
167 |         q2_embed = bn(embedding(q2))
168 | 
169 |         # Encode
170 |         encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
171 |         q1_encoded = encode(q1_embed)
172 |         q2_encoded = encode(q2_embed)
173 | 
174 |         # Attention
175 |         q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
176 | 
177 |         # Compose
178 |         q1_combined = Concatenate()(
179 |         [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
180 |         q2_combined = Concatenate()(
181 |         [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
182 | 
183 |         compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
184 |         q1_compare = compose(q1_combined)
185 |         q2_compare = compose(q2_combined)
186 | 
187 |         # Aggregate
188 |         q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
189 |         q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
190 | 
191 |    
192 |         merged = Concatenate()([q1_rep, q2_rep])
193 | 
194 |         dense = BatchNormalization()(merged)
195 |         dense = Dense(dense_dim, activation='elu')(dense)
196 |         dense = BatchNormalization()(dense)
197 |         dense = Dropout(dense_dropout)  (dense)
198 |         dense = Dense(dense_dim, activation='elu')(dense)
199 |         dense = BatchNormalization()(dense)
200 |         dense = Dropout(dense_dropout)(dense)
201 |         out_ = Dense(1, activation='sigmoid')(dense)
202 | 
203 |         model = Model(inputs=[q1, q2], outputs=out_)
204 |         model.compile(loss='binary_crossentropy',
205 |                   optimizer=AdamW(lr=0.0003,weight_decay=0.02,),
206 |                   metrics=["accuracy",auc])
207 |         return model
208 |   ```
209 |   
210 | 
211 | #### 线上提交
212 | * finetuning_fasttext_esim(**0.6336**)*0.6+\
213 |   finetuning_w2v_esim(**0.626**)*0.2+\
214 |   finetuning_w2v_esim(**0.6248**)*0.2=**lb 0.6366**
215 | <hr>
216 | 
217 | * finetuning_fasttext_esim(**0.6336**)*0.5+\
218 |   finetuning_w2v_esim(**0.626**)*0.2+\
219 |   finetuning_w2v_esim(**0.6248**)*0.2+\
220 |   孪生RNN(**0.6214**)*0.1=ensemble_NN 
221 | 
222 |   lgb(**0.597**)*0.1+ensemble_NN*0.9= **lb 0.6371**
223 | 
224 | 
225 |   
226 |   
227 | #### 我们的优势
228 | * 工业可部署
229 | > 真实的线上业务也是庞大的数据量，如何充分利用数据是个难题。我们的方案适用于大数据量（流式训练全量数据内存小+finetuing迁移学习效果佳）
230 | 
231 | * 简单而实用
232 | > 我们总共才19个特征，不需要提取大量的手工特征，所以可以说不依赖于LGB模型，LGB模型是全量模型，要么只能选用小数据集提特征要么大数据量提取不了特征，不易迭代。我们的方案流式处理，易于迭代更新。
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 


--------------------------------------------------------------------------------
/bigtrain_fasttext_esim.py:
--------------------------------------------------------------------------------
  1 | from keras.activations import softmax
  2 | import os
  3 | import pandas as pd
  4 | import numpy as np
  5 | import random as rn
  6 | from tqdm import tqdm, tqdm_notebook
  7 | import tensorflow as tf
  8 | from sklearn.metrics import roc_auc_score
  9 | from keras.preprocessing.text import Tokenizer
 10 | from keras.preprocessing.sequence import pad_sequences
 11 | from keras.optimizers import Adam
 12 | from keras import backend as K
 13 | from keras.optimizers import *
 14 | from keras.callbacks import *
 15 | from keras.layers import *
 16 | from keras.models import *
 17 | from keras.engine.topology import Layer
 18 | from keras import initializers, regularizers, constraints, optimizers, layers
 19 | from keras.initializers import *
 20 | import keras
 21 | from sklearn.model_selection import StratifiedKFold, GroupKFold
 22 | import gc
 23 | import time
 24 | from gensim.models import Word2Vec
 25 | import logging
 26 | import Levenshtein
 27 | import fasttext
 28 | tqdm.pandas()
 29 | np.random.seed(1017)
 30 | rn.seed(1017)
 31 | tf.set_random_seed(1017)
 32 | path = "/home/kesci/input/bytedance/"
 33 | out = '/home/kesci/work/zhifeng/'
 34 | print(os.listdir(path))
 35 | 
 36 | w2v = fasttext.load_model(out+'corpus.fasttext.model')
 37 | word2index = {word: index+1 for index, word in enumerate(w2v.words)}
 38 | index2word = {index+1: word for index, word in enumerate(w2v.words)}
 39 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
 40 |                      maxlen_query=8):
 41 |     if label_tag:
 42 |         _, _q, _, _a, _label = line.strip().split(',')
 43 |     else:
 44 |         _, _q, _, _a = line.strip().split(',')
 45 |     q_seq = [token.get(item, 0) for item in _q.strip().split()]
 46 |     a_seq = [token.get(item, 0) for item in _a.strip().split()]
 47 |     q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
 48 |     a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
 49 |     if label_tag:
 50 |         return q_pad, a_pad, int(_label)
 51 |     return q_pad, a_pad
 52 | 
 53 | 
 54 | def gen_train(path, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
 55 |     while True:
 56 |         fin = open(path, 'r')
 57 |         batch_q, batch_a, batch_label = [], [], []
 58 |         for line in fin:
 59 |             if len(batch_q) == chunk_size*batch_size:
 60 |                 batch_q = np.array(batch_q)
 61 |                 batch_a = np.array(batch_a)
 62 |                 if label_tag:
 63 |                     batch_label = np.array(batch_label)
 64 |                 idx = list(range(chunk_size*batch_size))
 65 |                 if shuffle:
 66 |                     np.random.shuffle(idx)
 67 |                 for i in range(chunk_size):
 68 |                     if label_tag:
 69 |                         yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
 70 |                     else:
 71 |                         yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
 72 |                 batch_q, batch_a, batch_label = [], [], []
 73 |             if label_tag:
 74 |                 q, a, l = gen_feature_help(line, label_tag=label_tag)
 75 |             else:
 76 |                 q, a = gen_feature_help(line, label_tag=label_tag)
 77 |                 l = 0
 78 |             batch_q.append(q)
 79 |             batch_a.append(a)
 80 |             if label_tag:
 81 |                 batch_label.append(l)
 82 | 
 83 |         batch_q = np.array(batch_q)
 84 |         batch_a = np.array(batch_a)
 85 | 
 86 |         if label_tag:
 87 |             batch_label = np.array(batch_label)
 88 |         idx = list(range(len(batch_q)))
 89 |         if shuffle:
 90 |             np.random.shuffle(idx)
 91 |         for i in range(int(np.ceil(len(batch_q)/batch_size))):
 92 |             if label_tag:
 93 |                 yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
 94 |             else:
 95 |                 yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
 96 |         fin.close()
 97 | 
 98 | 
 99 | def get_embedding_matrix():
100 |     m = np.zeros(shape=(len(index2word)+1, 100))
101 |     for i, w in index2word.items():
102 |         m[i, :] = w2v[w]
103 |     return m
104 | 
105 | 
106 | embed_matrix = get_embedding_matrix()
107 | maxlen_query = 8
108 | maxlen_answer = 20
109 | 
110 | 
111 | class AdamW(Optimizer):
112 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/4)
113 |                  epsilon=1e-8, decay=0., **kwargs):
114 |         super(AdamW, self).__init__(**kwargs)
115 |         with K.name_scope(self.__class__.__name__):
116 |             self.iterations = K.variable(0, dtype='int64', name='iterations')
117 |             self.lr = K.variable(lr, name='lr')
118 |             self.beta_1 = K.variable(beta_1, name='beta_1')
119 |             self.beta_2 = K.variable(beta_2, name='beta_2')
120 |             self.decay = K.variable(decay, name='decay')
121 |             # decoupled weight decay (2/4)
122 |             self.wd = K.variable(weight_decay, name='weight_decay')
123 |         self.epsilon = epsilon
124 |         self.initial_decay = decay
125 | 
126 |     @interfaces.legacy_get_updates_support
127 |     def get_updates(self, loss, params):
128 |         grads = self.get_gradients(loss, params)
129 |         self.updates = [K.update_add(self.iterations, 1)]
130 |         wd = self.wd  # decoupled weight decay (3/4)
131 | 
132 |         lr = self.lr
133 |         if self.initial_decay > 0:
134 |             lr *= (1. / (1. + self.decay * K.cast(self.iterations,
135 |                                                   K.dtype(self.decay))))
136 | 
137 |         t = K.cast(self.iterations, K.floatx()) + 1
138 |         lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
139 |                      (1. - K.pow(self.beta_1, t)))
140 | 
141 |         ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
142 |         vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
143 |         self.weights = [self.iterations] + ms + vs
144 | 
145 |         for p, g, m, v in zip(params, grads, ms, vs):
146 |             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
147 |             v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
148 |             # decoupled weight decay (4/4)
149 |             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
150 | 
151 |             self.updates.append(K.update(m, m_t))
152 |             self.updates.append(K.update(v, v_t))
153 |             new_p = p_t
154 | 
155 |             # Apply constraints.
156 |             if getattr(p, 'constraint', None) is not None:
157 |                 new_p = p.constraint(new_p)
158 | 
159 |             self.updates.append(K.update(p, new_p))
160 |         return self.updates
161 | 
162 |     def get_config(self):
163 |         config = {'lr': float(K.get_value(self.lr)),
164 |                   'beta_1': float(K.get_value(self.beta_1)),
165 |                   'beta_2': float(K.get_value(self.beta_2)),
166 |                   'decay': float(K.get_value(self.decay)),
167 |                   'weight_decay': float(K.get_value(self.wd)),
168 |                   'epsilon': self.epsilon}
169 |         base_config = super(AdamW, self).get_config()
170 |         return dict(list(base_config.items()) + list(config.items()))
171 | 
172 | 
173 | class Attention(Layer):
174 |     def __init__(self, step_dim,
175 |                  W_regularizer=None, b_regularizer=None,
176 |                  W_constraint=None, b_constraint=None,
177 |                  bias=True, **kwargs):
178 |         self.supports_masking = True
179 |         self.init = initializers.get('glorot_uniform')
180 | 
181 |         self.W_regularizer = regularizers.get(W_regularizer)
182 |         self.b_regularizer = regularizers.get(b_regularizer)
183 | 
184 |         self.W_constraint = constraints.get(W_constraint)
185 |         self.b_constraint = constraints.get(b_constraint)
186 | 
187 |         self.bias = bias
188 |         self.step_dim = step_dim
189 |         self.features_dim = 0
190 |         super(Attention, self).__init__(**kwargs)
191 | 
192 |     def build(self, input_shape):
193 |         assert len(input_shape) == 3
194 | 
195 |         self.W = self.add_weight((input_shape[-1],),
196 |                                  initializer=self.init,
197 |                                  name='{}_W'.format(self.name),
198 |                                  regularizer=self.W_regularizer,
199 |                                  constraint=self.W_constraint)
200 |         self.features_dim = input_shape[-1]
201 | 
202 |         if self.bias:
203 |             self.b = self.add_weight((input_shape[1],),
204 |                                      initializer='zero',
205 |                                      name='{}_b'.format(self.name),
206 |                                      regularizer=self.b_regularizer,
207 |                                      constraint=self.b_constraint)
208 |         else:
209 |             self.b = None
210 | 
211 |         self.built = True
212 | 
213 |     def compute_mask(self, input, input_mask=None):
214 |         return None
215 | 
216 |     def call(self, x, mask=None):
217 |         features_dim = self.features_dim
218 |         step_dim = self.step_dim
219 | 
220 |         eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
221 |                               K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
222 | 
223 |         if self.bias:
224 |             eij += self.b
225 | 
226 |         eij = K.tanh(eij)
227 | 
228 |         a = K.exp(eij)
229 | 
230 |         if mask is not None:
231 |             a *= K.cast(mask, K.floatx())
232 | 
233 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
234 | 
235 |         a = K.expand_dims(a)
236 |         weighted_input = x * a
237 |         return K.sum(weighted_input, axis=1)
238 | 
239 |     def compute_output_shape(self, input_shape):
240 |         return input_shape[0],  self.features_dim
241 | 
242 | # AUC for a binary classifier
243 | 
244 | 
245 | def auc(y_true, y_pred):
246 |     ptas = tf.stack([binary_PTA(y_true, y_pred, k)
247 |                      for k in np.linspace(0, 1, 1000)], axis=0)
248 |     pfas = tf.stack([binary_PFA(y_true, y_pred, k)
249 |                      for k in np.linspace(0, 1, 1000)], axis=0)
250 |     pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
251 |     binSizes = -(pfas[1:]-pfas[:-1])
252 |     s = ptas*binSizes
253 |     return K.sum(s, axis=0)
254 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
255 | # PFA, prob false alert for binary classifier
256 | 
257 | 
258 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
259 |     y_pred = K.cast(y_pred >= threshold, 'float32')
260 |     # N = total number of negative labels
261 |     N = K.sum(1 - y_true)
262 |     # FP = total number of false alerts, alerts from the negative class labels
263 |     FP = K.sum(y_pred - y_pred * y_true)
264 |     return FP/N
265 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
266 | # P_TA prob true alerts for binary classifier
267 | 
268 | 
269 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
270 |     y_pred = K.cast(y_pred >= threshold, 'float32')
271 |     # P = total number of positive labels
272 |     P = K.sum(y_true)
273 |     # TP = total number of correct alerts, alerts from the positive class labels
274 |     TP = K.sum(y_pred * y_true)
275 |     return TP/P
276 | 
277 | 
278 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs):
279 |     "Create embedding layer from a pretrained weights array"
280 |     in_dim, out_dim = pretrained_weights.shape
281 |     embedding = Embedding(in_dim, out_dim, weights=[
282 |                           pretrained_weights], trainable=False, **kwargs)
283 |     return embedding
284 | 
285 | 
286 | def unchanged_shape(input_shape):
287 |     "Function for Lambda layer"
288 |     return input_shape
289 | 
290 | 
291 | def substract(input_1, input_2):
292 |     "Substract element-wise"
293 |     neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2)
294 |     out_ = Add()([input_1, neg_input_2])
295 |     return out_
296 | 
297 | 
298 | def submult(input_1, input_2):
299 |     "Get multiplication and subtraction then concatenate results"
300 |     mult = Multiply()([input_1, input_2])
301 |     sub = substract(input_1, input_2)
302 |     out_ = Concatenate()([sub, mult])
303 |     return out_
304 | 
305 | 
306 | def apply_multiple(input_, layers):
307 |     "Apply layers to input then concatenate result"
308 |     if not len(layers) > 1:
309 |         raise ValueError('Layers list should contain more than 1 layer')
310 |     else:
311 |         agg_ = []
312 |         for layer in layers:
313 |             agg_.append(layer(input_))
314 |         out_ = Concatenate()(agg_)
315 |     return out_
316 | 
317 | 
318 | def time_distributed(input_, layers):
319 |     "Apply a list of layers in TimeDistributed mode"
320 |     out_ = []
321 |     node_ = input_
322 |     for layer_ in layers:
323 |         node_ = TimeDistributed(layer_)(node_)
324 |     out_ = node_
325 |     return out_
326 | 
327 | 
328 | def soft_attention_alignment(input_1, input_2):
329 |     "Align text representation with neural soft attention"
330 |     attention = Dot(axes=-1)([input_1, input_2])
331 |     w_att_1 = Lambda(lambda x: softmax(x, axis=1),
332 |                      output_shape=unchanged_shape)(attention)
333 |     w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2),
334 |                                      output_shape=unchanged_shape)(attention))
335 |     in1_aligned = Dot(axes=1)([w_att_1, input_1])
336 |     in2_aligned = Dot(axes=1)([w_att_2, input_2])
337 |     return in1_aligned, in2_aligned
338 | 
339 | 
340 | def decomposable_attention(pretrained_weights,
341 |                            num_shape,
342 |                            projection_dim=300, projection_hidden=0, projection_dropout=0.2,
343 |                            compare_dim=500, compare_dropout=0.2,
344 |                            dense_dim=300, dense_dropout=0.2,
345 |                            lr=1e-3, activation='elu', maxlen=20):
346 |     # Based on: https://arxiv.org/abs/1606.01933
347 | 
348 |     q1 = Input(name='q1', shape=(maxlen,))
349 |     q2 = Input(name='q2', shape=(maxlen,))
350 | 
351 |     # Embedding
352 |     embedding = create_pretrained_embedding(pretrained_weights,
353 |                                             mask_zero=False)
354 |     q1_embed = embedding(q1)
355 |     q2_embed = embedding(q2)
356 | 
357 |     # Projection
358 |     projection_layers = []
359 |     if projection_hidden > 0:
360 |         projection_layers.extend([
361 |             Dense(projection_hidden, activation=activation),
362 |             Dropout(rate=projection_dropout),
363 |         ])
364 |     projection_layers.extend([
365 |         Dense(projection_dim, activation=None),
366 |         Dropout(rate=projection_dropout),
367 |     ])
368 |     q1_encoded = time_distributed(q1_embed, projection_layers)
369 |     q2_encoded = time_distributed(q2_embed, projection_layers)
370 | 
371 |     # Attention
372 |     q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
373 | 
374 |     # Compare
375 |     q1_combined = Concatenate()(
376 |         [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
377 |     q2_combined = Concatenate()(
378 |         [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
379 |     compare_layers = [
380 |         Dense(compare_dim, activation=activation),
381 |         Dropout(compare_dropout),
382 |         Dense(compare_dim, activation=activation),
383 |         Dropout(compare_dropout),
384 |     ]
385 |     q1_compare = time_distributed(q1_combined, compare_layers)
386 |     q2_compare = time_distributed(q2_combined, compare_layers)
387 | 
388 |     # Aggregate
389 |     q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
390 |     q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
391 | 
392 |     # Classifier
393 |     merged = Concatenate()([q1_rep, q2_rep])
394 |     dense = BatchNormalization()(merged)
395 |     dense = Dense(dense_dim, activation=activation)(dense)
396 |     dense = Dropout(dense_dropout)(dense)
397 |     dense = BatchNormalization()(dense)
398 |     dense = Dense(dense_dim, activation=activation)(dense)
399 |     dense = Dropout(dense_dropout)(dense)
400 |     out_ = Dense(1, activation='sigmoid')(dense)
401 | 
402 |     model = Model(inputs=[q1, q2], outputs=out_)
403 |     model.compile(loss='binary_crossentropy',
404 |                   optimizer=AdamW(lr=0.001, weight_decay=0.02,),
405 |                   metrics=["accuracy", auc])
406 |     return model
407 | 
408 | 
409 | def esim(embedding_matrix,
410 |          maxlen=20,
411 |          lstm_dim=30,
412 |          dense_dim=30,
413 |          dense_dropout=0.5):
414 |     # Based on arXiv:1609.06038
415 |     q1 = Input(name='q1', shape=(8,))
416 |     q2 = Input(name='q2', shape=(20,))
417 | 
418 |     # Embedding
419 |     embedding = create_pretrained_embedding(
420 |         embedding_matrix, mask_zero=False)
421 |     bn = BatchNormalization(axis=2)
422 |     q1_embed = bn(embedding(q1))
423 |     q2_embed = bn(embedding(q2))
424 | 
425 |     # Encode
426 |     encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
427 |     q1_encoded = encode(q1_embed)
428 |     q2_encoded = encode(q2_embed)
429 | 
430 |     # Attention
431 |     q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
432 | 
433 |     # Compose
434 |     q1_combined = Concatenate()(
435 |         [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
436 |     q2_combined = Concatenate()(
437 |         [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
438 | 
439 |     compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
440 |     q1_compare = compose(q1_combined)
441 |     q2_compare = compose(q2_combined)
442 | 
443 |     # Aggregate
444 |     q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
445 |     q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
446 | 
447 |     # leaks_input = Input(shape=(num_shape,))
448 |     # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input)
449 | 
450 |     # Classifier
451 |     merged = Concatenate()([q1_rep, q2_rep])
452 | 
453 |     dense = BatchNormalization()(merged)
454 |     dense = Dense(dense_dim, activation='elu')(dense)
455 |     dense = BatchNormalization()(dense)
456 |     dense = Dropout(dense_dropout)(dense)
457 |     dense = Dense(dense_dim, activation='elu')(dense)
458 |     dense = BatchNormalization()(dense)
459 |     dense = Dropout(dense_dropout)(dense)
460 |     out_ = Dense(1, activation='sigmoid')(dense)
461 | 
462 |     model = Model(inputs=[q1, q2], outputs=out_)
463 |     model.compile(loss='binary_crossentropy',
464 |                   optimizer=AdamW(lr=0.0003, weight_decay=0.02,),
465 |                   metrics=["accuracy"])
466 |     return model
467 | 
468 | 
469 | ####模型训练
470 | train_gen = gen_train(path='/home/kesci/zhifeng/train.csv',
471 |                       batch_size=4096, label_tag=True, chunk_size=1000)
472 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
473 |                     batch_size=4096, label_tag=True, chunk_size=1000)
474 | print("train...")
475 | print("###"*30)
476 | gc.collect()
477 | K.clear_session()
478 | model = esim(embed_matrix)
479 | model.summary()
480 | early_stopping = EarlyStopping(
481 |     monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
482 | reduce_lr = ReduceLROnPlateau(
483 |     monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
484 | bst_model_path = '/home/kesci/chizhu/chizhu_w2v_esim_weight_{epoch}_{val_loss}.h5'
485 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
486 |                              save_best_only=False,
487 |                              verbose=1, save_weights_only=True, period=1)
488 | callbacks = [checkpoint, reduce_lr, early_stopping]
489 | # print("load weight....")
490 | 
491 | 
492 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(999000000/2048)),
493 |                            epochs=10, verbose=1, callbacks=callbacks,
494 |                            validation_data=val_gen, validation_steps=int(
495 |                                np.ceil(1000000/2048)),
496 |                            max_queue_size=10, workers=1, use_multiprocessing=False)
497 | 
498 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
499 |                     batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
500 | val_prob = model.predict_generator(
501 |     val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
502 | 
503 | f = open('/home/kesci/zhifeng/val.csv', 'r')
504 | q, a, l = [], [], []
505 | for line in f:
506 |     qid, _, aid, _, label = line.strip().split(',')
507 |     q.append(qid)
508 |     a.append(aid)
509 |     l.append(int(label))
510 | 
511 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l})
512 | val_df['prob'] = val_prob.flatten()
513 | 
514 | roc_auc_score(val_df['label'], val_df['prob'])
515 | 
516 | 
517 | def perauc(df):
518 |     temp = pd.Series()
519 |     try:
520 |         temp['auc'] = roc_auc_score(df['label'], df['prob'])
521 |     except:
522 |         temp['auc'] = 0.5
523 |     return temp
524 | 
525 | 
526 | eval_df = val_df.groupby("qid").apply(perauc)
527 | eval_df.index = range(len(eval_df))
528 | print("qauc:", eval_df['auc'].mean())
529 | 
530 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
531 |                      batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
532 | prob = model.predict_generator(
533 |     test_gen, steps=int(np.ceil(20000000/4096)), verbose=1)
534 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',
535 |                   names=['qid', 'aid', 'prob'])
536 | sub['prob'] = prob.flatten()
537 | sub.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testa.csv',
538 |            index=False, header=False)
539 | 
540 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
541 |                      batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
542 | prob = model.predict_generator(
543 |     test_gen, steps=int(np.ceil(100000000/4096)), verbose=1)
544 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
545 |                     'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
546 | final['prob'] = prob.flatten()
547 | final.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testb.csv',
548 |              index=False, header=False)
549 | 


--------------------------------------------------------------------------------
/bigtrain_w2v_esim.py:
--------------------------------------------------------------------------------
  1 | from keras.activations import softmax
  2 | import os
  3 | import pandas as pd
  4 | import numpy as np
  5 | import random as rn
  6 | from tqdm import tqdm, tqdm_notebook
  7 | import tensorflow as tf
  8 | from sklearn.metrics import roc_auc_score
  9 | from keras.preprocessing.text import Tokenizer
 10 | from keras.preprocessing.sequence import pad_sequences
 11 | from keras.optimizers import Adam
 12 | from keras import backend as K
 13 | from keras.optimizers import *
 14 | from keras.callbacks import *
 15 | from keras.layers import *
 16 | from keras.models import *
 17 | from keras.engine.topology import Layer
 18 | from keras import initializers, regularizers, constraints, optimizers, layers
 19 | from keras.initializers import *
 20 | import keras
 21 | from sklearn.model_selection import StratifiedKFold, GroupKFold
 22 | import gc
 23 | import time
 24 | from gensim.models import Word2Vec
 25 | import logging
 26 | import Levenshtein
 27 | import fasttext
 28 | tqdm.pandas()
 29 | np.random.seed(1017)
 30 | rn.seed(1017)
 31 | tf.set_random_seed(1017)
 32 | path = "/home/kesci/input/bytedance/"
 33 | out = '/home/kesci/work/zhifeng/'
 34 | print(os.listdir(path))
 35 | 
 36 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model')
 37 | 
 38 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)}
 39 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)}
 40 | 
 41 | 
 42 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
 43 |                      maxlen_query=8):
 44 |     if label_tag:
 45 |         _, _q, _, _a, _label = line.strip().split(',')
 46 |     else:
 47 |         _, _q, _, _a = line.strip().split(',')
 48 |     q_seq = [token.get(item, 0) for item in _q.strip().split()]
 49 |     a_seq = [token.get(item, 0) for item in _a.strip().split()]
 50 |     q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
 51 |     a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
 52 |     if label_tag:
 53 |         return q_pad, a_pad, int(_label)
 54 |     return q_pad, a_pad
 55 | 
 56 | 
 57 | def gen_train(path, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
 58 |     while True:
 59 |         fin = open(path, 'r')
 60 |         batch_q, batch_a, batch_label = [], [], []
 61 |         for line in fin:
 62 |             if len(batch_q) == chunk_size*batch_size:
 63 |                 batch_q = np.array(batch_q)
 64 |                 batch_a = np.array(batch_a)
 65 |                 if label_tag:
 66 |                     batch_label = np.array(batch_label)
 67 |                 idx = list(range(chunk_size*batch_size))
 68 |                 if shuffle:
 69 |                     np.random.shuffle(idx)
 70 |                 for i in range(chunk_size):
 71 |                     if label_tag:
 72 |                         yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
 73 |                     else:
 74 |                         yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
 75 |                 batch_q, batch_a, batch_label = [], [], []
 76 |             if label_tag:
 77 |                 q, a, l = gen_feature_help(line, label_tag=label_tag)
 78 |             else:
 79 |                 q, a = gen_feature_help(line, label_tag=label_tag)
 80 |                 l = 0
 81 |             batch_q.append(q)
 82 |             batch_a.append(a)
 83 |             if label_tag:
 84 |                 batch_label.append(l)
 85 | 
 86 |         batch_q = np.array(batch_q)
 87 |         batch_a = np.array(batch_a)
 88 | 
 89 |         if label_tag:
 90 |             batch_label = np.array(batch_label)
 91 |         idx = list(range(len(batch_q)))
 92 |         if shuffle:
 93 |             np.random.shuffle(idx)
 94 |         for i in range(int(np.ceil(len(batch_q)/batch_size))):
 95 |             if label_tag:
 96 |                 yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
 97 |             else:
 98 |                 yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
 99 |         fin.close()
100 | 
101 | 
102 | def get_embedding_matrix():
103 |     m = np.zeros(shape=(len(index2word)+1, 300))
104 |     for i, w in index2word.items():
105 |         m[i, :] = w2v[w]
106 |     return m
107 | 
108 | 
109 | embed_matrix = get_embedding_matrix()
110 | maxlen_query = 8
111 | maxlen_answer = 20
112 | 
113 | 
114 | class AdamW(Optimizer):
115 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/4)
116 |                  epsilon=1e-8, decay=0., **kwargs):
117 |         super(AdamW, self).__init__(**kwargs)
118 |         with K.name_scope(self.__class__.__name__):
119 |             self.iterations = K.variable(0, dtype='int64', name='iterations')
120 |             self.lr = K.variable(lr, name='lr')
121 |             self.beta_1 = K.variable(beta_1, name='beta_1')
122 |             self.beta_2 = K.variable(beta_2, name='beta_2')
123 |             self.decay = K.variable(decay, name='decay')
124 |             # decoupled weight decay (2/4)
125 |             self.wd = K.variable(weight_decay, name='weight_decay')
126 |         self.epsilon = epsilon
127 |         self.initial_decay = decay
128 | 
129 |     @interfaces.legacy_get_updates_support
130 |     def get_updates(self, loss, params):
131 |         grads = self.get_gradients(loss, params)
132 |         self.updates = [K.update_add(self.iterations, 1)]
133 |         wd = self.wd  # decoupled weight decay (3/4)
134 | 
135 |         lr = self.lr
136 |         if self.initial_decay > 0:
137 |             lr *= (1. / (1. + self.decay * K.cast(self.iterations,
138 |                                                   K.dtype(self.decay))))
139 | 
140 |         t = K.cast(self.iterations, K.floatx()) + 1
141 |         lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
142 |                      (1. - K.pow(self.beta_1, t)))
143 | 
144 |         ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
145 |         vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
146 |         self.weights = [self.iterations] + ms + vs
147 | 
148 |         for p, g, m, v in zip(params, grads, ms, vs):
149 |             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
150 |             v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
151 |             # decoupled weight decay (4/4)
152 |             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
153 | 
154 |             self.updates.append(K.update(m, m_t))
155 |             self.updates.append(K.update(v, v_t))
156 |             new_p = p_t
157 | 
158 |             # Apply constraints.
159 |             if getattr(p, 'constraint', None) is not None:
160 |                 new_p = p.constraint(new_p)
161 | 
162 |             self.updates.append(K.update(p, new_p))
163 |         return self.updates
164 | 
165 |     def get_config(self):
166 |         config = {'lr': float(K.get_value(self.lr)),
167 |                   'beta_1': float(K.get_value(self.beta_1)),
168 |                   'beta_2': float(K.get_value(self.beta_2)),
169 |                   'decay': float(K.get_value(self.decay)),
170 |                   'weight_decay': float(K.get_value(self.wd)),
171 |                   'epsilon': self.epsilon}
172 |         base_config = super(AdamW, self).get_config()
173 |         return dict(list(base_config.items()) + list(config.items()))
174 | 
175 | 
176 | class Attention(Layer):
177 |     def __init__(self, step_dim,
178 |                  W_regularizer=None, b_regularizer=None,
179 |                  W_constraint=None, b_constraint=None,
180 |                  bias=True, **kwargs):
181 |         self.supports_masking = True
182 |         self.init = initializers.get('glorot_uniform')
183 | 
184 |         self.W_regularizer = regularizers.get(W_regularizer)
185 |         self.b_regularizer = regularizers.get(b_regularizer)
186 | 
187 |         self.W_constraint = constraints.get(W_constraint)
188 |         self.b_constraint = constraints.get(b_constraint)
189 | 
190 |         self.bias = bias
191 |         self.step_dim = step_dim
192 |         self.features_dim = 0
193 |         super(Attention, self).__init__(**kwargs)
194 | 
195 |     def build(self, input_shape):
196 |         assert len(input_shape) == 3
197 | 
198 |         self.W = self.add_weight((input_shape[-1],),
199 |                                  initializer=self.init,
200 |                                  name='{}_W'.format(self.name),
201 |                                  regularizer=self.W_regularizer,
202 |                                  constraint=self.W_constraint)
203 |         self.features_dim = input_shape[-1]
204 | 
205 |         if self.bias:
206 |             self.b = self.add_weight((input_shape[1],),
207 |                                      initializer='zero',
208 |                                      name='{}_b'.format(self.name),
209 |                                      regularizer=self.b_regularizer,
210 |                                      constraint=self.b_constraint)
211 |         else:
212 |             self.b = None
213 | 
214 |         self.built = True
215 | 
216 |     def compute_mask(self, input, input_mask=None):
217 |         return None
218 | 
219 |     def call(self, x, mask=None):
220 |         features_dim = self.features_dim
221 |         step_dim = self.step_dim
222 | 
223 |         eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
224 |                               K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
225 | 
226 |         if self.bias:
227 |             eij += self.b
228 | 
229 |         eij = K.tanh(eij)
230 | 
231 |         a = K.exp(eij)
232 | 
233 |         if mask is not None:
234 |             a *= K.cast(mask, K.floatx())
235 | 
236 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
237 | 
238 |         a = K.expand_dims(a)
239 |         weighted_input = x * a
240 |         return K.sum(weighted_input, axis=1)
241 | 
242 |     def compute_output_shape(self, input_shape):
243 |         return input_shape[0],  self.features_dim
244 | 
245 | # AUC for a binary classifier
246 | 
247 | 
248 | def auc(y_true, y_pred):
249 |     ptas = tf.stack([binary_PTA(y_true, y_pred, k)
250 |                      for k in np.linspace(0, 1, 1000)], axis=0)
251 |     pfas = tf.stack([binary_PFA(y_true, y_pred, k)
252 |                      for k in np.linspace(0, 1, 1000)], axis=0)
253 |     pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
254 |     binSizes = -(pfas[1:]-pfas[:-1])
255 |     s = ptas*binSizes
256 |     return K.sum(s, axis=0)
257 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
258 | # PFA, prob false alert for binary classifier
259 | 
260 | 
261 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
262 |     y_pred = K.cast(y_pred >= threshold, 'float32')
263 |     # N = total number of negative labels
264 |     N = K.sum(1 - y_true)
265 |     # FP = total number of false alerts, alerts from the negative class labels
266 |     FP = K.sum(y_pred - y_pred * y_true)
267 |     return FP/N
268 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
269 | # P_TA prob true alerts for binary classifier
270 | 
271 | 
272 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
273 |     y_pred = K.cast(y_pred >= threshold, 'float32')
274 |     # P = total number of positive labels
275 |     P = K.sum(y_true)
276 |     # TP = total number of correct alerts, alerts from the positive class labels
277 |     TP = K.sum(y_pred * y_true)
278 |     return TP/P
279 | 
280 | 
281 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs):
282 |     "Create embedding layer from a pretrained weights array"
283 |     in_dim, out_dim = pretrained_weights.shape
284 |     embedding = Embedding(in_dim, out_dim, weights=[
285 |                           pretrained_weights], trainable=False, **kwargs)
286 |     return embedding
287 | 
288 | 
289 | def unchanged_shape(input_shape):
290 |     "Function for Lambda layer"
291 |     return input_shape
292 | 
293 | 
294 | def substract(input_1, input_2):
295 |     "Substract element-wise"
296 |     neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2)
297 |     out_ = Add()([input_1, neg_input_2])
298 |     return out_
299 | 
300 | 
301 | def submult(input_1, input_2):
302 |     "Get multiplication and subtraction then concatenate results"
303 |     mult = Multiply()([input_1, input_2])
304 |     sub = substract(input_1, input_2)
305 |     out_ = Concatenate()([sub, mult])
306 |     return out_
307 | 
308 | 
309 | def apply_multiple(input_, layers):
310 |     "Apply layers to input then concatenate result"
311 |     if not len(layers) > 1:
312 |         raise ValueError('Layers list should contain more than 1 layer')
313 |     else:
314 |         agg_ = []
315 |         for layer in layers:
316 |             agg_.append(layer(input_))
317 |         out_ = Concatenate()(agg_)
318 |     return out_
319 | 
320 | 
321 | def time_distributed(input_, layers):
322 |     "Apply a list of layers in TimeDistributed mode"
323 |     out_ = []
324 |     node_ = input_
325 |     for layer_ in layers:
326 |         node_ = TimeDistributed(layer_)(node_)
327 |     out_ = node_
328 |     return out_
329 | 
330 | 
331 | def soft_attention_alignment(input_1, input_2):
332 |     "Align text representation with neural soft attention"
333 |     attention = Dot(axes=-1)([input_1, input_2])
334 |     w_att_1 = Lambda(lambda x: softmax(x, axis=1),
335 |                      output_shape=unchanged_shape)(attention)
336 |     w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2),
337 |                                      output_shape=unchanged_shape)(attention))
338 |     in1_aligned = Dot(axes=1)([w_att_1, input_1])
339 |     in2_aligned = Dot(axes=1)([w_att_2, input_2])
340 |     return in1_aligned, in2_aligned
341 | 
342 | 
343 | def decomposable_attention(pretrained_weights,
344 |                            num_shape,
345 |                            projection_dim=300, projection_hidden=0, projection_dropout=0.2,
346 |                            compare_dim=500, compare_dropout=0.2,
347 |                            dense_dim=300, dense_dropout=0.2,
348 |                            lr=1e-3, activation='elu', maxlen=20):
349 |     # Based on: https://arxiv.org/abs/1606.01933
350 | 
351 |     q1 = Input(name='q1', shape=(maxlen,))
352 |     q2 = Input(name='q2', shape=(maxlen,))
353 | 
354 |     # Embedding
355 |     embedding = create_pretrained_embedding(pretrained_weights,
356 |                                             mask_zero=False)
357 |     q1_embed = embedding(q1)
358 |     q2_embed = embedding(q2)
359 | 
360 |     # Projection
361 |     projection_layers = []
362 |     if projection_hidden > 0:
363 |         projection_layers.extend([
364 |             Dense(projection_hidden, activation=activation),
365 |             Dropout(rate=projection_dropout),
366 |         ])
367 |     projection_layers.extend([
368 |         Dense(projection_dim, activation=None),
369 |         Dropout(rate=projection_dropout),
370 |     ])
371 |     q1_encoded = time_distributed(q1_embed, projection_layers)
372 |     q2_encoded = time_distributed(q2_embed, projection_layers)
373 | 
374 |     # Attention
375 |     q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
376 | 
377 |     # Compare
378 |     q1_combined = Concatenate()(
379 |         [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
380 |     q2_combined = Concatenate()(
381 |         [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
382 |     compare_layers = [
383 |         Dense(compare_dim, activation=activation),
384 |         Dropout(compare_dropout),
385 |         Dense(compare_dim, activation=activation),
386 |         Dropout(compare_dropout),
387 |     ]
388 |     q1_compare = time_distributed(q1_combined, compare_layers)
389 |     q2_compare = time_distributed(q2_combined, compare_layers)
390 | 
391 |     # Aggregate
392 |     q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
393 |     q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
394 | 
395 |     # Classifier
396 |     merged = Concatenate()([q1_rep, q2_rep])
397 |     dense = BatchNormalization()(merged)
398 |     dense = Dense(dense_dim, activation=activation)(dense)
399 |     dense = Dropout(dense_dropout)(dense)
400 |     dense = BatchNormalization()(dense)
401 |     dense = Dense(dense_dim, activation=activation)(dense)
402 |     dense = Dropout(dense_dropout)(dense)
403 |     out_ = Dense(1, activation='sigmoid')(dense)
404 | 
405 |     model = Model(inputs=[q1, q2], outputs=out_)
406 |     model.compile(loss='binary_crossentropy',
407 |                   optimizer=AdamW(lr=0.001, weight_decay=0.02,),
408 |                   metrics=["accuracy", auc])
409 |     return model
410 | 
411 | 
412 | def esim(embedding_matrix,
413 |          maxlen=20,
414 |          lstm_dim=30,
415 |          dense_dim=30,
416 |          dense_dropout=0.5):
417 |     # Based on arXiv:1609.06038
418 |     q1 = Input(name='q1', shape=(8,))
419 |     q2 = Input(name='q2', shape=(20,))
420 | 
421 |     # Embedding
422 |     embedding = create_pretrained_embedding(
423 |         embedding_matrix, mask_zero=False)
424 |     bn = BatchNormalization(axis=2)
425 |     q1_embed = bn(embedding(q1))
426 |     q2_embed = bn(embedding(q2))
427 | 
428 |     # Encode
429 |     encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
430 |     q1_encoded = encode(q1_embed)
431 |     q2_encoded = encode(q2_embed)
432 | 
433 |     # Attention
434 |     q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
435 | 
436 |     # Compose
437 |     q1_combined = Concatenate()(
438 |         [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
439 |     q2_combined = Concatenate()(
440 |         [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
441 | 
442 |     compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
443 |     q1_compare = compose(q1_combined)
444 |     q2_compare = compose(q2_combined)
445 | 
446 |     # Aggregate
447 |     q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
448 |     q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
449 | 
450 |     # leaks_input = Input(shape=(num_shape,))
451 |     # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input)
452 | 
453 |     # Classifier
454 |     merged = Concatenate()([q1_rep, q2_rep])
455 | 
456 |     dense = BatchNormalization()(merged)
457 |     dense = Dense(dense_dim, activation='elu')(dense)
458 |     dense = BatchNormalization()(dense)
459 |     dense = Dropout(dense_dropout)(dense)
460 |     dense = Dense(dense_dim, activation='elu')(dense)
461 |     dense = BatchNormalization()(dense)
462 |     dense = Dropout(dense_dropout)(dense)
463 |     out_ = Dense(1, activation='sigmoid')(dense)
464 | 
465 |     model = Model(inputs=[q1, q2], outputs=out_)
466 |     model.compile(loss='binary_crossentropy',
467 |                   optimizer=AdamW(lr=0.0003, weight_decay=0.02,),
468 |                   metrics=["accuracy"])
469 |     return model
470 | 
471 | 
472 | ####模型训练
473 | train_gen = gen_train(path='/home/kesci/zhifeng/train.csv',
474 |                       batch_size=4096, label_tag=True, chunk_size=1000)
475 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
476 |                     batch_size=4096, label_tag=True, chunk_size=1000)
477 | print("train...")
478 | print("###"*30)
479 | gc.collect()
480 | K.clear_session()
481 | model = esim(embed_matrix)
482 | model.summary()
483 | early_stopping = EarlyStopping(
484 |     monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
485 | reduce_lr = ReduceLROnPlateau(
486 |     monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
487 | bst_model_path = '/home/kesci/chizhu/chizhu_w2v_esim_weight_{epoch}_{val_loss}.h5'
488 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
489 |                              save_best_only=False,
490 |                              verbose=1, save_weights_only=True, period=1)
491 | callbacks = [checkpoint, reduce_lr, early_stopping]
492 | # print("load weight....")
493 | 
494 | 
495 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(999000000/2048)),
496 |                            epochs=10, verbose=1, callbacks=callbacks,
497 |                            validation_data=val_gen, validation_steps=int(
498 |                                np.ceil(1000000/2048)),
499 |                            max_queue_size=10, workers=1, use_multiprocessing=False)
500 | 
501 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
502 |                     batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
503 | val_prob = model.predict_generator(
504 |     val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
505 | 
506 | f = open('/home/kesci/zhifeng/val.csv','r')
507 | q,a,l=[],[],[]
508 | for line in f:
509 |     qid,_,aid,_,label = line.strip().split(',')
510 |     q.append(qid)
511 |     a.append(aid)
512 |     l.append(int(label))
513 | 
514 | val_df = pd.DataFrame({'qid':q,'aid':a,'label':l})
515 | val_df['prob'] = val_prob.flatten()
516 | 
517 | roc_auc_score(val_df['label'], val_df['prob'])
518 | 
519 | def perauc(df):
520 |     temp=pd.Series()
521 |     try:
522 |         temp['auc']=roc_auc_score(df['label'],df['prob'])
523 |     except:
524 |         temp['auc']=0.5
525 |     return temp
526 | eval_df=val_df.groupby("qid").apply(perauc)
527 | eval_df.index=range(len(eval_df))
528 | print("qauc:",eval_df['auc'].mean())
529 | 
530 | test_gen  = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
531 | batch_size=4096,label_tag=False,chunk_size=1,shuffle=False)
532 | prob = model.predict_generator(test_gen,steps=int(np.ceil(20000000/4096)),verbose=1)
533 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',names=['qid','aid','prob'])
534 | sub['prob'] = prob.flatten()
535 | sub.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testa.csv',index=False,header=False)
536 | 
537 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
538 |                      batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
539 | prob = model.predict_generator(
540 |     test_gen, steps=int(np.ceil(100000000/4096)), verbose=1)
541 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
542 |                     'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
543 | final['prob'] = prob.flatten()
544 | final.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testb.csv',
545 |              index=False, header=False)
546 | 


--------------------------------------------------------------------------------
/bigtrain_w2v_rnn.py:
--------------------------------------------------------------------------------
  1 | from keras.activations import softmax
  2 | import os
  3 | import pandas as pd
  4 | import numpy as np
  5 | import random as rn
  6 | from tqdm import tqdm, tqdm_notebook
  7 | import tensorflow as tf
  8 | from sklearn.metrics import roc_auc_score
  9 | from keras.preprocessing.text import Tokenizer
 10 | from keras.preprocessing.sequence import pad_sequences
 11 | from keras.optimizers import Adam
 12 | from keras import backend as K
 13 | from keras.optimizers import *
 14 | from keras.callbacks import *
 15 | from keras.layers import *
 16 | from keras.models import *
 17 | from keras.engine.topology import Layer
 18 | from keras import initializers, regularizers, constraints, optimizers, layers
 19 | from keras.initializers import *
 20 | import keras
 21 | from sklearn.model_selection import StratifiedKFold, GroupKFold
 22 | import gc
 23 | import time
 24 | from gensim.models import Word2Vec
 25 | import logging
 26 | import Levenshtein
 27 | import fasttext
 28 | tqdm.pandas()
 29 | np.random.seed(1017)
 30 | rn.seed(1017)
 31 | tf.set_random_seed(1017)
 32 | path = "/home/kesci/input/bytedance/"
 33 | out = '/home/kesci/work/zhifeng/'
 34 | print(os.listdir(path))
 35 | 
 36 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model')
 37 | 
 38 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)}
 39 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)}
 40 | 
 41 | 
 42 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
 43 |                      maxlen_query=8):
 44 |     if label_tag:
 45 |         _, _q, _, _a, _label = line.strip().split(',')
 46 |     else:
 47 |         _, _q, _, _a = line.strip().split(',')
 48 |     q_seq = [token.get(item, 0) for item in _q.strip().split()]
 49 |     a_seq = [token.get(item, 0) for item in _a.strip().split()]
 50 |     q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
 51 |     a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
 52 |     if label_tag:
 53 |         return q_pad, a_pad, int(_label)
 54 |     return q_pad, a_pad
 55 | 
 56 | 
 57 | def gen_train(path, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
 58 |     while True:
 59 |         fin = open(path, 'r')
 60 |         batch_q, batch_a, batch_label = [], [], []
 61 |         for line in fin:
 62 |             if len(batch_q) == chunk_size*batch_size:
 63 |                 batch_q = np.array(batch_q)
 64 |                 batch_a = np.array(batch_a)
 65 |                 if label_tag:
 66 |                     batch_label = np.array(batch_label)
 67 |                 idx = list(range(chunk_size*batch_size))
 68 |                 if shuffle:
 69 |                     np.random.shuffle(idx)
 70 |                 for i in range(chunk_size):
 71 |                     if label_tag:
 72 |                         yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
 73 |                     else:
 74 |                         yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
 75 |                 batch_q, batch_a, batch_label = [], [], []
 76 |             if label_tag:
 77 |                 q, a, l = gen_feature_help(line, label_tag=label_tag)
 78 |             else:
 79 |                 q, a = gen_feature_help(line, label_tag=label_tag)
 80 |                 l = 0
 81 |             batch_q.append(q)
 82 |             batch_a.append(a)
 83 |             if label_tag:
 84 |                 batch_label.append(l)
 85 | 
 86 |         batch_q = np.array(batch_q)
 87 |         batch_a = np.array(batch_a)
 88 | 
 89 |         if label_tag:
 90 |             batch_label = np.array(batch_label)
 91 |         idx = list(range(len(batch_q)))
 92 |         if shuffle:
 93 |             np.random.shuffle(idx)
 94 |         for i in range(int(np.ceil(len(batch_q)/batch_size))):
 95 |             if label_tag:
 96 |                 yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
 97 |             else:
 98 |                 yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
 99 |         fin.close()
100 | 
101 | 
102 | def get_embedding_matrix():
103 |     m = np.zeros(shape=(len(index2word)+1, 300))
104 |     for i, w in index2word.items():
105 |         m[i, :] = w2v[w]
106 |     return m
107 | 
108 | 
109 | embed_matrix = get_embedding_matrix()
110 | maxlen_query = 8
111 | maxlen_answer = 20
112 | 
113 | 
114 | class AdamW(Optimizer):
115 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/4)
116 |                  epsilon=1e-8, decay=0., **kwargs):
117 |         super(AdamW, self).__init__(**kwargs)
118 |         with K.name_scope(self.__class__.__name__):
119 |             self.iterations = K.variable(0, dtype='int64', name='iterations')
120 |             self.lr = K.variable(lr, name='lr')
121 |             self.beta_1 = K.variable(beta_1, name='beta_1')
122 |             self.beta_2 = K.variable(beta_2, name='beta_2')
123 |             self.decay = K.variable(decay, name='decay')
124 |             # decoupled weight decay (2/4)
125 |             self.wd = K.variable(weight_decay, name='weight_decay')
126 |         self.epsilon = epsilon
127 |         self.initial_decay = decay
128 | 
129 |     @interfaces.legacy_get_updates_support
130 |     def get_updates(self, loss, params):
131 |         grads = self.get_gradients(loss, params)
132 |         self.updates = [K.update_add(self.iterations, 1)]
133 |         wd = self.wd  # decoupled weight decay (3/4)
134 | 
135 |         lr = self.lr
136 |         if self.initial_decay > 0:
137 |             lr *= (1. / (1. + self.decay * K.cast(self.iterations,
138 |                                                   K.dtype(self.decay))))
139 | 
140 |         t = K.cast(self.iterations, K.floatx()) + 1
141 |         lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
142 |                      (1. - K.pow(self.beta_1, t)))
143 | 
144 |         ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
145 |         vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
146 |         self.weights = [self.iterations] + ms + vs
147 | 
148 |         for p, g, m, v in zip(params, grads, ms, vs):
149 |             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
150 |             v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
151 |             # decoupled weight decay (4/4)
152 |             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
153 | 
154 |             self.updates.append(K.update(m, m_t))
155 |             self.updates.append(K.update(v, v_t))
156 |             new_p = p_t
157 | 
158 |             # Apply constraints.
159 |             if getattr(p, 'constraint', None) is not None:
160 |                 new_p = p.constraint(new_p)
161 | 
162 |             self.updates.append(K.update(p, new_p))
163 |         return self.updates
164 | 
165 |     def get_config(self):
166 |         config = {'lr': float(K.get_value(self.lr)),
167 |                   'beta_1': float(K.get_value(self.beta_1)),
168 |                   'beta_2': float(K.get_value(self.beta_2)),
169 |                   'decay': float(K.get_value(self.decay)),
170 |                   'weight_decay': float(K.get_value(self.wd)),
171 |                   'epsilon': self.epsilon}
172 |         base_config = super(AdamW, self).get_config()
173 |         return dict(list(base_config.items()) + list(config.items()))
174 | 
175 | 
176 | class Attention(Layer):
177 |     def __init__(self, step_dim,
178 |                  W_regularizer=None, b_regularizer=None,
179 |                  W_constraint=None, b_constraint=None,
180 |                  bias=True, **kwargs):
181 |         self.supports_masking = True
182 |         self.init = initializers.get('glorot_uniform')
183 | 
184 |         self.W_regularizer = regularizers.get(W_regularizer)
185 |         self.b_regularizer = regularizers.get(b_regularizer)
186 | 
187 |         self.W_constraint = constraints.get(W_constraint)
188 |         self.b_constraint = constraints.get(b_constraint)
189 | 
190 |         self.bias = bias
191 |         self.step_dim = step_dim
192 |         self.features_dim = 0
193 |         super(Attention, self).__init__(**kwargs)
194 | 
195 |     def build(self, input_shape):
196 |         assert len(input_shape) == 3
197 | 
198 |         self.W = self.add_weight((input_shape[-1],),
199 |                                  initializer=self.init,
200 |                                  name='{}_W'.format(self.name),
201 |                                  regularizer=self.W_regularizer,
202 |                                  constraint=self.W_constraint)
203 |         self.features_dim = input_shape[-1]
204 | 
205 |         if self.bias:
206 |             self.b = self.add_weight((input_shape[1],),
207 |                                      initializer='zero',
208 |                                      name='{}_b'.format(self.name),
209 |                                      regularizer=self.b_regularizer,
210 |                                      constraint=self.b_constraint)
211 |         else:
212 |             self.b = None
213 | 
214 |         self.built = True
215 | 
216 |     def compute_mask(self, input, input_mask=None):
217 |         return None
218 | 
219 |     def call(self, x, mask=None):
220 |         features_dim = self.features_dim
221 |         step_dim = self.step_dim
222 | 
223 |         eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
224 |                               K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
225 | 
226 |         if self.bias:
227 |             eij += self.b
228 | 
229 |         eij = K.tanh(eij)
230 | 
231 |         a = K.exp(eij)
232 | 
233 |         if mask is not None:
234 |             a *= K.cast(mask, K.floatx())
235 | 
236 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
237 | 
238 |         a = K.expand_dims(a)
239 |         weighted_input = x * a
240 |         return K.sum(weighted_input, axis=1)
241 | 
242 |     def compute_output_shape(self, input_shape):
243 |         return input_shape[0],  self.features_dim
244 | 
245 | # AUC for a binary classifier
246 | 
247 | 
248 | def auc(y_true, y_pred):
249 |     ptas = tf.stack([binary_PTA(y_true, y_pred, k)
250 |                      for k in np.linspace(0, 1, 1000)], axis=0)
251 |     pfas = tf.stack([binary_PFA(y_true, y_pred, k)
252 |                      for k in np.linspace(0, 1, 1000)], axis=0)
253 |     pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
254 |     binSizes = -(pfas[1:]-pfas[:-1])
255 |     s = ptas*binSizes
256 |     return K.sum(s, axis=0)
257 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
258 | # PFA, prob false alert for binary classifier
259 | 
260 | 
261 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
262 |     y_pred = K.cast(y_pred >= threshold, 'float32')
263 |     # N = total number of negative labels
264 |     N = K.sum(1 - y_true)
265 |     # FP = total number of false alerts, alerts from the negative class labels
266 |     FP = K.sum(y_pred - y_pred * y_true)
267 |     return FP/N
268 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
269 | # P_TA prob true alerts for binary classifier
270 | 
271 | 
272 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
273 |     y_pred = K.cast(y_pred >= threshold, 'float32')
274 |     # P = total number of positive labels
275 |     P = K.sum(y_true)
276 |     # TP = total number of correct alerts, alerts from the positive class labels
277 |     TP = K.sum(y_pred * y_true)
278 |     return TP/P
279 | 
280 | 
281 | def get_model(embedding_matrix):
282 | 
283 |     K.clear_session()
284 |     #The embedding layer containing the word vectors
285 |     emb_layer = Embedding(
286 |         input_dim=embedding_matrix.shape[0],
287 |         output_dim=embedding_matrix.shape[1],
288 |         weights=[embedding_matrix],
289 |         trainable=False
290 |     )
291 |     sdrop = SpatialDropout1D(rate=0.2)
292 |     lstm_layer = Bidirectional(CuDNNLSTM(40, return_sequences=True,
293 |                                          kernel_initializer=glorot_uniform(seed=123)))
294 |     gru_layer = Bidirectional(CuDNNGRU(40, return_sequences=True,
295 |                                        kernel_initializer=glorot_uniform(seed=123)))
296 | 
297 |     cnn1d_layer = keras.layers.Conv1D(
298 |         40, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
299 | 
300 |     # Define inputs
301 |     seq1 = Input(shape=(maxlen_query,))
302 |     x1 = emb_layer(seq1)
303 |     x1 = sdrop(x1)
304 |     lstm1 = lstm_layer(x1)
305 |     gru1 = gru_layer(lstm1)
306 |     att_1 = Attention(maxlen_query)(lstm1)
307 |     att_3 = Attention(maxlen_query)(gru1)
308 |     cnn1 = cnn1d_layer(lstm1)
309 | 
310 |     avg_pool = GlobalAveragePooling1D()
311 |     max_pool = GlobalMaxPooling1D()
312 | 
313 |     seq2 = Input(shape=(maxlen_answer,))
314 |     x2 = emb_layer(seq2)
315 |     x2 = sdrop(x2)
316 |     lstm2 = lstm_layer(x2)
317 |     gru2 = gru_layer(lstm2)
318 |     att_2 = Attention(maxlen_answer)(lstm2)
319 |     att_4 = Attention(maxlen_answer)(gru2)
320 |     cnn2 = cnn1d_layer(lstm2)
321 | 
322 |     x1 = concatenate([att_1, att_3, avg_pool(cnn1), max_pool(
323 |         cnn1), avg_pool(gru1), max_pool(gru1)])
324 |     x2 = concatenate([att_2, att_4, avg_pool(cnn2), max_pool(
325 |         cnn2), avg_pool(gru2), max_pool(gru2)])
326 | 
327 |     merge = Multiply()([x1, x2])
328 |     merge = Dropout(0.2)(merge)
329 | 
330 |     # htime = Dense(col_len,activation='relu')(hin)
331 |     # The MLP that determines the outcome
332 |     x = Dense(40, kernel_initializer=he_uniform(
333 |         seed=123), activation='relu',)(merge)
334 |     # x = Dropout(0.2)(x)
335 |     # x = BatchNormalization()(x)
336 | 
337 |     pred = Dense(1, kernel_initializer=he_uniform(
338 |         seed=123), activation='sigmoid')(x)
339 | 
340 |     model = Model(inputs=[seq1, seq2], outputs=pred)
341 | 
342 |     model.compile(loss='binary_crossentropy',
343 |                   optimizer=AdamW(lr=0.0003, weight_decay=0.02,),
344 |                   metrics=["accuracy"])
345 |     # model.summary()
346 |     return model
347 | 
348 | ####模型训练
349 | train_gen = gen_train(path='/home/kesci/zhifeng/train.csv',
350 |                       batch_size=4096, label_tag=True, chunk_size=1000)
351 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
352 |                     batch_size=4096, label_tag=True, chunk_size=1000)
353 | print("train...")
354 | print("###"*30)
355 | gc.collect()
356 | K.clear_session()
357 | model = (embed_matrix)
358 | model.summary()
359 | early_stopping = EarlyStopping(
360 |     monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
361 | reduce_lr = ReduceLROnPlateau(
362 |     monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
363 | bst_model_path = '/home/kesci/chizhu/chizhu_w2v_esim_weight_{epoch}_{val_loss}.h5'
364 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
365 |                              save_best_only=False,
366 |                              verbose=1, save_weights_only=True, period=1)
367 | callbacks = [checkpoint, reduce_lr, early_stopping]
368 | # print("load weight....")
369 | 
370 | 
371 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(999000000/2048)),
372 |                            epochs=10, verbose=1, callbacks=callbacks,
373 |                            validation_data=val_gen, validation_steps=int(
374 |                                np.ceil(1000000/2048)),
375 |                            max_queue_size=10, workers=1, use_multiprocessing=False)
376 | 
377 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
378 |                     batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
379 | val_prob = model.predict_generator(
380 |     val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
381 | 
382 | f = open('/home/kesci/zhifeng/val.csv', 'r')
383 | q, a, l = [], [], []
384 | for line in f:
385 |     qid, _, aid, _, label = line.strip().split(',')
386 |     q.append(qid)
387 |     a.append(aid)
388 |     l.append(int(label))
389 | 
390 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l})
391 | val_df['prob'] = val_prob.flatten()
392 | 
393 | roc_auc_score(val_df['label'], val_df['prob'])
394 | 
395 | 
396 | def perauc(df):
397 |     temp = pd.Series()
398 |     try:
399 |         temp['auc'] = roc_auc_score(df['label'], df['prob'])
400 |     except:
401 |         temp['auc'] = 0.5
402 |     return temp
403 | 
404 | 
405 | eval_df = val_df.groupby("qid").apply(perauc)
406 | eval_df.index = range(len(eval_df))
407 | print("qauc:", eval_df['auc'].mean())
408 | 
409 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
410 |                      batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
411 | prob = model.predict_generator(
412 |     test_gen, steps=int(np.ceil(20000000/4096)), verbose=1)
413 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',
414 |                   names=['qid', 'aid', 'prob'])
415 | sub['prob'] = prob.flatten()
416 | sub.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testa.csv',
417 |            index=False, header=False)
418 | 
419 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
420 |                      batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
421 | prob = model.predict_generator(
422 |     test_gen, steps=int(np.ceil(100000000/4096)), verbose=1)
423 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
424 |                     'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
425 | final['prob'] = prob.flatten()
426 | final.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testb.csv',
427 |              index=False, header=False)
428 | 


--------------------------------------------------------------------------------
/chizhu_rnn.py:
--------------------------------------------------------------------------------
  1 | from sklearn.preprocessing import StandardScaler
  2 | import os
  3 | import pandas as pd
  4 | import numpy as np
  5 | import random as rn
  6 | from tqdm import tqdm, tqdm_notebook
  7 | import tensorflow as tf
  8 | from sklearn.metrics import roc_auc_score
  9 | from keras.preprocessing.text import Tokenizer
 10 | from keras.preprocessing.sequence import pad_sequences
 11 | from keras.optimizers import Adam
 12 | from keras import backend as K
 13 | from keras.optimizers import *
 14 | from keras.callbacks import *
 15 | from keras.layers import *
 16 | from keras.models import *
 17 | from keras.engine.topology import Layer
 18 | from keras import initializers, regularizers, constraints, optimizers, layers
 19 | from keras.initializers import *
 20 | import keras
 21 | from sklearn.model_selection import StratifiedKFold, GroupKFold
 22 | import gc
 23 | import time
 24 | from gensim.models import Word2Vec
 25 | import logging
 26 | import Levenshtein
 27 | tqdm.pandas()
 28 | np.random.seed(1017)
 29 | rn.seed(1017)
 30 | tf.set_random_seed(1017)
 31 | path = "/home/kesci/input/bytedance/"
 32 | out = '/home/kesci/work/chizhu/'
 33 | print(os.listdir(path))
 34 | 
 35 | f1 = pd.read_csv(out + 'f1.csv')
 36 | f2 = pd.read_csv(out + 'f2.csv')
 37 | f3 = pd.read_csv(out + 'f3.csv')
 38 | feature = pd.concat([f1, f2, f3], sort=False, axis=1)
 39 | del f1, f2, f3
 40 | gc.collect()
 41 | 
 42 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl")
 43 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl")
 44 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl")
 45 | testb_w2v = pd.read_pickle(
 46 |     "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl")
 47 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v)
 48 | 
 49 | train_w2v = pd.read_pickle(
 50 |     "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl")
 51 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl")
 52 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl")
 53 | testb_w2v = pd.read_pickle(
 54 |     "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl")
 55 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \
 56 |     list(testa_w2v)+list(testb_w2v)
 57 | del train_w2v, val_w2v, testa_w2v, testb_w2v
 58 | gc.collect()
 59 | feature.shape
 60 | 
 61 | len_train = 99000000
 62 | len_val = 1000000
 63 | len_testa = 20000000
 64 | len_testb = 100000000
 65 | sc = StandardScaler()
 66 | feature = sc.fit_transform(feature)
 67 | train_feature = feature[:len_train]
 68 | val_feature = feature[len_train:len_train+len_val]
 69 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa]
 70 | testb_feature = feature[-len_testb:]
 71 | print(train_feature.shape, val_feature.shape,testa_feature.shape,testb_feature.shape)
 72 | 
 73 | del feature
 74 | gc.collect()
 75 | 
 76 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model')
 77 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)}
 78 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)}
 79 | 
 80 | 
 81 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
 82 |                      maxlen_query=8):
 83 |     if label_tag:
 84 |         _, _q, _, _a, _label = line.strip().split(',')
 85 |     else:
 86 |         _, _q, _, _a = line.strip().split(',')
 87 |     q_seq = [token.get(item, 0) for item in _q.strip().split()]
 88 |     a_seq = [token.get(item, 0) for item in _a.strip().split()]
 89 |     q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
 90 |     a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
 91 |     if label_tag:
 92 |         return q_pad, a_pad, int(_label)
 93 |     return q_pad, a_pad
 94 | 
 95 | 
 96 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
 97 |     while True:
 98 |         fin = open(path, 'r')
 99 |         batch_q, batch_a, batch_f, batch_label = [], [], [], []
100 |         for i, line in enumerate(fin):
101 |             if len(batch_q) == chunk_size*batch_size:
102 |                 batch_q = np.array(batch_q)
103 |                 batch_a = np.array(batch_a)
104 |                 batch_f = np.array(batch_f)
105 |                 if label_tag:
106 |                     batch_label = np.array(batch_label)
107 |                 idx = list(range(chunk_size*batch_size))
108 |                 if shuffle:
109 |                     np.random.shuffle(idx)
110 |                 for i in range(chunk_size):
111 |                     if label_tag:
112 |                         yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
113 |                                 np.array(
114 |                                     batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
115 |                                 np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
116 |                                np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
117 |                     else:
118 |                         yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
119 |                                np.array(
120 |                                    batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
121 |                                np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
122 |                 batch_q, batch_a, batch_f, batch_label = [], [], [], []
123 |             if label_tag:
124 |                 q, a, l = gen_feature_help(line, label_tag=label_tag)
125 |             else:
126 |                 q, a = gen_feature_help(line, label_tag=label_tag)
127 |                 l = 0
128 |             batch_q.append(q)
129 |             batch_a.append(a)
130 |             batch_f.append(feature[i])
131 |             if label_tag:
132 |                 batch_label.append(l)
133 | 
134 |         batch_q = np.array(batch_q)
135 |         batch_a = np.array(batch_a)
136 |         batch_f = np.array(batch_f)
137 | 
138 |         if label_tag:
139 |             batch_label = np.array(batch_label)
140 |         idx = list(range(len(batch_q)))
141 |         if shuffle:
142 |             np.random.shuffle(idx)
143 |         for i in range(int(np.ceil(len(batch_q)/batch_size))):
144 |             if label_tag:
145 |                 yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
146 |                         np.array(
147 |                             batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
148 |                         np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
149 |                        np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
150 |             else:
151 |                 yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
152 |                        np.array(
153 |                            batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
154 |                        np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
155 |         fin.close()
156 | 
157 | 
158 | def get_embedding_matrix():
159 |     m = np.zeros(shape=(len(index2word)+1, 300))
160 |     for i, w in index2word.items():
161 |         m[i, :] = w2v[w]
162 |     return m
163 | 
164 | 
165 | embed_matrix = get_embedding_matrix()
166 | maxlen_query = 8
167 | maxlen_answer = 20
168 | 
169 | 
170 | class AdamW(Optimizer):
171 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/4)
172 |                  epsilon=1e-8, decay=0., **kwargs):
173 |         super(AdamW, self).__init__(**kwargs)
174 |         with K.name_scope(self.__class__.__name__):
175 |             self.iterations = K.variable(0, dtype='int64', name='iterations')
176 |             self.lr = K.variable(lr, name='lr')
177 |             self.beta_1 = K.variable(beta_1, name='beta_1')
178 |             self.beta_2 = K.variable(beta_2, name='beta_2')
179 |             self.decay = K.variable(decay, name='decay')
180 |             # decoupled weight decay (2/4)
181 |             self.wd = K.variable(weight_decay, name='weight_decay')
182 |         self.epsilon = epsilon
183 |         self.initial_decay = decay
184 | 
185 |     @interfaces.legacy_get_updates_support
186 |     def get_updates(self, loss, params):
187 |         grads = self.get_gradients(loss, params)
188 |         self.updates = [K.update_add(self.iterations, 1)]
189 |         wd = self.wd  # decoupled weight decay (3/4)
190 | 
191 |         lr = self.lr
192 |         if self.initial_decay > 0:
193 |             lr *= (1. / (1. + self.decay * K.cast(self.iterations,
194 |                                                   K.dtype(self.decay))))
195 | 
196 |         t = K.cast(self.iterations, K.floatx()) + 1
197 |         lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
198 |                      (1. - K.pow(self.beta_1, t)))
199 | 
200 |         ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
201 |         vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
202 |         self.weights = [self.iterations] + ms + vs
203 | 
204 |         for p, g, m, v in zip(params, grads, ms, vs):
205 |             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
206 |             v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
207 |             # decoupled weight decay (4/4)
208 |             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
209 | 
210 |             self.updates.append(K.update(m, m_t))
211 |             self.updates.append(K.update(v, v_t))
212 |             new_p = p_t
213 | 
214 |             # Apply constraints.
215 |             if getattr(p, 'constraint', None) is not None:
216 |                 new_p = p.constraint(new_p)
217 | 
218 |             self.updates.append(K.update(p, new_p))
219 |         return self.updates
220 | 
221 |     def get_config(self):
222 |         config = {'lr': float(K.get_value(self.lr)),
223 |                   'beta_1': float(K.get_value(self.beta_1)),
224 |                   'beta_2': float(K.get_value(self.beta_2)),
225 |                   'decay': float(K.get_value(self.decay)),
226 |                   'weight_decay': float(K.get_value(self.wd)),
227 |                   'epsilon': self.epsilon}
228 |         base_config = super(AdamW, self).get_config()
229 |         return dict(list(base_config.items()) + list(config.items()))
230 | 
231 | 
232 | class Attention(Layer):
233 |     def __init__(self, step_dim,
234 |                  W_regularizer=None, b_regularizer=None,
235 |                  W_constraint=None, b_constraint=None,
236 |                  bias=True, **kwargs):
237 |         self.supports_masking = True
238 |         self.init = initializers.get('glorot_uniform')
239 | 
240 |         self.W_regularizer = regularizers.get(W_regularizer)
241 |         self.b_regularizer = regularizers.get(b_regularizer)
242 | 
243 |         self.W_constraint = constraints.get(W_constraint)
244 |         self.b_constraint = constraints.get(b_constraint)
245 | 
246 |         self.bias = bias
247 |         self.step_dim = step_dim
248 |         self.features_dim = 0
249 |         super(Attention, self).__init__(**kwargs)
250 | 
251 |     def build(self, input_shape):
252 |         assert len(input_shape) == 3
253 | 
254 |         self.W = self.add_weight((input_shape[-1],),
255 |                                  initializer=self.init,
256 |                                  name='{}_W'.format(self.name),
257 |                                  regularizer=self.W_regularizer,
258 |                                  constraint=self.W_constraint)
259 |         self.features_dim = input_shape[-1]
260 | 
261 |         if self.bias:
262 |             self.b = self.add_weight((input_shape[1],),
263 |                                      initializer='zero',
264 |                                      name='{}_b'.format(self.name),
265 |                                      regularizer=self.b_regularizer,
266 |                                      constraint=self.b_constraint)
267 |         else:
268 |             self.b = None
269 | 
270 |         self.built = True
271 | 
272 |     def compute_mask(self, input, input_mask=None):
273 |         return None
274 | 
275 |     def call(self, x, mask=None):
276 |         features_dim = self.features_dim
277 |         step_dim = self.step_dim
278 | 
279 |         eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
280 |                               K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
281 | 
282 |         if self.bias:
283 |             eij += self.b
284 | 
285 |         eij = K.tanh(eij)
286 | 
287 |         a = K.exp(eij)
288 | 
289 |         if mask is not None:
290 |             a *= K.cast(mask, K.floatx())
291 | 
292 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
293 | 
294 |         a = K.expand_dims(a)
295 |         weighted_input = x * a
296 |         return K.sum(weighted_input, axis=1)
297 | 
298 |     def compute_output_shape(self, input_shape):
299 |         return input_shape[0],  self.features_dim
300 | # AUC for a binary classifier
301 | 
302 | 
303 | def auc(y_true, y_pred):
304 |     ptas = tf.stack([binary_PTA(y_true, y_pred, k)
305 |                      for k in np.linspace(0, 1, 1000)], axis=0)
306 |     pfas = tf.stack([binary_PFA(y_true, y_pred, k)
307 |                      for k in np.linspace(0, 1, 1000)], axis=0)
308 |     pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
309 |     binSizes = -(pfas[1:]-pfas[:-1])
310 |     s = ptas*binSizes
311 |     return K.sum(s, axis=0)
312 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
313 | # PFA, prob false alert for binary classifier
314 | 
315 | 
316 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
317 |     y_pred = K.cast(y_pred >= threshold, 'float32')
318 |     # N = total number of negative labels
319 |     N = K.sum(1 - y_true)
320 |     # FP = total number of false alerts, alerts from the negative class labels
321 |     FP = K.sum(y_pred - y_pred * y_true)
322 |     return FP/N
323 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
324 | # P_TA prob true alerts for binary classifier
325 | 
326 | 
327 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
328 |     y_pred = K.cast(y_pred >= threshold, 'float32')
329 |     # P = total number of positive labels
330 |     P = K.sum(y_true)
331 |     # TP = total number of correct alerts, alerts from the positive class labels
332 |     TP = K.sum(y_pred * y_true)
333 |     return TP/P
334 | 
335 | 
336 | class Lookahead(object):
337 |     """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
338 |     """
339 | 
340 |     def __init__(self, k=5, alpha=0.5):
341 |         self.k = k
342 |         self.alpha = alpha
343 |         self.count = 0
344 | 
345 |     def inject(self, model):
346 |         """Inject the Lookahead algorithm for the given model.
347 |         The following code is modified from keras's _make_train_function method.
348 |         See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
349 |         """
350 |         if not hasattr(model, 'train_function'):
351 |             raise RuntimeError('You must compile your model before using it.')
352 | 
353 |         model._check_trainable_weights_consistency()
354 | 
355 |         if model.train_function is None:
356 |             inputs = (model._feed_inputs +
357 |                       model._feed_targets +
358 |                       model._feed_sample_weights)
359 |             if model._uses_dynamic_learning_phase():
360 |                 inputs += [K.learning_phase()]
361 |             fast_params = model._collected_trainable_weights
362 | 
363 |             with K.name_scope('training'):
364 |                 with K.name_scope(model.optimizer.__class__.__name__):
365 |                     training_updates = model.optimizer.get_updates(
366 |                         params=fast_params,
367 |                         loss=model.total_loss)
368 |                     slow_params = [K.variable(p) for p in fast_params]
369 |                 fast_updates = (model.updates +
370 |                                 training_updates +
371 |                                 model.metrics_updates)
372 | 
373 |                 slow_updates, copy_updates = [], []
374 |                 for p, q in zip(fast_params, slow_params):
375 |                     slow_updates.append(K.update(q, q + self.alpha * (p - q)))
376 |                     copy_updates.append(K.update(p, q))
377 | 
378 |                 # Gets loss and metrics. Updates weights at each call.
379 |                 fast_train_function = K.function(
380 |                     inputs,
381 |                     [model.total_loss] + model.metrics_tensors,
382 |                     updates=fast_updates,
383 |                     name='fast_train_function',
384 |                     **model._function_kwargs)
385 | 
386 |                 def F(inputs):
387 |                     self.count += 1
388 |                     R = fast_train_function(inputs)
389 |                     if self.count % self.k == 0:
390 |                         K.batch_get_value(slow_updates)
391 |                         K.batch_get_value(copy_updates)
392 |                     return R
393 | 
394 |                 model.train_function = F
395 | def get_model(embedding_matrix):
396 | 
397 |     K.clear_session()
398 |     #The embedding layer containing the word vectors
399 |     emb_layer = Embedding(
400 |         input_dim=embedding_matrix.shape[0],
401 |         output_dim=embedding_matrix.shape[1],
402 |         weights=[embedding_matrix],
403 |         trainable=False
404 |     )
405 |     sdrop=SpatialDropout1D(rate=0.2)
406 |     lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True, 
407 | kernel_initializer=glorot_uniform(seed = 123)))
408 |     gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True, 
409 | kernel_initializer=glorot_uniform(seed = 123)))
410 |     
411 |     cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
412 | 
413 |     # Define inputs
414 |     seq1 = Input(shape=(maxlen_query,))
415 |     x1 = emb_layer(seq1)
416 |     x1 = sdrop(x1)
417 |     lstm1 = lstm_layer(x1)
418 |     gru1 = gru_layer(lstm1)
419 |     att_1 = Attention(maxlen_query)(lstm1)
420 |     att_3 = Attention(maxlen_query)(gru1)
421 |     cnn1 = cnn1d_layer(lstm1)
422 |     
423 |     avg_pool = GlobalAveragePooling1D()
424 |     max_pool = GlobalMaxPooling1D()
425 |     
426 |     seq2 = Input(shape=(maxlen_answer,))
427 |     x2 = emb_layer(seq2)
428 |     x2 = sdrop(x2)
429 |     lstm2 = lstm_layer(x2)
430 |     gru2 = gru_layer(lstm2)
431 |     att_2 = Attention(maxlen_answer)(lstm2)
432 |     att_4 = Attention(maxlen_answer)(gru2)
433 |     cnn2 = cnn1d_layer(lstm2)
434 |     
435 |     x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)])
436 |     x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)])
437 |     
438 |     merge = Multiply()([x1, x2])
439 |     merge = Dropout(0.2)(merge)
440 |     
441 |     hin = Input(shape=(19,))
442 |     # htime = Dense(col_len,activation='relu')(hin)
443 |     x = Concatenate()([merge,hin])
444 |     # The MLP that determines the outcome
445 |     x = Dense(64,kernel_initializer=he_uniform(seed=123), activation='relu',)(x)
446 |     # x = Dropout(0.2)(x)
447 |     # x = BatchNormalization()(x)
448 | 
449 |     pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x)
450 | 
451 |     
452 |     model = Model(inputs=[seq1,seq2,hin], outputs=pred)
453 | 
454 |     model.compile(loss='binary_crossentropy',
455 |                   optimizer=AdamW(lr=0.001,weight_decay=0.02,),
456 |                   metrics=["accuracy",auc])
457 |     # model.summary()
458 |     return model
459 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv',feature=train_feature,batch_size=2048,
460 | label_tag=True,chunk_size=5000)
461 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',feature=val_feature,batch_size=2048,
462 | label_tag=True,chunk_size=5000)
463 | print("train...")
464 | print("###"*30)
465 | gc.collect()
466 | K.clear_session()
467 | model = get_model(embed_matrix)
468 | lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
469 | lookahead.inject(model) # add into model
470 | model.summary()
471 | early_stopping = EarlyStopping(monitor='val_loss',min_delta=0.0001, patience=2, mode='min', verbose=1)
472 | reduce_lr = ReduceLROnPlateau(
473 |         monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
474 | bst_model_path = out+'chizhurnn_chizhu_weight.h5'
475 | checkpoint = ModelCheckpoint(bst_model_path , monitor='val_loss', mode='min',
476 |                                       save_best_only=True, verbose=1,save_weights_only=True )
477 | callbacks = [checkpoint,reduce_lr,early_stopping]
478 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)),
479 | epochs=10, verbose=1, callbacks=callbacks, 
480 | validation_data=val_gen, validation_steps = int(np.ceil(1000000/2048)),
481 | max_queue_size=10, workers=1, use_multiprocessing=False)
482 | 
483 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature,
484 |                     batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
485 | val_prob = model.predict_generator(
486 |     val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
487 | 
488 | f = open('/home/kesci/zhifeng/val.csv', 'r')
489 | q, a, l = [], [], []
490 | for line in f:
491 |     qid, _, aid, _, label = line.strip().split(',')
492 |     q.append(qid)
493 |     a.append(aid)
494 |     l.append(int(label))
495 | 
496 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l})
497 | val_df['prob'] = val_prob.flatten()
498 | 
499 | 
500 | def perauc(df):
501 |     temp = pd.Series()
502 |     try:
503 |         temp['auc'] = roc_auc_score(df['label'], df['prob'])
504 |     except:
505 |         temp['auc'] = 0.5
506 |     return temp
507 | 
508 | 
509 | eval_df = val_df.groupby("qid").apply(perauc)
510 | eval_df.index = range(len(eval_df))
511 | print("qauc:", eval_df['auc'].mean())
512 | 
513 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
514 |                      feature=testa_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
515 | prob = model.predict_generator(
516 |     test_gen, steps=int(np.ceil(20000000/4096)), verbose=1)
517 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',
518 |                   names=['qid', 'aid', 'prob'])
519 | sub['prob'] = prob.flatten()
520 | sub.to_csv('/home/kesci/work/chizhu/chizhu_rnn_testa.csv',
521 |            index=False, header=False)
522 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
523 |                      feature=testb_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
524 | prob = model.predict_generator(
525 |     test_gen, steps=int(np.ceil(100000000/4096)), verbose=1)
526 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
527 |                     'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
528 | final['prob'] = prob.flatten()
529 | final.to_csv('/home/kesci/work/chizhu/chizhu_rnn_testb.csv',
530 |              index=False, header=False)
531 | 


--------------------------------------------------------------------------------
/fasttext_cos.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import numpy as np
 4 | import random as rn
 5 | from tqdm import tqdm, tqdm_notebook
 6 | from sklearn.metrics import roc_auc_score
 7 | from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
 8 | import gc
 9 | import time
10 | from gensim.models import Word2Vec
11 | import fasttext
12 | from gensim.models import Word2Vec
13 | import scipy.spatial.distance as ssd
14 | tqdm.pandas()
15 | input_path = "/home/kesci/input/bytedance/"
16 | out_work_path = '/home/kesci/work/zhifeng/'
17 | out_path = '/home/kesci/zhifeng/'
18 | 
19 | w2v = fasttext.load_model(out_work_path+'corpus.fasttext.model')
20 | train_cosine_list = []
21 | with open(out_path+'train.smaller.csv', 'r') as fin:
22 |     for line in tqdm(fin):
23 |         _, q, _, a, _ = line.strip().split(',')
24 |         v1 = w2v.get_sentence_vector(q)
25 |         v2 = w2v.get_sentence_vector(a)
26 |         train_cosine_list.append(ssd.cosine(v1, v2))
27 | pd.to_pickle(np.array(train_cosine_list),
28 |              out_work_path+'train.cosine.fasttext.pkl')
29 | val_cosine_list = []
30 | with open(out_path+'val.csv', 'r') as fin:
31 |     for line in tqdm(fin):
32 |         _, q, _, a, _ = line.strip().split(',')
33 |         v1 = w2v.get_sentence_vector(q)
34 |         v2 = w2v.get_sentence_vector(a)
35 |         val_cosine_list.append(ssd.cosine(v1, v2))
36 | pd.to_pickle(np.array(val_cosine_list),
37 |              out_work_path+'val.cosine.fasttext.pkl')
38 | test_cosine_list = []
39 | with open(input_path+'test_final_part1.csv', 'r') as fin:
40 |     for line in tqdm(fin):
41 |         _, q, _, a = line.strip().split(',')
42 |         v1 = w2v.get_sentence_vector(q)
43 |         v2 = w2v.get_sentence_vector(a)
44 |         test_cosine_list.append(ssd.cosine(v1, v2))
45 | pd.to_pickle(np.array(test_cosine_list),
46 |              out_work_path+'test.cosine.fasttext.pkl')
47 | 


--------------------------------------------------------------------------------
/finetuning_fasttext_esim.py:
--------------------------------------------------------------------------------
  1 | from keras.activations import softmax
  2 | from sklearn.preprocessing import StandardScaler
  3 | import os
  4 | import pandas as pd
  5 | import numpy as np
  6 | import random as rn
  7 | from tqdm import tqdm, tqdm_notebook
  8 | import tensorflow as tf
  9 | from sklearn.metrics import roc_auc_score
 10 | from keras.preprocessing.text import Tokenizer
 11 | from keras.preprocessing.sequence import pad_sequences
 12 | from keras.optimizers import Adam
 13 | from keras import backend as K
 14 | from keras.optimizers import *
 15 | from keras.callbacks import *
 16 | from keras.layers import *
 17 | from keras.models import *
 18 | from keras.engine.topology import Layer
 19 | from keras import initializers, regularizers, constraints, optimizers, layers
 20 | from keras.initializers import *
 21 | import keras
 22 | from sklearn.model_selection import StratifiedKFold, GroupKFold
 23 | import gc
 24 | import time
 25 | from gensim.models import Word2Vec
 26 | import logging
 27 | import Levenshtein
 28 | import fasttext
 29 | tqdm.pandas()
 30 | np.random.seed(1017)
 31 | rn.seed(1017)
 32 | tf.set_random_seed(1017)
 33 | path = "/home/kesci/input/bytedance/"
 34 | out = '/home/kesci/work/zhifeng/'
 35 | out_chizhu = '/home/kesci/work/chizhu/'
 36 | print(os.listdir(path))
 37 | 
 38 | f1 = pd.read_csv(out_chizhu + 'f1.csv')
 39 | f2 = pd.read_csv(out_chizhu + 'f2.csv')
 40 | f3 = pd.read_csv(out_chizhu + 'f3.csv')
 41 | feature = pd.concat([f1, f2, f3], sort=False, axis=1)
 42 | del f1, f2, f3
 43 | gc.collect()
 44 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl")
 45 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl")
 46 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl")
 47 | testb_w2v = pd.read_pickle(
 48 |     "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl")
 49 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v)
 50 | 
 51 | train_w2v = pd.read_pickle(
 52 |     "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl")
 53 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl")
 54 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl")
 55 | testb_w2v = pd.read_pickle(
 56 |     "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl")
 57 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \
 58 |     list(testa_w2v)+list(testb_w2v)
 59 | del train_w2v, val_w2v, testa_w2v, testb_w2v
 60 | gc.collect()
 61 | feature.shape
 62 | 
 63 | len_train = 99000000
 64 | len_val = 1000000
 65 | len_testa = 20000000
 66 | len_testb = 100000000
 67 | sc = StandardScaler()
 68 | feature = sc.fit_transform(feature)
 69 | train_feature = feature[:len_train]
 70 | val_feature = feature[len_train:len_train+len_val]
 71 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa]
 72 | testb_feature = feature[-len_testb:]
 73 | print(train_feature.shape, val_feature.shape,testa_feature.shape,testb_feature.shape)
 74 | 
 75 | del feature
 76 | gc.collect()
 77 | 
 78 | w2v = fasttext.load_model(out+'corpus.fasttext.model')
 79 | word2index = {word: index+1 for index, word in enumerate(w2v.words)}
 80 | index2word = {index+1: word for index, word in enumerate(w2v.words)}
 81 | 
 82 | 
 83 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
 84 |                      maxlen_query=8):
 85 |     if label_tag:
 86 |         _, _q, _, _a, _label = line.strip().split(',')
 87 |     else:
 88 |         _, _q, _, _a = line.strip().split(',')
 89 |     q_seq = [token.get(item, 0) for item in _q.strip().split()]
 90 |     a_seq = [token.get(item, 0) for item in _a.strip().split()]
 91 |     q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
 92 |     a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
 93 |     if label_tag:
 94 |         return q_pad, a_pad, int(_label)
 95 |     return q_pad, a_pad
 96 | 
 97 | 
 98 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
 99 |     while True:
100 |         fin = open(path, 'r')
101 |         batch_q, batch_a, batch_f, batch_label = [], [], [], []
102 |         for i, line in enumerate(fin):
103 |             if len(batch_q) == chunk_size*batch_size:
104 |                 batch_q = np.array(batch_q)
105 |                 batch_a = np.array(batch_a)
106 |                 batch_f = np.array(batch_f)
107 |                 if label_tag:
108 |                     batch_label = np.array(batch_label)
109 |                 idx = list(range(chunk_size*batch_size))
110 |                 if shuffle:
111 |                     np.random.shuffle(idx)
112 |                 for i in range(chunk_size):
113 |                     if label_tag:
114 |                         yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
115 |                                 np.array(
116 |                                     batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
117 |                                 np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
118 |                                np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
119 |                     else:
120 |                         yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
121 |                                np.array(
122 |                                    batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
123 |                                np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
124 |                 batch_q, batch_a, batch_f, batch_label = [], [], [], []
125 |             if label_tag:
126 |                 q, a, l = gen_feature_help(line, label_tag=label_tag)
127 |             else:
128 |                 q, a = gen_feature_help(line, label_tag=label_tag)
129 |                 l = 0
130 |             batch_q.append(q)
131 |             batch_a.append(a)
132 |             batch_f.append(feature[i])
133 |             if label_tag:
134 |                 batch_label.append(l)
135 | 
136 |         batch_q = np.array(batch_q)
137 |         batch_a = np.array(batch_a)
138 |         batch_f = np.array(batch_f)
139 | 
140 |         if label_tag:
141 |             batch_label = np.array(batch_label)
142 |         idx = list(range(len(batch_q)))
143 |         if shuffle:
144 |             np.random.shuffle(idx)
145 |         for i in range(int(np.ceil(len(batch_q)/batch_size))):
146 |             if label_tag:
147 |                 yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
148 |                         np.array(
149 |                             batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
150 |                         np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
151 |                        np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
152 |             else:
153 |                 yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
154 |                        np.array(
155 |                            batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
156 |                        np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
157 |         fin.close()
158 | 
159 | 
160 | def get_embedding_matrix():
161 |     m = np.zeros(shape=(len(index2word)+1, 100))
162 |     for i, w in index2word.items():
163 |         m[i, :] = w2v[w]
164 |     return m
165 | 
166 | 
167 | embed_matrix = get_embedding_matrix()
168 | maxlen_query = 8
169 | maxlen_answer = 20
170 | 
171 | 
172 | class AdamW(Optimizer):
173 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/4)
174 |                  epsilon=1e-8, decay=0., **kwargs):
175 |         super(AdamW, self).__init__(**kwargs)
176 |         with K.name_scope(self.__class__.__name__):
177 |             self.iterations = K.variable(0, dtype='int64', name='iterations')
178 |             self.lr = K.variable(lr, name='lr')
179 |             self.beta_1 = K.variable(beta_1, name='beta_1')
180 |             self.beta_2 = K.variable(beta_2, name='beta_2')
181 |             self.decay = K.variable(decay, name='decay')
182 |             # decoupled weight decay (2/4)
183 |             self.wd = K.variable(weight_decay, name='weight_decay')
184 |         self.epsilon = epsilon
185 |         self.initial_decay = decay
186 | 
187 |     @interfaces.legacy_get_updates_support
188 |     def get_updates(self, loss, params):
189 |         grads = self.get_gradients(loss, params)
190 |         self.updates = [K.update_add(self.iterations, 1)]
191 |         wd = self.wd  # decoupled weight decay (3/4)
192 | 
193 |         lr = self.lr
194 |         if self.initial_decay > 0:
195 |             lr *= (1. / (1. + self.decay * K.cast(self.iterations,
196 |                                                   K.dtype(self.decay))))
197 | 
198 |         t = K.cast(self.iterations, K.floatx()) + 1
199 |         lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
200 |                      (1. - K.pow(self.beta_1, t)))
201 | 
202 |         ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
203 |         vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
204 |         self.weights = [self.iterations] + ms + vs
205 | 
206 |         for p, g, m, v in zip(params, grads, ms, vs):
207 |             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
208 |             v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
209 |             # decoupled weight decay (4/4)
210 |             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
211 | 
212 |             self.updates.append(K.update(m, m_t))
213 |             self.updates.append(K.update(v, v_t))
214 |             new_p = p_t
215 | 
216 |             # Apply constraints.
217 |             if getattr(p, 'constraint', None) is not None:
218 |                 new_p = p.constraint(new_p)
219 | 
220 |             self.updates.append(K.update(p, new_p))
221 |         return self.updates
222 | 
223 |     def get_config(self):
224 |         config = {'lr': float(K.get_value(self.lr)),
225 |                   'beta_1': float(K.get_value(self.beta_1)),
226 |                   'beta_2': float(K.get_value(self.beta_2)),
227 |                   'decay': float(K.get_value(self.decay)),
228 |                   'weight_decay': float(K.get_value(self.wd)),
229 |                   'epsilon': self.epsilon}
230 |         base_config = super(AdamW, self).get_config()
231 |         return dict(list(base_config.items()) + list(config.items()))
232 | 
233 | 
234 | class Attention(Layer):
235 |     def __init__(self, step_dim,
236 |                  W_regularizer=None, b_regularizer=None,
237 |                  W_constraint=None, b_constraint=None,
238 |                  bias=True, **kwargs):
239 |         self.supports_masking = True
240 |         self.init = initializers.get('glorot_uniform')
241 | 
242 |         self.W_regularizer = regularizers.get(W_regularizer)
243 |         self.b_regularizer = regularizers.get(b_regularizer)
244 | 
245 |         self.W_constraint = constraints.get(W_constraint)
246 |         self.b_constraint = constraints.get(b_constraint)
247 | 
248 |         self.bias = bias
249 |         self.step_dim = step_dim
250 |         self.features_dim = 0
251 |         super(Attention, self).__init__(**kwargs)
252 | 
253 |     def build(self, input_shape):
254 |         assert len(input_shape) == 3
255 | 
256 |         self.W = self.add_weight((input_shape[-1],),
257 |                                  initializer=self.init,
258 |                                  name='{}_W'.format(self.name),
259 |                                  regularizer=self.W_regularizer,
260 |                                  constraint=self.W_constraint)
261 |         self.features_dim = input_shape[-1]
262 | 
263 |         if self.bias:
264 |             self.b = self.add_weight((input_shape[1],),
265 |                                      initializer='zero',
266 |                                      name='{}_b'.format(self.name),
267 |                                      regularizer=self.b_regularizer,
268 |                                      constraint=self.b_constraint)
269 |         else:
270 |             self.b = None
271 | 
272 |         self.built = True
273 | 
274 |     def compute_mask(self, input, input_mask=None):
275 |         return None
276 | 
277 |     def call(self, x, mask=None):
278 |         features_dim = self.features_dim
279 |         step_dim = self.step_dim
280 | 
281 |         eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
282 |                               K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
283 | 
284 |         if self.bias:
285 |             eij += self.b
286 | 
287 |         eij = K.tanh(eij)
288 | 
289 |         a = K.exp(eij)
290 | 
291 |         if mask is not None:
292 |             a *= K.cast(mask, K.floatx())
293 | 
294 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
295 | 
296 |         a = K.expand_dims(a)
297 |         weighted_input = x * a
298 |         return K.sum(weighted_input, axis=1)
299 | 
300 |     def compute_output_shape(self, input_shape):
301 |         return input_shape[0],  self.features_dim
302 | 
303 | # AUC for a binary classifier
304 | 
305 | 
306 | def auc(y_true, y_pred):
307 |     ptas = tf.stack([binary_PTA(y_true, y_pred, k)
308 |                      for k in np.linspace(0, 1, 1000)], axis=0)
309 |     pfas = tf.stack([binary_PFA(y_true, y_pred, k)
310 |                      for k in np.linspace(0, 1, 1000)], axis=0)
311 |     pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
312 |     binSizes = -(pfas[1:]-pfas[:-1])
313 |     s = ptas*binSizes
314 |     return K.sum(s, axis=0)
315 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
316 | # PFA, prob false alert for binary classifier
317 | 
318 | 
319 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
320 |     y_pred = K.cast(y_pred >= threshold, 'float32')
321 |     # N = total number of negative labels
322 |     N = K.sum(1 - y_true)
323 |     # FP = total number of false alerts, alerts from the negative class labels
324 |     FP = K.sum(y_pred - y_pred * y_true)
325 |     return FP/N
326 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
327 | # P_TA prob true alerts for binary classifier
328 | 
329 | 
330 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
331 |     y_pred = K.cast(y_pred >= threshold, 'float32')
332 |     # P = total number of positive labels
333 |     P = K.sum(y_true)
334 |     # TP = total number of correct alerts, alerts from the positive class labels
335 |     TP = K.sum(y_pred * y_true)
336 |     return TP/P
337 | 
338 | 
339 | class Lookahead(object):
340 |     """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
341 |     """
342 | 
343 |     def __init__(self, k=5, alpha=0.5):
344 |         self.k = k
345 |         self.alpha = alpha
346 |         self.count = 0
347 | 
348 |     def inject(self, model):
349 |         """Inject the Lookahead algorithm for the given model.
350 |         The following code is modified from keras's _make_train_function method.
351 |         See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
352 |         """
353 |         if not hasattr(model, 'train_function'):
354 |             raise RuntimeError('You must compile your model before using it.')
355 | 
356 |         model._check_trainable_weights_consistency()
357 | 
358 |         if model.train_function is None:
359 |             inputs = (model._feed_inputs +
360 |                       model._feed_targets +
361 |                       model._feed_sample_weights)
362 |             if model._uses_dynamic_learning_phase():
363 |                 inputs += [K.learning_phase()]
364 |             fast_params = model._collected_trainable_weights
365 | 
366 |             with K.name_scope('training'):
367 |                 with K.name_scope(model.optimizer.__class__.__name__):
368 |                     training_updates = model.optimizer.get_updates(
369 |                         params=fast_params,
370 |                         loss=model.total_loss)
371 |                     slow_params = [K.variable(p) for p in fast_params]
372 |                 fast_updates = (model.updates +
373 |                                 training_updates +
374 |                                 model.metrics_updates)
375 | 
376 |                 slow_updates, copy_updates = [], []
377 |                 for p, q in zip(fast_params, slow_params):
378 |                     slow_updates.append(K.update(q, q + self.alpha * (p - q)))
379 |                     copy_updates.append(K.update(p, q))
380 | 
381 |                 # Gets loss and metrics. Updates weights at each call.
382 |                 fast_train_function = K.function(
383 |                     inputs,
384 |                     [model.total_loss] + model.metrics_tensors,
385 |                     updates=fast_updates,
386 |                     name='fast_train_function',
387 |                     **model._function_kwargs)
388 | 
389 |                 def F(inputs):
390 |                     self.count += 1
391 |                     R = fast_train_function(inputs)
392 |                     if self.count % self.k == 0:
393 |                         K.batch_get_value(slow_updates)
394 |                         K.batch_get_value(copy_updates)
395 |                     return R
396 | 
397 |                 model.train_function = F
398 | 
399 | 
400 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs):
401 |     "Create embedding layer from a pretrained weights array"
402 |     in_dim, out_dim = pretrained_weights.shape
403 |     embedding = Embedding(in_dim, out_dim, weights=[
404 |                           pretrained_weights], trainable=False, **kwargs)
405 |     return embedding
406 | 
407 | 
408 | def unchanged_shape(input_shape):
409 |     "Function for Lambda layer"
410 |     return input_shape
411 | 
412 | 
413 | def substract(input_1, input_2):
414 |     "Substract element-wise"
415 |     neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2)
416 |     out_ = Add()([input_1, neg_input_2])
417 |     return out_
418 | 
419 | 
420 | def submult(input_1, input_2):
421 |     "Get multiplication and subtraction then concatenate results"
422 |     mult = Multiply()([input_1, input_2])
423 |     sub = substract(input_1, input_2)
424 |     out_ = Concatenate()([sub, mult])
425 |     return out_
426 | 
427 | 
428 | def apply_multiple(input_, layers):
429 |     "Apply layers to input then concatenate result"
430 |     if not len(layers) > 1:
431 |         raise ValueError('Layers list should contain more than 1 layer')
432 |     else:
433 |         agg_ = []
434 |         for layer in layers:
435 |             agg_.append(layer(input_))
436 |         out_ = Concatenate()(agg_)
437 |     return out_
438 | 
439 | 
440 | def time_distributed(input_, layers):
441 |     "Apply a list of layers in TimeDistributed mode"
442 |     out_ = []
443 |     node_ = input_
444 |     for layer_ in layers:
445 |         node_ = TimeDistributed(layer_)(node_)
446 |     out_ = node_
447 |     return out_
448 | 
449 | 
450 | def soft_attention_alignment(input_1, input_2):
451 |     "Align text representation with neural soft attention"
452 |     attention = Dot(axes=-1)([input_1, input_2])
453 |     w_att_1 = Lambda(lambda x: softmax(x, axis=1),
454 |                      output_shape=unchanged_shape)(attention)
455 |     w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2),
456 |                                      output_shape=unchanged_shape)(attention))
457 |     in1_aligned = Dot(axes=1)([w_att_1, input_1])
458 |     in2_aligned = Dot(axes=1)([w_att_2, input_2])
459 |     return in1_aligned, in2_aligned
460 | 
461 | 
462 | def decomposable_attention(pretrained_weights,
463 |                            num_shape,
464 |                            projection_dim=300, projection_hidden=0, projection_dropout=0.2,
465 |                            compare_dim=500, compare_dropout=0.2,
466 |                            dense_dim=300, dense_dropout=0.2,
467 |                            lr=1e-3, activation='elu', maxlen=20):
468 |     # Based on: https://arxiv.org/abs/1606.01933
469 | 
470 |     q1 = Input(name='q1', shape=(maxlen,))
471 |     q2 = Input(name='q2', shape=(maxlen,))
472 | 
473 |     # Embedding
474 |     embedding = create_pretrained_embedding(pretrained_weights,
475 |                                             mask_zero=False)
476 |     q1_embed = embedding(q1)
477 |     q2_embed = embedding(q2)
478 | 
479 |     # Projection
480 |     projection_layers = []
481 |     if projection_hidden > 0:
482 |         projection_layers.extend([
483 |             Dense(projection_hidden, activation=activation),
484 |             Dropout(rate=projection_dropout),
485 |         ])
486 |     projection_layers.extend([
487 |         Dense(projection_dim, activation=None),
488 |         Dropout(rate=projection_dropout),
489 |     ])
490 |     q1_encoded = time_distributed(q1_embed, projection_layers)
491 |     q2_encoded = time_distributed(q2_embed, projection_layers)
492 | 
493 |     # Attention
494 |     q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
495 | 
496 |     # Compare
497 |     q1_combined = Concatenate()(
498 |         [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
499 |     q2_combined = Concatenate()(
500 |         [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
501 |     compare_layers = [
502 |         Dense(compare_dim, activation=activation),
503 |         Dropout(compare_dropout),
504 |         Dense(compare_dim, activation=activation),
505 |         Dropout(compare_dropout),
506 |     ]
507 |     q1_compare = time_distributed(q1_combined, compare_layers)
508 |     q2_compare = time_distributed(q2_combined, compare_layers)
509 | 
510 |     # Aggregate
511 |     q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
512 |     q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
513 | 
514 |     # Classifier
515 |     merged = Concatenate()([q1_rep, q2_rep])
516 |     dense = BatchNormalization()(merged)
517 |     dense = Dense(dense_dim, activation=activation)(dense)
518 |     dense = Dropout(dense_dropout)(dense)
519 |     dense = BatchNormalization()(dense)
520 |     dense = Dense(dense_dim, activation=activation)(dense)
521 |     dense = Dropout(dense_dropout)(dense)
522 |     out_ = Dense(1, activation='sigmoid')(dense)
523 | 
524 |     model = Model(inputs=[q1, q2], outputs=out_)
525 |     model.compile(loss='binary_crossentropy',
526 |                   optimizer=AdamW(lr=0.001, weight_decay=0.02,),
527 |                   metrics=["accuracy", auc])
528 |     return model
529 | 
530 | 
531 | def esim(embedding_matrix,
532 |          maxlen=20,
533 |          lstm_dim=64,
534 |          dense_dim=128,
535 |          dense_dropout=0.5):
536 |     # Based on arXiv:1609.06038
537 |     q1 = Input(name='q1', shape=(8,))
538 |     q2 = Input(name='q2', shape=(20,))
539 | 
540 |     # Embedding
541 |     embedding = create_pretrained_embedding(
542 |         embedding_matrix, mask_zero=False)
543 |     bn = BatchNormalization(axis=2)
544 |     q1_embed = bn(embedding(q1))
545 |     q2_embed = bn(embedding(q2))
546 | 
547 |     # Encode
548 |     encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
549 |     q1_encoded = encode(q1_embed)
550 |     q2_encoded = encode(q2_embed)
551 | 
552 |     # Attention
553 |     q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
554 | 
555 |     # Compose
556 |     q1_combined = Concatenate()(
557 |         [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
558 |     q2_combined = Concatenate()(
559 |         [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
560 | 
561 |     compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
562 |     q1_compare = compose(q1_combined)
563 |     q2_compare = compose(q2_combined)
564 | 
565 |     # Aggregate
566 |     q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
567 |     q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
568 | 
569 |     # leaks_input = Input(shape=(num_shape,))
570 |     # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input)
571 | 
572 |     # Classifier
573 |     merged = Concatenate()([q1_rep, q2_rep])
574 | 
575 |     dense = BatchNormalization()(merged)
576 |     dense = Dense(dense_dim, activation='elu')(dense)
577 |     dense = BatchNormalization()(dense)
578 |     dense = Dropout(dense_dropout)(dense)
579 |     dense = Dense(dense_dim, activation='elu')(dense)
580 |     dense = BatchNormalization()(dense)
581 |     dense = Dropout(dense_dropout)(dense)
582 |     out_ = Dense(1, activation='sigmoid')(dense)
583 | 
584 |     model = Model(inputs=[q1, q2], outputs=out_)
585 |     model.compile(loss='binary_crossentropy',
586 |                   optimizer=AdamW(lr=0.0003, weight_decay=0.02,),
587 |                   metrics=["accuracy", auc])
588 |     return model
589 | 
590 | 
591 | def aux_esim_model(embed_matrix, model_weight_path):
592 |     base_model = esim(embed_matrix)
593 |     base_model.load_weights(model_weight_path)
594 |     input_q, input_a = base_model.inputs
595 |     input_f = Input((19,))
596 |     hidden_esim = base_model.get_layer(index=28).output
597 |     merged = Concatenate()([hidden_esim, input_f])
598 |     #dense = BatchNormalization()(merged)
599 |     dense = Dense(512, activation='relu')(merged)
600 |     #dense = BatchNormalization()(dense)
601 |     dense = Dropout(0.5)(dense)
602 |     dense = Dense(256, activation='relu')(dense)
603 |     #dense = BatchNormalization()(dense)
604 |     dense = Dropout(0.5)(dense)
605 |     out_ = Dense(1, activation='sigmoid')(dense)
606 | 
607 |     model = Model(inputs=[input_q, input_a, input_f], outputs=out_)
608 |     model.compile(loss='binary_crossentropy',
609 |                   optimizer=AdamW(lr=0.0003, weight_decay=0.02),
610 |                   metrics=["accuracy"])
611 |     return model
612 | 
613 | 
614 | ####模型训练
615 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv', feature=train_feature, batch_size=2048,
616 |                       label_tag=True, chunk_size=5000)
617 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, batch_size=2048,
618 |                     label_tag=True, chunk_size=5000)
619 | print("train...")
620 | print("###"*30)
621 | gc.collect()
622 | K.clear_session()
623 | weight_path = '/home/kesci/work/zhifeng/zhifeng_esim_weight_1_0.6924413924179077.h5'
624 | model = aux_esim_model(embed_matrix, weight_path)
625 | lookahead = Lookahead(k=5, alpha=0.5)  # Initialize Lookahead
626 | lookahead.inject(model)  # add into model
627 | model.summary()
628 | early_stopping = EarlyStopping(
629 |     monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
630 | reduce_lr = ReduceLROnPlateau(
631 |     monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
632 | bst_model_path = '/home/kesci/work/zhifeng/zhifeng_aux_fasttext_esim_finetune_{epoch}_{val_loss}.h5'
633 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
634 |                              save_best_only=False,
635 |                              verbose=1, save_weights_only=True, period=1)
636 | callbacks = [checkpoint, reduce_lr, early_stopping]
637 | # print("load weight....")
638 | 
639 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)),
640 |                            epochs=10, verbose=1, callbacks=callbacks,
641 |                            validation_data=val_gen, validation_steps=int(
642 |                                np.ceil(1000000/2048)),
643 |                            max_queue_size=10, workers=1, use_multiprocessing=False)
644 | 
645 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature,
646 |                     batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
647 | val_prob = model.predict_generator(
648 |     val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
649 | 
650 | f = open('/home/kesci/zhifeng/val.csv','r')
651 | q,a,l=[],[],[]
652 | for line in f:
653 |     qid,_,aid,_,label = line.strip().split(',')
654 |     q.append(qid)
655 |     a.append(aid)
656 |     l.append(int(label))
657 | 
658 | val_df = pd.DataFrame({'qid':q,'aid':a,'label':l})
659 | val_df['prob'] = val_prob.flatten()
660 | 
661 | roc_auc_score(val_df['label'], val_df['prob'])
662 | 
663 | 
664 | def perauc(df):
665 |     temp = pd.Series()
666 |     try:
667 |         temp['auc'] = roc_auc_score(df['label'], df['prob'])
668 |     except:
669 |         temp['auc'] = 0.5
670 |     return temp
671 | 
672 | 
673 | eval_df = val_df.groupby("qid").apply(perauc)
674 | eval_df.index = range(len(eval_df))
675 | print("qauc:", eval_df['auc'].mean())
676 | 
677 | test_gen  = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
678 | feature=testa_feature,batch_size=4096,label_tag=False,chunk_size=1,shuffle=False)
679 | prob = model.predict_generator(test_gen,steps=int(np.ceil(20000000/4096)),verbose=1)
680 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',names=['qid','aid','prob'])
681 | sub['prob'] = prob.flatten()
682 | sub.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testa.csv',index=False,header=False
683 | test_gen  = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
684 | feature=testb_feature,batch_size=4096,label_tag=False,chunk_size=1,shuffle=False)
685 | prob = model.predict_generator(test_gen,steps=int(np.ceil(100000000/4096)),verbose=1)
686 | final = pd.read_csv(path+"bytedance_contest.final_2.csv",names=['query_id','query','query_title_id','title'])[['query_id','query_title_id']]
687 | final['prob'] = prob.flatten()
688 | final.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testb.csv',index=False,header=False)


--------------------------------------------------------------------------------
/finetuning_w2v_esim.py:
--------------------------------------------------------------------------------
  1 | from keras.activations import softmax
  2 | from sklearn.preprocessing import StandardScaler
  3 | import os
  4 | import pandas as pd
  5 | import numpy as np
  6 | import random as rn
  7 | from tqdm import tqdm, tqdm_notebook
  8 | import tensorflow as tf
  9 | from sklearn.metrics import roc_auc_score
 10 | from keras.preprocessing.text import Tokenizer
 11 | from keras.preprocessing.sequence import pad_sequences
 12 | from keras.optimizers import Adam
 13 | from keras import backend as K
 14 | from keras.optimizers import *
 15 | from keras.callbacks import *
 16 | from keras.layers import *
 17 | from keras.models import *
 18 | from keras.engine.topology import Layer
 19 | from keras import initializers, regularizers, constraints, optimizers, layers
 20 | from keras.initializers import *
 21 | import keras
 22 | from sklearn.model_selection import StratifiedKFold, GroupKFold
 23 | import gc
 24 | import time
 25 | from gensim.models import Word2Vec
 26 | import logging
 27 | import Levenshtein
 28 | import fasttext
 29 | tqdm.pandas()
 30 | np.random.seed(1017)
 31 | rn.seed(1017)
 32 | tf.set_random_seed(1017)
 33 | path = "/home/kesci/input/bytedance/"
 34 | out = '/home/kesci/work/zhifeng/'
 35 | out_chizhu = '/home/kesci/work/chizhu/'
 36 | print(os.listdir(path))
 37 | 
 38 | f1 = pd.read_csv(out_chizhu + 'f1.csv')
 39 | f2 = pd.read_csv(out_chizhu + 'f2.csv')
 40 | f3 = pd.read_csv(out_chizhu + 'f3.csv')
 41 | feature = pd.concat([f1, f2, f3], sort=False, axis=1)
 42 | del f1, f2, f3
 43 | gc.collect()
 44 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl")
 45 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl")
 46 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl")
 47 | testb_w2v = pd.read_pickle(
 48 |     "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl")
 49 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v)
 50 | 
 51 | train_w2v = pd.read_pickle(
 52 |     "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl")
 53 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl")
 54 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl")
 55 | testb_w2v = pd.read_pickle(
 56 |     "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl")
 57 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \
 58 |     list(testa_w2v)+list(testb_w2v)
 59 | del train_w2v, val_w2v, testa_w2v, testb_w2v
 60 | gc.collect()
 61 | feature.shape
 62 | 
 63 | len_train = 99000000
 64 | len_val = 1000000
 65 | len_testa = 20000000
 66 | len_testb = 100000000
 67 | sc = StandardScaler()
 68 | feature = sc.fit_transform(feature)
 69 | train_feature = feature[:len_train]
 70 | val_feature = feature[len_train:len_train+len_val]
 71 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa]
 72 | testb_feature = feature[-len_testb:]
 73 | print(train_feature.shape, val_feature.shape,
 74 |       testa_feature.shape, testb_feature.shape)
 75 | 
 76 | del feature
 77 | gc.collect()
 78 | 
 79 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model')
 80 | 
 81 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)}
 82 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)}
 83 | 
 84 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
 85 |                      maxlen_query=8):
 86 |     if label_tag:
 87 |         _, _q, _, _a, _label = line.strip().split(',')
 88 |     else:
 89 |         _, _q, _, _a = line.strip().split(',')
 90 |     q_seq = [token.get(item, 0) for item in _q.strip().split()]
 91 |     a_seq = [token.get(item, 0) for item in _a.strip().split()]
 92 |     q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
 93 |     a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
 94 |     if label_tag:
 95 |         return q_pad, a_pad, int(_label)
 96 |     return q_pad, a_pad
 97 | 
 98 | 
 99 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
100 |     while True:
101 |         fin = open(path, 'r')
102 |         batch_q, batch_a, batch_f, batch_label = [], [], [], []
103 |         for i, line in enumerate(fin):
104 |             if len(batch_q) == chunk_size*batch_size:
105 |                 batch_q = np.array(batch_q)
106 |                 batch_a = np.array(batch_a)
107 |                 batch_f = np.array(batch_f)
108 |                 if label_tag:
109 |                     batch_label = np.array(batch_label)
110 |                 idx = list(range(chunk_size*batch_size))
111 |                 if shuffle:
112 |                     np.random.shuffle(idx)
113 |                 for i in range(chunk_size):
114 |                     if label_tag:
115 |                         yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
116 |                                 np.array(
117 |                                     batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
118 |                                 np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
119 |                                np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
120 |                     else:
121 |                         yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
122 |                                np.array(
123 |                                    batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
124 |                                np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
125 |                 batch_q, batch_a, batch_f, batch_label = [], [], [], []
126 |             if label_tag:
127 |                 q, a, l = gen_feature_help(line, label_tag=label_tag)
128 |             else:
129 |                 q, a = gen_feature_help(line, label_tag=label_tag)
130 |                 l = 0
131 |             batch_q.append(q)
132 |             batch_a.append(a)
133 |             batch_f.append(feature[i])
134 |             if label_tag:
135 |                 batch_label.append(l)
136 | 
137 |         batch_q = np.array(batch_q)
138 |         batch_a = np.array(batch_a)
139 |         batch_f = np.array(batch_f)
140 | 
141 |         if label_tag:
142 |             batch_label = np.array(batch_label)
143 |         idx = list(range(len(batch_q)))
144 |         if shuffle:
145 |             np.random.shuffle(idx)
146 |         for i in range(int(np.ceil(len(batch_q)/batch_size))):
147 |             if label_tag:
148 |                 yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
149 |                         np.array(
150 |                             batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
151 |                         np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
152 |                        np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
153 |             else:
154 |                 yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
155 |                        np.array(
156 |                            batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
157 |                        np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
158 |         fin.close()
159 | 
160 | 
161 | def get_embedding_matrix():
162 |     m = np.zeros(shape=(len(index2word)+1, 300))
163 |     for i, w in index2word.items():
164 |         m[i, :] = w2v[w]
165 |     return m
166 | 
167 | 
168 | embed_matrix = get_embedding_matrix()
169 | maxlen_query = 8
170 | maxlen_answer = 20
171 | 
172 | 
173 | class AdamW(Optimizer):
174 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/4)
175 |                  epsilon=1e-8, decay=0., **kwargs):
176 |         super(AdamW, self).__init__(**kwargs)
177 |         with K.name_scope(self.__class__.__name__):
178 |             self.iterations = K.variable(0, dtype='int64', name='iterations')
179 |             self.lr = K.variable(lr, name='lr')
180 |             self.beta_1 = K.variable(beta_1, name='beta_1')
181 |             self.beta_2 = K.variable(beta_2, name='beta_2')
182 |             self.decay = K.variable(decay, name='decay')
183 |             # decoupled weight decay (2/4)
184 |             self.wd = K.variable(weight_decay, name='weight_decay')
185 |         self.epsilon = epsilon
186 |         self.initial_decay = decay
187 | 
188 |     @interfaces.legacy_get_updates_support
189 |     def get_updates(self, loss, params):
190 |         grads = self.get_gradients(loss, params)
191 |         self.updates = [K.update_add(self.iterations, 1)]
192 |         wd = self.wd  # decoupled weight decay (3/4)
193 | 
194 |         lr = self.lr
195 |         if self.initial_decay > 0:
196 |             lr *= (1. / (1. + self.decay * K.cast(self.iterations,
197 |                                                   K.dtype(self.decay))))
198 | 
199 |         t = K.cast(self.iterations, K.floatx()) + 1
200 |         lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
201 |                      (1. - K.pow(self.beta_1, t)))
202 | 
203 |         ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
204 |         vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
205 |         self.weights = [self.iterations] + ms + vs
206 | 
207 |         for p, g, m, v in zip(params, grads, ms, vs):
208 |             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
209 |             v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
210 |             # decoupled weight decay (4/4)
211 |             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
212 | 
213 |             self.updates.append(K.update(m, m_t))
214 |             self.updates.append(K.update(v, v_t))
215 |             new_p = p_t
216 | 
217 |             # Apply constraints.
218 |             if getattr(p, 'constraint', None) is not None:
219 |                 new_p = p.constraint(new_p)
220 | 
221 |             self.updates.append(K.update(p, new_p))
222 |         return self.updates
223 | 
224 |     def get_config(self):
225 |         config = {'lr': float(K.get_value(self.lr)),
226 |                   'beta_1': float(K.get_value(self.beta_1)),
227 |                   'beta_2': float(K.get_value(self.beta_2)),
228 |                   'decay': float(K.get_value(self.decay)),
229 |                   'weight_decay': float(K.get_value(self.wd)),
230 |                   'epsilon': self.epsilon}
231 |         base_config = super(AdamW, self).get_config()
232 |         return dict(list(base_config.items()) + list(config.items()))
233 | 
234 | 
235 | class Attention(Layer):
236 |     def __init__(self, step_dim,
237 |                  W_regularizer=None, b_regularizer=None,
238 |                  W_constraint=None, b_constraint=None,
239 |                  bias=True, **kwargs):
240 |         self.supports_masking = True
241 |         self.init = initializers.get('glorot_uniform')
242 | 
243 |         self.W_regularizer = regularizers.get(W_regularizer)
244 |         self.b_regularizer = regularizers.get(b_regularizer)
245 | 
246 |         self.W_constraint = constraints.get(W_constraint)
247 |         self.b_constraint = constraints.get(b_constraint)
248 | 
249 |         self.bias = bias
250 |         self.step_dim = step_dim
251 |         self.features_dim = 0
252 |         super(Attention, self).__init__(**kwargs)
253 | 
254 |     def build(self, input_shape):
255 |         assert len(input_shape) == 3
256 | 
257 |         self.W = self.add_weight((input_shape[-1],),
258 |                                  initializer=self.init,
259 |                                  name='{}_W'.format(self.name),
260 |                                  regularizer=self.W_regularizer,
261 |                                  constraint=self.W_constraint)
262 |         self.features_dim = input_shape[-1]
263 | 
264 |         if self.bias:
265 |             self.b = self.add_weight((input_shape[1],),
266 |                                      initializer='zero',
267 |                                      name='{}_b'.format(self.name),
268 |                                      regularizer=self.b_regularizer,
269 |                                      constraint=self.b_constraint)
270 |         else:
271 |             self.b = None
272 | 
273 |         self.built = True
274 | 
275 |     def compute_mask(self, input, input_mask=None):
276 |         return None
277 | 
278 |     def call(self, x, mask=None):
279 |         features_dim = self.features_dim
280 |         step_dim = self.step_dim
281 | 
282 |         eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
283 |                               K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
284 | 
285 |         if self.bias:
286 |             eij += self.b
287 | 
288 |         eij = K.tanh(eij)
289 | 
290 |         a = K.exp(eij)
291 | 
292 |         if mask is not None:
293 |             a *= K.cast(mask, K.floatx())
294 | 
295 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
296 | 
297 |         a = K.expand_dims(a)
298 |         weighted_input = x * a
299 |         return K.sum(weighted_input, axis=1)
300 | 
301 |     def compute_output_shape(self, input_shape):
302 |         return input_shape[0],  self.features_dim
303 | 
304 | # AUC for a binary classifier
305 | 
306 | 
307 | def auc(y_true, y_pred):
308 |     ptas = tf.stack([binary_PTA(y_true, y_pred, k)
309 |                      for k in np.linspace(0, 1, 1000)], axis=0)
310 |     pfas = tf.stack([binary_PFA(y_true, y_pred, k)
311 |                      for k in np.linspace(0, 1, 1000)], axis=0)
312 |     pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
313 |     binSizes = -(pfas[1:]-pfas[:-1])
314 |     s = ptas*binSizes
315 |     return K.sum(s, axis=0)
316 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
317 | # PFA, prob false alert for binary classifier
318 | 
319 | 
320 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
321 |     y_pred = K.cast(y_pred >= threshold, 'float32')
322 |     # N = total number of negative labels
323 |     N = K.sum(1 - y_true)
324 |     # FP = total number of false alerts, alerts from the negative class labels
325 |     FP = K.sum(y_pred - y_pred * y_true)
326 |     return FP/N
327 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
328 | # P_TA prob true alerts for binary classifier
329 | 
330 | 
331 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
332 |     y_pred = K.cast(y_pred >= threshold, 'float32')
333 |     # P = total number of positive labels
334 |     P = K.sum(y_true)
335 |     # TP = total number of correct alerts, alerts from the positive class labels
336 |     TP = K.sum(y_pred * y_true)
337 |     return TP/P
338 | 
339 | 
340 | class Lookahead(object):
341 |     """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
342 |     """
343 | 
344 |     def __init__(self, k=5, alpha=0.5):
345 |         self.k = k
346 |         self.alpha = alpha
347 |         self.count = 0
348 | 
349 |     def inject(self, model):
350 |         """Inject the Lookahead algorithm for the given model.
351 |         The following code is modified from keras's _make_train_function method.
352 |         See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
353 |         """
354 |         if not hasattr(model, 'train_function'):
355 |             raise RuntimeError('You must compile your model before using it.')
356 | 
357 |         model._check_trainable_weights_consistency()
358 | 
359 |         if model.train_function is None:
360 |             inputs = (model._feed_inputs +
361 |                       model._feed_targets +
362 |                       model._feed_sample_weights)
363 |             if model._uses_dynamic_learning_phase():
364 |                 inputs += [K.learning_phase()]
365 |             fast_params = model._collected_trainable_weights
366 | 
367 |             with K.name_scope('training'):
368 |                 with K.name_scope(model.optimizer.__class__.__name__):
369 |                     training_updates = model.optimizer.get_updates(
370 |                         params=fast_params,
371 |                         loss=model.total_loss)
372 |                     slow_params = [K.variable(p) for p in fast_params]
373 |                 fast_updates = (model.updates +
374 |                                 training_updates +
375 |                                 model.metrics_updates)
376 | 
377 |                 slow_updates, copy_updates = [], []
378 |                 for p, q in zip(fast_params, slow_params):
379 |                     slow_updates.append(K.update(q, q + self.alpha * (p - q)))
380 |                     copy_updates.append(K.update(p, q))
381 | 
382 |                 # Gets loss and metrics. Updates weights at each call.
383 |                 fast_train_function = K.function(
384 |                     inputs,
385 |                     [model.total_loss] + model.metrics_tensors,
386 |                     updates=fast_updates,
387 |                     name='fast_train_function',
388 |                     **model._function_kwargs)
389 | 
390 |                 def F(inputs):
391 |                     self.count += 1
392 |                     R = fast_train_function(inputs)
393 |                     if self.count % self.k == 0:
394 |                         K.batch_get_value(slow_updates)
395 |                         K.batch_get_value(copy_updates)
396 |                     return R
397 | 
398 |                 model.train_function = F
399 | 
400 | 
401 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs):
402 |     "Create embedding layer from a pretrained weights array"
403 |     in_dim, out_dim = pretrained_weights.shape
404 |     embedding = Embedding(in_dim, out_dim, weights=[
405 |                           pretrained_weights], trainable=False, **kwargs)
406 |     return embedding
407 | 
408 | 
409 | def unchanged_shape(input_shape):
410 |     "Function for Lambda layer"
411 |     return input_shape
412 | 
413 | 
414 | def substract(input_1, input_2):
415 |     "Substract element-wise"
416 |     neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2)
417 |     out_ = Add()([input_1, neg_input_2])
418 |     return out_
419 | 
420 | 
421 | def submult(input_1, input_2):
422 |     "Get multiplication and subtraction then concatenate results"
423 |     mult = Multiply()([input_1, input_2])
424 |     sub = substract(input_1, input_2)
425 |     out_ = Concatenate()([sub, mult])
426 |     return out_
427 | 
428 | 
429 | def apply_multiple(input_, layers):
430 |     "Apply layers to input then concatenate result"
431 |     if not len(layers) > 1:
432 |         raise ValueError('Layers list should contain more than 1 layer')
433 |     else:
434 |         agg_ = []
435 |         for layer in layers:
436 |             agg_.append(layer(input_))
437 |         out_ = Concatenate()(agg_)
438 |     return out_
439 | 
440 | 
441 | def time_distributed(input_, layers):
442 |     "Apply a list of layers in TimeDistributed mode"
443 |     out_ = []
444 |     node_ = input_
445 |     for layer_ in layers:
446 |         node_ = TimeDistributed(layer_)(node_)
447 |     out_ = node_
448 |     return out_
449 | 
450 | 
451 | def soft_attention_alignment(input_1, input_2):
452 |     "Align text representation with neural soft attention"
453 |     attention = Dot(axes=-1)([input_1, input_2])
454 |     w_att_1 = Lambda(lambda x: softmax(x, axis=1),
455 |                      output_shape=unchanged_shape)(attention)
456 |     w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2),
457 |                                      output_shape=unchanged_shape)(attention))
458 |     in1_aligned = Dot(axes=1)([w_att_1, input_1])
459 |     in2_aligned = Dot(axes=1)([w_att_2, input_2])
460 |     return in1_aligned, in2_aligned
461 | 
462 | 
463 | def decomposable_attention(pretrained_weights,
464 |                            num_shape,
465 |                            projection_dim=300, projection_hidden=0, projection_dropout=0.2,
466 |                            compare_dim=500, compare_dropout=0.2,
467 |                            dense_dim=300, dense_dropout=0.2,
468 |                            lr=1e-3, activation='elu', maxlen=20):
469 |     # Based on: https://arxiv.org/abs/1606.01933
470 | 
471 |     q1 = Input(name='q1', shape=(maxlen,))
472 |     q2 = Input(name='q2', shape=(maxlen,))
473 | 
474 |     # Embedding
475 |     embedding = create_pretrained_embedding(pretrained_weights,
476 |                                             mask_zero=False)
477 |     q1_embed = embedding(q1)
478 |     q2_embed = embedding(q2)
479 | 
480 |     # Projection
481 |     projection_layers = []
482 |     if projection_hidden > 0:
483 |         projection_layers.extend([
484 |             Dense(projection_hidden, activation=activation),
485 |             Dropout(rate=projection_dropout),
486 |         ])
487 |     projection_layers.extend([
488 |         Dense(projection_dim, activation=None),
489 |         Dropout(rate=projection_dropout),
490 |     ])
491 |     q1_encoded = time_distributed(q1_embed, projection_layers)
492 |     q2_encoded = time_distributed(q2_embed, projection_layers)
493 | 
494 |     # Attention
495 |     q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
496 | 
497 |     # Compare
498 |     q1_combined = Concatenate()(
499 |         [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
500 |     q2_combined = Concatenate()(
501 |         [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
502 |     compare_layers = [
503 |         Dense(compare_dim, activation=activation),
504 |         Dropout(compare_dropout),
505 |         Dense(compare_dim, activation=activation),
506 |         Dropout(compare_dropout),
507 |     ]
508 |     q1_compare = time_distributed(q1_combined, compare_layers)
509 |     q2_compare = time_distributed(q2_combined, compare_layers)
510 | 
511 |     # Aggregate
512 |     q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
513 |     q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
514 | 
515 |     # Classifier
516 |     merged = Concatenate()([q1_rep, q2_rep])
517 |     dense = BatchNormalization()(merged)
518 |     dense = Dense(dense_dim, activation=activation)(dense)
519 |     dense = Dropout(dense_dropout)(dense)
520 |     dense = BatchNormalization()(dense)
521 |     dense = Dense(dense_dim, activation=activation)(dense)
522 |     dense = Dropout(dense_dropout)(dense)
523 |     out_ = Dense(1, activation='sigmoid')(dense)
524 | 
525 |     model = Model(inputs=[q1, q2], outputs=out_)
526 |     model.compile(loss='binary_crossentropy',
527 |                   optimizer=AdamW(lr=0.001, weight_decay=0.02,),
528 |                   metrics=["accuracy", auc])
529 |     return model
530 | 
531 | 
532 | def esim(embedding_matrix,
533 |          maxlen=20,
534 |          lstm_dim=64,
535 |          dense_dim=128,
536 |          dense_dropout=0.5):
537 |     # Based on arXiv:1609.06038
538 |     q1 = Input(name='q1', shape=(8,))
539 |     q2 = Input(name='q2', shape=(20,))
540 | 
541 |     # Embedding
542 |     embedding = create_pretrained_embedding(
543 |         embedding_matrix, mask_zero=False)
544 |     bn = BatchNormalization(axis=2)
545 |     q1_embed = bn(embedding(q1))
546 |     q2_embed = bn(embedding(q2))
547 | 
548 |     # Encode
549 |     encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
550 |     q1_encoded = encode(q1_embed)
551 |     q2_encoded = encode(q2_embed)
552 | 
553 |     # Attention
554 |     q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
555 | 
556 |     # Compose
557 |     q1_combined = Concatenate()(
558 |         [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
559 |     q2_combined = Concatenate()(
560 |         [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
561 | 
562 |     compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
563 |     q1_compare = compose(q1_combined)
564 |     q2_compare = compose(q2_combined)
565 | 
566 |     # Aggregate
567 |     q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
568 |     q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
569 | 
570 |     # leaks_input = Input(shape=(num_shape,))
571 |     # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input)
572 | 
573 |     # Classifier
574 |     merged = Concatenate()([q1_rep, q2_rep])
575 | 
576 |     dense = BatchNormalization()(merged)
577 |     dense = Dense(dense_dim, activation='elu')(dense)
578 |     dense = BatchNormalization()(dense)
579 |     dense = Dropout(dense_dropout)(dense)
580 |     dense = Dense(dense_dim, activation='elu')(dense)
581 |     dense = BatchNormalization()(dense)
582 |     dense = Dropout(dense_dropout)(dense)
583 |     out_ = Dense(1, activation='sigmoid')(dense)
584 | 
585 |     model = Model(inputs=[q1, q2], outputs=out_)
586 |     model.compile(loss='binary_crossentropy',
587 |                   optimizer=AdamW(lr=0.0003, weight_decay=0.02,),
588 |                   metrics=["accuracy", auc])
589 |     return model
590 | 
591 | 
592 | def aux_esim_model(embed_matrix, model_weight_path):
593 |     base_model = esim(embed_matrix)
594 |     base_model.load_weights(model_weight_path)
595 |     input_q, input_a = base_model.inputs
596 |     input_f = Input((19,))
597 |     hidden_esim = base_model.get_layer(index=28).output
598 |     merged = Concatenate()([hidden_esim, input_f])
599 |     #dense = BatchNormalization()(merged)
600 |     dense = Dense(512, activation='relu')(merged)
601 |     #dense = BatchNormalization()(dense)
602 |     dense = Dropout(0.5)(dense)
603 |     dense = Dense(256, activation='relu')(dense)
604 |     #dense = BatchNormalization()(dense)
605 |     dense = Dropout(0.5)(dense)
606 |     out_ = Dense(1, activation='sigmoid')(dense)
607 | 
608 |     model = Model(inputs=[input_q, input_a, input_f], outputs=out_)
609 |     model.compile(loss='binary_crossentropy',
610 |                   optimizer=AdamW(lr=0.0003, weight_decay=0.02),
611 |                   metrics=["accuracy"])
612 |     return model
613 | 
614 | 
615 | ####模型训练
616 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv', feature=train_feature, batch_size=2048,
617 |                       label_tag=True, chunk_size=5000)
618 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, batch_size=2048,
619 |                     label_tag=True, chunk_size=5000)
620 | print("train...")
621 | print("###"*30)
622 | gc.collect()
623 | K.clear_session()
624 | weight_path = '/home/kesci/work/chizhu/chizhu_w2v_esim_weight_1_0.44060374074871167.h5'
625 | model = aux_esim_model(embed_matrix, weight_path)
626 | lookahead = Lookahead(k=5, alpha=0.5)  # Initialize Lookahead
627 | lookahead.inject(model)  # add into model
628 | model.summary()
629 | early_stopping = EarlyStopping(
630 |     monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
631 | reduce_lr = ReduceLROnPlateau(
632 |     monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
633 | bst_model_path = '/home/kesci/work/zhifeng/zhifeng_aux_fasttext_esim_finetune_{epoch}_{val_loss}.h5'
634 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
635 |                              save_best_only=False,
636 |                              verbose=1, save_weights_only=True, period=1)
637 | callbacks = [checkpoint, reduce_lr, early_stopping]
638 | # print("load weight....")
639 | 
640 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)),
641 |                            epochs=10, verbose=1, callbacks=callbacks,
642 |                            validation_data=val_gen, validation_steps=int(
643 |                                np.ceil(1000000/2048)),
644 |                            max_queue_size=10, workers=1, use_multiprocessing=False)
645 | 
646 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature,
647 |                     batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
648 | val_prob = model.predict_generator(
649 |     val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
650 | 
651 | f = open('/home/kesci/zhifeng/val.csv', 'r')
652 | q, a, l = [], [], []
653 | for line in f:
654 |     qid, _, aid, _, label = line.strip().split(',')
655 |     q.append(qid)
656 |     a.append(aid)
657 |     l.append(int(label))
658 | 
659 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l})
660 | val_df['prob'] = val_prob.flatten()
661 | 
662 | roc_auc_score(val_df['label'], val_df['prob'])
663 | 
664 | 
665 | def perauc(df):
666 |     temp = pd.Series()
667 |     try:
668 |         temp['auc'] = roc_auc_score(df['label'], df['prob'])
669 |     except:
670 |         temp['auc'] = 0.5
671 |     return temp
672 | 
673 | 
674 | eval_df = val_df.groupby("qid").apply(perauc)
675 | eval_df.index = range(len(eval_df))
676 | print("qauc:", eval_df['auc'].mean())
677 | 
678 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
679 |                      feature=testa_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
680 | prob = model.predict_generator(
681 |     test_gen, steps=int(np.ceil(20000000/4096)), verbose=1)
682 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',
683 |                   names=['qid', 'aid', 'prob'])
684 | sub['prob'] = prob.flatten()
685 | sub.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testa.csv', index=False, header=False
686 | test_gen=gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
687 |                               feature=testb_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
688 | prob=model.predict_generator(test_gen, steps=int(
689 |                np.ceil(100000000/4096)), verbose=1)
690 | final=pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
691 |                              'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
692 | final['prob']=prob.flatten()
693 | final.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testb.csv', index=False, header=False)
694 | 


--------------------------------------------------------------------------------
/finetuning_w2v_rnn.py:
--------------------------------------------------------------------------------
  1 | from keras.activations import softmax
  2 | from sklearn.preprocessing import StandardScaler
  3 | import os
  4 | import pandas as pd
  5 | import numpy as np
  6 | import random as rn
  7 | from tqdm import tqdm, tqdm_notebook
  8 | import tensorflow as tf
  9 | from sklearn.metrics import roc_auc_score
 10 | from keras.preprocessing.text import Tokenizer
 11 | from keras.preprocessing.sequence import pad_sequences
 12 | from keras.optimizers import Adam
 13 | from keras import backend as K
 14 | from keras.optimizers import *
 15 | from keras.callbacks import *
 16 | from keras.layers import *
 17 | from keras.models import *
 18 | from keras.engine.topology import Layer
 19 | from keras import initializers, regularizers, constraints, optimizers, layers
 20 | from keras.initializers import *
 21 | import keras
 22 | from sklearn.model_selection import StratifiedKFold, GroupKFold
 23 | import gc
 24 | import time
 25 | from gensim.models import Word2Vec
 26 | import logging
 27 | import Levenshtein
 28 | import fasttext
 29 | tqdm.pandas()
 30 | np.random.seed(1017)
 31 | rn.seed(1017)
 32 | tf.set_random_seed(1017)
 33 | path = "/home/kesci/input/bytedance/"
 34 | out = '/home/kesci/work/zhifeng/'
 35 | out_chizhu = '/home/kesci/work/chizhu/'
 36 | print(os.listdir(path))
 37 | 
 38 | f1 = pd.read_csv(out_chizhu + 'f1.csv')
 39 | f2 = pd.read_csv(out_chizhu + 'f2.csv')
 40 | f3 = pd.read_csv(out_chizhu + 'f3.csv')
 41 | feature = pd.concat([f1, f2, f3], sort=False, axis=1)
 42 | del f1, f2, f3
 43 | gc.collect()
 44 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl")
 45 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl")
 46 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl")
 47 | testb_w2v = pd.read_pickle(
 48 |     "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl")
 49 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v)
 50 | 
 51 | train_w2v = pd.read_pickle(
 52 |     "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl")
 53 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl")
 54 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl")
 55 | testb_w2v = pd.read_pickle(
 56 |     "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl")
 57 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \
 58 |     list(testa_w2v)+list(testb_w2v)
 59 | del train_w2v, val_w2v, testa_w2v, testb_w2v
 60 | gc.collect()
 61 | feature.shape
 62 | 
 63 | len_train = 99000000
 64 | len_val = 1000000
 65 | len_testa = 20000000
 66 | len_testb = 100000000
 67 | sc = StandardScaler()
 68 | feature = sc.fit_transform(feature)
 69 | train_feature = feature[:len_train]
 70 | val_feature = feature[len_train:len_train+len_val]
 71 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa]
 72 | testb_feature = feature[-len_testb:]
 73 | print(train_feature.shape, val_feature.shape,
 74 |       testa_feature.shape, testb_feature.shape)
 75 | 
 76 | del feature
 77 | gc.collect()
 78 | 
 79 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model')
 80 | 
 81 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)}
 82 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)}
 83 | 
 84 | 
 85 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
 86 |                      maxlen_query=8):
 87 |     if label_tag:
 88 |         _, _q, _, _a, _label = line.strip().split(',')
 89 |     else:
 90 |         _, _q, _, _a = line.strip().split(',')
 91 |     q_seq = [token.get(item, 0) for item in _q.strip().split()]
 92 |     a_seq = [token.get(item, 0) for item in _a.strip().split()]
 93 |     q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
 94 |     a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
 95 |     if label_tag:
 96 |         return q_pad, a_pad, int(_label)
 97 |     return q_pad, a_pad
 98 | 
 99 | 
100 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
101 |     while True:
102 |         fin = open(path, 'r')
103 |         batch_q, batch_a, batch_f, batch_label = [], [], [], []
104 |         for i, line in enumerate(fin):
105 |             if len(batch_q) == chunk_size*batch_size:
106 |                 batch_q = np.array(batch_q)
107 |                 batch_a = np.array(batch_a)
108 |                 batch_f = np.array(batch_f)
109 |                 if label_tag:
110 |                     batch_label = np.array(batch_label)
111 |                 idx = list(range(chunk_size*batch_size))
112 |                 if shuffle:
113 |                     np.random.shuffle(idx)
114 |                 for i in range(chunk_size):
115 |                     if label_tag:
116 |                         yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
117 |                                 np.array(
118 |                                     batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
119 |                                 np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
120 |                                np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
121 |                     else:
122 |                         yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
123 |                                np.array(
124 |                                    batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
125 |                                np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
126 |                 batch_q, batch_a, batch_f, batch_label = [], [], [], []
127 |             if label_tag:
128 |                 q, a, l = gen_feature_help(line, label_tag=label_tag)
129 |             else:
130 |                 q, a = gen_feature_help(line, label_tag=label_tag)
131 |                 l = 0
132 |             batch_q.append(q)
133 |             batch_a.append(a)
134 |             batch_f.append(feature[i])
135 |             if label_tag:
136 |                 batch_label.append(l)
137 | 
138 |         batch_q = np.array(batch_q)
139 |         batch_a = np.array(batch_a)
140 |         batch_f = np.array(batch_f)
141 | 
142 |         if label_tag:
143 |             batch_label = np.array(batch_label)
144 |         idx = list(range(len(batch_q)))
145 |         if shuffle:
146 |             np.random.shuffle(idx)
147 |         for i in range(int(np.ceil(len(batch_q)/batch_size))):
148 |             if label_tag:
149 |                 yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
150 |                         np.array(
151 |                             batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
152 |                         np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
153 |                        np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
154 |             else:
155 |                 yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
156 |                        np.array(
157 |                            batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
158 |                        np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
159 |         fin.close()
160 | 
161 | 
162 | def get_embedding_matrix():
163 |     m = np.zeros(shape=(len(index2word)+1, 300))
164 |     for i, w in index2word.items():
165 |         m[i, :] = w2v[w]
166 |     return m
167 | 
168 | 
169 | embed_matrix = get_embedding_matrix()
170 | maxlen_query = 8
171 | maxlen_answer = 20
172 | 
173 | 
174 | class AdamW(Optimizer):
175 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/4)
176 |                  epsilon=1e-8, decay=0., **kwargs):
177 |         super(AdamW, self).__init__(**kwargs)
178 |         with K.name_scope(self.__class__.__name__):
179 |             self.iterations = K.variable(0, dtype='int64', name='iterations')
180 |             self.lr = K.variable(lr, name='lr')
181 |             self.beta_1 = K.variable(beta_1, name='beta_1')
182 |             self.beta_2 = K.variable(beta_2, name='beta_2')
183 |             self.decay = K.variable(decay, name='decay')
184 |             # decoupled weight decay (2/4)
185 |             self.wd = K.variable(weight_decay, name='weight_decay')
186 |         self.epsilon = epsilon
187 |         self.initial_decay = decay
188 | 
189 |     @interfaces.legacy_get_updates_support
190 |     def get_updates(self, loss, params):
191 |         grads = self.get_gradients(loss, params)
192 |         self.updates = [K.update_add(self.iterations, 1)]
193 |         wd = self.wd  # decoupled weight decay (3/4)
194 | 
195 |         lr = self.lr
196 |         if self.initial_decay > 0:
197 |             lr *= (1. / (1. + self.decay * K.cast(self.iterations,
198 |                                                   K.dtype(self.decay))))
199 | 
200 |         t = K.cast(self.iterations, K.floatx()) + 1
201 |         lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
202 |                      (1. - K.pow(self.beta_1, t)))
203 | 
204 |         ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
205 |         vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
206 |         self.weights = [self.iterations] + ms + vs
207 | 
208 |         for p, g, m, v in zip(params, grads, ms, vs):
209 |             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
210 |             v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
211 |             # decoupled weight decay (4/4)
212 |             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
213 | 
214 |             self.updates.append(K.update(m, m_t))
215 |             self.updates.append(K.update(v, v_t))
216 |             new_p = p_t
217 | 
218 |             # Apply constraints.
219 |             if getattr(p, 'constraint', None) is not None:
220 |                 new_p = p.constraint(new_p)
221 | 
222 |             self.updates.append(K.update(p, new_p))
223 |         return self.updates
224 | 
225 |     def get_config(self):
226 |         config = {'lr': float(K.get_value(self.lr)),
227 |                   'beta_1': float(K.get_value(self.beta_1)),
228 |                   'beta_2': float(K.get_value(self.beta_2)),
229 |                   'decay': float(K.get_value(self.decay)),
230 |                   'weight_decay': float(K.get_value(self.wd)),
231 |                   'epsilon': self.epsilon}
232 |         base_config = super(AdamW, self).get_config()
233 |         return dict(list(base_config.items()) + list(config.items()))
234 | 
235 | 
236 | class Attention(Layer):
237 |     def __init__(self, step_dim,
238 |                  W_regularizer=None, b_regularizer=None,
239 |                  W_constraint=None, b_constraint=None,
240 |                  bias=True, **kwargs):
241 |         self.supports_masking = True
242 |         self.init = initializers.get('glorot_uniform')
243 | 
244 |         self.W_regularizer = regularizers.get(W_regularizer)
245 |         self.b_regularizer = regularizers.get(b_regularizer)
246 | 
247 |         self.W_constraint = constraints.get(W_constraint)
248 |         self.b_constraint = constraints.get(b_constraint)
249 | 
250 |         self.bias = bias
251 |         self.step_dim = step_dim
252 |         self.features_dim = 0
253 |         super(Attention, self).__init__(**kwargs)
254 | 
255 |     def build(self, input_shape):
256 |         assert len(input_shape) == 3
257 | 
258 |         self.W = self.add_weight((input_shape[-1],),
259 |                                  initializer=self.init,
260 |                                  name='{}_W'.format(self.name),
261 |                                  regularizer=self.W_regularizer,
262 |                                  constraint=self.W_constraint)
263 |         self.features_dim = input_shape[-1]
264 | 
265 |         if self.bias:
266 |             self.b = self.add_weight((input_shape[1],),
267 |                                      initializer='zero',
268 |                                      name='{}_b'.format(self.name),
269 |                                      regularizer=self.b_regularizer,
270 |                                      constraint=self.b_constraint)
271 |         else:
272 |             self.b = None
273 | 
274 |         self.built = True
275 | 
276 |     def compute_mask(self, input, input_mask=None):
277 |         return None
278 | 
279 |     def call(self, x, mask=None):
280 |         features_dim = self.features_dim
281 |         step_dim = self.step_dim
282 | 
283 |         eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
284 |                               K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
285 | 
286 |         if self.bias:
287 |             eij += self.b
288 | 
289 |         eij = K.tanh(eij)
290 | 
291 |         a = K.exp(eij)
292 | 
293 |         if mask is not None:
294 |             a *= K.cast(mask, K.floatx())
295 | 
296 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
297 | 
298 |         a = K.expand_dims(a)
299 |         weighted_input = x * a
300 |         return K.sum(weighted_input, axis=1)
301 | 
302 |     def compute_output_shape(self, input_shape):
303 |         return input_shape[0],  self.features_dim
304 | 
305 | # AUC for a binary classifier
306 | 
307 | 
308 | def auc(y_true, y_pred):
309 |     ptas = tf.stack([binary_PTA(y_true, y_pred, k)
310 |                      for k in np.linspace(0, 1, 1000)], axis=0)
311 |     pfas = tf.stack([binary_PFA(y_true, y_pred, k)
312 |                      for k in np.linspace(0, 1, 1000)], axis=0)
313 |     pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
314 |     binSizes = -(pfas[1:]-pfas[:-1])
315 |     s = ptas*binSizes
316 |     return K.sum(s, axis=0)
317 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
318 | # PFA, prob false alert for binary classifier
319 | 
320 | 
321 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
322 |     y_pred = K.cast(y_pred >= threshold, 'float32')
323 |     # N = total number of negative labels
324 |     N = K.sum(1 - y_true)
325 |     # FP = total number of false alerts, alerts from the negative class labels
326 |     FP = K.sum(y_pred - y_pred * y_true)
327 |     return FP/N
328 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
329 | # P_TA prob true alerts for binary classifier
330 | 
331 | 
332 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
333 |     y_pred = K.cast(y_pred >= threshold, 'float32')
334 |     # P = total number of positive labels
335 |     P = K.sum(y_true)
336 |     # TP = total number of correct alerts, alerts from the positive class labels
337 |     TP = K.sum(y_pred * y_true)
338 |     return TP/P
339 | 
340 | 
341 | class Lookahead(object):
342 |     """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
343 |     """
344 | 
345 |     def __init__(self, k=5, alpha=0.5):
346 |         self.k = k
347 |         self.alpha = alpha
348 |         self.count = 0
349 | 
350 |     def inject(self, model):
351 |         """Inject the Lookahead algorithm for the given model.
352 |         The following code is modified from keras's _make_train_function method.
353 |         See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
354 |         """
355 |         if not hasattr(model, 'train_function'):
356 |             raise RuntimeError('You must compile your model before using it.')
357 | 
358 |         model._check_trainable_weights_consistency()
359 | 
360 |         if model.train_function is None:
361 |             inputs = (model._feed_inputs +
362 |                       model._feed_targets +
363 |                       model._feed_sample_weights)
364 |             if model._uses_dynamic_learning_phase():
365 |                 inputs += [K.learning_phase()]
366 |             fast_params = model._collected_trainable_weights
367 | 
368 |             with K.name_scope('training'):
369 |                 with K.name_scope(model.optimizer.__class__.__name__):
370 |                     training_updates = model.optimizer.get_updates(
371 |                         params=fast_params,
372 |                         loss=model.total_loss)
373 |                     slow_params = [K.variable(p) for p in fast_params]
374 |                 fast_updates = (model.updates +
375 |                                 training_updates +
376 |                                 model.metrics_updates)
377 | 
378 |                 slow_updates, copy_updates = [], []
379 |                 for p, q in zip(fast_params, slow_params):
380 |                     slow_updates.append(K.update(q, q + self.alpha * (p - q)))
381 |                     copy_updates.append(K.update(p, q))
382 | 
383 |                 # Gets loss and metrics. Updates weights at each call.
384 |                 fast_train_function = K.function(
385 |                     inputs,
386 |                     [model.total_loss] + model.metrics_tensors,
387 |                     updates=fast_updates,
388 |                     name='fast_train_function',
389 |                     **model._function_kwargs)
390 | 
391 |                 def F(inputs):
392 |                     self.count += 1
393 |                     R = fast_train_function(inputs)
394 |                     if self.count % self.k == 0:
395 |                         K.batch_get_value(slow_updates)
396 |                         K.batch_get_value(copy_updates)
397 |                     return R
398 | 
399 |                 model.train_function = F
400 | 
401 | 
402 | def get_model(embedding_matrix):
403 | 
404 |     K.clear_session()
405 |     #The embedding layer containing the word vectors
406 |     emb_layer = Embedding(
407 |         input_dim=embedding_matrix.shape[0],
408 |         output_dim=embedding_matrix.shape[1],
409 |         weights=[embedding_matrix],
410 |         trainable=False
411 |     )
412 |     sdrop=SpatialDropout1D(rate=0.2)
413 |     lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True, 
414 | kernel_initializer=glorot_uniform(seed = 123)))
415 |     gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True, 
416 | kernel_initializer=glorot_uniform(seed = 123)))
417 |     
418 |     cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
419 | 
420 |     # Define inputs
421 |     seq1 = Input(shape=(maxlen_query,))
422 |     x1 = emb_layer(seq1)
423 |     x1 = sdrop(x1)
424 |     lstm1 = lstm_layer(x1)
425 |     gru1 = gru_layer(lstm1)
426 |     att_1 = Attention(maxlen_query)(lstm1)
427 |     att_3 = Attention(maxlen_query)(gru1)
428 |     cnn1 = cnn1d_layer(lstm1)
429 |     
430 |     avg_pool = GlobalAveragePooling1D()
431 |     max_pool = GlobalMaxPooling1D()
432 |     
433 |     seq2 = Input(shape=(maxlen_answer,))
434 |     x2 = emb_layer(seq2)
435 |     x2 = sdrop(x2)
436 |     lstm2 = lstm_layer(x2)
437 |     gru2 = gru_layer(lstm2)
438 |     att_2 = Attention(maxlen_answer)(lstm2)
439 |     att_4 = Attention(maxlen_answer)(gru2)
440 |     cnn2 = cnn1d_layer(lstm2)
441 |     
442 |     x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)])
443 |     x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)])
444 |     
445 |     merge = Multiply()([x1, x2])
446 |     merge = Dropout(0.5)(merge)
447 |     # The MLP that determines the outcome
448 |     x = Dense(128,kernel_initializer=he_uniform(seed=123), activation='relu',)(merge)
449 |     # x = Dropout(0.2)(x)
450 |     # x = BatchNormalization()(x)
451 | 
452 |     pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x)
453 | 
454 |     
455 |     model = Model(inputs=[seq1,seq2], outputs=pred)
456 | 
457 |     model.compile(loss='binary_crossentropy',
458 |                   optimizer=AdamW(lr=0.0003,weight_decay=0.02,),
459 |                   metrics=["accuracy"])
460 |     # model.summary()
461 |     return model
462 | 
463 | 
464 | def aux_esim_model(embed_matrix, model_weight_path):
465 |     base_model = get_model(embed_matrix)
466 |     base_model.load_weights(model_weight_path)
467 |     input_q, input_a = base_model.inputs
468 |     input_f = Input((19,))
469 |     hidden_esim = base_model.get_layer(index=15).output
470 |     merged = Concatenate()([hidden_esim, input_f])
471 |     #dense = BatchNormalization()(merged)
472 |     dense = Dense(512, activation='relu')(merged)
473 |     #dense = BatchNormalization()(dense)
474 |     dense = Dropout(0.5)(dense)
475 |     dense = Dense(256, activation='relu')(dense)
476 |     #dense = BatchNormalization()(dense)
477 |     dense = Dropout(0.5)(dense)
478 |     out_ = Dense(1, activation='sigmoid')(dense)
479 | 
480 |     model = Model(inputs=[input_q, input_a, input_f], outputs=out_)
481 |     model.compile(loss='binary_crossentropy',
482 |                   optimizer=AdamW(lr=0.0003, weight_decay=0.02),
483 |                   metrics=["accuracy"])
484 |     return model
485 | 
486 | 
487 | ####模型训练
488 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv', feature=train_feature, batch_size=2048,
489 |                       label_tag=True, chunk_size=5000)
490 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, batch_size=2048,
491 |                     label_tag=True, chunk_size=5000)
492 | print("train...")
493 | print("###"*30)
494 | gc.collect()
495 | K.clear_session()
496 | weight_path = '/home/kesci/work/zhifeng/zhifeng_rnn_weight_1_0.668621638244629.h5'
497 | model = aux_esim_model(embed_matrix, weight_path)
498 | lookahead = Lookahead(k=5, alpha=0.5)  # Initialize Lookahead
499 | lookahead.inject(model)  # add into model
500 | model.summary()
501 | early_stopping = EarlyStopping(
502 |     monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
503 | reduce_lr = ReduceLROnPlateau(
504 |     monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
505 | bst_model_path = '/home/kesci/work/zhifeng/zhifeng_aux_fasttext_esim_finetune_{epoch}_{val_loss}.h5'
506 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
507 |                              save_best_only=False,
508 |                              verbose=1, save_weights_only=True, period=1)
509 | callbacks = [checkpoint, reduce_lr, early_stopping]
510 | # print("load weight....")
511 | 
512 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)),
513 |                            epochs=10, verbose=1, callbacks=callbacks,
514 |                            validation_data=val_gen, validation_steps=int(
515 |                                np.ceil(1000000/2048)),
516 |                            max_queue_size=10, workers=1, use_multiprocessing=False)
517 | 
518 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature,
519 |                     batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
520 | val_prob = model.predict_generator(
521 |     val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
522 | 
523 | f = open('/home/kesci/zhifeng/val.csv', 'r')
524 | q, a, l = [], [], []
525 | for line in f:
526 |     qid, _, aid, _, label = line.strip().split(',')
527 |     q.append(qid)
528 |     a.append(aid)
529 |     l.append(int(label))
530 | 
531 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l})
532 | val_df['prob'] = val_prob.flatten()
533 | 
534 | roc_auc_score(val_df['label'], val_df['prob'])
535 | 
536 | 
537 | def perauc(df):
538 |     temp = pd.Series()
539 |     try:
540 |         temp['auc'] = roc_auc_score(df['label'], df['prob'])
541 |     except:
542 |         temp['auc'] = 0.5
543 |     return temp
544 | 
545 | 
546 | eval_df = val_df.groupby("qid").apply(perauc)
547 | eval_df.index = range(len(eval_df))
548 | print("qauc:", eval_df['auc'].mean())
549 | 
550 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
551 |                      feature=testa_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
552 | prob = model.predict_generator(
553 |     test_gen, steps=int(np.ceil(20000000/4096)), verbose=1)
554 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',
555 |                   names=['qid', 'aid', 'prob'])
556 | sub['prob'] = prob.flatten()
557 | sub.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testa.csv', index=False, header=False
558 | test_gen=gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
559 |                               feature=testb_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
560 | prob=model.predict_generator(test_gen, steps=int(
561 |                np.ceil(100000000/4096)), verbose=1)
562 | final=pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
563 |                              'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
564 | final['prob']=prob.flatten()
565 | final.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testb.csv', index=False, header=False)
566 | 


--------------------------------------------------------------------------------
/gen_feature.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm, tqdm_notebook
  2 | from sklearn.model_selection import StratifiedKFold, GroupKFold
  3 | import numpy as np
  4 | import os
  5 | import Levenshtein
  6 | import logging
  7 | from gensim.models import Word2Vec
  8 | import time
  9 | import gc
 10 | import keras
 11 | from keras.initializers import *
 12 | from keras import initializers, regularizers, constraints, optimizers, layers
 13 | from keras.engine.topology import Layer
 14 | from keras.models import *
 15 | from keras.layers import *
 16 | from keras.callbacks import *
 17 | from keras.optimizers import *
 18 | from keras import backend as K
 19 | from keras.optimizers import Adam
 20 | from keras.preprocessing.sequence import pad_sequences
 21 | from keras.preprocessing.text import Tokenizer
 22 | from sklearn.metrics import roc_auc_score
 23 | import tensorflow as tf
 24 | import random as rn
 25 | import pandas as pd
 26 | tqdm.pandas()
 27 | np.random.seed(1017)
 28 | rn.seed(1017)
 29 | tf.set_random_seed(1017)
 30 | path = "/home/kesci/input/bytedance/"
 31 | out = '/home/kesci/work/chizhu/'
 32 | print(os.listdir(path))
 33 | 
 34 | train = pd.read_csv(path+"train_final.csv",skiprows=900000000,nrows=100000000,names=['query_id','query','query_title_id','title','label'])
 35 | 
 36 | testa = pd.read_csv(path+"test_final_part1.csv",names=['query_id','query','query_title_id','title'])
 37 | testb = pd.read_csv(path+"bytedance_contest.final_2.csv",names=['query_id','query','query_title_id','title'])
 38 | 
 39 | testa['label']=-1
 40 | testb['label']=-2
 41 | test=pd.concat([testa,testb],ignore_index=True)
 42 | del testa,testb
 43 | gc.collect()
 44 | 
 45 | train['title']=train['title'].apply(lambda x:str(x).replace("\t",""),1)
 46 | test['title']=test['title'].apply(lambda x:str(x).replace("\t",""),1)
 47 | data_all=pd.concat([train,test],ignore_index=True)
 48 | del train,test
 49 | gc.collect()
 50 | 
 51 | # 构造特征集 f1
 52 | def get_union_data(row):
 53 |     title_list = row['title'].split(' ')
 54 |     query_list = row['query'].split(' ')
 55 |     return len(list(set(title_list).intersection(set(query_list))))
 56 | 
 57 | def same_1(row):
 58 |     title_list = row['title'].split(' ')
 59 |     query_list = row['query'].split(' ')
 60 |     if title_list[0] ==  query_list[0]:
 61 |         return 1
 62 |     else:
 63 |         return 0
 64 | 
 65 | def same_2(row):
 66 |     title_list = row['title'].split(' ')
 67 |     query_list = row['query'].split(' ')
 68 |     if ' '.join(title_list[:2]) ==  ' '.join(query_list[:2]):
 69 |         return 1
 70 |     else:
 71 |         return 0
 72 | 
 73 | def same_3(row):
 74 |     title_list = row['title'].split(' ')
 75 |     query_list = row['query'].split(' ')
 76 |     if ' '.join(title_list[:3]) ==  ' '.join(query_list[:3]):
 77 |         return 1
 78 |     else:
 79 |         return 0
 80 | 
 81 | def is_all_in(row):
 82 |     if row['query'] in row['title']:
 83 |         return 1
 84 |     else:
 85 |         return 0
 86 | 
 87 | feature = pd.DataFrame()
 88 | feature['问题长度'] = data_all['query'].progress_apply(lambda row:len(row.split(' ')))
 89 | feature['标题长度'] = data_all['title'].progress_apply(lambda row:len(row.split(' ')))
 90 | feature['标题长度-问题长度'] = feature['标题长度'] - feature['问题长度']
 91 | feature['问题是否全部在标题里面'] = data_all.progress_apply(lambda row:is_all_in(row), axis=1)
 92 | feature['标题和问题的交集个数'] = data_all.progress_apply(lambda row:get_union_data(row), axis=1)
 93 | feature['标题问题词语的交集个数/问题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['问题长度']), 8)
 94 | feature['标题问题词语的交集个数/标题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['标题长度']), 8)
 95 | feature['编辑距离'] = data_all.progress_apply(lambda row:Levenshtein.distance(row['query'], row['title']), axis=1)
 96 | feature['前一个词语是否相同'] = data_all.progress_apply(lambda row:same_1(row), axis=1)
 97 | feature['前两个词语是否相同'] = data_all.progress_apply(lambda row:same_2(row), axis=1)
 98 | feature['前三个词语是否相同'] = data_all.progress_apply(lambda row:same_3(row), axis=1)
 99 | feature.to_csv(out + 'f1.csv', index=False)
100 | 
101 | # 构造特征集 f2
102 | def pos_1(row):
103 |     title_list = row['title'].split(' ')
104 |     query_list = row['query'].split(' ')
105 |     value = -1
106 |     try:
107 |         value = title_list.index(query_list[0])
108 |     except Exception:
109 |         value = -1
110 |     return value
111 | 
112 | def pos_2(row):
113 |     title_list = row['title'].split(' ')
114 |     query_list = row['query'].split(' ')
115 |     if len(query_list) <=1 :
116 |         return -1
117 |     try:
118 |         value = title_list.index(query_list[1])
119 |     except Exception:
120 |         value = -1
121 |     return value
122 | 
123 | def pos_3(row):
124 |     title_list = row['title'].split(' ')
125 |     query_list = row['query'].split(' ')
126 |     if len(query_list) <=2 :
127 |         return -1
128 |     try:
129 |         value = title_list.index(query_list[2])
130 |     except Exception:
131 |         value = -1
132 |     return value
133 | 
134 | feature = pd.DataFrame()
135 | feature['第一个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_1(row), axis=1)
136 | feature['第二个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_2(row), axis=1)
137 | feature['第三个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_3(row), axis=1)
138 | feature.to_csv(out + 'f2.csv', index=False)
139 | 
140 | feature = pd.DataFrame()
141 | feature['标题求组合后词语'] = data_all.groupby('title').query.transform('nunique')
142 | feature['词语求组合后标题'] = data_all.groupby('query').title.transform('nunique')
143 | feature.to_csv(out + 'f3.csv', index=False)
144 | 
145 | 
146 | # data_all = data_all.fillna(-1)
147 | # data_all.to_pickle(out+"data.pickle")
148 | 
149 | # data_all = pd.read_pickle(out+"data.pickle")
150 | # f5 word2vec本身相似度
151 | from gensim.models import Word2Vec
152 | import gensim
153 | import logging
154 | feature = pd.DataFrame()
155 | w2v = Word2Vec.load(out + 'new_skip_w2v_all_300.model')
156 | def get_new_w2v(seq1, seq2):
157 |     seq1 = seq1.split(' ')
158 |     seq2 = seq2.split(' ')
159 |     try:
160 |         return w2v.n_similarity(seq1, seq2)
161 |     except:
162 |         return -1
163 | 
164 | f3 = pd.read_csv(out + 'f3.csv')
165 | f3['w2v本身相似度'] = data_all.progress_apply(lambda row:get_new_w2v(row['query'], row['title']), axis=1)
166 | f3.to_csv(out + 'f3.csv', index=False)
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/get_corpus.py:
--------------------------------------------------------------------------------
 1 | fout = open(out + "corpus.csv",'w')
 2 | with open(path+"train_final.csv",'r') as fin:
 3 |     q_last = ''
 4 |     for line in tqdm(fin):
 5 |         _,q,_,t,_ = line.strip().split(',')
 6 |         if q!=q_last:
 7 |             q_last = q
 8 |             fout.write(q + '\n')
 9 |         fout.write(t + '\n')
10 | with open(path+"test_final_part1.csv",'r') as fin:
11 |     q_last = ''
12 |     for line in tqdm(fin):
13 |         _,q,_,t = line.strip().split(',')
14 |         if q!=q_last:
15 |             q_last = q
16 |             fout.write(q + '\n')
17 |         fout.write(t + '\n')
18 | fout.close()
19 | """
20 | corpus.txt格式
21 | // 每行是一条语料 以空格分隔
22 | 我 鄂温克  三打底裤  是是
23 | 说的 
24 | 是对的是  
25 | 时代大厦 是对的
26 | 是赛事方  说的 
27 |  
28 | """
29 | 


--------------------------------------------------------------------------------
/train_fasttext.py:
--------------------------------------------------------------------------------
 1 | import fasttext 
 2 | w2v = fasttext.train_unsupervised(input=out+"corpus.csv")
 3 | w2v.save_model(out+'corpus.fasttext.model')
 4 | w2v = fasttext.load_model(out+'corpus.fasttext.model')
 5 | word2index = {word: index+1 for index, word in enumerate(w2v.words)}
 6 | index2word = {index+1: word for index, word in enumerate(w2v.words)}
 7 | 
 8 | 
 9 | def get_embedding_matrix():
10 |     m = np.zeros(shape=(len(index2word)+1, 100))
11 |     for i, w in index2word.items():
12 |         m[i, :] = w2v[w]
13 |     return m
14 | 


--------------------------------------------------------------------------------
/train_w2v.py:
--------------------------------------------------------------------------------
 1 | from gensim.models import Word2Vec
 2 | import logging
 3 | from gensim.models import word2vec
 4 | logging.basicConfig(
 5 |     format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
 6 | sent=word2vec.Text8Corpus("/home/kesci/work/zhifeng/corpus.csv")
 7 | word2vecModel = word2vec.Word2Vec(sent, size=300, window=5, min_count=1,iter=5,
 8 |                      sg=1,workers=8)
 9 | word2vecModel.save(out+"skip_w2v_all_300.model")
10 | 
11 | # ##### further train
12 | from gensim.models import word2vec
13 | model = word2vec.Word2Vec.load(out+"skip_w2v_all_300.model")
14 | fout = open(out + "new_corpus.csv",'w')
15 | with open(path+"bytedance_contest.final_2.csv",'r') as fin:
16 |     q_last = ''
17 |     for line in tqdm(fin):
18 |         _,q,_,t = line.strip().split(',')
19 |         if q!=q_last:
20 |             q_last = q
21 |             fout.write(q + '\n')
22 |         fout.write(t + '\n')
23 | fout.close()
24 | logging.basicConfig(
25 |     format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
26 | sent=word2vec.Text8Corpus(out + "new_corpus.csv")
27 | model.build_vocab(sent, update=True)
28 | model.train(sent,total_examples=model.corpus_count, epochs=5)
29 | model.save(out+"new_skip_w2v_all_300.model")
30 | 


--------------------------------------------------------------------------------
/w2v_cos.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import numpy as np
 4 | import random as rn
 5 | from tqdm import tqdm, tqdm_notebook
 6 | from sklearn.metrics import roc_auc_score
 7 | from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
 8 | import gc
 9 | import time
10 | from gensim.models import Word2Vec
11 | import fasttext
12 | from gensim.models import Word2Vec
13 | import scipy.spatial.distance as ssd
14 | tqdm.pandas()
15 | input_path = "/home/kesci/input/bytedance/"
16 | out_work_path = '/home/kesci/work/zhifeng/'
17 | out_path = '/home/kesci/zhifeng/'
18 | 
19 | w2v = Word2Vec.load('/home/kesci/work/chizhu/skip_w2v_all_300.model')
20 | 
21 | 
22 | def get_sentence_embeddings(text, sep=' ', dim=300):
23 |     v = np.zeros(dim)
24 |     words = text.strip().split(sep)
25 |     cnt = 0
26 |     for word in words:
27 |         if word in w2v:
28 |             v += w2v[word]
29 |             cnt += 1
30 |     return v/cnt if cnt != 0 else v
31 | 
32 | 
33 | train_cosine_list = []
34 | with open(out_path+'train.smaller.csv', 'r') as fin:
35 |     for line in tqdm(fin):
36 |         _, q, _, a, _ = line.strip().split(',')
37 |         v1 = get_sentence_embeddings(q)
38 |         v2 = get_sentence_embeddings(a)
39 |         train_cosine_list.append(ssd.cosine(v1, v2))
40 | pd.to_pickle(np.array(train_cosine_list), out_work_path+'train.cosine.w2v.pkl')
41 | val_cosine_list = []
42 | with open(out_path+'val.csv', 'r') as fin:
43 |     for line in tqdm(fin):
44 |         _, q, _, a, _ = line.strip().split(',')
45 |         v1 = get_sentence_embeddings(q)
46 |         v2 = get_sentence_embeddings(a)
47 |         val_cosine_list.append(ssd.cosine(v1, v2))
48 | pd.to_pickle(np.array(val_cosine_list), out_work_path+'val.cosine.w2v.pkl')
49 | test_cosine_list = []
50 | with open(input_path+'test_final_part1.csv', 'r') as fin:
51 |     for line in tqdm(fin):
52 |         _, q, _, a = line.strip().split(',')
53 |         v1 = get_sentence_embeddings(q)
54 |         v2 = get_sentence_embeddings(a)
55 |         test_cosine_list.append(ssd.cosine(v1, v2))
56 | pd.to_pickle(np.array(test_cosine_list), out_path+'test.cosine.w2v.pkl')
57 | 


--------------------------------------------------------------------------------