├── NN_pipeline.py
├── README.md
├── bigtrain_fasttext_esim.py
├── bigtrain_w2v_esim.py
├── bigtrain_w2v_rnn.py
├── chizhu_rnn.py
├── fasttext_cos.py
├── finetuning_fasttext_esim.py
├── finetuning_w2v_esim.py
├── finetuning_w2v_rnn.py
├── gen_feature.py
├── get_corpus.py
├── train_fasttext.py
├── train_w2v.py
└── w2v_cos.py
/NN_pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import StandardScaler
2 | import os
3 | import pandas as pd
4 | import numpy as np
5 | import random as rn
6 | from tqdm import tqdm, tqdm_notebook
7 | import tensorflow as tf
8 | from sklearn.metrics import roc_auc_score
9 | from keras.preprocessing.text import Tokenizer
10 | from keras.preprocessing.sequence import pad_sequences
11 | from keras.optimizers import Adam
12 | from keras import backend as K
13 | from keras.optimizers import *
14 | from keras.callbacks import *
15 | from keras.layers import *
16 | from keras.models import *
17 | from keras.engine.topology import Layer
18 | from keras import initializers, regularizers, constraints, optimizers, layers
19 | from keras.initializers import *
20 | import keras
21 | from sklearn.model_selection import StratifiedKFold, GroupKFold
22 | import gc
23 | import time
24 | from gensim.models import Word2Vec
25 | import logging
26 | import Levenshtein
27 | tqdm.pandas()
28 | np.random.seed(1017)
29 | rn.seed(1017)
30 | tf.set_random_seed(1017)
31 | path = "/home/kesci/input/bytedance/"
32 | out = '/home/kesci/work/chizhu/'
33 | print(os.listdir(path))
34 |
35 | train = pd.read_csv(path+"train_final.csv",skiprows=900000000,nrows=100000000,names=['query_id','query','query_title_id','title','label'])
36 | test = pd.read_csv(path+"test_final_part1.csv",names=['query_id','query','query_title_id','title'])
37 |
38 | train['title']=train['title'].apply(lambda x:str(x).replace("\t",""),1)
39 | test['title']=test['title'].apply(lambda x:str(x).replace("\t",""),1)
40 | data_all=pd.concat([train,test],ignore_index=True)
41 | del train,test
42 | gc.collect()
43 |
44 | # 构造特征集 f1
45 | def get_union_data(row):
46 | title_list = row['title'].split(' ')
47 | query_list = row['query'].split(' ')
48 | return len(list(set(title_list).intersection(set(query_list))))
49 |
50 | def same_1(row):
51 | title_list = row['title'].split(' ')
52 | query_list = row['query'].split(' ')
53 | if title_list[0] == query_list[0]:
54 | return 1
55 | else:
56 | return 0
57 |
58 | def same_2(row):
59 | title_list = row['title'].split(' ')
60 | query_list = row['query'].split(' ')
61 | if ' '.join(title_list[:2]) == ' '.join(query_list[:2]):
62 | return 1
63 | else:
64 | return 0
65 |
66 | def same_3(row):
67 | title_list = row['title'].split(' ')
68 | query_list = row['query'].split(' ')
69 | if ' '.join(title_list[:3]) == ' '.join(query_list[:3]):
70 | return 1
71 | else:
72 | return 0
73 |
74 | def is_all_in(row):
75 | if row['query'] in row['title']:
76 | return 1
77 | else:
78 | return 0
79 |
80 | feature = pd.DataFrame()
81 | feature['问题长度'] = data_all['query'].progress_apply(lambda row:len(row.split(' ')))
82 | feature['标题长度'] = data_all['title'].progress_apply(lambda row:len(row.split(' ')))
83 | feature['标题长度-问题长度'] = feature['标题长度'] - feature['问题长度']
84 | feature['问题是否全部在标题里面'] = data_all.progress_apply(lambda row:is_all_in(row), axis=1)
85 | feature['标题和问题的交集个数'] = data_all.progress_apply(lambda row:get_union_data(row), axis=1)
86 | feature['标题问题词语的交集个数/问题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['问题长度']), 8)
87 | feature['标题问题词语的交集个数/标题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['标题长度']), 8)
88 | feature['编辑距离'] = data_all.progress_apply(lambda row:Levenshtein.distance(row['query'], row['title']), axis=1)
89 | feature['前一个词语是否相同'] = data_all.progress_apply(lambda row:same_1(row), axis=1)
90 | feature['前两个词语是否相同'] = data_all.progress_apply(lambda row:same_2(row), axis=1)
91 | feature['前三个词语是否相同'] = data_all.progress_apply(lambda row:same_3(row), axis=1)
92 | feature.to_csv(out + 'f1.csv', index=False)
93 |
94 |
95 | # 构造特征集 f2
96 | def pos_1(row):
97 | title_list = row['title'].split(' ')
98 | query_list = row['query'].split(' ')
99 | value = -1
100 | try:
101 | value = title_list.index(query_list[0])
102 | except Exception:
103 | value = -1
104 | return value
105 |
106 | def pos_2(row):
107 | title_list = row['title'].split(' ')
108 | query_list = row['query'].split(' ')
109 | if len(query_list) <=1 :
110 | return -1
111 | try:
112 | value = title_list.index(query_list[1])
113 | except Exception:
114 | value = -1
115 | return value
116 |
117 | def pos_3(row):
118 | title_list = row['title'].split(' ')
119 | query_list = row['query'].split(' ')
120 | if len(query_list) <=2 :
121 | return -1
122 | try:
123 | value = title_list.index(query_list[2])
124 | except Exception:
125 | value = -1
126 | return value
127 |
128 | feature = pd.DataFrame()
129 | feature['第一个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_1(row), axis=1)
130 | feature['第二个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_2(row), axis=1)
131 | feature['第三个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_3(row), axis=1)
132 | feature.to_csv(out + 'f2.csv', index=False)
133 |
134 | feature = pd.DataFrame()
135 | feature['标题求组合后词语'] = data_all.groupby('title').query.transform('nunique')
136 | # feature['词语求组合后标题'] = data_all.groupby('query').title.transform('nunique')
137 | feature.to_csv(out + 'f3.csv', index=False)
138 |
139 | # data_all = data_all.fillna(-1)
140 | # data_all.to_csv(out+"data.csv", index=False)
141 |
142 | # data_all = pd.read_csv(out+"data.csv")
143 |
144 | # f5 word2vec本身相似度
145 | from gensim.models import Word2Vec
146 | import gensim
147 | import logging
148 | feature = pd.DataFrame()
149 | w2v = Word2Vec.load(out + 'w2v.model')
150 | def get_new_w2v(seq1, seq2):
151 | seq1 = seq1.split(' ')
152 | seq2 = seq2.split(' ')
153 | try:
154 | return w2v.n_similarity(seq1, seq2)
155 | except:
156 | return -1
157 |
158 | f3 = pd.read_csv(out + 'f3.csv')
159 | f3['w2v本身相似度'] = data_all.progress_apply(lambda row:get_new_w2v(row['query'], row['title']), axis=1)
160 | f3.to_csv(out + 'f3.csv', index=False)
161 |
162 | f1 = pd.read_csv(out + 'f1.csv')
163 | f2 = pd.read_csv(out + 'f2.csv')
164 | f3 = pd.read_csv(out + 'f3.csv')
165 | feature = pd.concat([f1, f2, f3], sort=False, axis=1)
166 | del f1, f2, f3
167 | gc.collect()
168 |
169 | train = data_all[data_all['label'] != -1]
170 | test = data_all[data_all['label'] == -1]
171 | del data_all
172 | gc.collect()
173 | train_feature = feature[:len(train)]
174 | test_feature = feature[len(train):]
175 | train.index = range(len(train))
176 | test.index = range(len(test))
177 | train_feature.index = range(len(train_feature))
178 | test_feature.index = range(len(test_feature))
179 | del feature
180 | gc.collect()
181 |
182 | embed_size = 300 # how big is each word vector
183 | # how many unique words to use (i.e num rows in embedding vector)
184 | max_features = None
185 | maxlen1 = 8
186 | maxlen2 = 20 # max number of words in a question to use
187 |
188 | train_X1 = train["query"].fillna("0").values
189 | test_X1 = test["query"].fillna("0").values
190 |
191 | train_X2 = train["title"].fillna("0").values
192 | test_X2 = test["title"].fillna("0").values
193 | print("token...")
194 | tokenizer = Tokenizer(num_words=max_features)
195 | tokenizer.fit_on_texts(list(train_X1)+list(test_X1) +
196 | list(train_X2)+list(test_X2))
197 | train_X1 = tokenizer.texts_to_sequences(train_X1)
198 | test_X1 = tokenizer.texts_to_sequences(test_X1)
199 | ## Pad the sentences
200 | print("padding")
201 | train_X1 = pad_sequences(train_X1, maxlen=maxlen1)
202 | test_X1 = pad_sequences(test_X1, maxlen=maxlen1)
203 |
204 | train_X2 = tokenizer.texts_to_sequences(train_X2)
205 | test_X2 = tokenizer.texts_to_sequences(test_X2)
206 | ## Pad the sentences
207 | train_X2 = pad_sequences(train_X2, maxlen=maxlen2)
208 | test_X2 = pad_sequences(test_X2, maxlen=maxlen2)
209 | ## Get the target values
210 |
211 | train_y = train['label'].values
212 |
213 | word_index = tokenizer.word_index
214 | gc.collect()
215 |
216 | text_list = train['query'].values.tolist()
217 | text_list.extend(test['query'].values.tolist())
218 | text_list.extend(train['title'].values.tolist())
219 | text_list.extend(test['title'].values.tolist())
220 | del train,test
221 | gc.collect()
222 | import time
223 | time.sleep(10)
224 | text_list = [[word for word in str(document).split(' ') ] for document in text_list]
225 | logging.basicConfig(
226 | format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
227 | w2v = Word2Vec(text_list, size=300, window=7, iter=30, seed=10, workers=4, min_count=3)
228 | w2v.save(out+"w2v.model")
229 | w2v.wv.save_word2vec_format(out+'new_w2v_300.txt')
230 | print("w2v model done")
231 | del w2v, text_list, texts
232 | gc.collect()
233 |
234 |
235 | def get_embedding_matrix(word_index, embed_size=embed_size, Emed_path=out+"new_w2v_300.txt"):
236 | embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(
237 | Emed_path, binary=False)
238 | nb_words = len(word_index)+1
239 | embedding_matrix = np.zeros((nb_words, embed_size))
240 | count = 0
241 | for word, i in tqdm(word_index.items()):
242 | if i >= nb_words:
243 | continue
244 | try:
245 | embedding_vector = embeddings_index[word]
246 | except:
247 | embedding_vector = np.zeros(embed_size)
248 | count += 1
249 | if embedding_vector is not None:
250 | embedding_matrix[i] = embedding_vector
251 |
252 | print("null cnt", count)
253 | return embedding_matrix
254 |
255 |
256 | embedding_matrix = get_embedding_matrix(word_index)
257 |
258 |
259 | class AdamW(Optimizer):
260 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
261 | epsilon=1e-8, decay=0., **kwargs):
262 | super(AdamW, self).__init__(**kwargs)
263 | with K.name_scope(self.__class__.__name__):
264 | self.iterations = K.variable(0, dtype='int64', name='iterations')
265 | self.lr = K.variable(lr, name='lr')
266 | self.beta_1 = K.variable(beta_1, name='beta_1')
267 | self.beta_2 = K.variable(beta_2, name='beta_2')
268 | self.decay = K.variable(decay, name='decay')
269 | # decoupled weight decay (2/4)
270 | self.wd = K.variable(weight_decay, name='weight_decay')
271 | self.epsilon = epsilon
272 | self.initial_decay = decay
273 |
274 | @interfaces.legacy_get_updates_support
275 | def get_updates(self, loss, params):
276 | grads = self.get_gradients(loss, params)
277 | self.updates = [K.update_add(self.iterations, 1)]
278 | wd = self.wd # decoupled weight decay (3/4)
279 |
280 | lr = self.lr
281 | if self.initial_decay > 0:
282 | lr *= (1. / (1. + self.decay * K.cast(self.iterations,
283 | K.dtype(self.decay))))
284 |
285 | t = K.cast(self.iterations, K.floatx()) + 1
286 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
287 | (1. - K.pow(self.beta_1, t)))
288 |
289 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
290 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
291 | self.weights = [self.iterations] + ms + vs
292 |
293 | for p, g, m, v in zip(params, grads, ms, vs):
294 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
295 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
296 | # decoupled weight decay (4/4)
297 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
298 |
299 | self.updates.append(K.update(m, m_t))
300 | self.updates.append(K.update(v, v_t))
301 | new_p = p_t
302 |
303 | # Apply constraints.
304 | if getattr(p, 'constraint', None) is not None:
305 | new_p = p.constraint(new_p)
306 |
307 | self.updates.append(K.update(p, new_p))
308 | return self.updates
309 |
310 | def get_config(self):
311 | config = {'lr': float(K.get_value(self.lr)),
312 | 'beta_1': float(K.get_value(self.beta_1)),
313 | 'beta_2': float(K.get_value(self.beta_2)),
314 | 'decay': float(K.get_value(self.decay)),
315 | 'weight_decay': float(K.get_value(self.wd)),
316 | 'epsilon': self.epsilon}
317 | base_config = super(AdamW, self).get_config()
318 | return dict(list(base_config.items()) + list(config.items()))
319 |
320 |
321 | class Attention(Layer):
322 | def __init__(self, step_dim,
323 | W_regularizer=None, b_regularizer=None,
324 | W_constraint=None, b_constraint=None,
325 | bias=True, **kwargs):
326 | self.supports_masking = True
327 | self.init = initializers.get('glorot_uniform')
328 |
329 | self.W_regularizer = regularizers.get(W_regularizer)
330 | self.b_regularizer = regularizers.get(b_regularizer)
331 |
332 | self.W_constraint = constraints.get(W_constraint)
333 | self.b_constraint = constraints.get(b_constraint)
334 |
335 | self.bias = bias
336 | self.step_dim = step_dim
337 | self.features_dim = 0
338 | super(Attention, self).__init__(**kwargs)
339 |
340 | def build(self, input_shape):
341 | assert len(input_shape) == 3
342 |
343 | self.W = self.add_weight((input_shape[-1],),
344 | initializer=self.init,
345 | name='{}_W'.format(self.name),
346 | regularizer=self.W_regularizer,
347 | constraint=self.W_constraint)
348 | self.features_dim = input_shape[-1]
349 |
350 | if self.bias:
351 | self.b = self.add_weight((input_shape[1],),
352 | initializer='zero',
353 | name='{}_b'.format(self.name),
354 | regularizer=self.b_regularizer,
355 | constraint=self.b_constraint)
356 | else:
357 | self.b = None
358 |
359 | self.built = True
360 |
361 | def compute_mask(self, input, input_mask=None):
362 | return None
363 |
364 | def call(self, x, mask=None):
365 | features_dim = self.features_dim
366 | step_dim = self.step_dim
367 |
368 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
369 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
370 |
371 | if self.bias:
372 | eij += self.b
373 |
374 | eij = K.tanh(eij)
375 |
376 | a = K.exp(eij)
377 |
378 | if mask is not None:
379 | a *= K.cast(mask, K.floatx())
380 |
381 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
382 |
383 | a = K.expand_dims(a)
384 | weighted_input = x * a
385 | return K.sum(weighted_input, axis=1)
386 |
387 | def compute_output_shape(self, input_shape):
388 | return input_shape[0], self.features_dim
389 |
390 | # AUC for a binary classifier
391 | def auc(y_true, y_pred):
392 | ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
393 | pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
394 | pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
395 | binSizes = -(pfas[1:]-pfas[:-1])
396 | s = ptas*binSizes
397 | return K.sum(s, axis=0)
398 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
399 | # PFA, prob false alert for binary classifier
400 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
401 | y_pred = K.cast(y_pred >= threshold, 'float32')
402 | # N = total number of negative labels
403 | N = K.sum(1 - y_true)
404 | # FP = total number of false alerts, alerts from the negative class labels
405 | FP = K.sum(y_pred - y_pred * y_true)
406 | return FP/N
407 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
408 | # P_TA prob true alerts for binary classifier
409 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
410 | y_pred = K.cast(y_pred >= threshold, 'float32')
411 | # P = total number of positive labels
412 | P = K.sum(y_true)
413 | # TP = total number of correct alerts, alerts from the positive class labels
414 | TP = K.sum(y_pred * y_true)
415 | return TP/P
416 |
417 |
418 | val = train[99000000:]
419 | train = train[:99000000]
420 | val_X1 = train_X1[99000000:]
421 | val_X2 = train_X2[99000000:]
422 | train_X1 = train_X1[:99000000]
423 | train_X2 = train_X2[:99000000]
424 | val_feature = train_feature[99000000:]
425 | train_feature = train_feature[:99000000]
426 |
427 | class ManDist(keras.layers.Layer): # 封装成keras层的曼哈顿距离计算
428 |
429 | # 初始化ManDist层,此时不需要任何参数输入
430 | def __init__(self, **kwargs):
431 | self.result = None
432 | super(ManDist, self).__init__(**kwargs)
433 |
434 | # 自动建立ManDist层
435 | def build(self, input_shape):
436 | super(ManDist, self).build(input_shape)
437 |
438 | # 计算曼哈顿距离
439 | def call(self, x, **kwargs):
440 | self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True))
441 | return self.result
442 |
443 | # 返回结果
444 | def compute_output_shape(self, input_shape):
445 | return K.int_shape(self.result)
446 |
447 |
448 | sc = StandardScaler()
449 | col_len = len(train_feature.columns)
450 | sc.fit(pd.concat([train_feature, val_feature, test_feature]))
451 | train_feature = sc.transform(train_feature)
452 | val_feature = sc.transform(val_feature)
453 | test_feature = sc.transform(test_feature)
454 |
455 | def get_model(embedding_matrix):
456 |
457 | K.clear_session()
458 | #The embedding layer containing the word vectors
459 | emb_layer = Embedding(
460 | input_dim=embedding_matrix.shape[0],
461 | output_dim=embedding_matrix.shape[1],
462 | weights=[embedding_matrix],
463 | trainable=False
464 | )
465 | sdrop=SpatialDropout1D(rate=0.2)
466 | lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True,
467 | kernel_initializer=glorot_uniform(seed = 123)))
468 | gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True,
469 | kernel_initializer=glorot_uniform(seed = 123)))
470 |
471 | cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
472 |
473 | # Define inputs
474 | seq1 = Input(shape=(maxlen1,))
475 | x1 = emb_layer(seq1)
476 | x1 = sdrop(x1)
477 | lstm1 = lstm_layer(x1)
478 | gru1 = gru_layer(lstm1)
479 | att_1 = Attention(maxlen1)(lstm1)
480 | att_3 = Attention(maxlen1)(gru1)
481 | cnn1 = cnn1d_layer(lstm1)
482 |
483 | avg_pool = GlobalAveragePooling1D()
484 | max_pool = GlobalMaxPooling1D()
485 |
486 | seq2 = Input(shape=(maxlen2,))
487 | x2 = emb_layer(seq2)
488 | x2 = sdrop(x2)
489 | lstm2 = lstm_layer(x2)
490 | gru2 = gru_layer(lstm2)
491 | att_2 = Attention(maxlen2)(lstm2)
492 | att_4 = Attention(maxlen2)(gru2)
493 | cnn2 = cnn1d_layer(lstm2)
494 |
495 | x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)])
496 | x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)])
497 |
498 | merge = Multiply()([x1, x2])
499 | merge = Dropout(0.2)(merge)
500 |
501 | hin = Input(shape=(col_len,))
502 | # htime = Dense(col_len,activation='relu')(hin)
503 | x = Concatenate()([merge,hin])
504 | # The MLP that determines the outcome
505 | x = Dense(64,kernel_initializer=he_uniform(seed=123), activation='relu',)(x)
506 | # x = Dropout(0.2)(x)
507 | # x = BatchNormalization()(x)
508 |
509 | pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x)
510 |
511 |
512 | model = Model(inputs=[seq1,seq2,hin], outputs=pred)
513 |
514 | model.compile(loss='binary_crossentropy',
515 | optimizer=AdamW(lr=0.001,weight_decay=0.02,),
516 | metrics=["accuracy",auc])
517 | # model.summary()
518 | return model
519 |
520 |
521 | ####模型训练
522 |
523 | print("train...")
524 | print("###"*30)
525 | gc.collect()
526 | K.clear_session()
527 | model = get_model(embedding_matrix)
528 | # model = esim()
529 | model.summary()
530 | early_stopping = EarlyStopping(
531 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
532 | reduce_lr = ReduceLROnPlateau(
533 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
534 | bst_model_path = out+'chizhurnn_chizhu_weight.h5'
535 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
536 | save_best_only=True, verbose=1, save_weights_only=True)
537 | callbacks = [checkpoint, reduce_lr, early_stopping]
538 | print("load weight....")
539 | # model.load_weights(bst_model_path)
540 |
541 | hist = model.fit([train_X1,train_X2,train_feature],train['label'],
542 | validation_data=([val_X1,val_X2,val_feature], val['label']),
543 | epochs=30, batch_size=2048,
544 | # class_weight="auto",
545 | callbacks=callbacks,verbose=1
546 |
547 | )
548 |
549 | model.load_weights(bst_model_path)
550 |
551 | res = np.squeeze(model.predict(
552 | [val_X1, val_X2, val_feature], batch_size=2048, verbose=1))
553 |
554 | print("val auc:{}".format(roc_auc_score(val['label'], res)))
555 | val['prob'] = res
556 |
557 |
558 | def perauc(df):
559 | temp = pd.DataFrame(index=range(1))
560 | temp['query_id'] = df['query_id'].values[0]
561 | try:
562 | temp['auc'] = roc_auc_score(df['label'].values.astype(int), df['prob'])
563 | except:
564 | temp['auc'] = 0.5
565 | return temp
566 |
567 |
568 | eval_df = val.groupby("query_id", as_index=False).apply(lambda x: perauc(x))
569 | eval_df.index = range(len(eval_df))
570 | print("qauc:", eval_df['auc'].mean())
571 |
572 | test_prob = np.squeeze(model.predict(
573 | [test_X1, test_X2, test_feature], batch_size=2048, verbose=1))
574 |
575 |
576 | sub = test[['query_id', 'query_title_id']]
577 | sub['prediction'] = test_prob
578 | sub.to_csv(out+"/submit_rnn.csv", index=False, header=False)
579 |
580 |
581 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### 高校赛解决方案
2 | #### 赛题介绍
3 | * **数据**
4 |
5 | 提供10亿量级的数据,根据query和title预测query下doc点击率。数据已经脱敏并且分好词。
6 |
7 | | 列名 | 类型 | 示例 |
8 | | ------ | ------ | ------ |
9 | | query_id | int | 3 |
10 | | query | hash string,term空格分割 | 1 9 117 |
11 | | query_title_id | title在query下的唯一标识 | 2 |
12 | | title | hash string,term空格分割 | 3 9 120 |
13 | | label | int,取值{0, 1} | 0 |
14 | * **任务分析**
15 | 二分类问题。文本相似度+ctr点击预测
16 | * **难点**
17 |
18 | * 数据量大
19 | * 数据脱敏
20 |
21 | #### 解决方案
22 | ##### 特征工程(FE)
23 | * 问题长度
24 | * 标题长度
25 | * 标题长度-问题长度
26 | * 问题是否全部在标题里面
27 | * 标题和问题的共词个数
28 | * 标题问题词语的共词个数/问题长度
29 | * 标题问题词语的共词个数/标题长度
30 | * 编辑距离
31 | * 前一个词语是否相同
32 | * 前二个词语是否相同
33 | * 前三个词语是否相同
34 | * 第一个词语在标题里面出现位置
35 | * 第二个词语在标题里面出现位置
36 | * 第三个词语在标题里面出现位置
37 | * 标题求组合后词语
38 | * 词语求组合后标题
39 | * w2v_n_similarity
40 | * fasttext的余弦相似度
41 | * word2vec的余弦相似度
42 |
43 | (共19个特征,放入LGB模型lb是0.597)
44 | ##### NN模型
45 | * 孪生RNN
46 | * query+title双输入+FE特征
47 | * 使用最后一亿的数据(前9.9千万条数据训练+后1百万数据验证)
48 | * 网络结构
49 | ```python
50 | def get_model(embedding_matrix):
51 | K.clear_session()
52 | #The embedding layer containing the word vectors
53 | emb_layer = Embedding(
54 | input_dim=embedding_matrix.shape[0],
55 | output_dim=embedding_matrix.shape[1],
56 | weights=[embedding_matrix],
57 | trainable=False
58 | )
59 | sdrop=SpatialDropout1D(rate=0.2)
60 | lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True, kernel_initializer=glorot_uniform(seed = 123)))
61 | gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True, kernel_initializer=glorot_uniform(seed = 123)))
62 |
63 | cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
64 |
65 | # Define inputs
66 | seq1 = Input(shape=(maxlen_query,))
67 | x1 = emb_layer(seq1)
68 | x1 = sdrop(x1)
69 | lstm1 = lstm_layer(x1)
70 | gru1 = gru_layer(lstm1)
71 | att_1 = Attention(maxlen_query)(lstm1)
72 | att_3 = Attention(maxlen_query)(gru1)
73 | cnn1 = cnn1d_layer(lstm1)
74 |
75 | avg_pool = GlobalAveragePooling1D()
76 | max_pool = GlobalMaxPooling1D()
77 |
78 | seq2 = Input(shape=(maxlen_answer,))
79 | x2 = emb_layer(seq2)
80 | x2 = sdrop(x2)
81 | lstm2 = lstm_layer(x2)
82 | gru2 = gru_layer(lstm2)
83 | att_2 = Attention(maxlen_answer)(lstm2)
84 | att_4 = Attention(maxlen_answer)(gru2)
85 | cnn2 = cnn1d_layer(lstm2)
86 |
87 | x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)])
88 | x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)])
89 |
90 | merge = Multiply()([x1, x2])
91 | merge = Dropout(0.2)(merge)
92 |
93 | hin = Input(shape=(19,))
94 | # htime = Dense(col_len,activation='relu')(hin)
95 | x = Concatenate()([merge,hin])
96 | # The MLP that determines the outcome
97 | x = Dense(64,kernel_initializer=he_uniform(seed=123), activation='relu',)(x)
98 | # x = Dropout(0.2)(x)
99 | # x = BatchNormalization()(x)
100 |
101 | pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x)
102 | model = Model(inputs=[seq1,seq2,hin], outputs=pred)
103 | model.compile(loss='binary_crossentropy',
104 | optimizer=AdamW(lr=0.001,weight_decay=0.02,),
105 | metrics=["accuracy",auc])
106 | # model.summary()
107 | return model
108 | ```
109 |
110 | * 使用AdamW优化器加快训练过程
111 | * 使用最新刚出的lookahead 优化器(reference:Lookahead Optimizer: k steps forward, 1 step back(https://arxiv.org/abs/1907.08610))
112 | Lookahead 算法的性能显著优于 SGD 和 Adam,它迭代地更新两组权重。直观来说,Lookahead 算法通过提前观察另一个优化器生成的「fast weights」序列,来选择搜索方向。该研究发现,Lookahead 算法能够提升学习稳定性,不仅降低了调参需要的功夫,同时还能提升收敛速度与效果。
113 | * 线上效果
114 | **lb 0.6214**
115 | * **fine-tuning(亮点)**
116 | * 思考:官方提供10亿的数据量?先验知识告诉我们,数据越多效果越好,那么如何充分利用数据?
117 | * 解决方法
118 | * 先用10亿数据训练一个不加任何特征的裸NN,保存权重(如何能训练10亿?)
119 | > 文件流处理数据+分批次训练(训练10亿数据最大占用内存才10G)
120 | * 加载裸NN模型,获得倒二层的feature map作为输出,加入新的FE特征输入,然后把基模型的feature map和FE特征拼接最后送入全连接层。用最后一亿的数据fine-tuning 整个网络。
121 | (再次展示预训练在NLP领域的举足轻重不可动摇的地位)
122 |
123 | * fine-tuning用到的模型(整体参数都是改小了的,因为只有单卡机器,如果可以多卡训练,放开参数估计单模可以0.64+)
124 | * word2vec300维+孪生RNN(小参数) **lb 0.6248**
125 | * word2vec300维+ESIM(极小参数,最后时刻怕跑不完) **lb 0.626**
126 | * fasttext100维+ESIM(小参数) **lb 0.6336 单模都可以在A榜排到第三**
127 | * fine-tuning 网络结构
128 | ```python
129 | def aux_esim_model(embed_matrix,model_weight_path):
130 | base_model = esim(embed_matrix)
131 | base_model.load_weights(model_weight_path)
132 | input_q, input_a = base_model.inputs
133 | input_f = Input((19,))
134 | hidden_esim = base_model.get_layer(index=28).output
135 | merged = Concatenate()([hidden_esim, input_f])
136 | #dense = BatchNormalization()(merged)
137 | dense = Dense(512, activation='relu')(merged)
138 | #dense = BatchNormalization()(dense)
139 | dense = Dropout(0.5)(dense)
140 | dense = Dense(256, activation='relu')(dense)
141 | #dense = BatchNormalization()(dense)
142 | dense = Dropout(0.5)(dense)
143 | out_ = Dense(1, activation='sigmoid')(dense)
144 |
145 | model = Model(inputs=[input_q,input_a,input_f], outputs=out_)
146 | model.compile(loss='binary_crossentropy',
147 | optimizer=AdamW(lr=0.0003,weight_decay=0.02),
148 | metrics=["accuracy"])
149 | return model
150 | ```
151 | * ESIM 网络结构
152 | ```python
153 | def esim(embedding_matrix,
154 | maxlen=20,
155 | lstm_dim=64,
156 | dense_dim=128,
157 | dense_dropout=0.5):
158 | # Based on arXiv:1609.06038
159 | q1 = Input(name='q1', shape=(8,))
160 | q2 = Input(name='q2', shape=(20,))
161 |
162 | # Embedding
163 | embedding = create_pretrained_embedding(
164 | embedding_matrix, mask_zero=False)
165 | bn = BatchNormalization(axis=2)
166 | q1_embed = bn(embedding(q1))
167 | q2_embed = bn(embedding(q2))
168 |
169 | # Encode
170 | encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
171 | q1_encoded = encode(q1_embed)
172 | q2_encoded = encode(q2_embed)
173 |
174 | # Attention
175 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
176 |
177 | # Compose
178 | q1_combined = Concatenate()(
179 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
180 | q2_combined = Concatenate()(
181 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
182 |
183 | compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
184 | q1_compare = compose(q1_combined)
185 | q2_compare = compose(q2_combined)
186 |
187 | # Aggregate
188 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
189 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
190 |
191 |
192 | merged = Concatenate()([q1_rep, q2_rep])
193 |
194 | dense = BatchNormalization()(merged)
195 | dense = Dense(dense_dim, activation='elu')(dense)
196 | dense = BatchNormalization()(dense)
197 | dense = Dropout(dense_dropout) (dense)
198 | dense = Dense(dense_dim, activation='elu')(dense)
199 | dense = BatchNormalization()(dense)
200 | dense = Dropout(dense_dropout)(dense)
201 | out_ = Dense(1, activation='sigmoid')(dense)
202 |
203 | model = Model(inputs=[q1, q2], outputs=out_)
204 | model.compile(loss='binary_crossentropy',
205 | optimizer=AdamW(lr=0.0003,weight_decay=0.02,),
206 | metrics=["accuracy",auc])
207 | return model
208 | ```
209 |
210 |
211 | #### 线上提交
212 | * finetuning_fasttext_esim(**0.6336**)*0.6+\
213 | finetuning_w2v_esim(**0.626**)*0.2+\
214 | finetuning_w2v_esim(**0.6248**)*0.2=**lb 0.6366**
215 |
216 |
217 | * finetuning_fasttext_esim(**0.6336**)*0.5+\
218 | finetuning_w2v_esim(**0.626**)*0.2+\
219 | finetuning_w2v_esim(**0.6248**)*0.2+\
220 | 孪生RNN(**0.6214**)*0.1=ensemble_NN
221 |
222 | lgb(**0.597**)*0.1+ensemble_NN*0.9= **lb 0.6371**
223 |
224 |
225 |
226 |
227 | #### 我们的优势
228 | * 工业可部署
229 | > 真实的线上业务也是庞大的数据量,如何充分利用数据是个难题。我们的方案适用于大数据量(流式训练全量数据内存小+finetuing迁移学习效果佳)
230 |
231 | * 简单而实用
232 | > 我们总共才19个特征,不需要提取大量的手工特征,所以可以说不依赖于LGB模型,LGB模型是全量模型,要么只能选用小数据集提特征要么大数据量提取不了特征,不易迭代。我们的方案流式处理,易于迭代更新。
233 |
234 |
235 |
236 |
237 |
238 |
239 |
--------------------------------------------------------------------------------
/bigtrain_fasttext_esim.py:
--------------------------------------------------------------------------------
1 | from keras.activations import softmax
2 | import os
3 | import pandas as pd
4 | import numpy as np
5 | import random as rn
6 | from tqdm import tqdm, tqdm_notebook
7 | import tensorflow as tf
8 | from sklearn.metrics import roc_auc_score
9 | from keras.preprocessing.text import Tokenizer
10 | from keras.preprocessing.sequence import pad_sequences
11 | from keras.optimizers import Adam
12 | from keras import backend as K
13 | from keras.optimizers import *
14 | from keras.callbacks import *
15 | from keras.layers import *
16 | from keras.models import *
17 | from keras.engine.topology import Layer
18 | from keras import initializers, regularizers, constraints, optimizers, layers
19 | from keras.initializers import *
20 | import keras
21 | from sklearn.model_selection import StratifiedKFold, GroupKFold
22 | import gc
23 | import time
24 | from gensim.models import Word2Vec
25 | import logging
26 | import Levenshtein
27 | import fasttext
28 | tqdm.pandas()
29 | np.random.seed(1017)
30 | rn.seed(1017)
31 | tf.set_random_seed(1017)
32 | path = "/home/kesci/input/bytedance/"
33 | out = '/home/kesci/work/zhifeng/'
34 | print(os.listdir(path))
35 |
36 | w2v = fasttext.load_model(out+'corpus.fasttext.model')
37 | word2index = {word: index+1 for index, word in enumerate(w2v.words)}
38 | index2word = {index+1: word for index, word in enumerate(w2v.words)}
39 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
40 | maxlen_query=8):
41 | if label_tag:
42 | _, _q, _, _a, _label = line.strip().split(',')
43 | else:
44 | _, _q, _, _a = line.strip().split(',')
45 | q_seq = [token.get(item, 0) for item in _q.strip().split()]
46 | a_seq = [token.get(item, 0) for item in _a.strip().split()]
47 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
48 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
49 | if label_tag:
50 | return q_pad, a_pad, int(_label)
51 | return q_pad, a_pad
52 |
53 |
54 | def gen_train(path, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
55 | while True:
56 | fin = open(path, 'r')
57 | batch_q, batch_a, batch_label = [], [], []
58 | for line in fin:
59 | if len(batch_q) == chunk_size*batch_size:
60 | batch_q = np.array(batch_q)
61 | batch_a = np.array(batch_a)
62 | if label_tag:
63 | batch_label = np.array(batch_label)
64 | idx = list(range(chunk_size*batch_size))
65 | if shuffle:
66 | np.random.shuffle(idx)
67 | for i in range(chunk_size):
68 | if label_tag:
69 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
70 | else:
71 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
72 | batch_q, batch_a, batch_label = [], [], []
73 | if label_tag:
74 | q, a, l = gen_feature_help(line, label_tag=label_tag)
75 | else:
76 | q, a = gen_feature_help(line, label_tag=label_tag)
77 | l = 0
78 | batch_q.append(q)
79 | batch_a.append(a)
80 | if label_tag:
81 | batch_label.append(l)
82 |
83 | batch_q = np.array(batch_q)
84 | batch_a = np.array(batch_a)
85 |
86 | if label_tag:
87 | batch_label = np.array(batch_label)
88 | idx = list(range(len(batch_q)))
89 | if shuffle:
90 | np.random.shuffle(idx)
91 | for i in range(int(np.ceil(len(batch_q)/batch_size))):
92 | if label_tag:
93 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
94 | else:
95 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
96 | fin.close()
97 |
98 |
99 | def get_embedding_matrix():
100 | m = np.zeros(shape=(len(index2word)+1, 100))
101 | for i, w in index2word.items():
102 | m[i, :] = w2v[w]
103 | return m
104 |
105 |
106 | embed_matrix = get_embedding_matrix()
107 | maxlen_query = 8
108 | maxlen_answer = 20
109 |
110 |
111 | class AdamW(Optimizer):
112 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
113 | epsilon=1e-8, decay=0., **kwargs):
114 | super(AdamW, self).__init__(**kwargs)
115 | with K.name_scope(self.__class__.__name__):
116 | self.iterations = K.variable(0, dtype='int64', name='iterations')
117 | self.lr = K.variable(lr, name='lr')
118 | self.beta_1 = K.variable(beta_1, name='beta_1')
119 | self.beta_2 = K.variable(beta_2, name='beta_2')
120 | self.decay = K.variable(decay, name='decay')
121 | # decoupled weight decay (2/4)
122 | self.wd = K.variable(weight_decay, name='weight_decay')
123 | self.epsilon = epsilon
124 | self.initial_decay = decay
125 |
126 | @interfaces.legacy_get_updates_support
127 | def get_updates(self, loss, params):
128 | grads = self.get_gradients(loss, params)
129 | self.updates = [K.update_add(self.iterations, 1)]
130 | wd = self.wd # decoupled weight decay (3/4)
131 |
132 | lr = self.lr
133 | if self.initial_decay > 0:
134 | lr *= (1. / (1. + self.decay * K.cast(self.iterations,
135 | K.dtype(self.decay))))
136 |
137 | t = K.cast(self.iterations, K.floatx()) + 1
138 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
139 | (1. - K.pow(self.beta_1, t)))
140 |
141 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
142 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
143 | self.weights = [self.iterations] + ms + vs
144 |
145 | for p, g, m, v in zip(params, grads, ms, vs):
146 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
147 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
148 | # decoupled weight decay (4/4)
149 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
150 |
151 | self.updates.append(K.update(m, m_t))
152 | self.updates.append(K.update(v, v_t))
153 | new_p = p_t
154 |
155 | # Apply constraints.
156 | if getattr(p, 'constraint', None) is not None:
157 | new_p = p.constraint(new_p)
158 |
159 | self.updates.append(K.update(p, new_p))
160 | return self.updates
161 |
162 | def get_config(self):
163 | config = {'lr': float(K.get_value(self.lr)),
164 | 'beta_1': float(K.get_value(self.beta_1)),
165 | 'beta_2': float(K.get_value(self.beta_2)),
166 | 'decay': float(K.get_value(self.decay)),
167 | 'weight_decay': float(K.get_value(self.wd)),
168 | 'epsilon': self.epsilon}
169 | base_config = super(AdamW, self).get_config()
170 | return dict(list(base_config.items()) + list(config.items()))
171 |
172 |
173 | class Attention(Layer):
174 | def __init__(self, step_dim,
175 | W_regularizer=None, b_regularizer=None,
176 | W_constraint=None, b_constraint=None,
177 | bias=True, **kwargs):
178 | self.supports_masking = True
179 | self.init = initializers.get('glorot_uniform')
180 |
181 | self.W_regularizer = regularizers.get(W_regularizer)
182 | self.b_regularizer = regularizers.get(b_regularizer)
183 |
184 | self.W_constraint = constraints.get(W_constraint)
185 | self.b_constraint = constraints.get(b_constraint)
186 |
187 | self.bias = bias
188 | self.step_dim = step_dim
189 | self.features_dim = 0
190 | super(Attention, self).__init__(**kwargs)
191 |
192 | def build(self, input_shape):
193 | assert len(input_shape) == 3
194 |
195 | self.W = self.add_weight((input_shape[-1],),
196 | initializer=self.init,
197 | name='{}_W'.format(self.name),
198 | regularizer=self.W_regularizer,
199 | constraint=self.W_constraint)
200 | self.features_dim = input_shape[-1]
201 |
202 | if self.bias:
203 | self.b = self.add_weight((input_shape[1],),
204 | initializer='zero',
205 | name='{}_b'.format(self.name),
206 | regularizer=self.b_regularizer,
207 | constraint=self.b_constraint)
208 | else:
209 | self.b = None
210 |
211 | self.built = True
212 |
213 | def compute_mask(self, input, input_mask=None):
214 | return None
215 |
216 | def call(self, x, mask=None):
217 | features_dim = self.features_dim
218 | step_dim = self.step_dim
219 |
220 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
221 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
222 |
223 | if self.bias:
224 | eij += self.b
225 |
226 | eij = K.tanh(eij)
227 |
228 | a = K.exp(eij)
229 |
230 | if mask is not None:
231 | a *= K.cast(mask, K.floatx())
232 |
233 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
234 |
235 | a = K.expand_dims(a)
236 | weighted_input = x * a
237 | return K.sum(weighted_input, axis=1)
238 |
239 | def compute_output_shape(self, input_shape):
240 | return input_shape[0], self.features_dim
241 |
242 | # AUC for a binary classifier
243 |
244 |
245 | def auc(y_true, y_pred):
246 | ptas = tf.stack([binary_PTA(y_true, y_pred, k)
247 | for k in np.linspace(0, 1, 1000)], axis=0)
248 | pfas = tf.stack([binary_PFA(y_true, y_pred, k)
249 | for k in np.linspace(0, 1, 1000)], axis=0)
250 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
251 | binSizes = -(pfas[1:]-pfas[:-1])
252 | s = ptas*binSizes
253 | return K.sum(s, axis=0)
254 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
255 | # PFA, prob false alert for binary classifier
256 |
257 |
258 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
259 | y_pred = K.cast(y_pred >= threshold, 'float32')
260 | # N = total number of negative labels
261 | N = K.sum(1 - y_true)
262 | # FP = total number of false alerts, alerts from the negative class labels
263 | FP = K.sum(y_pred - y_pred * y_true)
264 | return FP/N
265 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
266 | # P_TA prob true alerts for binary classifier
267 |
268 |
269 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
270 | y_pred = K.cast(y_pred >= threshold, 'float32')
271 | # P = total number of positive labels
272 | P = K.sum(y_true)
273 | # TP = total number of correct alerts, alerts from the positive class labels
274 | TP = K.sum(y_pred * y_true)
275 | return TP/P
276 |
277 |
278 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs):
279 | "Create embedding layer from a pretrained weights array"
280 | in_dim, out_dim = pretrained_weights.shape
281 | embedding = Embedding(in_dim, out_dim, weights=[
282 | pretrained_weights], trainable=False, **kwargs)
283 | return embedding
284 |
285 |
286 | def unchanged_shape(input_shape):
287 | "Function for Lambda layer"
288 | return input_shape
289 |
290 |
291 | def substract(input_1, input_2):
292 | "Substract element-wise"
293 | neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2)
294 | out_ = Add()([input_1, neg_input_2])
295 | return out_
296 |
297 |
298 | def submult(input_1, input_2):
299 | "Get multiplication and subtraction then concatenate results"
300 | mult = Multiply()([input_1, input_2])
301 | sub = substract(input_1, input_2)
302 | out_ = Concatenate()([sub, mult])
303 | return out_
304 |
305 |
306 | def apply_multiple(input_, layers):
307 | "Apply layers to input then concatenate result"
308 | if not len(layers) > 1:
309 | raise ValueError('Layers list should contain more than 1 layer')
310 | else:
311 | agg_ = []
312 | for layer in layers:
313 | agg_.append(layer(input_))
314 | out_ = Concatenate()(agg_)
315 | return out_
316 |
317 |
318 | def time_distributed(input_, layers):
319 | "Apply a list of layers in TimeDistributed mode"
320 | out_ = []
321 | node_ = input_
322 | for layer_ in layers:
323 | node_ = TimeDistributed(layer_)(node_)
324 | out_ = node_
325 | return out_
326 |
327 |
328 | def soft_attention_alignment(input_1, input_2):
329 | "Align text representation with neural soft attention"
330 | attention = Dot(axes=-1)([input_1, input_2])
331 | w_att_1 = Lambda(lambda x: softmax(x, axis=1),
332 | output_shape=unchanged_shape)(attention)
333 | w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2),
334 | output_shape=unchanged_shape)(attention))
335 | in1_aligned = Dot(axes=1)([w_att_1, input_1])
336 | in2_aligned = Dot(axes=1)([w_att_2, input_2])
337 | return in1_aligned, in2_aligned
338 |
339 |
340 | def decomposable_attention(pretrained_weights,
341 | num_shape,
342 | projection_dim=300, projection_hidden=0, projection_dropout=0.2,
343 | compare_dim=500, compare_dropout=0.2,
344 | dense_dim=300, dense_dropout=0.2,
345 | lr=1e-3, activation='elu', maxlen=20):
346 | # Based on: https://arxiv.org/abs/1606.01933
347 |
348 | q1 = Input(name='q1', shape=(maxlen,))
349 | q2 = Input(name='q2', shape=(maxlen,))
350 |
351 | # Embedding
352 | embedding = create_pretrained_embedding(pretrained_weights,
353 | mask_zero=False)
354 | q1_embed = embedding(q1)
355 | q2_embed = embedding(q2)
356 |
357 | # Projection
358 | projection_layers = []
359 | if projection_hidden > 0:
360 | projection_layers.extend([
361 | Dense(projection_hidden, activation=activation),
362 | Dropout(rate=projection_dropout),
363 | ])
364 | projection_layers.extend([
365 | Dense(projection_dim, activation=None),
366 | Dropout(rate=projection_dropout),
367 | ])
368 | q1_encoded = time_distributed(q1_embed, projection_layers)
369 | q2_encoded = time_distributed(q2_embed, projection_layers)
370 |
371 | # Attention
372 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
373 |
374 | # Compare
375 | q1_combined = Concatenate()(
376 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
377 | q2_combined = Concatenate()(
378 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
379 | compare_layers = [
380 | Dense(compare_dim, activation=activation),
381 | Dropout(compare_dropout),
382 | Dense(compare_dim, activation=activation),
383 | Dropout(compare_dropout),
384 | ]
385 | q1_compare = time_distributed(q1_combined, compare_layers)
386 | q2_compare = time_distributed(q2_combined, compare_layers)
387 |
388 | # Aggregate
389 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
390 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
391 |
392 | # Classifier
393 | merged = Concatenate()([q1_rep, q2_rep])
394 | dense = BatchNormalization()(merged)
395 | dense = Dense(dense_dim, activation=activation)(dense)
396 | dense = Dropout(dense_dropout)(dense)
397 | dense = BatchNormalization()(dense)
398 | dense = Dense(dense_dim, activation=activation)(dense)
399 | dense = Dropout(dense_dropout)(dense)
400 | out_ = Dense(1, activation='sigmoid')(dense)
401 |
402 | model = Model(inputs=[q1, q2], outputs=out_)
403 | model.compile(loss='binary_crossentropy',
404 | optimizer=AdamW(lr=0.001, weight_decay=0.02,),
405 | metrics=["accuracy", auc])
406 | return model
407 |
408 |
409 | def esim(embedding_matrix,
410 | maxlen=20,
411 | lstm_dim=30,
412 | dense_dim=30,
413 | dense_dropout=0.5):
414 | # Based on arXiv:1609.06038
415 | q1 = Input(name='q1', shape=(8,))
416 | q2 = Input(name='q2', shape=(20,))
417 |
418 | # Embedding
419 | embedding = create_pretrained_embedding(
420 | embedding_matrix, mask_zero=False)
421 | bn = BatchNormalization(axis=2)
422 | q1_embed = bn(embedding(q1))
423 | q2_embed = bn(embedding(q2))
424 |
425 | # Encode
426 | encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
427 | q1_encoded = encode(q1_embed)
428 | q2_encoded = encode(q2_embed)
429 |
430 | # Attention
431 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
432 |
433 | # Compose
434 | q1_combined = Concatenate()(
435 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
436 | q2_combined = Concatenate()(
437 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
438 |
439 | compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
440 | q1_compare = compose(q1_combined)
441 | q2_compare = compose(q2_combined)
442 |
443 | # Aggregate
444 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
445 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
446 |
447 | # leaks_input = Input(shape=(num_shape,))
448 | # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input)
449 |
450 | # Classifier
451 | merged = Concatenate()([q1_rep, q2_rep])
452 |
453 | dense = BatchNormalization()(merged)
454 | dense = Dense(dense_dim, activation='elu')(dense)
455 | dense = BatchNormalization()(dense)
456 | dense = Dropout(dense_dropout)(dense)
457 | dense = Dense(dense_dim, activation='elu')(dense)
458 | dense = BatchNormalization()(dense)
459 | dense = Dropout(dense_dropout)(dense)
460 | out_ = Dense(1, activation='sigmoid')(dense)
461 |
462 | model = Model(inputs=[q1, q2], outputs=out_)
463 | model.compile(loss='binary_crossentropy',
464 | optimizer=AdamW(lr=0.0003, weight_decay=0.02,),
465 | metrics=["accuracy"])
466 | return model
467 |
468 |
469 | ####模型训练
470 | train_gen = gen_train(path='/home/kesci/zhifeng/train.csv',
471 | batch_size=4096, label_tag=True, chunk_size=1000)
472 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
473 | batch_size=4096, label_tag=True, chunk_size=1000)
474 | print("train...")
475 | print("###"*30)
476 | gc.collect()
477 | K.clear_session()
478 | model = esim(embed_matrix)
479 | model.summary()
480 | early_stopping = EarlyStopping(
481 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
482 | reduce_lr = ReduceLROnPlateau(
483 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
484 | bst_model_path = '/home/kesci/chizhu/chizhu_w2v_esim_weight_{epoch}_{val_loss}.h5'
485 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
486 | save_best_only=False,
487 | verbose=1, save_weights_only=True, period=1)
488 | callbacks = [checkpoint, reduce_lr, early_stopping]
489 | # print("load weight....")
490 |
491 |
492 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(999000000/2048)),
493 | epochs=10, verbose=1, callbacks=callbacks,
494 | validation_data=val_gen, validation_steps=int(
495 | np.ceil(1000000/2048)),
496 | max_queue_size=10, workers=1, use_multiprocessing=False)
497 |
498 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
499 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
500 | val_prob = model.predict_generator(
501 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
502 |
503 | f = open('/home/kesci/zhifeng/val.csv', 'r')
504 | q, a, l = [], [], []
505 | for line in f:
506 | qid, _, aid, _, label = line.strip().split(',')
507 | q.append(qid)
508 | a.append(aid)
509 | l.append(int(label))
510 |
511 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l})
512 | val_df['prob'] = val_prob.flatten()
513 |
514 | roc_auc_score(val_df['label'], val_df['prob'])
515 |
516 |
517 | def perauc(df):
518 | temp = pd.Series()
519 | try:
520 | temp['auc'] = roc_auc_score(df['label'], df['prob'])
521 | except:
522 | temp['auc'] = 0.5
523 | return temp
524 |
525 |
526 | eval_df = val_df.groupby("qid").apply(perauc)
527 | eval_df.index = range(len(eval_df))
528 | print("qauc:", eval_df['auc'].mean())
529 |
530 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
531 | batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
532 | prob = model.predict_generator(
533 | test_gen, steps=int(np.ceil(20000000/4096)), verbose=1)
534 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',
535 | names=['qid', 'aid', 'prob'])
536 | sub['prob'] = prob.flatten()
537 | sub.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testa.csv',
538 | index=False, header=False)
539 |
540 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
541 | batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
542 | prob = model.predict_generator(
543 | test_gen, steps=int(np.ceil(100000000/4096)), verbose=1)
544 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
545 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
546 | final['prob'] = prob.flatten()
547 | final.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testb.csv',
548 | index=False, header=False)
549 |
--------------------------------------------------------------------------------
/bigtrain_w2v_esim.py:
--------------------------------------------------------------------------------
1 | from keras.activations import softmax
2 | import os
3 | import pandas as pd
4 | import numpy as np
5 | import random as rn
6 | from tqdm import tqdm, tqdm_notebook
7 | import tensorflow as tf
8 | from sklearn.metrics import roc_auc_score
9 | from keras.preprocessing.text import Tokenizer
10 | from keras.preprocessing.sequence import pad_sequences
11 | from keras.optimizers import Adam
12 | from keras import backend as K
13 | from keras.optimizers import *
14 | from keras.callbacks import *
15 | from keras.layers import *
16 | from keras.models import *
17 | from keras.engine.topology import Layer
18 | from keras import initializers, regularizers, constraints, optimizers, layers
19 | from keras.initializers import *
20 | import keras
21 | from sklearn.model_selection import StratifiedKFold, GroupKFold
22 | import gc
23 | import time
24 | from gensim.models import Word2Vec
25 | import logging
26 | import Levenshtein
27 | import fasttext
28 | tqdm.pandas()
29 | np.random.seed(1017)
30 | rn.seed(1017)
31 | tf.set_random_seed(1017)
32 | path = "/home/kesci/input/bytedance/"
33 | out = '/home/kesci/work/zhifeng/'
34 | print(os.listdir(path))
35 |
36 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model')
37 |
38 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)}
39 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)}
40 |
41 |
42 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
43 | maxlen_query=8):
44 | if label_tag:
45 | _, _q, _, _a, _label = line.strip().split(',')
46 | else:
47 | _, _q, _, _a = line.strip().split(',')
48 | q_seq = [token.get(item, 0) for item in _q.strip().split()]
49 | a_seq = [token.get(item, 0) for item in _a.strip().split()]
50 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
51 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
52 | if label_tag:
53 | return q_pad, a_pad, int(_label)
54 | return q_pad, a_pad
55 |
56 |
57 | def gen_train(path, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
58 | while True:
59 | fin = open(path, 'r')
60 | batch_q, batch_a, batch_label = [], [], []
61 | for line in fin:
62 | if len(batch_q) == chunk_size*batch_size:
63 | batch_q = np.array(batch_q)
64 | batch_a = np.array(batch_a)
65 | if label_tag:
66 | batch_label = np.array(batch_label)
67 | idx = list(range(chunk_size*batch_size))
68 | if shuffle:
69 | np.random.shuffle(idx)
70 | for i in range(chunk_size):
71 | if label_tag:
72 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
73 | else:
74 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
75 | batch_q, batch_a, batch_label = [], [], []
76 | if label_tag:
77 | q, a, l = gen_feature_help(line, label_tag=label_tag)
78 | else:
79 | q, a = gen_feature_help(line, label_tag=label_tag)
80 | l = 0
81 | batch_q.append(q)
82 | batch_a.append(a)
83 | if label_tag:
84 | batch_label.append(l)
85 |
86 | batch_q = np.array(batch_q)
87 | batch_a = np.array(batch_a)
88 |
89 | if label_tag:
90 | batch_label = np.array(batch_label)
91 | idx = list(range(len(batch_q)))
92 | if shuffle:
93 | np.random.shuffle(idx)
94 | for i in range(int(np.ceil(len(batch_q)/batch_size))):
95 | if label_tag:
96 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
97 | else:
98 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
99 | fin.close()
100 |
101 |
102 | def get_embedding_matrix():
103 | m = np.zeros(shape=(len(index2word)+1, 300))
104 | for i, w in index2word.items():
105 | m[i, :] = w2v[w]
106 | return m
107 |
108 |
109 | embed_matrix = get_embedding_matrix()
110 | maxlen_query = 8
111 | maxlen_answer = 20
112 |
113 |
114 | class AdamW(Optimizer):
115 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
116 | epsilon=1e-8, decay=0., **kwargs):
117 | super(AdamW, self).__init__(**kwargs)
118 | with K.name_scope(self.__class__.__name__):
119 | self.iterations = K.variable(0, dtype='int64', name='iterations')
120 | self.lr = K.variable(lr, name='lr')
121 | self.beta_1 = K.variable(beta_1, name='beta_1')
122 | self.beta_2 = K.variable(beta_2, name='beta_2')
123 | self.decay = K.variable(decay, name='decay')
124 | # decoupled weight decay (2/4)
125 | self.wd = K.variable(weight_decay, name='weight_decay')
126 | self.epsilon = epsilon
127 | self.initial_decay = decay
128 |
129 | @interfaces.legacy_get_updates_support
130 | def get_updates(self, loss, params):
131 | grads = self.get_gradients(loss, params)
132 | self.updates = [K.update_add(self.iterations, 1)]
133 | wd = self.wd # decoupled weight decay (3/4)
134 |
135 | lr = self.lr
136 | if self.initial_decay > 0:
137 | lr *= (1. / (1. + self.decay * K.cast(self.iterations,
138 | K.dtype(self.decay))))
139 |
140 | t = K.cast(self.iterations, K.floatx()) + 1
141 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
142 | (1. - K.pow(self.beta_1, t)))
143 |
144 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
145 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
146 | self.weights = [self.iterations] + ms + vs
147 |
148 | for p, g, m, v in zip(params, grads, ms, vs):
149 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
150 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
151 | # decoupled weight decay (4/4)
152 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
153 |
154 | self.updates.append(K.update(m, m_t))
155 | self.updates.append(K.update(v, v_t))
156 | new_p = p_t
157 |
158 | # Apply constraints.
159 | if getattr(p, 'constraint', None) is not None:
160 | new_p = p.constraint(new_p)
161 |
162 | self.updates.append(K.update(p, new_p))
163 | return self.updates
164 |
165 | def get_config(self):
166 | config = {'lr': float(K.get_value(self.lr)),
167 | 'beta_1': float(K.get_value(self.beta_1)),
168 | 'beta_2': float(K.get_value(self.beta_2)),
169 | 'decay': float(K.get_value(self.decay)),
170 | 'weight_decay': float(K.get_value(self.wd)),
171 | 'epsilon': self.epsilon}
172 | base_config = super(AdamW, self).get_config()
173 | return dict(list(base_config.items()) + list(config.items()))
174 |
175 |
176 | class Attention(Layer):
177 | def __init__(self, step_dim,
178 | W_regularizer=None, b_regularizer=None,
179 | W_constraint=None, b_constraint=None,
180 | bias=True, **kwargs):
181 | self.supports_masking = True
182 | self.init = initializers.get('glorot_uniform')
183 |
184 | self.W_regularizer = regularizers.get(W_regularizer)
185 | self.b_regularizer = regularizers.get(b_regularizer)
186 |
187 | self.W_constraint = constraints.get(W_constraint)
188 | self.b_constraint = constraints.get(b_constraint)
189 |
190 | self.bias = bias
191 | self.step_dim = step_dim
192 | self.features_dim = 0
193 | super(Attention, self).__init__(**kwargs)
194 |
195 | def build(self, input_shape):
196 | assert len(input_shape) == 3
197 |
198 | self.W = self.add_weight((input_shape[-1],),
199 | initializer=self.init,
200 | name='{}_W'.format(self.name),
201 | regularizer=self.W_regularizer,
202 | constraint=self.W_constraint)
203 | self.features_dim = input_shape[-1]
204 |
205 | if self.bias:
206 | self.b = self.add_weight((input_shape[1],),
207 | initializer='zero',
208 | name='{}_b'.format(self.name),
209 | regularizer=self.b_regularizer,
210 | constraint=self.b_constraint)
211 | else:
212 | self.b = None
213 |
214 | self.built = True
215 |
216 | def compute_mask(self, input, input_mask=None):
217 | return None
218 |
219 | def call(self, x, mask=None):
220 | features_dim = self.features_dim
221 | step_dim = self.step_dim
222 |
223 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
224 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
225 |
226 | if self.bias:
227 | eij += self.b
228 |
229 | eij = K.tanh(eij)
230 |
231 | a = K.exp(eij)
232 |
233 | if mask is not None:
234 | a *= K.cast(mask, K.floatx())
235 |
236 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
237 |
238 | a = K.expand_dims(a)
239 | weighted_input = x * a
240 | return K.sum(weighted_input, axis=1)
241 |
242 | def compute_output_shape(self, input_shape):
243 | return input_shape[0], self.features_dim
244 |
245 | # AUC for a binary classifier
246 |
247 |
248 | def auc(y_true, y_pred):
249 | ptas = tf.stack([binary_PTA(y_true, y_pred, k)
250 | for k in np.linspace(0, 1, 1000)], axis=0)
251 | pfas = tf.stack([binary_PFA(y_true, y_pred, k)
252 | for k in np.linspace(0, 1, 1000)], axis=0)
253 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
254 | binSizes = -(pfas[1:]-pfas[:-1])
255 | s = ptas*binSizes
256 | return K.sum(s, axis=0)
257 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
258 | # PFA, prob false alert for binary classifier
259 |
260 |
261 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
262 | y_pred = K.cast(y_pred >= threshold, 'float32')
263 | # N = total number of negative labels
264 | N = K.sum(1 - y_true)
265 | # FP = total number of false alerts, alerts from the negative class labels
266 | FP = K.sum(y_pred - y_pred * y_true)
267 | return FP/N
268 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
269 | # P_TA prob true alerts for binary classifier
270 |
271 |
272 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
273 | y_pred = K.cast(y_pred >= threshold, 'float32')
274 | # P = total number of positive labels
275 | P = K.sum(y_true)
276 | # TP = total number of correct alerts, alerts from the positive class labels
277 | TP = K.sum(y_pred * y_true)
278 | return TP/P
279 |
280 |
281 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs):
282 | "Create embedding layer from a pretrained weights array"
283 | in_dim, out_dim = pretrained_weights.shape
284 | embedding = Embedding(in_dim, out_dim, weights=[
285 | pretrained_weights], trainable=False, **kwargs)
286 | return embedding
287 |
288 |
289 | def unchanged_shape(input_shape):
290 | "Function for Lambda layer"
291 | return input_shape
292 |
293 |
294 | def substract(input_1, input_2):
295 | "Substract element-wise"
296 | neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2)
297 | out_ = Add()([input_1, neg_input_2])
298 | return out_
299 |
300 |
301 | def submult(input_1, input_2):
302 | "Get multiplication and subtraction then concatenate results"
303 | mult = Multiply()([input_1, input_2])
304 | sub = substract(input_1, input_2)
305 | out_ = Concatenate()([sub, mult])
306 | return out_
307 |
308 |
309 | def apply_multiple(input_, layers):
310 | "Apply layers to input then concatenate result"
311 | if not len(layers) > 1:
312 | raise ValueError('Layers list should contain more than 1 layer')
313 | else:
314 | agg_ = []
315 | for layer in layers:
316 | agg_.append(layer(input_))
317 | out_ = Concatenate()(agg_)
318 | return out_
319 |
320 |
321 | def time_distributed(input_, layers):
322 | "Apply a list of layers in TimeDistributed mode"
323 | out_ = []
324 | node_ = input_
325 | for layer_ in layers:
326 | node_ = TimeDistributed(layer_)(node_)
327 | out_ = node_
328 | return out_
329 |
330 |
331 | def soft_attention_alignment(input_1, input_2):
332 | "Align text representation with neural soft attention"
333 | attention = Dot(axes=-1)([input_1, input_2])
334 | w_att_1 = Lambda(lambda x: softmax(x, axis=1),
335 | output_shape=unchanged_shape)(attention)
336 | w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2),
337 | output_shape=unchanged_shape)(attention))
338 | in1_aligned = Dot(axes=1)([w_att_1, input_1])
339 | in2_aligned = Dot(axes=1)([w_att_2, input_2])
340 | return in1_aligned, in2_aligned
341 |
342 |
343 | def decomposable_attention(pretrained_weights,
344 | num_shape,
345 | projection_dim=300, projection_hidden=0, projection_dropout=0.2,
346 | compare_dim=500, compare_dropout=0.2,
347 | dense_dim=300, dense_dropout=0.2,
348 | lr=1e-3, activation='elu', maxlen=20):
349 | # Based on: https://arxiv.org/abs/1606.01933
350 |
351 | q1 = Input(name='q1', shape=(maxlen,))
352 | q2 = Input(name='q2', shape=(maxlen,))
353 |
354 | # Embedding
355 | embedding = create_pretrained_embedding(pretrained_weights,
356 | mask_zero=False)
357 | q1_embed = embedding(q1)
358 | q2_embed = embedding(q2)
359 |
360 | # Projection
361 | projection_layers = []
362 | if projection_hidden > 0:
363 | projection_layers.extend([
364 | Dense(projection_hidden, activation=activation),
365 | Dropout(rate=projection_dropout),
366 | ])
367 | projection_layers.extend([
368 | Dense(projection_dim, activation=None),
369 | Dropout(rate=projection_dropout),
370 | ])
371 | q1_encoded = time_distributed(q1_embed, projection_layers)
372 | q2_encoded = time_distributed(q2_embed, projection_layers)
373 |
374 | # Attention
375 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
376 |
377 | # Compare
378 | q1_combined = Concatenate()(
379 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
380 | q2_combined = Concatenate()(
381 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
382 | compare_layers = [
383 | Dense(compare_dim, activation=activation),
384 | Dropout(compare_dropout),
385 | Dense(compare_dim, activation=activation),
386 | Dropout(compare_dropout),
387 | ]
388 | q1_compare = time_distributed(q1_combined, compare_layers)
389 | q2_compare = time_distributed(q2_combined, compare_layers)
390 |
391 | # Aggregate
392 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
393 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
394 |
395 | # Classifier
396 | merged = Concatenate()([q1_rep, q2_rep])
397 | dense = BatchNormalization()(merged)
398 | dense = Dense(dense_dim, activation=activation)(dense)
399 | dense = Dropout(dense_dropout)(dense)
400 | dense = BatchNormalization()(dense)
401 | dense = Dense(dense_dim, activation=activation)(dense)
402 | dense = Dropout(dense_dropout)(dense)
403 | out_ = Dense(1, activation='sigmoid')(dense)
404 |
405 | model = Model(inputs=[q1, q2], outputs=out_)
406 | model.compile(loss='binary_crossentropy',
407 | optimizer=AdamW(lr=0.001, weight_decay=0.02,),
408 | metrics=["accuracy", auc])
409 | return model
410 |
411 |
412 | def esim(embedding_matrix,
413 | maxlen=20,
414 | lstm_dim=30,
415 | dense_dim=30,
416 | dense_dropout=0.5):
417 | # Based on arXiv:1609.06038
418 | q1 = Input(name='q1', shape=(8,))
419 | q2 = Input(name='q2', shape=(20,))
420 |
421 | # Embedding
422 | embedding = create_pretrained_embedding(
423 | embedding_matrix, mask_zero=False)
424 | bn = BatchNormalization(axis=2)
425 | q1_embed = bn(embedding(q1))
426 | q2_embed = bn(embedding(q2))
427 |
428 | # Encode
429 | encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
430 | q1_encoded = encode(q1_embed)
431 | q2_encoded = encode(q2_embed)
432 |
433 | # Attention
434 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
435 |
436 | # Compose
437 | q1_combined = Concatenate()(
438 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
439 | q2_combined = Concatenate()(
440 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
441 |
442 | compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
443 | q1_compare = compose(q1_combined)
444 | q2_compare = compose(q2_combined)
445 |
446 | # Aggregate
447 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
448 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
449 |
450 | # leaks_input = Input(shape=(num_shape,))
451 | # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input)
452 |
453 | # Classifier
454 | merged = Concatenate()([q1_rep, q2_rep])
455 |
456 | dense = BatchNormalization()(merged)
457 | dense = Dense(dense_dim, activation='elu')(dense)
458 | dense = BatchNormalization()(dense)
459 | dense = Dropout(dense_dropout)(dense)
460 | dense = Dense(dense_dim, activation='elu')(dense)
461 | dense = BatchNormalization()(dense)
462 | dense = Dropout(dense_dropout)(dense)
463 | out_ = Dense(1, activation='sigmoid')(dense)
464 |
465 | model = Model(inputs=[q1, q2], outputs=out_)
466 | model.compile(loss='binary_crossentropy',
467 | optimizer=AdamW(lr=0.0003, weight_decay=0.02,),
468 | metrics=["accuracy"])
469 | return model
470 |
471 |
472 | ####模型训练
473 | train_gen = gen_train(path='/home/kesci/zhifeng/train.csv',
474 | batch_size=4096, label_tag=True, chunk_size=1000)
475 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
476 | batch_size=4096, label_tag=True, chunk_size=1000)
477 | print("train...")
478 | print("###"*30)
479 | gc.collect()
480 | K.clear_session()
481 | model = esim(embed_matrix)
482 | model.summary()
483 | early_stopping = EarlyStopping(
484 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
485 | reduce_lr = ReduceLROnPlateau(
486 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
487 | bst_model_path = '/home/kesci/chizhu/chizhu_w2v_esim_weight_{epoch}_{val_loss}.h5'
488 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
489 | save_best_only=False,
490 | verbose=1, save_weights_only=True, period=1)
491 | callbacks = [checkpoint, reduce_lr, early_stopping]
492 | # print("load weight....")
493 |
494 |
495 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(999000000/2048)),
496 | epochs=10, verbose=1, callbacks=callbacks,
497 | validation_data=val_gen, validation_steps=int(
498 | np.ceil(1000000/2048)),
499 | max_queue_size=10, workers=1, use_multiprocessing=False)
500 |
501 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
502 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
503 | val_prob = model.predict_generator(
504 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
505 |
506 | f = open('/home/kesci/zhifeng/val.csv','r')
507 | q,a,l=[],[],[]
508 | for line in f:
509 | qid,_,aid,_,label = line.strip().split(',')
510 | q.append(qid)
511 | a.append(aid)
512 | l.append(int(label))
513 |
514 | val_df = pd.DataFrame({'qid':q,'aid':a,'label':l})
515 | val_df['prob'] = val_prob.flatten()
516 |
517 | roc_auc_score(val_df['label'], val_df['prob'])
518 |
519 | def perauc(df):
520 | temp=pd.Series()
521 | try:
522 | temp['auc']=roc_auc_score(df['label'],df['prob'])
523 | except:
524 | temp['auc']=0.5
525 | return temp
526 | eval_df=val_df.groupby("qid").apply(perauc)
527 | eval_df.index=range(len(eval_df))
528 | print("qauc:",eval_df['auc'].mean())
529 |
530 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
531 | batch_size=4096,label_tag=False,chunk_size=1,shuffle=False)
532 | prob = model.predict_generator(test_gen,steps=int(np.ceil(20000000/4096)),verbose=1)
533 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',names=['qid','aid','prob'])
534 | sub['prob'] = prob.flatten()
535 | sub.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testa.csv',index=False,header=False)
536 |
537 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
538 | batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
539 | prob = model.predict_generator(
540 | test_gen, steps=int(np.ceil(100000000/4096)), verbose=1)
541 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
542 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
543 | final['prob'] = prob.flatten()
544 | final.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testb.csv',
545 | index=False, header=False)
546 |
--------------------------------------------------------------------------------
/bigtrain_w2v_rnn.py:
--------------------------------------------------------------------------------
1 | from keras.activations import softmax
2 | import os
3 | import pandas as pd
4 | import numpy as np
5 | import random as rn
6 | from tqdm import tqdm, tqdm_notebook
7 | import tensorflow as tf
8 | from sklearn.metrics import roc_auc_score
9 | from keras.preprocessing.text import Tokenizer
10 | from keras.preprocessing.sequence import pad_sequences
11 | from keras.optimizers import Adam
12 | from keras import backend as K
13 | from keras.optimizers import *
14 | from keras.callbacks import *
15 | from keras.layers import *
16 | from keras.models import *
17 | from keras.engine.topology import Layer
18 | from keras import initializers, regularizers, constraints, optimizers, layers
19 | from keras.initializers import *
20 | import keras
21 | from sklearn.model_selection import StratifiedKFold, GroupKFold
22 | import gc
23 | import time
24 | from gensim.models import Word2Vec
25 | import logging
26 | import Levenshtein
27 | import fasttext
28 | tqdm.pandas()
29 | np.random.seed(1017)
30 | rn.seed(1017)
31 | tf.set_random_seed(1017)
32 | path = "/home/kesci/input/bytedance/"
33 | out = '/home/kesci/work/zhifeng/'
34 | print(os.listdir(path))
35 |
36 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model')
37 |
38 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)}
39 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)}
40 |
41 |
42 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
43 | maxlen_query=8):
44 | if label_tag:
45 | _, _q, _, _a, _label = line.strip().split(',')
46 | else:
47 | _, _q, _, _a = line.strip().split(',')
48 | q_seq = [token.get(item, 0) for item in _q.strip().split()]
49 | a_seq = [token.get(item, 0) for item in _a.strip().split()]
50 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
51 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
52 | if label_tag:
53 | return q_pad, a_pad, int(_label)
54 | return q_pad, a_pad
55 |
56 |
57 | def gen_train(path, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
58 | while True:
59 | fin = open(path, 'r')
60 | batch_q, batch_a, batch_label = [], [], []
61 | for line in fin:
62 | if len(batch_q) == chunk_size*batch_size:
63 | batch_q = np.array(batch_q)
64 | batch_a = np.array(batch_a)
65 | if label_tag:
66 | batch_label = np.array(batch_label)
67 | idx = list(range(chunk_size*batch_size))
68 | if shuffle:
69 | np.random.shuffle(idx)
70 | for i in range(chunk_size):
71 | if label_tag:
72 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
73 | else:
74 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
75 | batch_q, batch_a, batch_label = [], [], []
76 | if label_tag:
77 | q, a, l = gen_feature_help(line, label_tag=label_tag)
78 | else:
79 | q, a = gen_feature_help(line, label_tag=label_tag)
80 | l = 0
81 | batch_q.append(q)
82 | batch_a.append(a)
83 | if label_tag:
84 | batch_label.append(l)
85 |
86 | batch_q = np.array(batch_q)
87 | batch_a = np.array(batch_a)
88 |
89 | if label_tag:
90 | batch_label = np.array(batch_label)
91 | idx = list(range(len(batch_q)))
92 | if shuffle:
93 | np.random.shuffle(idx)
94 | for i in range(int(np.ceil(len(batch_q)/batch_size))):
95 | if label_tag:
96 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])
97 | else:
98 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])]
99 | fin.close()
100 |
101 |
102 | def get_embedding_matrix():
103 | m = np.zeros(shape=(len(index2word)+1, 300))
104 | for i, w in index2word.items():
105 | m[i, :] = w2v[w]
106 | return m
107 |
108 |
109 | embed_matrix = get_embedding_matrix()
110 | maxlen_query = 8
111 | maxlen_answer = 20
112 |
113 |
114 | class AdamW(Optimizer):
115 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
116 | epsilon=1e-8, decay=0., **kwargs):
117 | super(AdamW, self).__init__(**kwargs)
118 | with K.name_scope(self.__class__.__name__):
119 | self.iterations = K.variable(0, dtype='int64', name='iterations')
120 | self.lr = K.variable(lr, name='lr')
121 | self.beta_1 = K.variable(beta_1, name='beta_1')
122 | self.beta_2 = K.variable(beta_2, name='beta_2')
123 | self.decay = K.variable(decay, name='decay')
124 | # decoupled weight decay (2/4)
125 | self.wd = K.variable(weight_decay, name='weight_decay')
126 | self.epsilon = epsilon
127 | self.initial_decay = decay
128 |
129 | @interfaces.legacy_get_updates_support
130 | def get_updates(self, loss, params):
131 | grads = self.get_gradients(loss, params)
132 | self.updates = [K.update_add(self.iterations, 1)]
133 | wd = self.wd # decoupled weight decay (3/4)
134 |
135 | lr = self.lr
136 | if self.initial_decay > 0:
137 | lr *= (1. / (1. + self.decay * K.cast(self.iterations,
138 | K.dtype(self.decay))))
139 |
140 | t = K.cast(self.iterations, K.floatx()) + 1
141 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
142 | (1. - K.pow(self.beta_1, t)))
143 |
144 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
145 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
146 | self.weights = [self.iterations] + ms + vs
147 |
148 | for p, g, m, v in zip(params, grads, ms, vs):
149 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
150 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
151 | # decoupled weight decay (4/4)
152 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
153 |
154 | self.updates.append(K.update(m, m_t))
155 | self.updates.append(K.update(v, v_t))
156 | new_p = p_t
157 |
158 | # Apply constraints.
159 | if getattr(p, 'constraint', None) is not None:
160 | new_p = p.constraint(new_p)
161 |
162 | self.updates.append(K.update(p, new_p))
163 | return self.updates
164 |
165 | def get_config(self):
166 | config = {'lr': float(K.get_value(self.lr)),
167 | 'beta_1': float(K.get_value(self.beta_1)),
168 | 'beta_2': float(K.get_value(self.beta_2)),
169 | 'decay': float(K.get_value(self.decay)),
170 | 'weight_decay': float(K.get_value(self.wd)),
171 | 'epsilon': self.epsilon}
172 | base_config = super(AdamW, self).get_config()
173 | return dict(list(base_config.items()) + list(config.items()))
174 |
175 |
176 | class Attention(Layer):
177 | def __init__(self, step_dim,
178 | W_regularizer=None, b_regularizer=None,
179 | W_constraint=None, b_constraint=None,
180 | bias=True, **kwargs):
181 | self.supports_masking = True
182 | self.init = initializers.get('glorot_uniform')
183 |
184 | self.W_regularizer = regularizers.get(W_regularizer)
185 | self.b_regularizer = regularizers.get(b_regularizer)
186 |
187 | self.W_constraint = constraints.get(W_constraint)
188 | self.b_constraint = constraints.get(b_constraint)
189 |
190 | self.bias = bias
191 | self.step_dim = step_dim
192 | self.features_dim = 0
193 | super(Attention, self).__init__(**kwargs)
194 |
195 | def build(self, input_shape):
196 | assert len(input_shape) == 3
197 |
198 | self.W = self.add_weight((input_shape[-1],),
199 | initializer=self.init,
200 | name='{}_W'.format(self.name),
201 | regularizer=self.W_regularizer,
202 | constraint=self.W_constraint)
203 | self.features_dim = input_shape[-1]
204 |
205 | if self.bias:
206 | self.b = self.add_weight((input_shape[1],),
207 | initializer='zero',
208 | name='{}_b'.format(self.name),
209 | regularizer=self.b_regularizer,
210 | constraint=self.b_constraint)
211 | else:
212 | self.b = None
213 |
214 | self.built = True
215 |
216 | def compute_mask(self, input, input_mask=None):
217 | return None
218 |
219 | def call(self, x, mask=None):
220 | features_dim = self.features_dim
221 | step_dim = self.step_dim
222 |
223 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
224 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
225 |
226 | if self.bias:
227 | eij += self.b
228 |
229 | eij = K.tanh(eij)
230 |
231 | a = K.exp(eij)
232 |
233 | if mask is not None:
234 | a *= K.cast(mask, K.floatx())
235 |
236 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
237 |
238 | a = K.expand_dims(a)
239 | weighted_input = x * a
240 | return K.sum(weighted_input, axis=1)
241 |
242 | def compute_output_shape(self, input_shape):
243 | return input_shape[0], self.features_dim
244 |
245 | # AUC for a binary classifier
246 |
247 |
248 | def auc(y_true, y_pred):
249 | ptas = tf.stack([binary_PTA(y_true, y_pred, k)
250 | for k in np.linspace(0, 1, 1000)], axis=0)
251 | pfas = tf.stack([binary_PFA(y_true, y_pred, k)
252 | for k in np.linspace(0, 1, 1000)], axis=0)
253 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
254 | binSizes = -(pfas[1:]-pfas[:-1])
255 | s = ptas*binSizes
256 | return K.sum(s, axis=0)
257 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
258 | # PFA, prob false alert for binary classifier
259 |
260 |
261 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
262 | y_pred = K.cast(y_pred >= threshold, 'float32')
263 | # N = total number of negative labels
264 | N = K.sum(1 - y_true)
265 | # FP = total number of false alerts, alerts from the negative class labels
266 | FP = K.sum(y_pred - y_pred * y_true)
267 | return FP/N
268 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
269 | # P_TA prob true alerts for binary classifier
270 |
271 |
272 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
273 | y_pred = K.cast(y_pred >= threshold, 'float32')
274 | # P = total number of positive labels
275 | P = K.sum(y_true)
276 | # TP = total number of correct alerts, alerts from the positive class labels
277 | TP = K.sum(y_pred * y_true)
278 | return TP/P
279 |
280 |
281 | def get_model(embedding_matrix):
282 |
283 | K.clear_session()
284 | #The embedding layer containing the word vectors
285 | emb_layer = Embedding(
286 | input_dim=embedding_matrix.shape[0],
287 | output_dim=embedding_matrix.shape[1],
288 | weights=[embedding_matrix],
289 | trainable=False
290 | )
291 | sdrop = SpatialDropout1D(rate=0.2)
292 | lstm_layer = Bidirectional(CuDNNLSTM(40, return_sequences=True,
293 | kernel_initializer=glorot_uniform(seed=123)))
294 | gru_layer = Bidirectional(CuDNNGRU(40, return_sequences=True,
295 | kernel_initializer=glorot_uniform(seed=123)))
296 |
297 | cnn1d_layer = keras.layers.Conv1D(
298 | 40, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
299 |
300 | # Define inputs
301 | seq1 = Input(shape=(maxlen_query,))
302 | x1 = emb_layer(seq1)
303 | x1 = sdrop(x1)
304 | lstm1 = lstm_layer(x1)
305 | gru1 = gru_layer(lstm1)
306 | att_1 = Attention(maxlen_query)(lstm1)
307 | att_3 = Attention(maxlen_query)(gru1)
308 | cnn1 = cnn1d_layer(lstm1)
309 |
310 | avg_pool = GlobalAveragePooling1D()
311 | max_pool = GlobalMaxPooling1D()
312 |
313 | seq2 = Input(shape=(maxlen_answer,))
314 | x2 = emb_layer(seq2)
315 | x2 = sdrop(x2)
316 | lstm2 = lstm_layer(x2)
317 | gru2 = gru_layer(lstm2)
318 | att_2 = Attention(maxlen_answer)(lstm2)
319 | att_4 = Attention(maxlen_answer)(gru2)
320 | cnn2 = cnn1d_layer(lstm2)
321 |
322 | x1 = concatenate([att_1, att_3, avg_pool(cnn1), max_pool(
323 | cnn1), avg_pool(gru1), max_pool(gru1)])
324 | x2 = concatenate([att_2, att_4, avg_pool(cnn2), max_pool(
325 | cnn2), avg_pool(gru2), max_pool(gru2)])
326 |
327 | merge = Multiply()([x1, x2])
328 | merge = Dropout(0.2)(merge)
329 |
330 | # htime = Dense(col_len,activation='relu')(hin)
331 | # The MLP that determines the outcome
332 | x = Dense(40, kernel_initializer=he_uniform(
333 | seed=123), activation='relu',)(merge)
334 | # x = Dropout(0.2)(x)
335 | # x = BatchNormalization()(x)
336 |
337 | pred = Dense(1, kernel_initializer=he_uniform(
338 | seed=123), activation='sigmoid')(x)
339 |
340 | model = Model(inputs=[seq1, seq2], outputs=pred)
341 |
342 | model.compile(loss='binary_crossentropy',
343 | optimizer=AdamW(lr=0.0003, weight_decay=0.02,),
344 | metrics=["accuracy"])
345 | # model.summary()
346 | return model
347 |
348 | ####模型训练
349 | train_gen = gen_train(path='/home/kesci/zhifeng/train.csv',
350 | batch_size=4096, label_tag=True, chunk_size=1000)
351 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
352 | batch_size=4096, label_tag=True, chunk_size=1000)
353 | print("train...")
354 | print("###"*30)
355 | gc.collect()
356 | K.clear_session()
357 | model = (embed_matrix)
358 | model.summary()
359 | early_stopping = EarlyStopping(
360 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
361 | reduce_lr = ReduceLROnPlateau(
362 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
363 | bst_model_path = '/home/kesci/chizhu/chizhu_w2v_esim_weight_{epoch}_{val_loss}.h5'
364 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
365 | save_best_only=False,
366 | verbose=1, save_weights_only=True, period=1)
367 | callbacks = [checkpoint, reduce_lr, early_stopping]
368 | # print("load weight....")
369 |
370 |
371 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(999000000/2048)),
372 | epochs=10, verbose=1, callbacks=callbacks,
373 | validation_data=val_gen, validation_steps=int(
374 | np.ceil(1000000/2048)),
375 | max_queue_size=10, workers=1, use_multiprocessing=False)
376 |
377 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',
378 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
379 | val_prob = model.predict_generator(
380 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
381 |
382 | f = open('/home/kesci/zhifeng/val.csv', 'r')
383 | q, a, l = [], [], []
384 | for line in f:
385 | qid, _, aid, _, label = line.strip().split(',')
386 | q.append(qid)
387 | a.append(aid)
388 | l.append(int(label))
389 |
390 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l})
391 | val_df['prob'] = val_prob.flatten()
392 |
393 | roc_auc_score(val_df['label'], val_df['prob'])
394 |
395 |
396 | def perauc(df):
397 | temp = pd.Series()
398 | try:
399 | temp['auc'] = roc_auc_score(df['label'], df['prob'])
400 | except:
401 | temp['auc'] = 0.5
402 | return temp
403 |
404 |
405 | eval_df = val_df.groupby("qid").apply(perauc)
406 | eval_df.index = range(len(eval_df))
407 | print("qauc:", eval_df['auc'].mean())
408 |
409 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
410 | batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
411 | prob = model.predict_generator(
412 | test_gen, steps=int(np.ceil(20000000/4096)), verbose=1)
413 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',
414 | names=['qid', 'aid', 'prob'])
415 | sub['prob'] = prob.flatten()
416 | sub.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testa.csv',
417 | index=False, header=False)
418 |
419 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
420 | batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
421 | prob = model.predict_generator(
422 | test_gen, steps=int(np.ceil(100000000/4096)), verbose=1)
423 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
424 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
425 | final['prob'] = prob.flatten()
426 | final.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testb.csv',
427 | index=False, header=False)
428 |
--------------------------------------------------------------------------------
/chizhu_rnn.py:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import StandardScaler
2 | import os
3 | import pandas as pd
4 | import numpy as np
5 | import random as rn
6 | from tqdm import tqdm, tqdm_notebook
7 | import tensorflow as tf
8 | from sklearn.metrics import roc_auc_score
9 | from keras.preprocessing.text import Tokenizer
10 | from keras.preprocessing.sequence import pad_sequences
11 | from keras.optimizers import Adam
12 | from keras import backend as K
13 | from keras.optimizers import *
14 | from keras.callbacks import *
15 | from keras.layers import *
16 | from keras.models import *
17 | from keras.engine.topology import Layer
18 | from keras import initializers, regularizers, constraints, optimizers, layers
19 | from keras.initializers import *
20 | import keras
21 | from sklearn.model_selection import StratifiedKFold, GroupKFold
22 | import gc
23 | import time
24 | from gensim.models import Word2Vec
25 | import logging
26 | import Levenshtein
27 | tqdm.pandas()
28 | np.random.seed(1017)
29 | rn.seed(1017)
30 | tf.set_random_seed(1017)
31 | path = "/home/kesci/input/bytedance/"
32 | out = '/home/kesci/work/chizhu/'
33 | print(os.listdir(path))
34 |
35 | f1 = pd.read_csv(out + 'f1.csv')
36 | f2 = pd.read_csv(out + 'f2.csv')
37 | f3 = pd.read_csv(out + 'f3.csv')
38 | feature = pd.concat([f1, f2, f3], sort=False, axis=1)
39 | del f1, f2, f3
40 | gc.collect()
41 |
42 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl")
43 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl")
44 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl")
45 | testb_w2v = pd.read_pickle(
46 | "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl")
47 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v)
48 |
49 | train_w2v = pd.read_pickle(
50 | "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl")
51 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl")
52 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl")
53 | testb_w2v = pd.read_pickle(
54 | "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl")
55 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \
56 | list(testa_w2v)+list(testb_w2v)
57 | del train_w2v, val_w2v, testa_w2v, testb_w2v
58 | gc.collect()
59 | feature.shape
60 |
61 | len_train = 99000000
62 | len_val = 1000000
63 | len_testa = 20000000
64 | len_testb = 100000000
65 | sc = StandardScaler()
66 | feature = sc.fit_transform(feature)
67 | train_feature = feature[:len_train]
68 | val_feature = feature[len_train:len_train+len_val]
69 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa]
70 | testb_feature = feature[-len_testb:]
71 | print(train_feature.shape, val_feature.shape,testa_feature.shape,testb_feature.shape)
72 |
73 | del feature
74 | gc.collect()
75 |
76 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model')
77 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)}
78 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)}
79 |
80 |
81 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
82 | maxlen_query=8):
83 | if label_tag:
84 | _, _q, _, _a, _label = line.strip().split(',')
85 | else:
86 | _, _q, _, _a = line.strip().split(',')
87 | q_seq = [token.get(item, 0) for item in _q.strip().split()]
88 | a_seq = [token.get(item, 0) for item in _a.strip().split()]
89 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
90 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
91 | if label_tag:
92 | return q_pad, a_pad, int(_label)
93 | return q_pad, a_pad
94 |
95 |
96 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
97 | while True:
98 | fin = open(path, 'r')
99 | batch_q, batch_a, batch_f, batch_label = [], [], [], []
100 | for i, line in enumerate(fin):
101 | if len(batch_q) == chunk_size*batch_size:
102 | batch_q = np.array(batch_q)
103 | batch_a = np.array(batch_a)
104 | batch_f = np.array(batch_f)
105 | if label_tag:
106 | batch_label = np.array(batch_label)
107 | idx = list(range(chunk_size*batch_size))
108 | if shuffle:
109 | np.random.shuffle(idx)
110 | for i in range(chunk_size):
111 | if label_tag:
112 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
113 | np.array(
114 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
115 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
116 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
117 | else:
118 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
119 | np.array(
120 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
121 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
122 | batch_q, batch_a, batch_f, batch_label = [], [], [], []
123 | if label_tag:
124 | q, a, l = gen_feature_help(line, label_tag=label_tag)
125 | else:
126 | q, a = gen_feature_help(line, label_tag=label_tag)
127 | l = 0
128 | batch_q.append(q)
129 | batch_a.append(a)
130 | batch_f.append(feature[i])
131 | if label_tag:
132 | batch_label.append(l)
133 |
134 | batch_q = np.array(batch_q)
135 | batch_a = np.array(batch_a)
136 | batch_f = np.array(batch_f)
137 |
138 | if label_tag:
139 | batch_label = np.array(batch_label)
140 | idx = list(range(len(batch_q)))
141 | if shuffle:
142 | np.random.shuffle(idx)
143 | for i in range(int(np.ceil(len(batch_q)/batch_size))):
144 | if label_tag:
145 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
146 | np.array(
147 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
148 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
149 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
150 | else:
151 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
152 | np.array(
153 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
154 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
155 | fin.close()
156 |
157 |
158 | def get_embedding_matrix():
159 | m = np.zeros(shape=(len(index2word)+1, 300))
160 | for i, w in index2word.items():
161 | m[i, :] = w2v[w]
162 | return m
163 |
164 |
165 | embed_matrix = get_embedding_matrix()
166 | maxlen_query = 8
167 | maxlen_answer = 20
168 |
169 |
170 | class AdamW(Optimizer):
171 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
172 | epsilon=1e-8, decay=0., **kwargs):
173 | super(AdamW, self).__init__(**kwargs)
174 | with K.name_scope(self.__class__.__name__):
175 | self.iterations = K.variable(0, dtype='int64', name='iterations')
176 | self.lr = K.variable(lr, name='lr')
177 | self.beta_1 = K.variable(beta_1, name='beta_1')
178 | self.beta_2 = K.variable(beta_2, name='beta_2')
179 | self.decay = K.variable(decay, name='decay')
180 | # decoupled weight decay (2/4)
181 | self.wd = K.variable(weight_decay, name='weight_decay')
182 | self.epsilon = epsilon
183 | self.initial_decay = decay
184 |
185 | @interfaces.legacy_get_updates_support
186 | def get_updates(self, loss, params):
187 | grads = self.get_gradients(loss, params)
188 | self.updates = [K.update_add(self.iterations, 1)]
189 | wd = self.wd # decoupled weight decay (3/4)
190 |
191 | lr = self.lr
192 | if self.initial_decay > 0:
193 | lr *= (1. / (1. + self.decay * K.cast(self.iterations,
194 | K.dtype(self.decay))))
195 |
196 | t = K.cast(self.iterations, K.floatx()) + 1
197 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
198 | (1. - K.pow(self.beta_1, t)))
199 |
200 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
201 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
202 | self.weights = [self.iterations] + ms + vs
203 |
204 | for p, g, m, v in zip(params, grads, ms, vs):
205 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
206 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
207 | # decoupled weight decay (4/4)
208 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
209 |
210 | self.updates.append(K.update(m, m_t))
211 | self.updates.append(K.update(v, v_t))
212 | new_p = p_t
213 |
214 | # Apply constraints.
215 | if getattr(p, 'constraint', None) is not None:
216 | new_p = p.constraint(new_p)
217 |
218 | self.updates.append(K.update(p, new_p))
219 | return self.updates
220 |
221 | def get_config(self):
222 | config = {'lr': float(K.get_value(self.lr)),
223 | 'beta_1': float(K.get_value(self.beta_1)),
224 | 'beta_2': float(K.get_value(self.beta_2)),
225 | 'decay': float(K.get_value(self.decay)),
226 | 'weight_decay': float(K.get_value(self.wd)),
227 | 'epsilon': self.epsilon}
228 | base_config = super(AdamW, self).get_config()
229 | return dict(list(base_config.items()) + list(config.items()))
230 |
231 |
232 | class Attention(Layer):
233 | def __init__(self, step_dim,
234 | W_regularizer=None, b_regularizer=None,
235 | W_constraint=None, b_constraint=None,
236 | bias=True, **kwargs):
237 | self.supports_masking = True
238 | self.init = initializers.get('glorot_uniform')
239 |
240 | self.W_regularizer = regularizers.get(W_regularizer)
241 | self.b_regularizer = regularizers.get(b_regularizer)
242 |
243 | self.W_constraint = constraints.get(W_constraint)
244 | self.b_constraint = constraints.get(b_constraint)
245 |
246 | self.bias = bias
247 | self.step_dim = step_dim
248 | self.features_dim = 0
249 | super(Attention, self).__init__(**kwargs)
250 |
251 | def build(self, input_shape):
252 | assert len(input_shape) == 3
253 |
254 | self.W = self.add_weight((input_shape[-1],),
255 | initializer=self.init,
256 | name='{}_W'.format(self.name),
257 | regularizer=self.W_regularizer,
258 | constraint=self.W_constraint)
259 | self.features_dim = input_shape[-1]
260 |
261 | if self.bias:
262 | self.b = self.add_weight((input_shape[1],),
263 | initializer='zero',
264 | name='{}_b'.format(self.name),
265 | regularizer=self.b_regularizer,
266 | constraint=self.b_constraint)
267 | else:
268 | self.b = None
269 |
270 | self.built = True
271 |
272 | def compute_mask(self, input, input_mask=None):
273 | return None
274 |
275 | def call(self, x, mask=None):
276 | features_dim = self.features_dim
277 | step_dim = self.step_dim
278 |
279 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
280 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
281 |
282 | if self.bias:
283 | eij += self.b
284 |
285 | eij = K.tanh(eij)
286 |
287 | a = K.exp(eij)
288 |
289 | if mask is not None:
290 | a *= K.cast(mask, K.floatx())
291 |
292 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
293 |
294 | a = K.expand_dims(a)
295 | weighted_input = x * a
296 | return K.sum(weighted_input, axis=1)
297 |
298 | def compute_output_shape(self, input_shape):
299 | return input_shape[0], self.features_dim
300 | # AUC for a binary classifier
301 |
302 |
303 | def auc(y_true, y_pred):
304 | ptas = tf.stack([binary_PTA(y_true, y_pred, k)
305 | for k in np.linspace(0, 1, 1000)], axis=0)
306 | pfas = tf.stack([binary_PFA(y_true, y_pred, k)
307 | for k in np.linspace(0, 1, 1000)], axis=0)
308 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
309 | binSizes = -(pfas[1:]-pfas[:-1])
310 | s = ptas*binSizes
311 | return K.sum(s, axis=0)
312 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
313 | # PFA, prob false alert for binary classifier
314 |
315 |
316 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
317 | y_pred = K.cast(y_pred >= threshold, 'float32')
318 | # N = total number of negative labels
319 | N = K.sum(1 - y_true)
320 | # FP = total number of false alerts, alerts from the negative class labels
321 | FP = K.sum(y_pred - y_pred * y_true)
322 | return FP/N
323 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
324 | # P_TA prob true alerts for binary classifier
325 |
326 |
327 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
328 | y_pred = K.cast(y_pred >= threshold, 'float32')
329 | # P = total number of positive labels
330 | P = K.sum(y_true)
331 | # TP = total number of correct alerts, alerts from the positive class labels
332 | TP = K.sum(y_pred * y_true)
333 | return TP/P
334 |
335 |
336 | class Lookahead(object):
337 | """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
338 | """
339 |
340 | def __init__(self, k=5, alpha=0.5):
341 | self.k = k
342 | self.alpha = alpha
343 | self.count = 0
344 |
345 | def inject(self, model):
346 | """Inject the Lookahead algorithm for the given model.
347 | The following code is modified from keras's _make_train_function method.
348 | See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
349 | """
350 | if not hasattr(model, 'train_function'):
351 | raise RuntimeError('You must compile your model before using it.')
352 |
353 | model._check_trainable_weights_consistency()
354 |
355 | if model.train_function is None:
356 | inputs = (model._feed_inputs +
357 | model._feed_targets +
358 | model._feed_sample_weights)
359 | if model._uses_dynamic_learning_phase():
360 | inputs += [K.learning_phase()]
361 | fast_params = model._collected_trainable_weights
362 |
363 | with K.name_scope('training'):
364 | with K.name_scope(model.optimizer.__class__.__name__):
365 | training_updates = model.optimizer.get_updates(
366 | params=fast_params,
367 | loss=model.total_loss)
368 | slow_params = [K.variable(p) for p in fast_params]
369 | fast_updates = (model.updates +
370 | training_updates +
371 | model.metrics_updates)
372 |
373 | slow_updates, copy_updates = [], []
374 | for p, q in zip(fast_params, slow_params):
375 | slow_updates.append(K.update(q, q + self.alpha * (p - q)))
376 | copy_updates.append(K.update(p, q))
377 |
378 | # Gets loss and metrics. Updates weights at each call.
379 | fast_train_function = K.function(
380 | inputs,
381 | [model.total_loss] + model.metrics_tensors,
382 | updates=fast_updates,
383 | name='fast_train_function',
384 | **model._function_kwargs)
385 |
386 | def F(inputs):
387 | self.count += 1
388 | R = fast_train_function(inputs)
389 | if self.count % self.k == 0:
390 | K.batch_get_value(slow_updates)
391 | K.batch_get_value(copy_updates)
392 | return R
393 |
394 | model.train_function = F
395 | def get_model(embedding_matrix):
396 |
397 | K.clear_session()
398 | #The embedding layer containing the word vectors
399 | emb_layer = Embedding(
400 | input_dim=embedding_matrix.shape[0],
401 | output_dim=embedding_matrix.shape[1],
402 | weights=[embedding_matrix],
403 | trainable=False
404 | )
405 | sdrop=SpatialDropout1D(rate=0.2)
406 | lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True,
407 | kernel_initializer=glorot_uniform(seed = 123)))
408 | gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True,
409 | kernel_initializer=glorot_uniform(seed = 123)))
410 |
411 | cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
412 |
413 | # Define inputs
414 | seq1 = Input(shape=(maxlen_query,))
415 | x1 = emb_layer(seq1)
416 | x1 = sdrop(x1)
417 | lstm1 = lstm_layer(x1)
418 | gru1 = gru_layer(lstm1)
419 | att_1 = Attention(maxlen_query)(lstm1)
420 | att_3 = Attention(maxlen_query)(gru1)
421 | cnn1 = cnn1d_layer(lstm1)
422 |
423 | avg_pool = GlobalAveragePooling1D()
424 | max_pool = GlobalMaxPooling1D()
425 |
426 | seq2 = Input(shape=(maxlen_answer,))
427 | x2 = emb_layer(seq2)
428 | x2 = sdrop(x2)
429 | lstm2 = lstm_layer(x2)
430 | gru2 = gru_layer(lstm2)
431 | att_2 = Attention(maxlen_answer)(lstm2)
432 | att_4 = Attention(maxlen_answer)(gru2)
433 | cnn2 = cnn1d_layer(lstm2)
434 |
435 | x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)])
436 | x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)])
437 |
438 | merge = Multiply()([x1, x2])
439 | merge = Dropout(0.2)(merge)
440 |
441 | hin = Input(shape=(19,))
442 | # htime = Dense(col_len,activation='relu')(hin)
443 | x = Concatenate()([merge,hin])
444 | # The MLP that determines the outcome
445 | x = Dense(64,kernel_initializer=he_uniform(seed=123), activation='relu',)(x)
446 | # x = Dropout(0.2)(x)
447 | # x = BatchNormalization()(x)
448 |
449 | pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x)
450 |
451 |
452 | model = Model(inputs=[seq1,seq2,hin], outputs=pred)
453 |
454 | model.compile(loss='binary_crossentropy',
455 | optimizer=AdamW(lr=0.001,weight_decay=0.02,),
456 | metrics=["accuracy",auc])
457 | # model.summary()
458 | return model
459 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv',feature=train_feature,batch_size=2048,
460 | label_tag=True,chunk_size=5000)
461 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',feature=val_feature,batch_size=2048,
462 | label_tag=True,chunk_size=5000)
463 | print("train...")
464 | print("###"*30)
465 | gc.collect()
466 | K.clear_session()
467 | model = get_model(embed_matrix)
468 | lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
469 | lookahead.inject(model) # add into model
470 | model.summary()
471 | early_stopping = EarlyStopping(monitor='val_loss',min_delta=0.0001, patience=2, mode='min', verbose=1)
472 | reduce_lr = ReduceLROnPlateau(
473 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
474 | bst_model_path = out+'chizhurnn_chizhu_weight.h5'
475 | checkpoint = ModelCheckpoint(bst_model_path , monitor='val_loss', mode='min',
476 | save_best_only=True, verbose=1,save_weights_only=True )
477 | callbacks = [checkpoint,reduce_lr,early_stopping]
478 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)),
479 | epochs=10, verbose=1, callbacks=callbacks,
480 | validation_data=val_gen, validation_steps = int(np.ceil(1000000/2048)),
481 | max_queue_size=10, workers=1, use_multiprocessing=False)
482 |
483 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature,
484 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
485 | val_prob = model.predict_generator(
486 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
487 |
488 | f = open('/home/kesci/zhifeng/val.csv', 'r')
489 | q, a, l = [], [], []
490 | for line in f:
491 | qid, _, aid, _, label = line.strip().split(',')
492 | q.append(qid)
493 | a.append(aid)
494 | l.append(int(label))
495 |
496 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l})
497 | val_df['prob'] = val_prob.flatten()
498 |
499 |
500 | def perauc(df):
501 | temp = pd.Series()
502 | try:
503 | temp['auc'] = roc_auc_score(df['label'], df['prob'])
504 | except:
505 | temp['auc'] = 0.5
506 | return temp
507 |
508 |
509 | eval_df = val_df.groupby("qid").apply(perauc)
510 | eval_df.index = range(len(eval_df))
511 | print("qauc:", eval_df['auc'].mean())
512 |
513 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
514 | feature=testa_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
515 | prob = model.predict_generator(
516 | test_gen, steps=int(np.ceil(20000000/4096)), verbose=1)
517 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',
518 | names=['qid', 'aid', 'prob'])
519 | sub['prob'] = prob.flatten()
520 | sub.to_csv('/home/kesci/work/chizhu/chizhu_rnn_testa.csv',
521 | index=False, header=False)
522 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
523 | feature=testb_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
524 | prob = model.predict_generator(
525 | test_gen, steps=int(np.ceil(100000000/4096)), verbose=1)
526 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
527 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
528 | final['prob'] = prob.flatten()
529 | final.to_csv('/home/kesci/work/chizhu/chizhu_rnn_testb.csv',
530 | index=False, header=False)
531 |
--------------------------------------------------------------------------------
/fasttext_cos.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import numpy as np
4 | import random as rn
5 | from tqdm import tqdm, tqdm_notebook
6 | from sklearn.metrics import roc_auc_score
7 | from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
8 | import gc
9 | import time
10 | from gensim.models import Word2Vec
11 | import fasttext
12 | from gensim.models import Word2Vec
13 | import scipy.spatial.distance as ssd
14 | tqdm.pandas()
15 | input_path = "/home/kesci/input/bytedance/"
16 | out_work_path = '/home/kesci/work/zhifeng/'
17 | out_path = '/home/kesci/zhifeng/'
18 |
19 | w2v = fasttext.load_model(out_work_path+'corpus.fasttext.model')
20 | train_cosine_list = []
21 | with open(out_path+'train.smaller.csv', 'r') as fin:
22 | for line in tqdm(fin):
23 | _, q, _, a, _ = line.strip().split(',')
24 | v1 = w2v.get_sentence_vector(q)
25 | v2 = w2v.get_sentence_vector(a)
26 | train_cosine_list.append(ssd.cosine(v1, v2))
27 | pd.to_pickle(np.array(train_cosine_list),
28 | out_work_path+'train.cosine.fasttext.pkl')
29 | val_cosine_list = []
30 | with open(out_path+'val.csv', 'r') as fin:
31 | for line in tqdm(fin):
32 | _, q, _, a, _ = line.strip().split(',')
33 | v1 = w2v.get_sentence_vector(q)
34 | v2 = w2v.get_sentence_vector(a)
35 | val_cosine_list.append(ssd.cosine(v1, v2))
36 | pd.to_pickle(np.array(val_cosine_list),
37 | out_work_path+'val.cosine.fasttext.pkl')
38 | test_cosine_list = []
39 | with open(input_path+'test_final_part1.csv', 'r') as fin:
40 | for line in tqdm(fin):
41 | _, q, _, a = line.strip().split(',')
42 | v1 = w2v.get_sentence_vector(q)
43 | v2 = w2v.get_sentence_vector(a)
44 | test_cosine_list.append(ssd.cosine(v1, v2))
45 | pd.to_pickle(np.array(test_cosine_list),
46 | out_work_path+'test.cosine.fasttext.pkl')
47 |
--------------------------------------------------------------------------------
/finetuning_fasttext_esim.py:
--------------------------------------------------------------------------------
1 | from keras.activations import softmax
2 | from sklearn.preprocessing import StandardScaler
3 | import os
4 | import pandas as pd
5 | import numpy as np
6 | import random as rn
7 | from tqdm import tqdm, tqdm_notebook
8 | import tensorflow as tf
9 | from sklearn.metrics import roc_auc_score
10 | from keras.preprocessing.text import Tokenizer
11 | from keras.preprocessing.sequence import pad_sequences
12 | from keras.optimizers import Adam
13 | from keras import backend as K
14 | from keras.optimizers import *
15 | from keras.callbacks import *
16 | from keras.layers import *
17 | from keras.models import *
18 | from keras.engine.topology import Layer
19 | from keras import initializers, regularizers, constraints, optimizers, layers
20 | from keras.initializers import *
21 | import keras
22 | from sklearn.model_selection import StratifiedKFold, GroupKFold
23 | import gc
24 | import time
25 | from gensim.models import Word2Vec
26 | import logging
27 | import Levenshtein
28 | import fasttext
29 | tqdm.pandas()
30 | np.random.seed(1017)
31 | rn.seed(1017)
32 | tf.set_random_seed(1017)
33 | path = "/home/kesci/input/bytedance/"
34 | out = '/home/kesci/work/zhifeng/'
35 | out_chizhu = '/home/kesci/work/chizhu/'
36 | print(os.listdir(path))
37 |
38 | f1 = pd.read_csv(out_chizhu + 'f1.csv')
39 | f2 = pd.read_csv(out_chizhu + 'f2.csv')
40 | f3 = pd.read_csv(out_chizhu + 'f3.csv')
41 | feature = pd.concat([f1, f2, f3], sort=False, axis=1)
42 | del f1, f2, f3
43 | gc.collect()
44 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl")
45 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl")
46 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl")
47 | testb_w2v = pd.read_pickle(
48 | "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl")
49 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v)
50 |
51 | train_w2v = pd.read_pickle(
52 | "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl")
53 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl")
54 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl")
55 | testb_w2v = pd.read_pickle(
56 | "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl")
57 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \
58 | list(testa_w2v)+list(testb_w2v)
59 | del train_w2v, val_w2v, testa_w2v, testb_w2v
60 | gc.collect()
61 | feature.shape
62 |
63 | len_train = 99000000
64 | len_val = 1000000
65 | len_testa = 20000000
66 | len_testb = 100000000
67 | sc = StandardScaler()
68 | feature = sc.fit_transform(feature)
69 | train_feature = feature[:len_train]
70 | val_feature = feature[len_train:len_train+len_val]
71 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa]
72 | testb_feature = feature[-len_testb:]
73 | print(train_feature.shape, val_feature.shape,testa_feature.shape,testb_feature.shape)
74 |
75 | del feature
76 | gc.collect()
77 |
78 | w2v = fasttext.load_model(out+'corpus.fasttext.model')
79 | word2index = {word: index+1 for index, word in enumerate(w2v.words)}
80 | index2word = {index+1: word for index, word in enumerate(w2v.words)}
81 |
82 |
83 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
84 | maxlen_query=8):
85 | if label_tag:
86 | _, _q, _, _a, _label = line.strip().split(',')
87 | else:
88 | _, _q, _, _a = line.strip().split(',')
89 | q_seq = [token.get(item, 0) for item in _q.strip().split()]
90 | a_seq = [token.get(item, 0) for item in _a.strip().split()]
91 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
92 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
93 | if label_tag:
94 | return q_pad, a_pad, int(_label)
95 | return q_pad, a_pad
96 |
97 |
98 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
99 | while True:
100 | fin = open(path, 'r')
101 | batch_q, batch_a, batch_f, batch_label = [], [], [], []
102 | for i, line in enumerate(fin):
103 | if len(batch_q) == chunk_size*batch_size:
104 | batch_q = np.array(batch_q)
105 | batch_a = np.array(batch_a)
106 | batch_f = np.array(batch_f)
107 | if label_tag:
108 | batch_label = np.array(batch_label)
109 | idx = list(range(chunk_size*batch_size))
110 | if shuffle:
111 | np.random.shuffle(idx)
112 | for i in range(chunk_size):
113 | if label_tag:
114 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
115 | np.array(
116 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
117 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
118 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
119 | else:
120 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
121 | np.array(
122 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
123 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
124 | batch_q, batch_a, batch_f, batch_label = [], [], [], []
125 | if label_tag:
126 | q, a, l = gen_feature_help(line, label_tag=label_tag)
127 | else:
128 | q, a = gen_feature_help(line, label_tag=label_tag)
129 | l = 0
130 | batch_q.append(q)
131 | batch_a.append(a)
132 | batch_f.append(feature[i])
133 | if label_tag:
134 | batch_label.append(l)
135 |
136 | batch_q = np.array(batch_q)
137 | batch_a = np.array(batch_a)
138 | batch_f = np.array(batch_f)
139 |
140 | if label_tag:
141 | batch_label = np.array(batch_label)
142 | idx = list(range(len(batch_q)))
143 | if shuffle:
144 | np.random.shuffle(idx)
145 | for i in range(int(np.ceil(len(batch_q)/batch_size))):
146 | if label_tag:
147 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
148 | np.array(
149 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
150 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
151 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
152 | else:
153 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
154 | np.array(
155 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
156 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
157 | fin.close()
158 |
159 |
160 | def get_embedding_matrix():
161 | m = np.zeros(shape=(len(index2word)+1, 100))
162 | for i, w in index2word.items():
163 | m[i, :] = w2v[w]
164 | return m
165 |
166 |
167 | embed_matrix = get_embedding_matrix()
168 | maxlen_query = 8
169 | maxlen_answer = 20
170 |
171 |
172 | class AdamW(Optimizer):
173 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
174 | epsilon=1e-8, decay=0., **kwargs):
175 | super(AdamW, self).__init__(**kwargs)
176 | with K.name_scope(self.__class__.__name__):
177 | self.iterations = K.variable(0, dtype='int64', name='iterations')
178 | self.lr = K.variable(lr, name='lr')
179 | self.beta_1 = K.variable(beta_1, name='beta_1')
180 | self.beta_2 = K.variable(beta_2, name='beta_2')
181 | self.decay = K.variable(decay, name='decay')
182 | # decoupled weight decay (2/4)
183 | self.wd = K.variable(weight_decay, name='weight_decay')
184 | self.epsilon = epsilon
185 | self.initial_decay = decay
186 |
187 | @interfaces.legacy_get_updates_support
188 | def get_updates(self, loss, params):
189 | grads = self.get_gradients(loss, params)
190 | self.updates = [K.update_add(self.iterations, 1)]
191 | wd = self.wd # decoupled weight decay (3/4)
192 |
193 | lr = self.lr
194 | if self.initial_decay > 0:
195 | lr *= (1. / (1. + self.decay * K.cast(self.iterations,
196 | K.dtype(self.decay))))
197 |
198 | t = K.cast(self.iterations, K.floatx()) + 1
199 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
200 | (1. - K.pow(self.beta_1, t)))
201 |
202 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
203 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
204 | self.weights = [self.iterations] + ms + vs
205 |
206 | for p, g, m, v in zip(params, grads, ms, vs):
207 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
208 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
209 | # decoupled weight decay (4/4)
210 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
211 |
212 | self.updates.append(K.update(m, m_t))
213 | self.updates.append(K.update(v, v_t))
214 | new_p = p_t
215 |
216 | # Apply constraints.
217 | if getattr(p, 'constraint', None) is not None:
218 | new_p = p.constraint(new_p)
219 |
220 | self.updates.append(K.update(p, new_p))
221 | return self.updates
222 |
223 | def get_config(self):
224 | config = {'lr': float(K.get_value(self.lr)),
225 | 'beta_1': float(K.get_value(self.beta_1)),
226 | 'beta_2': float(K.get_value(self.beta_2)),
227 | 'decay': float(K.get_value(self.decay)),
228 | 'weight_decay': float(K.get_value(self.wd)),
229 | 'epsilon': self.epsilon}
230 | base_config = super(AdamW, self).get_config()
231 | return dict(list(base_config.items()) + list(config.items()))
232 |
233 |
234 | class Attention(Layer):
235 | def __init__(self, step_dim,
236 | W_regularizer=None, b_regularizer=None,
237 | W_constraint=None, b_constraint=None,
238 | bias=True, **kwargs):
239 | self.supports_masking = True
240 | self.init = initializers.get('glorot_uniform')
241 |
242 | self.W_regularizer = regularizers.get(W_regularizer)
243 | self.b_regularizer = regularizers.get(b_regularizer)
244 |
245 | self.W_constraint = constraints.get(W_constraint)
246 | self.b_constraint = constraints.get(b_constraint)
247 |
248 | self.bias = bias
249 | self.step_dim = step_dim
250 | self.features_dim = 0
251 | super(Attention, self).__init__(**kwargs)
252 |
253 | def build(self, input_shape):
254 | assert len(input_shape) == 3
255 |
256 | self.W = self.add_weight((input_shape[-1],),
257 | initializer=self.init,
258 | name='{}_W'.format(self.name),
259 | regularizer=self.W_regularizer,
260 | constraint=self.W_constraint)
261 | self.features_dim = input_shape[-1]
262 |
263 | if self.bias:
264 | self.b = self.add_weight((input_shape[1],),
265 | initializer='zero',
266 | name='{}_b'.format(self.name),
267 | regularizer=self.b_regularizer,
268 | constraint=self.b_constraint)
269 | else:
270 | self.b = None
271 |
272 | self.built = True
273 |
274 | def compute_mask(self, input, input_mask=None):
275 | return None
276 |
277 | def call(self, x, mask=None):
278 | features_dim = self.features_dim
279 | step_dim = self.step_dim
280 |
281 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
282 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
283 |
284 | if self.bias:
285 | eij += self.b
286 |
287 | eij = K.tanh(eij)
288 |
289 | a = K.exp(eij)
290 |
291 | if mask is not None:
292 | a *= K.cast(mask, K.floatx())
293 |
294 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
295 |
296 | a = K.expand_dims(a)
297 | weighted_input = x * a
298 | return K.sum(weighted_input, axis=1)
299 |
300 | def compute_output_shape(self, input_shape):
301 | return input_shape[0], self.features_dim
302 |
303 | # AUC for a binary classifier
304 |
305 |
306 | def auc(y_true, y_pred):
307 | ptas = tf.stack([binary_PTA(y_true, y_pred, k)
308 | for k in np.linspace(0, 1, 1000)], axis=0)
309 | pfas = tf.stack([binary_PFA(y_true, y_pred, k)
310 | for k in np.linspace(0, 1, 1000)], axis=0)
311 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
312 | binSizes = -(pfas[1:]-pfas[:-1])
313 | s = ptas*binSizes
314 | return K.sum(s, axis=0)
315 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
316 | # PFA, prob false alert for binary classifier
317 |
318 |
319 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
320 | y_pred = K.cast(y_pred >= threshold, 'float32')
321 | # N = total number of negative labels
322 | N = K.sum(1 - y_true)
323 | # FP = total number of false alerts, alerts from the negative class labels
324 | FP = K.sum(y_pred - y_pred * y_true)
325 | return FP/N
326 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
327 | # P_TA prob true alerts for binary classifier
328 |
329 |
330 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
331 | y_pred = K.cast(y_pred >= threshold, 'float32')
332 | # P = total number of positive labels
333 | P = K.sum(y_true)
334 | # TP = total number of correct alerts, alerts from the positive class labels
335 | TP = K.sum(y_pred * y_true)
336 | return TP/P
337 |
338 |
339 | class Lookahead(object):
340 | """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
341 | """
342 |
343 | def __init__(self, k=5, alpha=0.5):
344 | self.k = k
345 | self.alpha = alpha
346 | self.count = 0
347 |
348 | def inject(self, model):
349 | """Inject the Lookahead algorithm for the given model.
350 | The following code is modified from keras's _make_train_function method.
351 | See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
352 | """
353 | if not hasattr(model, 'train_function'):
354 | raise RuntimeError('You must compile your model before using it.')
355 |
356 | model._check_trainable_weights_consistency()
357 |
358 | if model.train_function is None:
359 | inputs = (model._feed_inputs +
360 | model._feed_targets +
361 | model._feed_sample_weights)
362 | if model._uses_dynamic_learning_phase():
363 | inputs += [K.learning_phase()]
364 | fast_params = model._collected_trainable_weights
365 |
366 | with K.name_scope('training'):
367 | with K.name_scope(model.optimizer.__class__.__name__):
368 | training_updates = model.optimizer.get_updates(
369 | params=fast_params,
370 | loss=model.total_loss)
371 | slow_params = [K.variable(p) for p in fast_params]
372 | fast_updates = (model.updates +
373 | training_updates +
374 | model.metrics_updates)
375 |
376 | slow_updates, copy_updates = [], []
377 | for p, q in zip(fast_params, slow_params):
378 | slow_updates.append(K.update(q, q + self.alpha * (p - q)))
379 | copy_updates.append(K.update(p, q))
380 |
381 | # Gets loss and metrics. Updates weights at each call.
382 | fast_train_function = K.function(
383 | inputs,
384 | [model.total_loss] + model.metrics_tensors,
385 | updates=fast_updates,
386 | name='fast_train_function',
387 | **model._function_kwargs)
388 |
389 | def F(inputs):
390 | self.count += 1
391 | R = fast_train_function(inputs)
392 | if self.count % self.k == 0:
393 | K.batch_get_value(slow_updates)
394 | K.batch_get_value(copy_updates)
395 | return R
396 |
397 | model.train_function = F
398 |
399 |
400 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs):
401 | "Create embedding layer from a pretrained weights array"
402 | in_dim, out_dim = pretrained_weights.shape
403 | embedding = Embedding(in_dim, out_dim, weights=[
404 | pretrained_weights], trainable=False, **kwargs)
405 | return embedding
406 |
407 |
408 | def unchanged_shape(input_shape):
409 | "Function for Lambda layer"
410 | return input_shape
411 |
412 |
413 | def substract(input_1, input_2):
414 | "Substract element-wise"
415 | neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2)
416 | out_ = Add()([input_1, neg_input_2])
417 | return out_
418 |
419 |
420 | def submult(input_1, input_2):
421 | "Get multiplication and subtraction then concatenate results"
422 | mult = Multiply()([input_1, input_2])
423 | sub = substract(input_1, input_2)
424 | out_ = Concatenate()([sub, mult])
425 | return out_
426 |
427 |
428 | def apply_multiple(input_, layers):
429 | "Apply layers to input then concatenate result"
430 | if not len(layers) > 1:
431 | raise ValueError('Layers list should contain more than 1 layer')
432 | else:
433 | agg_ = []
434 | for layer in layers:
435 | agg_.append(layer(input_))
436 | out_ = Concatenate()(agg_)
437 | return out_
438 |
439 |
440 | def time_distributed(input_, layers):
441 | "Apply a list of layers in TimeDistributed mode"
442 | out_ = []
443 | node_ = input_
444 | for layer_ in layers:
445 | node_ = TimeDistributed(layer_)(node_)
446 | out_ = node_
447 | return out_
448 |
449 |
450 | def soft_attention_alignment(input_1, input_2):
451 | "Align text representation with neural soft attention"
452 | attention = Dot(axes=-1)([input_1, input_2])
453 | w_att_1 = Lambda(lambda x: softmax(x, axis=1),
454 | output_shape=unchanged_shape)(attention)
455 | w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2),
456 | output_shape=unchanged_shape)(attention))
457 | in1_aligned = Dot(axes=1)([w_att_1, input_1])
458 | in2_aligned = Dot(axes=1)([w_att_2, input_2])
459 | return in1_aligned, in2_aligned
460 |
461 |
462 | def decomposable_attention(pretrained_weights,
463 | num_shape,
464 | projection_dim=300, projection_hidden=0, projection_dropout=0.2,
465 | compare_dim=500, compare_dropout=0.2,
466 | dense_dim=300, dense_dropout=0.2,
467 | lr=1e-3, activation='elu', maxlen=20):
468 | # Based on: https://arxiv.org/abs/1606.01933
469 |
470 | q1 = Input(name='q1', shape=(maxlen,))
471 | q2 = Input(name='q2', shape=(maxlen,))
472 |
473 | # Embedding
474 | embedding = create_pretrained_embedding(pretrained_weights,
475 | mask_zero=False)
476 | q1_embed = embedding(q1)
477 | q2_embed = embedding(q2)
478 |
479 | # Projection
480 | projection_layers = []
481 | if projection_hidden > 0:
482 | projection_layers.extend([
483 | Dense(projection_hidden, activation=activation),
484 | Dropout(rate=projection_dropout),
485 | ])
486 | projection_layers.extend([
487 | Dense(projection_dim, activation=None),
488 | Dropout(rate=projection_dropout),
489 | ])
490 | q1_encoded = time_distributed(q1_embed, projection_layers)
491 | q2_encoded = time_distributed(q2_embed, projection_layers)
492 |
493 | # Attention
494 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
495 |
496 | # Compare
497 | q1_combined = Concatenate()(
498 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
499 | q2_combined = Concatenate()(
500 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
501 | compare_layers = [
502 | Dense(compare_dim, activation=activation),
503 | Dropout(compare_dropout),
504 | Dense(compare_dim, activation=activation),
505 | Dropout(compare_dropout),
506 | ]
507 | q1_compare = time_distributed(q1_combined, compare_layers)
508 | q2_compare = time_distributed(q2_combined, compare_layers)
509 |
510 | # Aggregate
511 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
512 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
513 |
514 | # Classifier
515 | merged = Concatenate()([q1_rep, q2_rep])
516 | dense = BatchNormalization()(merged)
517 | dense = Dense(dense_dim, activation=activation)(dense)
518 | dense = Dropout(dense_dropout)(dense)
519 | dense = BatchNormalization()(dense)
520 | dense = Dense(dense_dim, activation=activation)(dense)
521 | dense = Dropout(dense_dropout)(dense)
522 | out_ = Dense(1, activation='sigmoid')(dense)
523 |
524 | model = Model(inputs=[q1, q2], outputs=out_)
525 | model.compile(loss='binary_crossentropy',
526 | optimizer=AdamW(lr=0.001, weight_decay=0.02,),
527 | metrics=["accuracy", auc])
528 | return model
529 |
530 |
531 | def esim(embedding_matrix,
532 | maxlen=20,
533 | lstm_dim=64,
534 | dense_dim=128,
535 | dense_dropout=0.5):
536 | # Based on arXiv:1609.06038
537 | q1 = Input(name='q1', shape=(8,))
538 | q2 = Input(name='q2', shape=(20,))
539 |
540 | # Embedding
541 | embedding = create_pretrained_embedding(
542 | embedding_matrix, mask_zero=False)
543 | bn = BatchNormalization(axis=2)
544 | q1_embed = bn(embedding(q1))
545 | q2_embed = bn(embedding(q2))
546 |
547 | # Encode
548 | encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
549 | q1_encoded = encode(q1_embed)
550 | q2_encoded = encode(q2_embed)
551 |
552 | # Attention
553 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
554 |
555 | # Compose
556 | q1_combined = Concatenate()(
557 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
558 | q2_combined = Concatenate()(
559 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
560 |
561 | compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
562 | q1_compare = compose(q1_combined)
563 | q2_compare = compose(q2_combined)
564 |
565 | # Aggregate
566 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
567 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
568 |
569 | # leaks_input = Input(shape=(num_shape,))
570 | # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input)
571 |
572 | # Classifier
573 | merged = Concatenate()([q1_rep, q2_rep])
574 |
575 | dense = BatchNormalization()(merged)
576 | dense = Dense(dense_dim, activation='elu')(dense)
577 | dense = BatchNormalization()(dense)
578 | dense = Dropout(dense_dropout)(dense)
579 | dense = Dense(dense_dim, activation='elu')(dense)
580 | dense = BatchNormalization()(dense)
581 | dense = Dropout(dense_dropout)(dense)
582 | out_ = Dense(1, activation='sigmoid')(dense)
583 |
584 | model = Model(inputs=[q1, q2], outputs=out_)
585 | model.compile(loss='binary_crossentropy',
586 | optimizer=AdamW(lr=0.0003, weight_decay=0.02,),
587 | metrics=["accuracy", auc])
588 | return model
589 |
590 |
591 | def aux_esim_model(embed_matrix, model_weight_path):
592 | base_model = esim(embed_matrix)
593 | base_model.load_weights(model_weight_path)
594 | input_q, input_a = base_model.inputs
595 | input_f = Input((19,))
596 | hidden_esim = base_model.get_layer(index=28).output
597 | merged = Concatenate()([hidden_esim, input_f])
598 | #dense = BatchNormalization()(merged)
599 | dense = Dense(512, activation='relu')(merged)
600 | #dense = BatchNormalization()(dense)
601 | dense = Dropout(0.5)(dense)
602 | dense = Dense(256, activation='relu')(dense)
603 | #dense = BatchNormalization()(dense)
604 | dense = Dropout(0.5)(dense)
605 | out_ = Dense(1, activation='sigmoid')(dense)
606 |
607 | model = Model(inputs=[input_q, input_a, input_f], outputs=out_)
608 | model.compile(loss='binary_crossentropy',
609 | optimizer=AdamW(lr=0.0003, weight_decay=0.02),
610 | metrics=["accuracy"])
611 | return model
612 |
613 |
614 | ####模型训练
615 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv', feature=train_feature, batch_size=2048,
616 | label_tag=True, chunk_size=5000)
617 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, batch_size=2048,
618 | label_tag=True, chunk_size=5000)
619 | print("train...")
620 | print("###"*30)
621 | gc.collect()
622 | K.clear_session()
623 | weight_path = '/home/kesci/work/zhifeng/zhifeng_esim_weight_1_0.6924413924179077.h5'
624 | model = aux_esim_model(embed_matrix, weight_path)
625 | lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
626 | lookahead.inject(model) # add into model
627 | model.summary()
628 | early_stopping = EarlyStopping(
629 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
630 | reduce_lr = ReduceLROnPlateau(
631 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
632 | bst_model_path = '/home/kesci/work/zhifeng/zhifeng_aux_fasttext_esim_finetune_{epoch}_{val_loss}.h5'
633 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
634 | save_best_only=False,
635 | verbose=1, save_weights_only=True, period=1)
636 | callbacks = [checkpoint, reduce_lr, early_stopping]
637 | # print("load weight....")
638 |
639 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)),
640 | epochs=10, verbose=1, callbacks=callbacks,
641 | validation_data=val_gen, validation_steps=int(
642 | np.ceil(1000000/2048)),
643 | max_queue_size=10, workers=1, use_multiprocessing=False)
644 |
645 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature,
646 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
647 | val_prob = model.predict_generator(
648 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
649 |
650 | f = open('/home/kesci/zhifeng/val.csv','r')
651 | q,a,l=[],[],[]
652 | for line in f:
653 | qid,_,aid,_,label = line.strip().split(',')
654 | q.append(qid)
655 | a.append(aid)
656 | l.append(int(label))
657 |
658 | val_df = pd.DataFrame({'qid':q,'aid':a,'label':l})
659 | val_df['prob'] = val_prob.flatten()
660 |
661 | roc_auc_score(val_df['label'], val_df['prob'])
662 |
663 |
664 | def perauc(df):
665 | temp = pd.Series()
666 | try:
667 | temp['auc'] = roc_auc_score(df['label'], df['prob'])
668 | except:
669 | temp['auc'] = 0.5
670 | return temp
671 |
672 |
673 | eval_df = val_df.groupby("qid").apply(perauc)
674 | eval_df.index = range(len(eval_df))
675 | print("qauc:", eval_df['auc'].mean())
676 |
677 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
678 | feature=testa_feature,batch_size=4096,label_tag=False,chunk_size=1,shuffle=False)
679 | prob = model.predict_generator(test_gen,steps=int(np.ceil(20000000/4096)),verbose=1)
680 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',names=['qid','aid','prob'])
681 | sub['prob'] = prob.flatten()
682 | sub.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testa.csv',index=False,header=False
683 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
684 | feature=testb_feature,batch_size=4096,label_tag=False,chunk_size=1,shuffle=False)
685 | prob = model.predict_generator(test_gen,steps=int(np.ceil(100000000/4096)),verbose=1)
686 | final = pd.read_csv(path+"bytedance_contest.final_2.csv",names=['query_id','query','query_title_id','title'])[['query_id','query_title_id']]
687 | final['prob'] = prob.flatten()
688 | final.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testb.csv',index=False,header=False)
--------------------------------------------------------------------------------
/finetuning_w2v_esim.py:
--------------------------------------------------------------------------------
1 | from keras.activations import softmax
2 | from sklearn.preprocessing import StandardScaler
3 | import os
4 | import pandas as pd
5 | import numpy as np
6 | import random as rn
7 | from tqdm import tqdm, tqdm_notebook
8 | import tensorflow as tf
9 | from sklearn.metrics import roc_auc_score
10 | from keras.preprocessing.text import Tokenizer
11 | from keras.preprocessing.sequence import pad_sequences
12 | from keras.optimizers import Adam
13 | from keras import backend as K
14 | from keras.optimizers import *
15 | from keras.callbacks import *
16 | from keras.layers import *
17 | from keras.models import *
18 | from keras.engine.topology import Layer
19 | from keras import initializers, regularizers, constraints, optimizers, layers
20 | from keras.initializers import *
21 | import keras
22 | from sklearn.model_selection import StratifiedKFold, GroupKFold
23 | import gc
24 | import time
25 | from gensim.models import Word2Vec
26 | import logging
27 | import Levenshtein
28 | import fasttext
29 | tqdm.pandas()
30 | np.random.seed(1017)
31 | rn.seed(1017)
32 | tf.set_random_seed(1017)
33 | path = "/home/kesci/input/bytedance/"
34 | out = '/home/kesci/work/zhifeng/'
35 | out_chizhu = '/home/kesci/work/chizhu/'
36 | print(os.listdir(path))
37 |
38 | f1 = pd.read_csv(out_chizhu + 'f1.csv')
39 | f2 = pd.read_csv(out_chizhu + 'f2.csv')
40 | f3 = pd.read_csv(out_chizhu + 'f3.csv')
41 | feature = pd.concat([f1, f2, f3], sort=False, axis=1)
42 | del f1, f2, f3
43 | gc.collect()
44 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl")
45 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl")
46 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl")
47 | testb_w2v = pd.read_pickle(
48 | "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl")
49 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v)
50 |
51 | train_w2v = pd.read_pickle(
52 | "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl")
53 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl")
54 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl")
55 | testb_w2v = pd.read_pickle(
56 | "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl")
57 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \
58 | list(testa_w2v)+list(testb_w2v)
59 | del train_w2v, val_w2v, testa_w2v, testb_w2v
60 | gc.collect()
61 | feature.shape
62 |
63 | len_train = 99000000
64 | len_val = 1000000
65 | len_testa = 20000000
66 | len_testb = 100000000
67 | sc = StandardScaler()
68 | feature = sc.fit_transform(feature)
69 | train_feature = feature[:len_train]
70 | val_feature = feature[len_train:len_train+len_val]
71 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa]
72 | testb_feature = feature[-len_testb:]
73 | print(train_feature.shape, val_feature.shape,
74 | testa_feature.shape, testb_feature.shape)
75 |
76 | del feature
77 | gc.collect()
78 |
79 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model')
80 |
81 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)}
82 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)}
83 |
84 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
85 | maxlen_query=8):
86 | if label_tag:
87 | _, _q, _, _a, _label = line.strip().split(',')
88 | else:
89 | _, _q, _, _a = line.strip().split(',')
90 | q_seq = [token.get(item, 0) for item in _q.strip().split()]
91 | a_seq = [token.get(item, 0) for item in _a.strip().split()]
92 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
93 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
94 | if label_tag:
95 | return q_pad, a_pad, int(_label)
96 | return q_pad, a_pad
97 |
98 |
99 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
100 | while True:
101 | fin = open(path, 'r')
102 | batch_q, batch_a, batch_f, batch_label = [], [], [], []
103 | for i, line in enumerate(fin):
104 | if len(batch_q) == chunk_size*batch_size:
105 | batch_q = np.array(batch_q)
106 | batch_a = np.array(batch_a)
107 | batch_f = np.array(batch_f)
108 | if label_tag:
109 | batch_label = np.array(batch_label)
110 | idx = list(range(chunk_size*batch_size))
111 | if shuffle:
112 | np.random.shuffle(idx)
113 | for i in range(chunk_size):
114 | if label_tag:
115 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
116 | np.array(
117 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
118 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
119 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
120 | else:
121 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
122 | np.array(
123 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
124 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
125 | batch_q, batch_a, batch_f, batch_label = [], [], [], []
126 | if label_tag:
127 | q, a, l = gen_feature_help(line, label_tag=label_tag)
128 | else:
129 | q, a = gen_feature_help(line, label_tag=label_tag)
130 | l = 0
131 | batch_q.append(q)
132 | batch_a.append(a)
133 | batch_f.append(feature[i])
134 | if label_tag:
135 | batch_label.append(l)
136 |
137 | batch_q = np.array(batch_q)
138 | batch_a = np.array(batch_a)
139 | batch_f = np.array(batch_f)
140 |
141 | if label_tag:
142 | batch_label = np.array(batch_label)
143 | idx = list(range(len(batch_q)))
144 | if shuffle:
145 | np.random.shuffle(idx)
146 | for i in range(int(np.ceil(len(batch_q)/batch_size))):
147 | if label_tag:
148 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
149 | np.array(
150 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
151 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
152 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
153 | else:
154 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
155 | np.array(
156 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
157 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
158 | fin.close()
159 |
160 |
161 | def get_embedding_matrix():
162 | m = np.zeros(shape=(len(index2word)+1, 300))
163 | for i, w in index2word.items():
164 | m[i, :] = w2v[w]
165 | return m
166 |
167 |
168 | embed_matrix = get_embedding_matrix()
169 | maxlen_query = 8
170 | maxlen_answer = 20
171 |
172 |
173 | class AdamW(Optimizer):
174 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
175 | epsilon=1e-8, decay=0., **kwargs):
176 | super(AdamW, self).__init__(**kwargs)
177 | with K.name_scope(self.__class__.__name__):
178 | self.iterations = K.variable(0, dtype='int64', name='iterations')
179 | self.lr = K.variable(lr, name='lr')
180 | self.beta_1 = K.variable(beta_1, name='beta_1')
181 | self.beta_2 = K.variable(beta_2, name='beta_2')
182 | self.decay = K.variable(decay, name='decay')
183 | # decoupled weight decay (2/4)
184 | self.wd = K.variable(weight_decay, name='weight_decay')
185 | self.epsilon = epsilon
186 | self.initial_decay = decay
187 |
188 | @interfaces.legacy_get_updates_support
189 | def get_updates(self, loss, params):
190 | grads = self.get_gradients(loss, params)
191 | self.updates = [K.update_add(self.iterations, 1)]
192 | wd = self.wd # decoupled weight decay (3/4)
193 |
194 | lr = self.lr
195 | if self.initial_decay > 0:
196 | lr *= (1. / (1. + self.decay * K.cast(self.iterations,
197 | K.dtype(self.decay))))
198 |
199 | t = K.cast(self.iterations, K.floatx()) + 1
200 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
201 | (1. - K.pow(self.beta_1, t)))
202 |
203 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
204 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
205 | self.weights = [self.iterations] + ms + vs
206 |
207 | for p, g, m, v in zip(params, grads, ms, vs):
208 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
209 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
210 | # decoupled weight decay (4/4)
211 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
212 |
213 | self.updates.append(K.update(m, m_t))
214 | self.updates.append(K.update(v, v_t))
215 | new_p = p_t
216 |
217 | # Apply constraints.
218 | if getattr(p, 'constraint', None) is not None:
219 | new_p = p.constraint(new_p)
220 |
221 | self.updates.append(K.update(p, new_p))
222 | return self.updates
223 |
224 | def get_config(self):
225 | config = {'lr': float(K.get_value(self.lr)),
226 | 'beta_1': float(K.get_value(self.beta_1)),
227 | 'beta_2': float(K.get_value(self.beta_2)),
228 | 'decay': float(K.get_value(self.decay)),
229 | 'weight_decay': float(K.get_value(self.wd)),
230 | 'epsilon': self.epsilon}
231 | base_config = super(AdamW, self).get_config()
232 | return dict(list(base_config.items()) + list(config.items()))
233 |
234 |
235 | class Attention(Layer):
236 | def __init__(self, step_dim,
237 | W_regularizer=None, b_regularizer=None,
238 | W_constraint=None, b_constraint=None,
239 | bias=True, **kwargs):
240 | self.supports_masking = True
241 | self.init = initializers.get('glorot_uniform')
242 |
243 | self.W_regularizer = regularizers.get(W_regularizer)
244 | self.b_regularizer = regularizers.get(b_regularizer)
245 |
246 | self.W_constraint = constraints.get(W_constraint)
247 | self.b_constraint = constraints.get(b_constraint)
248 |
249 | self.bias = bias
250 | self.step_dim = step_dim
251 | self.features_dim = 0
252 | super(Attention, self).__init__(**kwargs)
253 |
254 | def build(self, input_shape):
255 | assert len(input_shape) == 3
256 |
257 | self.W = self.add_weight((input_shape[-1],),
258 | initializer=self.init,
259 | name='{}_W'.format(self.name),
260 | regularizer=self.W_regularizer,
261 | constraint=self.W_constraint)
262 | self.features_dim = input_shape[-1]
263 |
264 | if self.bias:
265 | self.b = self.add_weight((input_shape[1],),
266 | initializer='zero',
267 | name='{}_b'.format(self.name),
268 | regularizer=self.b_regularizer,
269 | constraint=self.b_constraint)
270 | else:
271 | self.b = None
272 |
273 | self.built = True
274 |
275 | def compute_mask(self, input, input_mask=None):
276 | return None
277 |
278 | def call(self, x, mask=None):
279 | features_dim = self.features_dim
280 | step_dim = self.step_dim
281 |
282 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
283 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
284 |
285 | if self.bias:
286 | eij += self.b
287 |
288 | eij = K.tanh(eij)
289 |
290 | a = K.exp(eij)
291 |
292 | if mask is not None:
293 | a *= K.cast(mask, K.floatx())
294 |
295 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
296 |
297 | a = K.expand_dims(a)
298 | weighted_input = x * a
299 | return K.sum(weighted_input, axis=1)
300 |
301 | def compute_output_shape(self, input_shape):
302 | return input_shape[0], self.features_dim
303 |
304 | # AUC for a binary classifier
305 |
306 |
307 | def auc(y_true, y_pred):
308 | ptas = tf.stack([binary_PTA(y_true, y_pred, k)
309 | for k in np.linspace(0, 1, 1000)], axis=0)
310 | pfas = tf.stack([binary_PFA(y_true, y_pred, k)
311 | for k in np.linspace(0, 1, 1000)], axis=0)
312 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
313 | binSizes = -(pfas[1:]-pfas[:-1])
314 | s = ptas*binSizes
315 | return K.sum(s, axis=0)
316 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
317 | # PFA, prob false alert for binary classifier
318 |
319 |
320 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
321 | y_pred = K.cast(y_pred >= threshold, 'float32')
322 | # N = total number of negative labels
323 | N = K.sum(1 - y_true)
324 | # FP = total number of false alerts, alerts from the negative class labels
325 | FP = K.sum(y_pred - y_pred * y_true)
326 | return FP/N
327 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
328 | # P_TA prob true alerts for binary classifier
329 |
330 |
331 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
332 | y_pred = K.cast(y_pred >= threshold, 'float32')
333 | # P = total number of positive labels
334 | P = K.sum(y_true)
335 | # TP = total number of correct alerts, alerts from the positive class labels
336 | TP = K.sum(y_pred * y_true)
337 | return TP/P
338 |
339 |
340 | class Lookahead(object):
341 | """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
342 | """
343 |
344 | def __init__(self, k=5, alpha=0.5):
345 | self.k = k
346 | self.alpha = alpha
347 | self.count = 0
348 |
349 | def inject(self, model):
350 | """Inject the Lookahead algorithm for the given model.
351 | The following code is modified from keras's _make_train_function method.
352 | See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
353 | """
354 | if not hasattr(model, 'train_function'):
355 | raise RuntimeError('You must compile your model before using it.')
356 |
357 | model._check_trainable_weights_consistency()
358 |
359 | if model.train_function is None:
360 | inputs = (model._feed_inputs +
361 | model._feed_targets +
362 | model._feed_sample_weights)
363 | if model._uses_dynamic_learning_phase():
364 | inputs += [K.learning_phase()]
365 | fast_params = model._collected_trainable_weights
366 |
367 | with K.name_scope('training'):
368 | with K.name_scope(model.optimizer.__class__.__name__):
369 | training_updates = model.optimizer.get_updates(
370 | params=fast_params,
371 | loss=model.total_loss)
372 | slow_params = [K.variable(p) for p in fast_params]
373 | fast_updates = (model.updates +
374 | training_updates +
375 | model.metrics_updates)
376 |
377 | slow_updates, copy_updates = [], []
378 | for p, q in zip(fast_params, slow_params):
379 | slow_updates.append(K.update(q, q + self.alpha * (p - q)))
380 | copy_updates.append(K.update(p, q))
381 |
382 | # Gets loss and metrics. Updates weights at each call.
383 | fast_train_function = K.function(
384 | inputs,
385 | [model.total_loss] + model.metrics_tensors,
386 | updates=fast_updates,
387 | name='fast_train_function',
388 | **model._function_kwargs)
389 |
390 | def F(inputs):
391 | self.count += 1
392 | R = fast_train_function(inputs)
393 | if self.count % self.k == 0:
394 | K.batch_get_value(slow_updates)
395 | K.batch_get_value(copy_updates)
396 | return R
397 |
398 | model.train_function = F
399 |
400 |
401 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs):
402 | "Create embedding layer from a pretrained weights array"
403 | in_dim, out_dim = pretrained_weights.shape
404 | embedding = Embedding(in_dim, out_dim, weights=[
405 | pretrained_weights], trainable=False, **kwargs)
406 | return embedding
407 |
408 |
409 | def unchanged_shape(input_shape):
410 | "Function for Lambda layer"
411 | return input_shape
412 |
413 |
414 | def substract(input_1, input_2):
415 | "Substract element-wise"
416 | neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2)
417 | out_ = Add()([input_1, neg_input_2])
418 | return out_
419 |
420 |
421 | def submult(input_1, input_2):
422 | "Get multiplication and subtraction then concatenate results"
423 | mult = Multiply()([input_1, input_2])
424 | sub = substract(input_1, input_2)
425 | out_ = Concatenate()([sub, mult])
426 | return out_
427 |
428 |
429 | def apply_multiple(input_, layers):
430 | "Apply layers to input then concatenate result"
431 | if not len(layers) > 1:
432 | raise ValueError('Layers list should contain more than 1 layer')
433 | else:
434 | agg_ = []
435 | for layer in layers:
436 | agg_.append(layer(input_))
437 | out_ = Concatenate()(agg_)
438 | return out_
439 |
440 |
441 | def time_distributed(input_, layers):
442 | "Apply a list of layers in TimeDistributed mode"
443 | out_ = []
444 | node_ = input_
445 | for layer_ in layers:
446 | node_ = TimeDistributed(layer_)(node_)
447 | out_ = node_
448 | return out_
449 |
450 |
451 | def soft_attention_alignment(input_1, input_2):
452 | "Align text representation with neural soft attention"
453 | attention = Dot(axes=-1)([input_1, input_2])
454 | w_att_1 = Lambda(lambda x: softmax(x, axis=1),
455 | output_shape=unchanged_shape)(attention)
456 | w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2),
457 | output_shape=unchanged_shape)(attention))
458 | in1_aligned = Dot(axes=1)([w_att_1, input_1])
459 | in2_aligned = Dot(axes=1)([w_att_2, input_2])
460 | return in1_aligned, in2_aligned
461 |
462 |
463 | def decomposable_attention(pretrained_weights,
464 | num_shape,
465 | projection_dim=300, projection_hidden=0, projection_dropout=0.2,
466 | compare_dim=500, compare_dropout=0.2,
467 | dense_dim=300, dense_dropout=0.2,
468 | lr=1e-3, activation='elu', maxlen=20):
469 | # Based on: https://arxiv.org/abs/1606.01933
470 |
471 | q1 = Input(name='q1', shape=(maxlen,))
472 | q2 = Input(name='q2', shape=(maxlen,))
473 |
474 | # Embedding
475 | embedding = create_pretrained_embedding(pretrained_weights,
476 | mask_zero=False)
477 | q1_embed = embedding(q1)
478 | q2_embed = embedding(q2)
479 |
480 | # Projection
481 | projection_layers = []
482 | if projection_hidden > 0:
483 | projection_layers.extend([
484 | Dense(projection_hidden, activation=activation),
485 | Dropout(rate=projection_dropout),
486 | ])
487 | projection_layers.extend([
488 | Dense(projection_dim, activation=None),
489 | Dropout(rate=projection_dropout),
490 | ])
491 | q1_encoded = time_distributed(q1_embed, projection_layers)
492 | q2_encoded = time_distributed(q2_embed, projection_layers)
493 |
494 | # Attention
495 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
496 |
497 | # Compare
498 | q1_combined = Concatenate()(
499 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
500 | q2_combined = Concatenate()(
501 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
502 | compare_layers = [
503 | Dense(compare_dim, activation=activation),
504 | Dropout(compare_dropout),
505 | Dense(compare_dim, activation=activation),
506 | Dropout(compare_dropout),
507 | ]
508 | q1_compare = time_distributed(q1_combined, compare_layers)
509 | q2_compare = time_distributed(q2_combined, compare_layers)
510 |
511 | # Aggregate
512 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
513 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
514 |
515 | # Classifier
516 | merged = Concatenate()([q1_rep, q2_rep])
517 | dense = BatchNormalization()(merged)
518 | dense = Dense(dense_dim, activation=activation)(dense)
519 | dense = Dropout(dense_dropout)(dense)
520 | dense = BatchNormalization()(dense)
521 | dense = Dense(dense_dim, activation=activation)(dense)
522 | dense = Dropout(dense_dropout)(dense)
523 | out_ = Dense(1, activation='sigmoid')(dense)
524 |
525 | model = Model(inputs=[q1, q2], outputs=out_)
526 | model.compile(loss='binary_crossentropy',
527 | optimizer=AdamW(lr=0.001, weight_decay=0.02,),
528 | metrics=["accuracy", auc])
529 | return model
530 |
531 |
532 | def esim(embedding_matrix,
533 | maxlen=20,
534 | lstm_dim=64,
535 | dense_dim=128,
536 | dense_dropout=0.5):
537 | # Based on arXiv:1609.06038
538 | q1 = Input(name='q1', shape=(8,))
539 | q2 = Input(name='q2', shape=(20,))
540 |
541 | # Embedding
542 | embedding = create_pretrained_embedding(
543 | embedding_matrix, mask_zero=False)
544 | bn = BatchNormalization(axis=2)
545 | q1_embed = bn(embedding(q1))
546 | q2_embed = bn(embedding(q2))
547 |
548 | # Encode
549 | encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
550 | q1_encoded = encode(q1_embed)
551 | q2_encoded = encode(q2_embed)
552 |
553 | # Attention
554 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
555 |
556 | # Compose
557 | q1_combined = Concatenate()(
558 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)])
559 | q2_combined = Concatenate()(
560 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)])
561 |
562 | compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True))
563 | q1_compare = compose(q1_combined)
564 | q2_compare = compose(q2_combined)
565 |
566 | # Aggregate
567 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
568 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
569 |
570 | # leaks_input = Input(shape=(num_shape,))
571 | # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input)
572 |
573 | # Classifier
574 | merged = Concatenate()([q1_rep, q2_rep])
575 |
576 | dense = BatchNormalization()(merged)
577 | dense = Dense(dense_dim, activation='elu')(dense)
578 | dense = BatchNormalization()(dense)
579 | dense = Dropout(dense_dropout)(dense)
580 | dense = Dense(dense_dim, activation='elu')(dense)
581 | dense = BatchNormalization()(dense)
582 | dense = Dropout(dense_dropout)(dense)
583 | out_ = Dense(1, activation='sigmoid')(dense)
584 |
585 | model = Model(inputs=[q1, q2], outputs=out_)
586 | model.compile(loss='binary_crossentropy',
587 | optimizer=AdamW(lr=0.0003, weight_decay=0.02,),
588 | metrics=["accuracy", auc])
589 | return model
590 |
591 |
592 | def aux_esim_model(embed_matrix, model_weight_path):
593 | base_model = esim(embed_matrix)
594 | base_model.load_weights(model_weight_path)
595 | input_q, input_a = base_model.inputs
596 | input_f = Input((19,))
597 | hidden_esim = base_model.get_layer(index=28).output
598 | merged = Concatenate()([hidden_esim, input_f])
599 | #dense = BatchNormalization()(merged)
600 | dense = Dense(512, activation='relu')(merged)
601 | #dense = BatchNormalization()(dense)
602 | dense = Dropout(0.5)(dense)
603 | dense = Dense(256, activation='relu')(dense)
604 | #dense = BatchNormalization()(dense)
605 | dense = Dropout(0.5)(dense)
606 | out_ = Dense(1, activation='sigmoid')(dense)
607 |
608 | model = Model(inputs=[input_q, input_a, input_f], outputs=out_)
609 | model.compile(loss='binary_crossentropy',
610 | optimizer=AdamW(lr=0.0003, weight_decay=0.02),
611 | metrics=["accuracy"])
612 | return model
613 |
614 |
615 | ####模型训练
616 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv', feature=train_feature, batch_size=2048,
617 | label_tag=True, chunk_size=5000)
618 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, batch_size=2048,
619 | label_tag=True, chunk_size=5000)
620 | print("train...")
621 | print("###"*30)
622 | gc.collect()
623 | K.clear_session()
624 | weight_path = '/home/kesci/work/chizhu/chizhu_w2v_esim_weight_1_0.44060374074871167.h5'
625 | model = aux_esim_model(embed_matrix, weight_path)
626 | lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
627 | lookahead.inject(model) # add into model
628 | model.summary()
629 | early_stopping = EarlyStopping(
630 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
631 | reduce_lr = ReduceLROnPlateau(
632 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
633 | bst_model_path = '/home/kesci/work/zhifeng/zhifeng_aux_fasttext_esim_finetune_{epoch}_{val_loss}.h5'
634 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
635 | save_best_only=False,
636 | verbose=1, save_weights_only=True, period=1)
637 | callbacks = [checkpoint, reduce_lr, early_stopping]
638 | # print("load weight....")
639 |
640 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)),
641 | epochs=10, verbose=1, callbacks=callbacks,
642 | validation_data=val_gen, validation_steps=int(
643 | np.ceil(1000000/2048)),
644 | max_queue_size=10, workers=1, use_multiprocessing=False)
645 |
646 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature,
647 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
648 | val_prob = model.predict_generator(
649 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
650 |
651 | f = open('/home/kesci/zhifeng/val.csv', 'r')
652 | q, a, l = [], [], []
653 | for line in f:
654 | qid, _, aid, _, label = line.strip().split(',')
655 | q.append(qid)
656 | a.append(aid)
657 | l.append(int(label))
658 |
659 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l})
660 | val_df['prob'] = val_prob.flatten()
661 |
662 | roc_auc_score(val_df['label'], val_df['prob'])
663 |
664 |
665 | def perauc(df):
666 | temp = pd.Series()
667 | try:
668 | temp['auc'] = roc_auc_score(df['label'], df['prob'])
669 | except:
670 | temp['auc'] = 0.5
671 | return temp
672 |
673 |
674 | eval_df = val_df.groupby("qid").apply(perauc)
675 | eval_df.index = range(len(eval_df))
676 | print("qauc:", eval_df['auc'].mean())
677 |
678 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
679 | feature=testa_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
680 | prob = model.predict_generator(
681 | test_gen, steps=int(np.ceil(20000000/4096)), verbose=1)
682 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',
683 | names=['qid', 'aid', 'prob'])
684 | sub['prob'] = prob.flatten()
685 | sub.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testa.csv', index=False, header=False
686 | test_gen=gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
687 | feature=testb_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
688 | prob=model.predict_generator(test_gen, steps=int(
689 | np.ceil(100000000/4096)), verbose=1)
690 | final=pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
691 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
692 | final['prob']=prob.flatten()
693 | final.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testb.csv', index=False, header=False)
694 |
--------------------------------------------------------------------------------
/finetuning_w2v_rnn.py:
--------------------------------------------------------------------------------
1 | from keras.activations import softmax
2 | from sklearn.preprocessing import StandardScaler
3 | import os
4 | import pandas as pd
5 | import numpy as np
6 | import random as rn
7 | from tqdm import tqdm, tqdm_notebook
8 | import tensorflow as tf
9 | from sklearn.metrics import roc_auc_score
10 | from keras.preprocessing.text import Tokenizer
11 | from keras.preprocessing.sequence import pad_sequences
12 | from keras.optimizers import Adam
13 | from keras import backend as K
14 | from keras.optimizers import *
15 | from keras.callbacks import *
16 | from keras.layers import *
17 | from keras.models import *
18 | from keras.engine.topology import Layer
19 | from keras import initializers, regularizers, constraints, optimizers, layers
20 | from keras.initializers import *
21 | import keras
22 | from sklearn.model_selection import StratifiedKFold, GroupKFold
23 | import gc
24 | import time
25 | from gensim.models import Word2Vec
26 | import logging
27 | import Levenshtein
28 | import fasttext
29 | tqdm.pandas()
30 | np.random.seed(1017)
31 | rn.seed(1017)
32 | tf.set_random_seed(1017)
33 | path = "/home/kesci/input/bytedance/"
34 | out = '/home/kesci/work/zhifeng/'
35 | out_chizhu = '/home/kesci/work/chizhu/'
36 | print(os.listdir(path))
37 |
38 | f1 = pd.read_csv(out_chizhu + 'f1.csv')
39 | f2 = pd.read_csv(out_chizhu + 'f2.csv')
40 | f3 = pd.read_csv(out_chizhu + 'f3.csv')
41 | feature = pd.concat([f1, f2, f3], sort=False, axis=1)
42 | del f1, f2, f3
43 | gc.collect()
44 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl")
45 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl")
46 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl")
47 | testb_w2v = pd.read_pickle(
48 | "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl")
49 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v)
50 |
51 | train_w2v = pd.read_pickle(
52 | "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl")
53 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl")
54 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl")
55 | testb_w2v = pd.read_pickle(
56 | "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl")
57 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \
58 | list(testa_w2v)+list(testb_w2v)
59 | del train_w2v, val_w2v, testa_w2v, testb_w2v
60 | gc.collect()
61 | feature.shape
62 |
63 | len_train = 99000000
64 | len_val = 1000000
65 | len_testa = 20000000
66 | len_testb = 100000000
67 | sc = StandardScaler()
68 | feature = sc.fit_transform(feature)
69 | train_feature = feature[:len_train]
70 | val_feature = feature[len_train:len_train+len_val]
71 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa]
72 | testb_feature = feature[-len_testb:]
73 | print(train_feature.shape, val_feature.shape,
74 | testa_feature.shape, testb_feature.shape)
75 |
76 | del feature
77 | gc.collect()
78 |
79 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model')
80 |
81 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)}
82 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)}
83 |
84 |
85 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20,
86 | maxlen_query=8):
87 | if label_tag:
88 | _, _q, _, _a, _label = line.strip().split(',')
89 | else:
90 | _, _q, _, _a = line.strip().split(',')
91 | q_seq = [token.get(item, 0) for item in _q.strip().split()]
92 | a_seq = [token.get(item, 0) for item in _a.strip().split()]
93 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:]
94 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:]
95 | if label_tag:
96 | return q_pad, a_pad, int(_label)
97 | return q_pad, a_pad
98 |
99 |
100 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8):
101 | while True:
102 | fin = open(path, 'r')
103 | batch_q, batch_a, batch_f, batch_label = [], [], [], []
104 | for i, line in enumerate(fin):
105 | if len(batch_q) == chunk_size*batch_size:
106 | batch_q = np.array(batch_q)
107 | batch_a = np.array(batch_a)
108 | batch_f = np.array(batch_f)
109 | if label_tag:
110 | batch_label = np.array(batch_label)
111 | idx = list(range(chunk_size*batch_size))
112 | if shuffle:
113 | np.random.shuffle(idx)
114 | for i in range(chunk_size):
115 | if label_tag:
116 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
117 | np.array(
118 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
119 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
120 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
121 | else:
122 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
123 | np.array(
124 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
125 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
126 | batch_q, batch_a, batch_f, batch_label = [], [], [], []
127 | if label_tag:
128 | q, a, l = gen_feature_help(line, label_tag=label_tag)
129 | else:
130 | q, a = gen_feature_help(line, label_tag=label_tag)
131 | l = 0
132 | batch_q.append(q)
133 | batch_a.append(a)
134 | batch_f.append(feature[i])
135 | if label_tag:
136 | batch_label.append(l)
137 |
138 | batch_q = np.array(batch_q)
139 | batch_a = np.array(batch_a)
140 | batch_f = np.array(batch_f)
141 |
142 | if label_tag:
143 | batch_label = np.array(batch_label)
144 | idx = list(range(len(batch_q)))
145 | if shuffle:
146 | np.random.shuffle(idx)
147 | for i in range(int(np.ceil(len(batch_q)/batch_size))):
148 | if label_tag:
149 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
150 | np.array(
151 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
152 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])],
153 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]))
154 | else:
155 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]),
156 | np.array(
157 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]),
158 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])]
159 | fin.close()
160 |
161 |
162 | def get_embedding_matrix():
163 | m = np.zeros(shape=(len(index2word)+1, 300))
164 | for i, w in index2word.items():
165 | m[i, :] = w2v[w]
166 | return m
167 |
168 |
169 | embed_matrix = get_embedding_matrix()
170 | maxlen_query = 8
171 | maxlen_answer = 20
172 |
173 |
174 | class AdamW(Optimizer):
175 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
176 | epsilon=1e-8, decay=0., **kwargs):
177 | super(AdamW, self).__init__(**kwargs)
178 | with K.name_scope(self.__class__.__name__):
179 | self.iterations = K.variable(0, dtype='int64', name='iterations')
180 | self.lr = K.variable(lr, name='lr')
181 | self.beta_1 = K.variable(beta_1, name='beta_1')
182 | self.beta_2 = K.variable(beta_2, name='beta_2')
183 | self.decay = K.variable(decay, name='decay')
184 | # decoupled weight decay (2/4)
185 | self.wd = K.variable(weight_decay, name='weight_decay')
186 | self.epsilon = epsilon
187 | self.initial_decay = decay
188 |
189 | @interfaces.legacy_get_updates_support
190 | def get_updates(self, loss, params):
191 | grads = self.get_gradients(loss, params)
192 | self.updates = [K.update_add(self.iterations, 1)]
193 | wd = self.wd # decoupled weight decay (3/4)
194 |
195 | lr = self.lr
196 | if self.initial_decay > 0:
197 | lr *= (1. / (1. + self.decay * K.cast(self.iterations,
198 | K.dtype(self.decay))))
199 |
200 | t = K.cast(self.iterations, K.floatx()) + 1
201 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
202 | (1. - K.pow(self.beta_1, t)))
203 |
204 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
205 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
206 | self.weights = [self.iterations] + ms + vs
207 |
208 | for p, g, m, v in zip(params, grads, ms, vs):
209 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
210 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
211 | # decoupled weight decay (4/4)
212 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
213 |
214 | self.updates.append(K.update(m, m_t))
215 | self.updates.append(K.update(v, v_t))
216 | new_p = p_t
217 |
218 | # Apply constraints.
219 | if getattr(p, 'constraint', None) is not None:
220 | new_p = p.constraint(new_p)
221 |
222 | self.updates.append(K.update(p, new_p))
223 | return self.updates
224 |
225 | def get_config(self):
226 | config = {'lr': float(K.get_value(self.lr)),
227 | 'beta_1': float(K.get_value(self.beta_1)),
228 | 'beta_2': float(K.get_value(self.beta_2)),
229 | 'decay': float(K.get_value(self.decay)),
230 | 'weight_decay': float(K.get_value(self.wd)),
231 | 'epsilon': self.epsilon}
232 | base_config = super(AdamW, self).get_config()
233 | return dict(list(base_config.items()) + list(config.items()))
234 |
235 |
236 | class Attention(Layer):
237 | def __init__(self, step_dim,
238 | W_regularizer=None, b_regularizer=None,
239 | W_constraint=None, b_constraint=None,
240 | bias=True, **kwargs):
241 | self.supports_masking = True
242 | self.init = initializers.get('glorot_uniform')
243 |
244 | self.W_regularizer = regularizers.get(W_regularizer)
245 | self.b_regularizer = regularizers.get(b_regularizer)
246 |
247 | self.W_constraint = constraints.get(W_constraint)
248 | self.b_constraint = constraints.get(b_constraint)
249 |
250 | self.bias = bias
251 | self.step_dim = step_dim
252 | self.features_dim = 0
253 | super(Attention, self).__init__(**kwargs)
254 |
255 | def build(self, input_shape):
256 | assert len(input_shape) == 3
257 |
258 | self.W = self.add_weight((input_shape[-1],),
259 | initializer=self.init,
260 | name='{}_W'.format(self.name),
261 | regularizer=self.W_regularizer,
262 | constraint=self.W_constraint)
263 | self.features_dim = input_shape[-1]
264 |
265 | if self.bias:
266 | self.b = self.add_weight((input_shape[1],),
267 | initializer='zero',
268 | name='{}_b'.format(self.name),
269 | regularizer=self.b_regularizer,
270 | constraint=self.b_constraint)
271 | else:
272 | self.b = None
273 |
274 | self.built = True
275 |
276 | def compute_mask(self, input, input_mask=None):
277 | return None
278 |
279 | def call(self, x, mask=None):
280 | features_dim = self.features_dim
281 | step_dim = self.step_dim
282 |
283 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
284 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
285 |
286 | if self.bias:
287 | eij += self.b
288 |
289 | eij = K.tanh(eij)
290 |
291 | a = K.exp(eij)
292 |
293 | if mask is not None:
294 | a *= K.cast(mask, K.floatx())
295 |
296 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
297 |
298 | a = K.expand_dims(a)
299 | weighted_input = x * a
300 | return K.sum(weighted_input, axis=1)
301 |
302 | def compute_output_shape(self, input_shape):
303 | return input_shape[0], self.features_dim
304 |
305 | # AUC for a binary classifier
306 |
307 |
308 | def auc(y_true, y_pred):
309 | ptas = tf.stack([binary_PTA(y_true, y_pred, k)
310 | for k in np.linspace(0, 1, 1000)], axis=0)
311 | pfas = tf.stack([binary_PFA(y_true, y_pred, k)
312 | for k in np.linspace(0, 1, 1000)], axis=0)
313 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
314 | binSizes = -(pfas[1:]-pfas[:-1])
315 | s = ptas*binSizes
316 | return K.sum(s, axis=0)
317 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
318 | # PFA, prob false alert for binary classifier
319 |
320 |
321 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
322 | y_pred = K.cast(y_pred >= threshold, 'float32')
323 | # N = total number of negative labels
324 | N = K.sum(1 - y_true)
325 | # FP = total number of false alerts, alerts from the negative class labels
326 | FP = K.sum(y_pred - y_pred * y_true)
327 | return FP/N
328 | #-----------------------------------------------------------------------------------------------------------------------------------------------------
329 | # P_TA prob true alerts for binary classifier
330 |
331 |
332 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
333 | y_pred = K.cast(y_pred >= threshold, 'float32')
334 | # P = total number of positive labels
335 | P = K.sum(y_true)
336 | # TP = total number of correct alerts, alerts from the positive class labels
337 | TP = K.sum(y_pred * y_true)
338 | return TP/P
339 |
340 |
341 | class Lookahead(object):
342 | """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
343 | """
344 |
345 | def __init__(self, k=5, alpha=0.5):
346 | self.k = k
347 | self.alpha = alpha
348 | self.count = 0
349 |
350 | def inject(self, model):
351 | """Inject the Lookahead algorithm for the given model.
352 | The following code is modified from keras's _make_train_function method.
353 | See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
354 | """
355 | if not hasattr(model, 'train_function'):
356 | raise RuntimeError('You must compile your model before using it.')
357 |
358 | model._check_trainable_weights_consistency()
359 |
360 | if model.train_function is None:
361 | inputs = (model._feed_inputs +
362 | model._feed_targets +
363 | model._feed_sample_weights)
364 | if model._uses_dynamic_learning_phase():
365 | inputs += [K.learning_phase()]
366 | fast_params = model._collected_trainable_weights
367 |
368 | with K.name_scope('training'):
369 | with K.name_scope(model.optimizer.__class__.__name__):
370 | training_updates = model.optimizer.get_updates(
371 | params=fast_params,
372 | loss=model.total_loss)
373 | slow_params = [K.variable(p) for p in fast_params]
374 | fast_updates = (model.updates +
375 | training_updates +
376 | model.metrics_updates)
377 |
378 | slow_updates, copy_updates = [], []
379 | for p, q in zip(fast_params, slow_params):
380 | slow_updates.append(K.update(q, q + self.alpha * (p - q)))
381 | copy_updates.append(K.update(p, q))
382 |
383 | # Gets loss and metrics. Updates weights at each call.
384 | fast_train_function = K.function(
385 | inputs,
386 | [model.total_loss] + model.metrics_tensors,
387 | updates=fast_updates,
388 | name='fast_train_function',
389 | **model._function_kwargs)
390 |
391 | def F(inputs):
392 | self.count += 1
393 | R = fast_train_function(inputs)
394 | if self.count % self.k == 0:
395 | K.batch_get_value(slow_updates)
396 | K.batch_get_value(copy_updates)
397 | return R
398 |
399 | model.train_function = F
400 |
401 |
402 | def get_model(embedding_matrix):
403 |
404 | K.clear_session()
405 | #The embedding layer containing the word vectors
406 | emb_layer = Embedding(
407 | input_dim=embedding_matrix.shape[0],
408 | output_dim=embedding_matrix.shape[1],
409 | weights=[embedding_matrix],
410 | trainable=False
411 | )
412 | sdrop=SpatialDropout1D(rate=0.2)
413 | lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True,
414 | kernel_initializer=glorot_uniform(seed = 123)))
415 | gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True,
416 | kernel_initializer=glorot_uniform(seed = 123)))
417 |
418 | cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
419 |
420 | # Define inputs
421 | seq1 = Input(shape=(maxlen_query,))
422 | x1 = emb_layer(seq1)
423 | x1 = sdrop(x1)
424 | lstm1 = lstm_layer(x1)
425 | gru1 = gru_layer(lstm1)
426 | att_1 = Attention(maxlen_query)(lstm1)
427 | att_3 = Attention(maxlen_query)(gru1)
428 | cnn1 = cnn1d_layer(lstm1)
429 |
430 | avg_pool = GlobalAveragePooling1D()
431 | max_pool = GlobalMaxPooling1D()
432 |
433 | seq2 = Input(shape=(maxlen_answer,))
434 | x2 = emb_layer(seq2)
435 | x2 = sdrop(x2)
436 | lstm2 = lstm_layer(x2)
437 | gru2 = gru_layer(lstm2)
438 | att_2 = Attention(maxlen_answer)(lstm2)
439 | att_4 = Attention(maxlen_answer)(gru2)
440 | cnn2 = cnn1d_layer(lstm2)
441 |
442 | x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)])
443 | x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)])
444 |
445 | merge = Multiply()([x1, x2])
446 | merge = Dropout(0.5)(merge)
447 | # The MLP that determines the outcome
448 | x = Dense(128,kernel_initializer=he_uniform(seed=123), activation='relu',)(merge)
449 | # x = Dropout(0.2)(x)
450 | # x = BatchNormalization()(x)
451 |
452 | pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x)
453 |
454 |
455 | model = Model(inputs=[seq1,seq2], outputs=pred)
456 |
457 | model.compile(loss='binary_crossentropy',
458 | optimizer=AdamW(lr=0.0003,weight_decay=0.02,),
459 | metrics=["accuracy"])
460 | # model.summary()
461 | return model
462 |
463 |
464 | def aux_esim_model(embed_matrix, model_weight_path):
465 | base_model = get_model(embed_matrix)
466 | base_model.load_weights(model_weight_path)
467 | input_q, input_a = base_model.inputs
468 | input_f = Input((19,))
469 | hidden_esim = base_model.get_layer(index=15).output
470 | merged = Concatenate()([hidden_esim, input_f])
471 | #dense = BatchNormalization()(merged)
472 | dense = Dense(512, activation='relu')(merged)
473 | #dense = BatchNormalization()(dense)
474 | dense = Dropout(0.5)(dense)
475 | dense = Dense(256, activation='relu')(dense)
476 | #dense = BatchNormalization()(dense)
477 | dense = Dropout(0.5)(dense)
478 | out_ = Dense(1, activation='sigmoid')(dense)
479 |
480 | model = Model(inputs=[input_q, input_a, input_f], outputs=out_)
481 | model.compile(loss='binary_crossentropy',
482 | optimizer=AdamW(lr=0.0003, weight_decay=0.02),
483 | metrics=["accuracy"])
484 | return model
485 |
486 |
487 | ####模型训练
488 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv', feature=train_feature, batch_size=2048,
489 | label_tag=True, chunk_size=5000)
490 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, batch_size=2048,
491 | label_tag=True, chunk_size=5000)
492 | print("train...")
493 | print("###"*30)
494 | gc.collect()
495 | K.clear_session()
496 | weight_path = '/home/kesci/work/zhifeng/zhifeng_rnn_weight_1_0.668621638244629.h5'
497 | model = aux_esim_model(embed_matrix, weight_path)
498 | lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
499 | lookahead.inject(model) # add into model
500 | model.summary()
501 | early_stopping = EarlyStopping(
502 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1)
503 | reduce_lr = ReduceLROnPlateau(
504 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
505 | bst_model_path = '/home/kesci/work/zhifeng/zhifeng_aux_fasttext_esim_finetune_{epoch}_{val_loss}.h5'
506 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
507 | save_best_only=False,
508 | verbose=1, save_weights_only=True, period=1)
509 | callbacks = [checkpoint, reduce_lr, early_stopping]
510 | # print("load weight....")
511 |
512 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)),
513 | epochs=10, verbose=1, callbacks=callbacks,
514 | validation_data=val_gen, validation_steps=int(
515 | np.ceil(1000000/2048)),
516 | max_queue_size=10, workers=1, use_multiprocessing=False)
517 |
518 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature,
519 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False)
520 | val_prob = model.predict_generator(
521 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1)
522 |
523 | f = open('/home/kesci/zhifeng/val.csv', 'r')
524 | q, a, l = [], [], []
525 | for line in f:
526 | qid, _, aid, _, label = line.strip().split(',')
527 | q.append(qid)
528 | a.append(aid)
529 | l.append(int(label))
530 |
531 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l})
532 | val_df['prob'] = val_prob.flatten()
533 |
534 | roc_auc_score(val_df['label'], val_df['prob'])
535 |
536 |
537 | def perauc(df):
538 | temp = pd.Series()
539 | try:
540 | temp['auc'] = roc_auc_score(df['label'], df['prob'])
541 | except:
542 | temp['auc'] = 0.5
543 | return temp
544 |
545 |
546 | eval_df = val_df.groupby("qid").apply(perauc)
547 | eval_df.index = range(len(eval_df))
548 | print("qauc:", eval_df['auc'].mean())
549 |
550 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv',
551 | feature=testa_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
552 | prob = model.predict_generator(
553 | test_gen, steps=int(np.ceil(20000000/4096)), verbose=1)
554 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',
555 | names=['qid', 'aid', 'prob'])
556 | sub['prob'] = prob.flatten()
557 | sub.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testa.csv', index=False, header=False
558 | test_gen=gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv',
559 | feature=testb_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False)
560 | prob=model.predict_generator(test_gen, steps=int(
561 | np.ceil(100000000/4096)), verbose=1)
562 | final=pd.read_csv(path+"bytedance_contest.final_2.csv", names=[
563 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']]
564 | final['prob']=prob.flatten()
565 | final.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testb.csv', index=False, header=False)
566 |
--------------------------------------------------------------------------------
/gen_feature.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm, tqdm_notebook
2 | from sklearn.model_selection import StratifiedKFold, GroupKFold
3 | import numpy as np
4 | import os
5 | import Levenshtein
6 | import logging
7 | from gensim.models import Word2Vec
8 | import time
9 | import gc
10 | import keras
11 | from keras.initializers import *
12 | from keras import initializers, regularizers, constraints, optimizers, layers
13 | from keras.engine.topology import Layer
14 | from keras.models import *
15 | from keras.layers import *
16 | from keras.callbacks import *
17 | from keras.optimizers import *
18 | from keras import backend as K
19 | from keras.optimizers import Adam
20 | from keras.preprocessing.sequence import pad_sequences
21 | from keras.preprocessing.text import Tokenizer
22 | from sklearn.metrics import roc_auc_score
23 | import tensorflow as tf
24 | import random as rn
25 | import pandas as pd
26 | tqdm.pandas()
27 | np.random.seed(1017)
28 | rn.seed(1017)
29 | tf.set_random_seed(1017)
30 | path = "/home/kesci/input/bytedance/"
31 | out = '/home/kesci/work/chizhu/'
32 | print(os.listdir(path))
33 |
34 | train = pd.read_csv(path+"train_final.csv",skiprows=900000000,nrows=100000000,names=['query_id','query','query_title_id','title','label'])
35 |
36 | testa = pd.read_csv(path+"test_final_part1.csv",names=['query_id','query','query_title_id','title'])
37 | testb = pd.read_csv(path+"bytedance_contest.final_2.csv",names=['query_id','query','query_title_id','title'])
38 |
39 | testa['label']=-1
40 | testb['label']=-2
41 | test=pd.concat([testa,testb],ignore_index=True)
42 | del testa,testb
43 | gc.collect()
44 |
45 | train['title']=train['title'].apply(lambda x:str(x).replace("\t",""),1)
46 | test['title']=test['title'].apply(lambda x:str(x).replace("\t",""),1)
47 | data_all=pd.concat([train,test],ignore_index=True)
48 | del train,test
49 | gc.collect()
50 |
51 | # 构造特征集 f1
52 | def get_union_data(row):
53 | title_list = row['title'].split(' ')
54 | query_list = row['query'].split(' ')
55 | return len(list(set(title_list).intersection(set(query_list))))
56 |
57 | def same_1(row):
58 | title_list = row['title'].split(' ')
59 | query_list = row['query'].split(' ')
60 | if title_list[0] == query_list[0]:
61 | return 1
62 | else:
63 | return 0
64 |
65 | def same_2(row):
66 | title_list = row['title'].split(' ')
67 | query_list = row['query'].split(' ')
68 | if ' '.join(title_list[:2]) == ' '.join(query_list[:2]):
69 | return 1
70 | else:
71 | return 0
72 |
73 | def same_3(row):
74 | title_list = row['title'].split(' ')
75 | query_list = row['query'].split(' ')
76 | if ' '.join(title_list[:3]) == ' '.join(query_list[:3]):
77 | return 1
78 | else:
79 | return 0
80 |
81 | def is_all_in(row):
82 | if row['query'] in row['title']:
83 | return 1
84 | else:
85 | return 0
86 |
87 | feature = pd.DataFrame()
88 | feature['问题长度'] = data_all['query'].progress_apply(lambda row:len(row.split(' ')))
89 | feature['标题长度'] = data_all['title'].progress_apply(lambda row:len(row.split(' ')))
90 | feature['标题长度-问题长度'] = feature['标题长度'] - feature['问题长度']
91 | feature['问题是否全部在标题里面'] = data_all.progress_apply(lambda row:is_all_in(row), axis=1)
92 | feature['标题和问题的交集个数'] = data_all.progress_apply(lambda row:get_union_data(row), axis=1)
93 | feature['标题问题词语的交集个数/问题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['问题长度']), 8)
94 | feature['标题问题词语的交集个数/标题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['标题长度']), 8)
95 | feature['编辑距离'] = data_all.progress_apply(lambda row:Levenshtein.distance(row['query'], row['title']), axis=1)
96 | feature['前一个词语是否相同'] = data_all.progress_apply(lambda row:same_1(row), axis=1)
97 | feature['前两个词语是否相同'] = data_all.progress_apply(lambda row:same_2(row), axis=1)
98 | feature['前三个词语是否相同'] = data_all.progress_apply(lambda row:same_3(row), axis=1)
99 | feature.to_csv(out + 'f1.csv', index=False)
100 |
101 | # 构造特征集 f2
102 | def pos_1(row):
103 | title_list = row['title'].split(' ')
104 | query_list = row['query'].split(' ')
105 | value = -1
106 | try:
107 | value = title_list.index(query_list[0])
108 | except Exception:
109 | value = -1
110 | return value
111 |
112 | def pos_2(row):
113 | title_list = row['title'].split(' ')
114 | query_list = row['query'].split(' ')
115 | if len(query_list) <=1 :
116 | return -1
117 | try:
118 | value = title_list.index(query_list[1])
119 | except Exception:
120 | value = -1
121 | return value
122 |
123 | def pos_3(row):
124 | title_list = row['title'].split(' ')
125 | query_list = row['query'].split(' ')
126 | if len(query_list) <=2 :
127 | return -1
128 | try:
129 | value = title_list.index(query_list[2])
130 | except Exception:
131 | value = -1
132 | return value
133 |
134 | feature = pd.DataFrame()
135 | feature['第一个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_1(row), axis=1)
136 | feature['第二个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_2(row), axis=1)
137 | feature['第三个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_3(row), axis=1)
138 | feature.to_csv(out + 'f2.csv', index=False)
139 |
140 | feature = pd.DataFrame()
141 | feature['标题求组合后词语'] = data_all.groupby('title').query.transform('nunique')
142 | feature['词语求组合后标题'] = data_all.groupby('query').title.transform('nunique')
143 | feature.to_csv(out + 'f3.csv', index=False)
144 |
145 |
146 | # data_all = data_all.fillna(-1)
147 | # data_all.to_pickle(out+"data.pickle")
148 |
149 | # data_all = pd.read_pickle(out+"data.pickle")
150 | # f5 word2vec本身相似度
151 | from gensim.models import Word2Vec
152 | import gensim
153 | import logging
154 | feature = pd.DataFrame()
155 | w2v = Word2Vec.load(out + 'new_skip_w2v_all_300.model')
156 | def get_new_w2v(seq1, seq2):
157 | seq1 = seq1.split(' ')
158 | seq2 = seq2.split(' ')
159 | try:
160 | return w2v.n_similarity(seq1, seq2)
161 | except:
162 | return -1
163 |
164 | f3 = pd.read_csv(out + 'f3.csv')
165 | f3['w2v本身相似度'] = data_all.progress_apply(lambda row:get_new_w2v(row['query'], row['title']), axis=1)
166 | f3.to_csv(out + 'f3.csv', index=False)
167 |
168 |
169 |
--------------------------------------------------------------------------------
/get_corpus.py:
--------------------------------------------------------------------------------
1 | fout = open(out + "corpus.csv",'w')
2 | with open(path+"train_final.csv",'r') as fin:
3 | q_last = ''
4 | for line in tqdm(fin):
5 | _,q,_,t,_ = line.strip().split(',')
6 | if q!=q_last:
7 | q_last = q
8 | fout.write(q + '\n')
9 | fout.write(t + '\n')
10 | with open(path+"test_final_part1.csv",'r') as fin:
11 | q_last = ''
12 | for line in tqdm(fin):
13 | _,q,_,t = line.strip().split(',')
14 | if q!=q_last:
15 | q_last = q
16 | fout.write(q + '\n')
17 | fout.write(t + '\n')
18 | fout.close()
19 | """
20 | corpus.txt格式
21 | // 每行是一条语料 以空格分隔
22 | 我 鄂温克 三打底裤 是是
23 | 说的
24 | 是对的是
25 | 时代大厦 是对的
26 | 是赛事方 说的
27 |
28 | """
29 |
--------------------------------------------------------------------------------
/train_fasttext.py:
--------------------------------------------------------------------------------
1 | import fasttext
2 | w2v = fasttext.train_unsupervised(input=out+"corpus.csv")
3 | w2v.save_model(out+'corpus.fasttext.model')
4 | w2v = fasttext.load_model(out+'corpus.fasttext.model')
5 | word2index = {word: index+1 for index, word in enumerate(w2v.words)}
6 | index2word = {index+1: word for index, word in enumerate(w2v.words)}
7 |
8 |
9 | def get_embedding_matrix():
10 | m = np.zeros(shape=(len(index2word)+1, 100))
11 | for i, w in index2word.items():
12 | m[i, :] = w2v[w]
13 | return m
14 |
--------------------------------------------------------------------------------
/train_w2v.py:
--------------------------------------------------------------------------------
1 | from gensim.models import Word2Vec
2 | import logging
3 | from gensim.models import word2vec
4 | logging.basicConfig(
5 | format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
6 | sent=word2vec.Text8Corpus("/home/kesci/work/zhifeng/corpus.csv")
7 | word2vecModel = word2vec.Word2Vec(sent, size=300, window=5, min_count=1,iter=5,
8 | sg=1,workers=8)
9 | word2vecModel.save(out+"skip_w2v_all_300.model")
10 |
11 | # ##### further train
12 | from gensim.models import word2vec
13 | model = word2vec.Word2Vec.load(out+"skip_w2v_all_300.model")
14 | fout = open(out + "new_corpus.csv",'w')
15 | with open(path+"bytedance_contest.final_2.csv",'r') as fin:
16 | q_last = ''
17 | for line in tqdm(fin):
18 | _,q,_,t = line.strip().split(',')
19 | if q!=q_last:
20 | q_last = q
21 | fout.write(q + '\n')
22 | fout.write(t + '\n')
23 | fout.close()
24 | logging.basicConfig(
25 | format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
26 | sent=word2vec.Text8Corpus(out + "new_corpus.csv")
27 | model.build_vocab(sent, update=True)
28 | model.train(sent,total_examples=model.corpus_count, epochs=5)
29 | model.save(out+"new_skip_w2v_all_300.model")
30 |
--------------------------------------------------------------------------------
/w2v_cos.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import numpy as np
4 | import random as rn
5 | from tqdm import tqdm, tqdm_notebook
6 | from sklearn.metrics import roc_auc_score
7 | from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
8 | import gc
9 | import time
10 | from gensim.models import Word2Vec
11 | import fasttext
12 | from gensim.models import Word2Vec
13 | import scipy.spatial.distance as ssd
14 | tqdm.pandas()
15 | input_path = "/home/kesci/input/bytedance/"
16 | out_work_path = '/home/kesci/work/zhifeng/'
17 | out_path = '/home/kesci/zhifeng/'
18 |
19 | w2v = Word2Vec.load('/home/kesci/work/chizhu/skip_w2v_all_300.model')
20 |
21 |
22 | def get_sentence_embeddings(text, sep=' ', dim=300):
23 | v = np.zeros(dim)
24 | words = text.strip().split(sep)
25 | cnt = 0
26 | for word in words:
27 | if word in w2v:
28 | v += w2v[word]
29 | cnt += 1
30 | return v/cnt if cnt != 0 else v
31 |
32 |
33 | train_cosine_list = []
34 | with open(out_path+'train.smaller.csv', 'r') as fin:
35 | for line in tqdm(fin):
36 | _, q, _, a, _ = line.strip().split(',')
37 | v1 = get_sentence_embeddings(q)
38 | v2 = get_sentence_embeddings(a)
39 | train_cosine_list.append(ssd.cosine(v1, v2))
40 | pd.to_pickle(np.array(train_cosine_list), out_work_path+'train.cosine.w2v.pkl')
41 | val_cosine_list = []
42 | with open(out_path+'val.csv', 'r') as fin:
43 | for line in tqdm(fin):
44 | _, q, _, a, _ = line.strip().split(',')
45 | v1 = get_sentence_embeddings(q)
46 | v2 = get_sentence_embeddings(a)
47 | val_cosine_list.append(ssd.cosine(v1, v2))
48 | pd.to_pickle(np.array(val_cosine_list), out_work_path+'val.cosine.w2v.pkl')
49 | test_cosine_list = []
50 | with open(input_path+'test_final_part1.csv', 'r') as fin:
51 | for line in tqdm(fin):
52 | _, q, _, a = line.strip().split(',')
53 | v1 = get_sentence_embeddings(q)
54 | v2 = get_sentence_embeddings(a)
55 | test_cosine_list.append(ssd.cosine(v1, v2))
56 | pd.to_pickle(np.array(test_cosine_list), out_path+'test.cosine.w2v.pkl')
57 |
--------------------------------------------------------------------------------