├── NN_pipeline.py ├── README.md ├── bigtrain_fasttext_esim.py ├── bigtrain_w2v_esim.py ├── bigtrain_w2v_rnn.py ├── chizhu_rnn.py ├── fasttext_cos.py ├── finetuning_fasttext_esim.py ├── finetuning_w2v_esim.py ├── finetuning_w2v_rnn.py ├── gen_feature.py ├── get_corpus.py ├── train_fasttext.py ├── train_w2v.py └── w2v_cos.py /NN_pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import StandardScaler 2 | import os 3 | import pandas as pd 4 | import numpy as np 5 | import random as rn 6 | from tqdm import tqdm, tqdm_notebook 7 | import tensorflow as tf 8 | from sklearn.metrics import roc_auc_score 9 | from keras.preprocessing.text import Tokenizer 10 | from keras.preprocessing.sequence import pad_sequences 11 | from keras.optimizers import Adam 12 | from keras import backend as K 13 | from keras.optimizers import * 14 | from keras.callbacks import * 15 | from keras.layers import * 16 | from keras.models import * 17 | from keras.engine.topology import Layer 18 | from keras import initializers, regularizers, constraints, optimizers, layers 19 | from keras.initializers import * 20 | import keras 21 | from sklearn.model_selection import StratifiedKFold, GroupKFold 22 | import gc 23 | import time 24 | from gensim.models import Word2Vec 25 | import logging 26 | import Levenshtein 27 | tqdm.pandas() 28 | np.random.seed(1017) 29 | rn.seed(1017) 30 | tf.set_random_seed(1017) 31 | path = "/home/kesci/input/bytedance/" 32 | out = '/home/kesci/work/chizhu/' 33 | print(os.listdir(path)) 34 | 35 | train = pd.read_csv(path+"train_final.csv",skiprows=900000000,nrows=100000000,names=['query_id','query','query_title_id','title','label']) 36 | test = pd.read_csv(path+"test_final_part1.csv",names=['query_id','query','query_title_id','title']) 37 | 38 | train['title']=train['title'].apply(lambda x:str(x).replace("\t",""),1) 39 | test['title']=test['title'].apply(lambda x:str(x).replace("\t",""),1) 40 | data_all=pd.concat([train,test],ignore_index=True) 41 | del train,test 42 | gc.collect() 43 | 44 | # 构造特征集 f1 45 | def get_union_data(row): 46 | title_list = row['title'].split(' ') 47 | query_list = row['query'].split(' ') 48 | return len(list(set(title_list).intersection(set(query_list)))) 49 | 50 | def same_1(row): 51 | title_list = row['title'].split(' ') 52 | query_list = row['query'].split(' ') 53 | if title_list[0] == query_list[0]: 54 | return 1 55 | else: 56 | return 0 57 | 58 | def same_2(row): 59 | title_list = row['title'].split(' ') 60 | query_list = row['query'].split(' ') 61 | if ' '.join(title_list[:2]) == ' '.join(query_list[:2]): 62 | return 1 63 | else: 64 | return 0 65 | 66 | def same_3(row): 67 | title_list = row['title'].split(' ') 68 | query_list = row['query'].split(' ') 69 | if ' '.join(title_list[:3]) == ' '.join(query_list[:3]): 70 | return 1 71 | else: 72 | return 0 73 | 74 | def is_all_in(row): 75 | if row['query'] in row['title']: 76 | return 1 77 | else: 78 | return 0 79 | 80 | feature = pd.DataFrame() 81 | feature['问题长度'] = data_all['query'].progress_apply(lambda row:len(row.split(' '))) 82 | feature['标题长度'] = data_all['title'].progress_apply(lambda row:len(row.split(' '))) 83 | feature['标题长度-问题长度'] = feature['标题长度'] - feature['问题长度'] 84 | feature['问题是否全部在标题里面'] = data_all.progress_apply(lambda row:is_all_in(row), axis=1) 85 | feature['标题和问题的交集个数'] = data_all.progress_apply(lambda row:get_union_data(row), axis=1) 86 | feature['标题问题词语的交集个数/问题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['问题长度']), 8) 87 | feature['标题问题词语的交集个数/标题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['标题长度']), 8) 88 | feature['编辑距离'] = data_all.progress_apply(lambda row:Levenshtein.distance(row['query'], row['title']), axis=1) 89 | feature['前一个词语是否相同'] = data_all.progress_apply(lambda row:same_1(row), axis=1) 90 | feature['前两个词语是否相同'] = data_all.progress_apply(lambda row:same_2(row), axis=1) 91 | feature['前三个词语是否相同'] = data_all.progress_apply(lambda row:same_3(row), axis=1) 92 | feature.to_csv(out + 'f1.csv', index=False) 93 | 94 | 95 | # 构造特征集 f2 96 | def pos_1(row): 97 | title_list = row['title'].split(' ') 98 | query_list = row['query'].split(' ') 99 | value = -1 100 | try: 101 | value = title_list.index(query_list[0]) 102 | except Exception: 103 | value = -1 104 | return value 105 | 106 | def pos_2(row): 107 | title_list = row['title'].split(' ') 108 | query_list = row['query'].split(' ') 109 | if len(query_list) <=1 : 110 | return -1 111 | try: 112 | value = title_list.index(query_list[1]) 113 | except Exception: 114 | value = -1 115 | return value 116 | 117 | def pos_3(row): 118 | title_list = row['title'].split(' ') 119 | query_list = row['query'].split(' ') 120 | if len(query_list) <=2 : 121 | return -1 122 | try: 123 | value = title_list.index(query_list[2]) 124 | except Exception: 125 | value = -1 126 | return value 127 | 128 | feature = pd.DataFrame() 129 | feature['第一个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_1(row), axis=1) 130 | feature['第二个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_2(row), axis=1) 131 | feature['第三个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_3(row), axis=1) 132 | feature.to_csv(out + 'f2.csv', index=False) 133 | 134 | feature = pd.DataFrame() 135 | feature['标题求组合后词语'] = data_all.groupby('title').query.transform('nunique') 136 | # feature['词语求组合后标题'] = data_all.groupby('query').title.transform('nunique') 137 | feature.to_csv(out + 'f3.csv', index=False) 138 | 139 | # data_all = data_all.fillna(-1) 140 | # data_all.to_csv(out+"data.csv", index=False) 141 | 142 | # data_all = pd.read_csv(out+"data.csv") 143 | 144 | # f5 word2vec本身相似度 145 | from gensim.models import Word2Vec 146 | import gensim 147 | import logging 148 | feature = pd.DataFrame() 149 | w2v = Word2Vec.load(out + 'w2v.model') 150 | def get_new_w2v(seq1, seq2): 151 | seq1 = seq1.split(' ') 152 | seq2 = seq2.split(' ') 153 | try: 154 | return w2v.n_similarity(seq1, seq2) 155 | except: 156 | return -1 157 | 158 | f3 = pd.read_csv(out + 'f3.csv') 159 | f3['w2v本身相似度'] = data_all.progress_apply(lambda row:get_new_w2v(row['query'], row['title']), axis=1) 160 | f3.to_csv(out + 'f3.csv', index=False) 161 | 162 | f1 = pd.read_csv(out + 'f1.csv') 163 | f2 = pd.read_csv(out + 'f2.csv') 164 | f3 = pd.read_csv(out + 'f3.csv') 165 | feature = pd.concat([f1, f2, f3], sort=False, axis=1) 166 | del f1, f2, f3 167 | gc.collect() 168 | 169 | train = data_all[data_all['label'] != -1] 170 | test = data_all[data_all['label'] == -1] 171 | del data_all 172 | gc.collect() 173 | train_feature = feature[:len(train)] 174 | test_feature = feature[len(train):] 175 | train.index = range(len(train)) 176 | test.index = range(len(test)) 177 | train_feature.index = range(len(train_feature)) 178 | test_feature.index = range(len(test_feature)) 179 | del feature 180 | gc.collect() 181 | 182 | embed_size = 300 # how big is each word vector 183 | # how many unique words to use (i.e num rows in embedding vector) 184 | max_features = None 185 | maxlen1 = 8 186 | maxlen2 = 20 # max number of words in a question to use 187 | 188 | train_X1 = train["query"].fillna("0").values 189 | test_X1 = test["query"].fillna("0").values 190 | 191 | train_X2 = train["title"].fillna("0").values 192 | test_X2 = test["title"].fillna("0").values 193 | print("token...") 194 | tokenizer = Tokenizer(num_words=max_features) 195 | tokenizer.fit_on_texts(list(train_X1)+list(test_X1) + 196 | list(train_X2)+list(test_X2)) 197 | train_X1 = tokenizer.texts_to_sequences(train_X1) 198 | test_X1 = tokenizer.texts_to_sequences(test_X1) 199 | ## Pad the sentences 200 | print("padding") 201 | train_X1 = pad_sequences(train_X1, maxlen=maxlen1) 202 | test_X1 = pad_sequences(test_X1, maxlen=maxlen1) 203 | 204 | train_X2 = tokenizer.texts_to_sequences(train_X2) 205 | test_X2 = tokenizer.texts_to_sequences(test_X2) 206 | ## Pad the sentences 207 | train_X2 = pad_sequences(train_X2, maxlen=maxlen2) 208 | test_X2 = pad_sequences(test_X2, maxlen=maxlen2) 209 | ## Get the target values 210 | 211 | train_y = train['label'].values 212 | 213 | word_index = tokenizer.word_index 214 | gc.collect() 215 | 216 | text_list = train['query'].values.tolist() 217 | text_list.extend(test['query'].values.tolist()) 218 | text_list.extend(train['title'].values.tolist()) 219 | text_list.extend(test['title'].values.tolist()) 220 | del train,test 221 | gc.collect() 222 | import time 223 | time.sleep(10) 224 | text_list = [[word for word in str(document).split(' ') ] for document in text_list] 225 | logging.basicConfig( 226 | format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO) 227 | w2v = Word2Vec(text_list, size=300, window=7, iter=30, seed=10, workers=4, min_count=3) 228 | w2v.save(out+"w2v.model") 229 | w2v.wv.save_word2vec_format(out+'new_w2v_300.txt') 230 | print("w2v model done") 231 | del w2v, text_list, texts 232 | gc.collect() 233 | 234 | 235 | def get_embedding_matrix(word_index, embed_size=embed_size, Emed_path=out+"new_w2v_300.txt"): 236 | embeddings_index = gensim.models.KeyedVectors.load_word2vec_format( 237 | Emed_path, binary=False) 238 | nb_words = len(word_index)+1 239 | embedding_matrix = np.zeros((nb_words, embed_size)) 240 | count = 0 241 | for word, i in tqdm(word_index.items()): 242 | if i >= nb_words: 243 | continue 244 | try: 245 | embedding_vector = embeddings_index[word] 246 | except: 247 | embedding_vector = np.zeros(embed_size) 248 | count += 1 249 | if embedding_vector is not None: 250 | embedding_matrix[i] = embedding_vector 251 | 252 | print("null cnt", count) 253 | return embedding_matrix 254 | 255 | 256 | embedding_matrix = get_embedding_matrix(word_index) 257 | 258 | 259 | class AdamW(Optimizer): 260 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4) 261 | epsilon=1e-8, decay=0., **kwargs): 262 | super(AdamW, self).__init__(**kwargs) 263 | with K.name_scope(self.__class__.__name__): 264 | self.iterations = K.variable(0, dtype='int64', name='iterations') 265 | self.lr = K.variable(lr, name='lr') 266 | self.beta_1 = K.variable(beta_1, name='beta_1') 267 | self.beta_2 = K.variable(beta_2, name='beta_2') 268 | self.decay = K.variable(decay, name='decay') 269 | # decoupled weight decay (2/4) 270 | self.wd = K.variable(weight_decay, name='weight_decay') 271 | self.epsilon = epsilon 272 | self.initial_decay = decay 273 | 274 | @interfaces.legacy_get_updates_support 275 | def get_updates(self, loss, params): 276 | grads = self.get_gradients(loss, params) 277 | self.updates = [K.update_add(self.iterations, 1)] 278 | wd = self.wd # decoupled weight decay (3/4) 279 | 280 | lr = self.lr 281 | if self.initial_decay > 0: 282 | lr *= (1. / (1. + self.decay * K.cast(self.iterations, 283 | K.dtype(self.decay)))) 284 | 285 | t = K.cast(self.iterations, K.floatx()) + 1 286 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / 287 | (1. - K.pow(self.beta_1, t))) 288 | 289 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 290 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 291 | self.weights = [self.iterations] + ms + vs 292 | 293 | for p, g, m, v in zip(params, grads, ms, vs): 294 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 295 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 296 | # decoupled weight decay (4/4) 297 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p 298 | 299 | self.updates.append(K.update(m, m_t)) 300 | self.updates.append(K.update(v, v_t)) 301 | new_p = p_t 302 | 303 | # Apply constraints. 304 | if getattr(p, 'constraint', None) is not None: 305 | new_p = p.constraint(new_p) 306 | 307 | self.updates.append(K.update(p, new_p)) 308 | return self.updates 309 | 310 | def get_config(self): 311 | config = {'lr': float(K.get_value(self.lr)), 312 | 'beta_1': float(K.get_value(self.beta_1)), 313 | 'beta_2': float(K.get_value(self.beta_2)), 314 | 'decay': float(K.get_value(self.decay)), 315 | 'weight_decay': float(K.get_value(self.wd)), 316 | 'epsilon': self.epsilon} 317 | base_config = super(AdamW, self).get_config() 318 | return dict(list(base_config.items()) + list(config.items())) 319 | 320 | 321 | class Attention(Layer): 322 | def __init__(self, step_dim, 323 | W_regularizer=None, b_regularizer=None, 324 | W_constraint=None, b_constraint=None, 325 | bias=True, **kwargs): 326 | self.supports_masking = True 327 | self.init = initializers.get('glorot_uniform') 328 | 329 | self.W_regularizer = regularizers.get(W_regularizer) 330 | self.b_regularizer = regularizers.get(b_regularizer) 331 | 332 | self.W_constraint = constraints.get(W_constraint) 333 | self.b_constraint = constraints.get(b_constraint) 334 | 335 | self.bias = bias 336 | self.step_dim = step_dim 337 | self.features_dim = 0 338 | super(Attention, self).__init__(**kwargs) 339 | 340 | def build(self, input_shape): 341 | assert len(input_shape) == 3 342 | 343 | self.W = self.add_weight((input_shape[-1],), 344 | initializer=self.init, 345 | name='{}_W'.format(self.name), 346 | regularizer=self.W_regularizer, 347 | constraint=self.W_constraint) 348 | self.features_dim = input_shape[-1] 349 | 350 | if self.bias: 351 | self.b = self.add_weight((input_shape[1],), 352 | initializer='zero', 353 | name='{}_b'.format(self.name), 354 | regularizer=self.b_regularizer, 355 | constraint=self.b_constraint) 356 | else: 357 | self.b = None 358 | 359 | self.built = True 360 | 361 | def compute_mask(self, input, input_mask=None): 362 | return None 363 | 364 | def call(self, x, mask=None): 365 | features_dim = self.features_dim 366 | step_dim = self.step_dim 367 | 368 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 369 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) 370 | 371 | if self.bias: 372 | eij += self.b 373 | 374 | eij = K.tanh(eij) 375 | 376 | a = K.exp(eij) 377 | 378 | if mask is not None: 379 | a *= K.cast(mask, K.floatx()) 380 | 381 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 382 | 383 | a = K.expand_dims(a) 384 | weighted_input = x * a 385 | return K.sum(weighted_input, axis=1) 386 | 387 | def compute_output_shape(self, input_shape): 388 | return input_shape[0], self.features_dim 389 | 390 | # AUC for a binary classifier 391 | def auc(y_true, y_pred): 392 | ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0) 393 | pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0) 394 | pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0) 395 | binSizes = -(pfas[1:]-pfas[:-1]) 396 | s = ptas*binSizes 397 | return K.sum(s, axis=0) 398 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 399 | # PFA, prob false alert for binary classifier 400 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)): 401 | y_pred = K.cast(y_pred >= threshold, 'float32') 402 | # N = total number of negative labels 403 | N = K.sum(1 - y_true) 404 | # FP = total number of false alerts, alerts from the negative class labels 405 | FP = K.sum(y_pred - y_pred * y_true) 406 | return FP/N 407 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 408 | # P_TA prob true alerts for binary classifier 409 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)): 410 | y_pred = K.cast(y_pred >= threshold, 'float32') 411 | # P = total number of positive labels 412 | P = K.sum(y_true) 413 | # TP = total number of correct alerts, alerts from the positive class labels 414 | TP = K.sum(y_pred * y_true) 415 | return TP/P 416 | 417 | 418 | val = train[99000000:] 419 | train = train[:99000000] 420 | val_X1 = train_X1[99000000:] 421 | val_X2 = train_X2[99000000:] 422 | train_X1 = train_X1[:99000000] 423 | train_X2 = train_X2[:99000000] 424 | val_feature = train_feature[99000000:] 425 | train_feature = train_feature[:99000000] 426 | 427 | class ManDist(keras.layers.Layer): # 封装成keras层的曼哈顿距离计算 428 | 429 | # 初始化ManDist层,此时不需要任何参数输入 430 | def __init__(self, **kwargs): 431 | self.result = None 432 | super(ManDist, self).__init__(**kwargs) 433 | 434 | # 自动建立ManDist层 435 | def build(self, input_shape): 436 | super(ManDist, self).build(input_shape) 437 | 438 | # 计算曼哈顿距离 439 | def call(self, x, **kwargs): 440 | self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True)) 441 | return self.result 442 | 443 | # 返回结果 444 | def compute_output_shape(self, input_shape): 445 | return K.int_shape(self.result) 446 | 447 | 448 | sc = StandardScaler() 449 | col_len = len(train_feature.columns) 450 | sc.fit(pd.concat([train_feature, val_feature, test_feature])) 451 | train_feature = sc.transform(train_feature) 452 | val_feature = sc.transform(val_feature) 453 | test_feature = sc.transform(test_feature) 454 | 455 | def get_model(embedding_matrix): 456 | 457 | K.clear_session() 458 | #The embedding layer containing the word vectors 459 | emb_layer = Embedding( 460 | input_dim=embedding_matrix.shape[0], 461 | output_dim=embedding_matrix.shape[1], 462 | weights=[embedding_matrix], 463 | trainable=False 464 | ) 465 | sdrop=SpatialDropout1D(rate=0.2) 466 | lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True, 467 | kernel_initializer=glorot_uniform(seed = 123))) 468 | gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True, 469 | kernel_initializer=glorot_uniform(seed = 123))) 470 | 471 | cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform") 472 | 473 | # Define inputs 474 | seq1 = Input(shape=(maxlen1,)) 475 | x1 = emb_layer(seq1) 476 | x1 = sdrop(x1) 477 | lstm1 = lstm_layer(x1) 478 | gru1 = gru_layer(lstm1) 479 | att_1 = Attention(maxlen1)(lstm1) 480 | att_3 = Attention(maxlen1)(gru1) 481 | cnn1 = cnn1d_layer(lstm1) 482 | 483 | avg_pool = GlobalAveragePooling1D() 484 | max_pool = GlobalMaxPooling1D() 485 | 486 | seq2 = Input(shape=(maxlen2,)) 487 | x2 = emb_layer(seq2) 488 | x2 = sdrop(x2) 489 | lstm2 = lstm_layer(x2) 490 | gru2 = gru_layer(lstm2) 491 | att_2 = Attention(maxlen2)(lstm2) 492 | att_4 = Attention(maxlen2)(gru2) 493 | cnn2 = cnn1d_layer(lstm2) 494 | 495 | x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)]) 496 | x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)]) 497 | 498 | merge = Multiply()([x1, x2]) 499 | merge = Dropout(0.2)(merge) 500 | 501 | hin = Input(shape=(col_len,)) 502 | # htime = Dense(col_len,activation='relu')(hin) 503 | x = Concatenate()([merge,hin]) 504 | # The MLP that determines the outcome 505 | x = Dense(64,kernel_initializer=he_uniform(seed=123), activation='relu',)(x) 506 | # x = Dropout(0.2)(x) 507 | # x = BatchNormalization()(x) 508 | 509 | pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x) 510 | 511 | 512 | model = Model(inputs=[seq1,seq2,hin], outputs=pred) 513 | 514 | model.compile(loss='binary_crossentropy', 515 | optimizer=AdamW(lr=0.001,weight_decay=0.02,), 516 | metrics=["accuracy",auc]) 517 | # model.summary() 518 | return model 519 | 520 | 521 | ####模型训练 522 | 523 | print("train...") 524 | print("###"*30) 525 | gc.collect() 526 | K.clear_session() 527 | model = get_model(embedding_matrix) 528 | # model = esim() 529 | model.summary() 530 | early_stopping = EarlyStopping( 531 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1) 532 | reduce_lr = ReduceLROnPlateau( 533 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2) 534 | bst_model_path = out+'chizhurnn_chizhu_weight.h5' 535 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 536 | save_best_only=True, verbose=1, save_weights_only=True) 537 | callbacks = [checkpoint, reduce_lr, early_stopping] 538 | print("load weight....") 539 | # model.load_weights(bst_model_path) 540 | 541 | hist = model.fit([train_X1,train_X2,train_feature],train['label'], 542 | validation_data=([val_X1,val_X2,val_feature], val['label']), 543 | epochs=30, batch_size=2048, 544 | # class_weight="auto", 545 | callbacks=callbacks,verbose=1 546 | 547 | ) 548 | 549 | model.load_weights(bst_model_path) 550 | 551 | res = np.squeeze(model.predict( 552 | [val_X1, val_X2, val_feature], batch_size=2048, verbose=1)) 553 | 554 | print("val auc:{}".format(roc_auc_score(val['label'], res))) 555 | val['prob'] = res 556 | 557 | 558 | def perauc(df): 559 | temp = pd.DataFrame(index=range(1)) 560 | temp['query_id'] = df['query_id'].values[0] 561 | try: 562 | temp['auc'] = roc_auc_score(df['label'].values.astype(int), df['prob']) 563 | except: 564 | temp['auc'] = 0.5 565 | return temp 566 | 567 | 568 | eval_df = val.groupby("query_id", as_index=False).apply(lambda x: perauc(x)) 569 | eval_df.index = range(len(eval_df)) 570 | print("qauc:", eval_df['auc'].mean()) 571 | 572 | test_prob = np.squeeze(model.predict( 573 | [test_X1, test_X2, test_feature], batch_size=2048, verbose=1)) 574 | 575 | 576 | sub = test[['query_id', 'query_title_id']] 577 | sub['prediction'] = test_prob 578 | sub.to_csv(out+"/submit_rnn.csv", index=False, header=False) 579 | 580 | 581 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### 高校赛解决方案 2 | #### 赛题介绍 3 | * **数据** 4 | 5 | 提供10亿量级的数据,根据query和title预测query下doc点击率。数据已经脱敏并且分好词。 6 | 7 | | 列名 | 类型 | 示例 | 8 | | ------ | ------ | ------ | 9 | | query_id | int | 3 | 10 | | query | hash string,term空格分割 | 1 9 117 | 11 | | query_title_id | title在query下的唯一标识 | 2 | 12 | | title | hash string,term空格分割 | 3 9 120 | 13 | | label | int,取值{0, 1} | 0 | 14 | * **任务分析** 15 | 二分类问题。文本相似度+ctr点击预测 16 | * **难点** 17 | 18 | * 数据量大 19 | * 数据脱敏 20 | 21 | #### 解决方案 22 | ##### 特征工程(FE) 23 | * 问题长度 24 | * 标题长度 25 | * 标题长度-问题长度 26 | * 问题是否全部在标题里面 27 | * 标题和问题的共词个数 28 | * 标题问题词语的共词个数/问题长度 29 | * 标题问题词语的共词个数/标题长度 30 | * 编辑距离 31 | * 前一个词语是否相同 32 | * 前二个词语是否相同 33 | * 前三个词语是否相同 34 | * 第一个词语在标题里面出现位置 35 | * 第二个词语在标题里面出现位置 36 | * 第三个词语在标题里面出现位置 37 | * 标题求组合后词语 38 | * 词语求组合后标题 39 | * w2v_n_similarity 40 | * fasttext的余弦相似度 41 | * word2vec的余弦相似度 42 | 43 | (共19个特征,放入LGB模型lb是0.597) 44 | ##### NN模型 45 | * 孪生RNN 46 | * query+title双输入+FE特征 47 | * 使用最后一亿的数据(前9.9千万条数据训练+后1百万数据验证) 48 | * 网络结构 49 | ```python 50 | def get_model(embedding_matrix): 51 | K.clear_session() 52 | #The embedding layer containing the word vectors 53 | emb_layer = Embedding( 54 | input_dim=embedding_matrix.shape[0], 55 | output_dim=embedding_matrix.shape[1], 56 | weights=[embedding_matrix], 57 | trainable=False 58 | ) 59 | sdrop=SpatialDropout1D(rate=0.2) 60 | lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True, kernel_initializer=glorot_uniform(seed = 123))) 61 | gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True, kernel_initializer=glorot_uniform(seed = 123))) 62 | 63 | cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform") 64 | 65 | # Define inputs 66 | seq1 = Input(shape=(maxlen_query,)) 67 | x1 = emb_layer(seq1) 68 | x1 = sdrop(x1) 69 | lstm1 = lstm_layer(x1) 70 | gru1 = gru_layer(lstm1) 71 | att_1 = Attention(maxlen_query)(lstm1) 72 | att_3 = Attention(maxlen_query)(gru1) 73 | cnn1 = cnn1d_layer(lstm1) 74 | 75 | avg_pool = GlobalAveragePooling1D() 76 | max_pool = GlobalMaxPooling1D() 77 | 78 | seq2 = Input(shape=(maxlen_answer,)) 79 | x2 = emb_layer(seq2) 80 | x2 = sdrop(x2) 81 | lstm2 = lstm_layer(x2) 82 | gru2 = gru_layer(lstm2) 83 | att_2 = Attention(maxlen_answer)(lstm2) 84 | att_4 = Attention(maxlen_answer)(gru2) 85 | cnn2 = cnn1d_layer(lstm2) 86 | 87 | x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)]) 88 | x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)]) 89 | 90 | merge = Multiply()([x1, x2]) 91 | merge = Dropout(0.2)(merge) 92 | 93 | hin = Input(shape=(19,)) 94 | # htime = Dense(col_len,activation='relu')(hin) 95 | x = Concatenate()([merge,hin]) 96 | # The MLP that determines the outcome 97 | x = Dense(64,kernel_initializer=he_uniform(seed=123), activation='relu',)(x) 98 | # x = Dropout(0.2)(x) 99 | # x = BatchNormalization()(x) 100 | 101 | pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x) 102 | model = Model(inputs=[seq1,seq2,hin], outputs=pred) 103 | model.compile(loss='binary_crossentropy', 104 | optimizer=AdamW(lr=0.001,weight_decay=0.02,), 105 | metrics=["accuracy",auc]) 106 | # model.summary() 107 | return model 108 | ``` 109 | 110 | * 使用AdamW优化器加快训练过程 111 | * 使用最新刚出的lookahead 优化器(reference:Lookahead Optimizer: k steps forward, 1 step back(https://arxiv.org/abs/1907.08610)) 112 | Lookahead 算法的性能显著优于 SGD 和 Adam,它迭代地更新两组权重。直观来说,Lookahead 算法通过提前观察另一个优化器生成的「fast weights」序列,来选择搜索方向。该研究发现,Lookahead 算法能够提升学习稳定性,不仅降低了调参需要的功夫,同时还能提升收敛速度与效果。 113 | * 线上效果 114 | **lb 0.6214** 115 | * **fine-tuning(亮点)** 116 | * 思考:官方提供10亿的数据量?先验知识告诉我们,数据越多效果越好,那么如何充分利用数据? 117 | * 解决方法 118 | * 先用10亿数据训练一个不加任何特征的裸NN,保存权重(如何能训练10亿?) 119 | > 文件流处理数据+分批次训练(训练10亿数据最大占用内存才10G) 120 | * 加载裸NN模型,获得倒二层的feature map作为输出,加入新的FE特征输入,然后把基模型的feature map和FE特征拼接最后送入全连接层。用最后一亿的数据fine-tuning 整个网络。 121 | (再次展示预训练在NLP领域的举足轻重不可动摇的地位) 122 | 123 | * fine-tuning用到的模型(整体参数都是改小了的,因为只有单卡机器,如果可以多卡训练,放开参数估计单模可以0.64+) 124 | * word2vec300维+孪生RNN(小参数) **lb 0.6248** 125 | * word2vec300维+ESIM(极小参数,最后时刻怕跑不完) **lb 0.626** 126 | * fasttext100维+ESIM(小参数) **lb 0.6336 单模都可以在A榜排到第三** 127 | * fine-tuning 网络结构 128 | ```python 129 | def aux_esim_model(embed_matrix,model_weight_path): 130 | base_model = esim(embed_matrix) 131 | base_model.load_weights(model_weight_path) 132 | input_q, input_a = base_model.inputs 133 | input_f = Input((19,)) 134 | hidden_esim = base_model.get_layer(index=28).output 135 | merged = Concatenate()([hidden_esim, input_f]) 136 | #dense = BatchNormalization()(merged) 137 | dense = Dense(512, activation='relu')(merged) 138 | #dense = BatchNormalization()(dense) 139 | dense = Dropout(0.5)(dense) 140 | dense = Dense(256, activation='relu')(dense) 141 | #dense = BatchNormalization()(dense) 142 | dense = Dropout(0.5)(dense) 143 | out_ = Dense(1, activation='sigmoid')(dense) 144 | 145 | model = Model(inputs=[input_q,input_a,input_f], outputs=out_) 146 | model.compile(loss='binary_crossentropy', 147 | optimizer=AdamW(lr=0.0003,weight_decay=0.02), 148 | metrics=["accuracy"]) 149 | return model 150 | ``` 151 | * ESIM 网络结构 152 | ```python 153 | def esim(embedding_matrix, 154 | maxlen=20, 155 | lstm_dim=64, 156 | dense_dim=128, 157 | dense_dropout=0.5): 158 | # Based on arXiv:1609.06038 159 | q1 = Input(name='q1', shape=(8,)) 160 | q2 = Input(name='q2', shape=(20,)) 161 | 162 | # Embedding 163 | embedding = create_pretrained_embedding( 164 | embedding_matrix, mask_zero=False) 165 | bn = BatchNormalization(axis=2) 166 | q1_embed = bn(embedding(q1)) 167 | q2_embed = bn(embedding(q2)) 168 | 169 | # Encode 170 | encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True)) 171 | q1_encoded = encode(q1_embed) 172 | q2_encoded = encode(q2_embed) 173 | 174 | # Attention 175 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) 176 | 177 | # Compose 178 | q1_combined = Concatenate()( 179 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) 180 | q2_combined = Concatenate()( 181 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) 182 | 183 | compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True)) 184 | q1_compare = compose(q1_combined) 185 | q2_compare = compose(q2_combined) 186 | 187 | # Aggregate 188 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 189 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 190 | 191 | 192 | merged = Concatenate()([q1_rep, q2_rep]) 193 | 194 | dense = BatchNormalization()(merged) 195 | dense = Dense(dense_dim, activation='elu')(dense) 196 | dense = BatchNormalization()(dense) 197 | dense = Dropout(dense_dropout) (dense) 198 | dense = Dense(dense_dim, activation='elu')(dense) 199 | dense = BatchNormalization()(dense) 200 | dense = Dropout(dense_dropout)(dense) 201 | out_ = Dense(1, activation='sigmoid')(dense) 202 | 203 | model = Model(inputs=[q1, q2], outputs=out_) 204 | model.compile(loss='binary_crossentropy', 205 | optimizer=AdamW(lr=0.0003,weight_decay=0.02,), 206 | metrics=["accuracy",auc]) 207 | return model 208 | ``` 209 | 210 | 211 | #### 线上提交 212 | * finetuning_fasttext_esim(**0.6336**)*0.6+\ 213 | finetuning_w2v_esim(**0.626**)*0.2+\ 214 | finetuning_w2v_esim(**0.6248**)*0.2=**lb 0.6366** 215 |
216 | 217 | * finetuning_fasttext_esim(**0.6336**)*0.5+\ 218 | finetuning_w2v_esim(**0.626**)*0.2+\ 219 | finetuning_w2v_esim(**0.6248**)*0.2+\ 220 | 孪生RNN(**0.6214**)*0.1=ensemble_NN 221 | 222 | lgb(**0.597**)*0.1+ensemble_NN*0.9= **lb 0.6371** 223 | 224 | 225 | 226 | 227 | #### 我们的优势 228 | * 工业可部署 229 | > 真实的线上业务也是庞大的数据量,如何充分利用数据是个难题。我们的方案适用于大数据量(流式训练全量数据内存小+finetuing迁移学习效果佳) 230 | 231 | * 简单而实用 232 | > 我们总共才19个特征,不需要提取大量的手工特征,所以可以说不依赖于LGB模型,LGB模型是全量模型,要么只能选用小数据集提特征要么大数据量提取不了特征,不易迭代。我们的方案流式处理,易于迭代更新。 233 | 234 | 235 | 236 | 237 | 238 | 239 | -------------------------------------------------------------------------------- /bigtrain_fasttext_esim.py: -------------------------------------------------------------------------------- 1 | from keras.activations import softmax 2 | import os 3 | import pandas as pd 4 | import numpy as np 5 | import random as rn 6 | from tqdm import tqdm, tqdm_notebook 7 | import tensorflow as tf 8 | from sklearn.metrics import roc_auc_score 9 | from keras.preprocessing.text import Tokenizer 10 | from keras.preprocessing.sequence import pad_sequences 11 | from keras.optimizers import Adam 12 | from keras import backend as K 13 | from keras.optimizers import * 14 | from keras.callbacks import * 15 | from keras.layers import * 16 | from keras.models import * 17 | from keras.engine.topology import Layer 18 | from keras import initializers, regularizers, constraints, optimizers, layers 19 | from keras.initializers import * 20 | import keras 21 | from sklearn.model_selection import StratifiedKFold, GroupKFold 22 | import gc 23 | import time 24 | from gensim.models import Word2Vec 25 | import logging 26 | import Levenshtein 27 | import fasttext 28 | tqdm.pandas() 29 | np.random.seed(1017) 30 | rn.seed(1017) 31 | tf.set_random_seed(1017) 32 | path = "/home/kesci/input/bytedance/" 33 | out = '/home/kesci/work/zhifeng/' 34 | print(os.listdir(path)) 35 | 36 | w2v = fasttext.load_model(out+'corpus.fasttext.model') 37 | word2index = {word: index+1 for index, word in enumerate(w2v.words)} 38 | index2word = {index+1: word for index, word in enumerate(w2v.words)} 39 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20, 40 | maxlen_query=8): 41 | if label_tag: 42 | _, _q, _, _a, _label = line.strip().split(',') 43 | else: 44 | _, _q, _, _a = line.strip().split(',') 45 | q_seq = [token.get(item, 0) for item in _q.strip().split()] 46 | a_seq = [token.get(item, 0) for item in _a.strip().split()] 47 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:] 48 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:] 49 | if label_tag: 50 | return q_pad, a_pad, int(_label) 51 | return q_pad, a_pad 52 | 53 | 54 | def gen_train(path, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8): 55 | while True: 56 | fin = open(path, 'r') 57 | batch_q, batch_a, batch_label = [], [], [] 58 | for line in fin: 59 | if len(batch_q) == chunk_size*batch_size: 60 | batch_q = np.array(batch_q) 61 | batch_a = np.array(batch_a) 62 | if label_tag: 63 | batch_label = np.array(batch_label) 64 | idx = list(range(chunk_size*batch_size)) 65 | if shuffle: 66 | np.random.shuffle(idx) 67 | for i in range(chunk_size): 68 | if label_tag: 69 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]) 70 | else: 71 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])] 72 | batch_q, batch_a, batch_label = [], [], [] 73 | if label_tag: 74 | q, a, l = gen_feature_help(line, label_tag=label_tag) 75 | else: 76 | q, a = gen_feature_help(line, label_tag=label_tag) 77 | l = 0 78 | batch_q.append(q) 79 | batch_a.append(a) 80 | if label_tag: 81 | batch_label.append(l) 82 | 83 | batch_q = np.array(batch_q) 84 | batch_a = np.array(batch_a) 85 | 86 | if label_tag: 87 | batch_label = np.array(batch_label) 88 | idx = list(range(len(batch_q))) 89 | if shuffle: 90 | np.random.shuffle(idx) 91 | for i in range(int(np.ceil(len(batch_q)/batch_size))): 92 | if label_tag: 93 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]) 94 | else: 95 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])] 96 | fin.close() 97 | 98 | 99 | def get_embedding_matrix(): 100 | m = np.zeros(shape=(len(index2word)+1, 100)) 101 | for i, w in index2word.items(): 102 | m[i, :] = w2v[w] 103 | return m 104 | 105 | 106 | embed_matrix = get_embedding_matrix() 107 | maxlen_query = 8 108 | maxlen_answer = 20 109 | 110 | 111 | class AdamW(Optimizer): 112 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4) 113 | epsilon=1e-8, decay=0., **kwargs): 114 | super(AdamW, self).__init__(**kwargs) 115 | with K.name_scope(self.__class__.__name__): 116 | self.iterations = K.variable(0, dtype='int64', name='iterations') 117 | self.lr = K.variable(lr, name='lr') 118 | self.beta_1 = K.variable(beta_1, name='beta_1') 119 | self.beta_2 = K.variable(beta_2, name='beta_2') 120 | self.decay = K.variable(decay, name='decay') 121 | # decoupled weight decay (2/4) 122 | self.wd = K.variable(weight_decay, name='weight_decay') 123 | self.epsilon = epsilon 124 | self.initial_decay = decay 125 | 126 | @interfaces.legacy_get_updates_support 127 | def get_updates(self, loss, params): 128 | grads = self.get_gradients(loss, params) 129 | self.updates = [K.update_add(self.iterations, 1)] 130 | wd = self.wd # decoupled weight decay (3/4) 131 | 132 | lr = self.lr 133 | if self.initial_decay > 0: 134 | lr *= (1. / (1. + self.decay * K.cast(self.iterations, 135 | K.dtype(self.decay)))) 136 | 137 | t = K.cast(self.iterations, K.floatx()) + 1 138 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / 139 | (1. - K.pow(self.beta_1, t))) 140 | 141 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 142 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 143 | self.weights = [self.iterations] + ms + vs 144 | 145 | for p, g, m, v in zip(params, grads, ms, vs): 146 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 147 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 148 | # decoupled weight decay (4/4) 149 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p 150 | 151 | self.updates.append(K.update(m, m_t)) 152 | self.updates.append(K.update(v, v_t)) 153 | new_p = p_t 154 | 155 | # Apply constraints. 156 | if getattr(p, 'constraint', None) is not None: 157 | new_p = p.constraint(new_p) 158 | 159 | self.updates.append(K.update(p, new_p)) 160 | return self.updates 161 | 162 | def get_config(self): 163 | config = {'lr': float(K.get_value(self.lr)), 164 | 'beta_1': float(K.get_value(self.beta_1)), 165 | 'beta_2': float(K.get_value(self.beta_2)), 166 | 'decay': float(K.get_value(self.decay)), 167 | 'weight_decay': float(K.get_value(self.wd)), 168 | 'epsilon': self.epsilon} 169 | base_config = super(AdamW, self).get_config() 170 | return dict(list(base_config.items()) + list(config.items())) 171 | 172 | 173 | class Attention(Layer): 174 | def __init__(self, step_dim, 175 | W_regularizer=None, b_regularizer=None, 176 | W_constraint=None, b_constraint=None, 177 | bias=True, **kwargs): 178 | self.supports_masking = True 179 | self.init = initializers.get('glorot_uniform') 180 | 181 | self.W_regularizer = regularizers.get(W_regularizer) 182 | self.b_regularizer = regularizers.get(b_regularizer) 183 | 184 | self.W_constraint = constraints.get(W_constraint) 185 | self.b_constraint = constraints.get(b_constraint) 186 | 187 | self.bias = bias 188 | self.step_dim = step_dim 189 | self.features_dim = 0 190 | super(Attention, self).__init__(**kwargs) 191 | 192 | def build(self, input_shape): 193 | assert len(input_shape) == 3 194 | 195 | self.W = self.add_weight((input_shape[-1],), 196 | initializer=self.init, 197 | name='{}_W'.format(self.name), 198 | regularizer=self.W_regularizer, 199 | constraint=self.W_constraint) 200 | self.features_dim = input_shape[-1] 201 | 202 | if self.bias: 203 | self.b = self.add_weight((input_shape[1],), 204 | initializer='zero', 205 | name='{}_b'.format(self.name), 206 | regularizer=self.b_regularizer, 207 | constraint=self.b_constraint) 208 | else: 209 | self.b = None 210 | 211 | self.built = True 212 | 213 | def compute_mask(self, input, input_mask=None): 214 | return None 215 | 216 | def call(self, x, mask=None): 217 | features_dim = self.features_dim 218 | step_dim = self.step_dim 219 | 220 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 221 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) 222 | 223 | if self.bias: 224 | eij += self.b 225 | 226 | eij = K.tanh(eij) 227 | 228 | a = K.exp(eij) 229 | 230 | if mask is not None: 231 | a *= K.cast(mask, K.floatx()) 232 | 233 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 234 | 235 | a = K.expand_dims(a) 236 | weighted_input = x * a 237 | return K.sum(weighted_input, axis=1) 238 | 239 | def compute_output_shape(self, input_shape): 240 | return input_shape[0], self.features_dim 241 | 242 | # AUC for a binary classifier 243 | 244 | 245 | def auc(y_true, y_pred): 246 | ptas = tf.stack([binary_PTA(y_true, y_pred, k) 247 | for k in np.linspace(0, 1, 1000)], axis=0) 248 | pfas = tf.stack([binary_PFA(y_true, y_pred, k) 249 | for k in np.linspace(0, 1, 1000)], axis=0) 250 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0) 251 | binSizes = -(pfas[1:]-pfas[:-1]) 252 | s = ptas*binSizes 253 | return K.sum(s, axis=0) 254 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 255 | # PFA, prob false alert for binary classifier 256 | 257 | 258 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)): 259 | y_pred = K.cast(y_pred >= threshold, 'float32') 260 | # N = total number of negative labels 261 | N = K.sum(1 - y_true) 262 | # FP = total number of false alerts, alerts from the negative class labels 263 | FP = K.sum(y_pred - y_pred * y_true) 264 | return FP/N 265 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 266 | # P_TA prob true alerts for binary classifier 267 | 268 | 269 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)): 270 | y_pred = K.cast(y_pred >= threshold, 'float32') 271 | # P = total number of positive labels 272 | P = K.sum(y_true) 273 | # TP = total number of correct alerts, alerts from the positive class labels 274 | TP = K.sum(y_pred * y_true) 275 | return TP/P 276 | 277 | 278 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs): 279 | "Create embedding layer from a pretrained weights array" 280 | in_dim, out_dim = pretrained_weights.shape 281 | embedding = Embedding(in_dim, out_dim, weights=[ 282 | pretrained_weights], trainable=False, **kwargs) 283 | return embedding 284 | 285 | 286 | def unchanged_shape(input_shape): 287 | "Function for Lambda layer" 288 | return input_shape 289 | 290 | 291 | def substract(input_1, input_2): 292 | "Substract element-wise" 293 | neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2) 294 | out_ = Add()([input_1, neg_input_2]) 295 | return out_ 296 | 297 | 298 | def submult(input_1, input_2): 299 | "Get multiplication and subtraction then concatenate results" 300 | mult = Multiply()([input_1, input_2]) 301 | sub = substract(input_1, input_2) 302 | out_ = Concatenate()([sub, mult]) 303 | return out_ 304 | 305 | 306 | def apply_multiple(input_, layers): 307 | "Apply layers to input then concatenate result" 308 | if not len(layers) > 1: 309 | raise ValueError('Layers list should contain more than 1 layer') 310 | else: 311 | agg_ = [] 312 | for layer in layers: 313 | agg_.append(layer(input_)) 314 | out_ = Concatenate()(agg_) 315 | return out_ 316 | 317 | 318 | def time_distributed(input_, layers): 319 | "Apply a list of layers in TimeDistributed mode" 320 | out_ = [] 321 | node_ = input_ 322 | for layer_ in layers: 323 | node_ = TimeDistributed(layer_)(node_) 324 | out_ = node_ 325 | return out_ 326 | 327 | 328 | def soft_attention_alignment(input_1, input_2): 329 | "Align text representation with neural soft attention" 330 | attention = Dot(axes=-1)([input_1, input_2]) 331 | w_att_1 = Lambda(lambda x: softmax(x, axis=1), 332 | output_shape=unchanged_shape)(attention) 333 | w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2), 334 | output_shape=unchanged_shape)(attention)) 335 | in1_aligned = Dot(axes=1)([w_att_1, input_1]) 336 | in2_aligned = Dot(axes=1)([w_att_2, input_2]) 337 | return in1_aligned, in2_aligned 338 | 339 | 340 | def decomposable_attention(pretrained_weights, 341 | num_shape, 342 | projection_dim=300, projection_hidden=0, projection_dropout=0.2, 343 | compare_dim=500, compare_dropout=0.2, 344 | dense_dim=300, dense_dropout=0.2, 345 | lr=1e-3, activation='elu', maxlen=20): 346 | # Based on: https://arxiv.org/abs/1606.01933 347 | 348 | q1 = Input(name='q1', shape=(maxlen,)) 349 | q2 = Input(name='q2', shape=(maxlen,)) 350 | 351 | # Embedding 352 | embedding = create_pretrained_embedding(pretrained_weights, 353 | mask_zero=False) 354 | q1_embed = embedding(q1) 355 | q2_embed = embedding(q2) 356 | 357 | # Projection 358 | projection_layers = [] 359 | if projection_hidden > 0: 360 | projection_layers.extend([ 361 | Dense(projection_hidden, activation=activation), 362 | Dropout(rate=projection_dropout), 363 | ]) 364 | projection_layers.extend([ 365 | Dense(projection_dim, activation=None), 366 | Dropout(rate=projection_dropout), 367 | ]) 368 | q1_encoded = time_distributed(q1_embed, projection_layers) 369 | q2_encoded = time_distributed(q2_embed, projection_layers) 370 | 371 | # Attention 372 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) 373 | 374 | # Compare 375 | q1_combined = Concatenate()( 376 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) 377 | q2_combined = Concatenate()( 378 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) 379 | compare_layers = [ 380 | Dense(compare_dim, activation=activation), 381 | Dropout(compare_dropout), 382 | Dense(compare_dim, activation=activation), 383 | Dropout(compare_dropout), 384 | ] 385 | q1_compare = time_distributed(q1_combined, compare_layers) 386 | q2_compare = time_distributed(q2_combined, compare_layers) 387 | 388 | # Aggregate 389 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 390 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 391 | 392 | # Classifier 393 | merged = Concatenate()([q1_rep, q2_rep]) 394 | dense = BatchNormalization()(merged) 395 | dense = Dense(dense_dim, activation=activation)(dense) 396 | dense = Dropout(dense_dropout)(dense) 397 | dense = BatchNormalization()(dense) 398 | dense = Dense(dense_dim, activation=activation)(dense) 399 | dense = Dropout(dense_dropout)(dense) 400 | out_ = Dense(1, activation='sigmoid')(dense) 401 | 402 | model = Model(inputs=[q1, q2], outputs=out_) 403 | model.compile(loss='binary_crossentropy', 404 | optimizer=AdamW(lr=0.001, weight_decay=0.02,), 405 | metrics=["accuracy", auc]) 406 | return model 407 | 408 | 409 | def esim(embedding_matrix, 410 | maxlen=20, 411 | lstm_dim=30, 412 | dense_dim=30, 413 | dense_dropout=0.5): 414 | # Based on arXiv:1609.06038 415 | q1 = Input(name='q1', shape=(8,)) 416 | q2 = Input(name='q2', shape=(20,)) 417 | 418 | # Embedding 419 | embedding = create_pretrained_embedding( 420 | embedding_matrix, mask_zero=False) 421 | bn = BatchNormalization(axis=2) 422 | q1_embed = bn(embedding(q1)) 423 | q2_embed = bn(embedding(q2)) 424 | 425 | # Encode 426 | encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True)) 427 | q1_encoded = encode(q1_embed) 428 | q2_encoded = encode(q2_embed) 429 | 430 | # Attention 431 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) 432 | 433 | # Compose 434 | q1_combined = Concatenate()( 435 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) 436 | q2_combined = Concatenate()( 437 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) 438 | 439 | compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True)) 440 | q1_compare = compose(q1_combined) 441 | q2_compare = compose(q2_combined) 442 | 443 | # Aggregate 444 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 445 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 446 | 447 | # leaks_input = Input(shape=(num_shape,)) 448 | # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input) 449 | 450 | # Classifier 451 | merged = Concatenate()([q1_rep, q2_rep]) 452 | 453 | dense = BatchNormalization()(merged) 454 | dense = Dense(dense_dim, activation='elu')(dense) 455 | dense = BatchNormalization()(dense) 456 | dense = Dropout(dense_dropout)(dense) 457 | dense = Dense(dense_dim, activation='elu')(dense) 458 | dense = BatchNormalization()(dense) 459 | dense = Dropout(dense_dropout)(dense) 460 | out_ = Dense(1, activation='sigmoid')(dense) 461 | 462 | model = Model(inputs=[q1, q2], outputs=out_) 463 | model.compile(loss='binary_crossentropy', 464 | optimizer=AdamW(lr=0.0003, weight_decay=0.02,), 465 | metrics=["accuracy"]) 466 | return model 467 | 468 | 469 | ####模型训练 470 | train_gen = gen_train(path='/home/kesci/zhifeng/train.csv', 471 | batch_size=4096, label_tag=True, chunk_size=1000) 472 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', 473 | batch_size=4096, label_tag=True, chunk_size=1000) 474 | print("train...") 475 | print("###"*30) 476 | gc.collect() 477 | K.clear_session() 478 | model = esim(embed_matrix) 479 | model.summary() 480 | early_stopping = EarlyStopping( 481 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1) 482 | reduce_lr = ReduceLROnPlateau( 483 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2) 484 | bst_model_path = '/home/kesci/chizhu/chizhu_w2v_esim_weight_{epoch}_{val_loss}.h5' 485 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 486 | save_best_only=False, 487 | verbose=1, save_weights_only=True, period=1) 488 | callbacks = [checkpoint, reduce_lr, early_stopping] 489 | # print("load weight....") 490 | 491 | 492 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(999000000/2048)), 493 | epochs=10, verbose=1, callbacks=callbacks, 494 | validation_data=val_gen, validation_steps=int( 495 | np.ceil(1000000/2048)), 496 | max_queue_size=10, workers=1, use_multiprocessing=False) 497 | 498 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', 499 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False) 500 | val_prob = model.predict_generator( 501 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1) 502 | 503 | f = open('/home/kesci/zhifeng/val.csv', 'r') 504 | q, a, l = [], [], [] 505 | for line in f: 506 | qid, _, aid, _, label = line.strip().split(',') 507 | q.append(qid) 508 | a.append(aid) 509 | l.append(int(label)) 510 | 511 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l}) 512 | val_df['prob'] = val_prob.flatten() 513 | 514 | roc_auc_score(val_df['label'], val_df['prob']) 515 | 516 | 517 | def perauc(df): 518 | temp = pd.Series() 519 | try: 520 | temp['auc'] = roc_auc_score(df['label'], df['prob']) 521 | except: 522 | temp['auc'] = 0.5 523 | return temp 524 | 525 | 526 | eval_df = val_df.groupby("qid").apply(perauc) 527 | eval_df.index = range(len(eval_df)) 528 | print("qauc:", eval_df['auc'].mean()) 529 | 530 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv', 531 | batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 532 | prob = model.predict_generator( 533 | test_gen, steps=int(np.ceil(20000000/4096)), verbose=1) 534 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv', 535 | names=['qid', 'aid', 'prob']) 536 | sub['prob'] = prob.flatten() 537 | sub.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testa.csv', 538 | index=False, header=False) 539 | 540 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv', 541 | batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 542 | prob = model.predict_generator( 543 | test_gen, steps=int(np.ceil(100000000/4096)), verbose=1) 544 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[ 545 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']] 546 | final['prob'] = prob.flatten() 547 | final.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testb.csv', 548 | index=False, header=False) 549 | -------------------------------------------------------------------------------- /bigtrain_w2v_esim.py: -------------------------------------------------------------------------------- 1 | from keras.activations import softmax 2 | import os 3 | import pandas as pd 4 | import numpy as np 5 | import random as rn 6 | from tqdm import tqdm, tqdm_notebook 7 | import tensorflow as tf 8 | from sklearn.metrics import roc_auc_score 9 | from keras.preprocessing.text import Tokenizer 10 | from keras.preprocessing.sequence import pad_sequences 11 | from keras.optimizers import Adam 12 | from keras import backend as K 13 | from keras.optimizers import * 14 | from keras.callbacks import * 15 | from keras.layers import * 16 | from keras.models import * 17 | from keras.engine.topology import Layer 18 | from keras import initializers, regularizers, constraints, optimizers, layers 19 | from keras.initializers import * 20 | import keras 21 | from sklearn.model_selection import StratifiedKFold, GroupKFold 22 | import gc 23 | import time 24 | from gensim.models import Word2Vec 25 | import logging 26 | import Levenshtein 27 | import fasttext 28 | tqdm.pandas() 29 | np.random.seed(1017) 30 | rn.seed(1017) 31 | tf.set_random_seed(1017) 32 | path = "/home/kesci/input/bytedance/" 33 | out = '/home/kesci/work/zhifeng/' 34 | print(os.listdir(path)) 35 | 36 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model') 37 | 38 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)} 39 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)} 40 | 41 | 42 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20, 43 | maxlen_query=8): 44 | if label_tag: 45 | _, _q, _, _a, _label = line.strip().split(',') 46 | else: 47 | _, _q, _, _a = line.strip().split(',') 48 | q_seq = [token.get(item, 0) for item in _q.strip().split()] 49 | a_seq = [token.get(item, 0) for item in _a.strip().split()] 50 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:] 51 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:] 52 | if label_tag: 53 | return q_pad, a_pad, int(_label) 54 | return q_pad, a_pad 55 | 56 | 57 | def gen_train(path, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8): 58 | while True: 59 | fin = open(path, 'r') 60 | batch_q, batch_a, batch_label = [], [], [] 61 | for line in fin: 62 | if len(batch_q) == chunk_size*batch_size: 63 | batch_q = np.array(batch_q) 64 | batch_a = np.array(batch_a) 65 | if label_tag: 66 | batch_label = np.array(batch_label) 67 | idx = list(range(chunk_size*batch_size)) 68 | if shuffle: 69 | np.random.shuffle(idx) 70 | for i in range(chunk_size): 71 | if label_tag: 72 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]) 73 | else: 74 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])] 75 | batch_q, batch_a, batch_label = [], [], [] 76 | if label_tag: 77 | q, a, l = gen_feature_help(line, label_tag=label_tag) 78 | else: 79 | q, a = gen_feature_help(line, label_tag=label_tag) 80 | l = 0 81 | batch_q.append(q) 82 | batch_a.append(a) 83 | if label_tag: 84 | batch_label.append(l) 85 | 86 | batch_q = np.array(batch_q) 87 | batch_a = np.array(batch_a) 88 | 89 | if label_tag: 90 | batch_label = np.array(batch_label) 91 | idx = list(range(len(batch_q))) 92 | if shuffle: 93 | np.random.shuffle(idx) 94 | for i in range(int(np.ceil(len(batch_q)/batch_size))): 95 | if label_tag: 96 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]) 97 | else: 98 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])] 99 | fin.close() 100 | 101 | 102 | def get_embedding_matrix(): 103 | m = np.zeros(shape=(len(index2word)+1, 300)) 104 | for i, w in index2word.items(): 105 | m[i, :] = w2v[w] 106 | return m 107 | 108 | 109 | embed_matrix = get_embedding_matrix() 110 | maxlen_query = 8 111 | maxlen_answer = 20 112 | 113 | 114 | class AdamW(Optimizer): 115 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4) 116 | epsilon=1e-8, decay=0., **kwargs): 117 | super(AdamW, self).__init__(**kwargs) 118 | with K.name_scope(self.__class__.__name__): 119 | self.iterations = K.variable(0, dtype='int64', name='iterations') 120 | self.lr = K.variable(lr, name='lr') 121 | self.beta_1 = K.variable(beta_1, name='beta_1') 122 | self.beta_2 = K.variable(beta_2, name='beta_2') 123 | self.decay = K.variable(decay, name='decay') 124 | # decoupled weight decay (2/4) 125 | self.wd = K.variable(weight_decay, name='weight_decay') 126 | self.epsilon = epsilon 127 | self.initial_decay = decay 128 | 129 | @interfaces.legacy_get_updates_support 130 | def get_updates(self, loss, params): 131 | grads = self.get_gradients(loss, params) 132 | self.updates = [K.update_add(self.iterations, 1)] 133 | wd = self.wd # decoupled weight decay (3/4) 134 | 135 | lr = self.lr 136 | if self.initial_decay > 0: 137 | lr *= (1. / (1. + self.decay * K.cast(self.iterations, 138 | K.dtype(self.decay)))) 139 | 140 | t = K.cast(self.iterations, K.floatx()) + 1 141 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / 142 | (1. - K.pow(self.beta_1, t))) 143 | 144 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 145 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 146 | self.weights = [self.iterations] + ms + vs 147 | 148 | for p, g, m, v in zip(params, grads, ms, vs): 149 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 150 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 151 | # decoupled weight decay (4/4) 152 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p 153 | 154 | self.updates.append(K.update(m, m_t)) 155 | self.updates.append(K.update(v, v_t)) 156 | new_p = p_t 157 | 158 | # Apply constraints. 159 | if getattr(p, 'constraint', None) is not None: 160 | new_p = p.constraint(new_p) 161 | 162 | self.updates.append(K.update(p, new_p)) 163 | return self.updates 164 | 165 | def get_config(self): 166 | config = {'lr': float(K.get_value(self.lr)), 167 | 'beta_1': float(K.get_value(self.beta_1)), 168 | 'beta_2': float(K.get_value(self.beta_2)), 169 | 'decay': float(K.get_value(self.decay)), 170 | 'weight_decay': float(K.get_value(self.wd)), 171 | 'epsilon': self.epsilon} 172 | base_config = super(AdamW, self).get_config() 173 | return dict(list(base_config.items()) + list(config.items())) 174 | 175 | 176 | class Attention(Layer): 177 | def __init__(self, step_dim, 178 | W_regularizer=None, b_regularizer=None, 179 | W_constraint=None, b_constraint=None, 180 | bias=True, **kwargs): 181 | self.supports_masking = True 182 | self.init = initializers.get('glorot_uniform') 183 | 184 | self.W_regularizer = regularizers.get(W_regularizer) 185 | self.b_regularizer = regularizers.get(b_regularizer) 186 | 187 | self.W_constraint = constraints.get(W_constraint) 188 | self.b_constraint = constraints.get(b_constraint) 189 | 190 | self.bias = bias 191 | self.step_dim = step_dim 192 | self.features_dim = 0 193 | super(Attention, self).__init__(**kwargs) 194 | 195 | def build(self, input_shape): 196 | assert len(input_shape) == 3 197 | 198 | self.W = self.add_weight((input_shape[-1],), 199 | initializer=self.init, 200 | name='{}_W'.format(self.name), 201 | regularizer=self.W_regularizer, 202 | constraint=self.W_constraint) 203 | self.features_dim = input_shape[-1] 204 | 205 | if self.bias: 206 | self.b = self.add_weight((input_shape[1],), 207 | initializer='zero', 208 | name='{}_b'.format(self.name), 209 | regularizer=self.b_regularizer, 210 | constraint=self.b_constraint) 211 | else: 212 | self.b = None 213 | 214 | self.built = True 215 | 216 | def compute_mask(self, input, input_mask=None): 217 | return None 218 | 219 | def call(self, x, mask=None): 220 | features_dim = self.features_dim 221 | step_dim = self.step_dim 222 | 223 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 224 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) 225 | 226 | if self.bias: 227 | eij += self.b 228 | 229 | eij = K.tanh(eij) 230 | 231 | a = K.exp(eij) 232 | 233 | if mask is not None: 234 | a *= K.cast(mask, K.floatx()) 235 | 236 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 237 | 238 | a = K.expand_dims(a) 239 | weighted_input = x * a 240 | return K.sum(weighted_input, axis=1) 241 | 242 | def compute_output_shape(self, input_shape): 243 | return input_shape[0], self.features_dim 244 | 245 | # AUC for a binary classifier 246 | 247 | 248 | def auc(y_true, y_pred): 249 | ptas = tf.stack([binary_PTA(y_true, y_pred, k) 250 | for k in np.linspace(0, 1, 1000)], axis=0) 251 | pfas = tf.stack([binary_PFA(y_true, y_pred, k) 252 | for k in np.linspace(0, 1, 1000)], axis=0) 253 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0) 254 | binSizes = -(pfas[1:]-pfas[:-1]) 255 | s = ptas*binSizes 256 | return K.sum(s, axis=0) 257 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 258 | # PFA, prob false alert for binary classifier 259 | 260 | 261 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)): 262 | y_pred = K.cast(y_pred >= threshold, 'float32') 263 | # N = total number of negative labels 264 | N = K.sum(1 - y_true) 265 | # FP = total number of false alerts, alerts from the negative class labels 266 | FP = K.sum(y_pred - y_pred * y_true) 267 | return FP/N 268 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 269 | # P_TA prob true alerts for binary classifier 270 | 271 | 272 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)): 273 | y_pred = K.cast(y_pred >= threshold, 'float32') 274 | # P = total number of positive labels 275 | P = K.sum(y_true) 276 | # TP = total number of correct alerts, alerts from the positive class labels 277 | TP = K.sum(y_pred * y_true) 278 | return TP/P 279 | 280 | 281 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs): 282 | "Create embedding layer from a pretrained weights array" 283 | in_dim, out_dim = pretrained_weights.shape 284 | embedding = Embedding(in_dim, out_dim, weights=[ 285 | pretrained_weights], trainable=False, **kwargs) 286 | return embedding 287 | 288 | 289 | def unchanged_shape(input_shape): 290 | "Function for Lambda layer" 291 | return input_shape 292 | 293 | 294 | def substract(input_1, input_2): 295 | "Substract element-wise" 296 | neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2) 297 | out_ = Add()([input_1, neg_input_2]) 298 | return out_ 299 | 300 | 301 | def submult(input_1, input_2): 302 | "Get multiplication and subtraction then concatenate results" 303 | mult = Multiply()([input_1, input_2]) 304 | sub = substract(input_1, input_2) 305 | out_ = Concatenate()([sub, mult]) 306 | return out_ 307 | 308 | 309 | def apply_multiple(input_, layers): 310 | "Apply layers to input then concatenate result" 311 | if not len(layers) > 1: 312 | raise ValueError('Layers list should contain more than 1 layer') 313 | else: 314 | agg_ = [] 315 | for layer in layers: 316 | agg_.append(layer(input_)) 317 | out_ = Concatenate()(agg_) 318 | return out_ 319 | 320 | 321 | def time_distributed(input_, layers): 322 | "Apply a list of layers in TimeDistributed mode" 323 | out_ = [] 324 | node_ = input_ 325 | for layer_ in layers: 326 | node_ = TimeDistributed(layer_)(node_) 327 | out_ = node_ 328 | return out_ 329 | 330 | 331 | def soft_attention_alignment(input_1, input_2): 332 | "Align text representation with neural soft attention" 333 | attention = Dot(axes=-1)([input_1, input_2]) 334 | w_att_1 = Lambda(lambda x: softmax(x, axis=1), 335 | output_shape=unchanged_shape)(attention) 336 | w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2), 337 | output_shape=unchanged_shape)(attention)) 338 | in1_aligned = Dot(axes=1)([w_att_1, input_1]) 339 | in2_aligned = Dot(axes=1)([w_att_2, input_2]) 340 | return in1_aligned, in2_aligned 341 | 342 | 343 | def decomposable_attention(pretrained_weights, 344 | num_shape, 345 | projection_dim=300, projection_hidden=0, projection_dropout=0.2, 346 | compare_dim=500, compare_dropout=0.2, 347 | dense_dim=300, dense_dropout=0.2, 348 | lr=1e-3, activation='elu', maxlen=20): 349 | # Based on: https://arxiv.org/abs/1606.01933 350 | 351 | q1 = Input(name='q1', shape=(maxlen,)) 352 | q2 = Input(name='q2', shape=(maxlen,)) 353 | 354 | # Embedding 355 | embedding = create_pretrained_embedding(pretrained_weights, 356 | mask_zero=False) 357 | q1_embed = embedding(q1) 358 | q2_embed = embedding(q2) 359 | 360 | # Projection 361 | projection_layers = [] 362 | if projection_hidden > 0: 363 | projection_layers.extend([ 364 | Dense(projection_hidden, activation=activation), 365 | Dropout(rate=projection_dropout), 366 | ]) 367 | projection_layers.extend([ 368 | Dense(projection_dim, activation=None), 369 | Dropout(rate=projection_dropout), 370 | ]) 371 | q1_encoded = time_distributed(q1_embed, projection_layers) 372 | q2_encoded = time_distributed(q2_embed, projection_layers) 373 | 374 | # Attention 375 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) 376 | 377 | # Compare 378 | q1_combined = Concatenate()( 379 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) 380 | q2_combined = Concatenate()( 381 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) 382 | compare_layers = [ 383 | Dense(compare_dim, activation=activation), 384 | Dropout(compare_dropout), 385 | Dense(compare_dim, activation=activation), 386 | Dropout(compare_dropout), 387 | ] 388 | q1_compare = time_distributed(q1_combined, compare_layers) 389 | q2_compare = time_distributed(q2_combined, compare_layers) 390 | 391 | # Aggregate 392 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 393 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 394 | 395 | # Classifier 396 | merged = Concatenate()([q1_rep, q2_rep]) 397 | dense = BatchNormalization()(merged) 398 | dense = Dense(dense_dim, activation=activation)(dense) 399 | dense = Dropout(dense_dropout)(dense) 400 | dense = BatchNormalization()(dense) 401 | dense = Dense(dense_dim, activation=activation)(dense) 402 | dense = Dropout(dense_dropout)(dense) 403 | out_ = Dense(1, activation='sigmoid')(dense) 404 | 405 | model = Model(inputs=[q1, q2], outputs=out_) 406 | model.compile(loss='binary_crossentropy', 407 | optimizer=AdamW(lr=0.001, weight_decay=0.02,), 408 | metrics=["accuracy", auc]) 409 | return model 410 | 411 | 412 | def esim(embedding_matrix, 413 | maxlen=20, 414 | lstm_dim=30, 415 | dense_dim=30, 416 | dense_dropout=0.5): 417 | # Based on arXiv:1609.06038 418 | q1 = Input(name='q1', shape=(8,)) 419 | q2 = Input(name='q2', shape=(20,)) 420 | 421 | # Embedding 422 | embedding = create_pretrained_embedding( 423 | embedding_matrix, mask_zero=False) 424 | bn = BatchNormalization(axis=2) 425 | q1_embed = bn(embedding(q1)) 426 | q2_embed = bn(embedding(q2)) 427 | 428 | # Encode 429 | encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True)) 430 | q1_encoded = encode(q1_embed) 431 | q2_encoded = encode(q2_embed) 432 | 433 | # Attention 434 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) 435 | 436 | # Compose 437 | q1_combined = Concatenate()( 438 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) 439 | q2_combined = Concatenate()( 440 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) 441 | 442 | compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True)) 443 | q1_compare = compose(q1_combined) 444 | q2_compare = compose(q2_combined) 445 | 446 | # Aggregate 447 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 448 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 449 | 450 | # leaks_input = Input(shape=(num_shape,)) 451 | # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input) 452 | 453 | # Classifier 454 | merged = Concatenate()([q1_rep, q2_rep]) 455 | 456 | dense = BatchNormalization()(merged) 457 | dense = Dense(dense_dim, activation='elu')(dense) 458 | dense = BatchNormalization()(dense) 459 | dense = Dropout(dense_dropout)(dense) 460 | dense = Dense(dense_dim, activation='elu')(dense) 461 | dense = BatchNormalization()(dense) 462 | dense = Dropout(dense_dropout)(dense) 463 | out_ = Dense(1, activation='sigmoid')(dense) 464 | 465 | model = Model(inputs=[q1, q2], outputs=out_) 466 | model.compile(loss='binary_crossentropy', 467 | optimizer=AdamW(lr=0.0003, weight_decay=0.02,), 468 | metrics=["accuracy"]) 469 | return model 470 | 471 | 472 | ####模型训练 473 | train_gen = gen_train(path='/home/kesci/zhifeng/train.csv', 474 | batch_size=4096, label_tag=True, chunk_size=1000) 475 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', 476 | batch_size=4096, label_tag=True, chunk_size=1000) 477 | print("train...") 478 | print("###"*30) 479 | gc.collect() 480 | K.clear_session() 481 | model = esim(embed_matrix) 482 | model.summary() 483 | early_stopping = EarlyStopping( 484 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1) 485 | reduce_lr = ReduceLROnPlateau( 486 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2) 487 | bst_model_path = '/home/kesci/chizhu/chizhu_w2v_esim_weight_{epoch}_{val_loss}.h5' 488 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 489 | save_best_only=False, 490 | verbose=1, save_weights_only=True, period=1) 491 | callbacks = [checkpoint, reduce_lr, early_stopping] 492 | # print("load weight....") 493 | 494 | 495 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(999000000/2048)), 496 | epochs=10, verbose=1, callbacks=callbacks, 497 | validation_data=val_gen, validation_steps=int( 498 | np.ceil(1000000/2048)), 499 | max_queue_size=10, workers=1, use_multiprocessing=False) 500 | 501 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', 502 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False) 503 | val_prob = model.predict_generator( 504 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1) 505 | 506 | f = open('/home/kesci/zhifeng/val.csv','r') 507 | q,a,l=[],[],[] 508 | for line in f: 509 | qid,_,aid,_,label = line.strip().split(',') 510 | q.append(qid) 511 | a.append(aid) 512 | l.append(int(label)) 513 | 514 | val_df = pd.DataFrame({'qid':q,'aid':a,'label':l}) 515 | val_df['prob'] = val_prob.flatten() 516 | 517 | roc_auc_score(val_df['label'], val_df['prob']) 518 | 519 | def perauc(df): 520 | temp=pd.Series() 521 | try: 522 | temp['auc']=roc_auc_score(df['label'],df['prob']) 523 | except: 524 | temp['auc']=0.5 525 | return temp 526 | eval_df=val_df.groupby("qid").apply(perauc) 527 | eval_df.index=range(len(eval_df)) 528 | print("qauc:",eval_df['auc'].mean()) 529 | 530 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv', 531 | batch_size=4096,label_tag=False,chunk_size=1,shuffle=False) 532 | prob = model.predict_generator(test_gen,steps=int(np.ceil(20000000/4096)),verbose=1) 533 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',names=['qid','aid','prob']) 534 | sub['prob'] = prob.flatten() 535 | sub.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testa.csv',index=False,header=False) 536 | 537 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv', 538 | batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 539 | prob = model.predict_generator( 540 | test_gen, steps=int(np.ceil(100000000/4096)), verbose=1) 541 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[ 542 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']] 543 | final['prob'] = prob.flatten() 544 | final.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testb.csv', 545 | index=False, header=False) 546 | -------------------------------------------------------------------------------- /bigtrain_w2v_rnn.py: -------------------------------------------------------------------------------- 1 | from keras.activations import softmax 2 | import os 3 | import pandas as pd 4 | import numpy as np 5 | import random as rn 6 | from tqdm import tqdm, tqdm_notebook 7 | import tensorflow as tf 8 | from sklearn.metrics import roc_auc_score 9 | from keras.preprocessing.text import Tokenizer 10 | from keras.preprocessing.sequence import pad_sequences 11 | from keras.optimizers import Adam 12 | from keras import backend as K 13 | from keras.optimizers import * 14 | from keras.callbacks import * 15 | from keras.layers import * 16 | from keras.models import * 17 | from keras.engine.topology import Layer 18 | from keras import initializers, regularizers, constraints, optimizers, layers 19 | from keras.initializers import * 20 | import keras 21 | from sklearn.model_selection import StratifiedKFold, GroupKFold 22 | import gc 23 | import time 24 | from gensim.models import Word2Vec 25 | import logging 26 | import Levenshtein 27 | import fasttext 28 | tqdm.pandas() 29 | np.random.seed(1017) 30 | rn.seed(1017) 31 | tf.set_random_seed(1017) 32 | path = "/home/kesci/input/bytedance/" 33 | out = '/home/kesci/work/zhifeng/' 34 | print(os.listdir(path)) 35 | 36 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model') 37 | 38 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)} 39 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)} 40 | 41 | 42 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20, 43 | maxlen_query=8): 44 | if label_tag: 45 | _, _q, _, _a, _label = line.strip().split(',') 46 | else: 47 | _, _q, _, _a = line.strip().split(',') 48 | q_seq = [token.get(item, 0) for item in _q.strip().split()] 49 | a_seq = [token.get(item, 0) for item in _a.strip().split()] 50 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:] 51 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:] 52 | if label_tag: 53 | return q_pad, a_pad, int(_label) 54 | return q_pad, a_pad 55 | 56 | 57 | def gen_train(path, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8): 58 | while True: 59 | fin = open(path, 'r') 60 | batch_q, batch_a, batch_label = [], [], [] 61 | for line in fin: 62 | if len(batch_q) == chunk_size*batch_size: 63 | batch_q = np.array(batch_q) 64 | batch_a = np.array(batch_a) 65 | if label_tag: 66 | batch_label = np.array(batch_label) 67 | idx = list(range(chunk_size*batch_size)) 68 | if shuffle: 69 | np.random.shuffle(idx) 70 | for i in range(chunk_size): 71 | if label_tag: 72 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]) 73 | else: 74 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])] 75 | batch_q, batch_a, batch_label = [], [], [] 76 | if label_tag: 77 | q, a, l = gen_feature_help(line, label_tag=label_tag) 78 | else: 79 | q, a = gen_feature_help(line, label_tag=label_tag) 80 | l = 0 81 | batch_q.append(q) 82 | batch_a.append(a) 83 | if label_tag: 84 | batch_label.append(l) 85 | 86 | batch_q = np.array(batch_q) 87 | batch_a = np.array(batch_a) 88 | 89 | if label_tag: 90 | batch_label = np.array(batch_label) 91 | idx = list(range(len(batch_q))) 92 | if shuffle: 93 | np.random.shuffle(idx) 94 | for i in range(int(np.ceil(len(batch_q)/batch_size))): 95 | if label_tag: 96 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])], np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]]) 97 | else: 98 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), np.array(batch_a[idx[i*batch_size:i*batch_size+batch_size]])] 99 | fin.close() 100 | 101 | 102 | def get_embedding_matrix(): 103 | m = np.zeros(shape=(len(index2word)+1, 300)) 104 | for i, w in index2word.items(): 105 | m[i, :] = w2v[w] 106 | return m 107 | 108 | 109 | embed_matrix = get_embedding_matrix() 110 | maxlen_query = 8 111 | maxlen_answer = 20 112 | 113 | 114 | class AdamW(Optimizer): 115 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4) 116 | epsilon=1e-8, decay=0., **kwargs): 117 | super(AdamW, self).__init__(**kwargs) 118 | with K.name_scope(self.__class__.__name__): 119 | self.iterations = K.variable(0, dtype='int64', name='iterations') 120 | self.lr = K.variable(lr, name='lr') 121 | self.beta_1 = K.variable(beta_1, name='beta_1') 122 | self.beta_2 = K.variable(beta_2, name='beta_2') 123 | self.decay = K.variable(decay, name='decay') 124 | # decoupled weight decay (2/4) 125 | self.wd = K.variable(weight_decay, name='weight_decay') 126 | self.epsilon = epsilon 127 | self.initial_decay = decay 128 | 129 | @interfaces.legacy_get_updates_support 130 | def get_updates(self, loss, params): 131 | grads = self.get_gradients(loss, params) 132 | self.updates = [K.update_add(self.iterations, 1)] 133 | wd = self.wd # decoupled weight decay (3/4) 134 | 135 | lr = self.lr 136 | if self.initial_decay > 0: 137 | lr *= (1. / (1. + self.decay * K.cast(self.iterations, 138 | K.dtype(self.decay)))) 139 | 140 | t = K.cast(self.iterations, K.floatx()) + 1 141 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / 142 | (1. - K.pow(self.beta_1, t))) 143 | 144 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 145 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 146 | self.weights = [self.iterations] + ms + vs 147 | 148 | for p, g, m, v in zip(params, grads, ms, vs): 149 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 150 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 151 | # decoupled weight decay (4/4) 152 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p 153 | 154 | self.updates.append(K.update(m, m_t)) 155 | self.updates.append(K.update(v, v_t)) 156 | new_p = p_t 157 | 158 | # Apply constraints. 159 | if getattr(p, 'constraint', None) is not None: 160 | new_p = p.constraint(new_p) 161 | 162 | self.updates.append(K.update(p, new_p)) 163 | return self.updates 164 | 165 | def get_config(self): 166 | config = {'lr': float(K.get_value(self.lr)), 167 | 'beta_1': float(K.get_value(self.beta_1)), 168 | 'beta_2': float(K.get_value(self.beta_2)), 169 | 'decay': float(K.get_value(self.decay)), 170 | 'weight_decay': float(K.get_value(self.wd)), 171 | 'epsilon': self.epsilon} 172 | base_config = super(AdamW, self).get_config() 173 | return dict(list(base_config.items()) + list(config.items())) 174 | 175 | 176 | class Attention(Layer): 177 | def __init__(self, step_dim, 178 | W_regularizer=None, b_regularizer=None, 179 | W_constraint=None, b_constraint=None, 180 | bias=True, **kwargs): 181 | self.supports_masking = True 182 | self.init = initializers.get('glorot_uniform') 183 | 184 | self.W_regularizer = regularizers.get(W_regularizer) 185 | self.b_regularizer = regularizers.get(b_regularizer) 186 | 187 | self.W_constraint = constraints.get(W_constraint) 188 | self.b_constraint = constraints.get(b_constraint) 189 | 190 | self.bias = bias 191 | self.step_dim = step_dim 192 | self.features_dim = 0 193 | super(Attention, self).__init__(**kwargs) 194 | 195 | def build(self, input_shape): 196 | assert len(input_shape) == 3 197 | 198 | self.W = self.add_weight((input_shape[-1],), 199 | initializer=self.init, 200 | name='{}_W'.format(self.name), 201 | regularizer=self.W_regularizer, 202 | constraint=self.W_constraint) 203 | self.features_dim = input_shape[-1] 204 | 205 | if self.bias: 206 | self.b = self.add_weight((input_shape[1],), 207 | initializer='zero', 208 | name='{}_b'.format(self.name), 209 | regularizer=self.b_regularizer, 210 | constraint=self.b_constraint) 211 | else: 212 | self.b = None 213 | 214 | self.built = True 215 | 216 | def compute_mask(self, input, input_mask=None): 217 | return None 218 | 219 | def call(self, x, mask=None): 220 | features_dim = self.features_dim 221 | step_dim = self.step_dim 222 | 223 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 224 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) 225 | 226 | if self.bias: 227 | eij += self.b 228 | 229 | eij = K.tanh(eij) 230 | 231 | a = K.exp(eij) 232 | 233 | if mask is not None: 234 | a *= K.cast(mask, K.floatx()) 235 | 236 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 237 | 238 | a = K.expand_dims(a) 239 | weighted_input = x * a 240 | return K.sum(weighted_input, axis=1) 241 | 242 | def compute_output_shape(self, input_shape): 243 | return input_shape[0], self.features_dim 244 | 245 | # AUC for a binary classifier 246 | 247 | 248 | def auc(y_true, y_pred): 249 | ptas = tf.stack([binary_PTA(y_true, y_pred, k) 250 | for k in np.linspace(0, 1, 1000)], axis=0) 251 | pfas = tf.stack([binary_PFA(y_true, y_pred, k) 252 | for k in np.linspace(0, 1, 1000)], axis=0) 253 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0) 254 | binSizes = -(pfas[1:]-pfas[:-1]) 255 | s = ptas*binSizes 256 | return K.sum(s, axis=0) 257 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 258 | # PFA, prob false alert for binary classifier 259 | 260 | 261 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)): 262 | y_pred = K.cast(y_pred >= threshold, 'float32') 263 | # N = total number of negative labels 264 | N = K.sum(1 - y_true) 265 | # FP = total number of false alerts, alerts from the negative class labels 266 | FP = K.sum(y_pred - y_pred * y_true) 267 | return FP/N 268 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 269 | # P_TA prob true alerts for binary classifier 270 | 271 | 272 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)): 273 | y_pred = K.cast(y_pred >= threshold, 'float32') 274 | # P = total number of positive labels 275 | P = K.sum(y_true) 276 | # TP = total number of correct alerts, alerts from the positive class labels 277 | TP = K.sum(y_pred * y_true) 278 | return TP/P 279 | 280 | 281 | def get_model(embedding_matrix): 282 | 283 | K.clear_session() 284 | #The embedding layer containing the word vectors 285 | emb_layer = Embedding( 286 | input_dim=embedding_matrix.shape[0], 287 | output_dim=embedding_matrix.shape[1], 288 | weights=[embedding_matrix], 289 | trainable=False 290 | ) 291 | sdrop = SpatialDropout1D(rate=0.2) 292 | lstm_layer = Bidirectional(CuDNNLSTM(40, return_sequences=True, 293 | kernel_initializer=glorot_uniform(seed=123))) 294 | gru_layer = Bidirectional(CuDNNGRU(40, return_sequences=True, 295 | kernel_initializer=glorot_uniform(seed=123))) 296 | 297 | cnn1d_layer = keras.layers.Conv1D( 298 | 40, kernel_size=2, padding="valid", kernel_initializer="he_uniform") 299 | 300 | # Define inputs 301 | seq1 = Input(shape=(maxlen_query,)) 302 | x1 = emb_layer(seq1) 303 | x1 = sdrop(x1) 304 | lstm1 = lstm_layer(x1) 305 | gru1 = gru_layer(lstm1) 306 | att_1 = Attention(maxlen_query)(lstm1) 307 | att_3 = Attention(maxlen_query)(gru1) 308 | cnn1 = cnn1d_layer(lstm1) 309 | 310 | avg_pool = GlobalAveragePooling1D() 311 | max_pool = GlobalMaxPooling1D() 312 | 313 | seq2 = Input(shape=(maxlen_answer,)) 314 | x2 = emb_layer(seq2) 315 | x2 = sdrop(x2) 316 | lstm2 = lstm_layer(x2) 317 | gru2 = gru_layer(lstm2) 318 | att_2 = Attention(maxlen_answer)(lstm2) 319 | att_4 = Attention(maxlen_answer)(gru2) 320 | cnn2 = cnn1d_layer(lstm2) 321 | 322 | x1 = concatenate([att_1, att_3, avg_pool(cnn1), max_pool( 323 | cnn1), avg_pool(gru1), max_pool(gru1)]) 324 | x2 = concatenate([att_2, att_4, avg_pool(cnn2), max_pool( 325 | cnn2), avg_pool(gru2), max_pool(gru2)]) 326 | 327 | merge = Multiply()([x1, x2]) 328 | merge = Dropout(0.2)(merge) 329 | 330 | # htime = Dense(col_len,activation='relu')(hin) 331 | # The MLP that determines the outcome 332 | x = Dense(40, kernel_initializer=he_uniform( 333 | seed=123), activation='relu',)(merge) 334 | # x = Dropout(0.2)(x) 335 | # x = BatchNormalization()(x) 336 | 337 | pred = Dense(1, kernel_initializer=he_uniform( 338 | seed=123), activation='sigmoid')(x) 339 | 340 | model = Model(inputs=[seq1, seq2], outputs=pred) 341 | 342 | model.compile(loss='binary_crossentropy', 343 | optimizer=AdamW(lr=0.0003, weight_decay=0.02,), 344 | metrics=["accuracy"]) 345 | # model.summary() 346 | return model 347 | 348 | ####模型训练 349 | train_gen = gen_train(path='/home/kesci/zhifeng/train.csv', 350 | batch_size=4096, label_tag=True, chunk_size=1000) 351 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', 352 | batch_size=4096, label_tag=True, chunk_size=1000) 353 | print("train...") 354 | print("###"*30) 355 | gc.collect() 356 | K.clear_session() 357 | model = (embed_matrix) 358 | model.summary() 359 | early_stopping = EarlyStopping( 360 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1) 361 | reduce_lr = ReduceLROnPlateau( 362 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2) 363 | bst_model_path = '/home/kesci/chizhu/chizhu_w2v_esim_weight_{epoch}_{val_loss}.h5' 364 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 365 | save_best_only=False, 366 | verbose=1, save_weights_only=True, period=1) 367 | callbacks = [checkpoint, reduce_lr, early_stopping] 368 | # print("load weight....") 369 | 370 | 371 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(999000000/2048)), 372 | epochs=10, verbose=1, callbacks=callbacks, 373 | validation_data=val_gen, validation_steps=int( 374 | np.ceil(1000000/2048)), 375 | max_queue_size=10, workers=1, use_multiprocessing=False) 376 | 377 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', 378 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False) 379 | val_prob = model.predict_generator( 380 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1) 381 | 382 | f = open('/home/kesci/zhifeng/val.csv', 'r') 383 | q, a, l = [], [], [] 384 | for line in f: 385 | qid, _, aid, _, label = line.strip().split(',') 386 | q.append(qid) 387 | a.append(aid) 388 | l.append(int(label)) 389 | 390 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l}) 391 | val_df['prob'] = val_prob.flatten() 392 | 393 | roc_auc_score(val_df['label'], val_df['prob']) 394 | 395 | 396 | def perauc(df): 397 | temp = pd.Series() 398 | try: 399 | temp['auc'] = roc_auc_score(df['label'], df['prob']) 400 | except: 401 | temp['auc'] = 0.5 402 | return temp 403 | 404 | 405 | eval_df = val_df.groupby("qid").apply(perauc) 406 | eval_df.index = range(len(eval_df)) 407 | print("qauc:", eval_df['auc'].mean()) 408 | 409 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv', 410 | batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 411 | prob = model.predict_generator( 412 | test_gen, steps=int(np.ceil(20000000/4096)), verbose=1) 413 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv', 414 | names=['qid', 'aid', 'prob']) 415 | sub['prob'] = prob.flatten() 416 | sub.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testa.csv', 417 | index=False, header=False) 418 | 419 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv', 420 | batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 421 | prob = model.predict_generator( 422 | test_gen, steps=int(np.ceil(100000000/4096)), verbose=1) 423 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[ 424 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']] 425 | final['prob'] = prob.flatten() 426 | final.to_csv('/home/kesci/work/chizhu/raw_w2v_esim_testb.csv', 427 | index=False, header=False) 428 | -------------------------------------------------------------------------------- /chizhu_rnn.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import StandardScaler 2 | import os 3 | import pandas as pd 4 | import numpy as np 5 | import random as rn 6 | from tqdm import tqdm, tqdm_notebook 7 | import tensorflow as tf 8 | from sklearn.metrics import roc_auc_score 9 | from keras.preprocessing.text import Tokenizer 10 | from keras.preprocessing.sequence import pad_sequences 11 | from keras.optimizers import Adam 12 | from keras import backend as K 13 | from keras.optimizers import * 14 | from keras.callbacks import * 15 | from keras.layers import * 16 | from keras.models import * 17 | from keras.engine.topology import Layer 18 | from keras import initializers, regularizers, constraints, optimizers, layers 19 | from keras.initializers import * 20 | import keras 21 | from sklearn.model_selection import StratifiedKFold, GroupKFold 22 | import gc 23 | import time 24 | from gensim.models import Word2Vec 25 | import logging 26 | import Levenshtein 27 | tqdm.pandas() 28 | np.random.seed(1017) 29 | rn.seed(1017) 30 | tf.set_random_seed(1017) 31 | path = "/home/kesci/input/bytedance/" 32 | out = '/home/kesci/work/chizhu/' 33 | print(os.listdir(path)) 34 | 35 | f1 = pd.read_csv(out + 'f1.csv') 36 | f2 = pd.read_csv(out + 'f2.csv') 37 | f3 = pd.read_csv(out + 'f3.csv') 38 | feature = pd.concat([f1, f2, f3], sort=False, axis=1) 39 | del f1, f2, f3 40 | gc.collect() 41 | 42 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl") 43 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl") 44 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl") 45 | testb_w2v = pd.read_pickle( 46 | "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl") 47 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v) 48 | 49 | train_w2v = pd.read_pickle( 50 | "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl") 51 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl") 52 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl") 53 | testb_w2v = pd.read_pickle( 54 | "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl") 55 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \ 56 | list(testa_w2v)+list(testb_w2v) 57 | del train_w2v, val_w2v, testa_w2v, testb_w2v 58 | gc.collect() 59 | feature.shape 60 | 61 | len_train = 99000000 62 | len_val = 1000000 63 | len_testa = 20000000 64 | len_testb = 100000000 65 | sc = StandardScaler() 66 | feature = sc.fit_transform(feature) 67 | train_feature = feature[:len_train] 68 | val_feature = feature[len_train:len_train+len_val] 69 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa] 70 | testb_feature = feature[-len_testb:] 71 | print(train_feature.shape, val_feature.shape,testa_feature.shape,testb_feature.shape) 72 | 73 | del feature 74 | gc.collect() 75 | 76 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model') 77 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)} 78 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)} 79 | 80 | 81 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20, 82 | maxlen_query=8): 83 | if label_tag: 84 | _, _q, _, _a, _label = line.strip().split(',') 85 | else: 86 | _, _q, _, _a = line.strip().split(',') 87 | q_seq = [token.get(item, 0) for item in _q.strip().split()] 88 | a_seq = [token.get(item, 0) for item in _a.strip().split()] 89 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:] 90 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:] 91 | if label_tag: 92 | return q_pad, a_pad, int(_label) 93 | return q_pad, a_pad 94 | 95 | 96 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8): 97 | while True: 98 | fin = open(path, 'r') 99 | batch_q, batch_a, batch_f, batch_label = [], [], [], [] 100 | for i, line in enumerate(fin): 101 | if len(batch_q) == chunk_size*batch_size: 102 | batch_q = np.array(batch_q) 103 | batch_a = np.array(batch_a) 104 | batch_f = np.array(batch_f) 105 | if label_tag: 106 | batch_label = np.array(batch_label) 107 | idx = list(range(chunk_size*batch_size)) 108 | if shuffle: 109 | np.random.shuffle(idx) 110 | for i in range(chunk_size): 111 | if label_tag: 112 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 113 | np.array( 114 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 115 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])], 116 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])) 117 | else: 118 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 119 | np.array( 120 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 121 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])] 122 | batch_q, batch_a, batch_f, batch_label = [], [], [], [] 123 | if label_tag: 124 | q, a, l = gen_feature_help(line, label_tag=label_tag) 125 | else: 126 | q, a = gen_feature_help(line, label_tag=label_tag) 127 | l = 0 128 | batch_q.append(q) 129 | batch_a.append(a) 130 | batch_f.append(feature[i]) 131 | if label_tag: 132 | batch_label.append(l) 133 | 134 | batch_q = np.array(batch_q) 135 | batch_a = np.array(batch_a) 136 | batch_f = np.array(batch_f) 137 | 138 | if label_tag: 139 | batch_label = np.array(batch_label) 140 | idx = list(range(len(batch_q))) 141 | if shuffle: 142 | np.random.shuffle(idx) 143 | for i in range(int(np.ceil(len(batch_q)/batch_size))): 144 | if label_tag: 145 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 146 | np.array( 147 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 148 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])], 149 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])) 150 | else: 151 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 152 | np.array( 153 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 154 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])] 155 | fin.close() 156 | 157 | 158 | def get_embedding_matrix(): 159 | m = np.zeros(shape=(len(index2word)+1, 300)) 160 | for i, w in index2word.items(): 161 | m[i, :] = w2v[w] 162 | return m 163 | 164 | 165 | embed_matrix = get_embedding_matrix() 166 | maxlen_query = 8 167 | maxlen_answer = 20 168 | 169 | 170 | class AdamW(Optimizer): 171 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4) 172 | epsilon=1e-8, decay=0., **kwargs): 173 | super(AdamW, self).__init__(**kwargs) 174 | with K.name_scope(self.__class__.__name__): 175 | self.iterations = K.variable(0, dtype='int64', name='iterations') 176 | self.lr = K.variable(lr, name='lr') 177 | self.beta_1 = K.variable(beta_1, name='beta_1') 178 | self.beta_2 = K.variable(beta_2, name='beta_2') 179 | self.decay = K.variable(decay, name='decay') 180 | # decoupled weight decay (2/4) 181 | self.wd = K.variable(weight_decay, name='weight_decay') 182 | self.epsilon = epsilon 183 | self.initial_decay = decay 184 | 185 | @interfaces.legacy_get_updates_support 186 | def get_updates(self, loss, params): 187 | grads = self.get_gradients(loss, params) 188 | self.updates = [K.update_add(self.iterations, 1)] 189 | wd = self.wd # decoupled weight decay (3/4) 190 | 191 | lr = self.lr 192 | if self.initial_decay > 0: 193 | lr *= (1. / (1. + self.decay * K.cast(self.iterations, 194 | K.dtype(self.decay)))) 195 | 196 | t = K.cast(self.iterations, K.floatx()) + 1 197 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / 198 | (1. - K.pow(self.beta_1, t))) 199 | 200 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 201 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 202 | self.weights = [self.iterations] + ms + vs 203 | 204 | for p, g, m, v in zip(params, grads, ms, vs): 205 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 206 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 207 | # decoupled weight decay (4/4) 208 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p 209 | 210 | self.updates.append(K.update(m, m_t)) 211 | self.updates.append(K.update(v, v_t)) 212 | new_p = p_t 213 | 214 | # Apply constraints. 215 | if getattr(p, 'constraint', None) is not None: 216 | new_p = p.constraint(new_p) 217 | 218 | self.updates.append(K.update(p, new_p)) 219 | return self.updates 220 | 221 | def get_config(self): 222 | config = {'lr': float(K.get_value(self.lr)), 223 | 'beta_1': float(K.get_value(self.beta_1)), 224 | 'beta_2': float(K.get_value(self.beta_2)), 225 | 'decay': float(K.get_value(self.decay)), 226 | 'weight_decay': float(K.get_value(self.wd)), 227 | 'epsilon': self.epsilon} 228 | base_config = super(AdamW, self).get_config() 229 | return dict(list(base_config.items()) + list(config.items())) 230 | 231 | 232 | class Attention(Layer): 233 | def __init__(self, step_dim, 234 | W_regularizer=None, b_regularizer=None, 235 | W_constraint=None, b_constraint=None, 236 | bias=True, **kwargs): 237 | self.supports_masking = True 238 | self.init = initializers.get('glorot_uniform') 239 | 240 | self.W_regularizer = regularizers.get(W_regularizer) 241 | self.b_regularizer = regularizers.get(b_regularizer) 242 | 243 | self.W_constraint = constraints.get(W_constraint) 244 | self.b_constraint = constraints.get(b_constraint) 245 | 246 | self.bias = bias 247 | self.step_dim = step_dim 248 | self.features_dim = 0 249 | super(Attention, self).__init__(**kwargs) 250 | 251 | def build(self, input_shape): 252 | assert len(input_shape) == 3 253 | 254 | self.W = self.add_weight((input_shape[-1],), 255 | initializer=self.init, 256 | name='{}_W'.format(self.name), 257 | regularizer=self.W_regularizer, 258 | constraint=self.W_constraint) 259 | self.features_dim = input_shape[-1] 260 | 261 | if self.bias: 262 | self.b = self.add_weight((input_shape[1],), 263 | initializer='zero', 264 | name='{}_b'.format(self.name), 265 | regularizer=self.b_regularizer, 266 | constraint=self.b_constraint) 267 | else: 268 | self.b = None 269 | 270 | self.built = True 271 | 272 | def compute_mask(self, input, input_mask=None): 273 | return None 274 | 275 | def call(self, x, mask=None): 276 | features_dim = self.features_dim 277 | step_dim = self.step_dim 278 | 279 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 280 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) 281 | 282 | if self.bias: 283 | eij += self.b 284 | 285 | eij = K.tanh(eij) 286 | 287 | a = K.exp(eij) 288 | 289 | if mask is not None: 290 | a *= K.cast(mask, K.floatx()) 291 | 292 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 293 | 294 | a = K.expand_dims(a) 295 | weighted_input = x * a 296 | return K.sum(weighted_input, axis=1) 297 | 298 | def compute_output_shape(self, input_shape): 299 | return input_shape[0], self.features_dim 300 | # AUC for a binary classifier 301 | 302 | 303 | def auc(y_true, y_pred): 304 | ptas = tf.stack([binary_PTA(y_true, y_pred, k) 305 | for k in np.linspace(0, 1, 1000)], axis=0) 306 | pfas = tf.stack([binary_PFA(y_true, y_pred, k) 307 | for k in np.linspace(0, 1, 1000)], axis=0) 308 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0) 309 | binSizes = -(pfas[1:]-pfas[:-1]) 310 | s = ptas*binSizes 311 | return K.sum(s, axis=0) 312 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 313 | # PFA, prob false alert for binary classifier 314 | 315 | 316 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)): 317 | y_pred = K.cast(y_pred >= threshold, 'float32') 318 | # N = total number of negative labels 319 | N = K.sum(1 - y_true) 320 | # FP = total number of false alerts, alerts from the negative class labels 321 | FP = K.sum(y_pred - y_pred * y_true) 322 | return FP/N 323 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 324 | # P_TA prob true alerts for binary classifier 325 | 326 | 327 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)): 328 | y_pred = K.cast(y_pred >= threshold, 'float32') 329 | # P = total number of positive labels 330 | P = K.sum(y_true) 331 | # TP = total number of correct alerts, alerts from the positive class labels 332 | TP = K.sum(y_pred * y_true) 333 | return TP/P 334 | 335 | 336 | class Lookahead(object): 337 | """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/). 338 | """ 339 | 340 | def __init__(self, k=5, alpha=0.5): 341 | self.k = k 342 | self.alpha = alpha 343 | self.count = 0 344 | 345 | def inject(self, model): 346 | """Inject the Lookahead algorithm for the given model. 347 | The following code is modified from keras's _make_train_function method. 348 | See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497 349 | """ 350 | if not hasattr(model, 'train_function'): 351 | raise RuntimeError('You must compile your model before using it.') 352 | 353 | model._check_trainable_weights_consistency() 354 | 355 | if model.train_function is None: 356 | inputs = (model._feed_inputs + 357 | model._feed_targets + 358 | model._feed_sample_weights) 359 | if model._uses_dynamic_learning_phase(): 360 | inputs += [K.learning_phase()] 361 | fast_params = model._collected_trainable_weights 362 | 363 | with K.name_scope('training'): 364 | with K.name_scope(model.optimizer.__class__.__name__): 365 | training_updates = model.optimizer.get_updates( 366 | params=fast_params, 367 | loss=model.total_loss) 368 | slow_params = [K.variable(p) for p in fast_params] 369 | fast_updates = (model.updates + 370 | training_updates + 371 | model.metrics_updates) 372 | 373 | slow_updates, copy_updates = [], [] 374 | for p, q in zip(fast_params, slow_params): 375 | slow_updates.append(K.update(q, q + self.alpha * (p - q))) 376 | copy_updates.append(K.update(p, q)) 377 | 378 | # Gets loss and metrics. Updates weights at each call. 379 | fast_train_function = K.function( 380 | inputs, 381 | [model.total_loss] + model.metrics_tensors, 382 | updates=fast_updates, 383 | name='fast_train_function', 384 | **model._function_kwargs) 385 | 386 | def F(inputs): 387 | self.count += 1 388 | R = fast_train_function(inputs) 389 | if self.count % self.k == 0: 390 | K.batch_get_value(slow_updates) 391 | K.batch_get_value(copy_updates) 392 | return R 393 | 394 | model.train_function = F 395 | def get_model(embedding_matrix): 396 | 397 | K.clear_session() 398 | #The embedding layer containing the word vectors 399 | emb_layer = Embedding( 400 | input_dim=embedding_matrix.shape[0], 401 | output_dim=embedding_matrix.shape[1], 402 | weights=[embedding_matrix], 403 | trainable=False 404 | ) 405 | sdrop=SpatialDropout1D(rate=0.2) 406 | lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True, 407 | kernel_initializer=glorot_uniform(seed = 123))) 408 | gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True, 409 | kernel_initializer=glorot_uniform(seed = 123))) 410 | 411 | cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform") 412 | 413 | # Define inputs 414 | seq1 = Input(shape=(maxlen_query,)) 415 | x1 = emb_layer(seq1) 416 | x1 = sdrop(x1) 417 | lstm1 = lstm_layer(x1) 418 | gru1 = gru_layer(lstm1) 419 | att_1 = Attention(maxlen_query)(lstm1) 420 | att_3 = Attention(maxlen_query)(gru1) 421 | cnn1 = cnn1d_layer(lstm1) 422 | 423 | avg_pool = GlobalAveragePooling1D() 424 | max_pool = GlobalMaxPooling1D() 425 | 426 | seq2 = Input(shape=(maxlen_answer,)) 427 | x2 = emb_layer(seq2) 428 | x2 = sdrop(x2) 429 | lstm2 = lstm_layer(x2) 430 | gru2 = gru_layer(lstm2) 431 | att_2 = Attention(maxlen_answer)(lstm2) 432 | att_4 = Attention(maxlen_answer)(gru2) 433 | cnn2 = cnn1d_layer(lstm2) 434 | 435 | x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)]) 436 | x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)]) 437 | 438 | merge = Multiply()([x1, x2]) 439 | merge = Dropout(0.2)(merge) 440 | 441 | hin = Input(shape=(19,)) 442 | # htime = Dense(col_len,activation='relu')(hin) 443 | x = Concatenate()([merge,hin]) 444 | # The MLP that determines the outcome 445 | x = Dense(64,kernel_initializer=he_uniform(seed=123), activation='relu',)(x) 446 | # x = Dropout(0.2)(x) 447 | # x = BatchNormalization()(x) 448 | 449 | pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x) 450 | 451 | 452 | model = Model(inputs=[seq1,seq2,hin], outputs=pred) 453 | 454 | model.compile(loss='binary_crossentropy', 455 | optimizer=AdamW(lr=0.001,weight_decay=0.02,), 456 | metrics=["accuracy",auc]) 457 | # model.summary() 458 | return model 459 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv',feature=train_feature,batch_size=2048, 460 | label_tag=True,chunk_size=5000) 461 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv',feature=val_feature,batch_size=2048, 462 | label_tag=True,chunk_size=5000) 463 | print("train...") 464 | print("###"*30) 465 | gc.collect() 466 | K.clear_session() 467 | model = get_model(embed_matrix) 468 | lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead 469 | lookahead.inject(model) # add into model 470 | model.summary() 471 | early_stopping = EarlyStopping(monitor='val_loss',min_delta=0.0001, patience=2, mode='min', verbose=1) 472 | reduce_lr = ReduceLROnPlateau( 473 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2) 474 | bst_model_path = out+'chizhurnn_chizhu_weight.h5' 475 | checkpoint = ModelCheckpoint(bst_model_path , monitor='val_loss', mode='min', 476 | save_best_only=True, verbose=1,save_weights_only=True ) 477 | callbacks = [checkpoint,reduce_lr,early_stopping] 478 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)), 479 | epochs=10, verbose=1, callbacks=callbacks, 480 | validation_data=val_gen, validation_steps = int(np.ceil(1000000/2048)), 481 | max_queue_size=10, workers=1, use_multiprocessing=False) 482 | 483 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, 484 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False) 485 | val_prob = model.predict_generator( 486 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1) 487 | 488 | f = open('/home/kesci/zhifeng/val.csv', 'r') 489 | q, a, l = [], [], [] 490 | for line in f: 491 | qid, _, aid, _, label = line.strip().split(',') 492 | q.append(qid) 493 | a.append(aid) 494 | l.append(int(label)) 495 | 496 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l}) 497 | val_df['prob'] = val_prob.flatten() 498 | 499 | 500 | def perauc(df): 501 | temp = pd.Series() 502 | try: 503 | temp['auc'] = roc_auc_score(df['label'], df['prob']) 504 | except: 505 | temp['auc'] = 0.5 506 | return temp 507 | 508 | 509 | eval_df = val_df.groupby("qid").apply(perauc) 510 | eval_df.index = range(len(eval_df)) 511 | print("qauc:", eval_df['auc'].mean()) 512 | 513 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv', 514 | feature=testa_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 515 | prob = model.predict_generator( 516 | test_gen, steps=int(np.ceil(20000000/4096)), verbose=1) 517 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv', 518 | names=['qid', 'aid', 'prob']) 519 | sub['prob'] = prob.flatten() 520 | sub.to_csv('/home/kesci/work/chizhu/chizhu_rnn_testa.csv', 521 | index=False, header=False) 522 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv', 523 | feature=testb_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 524 | prob = model.predict_generator( 525 | test_gen, steps=int(np.ceil(100000000/4096)), verbose=1) 526 | final = pd.read_csv(path+"bytedance_contest.final_2.csv", names=[ 527 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']] 528 | final['prob'] = prob.flatten() 529 | final.to_csv('/home/kesci/work/chizhu/chizhu_rnn_testb.csv', 530 | index=False, header=False) 531 | -------------------------------------------------------------------------------- /fasttext_cos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | import random as rn 5 | from tqdm import tqdm, tqdm_notebook 6 | from sklearn.metrics import roc_auc_score 7 | from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD 8 | import gc 9 | import time 10 | from gensim.models import Word2Vec 11 | import fasttext 12 | from gensim.models import Word2Vec 13 | import scipy.spatial.distance as ssd 14 | tqdm.pandas() 15 | input_path = "/home/kesci/input/bytedance/" 16 | out_work_path = '/home/kesci/work/zhifeng/' 17 | out_path = '/home/kesci/zhifeng/' 18 | 19 | w2v = fasttext.load_model(out_work_path+'corpus.fasttext.model') 20 | train_cosine_list = [] 21 | with open(out_path+'train.smaller.csv', 'r') as fin: 22 | for line in tqdm(fin): 23 | _, q, _, a, _ = line.strip().split(',') 24 | v1 = w2v.get_sentence_vector(q) 25 | v2 = w2v.get_sentence_vector(a) 26 | train_cosine_list.append(ssd.cosine(v1, v2)) 27 | pd.to_pickle(np.array(train_cosine_list), 28 | out_work_path+'train.cosine.fasttext.pkl') 29 | val_cosine_list = [] 30 | with open(out_path+'val.csv', 'r') as fin: 31 | for line in tqdm(fin): 32 | _, q, _, a, _ = line.strip().split(',') 33 | v1 = w2v.get_sentence_vector(q) 34 | v2 = w2v.get_sentence_vector(a) 35 | val_cosine_list.append(ssd.cosine(v1, v2)) 36 | pd.to_pickle(np.array(val_cosine_list), 37 | out_work_path+'val.cosine.fasttext.pkl') 38 | test_cosine_list = [] 39 | with open(input_path+'test_final_part1.csv', 'r') as fin: 40 | for line in tqdm(fin): 41 | _, q, _, a = line.strip().split(',') 42 | v1 = w2v.get_sentence_vector(q) 43 | v2 = w2v.get_sentence_vector(a) 44 | test_cosine_list.append(ssd.cosine(v1, v2)) 45 | pd.to_pickle(np.array(test_cosine_list), 46 | out_work_path+'test.cosine.fasttext.pkl') 47 | -------------------------------------------------------------------------------- /finetuning_fasttext_esim.py: -------------------------------------------------------------------------------- 1 | from keras.activations import softmax 2 | from sklearn.preprocessing import StandardScaler 3 | import os 4 | import pandas as pd 5 | import numpy as np 6 | import random as rn 7 | from tqdm import tqdm, tqdm_notebook 8 | import tensorflow as tf 9 | from sklearn.metrics import roc_auc_score 10 | from keras.preprocessing.text import Tokenizer 11 | from keras.preprocessing.sequence import pad_sequences 12 | from keras.optimizers import Adam 13 | from keras import backend as K 14 | from keras.optimizers import * 15 | from keras.callbacks import * 16 | from keras.layers import * 17 | from keras.models import * 18 | from keras.engine.topology import Layer 19 | from keras import initializers, regularizers, constraints, optimizers, layers 20 | from keras.initializers import * 21 | import keras 22 | from sklearn.model_selection import StratifiedKFold, GroupKFold 23 | import gc 24 | import time 25 | from gensim.models import Word2Vec 26 | import logging 27 | import Levenshtein 28 | import fasttext 29 | tqdm.pandas() 30 | np.random.seed(1017) 31 | rn.seed(1017) 32 | tf.set_random_seed(1017) 33 | path = "/home/kesci/input/bytedance/" 34 | out = '/home/kesci/work/zhifeng/' 35 | out_chizhu = '/home/kesci/work/chizhu/' 36 | print(os.listdir(path)) 37 | 38 | f1 = pd.read_csv(out_chizhu + 'f1.csv') 39 | f2 = pd.read_csv(out_chizhu + 'f2.csv') 40 | f3 = pd.read_csv(out_chizhu + 'f3.csv') 41 | feature = pd.concat([f1, f2, f3], sort=False, axis=1) 42 | del f1, f2, f3 43 | gc.collect() 44 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl") 45 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl") 46 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl") 47 | testb_w2v = pd.read_pickle( 48 | "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl") 49 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v) 50 | 51 | train_w2v = pd.read_pickle( 52 | "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl") 53 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl") 54 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl") 55 | testb_w2v = pd.read_pickle( 56 | "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl") 57 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \ 58 | list(testa_w2v)+list(testb_w2v) 59 | del train_w2v, val_w2v, testa_w2v, testb_w2v 60 | gc.collect() 61 | feature.shape 62 | 63 | len_train = 99000000 64 | len_val = 1000000 65 | len_testa = 20000000 66 | len_testb = 100000000 67 | sc = StandardScaler() 68 | feature = sc.fit_transform(feature) 69 | train_feature = feature[:len_train] 70 | val_feature = feature[len_train:len_train+len_val] 71 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa] 72 | testb_feature = feature[-len_testb:] 73 | print(train_feature.shape, val_feature.shape,testa_feature.shape,testb_feature.shape) 74 | 75 | del feature 76 | gc.collect() 77 | 78 | w2v = fasttext.load_model(out+'corpus.fasttext.model') 79 | word2index = {word: index+1 for index, word in enumerate(w2v.words)} 80 | index2word = {index+1: word for index, word in enumerate(w2v.words)} 81 | 82 | 83 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20, 84 | maxlen_query=8): 85 | if label_tag: 86 | _, _q, _, _a, _label = line.strip().split(',') 87 | else: 88 | _, _q, _, _a = line.strip().split(',') 89 | q_seq = [token.get(item, 0) for item in _q.strip().split()] 90 | a_seq = [token.get(item, 0) for item in _a.strip().split()] 91 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:] 92 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:] 93 | if label_tag: 94 | return q_pad, a_pad, int(_label) 95 | return q_pad, a_pad 96 | 97 | 98 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8): 99 | while True: 100 | fin = open(path, 'r') 101 | batch_q, batch_a, batch_f, batch_label = [], [], [], [] 102 | for i, line in enumerate(fin): 103 | if len(batch_q) == chunk_size*batch_size: 104 | batch_q = np.array(batch_q) 105 | batch_a = np.array(batch_a) 106 | batch_f = np.array(batch_f) 107 | if label_tag: 108 | batch_label = np.array(batch_label) 109 | idx = list(range(chunk_size*batch_size)) 110 | if shuffle: 111 | np.random.shuffle(idx) 112 | for i in range(chunk_size): 113 | if label_tag: 114 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 115 | np.array( 116 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 117 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])], 118 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])) 119 | else: 120 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 121 | np.array( 122 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 123 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])] 124 | batch_q, batch_a, batch_f, batch_label = [], [], [], [] 125 | if label_tag: 126 | q, a, l = gen_feature_help(line, label_tag=label_tag) 127 | else: 128 | q, a = gen_feature_help(line, label_tag=label_tag) 129 | l = 0 130 | batch_q.append(q) 131 | batch_a.append(a) 132 | batch_f.append(feature[i]) 133 | if label_tag: 134 | batch_label.append(l) 135 | 136 | batch_q = np.array(batch_q) 137 | batch_a = np.array(batch_a) 138 | batch_f = np.array(batch_f) 139 | 140 | if label_tag: 141 | batch_label = np.array(batch_label) 142 | idx = list(range(len(batch_q))) 143 | if shuffle: 144 | np.random.shuffle(idx) 145 | for i in range(int(np.ceil(len(batch_q)/batch_size))): 146 | if label_tag: 147 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 148 | np.array( 149 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 150 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])], 151 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])) 152 | else: 153 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 154 | np.array( 155 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 156 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])] 157 | fin.close() 158 | 159 | 160 | def get_embedding_matrix(): 161 | m = np.zeros(shape=(len(index2word)+1, 100)) 162 | for i, w in index2word.items(): 163 | m[i, :] = w2v[w] 164 | return m 165 | 166 | 167 | embed_matrix = get_embedding_matrix() 168 | maxlen_query = 8 169 | maxlen_answer = 20 170 | 171 | 172 | class AdamW(Optimizer): 173 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4) 174 | epsilon=1e-8, decay=0., **kwargs): 175 | super(AdamW, self).__init__(**kwargs) 176 | with K.name_scope(self.__class__.__name__): 177 | self.iterations = K.variable(0, dtype='int64', name='iterations') 178 | self.lr = K.variable(lr, name='lr') 179 | self.beta_1 = K.variable(beta_1, name='beta_1') 180 | self.beta_2 = K.variable(beta_2, name='beta_2') 181 | self.decay = K.variable(decay, name='decay') 182 | # decoupled weight decay (2/4) 183 | self.wd = K.variable(weight_decay, name='weight_decay') 184 | self.epsilon = epsilon 185 | self.initial_decay = decay 186 | 187 | @interfaces.legacy_get_updates_support 188 | def get_updates(self, loss, params): 189 | grads = self.get_gradients(loss, params) 190 | self.updates = [K.update_add(self.iterations, 1)] 191 | wd = self.wd # decoupled weight decay (3/4) 192 | 193 | lr = self.lr 194 | if self.initial_decay > 0: 195 | lr *= (1. / (1. + self.decay * K.cast(self.iterations, 196 | K.dtype(self.decay)))) 197 | 198 | t = K.cast(self.iterations, K.floatx()) + 1 199 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / 200 | (1. - K.pow(self.beta_1, t))) 201 | 202 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 203 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 204 | self.weights = [self.iterations] + ms + vs 205 | 206 | for p, g, m, v in zip(params, grads, ms, vs): 207 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 208 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 209 | # decoupled weight decay (4/4) 210 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p 211 | 212 | self.updates.append(K.update(m, m_t)) 213 | self.updates.append(K.update(v, v_t)) 214 | new_p = p_t 215 | 216 | # Apply constraints. 217 | if getattr(p, 'constraint', None) is not None: 218 | new_p = p.constraint(new_p) 219 | 220 | self.updates.append(K.update(p, new_p)) 221 | return self.updates 222 | 223 | def get_config(self): 224 | config = {'lr': float(K.get_value(self.lr)), 225 | 'beta_1': float(K.get_value(self.beta_1)), 226 | 'beta_2': float(K.get_value(self.beta_2)), 227 | 'decay': float(K.get_value(self.decay)), 228 | 'weight_decay': float(K.get_value(self.wd)), 229 | 'epsilon': self.epsilon} 230 | base_config = super(AdamW, self).get_config() 231 | return dict(list(base_config.items()) + list(config.items())) 232 | 233 | 234 | class Attention(Layer): 235 | def __init__(self, step_dim, 236 | W_regularizer=None, b_regularizer=None, 237 | W_constraint=None, b_constraint=None, 238 | bias=True, **kwargs): 239 | self.supports_masking = True 240 | self.init = initializers.get('glorot_uniform') 241 | 242 | self.W_regularizer = regularizers.get(W_regularizer) 243 | self.b_regularizer = regularizers.get(b_regularizer) 244 | 245 | self.W_constraint = constraints.get(W_constraint) 246 | self.b_constraint = constraints.get(b_constraint) 247 | 248 | self.bias = bias 249 | self.step_dim = step_dim 250 | self.features_dim = 0 251 | super(Attention, self).__init__(**kwargs) 252 | 253 | def build(self, input_shape): 254 | assert len(input_shape) == 3 255 | 256 | self.W = self.add_weight((input_shape[-1],), 257 | initializer=self.init, 258 | name='{}_W'.format(self.name), 259 | regularizer=self.W_regularizer, 260 | constraint=self.W_constraint) 261 | self.features_dim = input_shape[-1] 262 | 263 | if self.bias: 264 | self.b = self.add_weight((input_shape[1],), 265 | initializer='zero', 266 | name='{}_b'.format(self.name), 267 | regularizer=self.b_regularizer, 268 | constraint=self.b_constraint) 269 | else: 270 | self.b = None 271 | 272 | self.built = True 273 | 274 | def compute_mask(self, input, input_mask=None): 275 | return None 276 | 277 | def call(self, x, mask=None): 278 | features_dim = self.features_dim 279 | step_dim = self.step_dim 280 | 281 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 282 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) 283 | 284 | if self.bias: 285 | eij += self.b 286 | 287 | eij = K.tanh(eij) 288 | 289 | a = K.exp(eij) 290 | 291 | if mask is not None: 292 | a *= K.cast(mask, K.floatx()) 293 | 294 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 295 | 296 | a = K.expand_dims(a) 297 | weighted_input = x * a 298 | return K.sum(weighted_input, axis=1) 299 | 300 | def compute_output_shape(self, input_shape): 301 | return input_shape[0], self.features_dim 302 | 303 | # AUC for a binary classifier 304 | 305 | 306 | def auc(y_true, y_pred): 307 | ptas = tf.stack([binary_PTA(y_true, y_pred, k) 308 | for k in np.linspace(0, 1, 1000)], axis=0) 309 | pfas = tf.stack([binary_PFA(y_true, y_pred, k) 310 | for k in np.linspace(0, 1, 1000)], axis=0) 311 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0) 312 | binSizes = -(pfas[1:]-pfas[:-1]) 313 | s = ptas*binSizes 314 | return K.sum(s, axis=0) 315 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 316 | # PFA, prob false alert for binary classifier 317 | 318 | 319 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)): 320 | y_pred = K.cast(y_pred >= threshold, 'float32') 321 | # N = total number of negative labels 322 | N = K.sum(1 - y_true) 323 | # FP = total number of false alerts, alerts from the negative class labels 324 | FP = K.sum(y_pred - y_pred * y_true) 325 | return FP/N 326 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 327 | # P_TA prob true alerts for binary classifier 328 | 329 | 330 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)): 331 | y_pred = K.cast(y_pred >= threshold, 'float32') 332 | # P = total number of positive labels 333 | P = K.sum(y_true) 334 | # TP = total number of correct alerts, alerts from the positive class labels 335 | TP = K.sum(y_pred * y_true) 336 | return TP/P 337 | 338 | 339 | class Lookahead(object): 340 | """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/). 341 | """ 342 | 343 | def __init__(self, k=5, alpha=0.5): 344 | self.k = k 345 | self.alpha = alpha 346 | self.count = 0 347 | 348 | def inject(self, model): 349 | """Inject the Lookahead algorithm for the given model. 350 | The following code is modified from keras's _make_train_function method. 351 | See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497 352 | """ 353 | if not hasattr(model, 'train_function'): 354 | raise RuntimeError('You must compile your model before using it.') 355 | 356 | model._check_trainable_weights_consistency() 357 | 358 | if model.train_function is None: 359 | inputs = (model._feed_inputs + 360 | model._feed_targets + 361 | model._feed_sample_weights) 362 | if model._uses_dynamic_learning_phase(): 363 | inputs += [K.learning_phase()] 364 | fast_params = model._collected_trainable_weights 365 | 366 | with K.name_scope('training'): 367 | with K.name_scope(model.optimizer.__class__.__name__): 368 | training_updates = model.optimizer.get_updates( 369 | params=fast_params, 370 | loss=model.total_loss) 371 | slow_params = [K.variable(p) for p in fast_params] 372 | fast_updates = (model.updates + 373 | training_updates + 374 | model.metrics_updates) 375 | 376 | slow_updates, copy_updates = [], [] 377 | for p, q in zip(fast_params, slow_params): 378 | slow_updates.append(K.update(q, q + self.alpha * (p - q))) 379 | copy_updates.append(K.update(p, q)) 380 | 381 | # Gets loss and metrics. Updates weights at each call. 382 | fast_train_function = K.function( 383 | inputs, 384 | [model.total_loss] + model.metrics_tensors, 385 | updates=fast_updates, 386 | name='fast_train_function', 387 | **model._function_kwargs) 388 | 389 | def F(inputs): 390 | self.count += 1 391 | R = fast_train_function(inputs) 392 | if self.count % self.k == 0: 393 | K.batch_get_value(slow_updates) 394 | K.batch_get_value(copy_updates) 395 | return R 396 | 397 | model.train_function = F 398 | 399 | 400 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs): 401 | "Create embedding layer from a pretrained weights array" 402 | in_dim, out_dim = pretrained_weights.shape 403 | embedding = Embedding(in_dim, out_dim, weights=[ 404 | pretrained_weights], trainable=False, **kwargs) 405 | return embedding 406 | 407 | 408 | def unchanged_shape(input_shape): 409 | "Function for Lambda layer" 410 | return input_shape 411 | 412 | 413 | def substract(input_1, input_2): 414 | "Substract element-wise" 415 | neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2) 416 | out_ = Add()([input_1, neg_input_2]) 417 | return out_ 418 | 419 | 420 | def submult(input_1, input_2): 421 | "Get multiplication and subtraction then concatenate results" 422 | mult = Multiply()([input_1, input_2]) 423 | sub = substract(input_1, input_2) 424 | out_ = Concatenate()([sub, mult]) 425 | return out_ 426 | 427 | 428 | def apply_multiple(input_, layers): 429 | "Apply layers to input then concatenate result" 430 | if not len(layers) > 1: 431 | raise ValueError('Layers list should contain more than 1 layer') 432 | else: 433 | agg_ = [] 434 | for layer in layers: 435 | agg_.append(layer(input_)) 436 | out_ = Concatenate()(agg_) 437 | return out_ 438 | 439 | 440 | def time_distributed(input_, layers): 441 | "Apply a list of layers in TimeDistributed mode" 442 | out_ = [] 443 | node_ = input_ 444 | for layer_ in layers: 445 | node_ = TimeDistributed(layer_)(node_) 446 | out_ = node_ 447 | return out_ 448 | 449 | 450 | def soft_attention_alignment(input_1, input_2): 451 | "Align text representation with neural soft attention" 452 | attention = Dot(axes=-1)([input_1, input_2]) 453 | w_att_1 = Lambda(lambda x: softmax(x, axis=1), 454 | output_shape=unchanged_shape)(attention) 455 | w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2), 456 | output_shape=unchanged_shape)(attention)) 457 | in1_aligned = Dot(axes=1)([w_att_1, input_1]) 458 | in2_aligned = Dot(axes=1)([w_att_2, input_2]) 459 | return in1_aligned, in2_aligned 460 | 461 | 462 | def decomposable_attention(pretrained_weights, 463 | num_shape, 464 | projection_dim=300, projection_hidden=0, projection_dropout=0.2, 465 | compare_dim=500, compare_dropout=0.2, 466 | dense_dim=300, dense_dropout=0.2, 467 | lr=1e-3, activation='elu', maxlen=20): 468 | # Based on: https://arxiv.org/abs/1606.01933 469 | 470 | q1 = Input(name='q1', shape=(maxlen,)) 471 | q2 = Input(name='q2', shape=(maxlen,)) 472 | 473 | # Embedding 474 | embedding = create_pretrained_embedding(pretrained_weights, 475 | mask_zero=False) 476 | q1_embed = embedding(q1) 477 | q2_embed = embedding(q2) 478 | 479 | # Projection 480 | projection_layers = [] 481 | if projection_hidden > 0: 482 | projection_layers.extend([ 483 | Dense(projection_hidden, activation=activation), 484 | Dropout(rate=projection_dropout), 485 | ]) 486 | projection_layers.extend([ 487 | Dense(projection_dim, activation=None), 488 | Dropout(rate=projection_dropout), 489 | ]) 490 | q1_encoded = time_distributed(q1_embed, projection_layers) 491 | q2_encoded = time_distributed(q2_embed, projection_layers) 492 | 493 | # Attention 494 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) 495 | 496 | # Compare 497 | q1_combined = Concatenate()( 498 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) 499 | q2_combined = Concatenate()( 500 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) 501 | compare_layers = [ 502 | Dense(compare_dim, activation=activation), 503 | Dropout(compare_dropout), 504 | Dense(compare_dim, activation=activation), 505 | Dropout(compare_dropout), 506 | ] 507 | q1_compare = time_distributed(q1_combined, compare_layers) 508 | q2_compare = time_distributed(q2_combined, compare_layers) 509 | 510 | # Aggregate 511 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 512 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 513 | 514 | # Classifier 515 | merged = Concatenate()([q1_rep, q2_rep]) 516 | dense = BatchNormalization()(merged) 517 | dense = Dense(dense_dim, activation=activation)(dense) 518 | dense = Dropout(dense_dropout)(dense) 519 | dense = BatchNormalization()(dense) 520 | dense = Dense(dense_dim, activation=activation)(dense) 521 | dense = Dropout(dense_dropout)(dense) 522 | out_ = Dense(1, activation='sigmoid')(dense) 523 | 524 | model = Model(inputs=[q1, q2], outputs=out_) 525 | model.compile(loss='binary_crossentropy', 526 | optimizer=AdamW(lr=0.001, weight_decay=0.02,), 527 | metrics=["accuracy", auc]) 528 | return model 529 | 530 | 531 | def esim(embedding_matrix, 532 | maxlen=20, 533 | lstm_dim=64, 534 | dense_dim=128, 535 | dense_dropout=0.5): 536 | # Based on arXiv:1609.06038 537 | q1 = Input(name='q1', shape=(8,)) 538 | q2 = Input(name='q2', shape=(20,)) 539 | 540 | # Embedding 541 | embedding = create_pretrained_embedding( 542 | embedding_matrix, mask_zero=False) 543 | bn = BatchNormalization(axis=2) 544 | q1_embed = bn(embedding(q1)) 545 | q2_embed = bn(embedding(q2)) 546 | 547 | # Encode 548 | encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True)) 549 | q1_encoded = encode(q1_embed) 550 | q2_encoded = encode(q2_embed) 551 | 552 | # Attention 553 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) 554 | 555 | # Compose 556 | q1_combined = Concatenate()( 557 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) 558 | q2_combined = Concatenate()( 559 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) 560 | 561 | compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True)) 562 | q1_compare = compose(q1_combined) 563 | q2_compare = compose(q2_combined) 564 | 565 | # Aggregate 566 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 567 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 568 | 569 | # leaks_input = Input(shape=(num_shape,)) 570 | # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input) 571 | 572 | # Classifier 573 | merged = Concatenate()([q1_rep, q2_rep]) 574 | 575 | dense = BatchNormalization()(merged) 576 | dense = Dense(dense_dim, activation='elu')(dense) 577 | dense = BatchNormalization()(dense) 578 | dense = Dropout(dense_dropout)(dense) 579 | dense = Dense(dense_dim, activation='elu')(dense) 580 | dense = BatchNormalization()(dense) 581 | dense = Dropout(dense_dropout)(dense) 582 | out_ = Dense(1, activation='sigmoid')(dense) 583 | 584 | model = Model(inputs=[q1, q2], outputs=out_) 585 | model.compile(loss='binary_crossentropy', 586 | optimizer=AdamW(lr=0.0003, weight_decay=0.02,), 587 | metrics=["accuracy", auc]) 588 | return model 589 | 590 | 591 | def aux_esim_model(embed_matrix, model_weight_path): 592 | base_model = esim(embed_matrix) 593 | base_model.load_weights(model_weight_path) 594 | input_q, input_a = base_model.inputs 595 | input_f = Input((19,)) 596 | hidden_esim = base_model.get_layer(index=28).output 597 | merged = Concatenate()([hidden_esim, input_f]) 598 | #dense = BatchNormalization()(merged) 599 | dense = Dense(512, activation='relu')(merged) 600 | #dense = BatchNormalization()(dense) 601 | dense = Dropout(0.5)(dense) 602 | dense = Dense(256, activation='relu')(dense) 603 | #dense = BatchNormalization()(dense) 604 | dense = Dropout(0.5)(dense) 605 | out_ = Dense(1, activation='sigmoid')(dense) 606 | 607 | model = Model(inputs=[input_q, input_a, input_f], outputs=out_) 608 | model.compile(loss='binary_crossentropy', 609 | optimizer=AdamW(lr=0.0003, weight_decay=0.02), 610 | metrics=["accuracy"]) 611 | return model 612 | 613 | 614 | ####模型训练 615 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv', feature=train_feature, batch_size=2048, 616 | label_tag=True, chunk_size=5000) 617 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, batch_size=2048, 618 | label_tag=True, chunk_size=5000) 619 | print("train...") 620 | print("###"*30) 621 | gc.collect() 622 | K.clear_session() 623 | weight_path = '/home/kesci/work/zhifeng/zhifeng_esim_weight_1_0.6924413924179077.h5' 624 | model = aux_esim_model(embed_matrix, weight_path) 625 | lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead 626 | lookahead.inject(model) # add into model 627 | model.summary() 628 | early_stopping = EarlyStopping( 629 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1) 630 | reduce_lr = ReduceLROnPlateau( 631 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2) 632 | bst_model_path = '/home/kesci/work/zhifeng/zhifeng_aux_fasttext_esim_finetune_{epoch}_{val_loss}.h5' 633 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 634 | save_best_only=False, 635 | verbose=1, save_weights_only=True, period=1) 636 | callbacks = [checkpoint, reduce_lr, early_stopping] 637 | # print("load weight....") 638 | 639 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)), 640 | epochs=10, verbose=1, callbacks=callbacks, 641 | validation_data=val_gen, validation_steps=int( 642 | np.ceil(1000000/2048)), 643 | max_queue_size=10, workers=1, use_multiprocessing=False) 644 | 645 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, 646 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False) 647 | val_prob = model.predict_generator( 648 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1) 649 | 650 | f = open('/home/kesci/zhifeng/val.csv','r') 651 | q,a,l=[],[],[] 652 | for line in f: 653 | qid,_,aid,_,label = line.strip().split(',') 654 | q.append(qid) 655 | a.append(aid) 656 | l.append(int(label)) 657 | 658 | val_df = pd.DataFrame({'qid':q,'aid':a,'label':l}) 659 | val_df['prob'] = val_prob.flatten() 660 | 661 | roc_auc_score(val_df['label'], val_df['prob']) 662 | 663 | 664 | def perauc(df): 665 | temp = pd.Series() 666 | try: 667 | temp['auc'] = roc_auc_score(df['label'], df['prob']) 668 | except: 669 | temp['auc'] = 0.5 670 | return temp 671 | 672 | 673 | eval_df = val_df.groupby("qid").apply(perauc) 674 | eval_df.index = range(len(eval_df)) 675 | print("qauc:", eval_df['auc'].mean()) 676 | 677 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv', 678 | feature=testa_feature,batch_size=4096,label_tag=False,chunk_size=1,shuffle=False) 679 | prob = model.predict_generator(test_gen,steps=int(np.ceil(20000000/4096)),verbose=1) 680 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv',names=['qid','aid','prob']) 681 | sub['prob'] = prob.flatten() 682 | sub.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testa.csv',index=False,header=False 683 | test_gen = gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv', 684 | feature=testb_feature,batch_size=4096,label_tag=False,chunk_size=1,shuffle=False) 685 | prob = model.predict_generator(test_gen,steps=int(np.ceil(100000000/4096)),verbose=1) 686 | final = pd.read_csv(path+"bytedance_contest.final_2.csv",names=['query_id','query','query_title_id','title'])[['query_id','query_title_id']] 687 | final['prob'] = prob.flatten() 688 | final.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testb.csv',index=False,header=False) -------------------------------------------------------------------------------- /finetuning_w2v_esim.py: -------------------------------------------------------------------------------- 1 | from keras.activations import softmax 2 | from sklearn.preprocessing import StandardScaler 3 | import os 4 | import pandas as pd 5 | import numpy as np 6 | import random as rn 7 | from tqdm import tqdm, tqdm_notebook 8 | import tensorflow as tf 9 | from sklearn.metrics import roc_auc_score 10 | from keras.preprocessing.text import Tokenizer 11 | from keras.preprocessing.sequence import pad_sequences 12 | from keras.optimizers import Adam 13 | from keras import backend as K 14 | from keras.optimizers import * 15 | from keras.callbacks import * 16 | from keras.layers import * 17 | from keras.models import * 18 | from keras.engine.topology import Layer 19 | from keras import initializers, regularizers, constraints, optimizers, layers 20 | from keras.initializers import * 21 | import keras 22 | from sklearn.model_selection import StratifiedKFold, GroupKFold 23 | import gc 24 | import time 25 | from gensim.models import Word2Vec 26 | import logging 27 | import Levenshtein 28 | import fasttext 29 | tqdm.pandas() 30 | np.random.seed(1017) 31 | rn.seed(1017) 32 | tf.set_random_seed(1017) 33 | path = "/home/kesci/input/bytedance/" 34 | out = '/home/kesci/work/zhifeng/' 35 | out_chizhu = '/home/kesci/work/chizhu/' 36 | print(os.listdir(path)) 37 | 38 | f1 = pd.read_csv(out_chizhu + 'f1.csv') 39 | f2 = pd.read_csv(out_chizhu + 'f2.csv') 40 | f3 = pd.read_csv(out_chizhu + 'f3.csv') 41 | feature = pd.concat([f1, f2, f3], sort=False, axis=1) 42 | del f1, f2, f3 43 | gc.collect() 44 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl") 45 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl") 46 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl") 47 | testb_w2v = pd.read_pickle( 48 | "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl") 49 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v) 50 | 51 | train_w2v = pd.read_pickle( 52 | "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl") 53 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl") 54 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl") 55 | testb_w2v = pd.read_pickle( 56 | "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl") 57 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \ 58 | list(testa_w2v)+list(testb_w2v) 59 | del train_w2v, val_w2v, testa_w2v, testb_w2v 60 | gc.collect() 61 | feature.shape 62 | 63 | len_train = 99000000 64 | len_val = 1000000 65 | len_testa = 20000000 66 | len_testb = 100000000 67 | sc = StandardScaler() 68 | feature = sc.fit_transform(feature) 69 | train_feature = feature[:len_train] 70 | val_feature = feature[len_train:len_train+len_val] 71 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa] 72 | testb_feature = feature[-len_testb:] 73 | print(train_feature.shape, val_feature.shape, 74 | testa_feature.shape, testb_feature.shape) 75 | 76 | del feature 77 | gc.collect() 78 | 79 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model') 80 | 81 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)} 82 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)} 83 | 84 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20, 85 | maxlen_query=8): 86 | if label_tag: 87 | _, _q, _, _a, _label = line.strip().split(',') 88 | else: 89 | _, _q, _, _a = line.strip().split(',') 90 | q_seq = [token.get(item, 0) for item in _q.strip().split()] 91 | a_seq = [token.get(item, 0) for item in _a.strip().split()] 92 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:] 93 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:] 94 | if label_tag: 95 | return q_pad, a_pad, int(_label) 96 | return q_pad, a_pad 97 | 98 | 99 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8): 100 | while True: 101 | fin = open(path, 'r') 102 | batch_q, batch_a, batch_f, batch_label = [], [], [], [] 103 | for i, line in enumerate(fin): 104 | if len(batch_q) == chunk_size*batch_size: 105 | batch_q = np.array(batch_q) 106 | batch_a = np.array(batch_a) 107 | batch_f = np.array(batch_f) 108 | if label_tag: 109 | batch_label = np.array(batch_label) 110 | idx = list(range(chunk_size*batch_size)) 111 | if shuffle: 112 | np.random.shuffle(idx) 113 | for i in range(chunk_size): 114 | if label_tag: 115 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 116 | np.array( 117 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 118 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])], 119 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])) 120 | else: 121 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 122 | np.array( 123 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 124 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])] 125 | batch_q, batch_a, batch_f, batch_label = [], [], [], [] 126 | if label_tag: 127 | q, a, l = gen_feature_help(line, label_tag=label_tag) 128 | else: 129 | q, a = gen_feature_help(line, label_tag=label_tag) 130 | l = 0 131 | batch_q.append(q) 132 | batch_a.append(a) 133 | batch_f.append(feature[i]) 134 | if label_tag: 135 | batch_label.append(l) 136 | 137 | batch_q = np.array(batch_q) 138 | batch_a = np.array(batch_a) 139 | batch_f = np.array(batch_f) 140 | 141 | if label_tag: 142 | batch_label = np.array(batch_label) 143 | idx = list(range(len(batch_q))) 144 | if shuffle: 145 | np.random.shuffle(idx) 146 | for i in range(int(np.ceil(len(batch_q)/batch_size))): 147 | if label_tag: 148 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 149 | np.array( 150 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 151 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])], 152 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])) 153 | else: 154 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 155 | np.array( 156 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 157 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])] 158 | fin.close() 159 | 160 | 161 | def get_embedding_matrix(): 162 | m = np.zeros(shape=(len(index2word)+1, 300)) 163 | for i, w in index2word.items(): 164 | m[i, :] = w2v[w] 165 | return m 166 | 167 | 168 | embed_matrix = get_embedding_matrix() 169 | maxlen_query = 8 170 | maxlen_answer = 20 171 | 172 | 173 | class AdamW(Optimizer): 174 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4) 175 | epsilon=1e-8, decay=0., **kwargs): 176 | super(AdamW, self).__init__(**kwargs) 177 | with K.name_scope(self.__class__.__name__): 178 | self.iterations = K.variable(0, dtype='int64', name='iterations') 179 | self.lr = K.variable(lr, name='lr') 180 | self.beta_1 = K.variable(beta_1, name='beta_1') 181 | self.beta_2 = K.variable(beta_2, name='beta_2') 182 | self.decay = K.variable(decay, name='decay') 183 | # decoupled weight decay (2/4) 184 | self.wd = K.variable(weight_decay, name='weight_decay') 185 | self.epsilon = epsilon 186 | self.initial_decay = decay 187 | 188 | @interfaces.legacy_get_updates_support 189 | def get_updates(self, loss, params): 190 | grads = self.get_gradients(loss, params) 191 | self.updates = [K.update_add(self.iterations, 1)] 192 | wd = self.wd # decoupled weight decay (3/4) 193 | 194 | lr = self.lr 195 | if self.initial_decay > 0: 196 | lr *= (1. / (1. + self.decay * K.cast(self.iterations, 197 | K.dtype(self.decay)))) 198 | 199 | t = K.cast(self.iterations, K.floatx()) + 1 200 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / 201 | (1. - K.pow(self.beta_1, t))) 202 | 203 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 204 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 205 | self.weights = [self.iterations] + ms + vs 206 | 207 | for p, g, m, v in zip(params, grads, ms, vs): 208 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 209 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 210 | # decoupled weight decay (4/4) 211 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p 212 | 213 | self.updates.append(K.update(m, m_t)) 214 | self.updates.append(K.update(v, v_t)) 215 | new_p = p_t 216 | 217 | # Apply constraints. 218 | if getattr(p, 'constraint', None) is not None: 219 | new_p = p.constraint(new_p) 220 | 221 | self.updates.append(K.update(p, new_p)) 222 | return self.updates 223 | 224 | def get_config(self): 225 | config = {'lr': float(K.get_value(self.lr)), 226 | 'beta_1': float(K.get_value(self.beta_1)), 227 | 'beta_2': float(K.get_value(self.beta_2)), 228 | 'decay': float(K.get_value(self.decay)), 229 | 'weight_decay': float(K.get_value(self.wd)), 230 | 'epsilon': self.epsilon} 231 | base_config = super(AdamW, self).get_config() 232 | return dict(list(base_config.items()) + list(config.items())) 233 | 234 | 235 | class Attention(Layer): 236 | def __init__(self, step_dim, 237 | W_regularizer=None, b_regularizer=None, 238 | W_constraint=None, b_constraint=None, 239 | bias=True, **kwargs): 240 | self.supports_masking = True 241 | self.init = initializers.get('glorot_uniform') 242 | 243 | self.W_regularizer = regularizers.get(W_regularizer) 244 | self.b_regularizer = regularizers.get(b_regularizer) 245 | 246 | self.W_constraint = constraints.get(W_constraint) 247 | self.b_constraint = constraints.get(b_constraint) 248 | 249 | self.bias = bias 250 | self.step_dim = step_dim 251 | self.features_dim = 0 252 | super(Attention, self).__init__(**kwargs) 253 | 254 | def build(self, input_shape): 255 | assert len(input_shape) == 3 256 | 257 | self.W = self.add_weight((input_shape[-1],), 258 | initializer=self.init, 259 | name='{}_W'.format(self.name), 260 | regularizer=self.W_regularizer, 261 | constraint=self.W_constraint) 262 | self.features_dim = input_shape[-1] 263 | 264 | if self.bias: 265 | self.b = self.add_weight((input_shape[1],), 266 | initializer='zero', 267 | name='{}_b'.format(self.name), 268 | regularizer=self.b_regularizer, 269 | constraint=self.b_constraint) 270 | else: 271 | self.b = None 272 | 273 | self.built = True 274 | 275 | def compute_mask(self, input, input_mask=None): 276 | return None 277 | 278 | def call(self, x, mask=None): 279 | features_dim = self.features_dim 280 | step_dim = self.step_dim 281 | 282 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 283 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) 284 | 285 | if self.bias: 286 | eij += self.b 287 | 288 | eij = K.tanh(eij) 289 | 290 | a = K.exp(eij) 291 | 292 | if mask is not None: 293 | a *= K.cast(mask, K.floatx()) 294 | 295 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 296 | 297 | a = K.expand_dims(a) 298 | weighted_input = x * a 299 | return K.sum(weighted_input, axis=1) 300 | 301 | def compute_output_shape(self, input_shape): 302 | return input_shape[0], self.features_dim 303 | 304 | # AUC for a binary classifier 305 | 306 | 307 | def auc(y_true, y_pred): 308 | ptas = tf.stack([binary_PTA(y_true, y_pred, k) 309 | for k in np.linspace(0, 1, 1000)], axis=0) 310 | pfas = tf.stack([binary_PFA(y_true, y_pred, k) 311 | for k in np.linspace(0, 1, 1000)], axis=0) 312 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0) 313 | binSizes = -(pfas[1:]-pfas[:-1]) 314 | s = ptas*binSizes 315 | return K.sum(s, axis=0) 316 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 317 | # PFA, prob false alert for binary classifier 318 | 319 | 320 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)): 321 | y_pred = K.cast(y_pred >= threshold, 'float32') 322 | # N = total number of negative labels 323 | N = K.sum(1 - y_true) 324 | # FP = total number of false alerts, alerts from the negative class labels 325 | FP = K.sum(y_pred - y_pred * y_true) 326 | return FP/N 327 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 328 | # P_TA prob true alerts for binary classifier 329 | 330 | 331 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)): 332 | y_pred = K.cast(y_pred >= threshold, 'float32') 333 | # P = total number of positive labels 334 | P = K.sum(y_true) 335 | # TP = total number of correct alerts, alerts from the positive class labels 336 | TP = K.sum(y_pred * y_true) 337 | return TP/P 338 | 339 | 340 | class Lookahead(object): 341 | """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/). 342 | """ 343 | 344 | def __init__(self, k=5, alpha=0.5): 345 | self.k = k 346 | self.alpha = alpha 347 | self.count = 0 348 | 349 | def inject(self, model): 350 | """Inject the Lookahead algorithm for the given model. 351 | The following code is modified from keras's _make_train_function method. 352 | See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497 353 | """ 354 | if not hasattr(model, 'train_function'): 355 | raise RuntimeError('You must compile your model before using it.') 356 | 357 | model._check_trainable_weights_consistency() 358 | 359 | if model.train_function is None: 360 | inputs = (model._feed_inputs + 361 | model._feed_targets + 362 | model._feed_sample_weights) 363 | if model._uses_dynamic_learning_phase(): 364 | inputs += [K.learning_phase()] 365 | fast_params = model._collected_trainable_weights 366 | 367 | with K.name_scope('training'): 368 | with K.name_scope(model.optimizer.__class__.__name__): 369 | training_updates = model.optimizer.get_updates( 370 | params=fast_params, 371 | loss=model.total_loss) 372 | slow_params = [K.variable(p) for p in fast_params] 373 | fast_updates = (model.updates + 374 | training_updates + 375 | model.metrics_updates) 376 | 377 | slow_updates, copy_updates = [], [] 378 | for p, q in zip(fast_params, slow_params): 379 | slow_updates.append(K.update(q, q + self.alpha * (p - q))) 380 | copy_updates.append(K.update(p, q)) 381 | 382 | # Gets loss and metrics. Updates weights at each call. 383 | fast_train_function = K.function( 384 | inputs, 385 | [model.total_loss] + model.metrics_tensors, 386 | updates=fast_updates, 387 | name='fast_train_function', 388 | **model._function_kwargs) 389 | 390 | def F(inputs): 391 | self.count += 1 392 | R = fast_train_function(inputs) 393 | if self.count % self.k == 0: 394 | K.batch_get_value(slow_updates) 395 | K.batch_get_value(copy_updates) 396 | return R 397 | 398 | model.train_function = F 399 | 400 | 401 | def create_pretrained_embedding(pretrained_weights, trainable=False, **kwargs): 402 | "Create embedding layer from a pretrained weights array" 403 | in_dim, out_dim = pretrained_weights.shape 404 | embedding = Embedding(in_dim, out_dim, weights=[ 405 | pretrained_weights], trainable=False, **kwargs) 406 | return embedding 407 | 408 | 409 | def unchanged_shape(input_shape): 410 | "Function for Lambda layer" 411 | return input_shape 412 | 413 | 414 | def substract(input_1, input_2): 415 | "Substract element-wise" 416 | neg_input_2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(input_2) 417 | out_ = Add()([input_1, neg_input_2]) 418 | return out_ 419 | 420 | 421 | def submult(input_1, input_2): 422 | "Get multiplication and subtraction then concatenate results" 423 | mult = Multiply()([input_1, input_2]) 424 | sub = substract(input_1, input_2) 425 | out_ = Concatenate()([sub, mult]) 426 | return out_ 427 | 428 | 429 | def apply_multiple(input_, layers): 430 | "Apply layers to input then concatenate result" 431 | if not len(layers) > 1: 432 | raise ValueError('Layers list should contain more than 1 layer') 433 | else: 434 | agg_ = [] 435 | for layer in layers: 436 | agg_.append(layer(input_)) 437 | out_ = Concatenate()(agg_) 438 | return out_ 439 | 440 | 441 | def time_distributed(input_, layers): 442 | "Apply a list of layers in TimeDistributed mode" 443 | out_ = [] 444 | node_ = input_ 445 | for layer_ in layers: 446 | node_ = TimeDistributed(layer_)(node_) 447 | out_ = node_ 448 | return out_ 449 | 450 | 451 | def soft_attention_alignment(input_1, input_2): 452 | "Align text representation with neural soft attention" 453 | attention = Dot(axes=-1)([input_1, input_2]) 454 | w_att_1 = Lambda(lambda x: softmax(x, axis=1), 455 | output_shape=unchanged_shape)(attention) 456 | w_att_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2), 457 | output_shape=unchanged_shape)(attention)) 458 | in1_aligned = Dot(axes=1)([w_att_1, input_1]) 459 | in2_aligned = Dot(axes=1)([w_att_2, input_2]) 460 | return in1_aligned, in2_aligned 461 | 462 | 463 | def decomposable_attention(pretrained_weights, 464 | num_shape, 465 | projection_dim=300, projection_hidden=0, projection_dropout=0.2, 466 | compare_dim=500, compare_dropout=0.2, 467 | dense_dim=300, dense_dropout=0.2, 468 | lr=1e-3, activation='elu', maxlen=20): 469 | # Based on: https://arxiv.org/abs/1606.01933 470 | 471 | q1 = Input(name='q1', shape=(maxlen,)) 472 | q2 = Input(name='q2', shape=(maxlen,)) 473 | 474 | # Embedding 475 | embedding = create_pretrained_embedding(pretrained_weights, 476 | mask_zero=False) 477 | q1_embed = embedding(q1) 478 | q2_embed = embedding(q2) 479 | 480 | # Projection 481 | projection_layers = [] 482 | if projection_hidden > 0: 483 | projection_layers.extend([ 484 | Dense(projection_hidden, activation=activation), 485 | Dropout(rate=projection_dropout), 486 | ]) 487 | projection_layers.extend([ 488 | Dense(projection_dim, activation=None), 489 | Dropout(rate=projection_dropout), 490 | ]) 491 | q1_encoded = time_distributed(q1_embed, projection_layers) 492 | q2_encoded = time_distributed(q2_embed, projection_layers) 493 | 494 | # Attention 495 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) 496 | 497 | # Compare 498 | q1_combined = Concatenate()( 499 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) 500 | q2_combined = Concatenate()( 501 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) 502 | compare_layers = [ 503 | Dense(compare_dim, activation=activation), 504 | Dropout(compare_dropout), 505 | Dense(compare_dim, activation=activation), 506 | Dropout(compare_dropout), 507 | ] 508 | q1_compare = time_distributed(q1_combined, compare_layers) 509 | q2_compare = time_distributed(q2_combined, compare_layers) 510 | 511 | # Aggregate 512 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 513 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 514 | 515 | # Classifier 516 | merged = Concatenate()([q1_rep, q2_rep]) 517 | dense = BatchNormalization()(merged) 518 | dense = Dense(dense_dim, activation=activation)(dense) 519 | dense = Dropout(dense_dropout)(dense) 520 | dense = BatchNormalization()(dense) 521 | dense = Dense(dense_dim, activation=activation)(dense) 522 | dense = Dropout(dense_dropout)(dense) 523 | out_ = Dense(1, activation='sigmoid')(dense) 524 | 525 | model = Model(inputs=[q1, q2], outputs=out_) 526 | model.compile(loss='binary_crossentropy', 527 | optimizer=AdamW(lr=0.001, weight_decay=0.02,), 528 | metrics=["accuracy", auc]) 529 | return model 530 | 531 | 532 | def esim(embedding_matrix, 533 | maxlen=20, 534 | lstm_dim=64, 535 | dense_dim=128, 536 | dense_dropout=0.5): 537 | # Based on arXiv:1609.06038 538 | q1 = Input(name='q1', shape=(8,)) 539 | q2 = Input(name='q2', shape=(20,)) 540 | 541 | # Embedding 542 | embedding = create_pretrained_embedding( 543 | embedding_matrix, mask_zero=False) 544 | bn = BatchNormalization(axis=2) 545 | q1_embed = bn(embedding(q1)) 546 | q2_embed = bn(embedding(q2)) 547 | 548 | # Encode 549 | encode = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True)) 550 | q1_encoded = encode(q1_embed) 551 | q2_encoded = encode(q2_embed) 552 | 553 | # Attention 554 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) 555 | 556 | # Compose 557 | q1_combined = Concatenate()( 558 | [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) 559 | q2_combined = Concatenate()( 560 | [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) 561 | 562 | compose = Bidirectional(CuDNNLSTM(lstm_dim, return_sequences=True)) 563 | q1_compare = compose(q1_combined) 564 | q2_compare = compose(q2_combined) 565 | 566 | # Aggregate 567 | q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 568 | q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) 569 | 570 | # leaks_input = Input(shape=(num_shape,)) 571 | # leaks_dense = Dense(dense_dim//2, activation='relu')(leaks_input) 572 | 573 | # Classifier 574 | merged = Concatenate()([q1_rep, q2_rep]) 575 | 576 | dense = BatchNormalization()(merged) 577 | dense = Dense(dense_dim, activation='elu')(dense) 578 | dense = BatchNormalization()(dense) 579 | dense = Dropout(dense_dropout)(dense) 580 | dense = Dense(dense_dim, activation='elu')(dense) 581 | dense = BatchNormalization()(dense) 582 | dense = Dropout(dense_dropout)(dense) 583 | out_ = Dense(1, activation='sigmoid')(dense) 584 | 585 | model = Model(inputs=[q1, q2], outputs=out_) 586 | model.compile(loss='binary_crossentropy', 587 | optimizer=AdamW(lr=0.0003, weight_decay=0.02,), 588 | metrics=["accuracy", auc]) 589 | return model 590 | 591 | 592 | def aux_esim_model(embed_matrix, model_weight_path): 593 | base_model = esim(embed_matrix) 594 | base_model.load_weights(model_weight_path) 595 | input_q, input_a = base_model.inputs 596 | input_f = Input((19,)) 597 | hidden_esim = base_model.get_layer(index=28).output 598 | merged = Concatenate()([hidden_esim, input_f]) 599 | #dense = BatchNormalization()(merged) 600 | dense = Dense(512, activation='relu')(merged) 601 | #dense = BatchNormalization()(dense) 602 | dense = Dropout(0.5)(dense) 603 | dense = Dense(256, activation='relu')(dense) 604 | #dense = BatchNormalization()(dense) 605 | dense = Dropout(0.5)(dense) 606 | out_ = Dense(1, activation='sigmoid')(dense) 607 | 608 | model = Model(inputs=[input_q, input_a, input_f], outputs=out_) 609 | model.compile(loss='binary_crossentropy', 610 | optimizer=AdamW(lr=0.0003, weight_decay=0.02), 611 | metrics=["accuracy"]) 612 | return model 613 | 614 | 615 | ####模型训练 616 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv', feature=train_feature, batch_size=2048, 617 | label_tag=True, chunk_size=5000) 618 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, batch_size=2048, 619 | label_tag=True, chunk_size=5000) 620 | print("train...") 621 | print("###"*30) 622 | gc.collect() 623 | K.clear_session() 624 | weight_path = '/home/kesci/work/chizhu/chizhu_w2v_esim_weight_1_0.44060374074871167.h5' 625 | model = aux_esim_model(embed_matrix, weight_path) 626 | lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead 627 | lookahead.inject(model) # add into model 628 | model.summary() 629 | early_stopping = EarlyStopping( 630 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1) 631 | reduce_lr = ReduceLROnPlateau( 632 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2) 633 | bst_model_path = '/home/kesci/work/zhifeng/zhifeng_aux_fasttext_esim_finetune_{epoch}_{val_loss}.h5' 634 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 635 | save_best_only=False, 636 | verbose=1, save_weights_only=True, period=1) 637 | callbacks = [checkpoint, reduce_lr, early_stopping] 638 | # print("load weight....") 639 | 640 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)), 641 | epochs=10, verbose=1, callbacks=callbacks, 642 | validation_data=val_gen, validation_steps=int( 643 | np.ceil(1000000/2048)), 644 | max_queue_size=10, workers=1, use_multiprocessing=False) 645 | 646 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, 647 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False) 648 | val_prob = model.predict_generator( 649 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1) 650 | 651 | f = open('/home/kesci/zhifeng/val.csv', 'r') 652 | q, a, l = [], [], [] 653 | for line in f: 654 | qid, _, aid, _, label = line.strip().split(',') 655 | q.append(qid) 656 | a.append(aid) 657 | l.append(int(label)) 658 | 659 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l}) 660 | val_df['prob'] = val_prob.flatten() 661 | 662 | roc_auc_score(val_df['label'], val_df['prob']) 663 | 664 | 665 | def perauc(df): 666 | temp = pd.Series() 667 | try: 668 | temp['auc'] = roc_auc_score(df['label'], df['prob']) 669 | except: 670 | temp['auc'] = 0.5 671 | return temp 672 | 673 | 674 | eval_df = val_df.groupby("qid").apply(perauc) 675 | eval_df.index = range(len(eval_df)) 676 | print("qauc:", eval_df['auc'].mean()) 677 | 678 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv', 679 | feature=testa_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 680 | prob = model.predict_generator( 681 | test_gen, steps=int(np.ceil(20000000/4096)), verbose=1) 682 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv', 683 | names=['qid', 'aid', 'prob']) 684 | sub['prob'] = prob.flatten() 685 | sub.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testa.csv', index=False, header=False 686 | test_gen=gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv', 687 | feature=testb_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 688 | prob=model.predict_generator(test_gen, steps=int( 689 | np.ceil(100000000/4096)), verbose=1) 690 | final=pd.read_csv(path+"bytedance_contest.final_2.csv", names=[ 691 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']] 692 | final['prob']=prob.flatten() 693 | final.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testb.csv', index=False, header=False) 694 | -------------------------------------------------------------------------------- /finetuning_w2v_rnn.py: -------------------------------------------------------------------------------- 1 | from keras.activations import softmax 2 | from sklearn.preprocessing import StandardScaler 3 | import os 4 | import pandas as pd 5 | import numpy as np 6 | import random as rn 7 | from tqdm import tqdm, tqdm_notebook 8 | import tensorflow as tf 9 | from sklearn.metrics import roc_auc_score 10 | from keras.preprocessing.text import Tokenizer 11 | from keras.preprocessing.sequence import pad_sequences 12 | from keras.optimizers import Adam 13 | from keras import backend as K 14 | from keras.optimizers import * 15 | from keras.callbacks import * 16 | from keras.layers import * 17 | from keras.models import * 18 | from keras.engine.topology import Layer 19 | from keras import initializers, regularizers, constraints, optimizers, layers 20 | from keras.initializers import * 21 | import keras 22 | from sklearn.model_selection import StratifiedKFold, GroupKFold 23 | import gc 24 | import time 25 | from gensim.models import Word2Vec 26 | import logging 27 | import Levenshtein 28 | import fasttext 29 | tqdm.pandas() 30 | np.random.seed(1017) 31 | rn.seed(1017) 32 | tf.set_random_seed(1017) 33 | path = "/home/kesci/input/bytedance/" 34 | out = '/home/kesci/work/zhifeng/' 35 | out_chizhu = '/home/kesci/work/chizhu/' 36 | print(os.listdir(path)) 37 | 38 | f1 = pd.read_csv(out_chizhu + 'f1.csv') 39 | f2 = pd.read_csv(out_chizhu + 'f2.csv') 40 | f3 = pd.read_csv(out_chizhu + 'f3.csv') 41 | feature = pd.concat([f1, f2, f3], sort=False, axis=1) 42 | del f1, f2, f3 43 | gc.collect() 44 | train_w2v = pd.read_pickle("/home/kesci/work/zhifeng/train.cosine.w2v.pkl") 45 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.w2v.pkl") 46 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.w2v.pkl") 47 | testb_w2v = pd.read_pickle( 48 | "/home/kesci/work/zhifeng/test_final.cosine.w2v.pkl") 49 | feature['w2v_cos'] = list(train_w2v)+list(testa_w2v)+list(testb_w2v) 50 | 51 | train_w2v = pd.read_pickle( 52 | "/home/kesci/work/zhifeng/train.cosine.fasttext.pkl") 53 | val_w2v = pd.read_pickle("/home/kesci/work/zhifeng/val.cosine.fasttext.pkl") 54 | testa_w2v = pd.read_pickle("/home/kesci/work/zhifeng/test.cosine.fasttext.pkl") 55 | testb_w2v = pd.read_pickle( 56 | "/home/kesci/work/zhifeng/test_final.cosine.fasttext.pkl") 57 | feature['fast_cos'] = list(train_w2v)+list(val_w2v) + \ 58 | list(testa_w2v)+list(testb_w2v) 59 | del train_w2v, val_w2v, testa_w2v, testb_w2v 60 | gc.collect() 61 | feature.shape 62 | 63 | len_train = 99000000 64 | len_val = 1000000 65 | len_testa = 20000000 66 | len_testb = 100000000 67 | sc = StandardScaler() 68 | feature = sc.fit_transform(feature) 69 | train_feature = feature[:len_train] 70 | val_feature = feature[len_train:len_train+len_val] 71 | testa_feature = feature[len_train+len_val:len_train+len_val+len_testa] 72 | testb_feature = feature[-len_testb:] 73 | print(train_feature.shape, val_feature.shape, 74 | testa_feature.shape, testb_feature.shape) 75 | 76 | del feature 77 | gc.collect() 78 | 79 | w2v = Word2Vec.load('/home/kesci/work/chizhu/new_skip_w2v_all_300.model') 80 | 81 | word2index = {word: index+1 for index, word in enumerate(w2v.wv.index2entity)} 82 | index2word = {index+1: word for index, word in enumerate(w2v.wv.index2entity)} 83 | 84 | 85 | def gen_feature_help(line, label_tag=True, token=word2index, maxlen_answer=20, 86 | maxlen_query=8): 87 | if label_tag: 88 | _, _q, _, _a, _label = line.strip().split(',') 89 | else: 90 | _, _q, _, _a = line.strip().split(',') 91 | q_seq = [token.get(item, 0) for item in _q.strip().split()] 92 | a_seq = [token.get(item, 0) for item in _a.strip().split()] 93 | q_pad = [0]*(maxlen_query - len(q_seq)) + q_seq[-maxlen_query:] 94 | a_pad = [0]*(maxlen_answer - len(a_seq)) + a_seq[-maxlen_answer:] 95 | if label_tag: 96 | return q_pad, a_pad, int(_label) 97 | return q_pad, a_pad 98 | 99 | 100 | def gen_train(path, feature, batch_size=256, label_tag=True, chunk_size=1000, shuffle=True, maxlen_answer=20, maxlen_query=8): 101 | while True: 102 | fin = open(path, 'r') 103 | batch_q, batch_a, batch_f, batch_label = [], [], [], [] 104 | for i, line in enumerate(fin): 105 | if len(batch_q) == chunk_size*batch_size: 106 | batch_q = np.array(batch_q) 107 | batch_a = np.array(batch_a) 108 | batch_f = np.array(batch_f) 109 | if label_tag: 110 | batch_label = np.array(batch_label) 111 | idx = list(range(chunk_size*batch_size)) 112 | if shuffle: 113 | np.random.shuffle(idx) 114 | for i in range(chunk_size): 115 | if label_tag: 116 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 117 | np.array( 118 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 119 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])], 120 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])) 121 | else: 122 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 123 | np.array( 124 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 125 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])] 126 | batch_q, batch_a, batch_f, batch_label = [], [], [], [] 127 | if label_tag: 128 | q, a, l = gen_feature_help(line, label_tag=label_tag) 129 | else: 130 | q, a = gen_feature_help(line, label_tag=label_tag) 131 | l = 0 132 | batch_q.append(q) 133 | batch_a.append(a) 134 | batch_f.append(feature[i]) 135 | if label_tag: 136 | batch_label.append(l) 137 | 138 | batch_q = np.array(batch_q) 139 | batch_a = np.array(batch_a) 140 | batch_f = np.array(batch_f) 141 | 142 | if label_tag: 143 | batch_label = np.array(batch_label) 144 | idx = list(range(len(batch_q))) 145 | if shuffle: 146 | np.random.shuffle(idx) 147 | for i in range(int(np.ceil(len(batch_q)/batch_size))): 148 | if label_tag: 149 | yield ([np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 150 | np.array( 151 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 152 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])], 153 | np.array(batch_label[idx[i*batch_size:i*batch_size+batch_size]])) 154 | else: 155 | yield [np.array(batch_q[idx[i*batch_size:i*batch_size+batch_size]]), 156 | np.array( 157 | batch_a[idx[i*batch_size:i*batch_size+batch_size]]), 158 | np.array(batch_f[idx[i*batch_size:i*batch_size+batch_size]])] 159 | fin.close() 160 | 161 | 162 | def get_embedding_matrix(): 163 | m = np.zeros(shape=(len(index2word)+1, 300)) 164 | for i, w in index2word.items(): 165 | m[i, :] = w2v[w] 166 | return m 167 | 168 | 169 | embed_matrix = get_embedding_matrix() 170 | maxlen_query = 8 171 | maxlen_answer = 20 172 | 173 | 174 | class AdamW(Optimizer): 175 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4) 176 | epsilon=1e-8, decay=0., **kwargs): 177 | super(AdamW, self).__init__(**kwargs) 178 | with K.name_scope(self.__class__.__name__): 179 | self.iterations = K.variable(0, dtype='int64', name='iterations') 180 | self.lr = K.variable(lr, name='lr') 181 | self.beta_1 = K.variable(beta_1, name='beta_1') 182 | self.beta_2 = K.variable(beta_2, name='beta_2') 183 | self.decay = K.variable(decay, name='decay') 184 | # decoupled weight decay (2/4) 185 | self.wd = K.variable(weight_decay, name='weight_decay') 186 | self.epsilon = epsilon 187 | self.initial_decay = decay 188 | 189 | @interfaces.legacy_get_updates_support 190 | def get_updates(self, loss, params): 191 | grads = self.get_gradients(loss, params) 192 | self.updates = [K.update_add(self.iterations, 1)] 193 | wd = self.wd # decoupled weight decay (3/4) 194 | 195 | lr = self.lr 196 | if self.initial_decay > 0: 197 | lr *= (1. / (1. + self.decay * K.cast(self.iterations, 198 | K.dtype(self.decay)))) 199 | 200 | t = K.cast(self.iterations, K.floatx()) + 1 201 | lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / 202 | (1. - K.pow(self.beta_1, t))) 203 | 204 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 205 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 206 | self.weights = [self.iterations] + ms + vs 207 | 208 | for p, g, m, v in zip(params, grads, ms, vs): 209 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 210 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 211 | # decoupled weight decay (4/4) 212 | p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p 213 | 214 | self.updates.append(K.update(m, m_t)) 215 | self.updates.append(K.update(v, v_t)) 216 | new_p = p_t 217 | 218 | # Apply constraints. 219 | if getattr(p, 'constraint', None) is not None: 220 | new_p = p.constraint(new_p) 221 | 222 | self.updates.append(K.update(p, new_p)) 223 | return self.updates 224 | 225 | def get_config(self): 226 | config = {'lr': float(K.get_value(self.lr)), 227 | 'beta_1': float(K.get_value(self.beta_1)), 228 | 'beta_2': float(K.get_value(self.beta_2)), 229 | 'decay': float(K.get_value(self.decay)), 230 | 'weight_decay': float(K.get_value(self.wd)), 231 | 'epsilon': self.epsilon} 232 | base_config = super(AdamW, self).get_config() 233 | return dict(list(base_config.items()) + list(config.items())) 234 | 235 | 236 | class Attention(Layer): 237 | def __init__(self, step_dim, 238 | W_regularizer=None, b_regularizer=None, 239 | W_constraint=None, b_constraint=None, 240 | bias=True, **kwargs): 241 | self.supports_masking = True 242 | self.init = initializers.get('glorot_uniform') 243 | 244 | self.W_regularizer = regularizers.get(W_regularizer) 245 | self.b_regularizer = regularizers.get(b_regularizer) 246 | 247 | self.W_constraint = constraints.get(W_constraint) 248 | self.b_constraint = constraints.get(b_constraint) 249 | 250 | self.bias = bias 251 | self.step_dim = step_dim 252 | self.features_dim = 0 253 | super(Attention, self).__init__(**kwargs) 254 | 255 | def build(self, input_shape): 256 | assert len(input_shape) == 3 257 | 258 | self.W = self.add_weight((input_shape[-1],), 259 | initializer=self.init, 260 | name='{}_W'.format(self.name), 261 | regularizer=self.W_regularizer, 262 | constraint=self.W_constraint) 263 | self.features_dim = input_shape[-1] 264 | 265 | if self.bias: 266 | self.b = self.add_weight((input_shape[1],), 267 | initializer='zero', 268 | name='{}_b'.format(self.name), 269 | regularizer=self.b_regularizer, 270 | constraint=self.b_constraint) 271 | else: 272 | self.b = None 273 | 274 | self.built = True 275 | 276 | def compute_mask(self, input, input_mask=None): 277 | return None 278 | 279 | def call(self, x, mask=None): 280 | features_dim = self.features_dim 281 | step_dim = self.step_dim 282 | 283 | eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 284 | K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) 285 | 286 | if self.bias: 287 | eij += self.b 288 | 289 | eij = K.tanh(eij) 290 | 291 | a = K.exp(eij) 292 | 293 | if mask is not None: 294 | a *= K.cast(mask, K.floatx()) 295 | 296 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 297 | 298 | a = K.expand_dims(a) 299 | weighted_input = x * a 300 | return K.sum(weighted_input, axis=1) 301 | 302 | def compute_output_shape(self, input_shape): 303 | return input_shape[0], self.features_dim 304 | 305 | # AUC for a binary classifier 306 | 307 | 308 | def auc(y_true, y_pred): 309 | ptas = tf.stack([binary_PTA(y_true, y_pred, k) 310 | for k in np.linspace(0, 1, 1000)], axis=0) 311 | pfas = tf.stack([binary_PFA(y_true, y_pred, k) 312 | for k in np.linspace(0, 1, 1000)], axis=0) 313 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0) 314 | binSizes = -(pfas[1:]-pfas[:-1]) 315 | s = ptas*binSizes 316 | return K.sum(s, axis=0) 317 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 318 | # PFA, prob false alert for binary classifier 319 | 320 | 321 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)): 322 | y_pred = K.cast(y_pred >= threshold, 'float32') 323 | # N = total number of negative labels 324 | N = K.sum(1 - y_true) 325 | # FP = total number of false alerts, alerts from the negative class labels 326 | FP = K.sum(y_pred - y_pred * y_true) 327 | return FP/N 328 | #----------------------------------------------------------------------------------------------------------------------------------------------------- 329 | # P_TA prob true alerts for binary classifier 330 | 331 | 332 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)): 333 | y_pred = K.cast(y_pred >= threshold, 'float32') 334 | # P = total number of positive labels 335 | P = K.sum(y_true) 336 | # TP = total number of correct alerts, alerts from the positive class labels 337 | TP = K.sum(y_pred * y_true) 338 | return TP/P 339 | 340 | 341 | class Lookahead(object): 342 | """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/). 343 | """ 344 | 345 | def __init__(self, k=5, alpha=0.5): 346 | self.k = k 347 | self.alpha = alpha 348 | self.count = 0 349 | 350 | def inject(self, model): 351 | """Inject the Lookahead algorithm for the given model. 352 | The following code is modified from keras's _make_train_function method. 353 | See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497 354 | """ 355 | if not hasattr(model, 'train_function'): 356 | raise RuntimeError('You must compile your model before using it.') 357 | 358 | model._check_trainable_weights_consistency() 359 | 360 | if model.train_function is None: 361 | inputs = (model._feed_inputs + 362 | model._feed_targets + 363 | model._feed_sample_weights) 364 | if model._uses_dynamic_learning_phase(): 365 | inputs += [K.learning_phase()] 366 | fast_params = model._collected_trainable_weights 367 | 368 | with K.name_scope('training'): 369 | with K.name_scope(model.optimizer.__class__.__name__): 370 | training_updates = model.optimizer.get_updates( 371 | params=fast_params, 372 | loss=model.total_loss) 373 | slow_params = [K.variable(p) for p in fast_params] 374 | fast_updates = (model.updates + 375 | training_updates + 376 | model.metrics_updates) 377 | 378 | slow_updates, copy_updates = [], [] 379 | for p, q in zip(fast_params, slow_params): 380 | slow_updates.append(K.update(q, q + self.alpha * (p - q))) 381 | copy_updates.append(K.update(p, q)) 382 | 383 | # Gets loss and metrics. Updates weights at each call. 384 | fast_train_function = K.function( 385 | inputs, 386 | [model.total_loss] + model.metrics_tensors, 387 | updates=fast_updates, 388 | name='fast_train_function', 389 | **model._function_kwargs) 390 | 391 | def F(inputs): 392 | self.count += 1 393 | R = fast_train_function(inputs) 394 | if self.count % self.k == 0: 395 | K.batch_get_value(slow_updates) 396 | K.batch_get_value(copy_updates) 397 | return R 398 | 399 | model.train_function = F 400 | 401 | 402 | def get_model(embedding_matrix): 403 | 404 | K.clear_session() 405 | #The embedding layer containing the word vectors 406 | emb_layer = Embedding( 407 | input_dim=embedding_matrix.shape[0], 408 | output_dim=embedding_matrix.shape[1], 409 | weights=[embedding_matrix], 410 | trainable=False 411 | ) 412 | sdrop=SpatialDropout1D(rate=0.2) 413 | lstm_layer = Bidirectional(CuDNNLSTM(64, return_sequences=True, 414 | kernel_initializer=glorot_uniform(seed = 123))) 415 | gru_layer = Bidirectional(CuDNNGRU(64, return_sequences=True, 416 | kernel_initializer=glorot_uniform(seed = 123))) 417 | 418 | cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform") 419 | 420 | # Define inputs 421 | seq1 = Input(shape=(maxlen_query,)) 422 | x1 = emb_layer(seq1) 423 | x1 = sdrop(x1) 424 | lstm1 = lstm_layer(x1) 425 | gru1 = gru_layer(lstm1) 426 | att_1 = Attention(maxlen_query)(lstm1) 427 | att_3 = Attention(maxlen_query)(gru1) 428 | cnn1 = cnn1d_layer(lstm1) 429 | 430 | avg_pool = GlobalAveragePooling1D() 431 | max_pool = GlobalMaxPooling1D() 432 | 433 | seq2 = Input(shape=(maxlen_answer,)) 434 | x2 = emb_layer(seq2) 435 | x2 = sdrop(x2) 436 | lstm2 = lstm_layer(x2) 437 | gru2 = gru_layer(lstm2) 438 | att_2 = Attention(maxlen_answer)(lstm2) 439 | att_4 = Attention(maxlen_answer)(gru2) 440 | cnn2 = cnn1d_layer(lstm2) 441 | 442 | x1=concatenate([att_1,att_3,avg_pool(cnn1),max_pool(cnn1),avg_pool(gru1),max_pool(gru1)]) 443 | x2=concatenate([att_2,att_4,avg_pool(cnn2),max_pool(cnn2),avg_pool(gru2),max_pool(gru2)]) 444 | 445 | merge = Multiply()([x1, x2]) 446 | merge = Dropout(0.5)(merge) 447 | # The MLP that determines the outcome 448 | x = Dense(128,kernel_initializer=he_uniform(seed=123), activation='relu',)(merge) 449 | # x = Dropout(0.2)(x) 450 | # x = BatchNormalization()(x) 451 | 452 | pred = Dense(1,kernel_initializer=he_uniform(seed=123), activation='sigmoid')(x) 453 | 454 | 455 | model = Model(inputs=[seq1,seq2], outputs=pred) 456 | 457 | model.compile(loss='binary_crossentropy', 458 | optimizer=AdamW(lr=0.0003,weight_decay=0.02,), 459 | metrics=["accuracy"]) 460 | # model.summary() 461 | return model 462 | 463 | 464 | def aux_esim_model(embed_matrix, model_weight_path): 465 | base_model = get_model(embed_matrix) 466 | base_model.load_weights(model_weight_path) 467 | input_q, input_a = base_model.inputs 468 | input_f = Input((19,)) 469 | hidden_esim = base_model.get_layer(index=15).output 470 | merged = Concatenate()([hidden_esim, input_f]) 471 | #dense = BatchNormalization()(merged) 472 | dense = Dense(512, activation='relu')(merged) 473 | #dense = BatchNormalization()(dense) 474 | dense = Dropout(0.5)(dense) 475 | dense = Dense(256, activation='relu')(dense) 476 | #dense = BatchNormalization()(dense) 477 | dense = Dropout(0.5)(dense) 478 | out_ = Dense(1, activation='sigmoid')(dense) 479 | 480 | model = Model(inputs=[input_q, input_a, input_f], outputs=out_) 481 | model.compile(loss='binary_crossentropy', 482 | optimizer=AdamW(lr=0.0003, weight_decay=0.02), 483 | metrics=["accuracy"]) 484 | return model 485 | 486 | 487 | ####模型训练 488 | train_gen = gen_train(path='/home/kesci/zhifeng/train.smaller.csv', feature=train_feature, batch_size=2048, 489 | label_tag=True, chunk_size=5000) 490 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, batch_size=2048, 491 | label_tag=True, chunk_size=5000) 492 | print("train...") 493 | print("###"*30) 494 | gc.collect() 495 | K.clear_session() 496 | weight_path = '/home/kesci/work/zhifeng/zhifeng_rnn_weight_1_0.668621638244629.h5' 497 | model = aux_esim_model(embed_matrix, weight_path) 498 | lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead 499 | lookahead.inject(model) # add into model 500 | model.summary() 501 | early_stopping = EarlyStopping( 502 | monitor='val_loss', min_delta=0.0001, patience=2, mode='min', verbose=1) 503 | reduce_lr = ReduceLROnPlateau( 504 | monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2) 505 | bst_model_path = '/home/kesci/work/zhifeng/zhifeng_aux_fasttext_esim_finetune_{epoch}_{val_loss}.h5' 506 | checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 507 | save_best_only=False, 508 | verbose=1, save_weights_only=True, period=1) 509 | callbacks = [checkpoint, reduce_lr, early_stopping] 510 | # print("load weight....") 511 | 512 | hist = model.fit_generator(train_gen, steps_per_epoch=int(np.ceil(99000000/2048)), 513 | epochs=10, verbose=1, callbacks=callbacks, 514 | validation_data=val_gen, validation_steps=int( 515 | np.ceil(1000000/2048)), 516 | max_queue_size=10, workers=1, use_multiprocessing=False) 517 | 518 | val_gen = gen_train(path='/home/kesci/zhifeng/val.csv', feature=val_feature, 519 | batch_size=4096, label_tag=True, chunk_size=1000, shuffle=False) 520 | val_prob = model.predict_generator( 521 | val_gen, steps=int(np.ceil(1000000/4096)), verbose=1) 522 | 523 | f = open('/home/kesci/zhifeng/val.csv', 'r') 524 | q, a, l = [], [], [] 525 | for line in f: 526 | qid, _, aid, _, label = line.strip().split(',') 527 | q.append(qid) 528 | a.append(aid) 529 | l.append(int(label)) 530 | 531 | val_df = pd.DataFrame({'qid': q, 'aid': a, 'label': l}) 532 | val_df['prob'] = val_prob.flatten() 533 | 534 | roc_auc_score(val_df['label'], val_df['prob']) 535 | 536 | 537 | def perauc(df): 538 | temp = pd.Series() 539 | try: 540 | temp['auc'] = roc_auc_score(df['label'], df['prob']) 541 | except: 542 | temp['auc'] = 0.5 543 | return temp 544 | 545 | 546 | eval_df = val_df.groupby("qid").apply(perauc) 547 | eval_df.index = range(len(eval_df)) 548 | print("qauc:", eval_df['auc'].mean()) 549 | 550 | test_gen = gen_train(path='/home/kesci/input/bytedance/test_final_part1.csv', 551 | feature=testa_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 552 | prob = model.predict_generator( 553 | test_gen, steps=int(np.ceil(20000000/4096)), verbose=1) 554 | sub = pd.read_csv('/home/kesci/work/chizhu/submit_rnn.csv', 555 | names=['qid', 'aid', 'prob']) 556 | sub['prob'] = prob.flatten() 557 | sub.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testa.csv', index=False, header=False 558 | test_gen=gen_train(path='/home/kesci/input/bytedance/bytedance_contest.final_2.csv', 559 | feature=testb_feature, batch_size=4096, label_tag=False, chunk_size=1, shuffle=False) 560 | prob=model.predict_generator(test_gen, steps=int( 561 | np.ceil(100000000/4096)), verbose=1) 562 | final=pd.read_csv(path+"bytedance_contest.final_2.csv", names=[ 563 | 'query_id', 'query', 'query_title_id', 'title'])[['query_id', 'query_title_id']] 564 | final['prob']=prob.flatten() 565 | final.to_csv('/home/kesci/work/chizhu/finetuning_fasttext_esim_testb.csv', index=False, header=False) 566 | -------------------------------------------------------------------------------- /gen_feature.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm, tqdm_notebook 2 | from sklearn.model_selection import StratifiedKFold, GroupKFold 3 | import numpy as np 4 | import os 5 | import Levenshtein 6 | import logging 7 | from gensim.models import Word2Vec 8 | import time 9 | import gc 10 | import keras 11 | from keras.initializers import * 12 | from keras import initializers, regularizers, constraints, optimizers, layers 13 | from keras.engine.topology import Layer 14 | from keras.models import * 15 | from keras.layers import * 16 | from keras.callbacks import * 17 | from keras.optimizers import * 18 | from keras import backend as K 19 | from keras.optimizers import Adam 20 | from keras.preprocessing.sequence import pad_sequences 21 | from keras.preprocessing.text import Tokenizer 22 | from sklearn.metrics import roc_auc_score 23 | import tensorflow as tf 24 | import random as rn 25 | import pandas as pd 26 | tqdm.pandas() 27 | np.random.seed(1017) 28 | rn.seed(1017) 29 | tf.set_random_seed(1017) 30 | path = "/home/kesci/input/bytedance/" 31 | out = '/home/kesci/work/chizhu/' 32 | print(os.listdir(path)) 33 | 34 | train = pd.read_csv(path+"train_final.csv",skiprows=900000000,nrows=100000000,names=['query_id','query','query_title_id','title','label']) 35 | 36 | testa = pd.read_csv(path+"test_final_part1.csv",names=['query_id','query','query_title_id','title']) 37 | testb = pd.read_csv(path+"bytedance_contest.final_2.csv",names=['query_id','query','query_title_id','title']) 38 | 39 | testa['label']=-1 40 | testb['label']=-2 41 | test=pd.concat([testa,testb],ignore_index=True) 42 | del testa,testb 43 | gc.collect() 44 | 45 | train['title']=train['title'].apply(lambda x:str(x).replace("\t",""),1) 46 | test['title']=test['title'].apply(lambda x:str(x).replace("\t",""),1) 47 | data_all=pd.concat([train,test],ignore_index=True) 48 | del train,test 49 | gc.collect() 50 | 51 | # 构造特征集 f1 52 | def get_union_data(row): 53 | title_list = row['title'].split(' ') 54 | query_list = row['query'].split(' ') 55 | return len(list(set(title_list).intersection(set(query_list)))) 56 | 57 | def same_1(row): 58 | title_list = row['title'].split(' ') 59 | query_list = row['query'].split(' ') 60 | if title_list[0] == query_list[0]: 61 | return 1 62 | else: 63 | return 0 64 | 65 | def same_2(row): 66 | title_list = row['title'].split(' ') 67 | query_list = row['query'].split(' ') 68 | if ' '.join(title_list[:2]) == ' '.join(query_list[:2]): 69 | return 1 70 | else: 71 | return 0 72 | 73 | def same_3(row): 74 | title_list = row['title'].split(' ') 75 | query_list = row['query'].split(' ') 76 | if ' '.join(title_list[:3]) == ' '.join(query_list[:3]): 77 | return 1 78 | else: 79 | return 0 80 | 81 | def is_all_in(row): 82 | if row['query'] in row['title']: 83 | return 1 84 | else: 85 | return 0 86 | 87 | feature = pd.DataFrame() 88 | feature['问题长度'] = data_all['query'].progress_apply(lambda row:len(row.split(' '))) 89 | feature['标题长度'] = data_all['title'].progress_apply(lambda row:len(row.split(' '))) 90 | feature['标题长度-问题长度'] = feature['标题长度'] - feature['问题长度'] 91 | feature['问题是否全部在标题里面'] = data_all.progress_apply(lambda row:is_all_in(row), axis=1) 92 | feature['标题和问题的交集个数'] = data_all.progress_apply(lambda row:get_union_data(row), axis=1) 93 | feature['标题问题词语的交集个数/问题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['问题长度']), 8) 94 | feature['标题问题词语的交集个数/标题长度'] = np.around(np.divide(feature['标题和问题的交集个数'], feature['标题长度']), 8) 95 | feature['编辑距离'] = data_all.progress_apply(lambda row:Levenshtein.distance(row['query'], row['title']), axis=1) 96 | feature['前一个词语是否相同'] = data_all.progress_apply(lambda row:same_1(row), axis=1) 97 | feature['前两个词语是否相同'] = data_all.progress_apply(lambda row:same_2(row), axis=1) 98 | feature['前三个词语是否相同'] = data_all.progress_apply(lambda row:same_3(row), axis=1) 99 | feature.to_csv(out + 'f1.csv', index=False) 100 | 101 | # 构造特征集 f2 102 | def pos_1(row): 103 | title_list = row['title'].split(' ') 104 | query_list = row['query'].split(' ') 105 | value = -1 106 | try: 107 | value = title_list.index(query_list[0]) 108 | except Exception: 109 | value = -1 110 | return value 111 | 112 | def pos_2(row): 113 | title_list = row['title'].split(' ') 114 | query_list = row['query'].split(' ') 115 | if len(query_list) <=1 : 116 | return -1 117 | try: 118 | value = title_list.index(query_list[1]) 119 | except Exception: 120 | value = -1 121 | return value 122 | 123 | def pos_3(row): 124 | title_list = row['title'].split(' ') 125 | query_list = row['query'].split(' ') 126 | if len(query_list) <=2 : 127 | return -1 128 | try: 129 | value = title_list.index(query_list[2]) 130 | except Exception: 131 | value = -1 132 | return value 133 | 134 | feature = pd.DataFrame() 135 | feature['第一个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_1(row), axis=1) 136 | feature['第二个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_2(row), axis=1) 137 | feature['第三个词语在标题里面出现位置'] = data_all.progress_apply(lambda row:pos_3(row), axis=1) 138 | feature.to_csv(out + 'f2.csv', index=False) 139 | 140 | feature = pd.DataFrame() 141 | feature['标题求组合后词语'] = data_all.groupby('title').query.transform('nunique') 142 | feature['词语求组合后标题'] = data_all.groupby('query').title.transform('nunique') 143 | feature.to_csv(out + 'f3.csv', index=False) 144 | 145 | 146 | # data_all = data_all.fillna(-1) 147 | # data_all.to_pickle(out+"data.pickle") 148 | 149 | # data_all = pd.read_pickle(out+"data.pickle") 150 | # f5 word2vec本身相似度 151 | from gensim.models import Word2Vec 152 | import gensim 153 | import logging 154 | feature = pd.DataFrame() 155 | w2v = Word2Vec.load(out + 'new_skip_w2v_all_300.model') 156 | def get_new_w2v(seq1, seq2): 157 | seq1 = seq1.split(' ') 158 | seq2 = seq2.split(' ') 159 | try: 160 | return w2v.n_similarity(seq1, seq2) 161 | except: 162 | return -1 163 | 164 | f3 = pd.read_csv(out + 'f3.csv') 165 | f3['w2v本身相似度'] = data_all.progress_apply(lambda row:get_new_w2v(row['query'], row['title']), axis=1) 166 | f3.to_csv(out + 'f3.csv', index=False) 167 | 168 | 169 | -------------------------------------------------------------------------------- /get_corpus.py: -------------------------------------------------------------------------------- 1 | fout = open(out + "corpus.csv",'w') 2 | with open(path+"train_final.csv",'r') as fin: 3 | q_last = '' 4 | for line in tqdm(fin): 5 | _,q,_,t,_ = line.strip().split(',') 6 | if q!=q_last: 7 | q_last = q 8 | fout.write(q + '\n') 9 | fout.write(t + '\n') 10 | with open(path+"test_final_part1.csv",'r') as fin: 11 | q_last = '' 12 | for line in tqdm(fin): 13 | _,q,_,t = line.strip().split(',') 14 | if q!=q_last: 15 | q_last = q 16 | fout.write(q + '\n') 17 | fout.write(t + '\n') 18 | fout.close() 19 | """ 20 | corpus.txt格式 21 | // 每行是一条语料 以空格分隔 22 | 我 鄂温克 三打底裤 是是 23 | 说的 24 | 是对的是 25 | 时代大厦 是对的 26 | 是赛事方 说的 27 | 28 | """ 29 | -------------------------------------------------------------------------------- /train_fasttext.py: -------------------------------------------------------------------------------- 1 | import fasttext 2 | w2v = fasttext.train_unsupervised(input=out+"corpus.csv") 3 | w2v.save_model(out+'corpus.fasttext.model') 4 | w2v = fasttext.load_model(out+'corpus.fasttext.model') 5 | word2index = {word: index+1 for index, word in enumerate(w2v.words)} 6 | index2word = {index+1: word for index, word in enumerate(w2v.words)} 7 | 8 | 9 | def get_embedding_matrix(): 10 | m = np.zeros(shape=(len(index2word)+1, 100)) 11 | for i, w in index2word.items(): 12 | m[i, :] = w2v[w] 13 | return m 14 | -------------------------------------------------------------------------------- /train_w2v.py: -------------------------------------------------------------------------------- 1 | from gensim.models import Word2Vec 2 | import logging 3 | from gensim.models import word2vec 4 | logging.basicConfig( 5 | format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO) 6 | sent=word2vec.Text8Corpus("/home/kesci/work/zhifeng/corpus.csv") 7 | word2vecModel = word2vec.Word2Vec(sent, size=300, window=5, min_count=1,iter=5, 8 | sg=1,workers=8) 9 | word2vecModel.save(out+"skip_w2v_all_300.model") 10 | 11 | # ##### further train 12 | from gensim.models import word2vec 13 | model = word2vec.Word2Vec.load(out+"skip_w2v_all_300.model") 14 | fout = open(out + "new_corpus.csv",'w') 15 | with open(path+"bytedance_contest.final_2.csv",'r') as fin: 16 | q_last = '' 17 | for line in tqdm(fin): 18 | _,q,_,t = line.strip().split(',') 19 | if q!=q_last: 20 | q_last = q 21 | fout.write(q + '\n') 22 | fout.write(t + '\n') 23 | fout.close() 24 | logging.basicConfig( 25 | format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO) 26 | sent=word2vec.Text8Corpus(out + "new_corpus.csv") 27 | model.build_vocab(sent, update=True) 28 | model.train(sent,total_examples=model.corpus_count, epochs=5) 29 | model.save(out+"new_skip_w2v_all_300.model") 30 | -------------------------------------------------------------------------------- /w2v_cos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | import random as rn 5 | from tqdm import tqdm, tqdm_notebook 6 | from sklearn.metrics import roc_auc_score 7 | from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD 8 | import gc 9 | import time 10 | from gensim.models import Word2Vec 11 | import fasttext 12 | from gensim.models import Word2Vec 13 | import scipy.spatial.distance as ssd 14 | tqdm.pandas() 15 | input_path = "/home/kesci/input/bytedance/" 16 | out_work_path = '/home/kesci/work/zhifeng/' 17 | out_path = '/home/kesci/zhifeng/' 18 | 19 | w2v = Word2Vec.load('/home/kesci/work/chizhu/skip_w2v_all_300.model') 20 | 21 | 22 | def get_sentence_embeddings(text, sep=' ', dim=300): 23 | v = np.zeros(dim) 24 | words = text.strip().split(sep) 25 | cnt = 0 26 | for word in words: 27 | if word in w2v: 28 | v += w2v[word] 29 | cnt += 1 30 | return v/cnt if cnt != 0 else v 31 | 32 | 33 | train_cosine_list = [] 34 | with open(out_path+'train.smaller.csv', 'r') as fin: 35 | for line in tqdm(fin): 36 | _, q, _, a, _ = line.strip().split(',') 37 | v1 = get_sentence_embeddings(q) 38 | v2 = get_sentence_embeddings(a) 39 | train_cosine_list.append(ssd.cosine(v1, v2)) 40 | pd.to_pickle(np.array(train_cosine_list), out_work_path+'train.cosine.w2v.pkl') 41 | val_cosine_list = [] 42 | with open(out_path+'val.csv', 'r') as fin: 43 | for line in tqdm(fin): 44 | _, q, _, a, _ = line.strip().split(',') 45 | v1 = get_sentence_embeddings(q) 46 | v2 = get_sentence_embeddings(a) 47 | val_cosine_list.append(ssd.cosine(v1, v2)) 48 | pd.to_pickle(np.array(val_cosine_list), out_work_path+'val.cosine.w2v.pkl') 49 | test_cosine_list = [] 50 | with open(input_path+'test_final_part1.csv', 'r') as fin: 51 | for line in tqdm(fin): 52 | _, q, _, a = line.strip().split(',') 53 | v1 = get_sentence_embeddings(q) 54 | v2 = get_sentence_embeddings(a) 55 | test_cosine_list.append(ssd.cosine(v1, v2)) 56 | pd.to_pickle(np.array(test_cosine_list), out_path+'test.cosine.w2v.pkl') 57 | --------------------------------------------------------------------------------