├── README.md ├── config.py ├── data_util.py ├── models ├── __init__.py ├── bimpm.py ├── layers.py └── multi_perspective.py └── train_model.py /README.md: -------------------------------------------------------------------------------- 1 | # BiMPM_keras 2 | Keras implementation of Bilateral Multi-Perspective Matching [1] using in [Quora Question Duplicate Pairs Competition](https://www.kaggle.com/c/quora-question-pairs). You can find the original tensorflow implementation from [here](https://github.com/zhiguowang/BiMPM). 3 | 4 | ## Description 5 | 6 | `models/bimpm.py` - model graph. 7 | 8 | `models/multi_perspective.py` - multi perspective matching layer. 9 | 10 | `models/layers.py` - other layers, word embedding layers, context layer, etc. 11 | 12 | `train_model.py` - train and test BiMPM model. 13 | 14 | `config.py` - hyper-parameters. 15 | 16 | If you find any bugs, please create an issue, thanks. 17 | 18 | ## Requirements 19 | 20 | - python 2.7 21 | - tensorflow 1.1.0 22 | - keras 2.0.3 23 | - numpy 1.12.1 24 | - pandas 0.19.2 25 | - nltk 3.2.2 26 | - gensim 1.0.1 27 | 28 | ## References 29 | 30 | [[1]](https://arxiv.org/pdf/1702.03814) Zhiguo Wang, Wael Hamza and Radu Florian. "Bilateral Multi-Perspective Matching for Natural Language Sentences." 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Model configs. 3 | """ 4 | import os 5 | 6 | 7 | class DirConfig(object): 8 | DEBUG = 0 9 | W2V_FILE = '../embeddings/GoogleNews-vectors-negative300.bin' 10 | GLOVE_FILE = '../embeddings/glove.840B.300d.txt' 11 | BASE_DIR = '../' 12 | DATA_DIR = '../dataset/' 13 | TRAIN_FILE = DATA_DIR + 'train.csv' 14 | TEST_FILE = DATA_DIR + 'test.csv' 15 | TRAIN_FEATURES_FILE = DATA_DIR + 'train_xgb_features.csv' 16 | TEST_FEATURES_FILE = DATA_DIR + 'test_xgb_features.csv' 17 | SAMPLE_TRAIN_FILE = DATA_DIR + 'sample_train.csv' 18 | SAMPLE_TEST_FILE = DATA_DIR + 'sample_test.csv' 19 | SAMPLE_TRAIN_FEATURES_FILE = DATA_DIR + 'sample_train_xgb_features.csv' 20 | SAMPLE_TEST_FEATURES_FILE = DATA_DIR + 'sample_test_xgb_features.csv' 21 | HISTORYA_DIR = os.path.join(BASE_DIR, 'history') 22 | SUBM_DIR = '../subm/' 23 | Q1_CACHE_TRAIN = '../dataset/cache_train_q1.npy' 24 | Q2_CACHE_TRAIN = '../dataset/cache_train_q2.npy' 25 | Q1_CACHE_TEST = '../dataset/cache_test_q1.npy' 26 | Q2_CACHE_TEST = '../dataset/cache_test_q2.npy' 27 | CHAR1_CACHE_TRAIN = '../dataset/cache_train_char1.npy' 28 | CHAR2_CACHE_TRAIN = '../dataset/cache_train_char2.npy' 29 | CHAR1_CACHE_TEST = '../dataset/cache_test_char1.npy' 30 | CHAR2_CACHE_TEST = '../dataset/cache_test_char2.npy' 31 | CHAR_INDEX_CACHE = '../dataset/char_index.npy' 32 | W2V_CACHE = '../dataset/w2v_matrix.npy' 33 | GLOVE_CACHE = '../dataset/glove_matrix.npy' 34 | WORD_INDEX_CACHE = '../dataset/word_index.npy' 35 | TARGETS_CACHE = '../dataset/cache_targets.npy' 36 | TEST_ID_CACHE = '../dataset/cache_test_id.npy' 37 | 38 | 39 | class TrainConfig(object): 40 | TEST_SIZE = 0.1 41 | RE_WEIGHT = True 42 | BATCH_SIZE = 1024 43 | NB_EPOCH = 5 if DirConfig.DEBUG else 50 44 | CLASS_WEIGHT = {0: 1.0, 1: 1.708574797505075} 45 | SHARE_RNN = 1 46 | USE_CHAR = 0 47 | REMOVE_STOPWORDS = 0 48 | USE_STEM = 0 49 | W2V_TYPE = 'word2vec' 50 | KFOLD = 1 51 | MAX_SEQUENCE_LENGTH = 40 52 | MAX_NB_WORDS = 200000 53 | WORD_EMBEDDING_DIM = 300 54 | MAX_NB_CHARS = 50 55 | MAX_CHAR_PER_WORD = 10 56 | CHAR_EMBEDDING_DIM = 20 57 | CHAR_LSTM_DIM = 50 58 | VALIDATION_SPLIT = 0.1 59 | 60 | 61 | class TestConfig(object): 62 | RE_WEIGHT = True 63 | BATCH_SIZE = 1024 64 | CLASS_WEIGHT = {0: 1.309028344, 1: 0.472001959} 65 | 66 | 67 | class BiMPMConfig(object): 68 | SEED = 2017 + 6 69 | MODEL = 'BiMPM' 70 | RNN_UNIT = 'gru' 71 | TRIAL = 5 72 | BASE_DIR = '../models/' 73 | CONTEXT_LSTM_DIM = 100 74 | AGGREGATION_LSTM_DIM = 300 75 | DENSE_DIM = 100 76 | RATE_DROP_REPRES = 0.4 77 | DROP_RATE = 0.4 78 | WITH_HIGHWAY = 1 79 | MP_DIM = 10 80 | CHECKPOINT = '../checkpoint/{}_trial_{}_db_{}.h5'.format(MODEL, TRIAL, DirConfig.DEBUG) 81 | INFO = '%s_rnn_%s_seq_%d_context_%d_mp_%d_aggreg_%d_highway_%d_shareRNN_%d_drop_%.2f \ 82 | _char_%d_k_%d' % \ 83 | (MODEL, RNN_UNIT, TrainConfig.MAX_SEQUENCE_LENGTH, CONTEXT_LSTM_DIM, MP_DIM, 84 | AGGREGATION_LSTM_DIM, WITH_HIGHWAY, TrainConfig.SHARE_RNN, DROP_RATE, 85 | TrainConfig.USE_CHAR, TrainConfig.KFOLD) 86 | W2V_TYPE = 'word2vector' 87 | -------------------------------------------------------------------------------- /data_util.py: -------------------------------------------------------------------------------- 1 | import re 2 | from nltk.corpus import stopwords 3 | from nltk.stem import SnowballStemmer 4 | from config import ( 5 | DirConfig, TrainConfig 6 | ) 7 | from keras.preprocessing.text import Tokenizer 8 | from keras.preprocessing.sequence import pad_sequences 9 | from keras.models import load_model 10 | import pandas as pd 11 | from tqdm import tqdm 12 | import numpy as np 13 | from gensim.models import KeyedVectors 14 | import datetime 15 | import os 16 | 17 | 18 | def get_text_sequence(): 19 | if os.path.isfile(DirConfig.CHAR1_CACHE_TRAIN): 20 | print('---- Load data from cache.') 21 | train_x1 = np.load(open(DirConfig.Q1_CACHE_TRAIN, 'rb')) 22 | train_x2 = np.load(open(DirConfig.Q2_CACHE_TRAIN, 'rb')) 23 | test_x1 = np.load(open(DirConfig.Q1_CACHE_TEST, 'rb')) 24 | test_x2 = np.load(open(DirConfig.Q2_CACHE_TEST, 'rb')) 25 | labels = np.load(open(DirConfig.TARGETS_CACHE, 'rb')) 26 | test_ids = np.load(open(DirConfig.TEST_ID_CACHE, 'rb')) 27 | word_index = np.load(open(DirConfig.WORD_INDEX_CACHE, 'rb')).item() 28 | char_index = None 29 | 30 | # use char representation 31 | if TrainConfig.USE_CHAR: 32 | train_words1 = np.load(open(DirConfig.CHAR1_CACHE_TRAIN, 'rb')) 33 | train_words2 = np.load(open(DirConfig.CHAR2_CACHE_TRAIN, 'rb')) 34 | test_words1 = np.load(open(DirConfig.CHAR1_CACHE_TEST, 'rb')) 35 | test_words2 = np.load(open(DirConfig.CHAR2_CACHE_TEST, 'rb')) 36 | char_index = np.load(open(DirConfig.CHAR_INDEX_CACHE, 'rb')).item() 37 | else: 38 | # load data from csv 39 | if DirConfig.DEBUG: 40 | train_data = pd.read_csv(DirConfig.SAMPLE_TRAIN_FILE) 41 | test_data = pd.read_csv(DirConfig.SAMPLE_TEST_FILE) 42 | else: 43 | train_data = pd.read_csv(DirConfig.TRAIN_FILE) 44 | test_data = pd.read_csv(DirConfig.TEST_FILE) 45 | 46 | # train and text text 47 | train_ori1 = list(train_data.question1.values.astype(str)) 48 | train_ori2 = list(train_data.question2.values.astype(str)) 49 | test_ori1 = list(test_data.question1.values.astype(str)) 50 | test_ori2 = list(test_data.question2.values.astype(str)) 51 | 52 | # target labels 53 | labels = train_data.is_duplicate.values 54 | test_ids = test_data.test_id 55 | np.save(open(DirConfig.TARGETS_CACHE, 'wb'), labels) 56 | np.save(open(DirConfig.TEST_ID_CACHE, 'wb'), test_ids) 57 | 58 | train_ori1 = preprocess_texts(train_ori1) 59 | train_ori2 = preprocess_texts(train_ori2) 60 | test_ori1 = preprocess_texts(test_ori1) 61 | test_ori2 = preprocess_texts(test_ori2) 62 | 63 | train_x1, train_x2, test_x1, test_x2, word_index = \ 64 | get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2) 65 | 66 | if TrainConfig.USE_CHAR: 67 | train_words1, train_words2, test_words1, test_words2, char_index = \ 68 | get_char_seq(train_ori1, train_ori2, test_ori1, test_ori2) 69 | else: 70 | char_index = None 71 | 72 | if TrainConfig.USE_CHAR: 73 | # concatenate inputs 74 | train_x1 = (train_x1, train_words1) 75 | train_x2 = (train_x2, train_words2) 76 | test_x1 = (test_x1, test_words1) 77 | test_x2 = (test_x2, test_words2) 78 | 79 | return train_x1, train_x2, test_x1, test_x2, labels, test_ids, word_index, char_index 80 | 81 | 82 | def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2): 83 | # fit tokenizer 84 | tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS) 85 | tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2) 86 | word_index = tk.word_index 87 | 88 | # q1, q2 training text sequence 89 | # (sentence_len, MAX_SEQUENCE_LENGTH) 90 | train_x1 = tk.texts_to_sequences(train_ori1) 91 | train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH) 92 | train_x2 = tk.texts_to_sequences(train_ori2) 93 | train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH) 94 | 95 | # q1, q2 testing text sequence 96 | test_x1 = tk.texts_to_sequences(test_ori1) 97 | test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH) 98 | test_x2 = tk.texts_to_sequences(test_ori2) 99 | test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH) 100 | 101 | np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1) 102 | np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2) 103 | np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1) 104 | np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2) 105 | np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index) 106 | return train_x1, train_x2, test_x1, test_x2, word_index 107 | 108 | 109 | def words_to_char_sequence(words_list, tk): 110 | """Convert words list to chars sequence 111 | 112 | # Arguments 113 | words: word list, (sentence_len, word_len) 114 | 115 | # Output shape 116 | (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD) 117 | """ 118 | c_seqs = np.zeros((len(words_list), 119 | TrainConfig.MAX_SEQUENCE_LENGTH, 120 | TrainConfig.MAX_CHAR_PER_WORD), dtype='int32') 121 | for w_i in xrange(len(words_list)): 122 | words = words_list[w_i] 123 | fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH, 124 | TrainConfig.MAX_CHAR_PER_WORD), dtype='int32') 125 | ws = tk.texts_to_sequences(words) 126 | ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD) 127 | if TrainConfig.MAX_SEQUENCE_LENGTH < len(words): 128 | max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH 129 | else: 130 | max_word_len = len(words) 131 | fixed_ws[:max_word_len, :] = ws[:max_word_len, :] 132 | c_seqs[w_i] = fixed_ws 133 | return c_seqs 134 | 135 | 136 | def get_char_seq(train_ori1, train_ori2, test_ori1, test_ori2): 137 | # extract words from each text 138 | train_words1 = extract_words(train_ori1) 139 | train_words2 = extract_words(train_ori2) 140 | test_words1 = extract_words(test_ori1) 141 | test_words2 = extract_words(test_ori2) 142 | 143 | # fit tokenizer 144 | tk = Tokenizer(num_words=TrainConfig.MAX_NB_CHARS, char_level=True) 145 | tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2) 146 | char_index = tk.word_index 147 | 148 | # q1, q2 training word sequence 149 | train_s1 = words_to_char_sequence(train_words1, tk) 150 | train_s2 = words_to_char_sequence(train_words2, tk) 151 | 152 | # q1, q2 testing word sequence 153 | test_s1 = words_to_char_sequence(test_words1, tk) 154 | test_s2 = words_to_char_sequence(test_words2, tk) 155 | 156 | # save cache 157 | np.save(open(DirConfig.CHAR1_CACHE_TRAIN, 'wb'), train_s1) 158 | np.save(open(DirConfig.CHAR2_CACHE_TRAIN, 'wb'), train_s2) 159 | np.save(open(DirConfig.CHAR1_CACHE_TEST, 'wb'), test_s1) 160 | np.save(open(DirConfig.CHAR2_CACHE_TEST, 'wb'), test_s2) 161 | np.save(open(DirConfig.CHAR_INDEX_CACHE, 'wb'), char_index) 162 | return train_s1, train_s2, test_s1, test_s2, char_index 163 | 164 | 165 | # from https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text 166 | def text_to_wordlist(text, remove_stopwords=False, stem_words=False): 167 | # Convert words to lower case and split them 168 | text = str(text).lower().split() 169 | 170 | # Optionally, remove stop words 171 | if remove_stopwords: 172 | stops = set(stopwords.words("english")) 173 | text = [w for w in text if not w in stops] 174 | 175 | text = " ".join(text) 176 | 177 | # Clean the text 178 | text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) 179 | text = re.sub(r"what's", "what is ", text) 180 | text = re.sub(r"\'s", " ", text) 181 | text = re.sub(r"\'ve", " have ", text) 182 | text = re.sub(r"can't", "cannot ", text) 183 | text = re.sub(r"n't", " not ", text) 184 | text = re.sub(r"i'm", "i am ", text) 185 | text = re.sub(r"\'re", " are ", text) 186 | text = re.sub(r"\'d", " would ", text) 187 | text = re.sub(r"\'ll", " will ", text) 188 | text = re.sub(r" e g ", " eg ", text) 189 | text = re.sub(r" b g ", " bg ", text) 190 | text = re.sub(r"e-mail", "email", text) 191 | text = re.sub(r"imrovement", "improvement", text) 192 | text = re.sub(r"intially", "initially", text) 193 | text = re.sub(r"demonitization", "demonetization", text) 194 | text = re.sub(r"actived", "active", text) 195 | 196 | text = re.sub(r",", " ", text) 197 | text = re.sub(r"\.", " ", text) 198 | text = re.sub(r"!", " ! ", text) 199 | text = re.sub(r"\/", " ", text) 200 | text = re.sub(r"\^", " ^ ", text) 201 | text = re.sub(r"\+", " + ", text) 202 | text = re.sub(r"\-", " - ", text) 203 | text = re.sub(r"\=", " = ", text) 204 | text = re.sub(r"'", " ", text) 205 | text = re.sub(r"(\d+)(k)", r"\g<1>000", text) 206 | text = re.sub(r":", " : ", text) 207 | text = re.sub(r" u s ", " american ", text) 208 | text = re.sub(r"\0s", "0", text) 209 | text = re.sub(r" 9 11 ", " 911 ", text) 210 | text = re.sub(r"e - mail", "email", text) 211 | text = re.sub(r"j k", "jk", text) 212 | text = re.sub(r"\s{2,}", " ", text) 213 | 214 | # Optionally, shorten words to their stems 215 | if stem_words: 216 | text = text.split() 217 | stemmer = SnowballStemmer('english') 218 | stemmed_words = [stemmer.stem(word) for word in text] 219 | text = " ".join(stemmed_words) 220 | 221 | # Return a list of words 222 | return(text) 223 | 224 | 225 | def preprocess_texts(texts): 226 | processed = [] 227 | for t in texts: 228 | processed.append(text_to_wordlist( 229 | t, remove_stopwords=TrainConfig.REMOVE_STOPWORDS, stem_words=TrainConfig.USE_STEM)) 230 | return processed 231 | 232 | 233 | def split_train_data(train_x1, train_x2, labels, train_index, val_index): 234 | if TrainConfig.USE_CHAR: 235 | train_w1 = train_x1[0][train_index] 236 | train_w2 = train_x2[0][train_index] 237 | train_c1 = train_x1[1][train_index] 238 | train_c2 = train_x2[1][train_index] 239 | train_data = [train_w1, train_w2, train_c1, train_c2] 240 | 241 | val_w1 = train_x1[0][val_index] 242 | val_w2 = train_x2[0][val_index] 243 | val_c1 = train_x1[1][val_index] 244 | val_c2 = train_x2[1][val_index] 245 | val_data = [val_w1, val_w2, val_c1, val_c2] 246 | else: 247 | train_data = [train_x1[train_index], train_x2[train_index]] 248 | val_data = [train_x1[val_index], train_x2[val_index]] 249 | 250 | train_labels = labels[train_index] 251 | val_labels = labels[val_index] 252 | return train_data, train_labels, val_data, val_labels 253 | 254 | 255 | def extract_words(sentences): 256 | """Extract chars from each sentence 257 | 258 | # Arguments 259 | sentences: list of sentences 260 | """ 261 | w_seqs = [] 262 | for s in sentences: 263 | s = re.sub(r"[?^,!.\/'+-=()]", " ", s) 264 | s = s.strip() 265 | words = [] 266 | for word in re.split('\\s+', s): 267 | words.append(word) 268 | w_seqs.append(words) 269 | return w_seqs 270 | 271 | 272 | def load_word_embedding(type, vec_file, word_index, config): 273 | if type == 'glove': 274 | return load_glove_matrix(vec_file, word_index, config) 275 | else: 276 | return load_word2vec_matrix(vec_file, word_index, config) 277 | 278 | 279 | def load_glove_matrix(vec_file, word_index, config): 280 | if os.path.isfile(DirConfig.GLOVE_CACHE): 281 | print('---- Load word vectors from cache.') 282 | embedding_matrix = np.load(open(DirConfig.GLOVE_CACHE, 'rb')) 283 | return embedding_matrix 284 | 285 | print('---- loading glove ...') 286 | embeddings_index = {} 287 | f = open(vec_file) 288 | for line in tqdm(f): 289 | values = line.split() 290 | word = values[0] 291 | coefs = np.asarray(values[1:], dtype='float32') 292 | embeddings_index[word] = coefs 293 | f.close() 294 | 295 | print('Found %s word vectors.' % len(embeddings_index)) 296 | 297 | nb_words = min(config.MAX_NB_WORDS, len(word_index)) + 1 298 | embedding_matrix = np.zeros((nb_words, config.WORD_EMBEDDING_DIM)) 299 | for word, i in tqdm(word_index.items()): 300 | embedding_vector = embeddings_index.get(word) 301 | if embedding_vector is not None: 302 | embedding_matrix[i] = embedding_vector 303 | 304 | # check the words which not in embedding vectors 305 | not_found_words = [] 306 | for word, i in word_index.items(): 307 | if word not in embeddings_index: 308 | not_found_words.append(word) 309 | 310 | np.save(open(DirConfig.GLOVE_CACHE, 'wb'), embedding_matrix) 311 | return embedding_matrix 312 | 313 | 314 | def load_word2vec_matrix(vec_file, word_index, config): 315 | if os.path.isfile(DirConfig.W2V_CACHE): 316 | print('---- Load word vectors from cache.') 317 | embedding_matrix = np.load(open(DirConfig.W2V_CACHE, 'rb')) 318 | return embedding_matrix 319 | 320 | print('---- loading word2vec ...') 321 | word2vec = KeyedVectors.load_word2vec_format( 322 | vec_file, binary=True) 323 | print('Found %s word vectors of word2vec' % len(word2vec.vocab)) 324 | 325 | nb_words = min(config.MAX_NB_WORDS, len(word_index)) + 1 326 | embedding_matrix = np.zeros((nb_words, config.WORD_EMBEDDING_DIM)) 327 | for word, i in word_index.items(): 328 | if word in word2vec.vocab: 329 | embedding_matrix[i] = word2vec.word_vec(word) 330 | print('Null word embeddings: %d' % \ 331 | np.sum(np.sum(embedding_matrix, axis=1) == 0)) 332 | 333 | # check the words which not in embedding vectors 334 | not_found_words = [] 335 | for word, i in word_index.items(): 336 | if word not in word2vec.vocab: 337 | not_found_words.append(word) 338 | 339 | np.save(open(DirConfig.W2V_CACHE, 'wb'), embedding_matrix) 340 | return embedding_matrix 341 | 342 | 343 | def save_training_history(path, config, history, fold=0): 344 | values = np.array(history.history.values()) 345 | results = pd.DataFrame(values.transpose(), columns=[history.history.keys()]) 346 | now = datetime.datetime.now() 347 | suffix = str(now.strftime("%Y-%m-%d-%H-%M")) 348 | path = os.path.join( 349 | path, 'his_{}_trial_{}_db_{}_k_{}-{}.csv'.format( 350 | config.INFO, config.TRIAL, DirConfig.DEBUG, fold, suffix)) 351 | results.to_csv(path) 352 | 353 | 354 | def create_submission(path, config, preds, test_ids, low_threhold=0.05): 355 | print('----- Create submission for {}'.format(config.MODEL)) 356 | if preds.shape[1] > 1: 357 | preds = preds[:, 1] 358 | preds = preds.clip(low_threhold, 1.0 - low_threhold) 359 | submission = pd.DataFrame(test_ids, columns=['test_id']) 360 | submission.loc[:, 'is_duplicate'] = preds.ravel() 361 | now = datetime.datetime.now() 362 | subm_file = os.path.join(path, 'subm_{}_trial_{}_db_{}-{}.csv'.format( 363 | config.INFO, config.TRIAL, DirConfig.DEBUG, str(now.strftime("%Y-%m-%d-%H-%M")))) 364 | submission.to_csv(subm_file, index=False) 365 | return subm_file 366 | 367 | 368 | def save_model(model, config, fold=0): 369 | m_file = os.path.join( 370 | config.BASE_DIR, '{}_trial_{}_db_{}_k_{}_model.h5'.format( 371 | config.INFO, config.TRIAL, DirConfig.DEBUG, fold)) 372 | w_file = os.path.join( 373 | config.BASE_DIR, '{}_trial_{}_db_{}_k_{}_weight.h5'.format( 374 | config.INFO, config.TRIAL, DirConfig.DEBUG, fold)) 375 | model.save(m_file) 376 | model.save_weights(w_file) 377 | print('--- Saved model.') 378 | 379 | 380 | def load_keras_model(config, custom_objects=None, fold=0): 381 | m_file = os.path.join( 382 | config.BASE_DIR, '{}_trial_{}_db_{}_k_{}_model.h5'.format( 383 | config.INFO, config.TRIAL, DirConfig.DEBUG, fold)) 384 | if os.path.isfile(m_file): 385 | model = load_model(m_file, custom_objects) 386 | return model 387 | else: 388 | return None 389 | 390 | 391 | def merge_several_folds_mean(data, nfolds): 392 | print('------ Merge several folds results to mean. -----') 393 | a = np.array(data[0]) 394 | for i in range(1, nfolds): 395 | a += np.array(data[i]) 396 | a /= nfolds 397 | return a.tolist() 398 | 399 | 400 | def load_trained_models(config): 401 | models = [] 402 | for k in range(TrainConfig.KFOLD): 403 | model = load_keras_model(config, fold=k + 1) 404 | if model is None: 405 | break 406 | # Compile model 407 | model.compile(loss='binary_crossentropy', 408 | optimizer='nadam', 409 | metrics=['accuracy']) 410 | models.append(model) 411 | return models 412 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ijinmao/BiMPM_keras/73245b76a4f4f53424ad1aa0a79df7f864181c8c/models/__init__.py -------------------------------------------------------------------------------- /models/bimpm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Model graph of Bilateral Multi-Perspective Matching. 3 | 4 | References: 5 | Bilateral Multi-Perspective Matching for Natural Language Sentences 6 | """ 7 | import numpy as np 8 | from keras.layers import Input 9 | from keras.models import Model 10 | from keras.layers.merge import concatenate 11 | import keras.backend as K 12 | from config import ( 13 | BiMPMConfig, TrainConfig 14 | ) 15 | from model.multi_perspective import MultiPerspective 16 | from models.layers import ( 17 | WordRepresLayer, CharRepresLayer, ContextLayer, PredictLayer 18 | ) 19 | 20 | np.random.seed(BiMPMConfig.SEED) 21 | 22 | 23 | def build_model(embedding_matrix, word_index, char_index=None): 24 | print('--- Building model...') 25 | 26 | # Parameters 27 | sequence_length = TrainConfig.MAX_SEQUENCE_LENGTH 28 | nb_per_word = TrainConfig.MAX_CHAR_PER_WORD 29 | rnn_unit = BiMPMConfig.RNN_UNIT 30 | nb_words = min(TrainConfig.MAX_NB_WORDS, len(word_index)) + 1 31 | word_embedding_dim = TrainConfig.WORD_EMBEDDING_DIM 32 | dropout = BiMPMConfig.DROP_RATE 33 | context_rnn_dim = BiMPMConfig.CONTEXT_LSTM_DIM 34 | mp_dim = BiMPMConfig.MP_DIM 35 | highway = BiMPMConfig.WITH_HIGHWAY 36 | aggregate_rnn_dim = BiMPMConfig.AGGREGATION_LSTM_DIM 37 | dense_dim = BiMPMConfig.DENSE_DIM 38 | if TrainConfig.USE_CHAR: 39 | nb_chars = min(TrainConfig.MAX_NB_CHARS, len(char_index)) + 1 40 | char_embedding_dim = TrainConfig.CHAR_EMBEDDING_DIM 41 | char_rnn_dim = TrainConfig.CHAR_LSTM_DIM 42 | 43 | # Model words input 44 | w1 = Input(shape=(sequence_length,), dtype='int32') 45 | w2 = Input(shape=(sequence_length,), dtype='int32') 46 | if TrainConfig.USE_CHAR: 47 | c1 = Input(shape=(sequence_length, nb_per_word), dtype='int32') 48 | c2 = Input(shape=(sequence_length, nb_per_word), dtype='int32') 49 | 50 | # Build word representation layer 51 | word_layer = WordRepresLayer( 52 | sequence_length, nb_words, word_embedding_dim, embedding_matrix) 53 | w_res1 = word_layer(w1) 54 | w_res2 = word_layer(w2) 55 | 56 | # Model chars input 57 | if TrainConfig.USE_CHAR: 58 | char_layer = CharRepresLayer( 59 | sequence_length, nb_chars, nb_per_word, char_embedding_dim, 60 | char_rnn_dim, rnn_unit=rnn_unit, dropout=dropout) 61 | c_res1 = char_layer(c1) 62 | c_res2 = char_layer(c2) 63 | sequence1 = concatenate([w_res1, c_res1]) 64 | sequence2 = concatenate([w_res2, c_res2]) 65 | else: 66 | sequence1 = w_res1 67 | sequence2 = w_res2 68 | 69 | # Build context representation layer 70 | context_layer = ContextLayer( 71 | context_rnn_dim, rnn_unit=rnn_unit, dropout=dropout, highway=highway, 72 | input_shape=(sequence_length, K.int_shape(sequence1)[-1],), 73 | return_sequences=True) 74 | context1 = context_layer(sequence1) 75 | context2 = context_layer(sequence2) 76 | 77 | # Build matching layer 78 | matching_layer = MultiPerspective(mp_dim) 79 | matching1 = matching_layer([context1, context2]) 80 | matching2 = matching_layer([context2, context1]) 81 | matching = concatenate([matching1, matching2]) 82 | 83 | # Build aggregation layer 84 | aggregate_layer = ContextLayer( 85 | aggregate_rnn_dim, rnn_unit=rnn_unit, dropout=dropout, highway=highway, 86 | input_shape=(sequence_length, K.int_shape(matching)[-1],), 87 | return_sequences=False) 88 | aggregation = aggregate_layer(matching) 89 | 90 | # Build prediction layer 91 | pred = PredictLayer(dense_dim, 92 | input_dim=K.int_shape(aggregation)[-1], 93 | dropout=dropout)(aggregation) 94 | # Build model 95 | if TrainConfig.USE_CHAR: 96 | inputs = (w1, w2, c1, c2) 97 | else: 98 | inputs = (w1, w2) 99 | 100 | # Build model graph 101 | model = Model(inputs=inputs, 102 | outputs=pred) 103 | 104 | # Compile model 105 | model.compile(loss='binary_crossentropy', 106 | optimizer='adam', 107 | metrics=['accuracy']) 108 | print(model.summary()) 109 | return model 110 | -------------------------------------------------------------------------------- /models/layers.py: -------------------------------------------------------------------------------- 1 | from keras.models import Sequential 2 | from keras.layers.embeddings import Embedding 3 | from keras.layers.core import Lambda, Dense, Dropout 4 | from keras.layers.recurrent import LSTM, GRU 5 | from keras.layers.wrappers import Bidirectional 6 | from keras.legacy.layers import Highway 7 | from keras.layers import TimeDistributed 8 | import keras.backend as K 9 | from keras.layers.normalization import BatchNormalization 10 | 11 | 12 | class WordRepresLayer(object): 13 | """Word embedding representation layer 14 | """ 15 | def __init__(self, sequence_length, nb_words, 16 | word_embedding_dim, embedding_matrix): 17 | self.model = Sequential() 18 | self.model.add(Embedding(nb_words, 19 | word_embedding_dim, 20 | weights=[embedding_matrix], 21 | input_length=sequence_length, 22 | trainable=False)) 23 | 24 | def __call__(self, inputs): 25 | return self.model(inputs) 26 | 27 | 28 | class CharRepresLayer(object): 29 | """Char embedding representation layer 30 | """ 31 | def __init__(self, sequence_length, nb_chars, nb_per_word, 32 | embedding_dim, rnn_dim, rnn_unit='gru', dropout=0.0): 33 | def _collapse_input(x, nb_per_word=0): 34 | x = K.reshape(x, (-1, nb_per_word)) 35 | return x 36 | 37 | def _unroll_input(x, sequence_length=0, rnn_dim=0): 38 | x = K.reshape(x, (-1, sequence_length, rnn_dim)) 39 | return x 40 | 41 | if rnn_unit == 'gru': 42 | rnn = GRU 43 | else: 44 | rnn = LSTM 45 | self.model = Sequential() 46 | self.model.add(Lambda(_collapse_input, 47 | arguments={'nb_per_word': nb_per_word}, 48 | output_shape=(nb_per_word,), 49 | input_shape=(sequence_length, nb_per_word,))) 50 | self.model.add(Embedding(nb_chars, 51 | embedding_dim, 52 | input_length=nb_per_word, 53 | trainable=True)) 54 | self.model.add(rnn(rnn_dim, 55 | dropout=dropout, 56 | recurrent_dropout=dropout)) 57 | self.model.add(Lambda(_unroll_input, 58 | arguments={'sequence_length': sequence_length, 59 | 'rnn_dim': rnn_dim}, 60 | output_shape=(sequence_length, rnn_dim))) 61 | 62 | def __call__(self, inputs): 63 | return self.model(inputs) 64 | 65 | 66 | class ContextLayer(object): 67 | """Word context layer 68 | """ 69 | def __init__(self, rnn_dim, rnn_unit='gru', input_shape=(0,), 70 | dropout=0.0, highway=False, return_sequences=False, 71 | dense_dim=0): 72 | if rnn_unit == 'gru': 73 | rnn = GRU 74 | else: 75 | rnn = LSTM 76 | self.model = Sequential() 77 | self.model.add( 78 | Bidirectional(rnn(rnn_dim, 79 | dropout=dropout, 80 | recurrent_dropout=dropout, 81 | return_sequences=return_sequences), 82 | input_shape=input_shape)) 83 | # self.model.add(rnn(rnn_dim, 84 | # dropout=dropout, 85 | # recurrent_dropout=dropout, 86 | # return_sequences=return_sequences, 87 | # input_shape=input_shape)) 88 | if highway: 89 | if return_sequences: 90 | self.model.add(TimeDistributed(Highway(activation='tanh'))) 91 | else: 92 | self.model.add(Highway(activation='tanh')) 93 | 94 | if dense_dim > 0: 95 | self.model.add(TimeDistributed(Dense(dense_dim, 96 | activation='relu'))) 97 | self.model.add(TimeDistributed(Dropout(dropout))) 98 | self.model.add(TimeDistributed(BatchNormalization())) 99 | 100 | def __call__(self, inputs): 101 | return self.model(inputs) 102 | 103 | 104 | class PredictLayer(object): 105 | """Prediction layer. 106 | 107 | """ 108 | def __init__(self, dense_dim, input_dim=0, 109 | dropout=0.0): 110 | self.model = Sequential() 111 | self.model.add(Dense(dense_dim, 112 | activation='relu', 113 | input_shape=(input_dim,))) 114 | self.model.add(Dropout(dropout)) 115 | self.model.add(BatchNormalization()) 116 | self.model.add(Dense(1, activation='sigmoid')) 117 | 118 | def __call__(self, inputs): 119 | return self.model(inputs) 120 | -------------------------------------------------------------------------------- /models/multi_perspective.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Multi-perspective Matching Layer. 4 | 5 | Reference: Bilateral Multi-Perspective Matching for Natural Language Sentences. 6 | """ 7 | 8 | import keras.backend as K 9 | from keras.engine.topology import Layer 10 | 11 | 12 | class MultiPerspective(Layer): 13 | """Multi-perspective Matching Layer. 14 | 15 | # Arguments 16 | mp_dim: single forward/backward multi-perspective dimention 17 | """ 18 | 19 | def __init__(self, mp_dim, epsilon=1e-6, **kwargs): 20 | self.mp_dim = mp_dim 21 | self.epsilon = 1e-6 22 | self.strategy = 4 23 | super(MultiPerspective, self).__init__(**kwargs) 24 | 25 | def build(self, input_shape): 26 | if isinstance(input_shape, list): 27 | input_shape = input_shape[0] 28 | embedding_size = input_shape[-1] / 2 29 | # Create a trainable weight variable for this layer. 30 | # input_shape is bidirectional RNN input shape 31 | # kernel shape (mp_dim * 2 * self.strategy, embedding_size) 32 | self.kernel = self.add_weight((self.mp_dim, 33 | embedding_size * 2 * self.strategy), 34 | name='kernel', 35 | initializer='glorot_uniform', 36 | trainable=True) 37 | self.kernel_full_fw = self.kernel[:, :embedding_size] 38 | self.kernel_full_bw = self.kernel[:, embedding_size: embedding_size * 2] 39 | self.kernel_attentive_fw = self.kernel[:, embedding_size * 2: embedding_size * 3] 40 | self.kernel_attentive_bw = self.kernel[:, embedding_size * 3: embedding_size * 4] 41 | self.kernel_max_attentive_fw = self.kernel[:, embedding_size * 4: embedding_size * 5] 42 | self.kernel_max_attentive_bw = self.kernel[:, embedding_size * 5: embedding_size * 6] 43 | self.kernel_max_pool_fw = self.kernel[:, embedding_size * 6: embedding_size * 7] 44 | self.kernel_max_pool_bw = self.kernel[:, embedding_size * 7:] 45 | self.built = True 46 | super(MultiPerspective, self).build(input_shape) 47 | 48 | def compute_output_shape(self, input_shape): 49 | if isinstance(input_shape, list): 50 | input_shape = input_shape[0] 51 | return (input_shape[0], input_shape[1], self.mp_dim * 2 * self.strategy) 52 | 53 | def get_config(self): 54 | config = {'mp_dim': self.mp_dim, 55 | 'epsilon': self.epsilon} 56 | base_config = super(MultiPerspective, self).get_config() 57 | return dict(list(base_config.items()) + list(config.items())) 58 | 59 | def call(self, inputs): 60 | # h1, h2: bidirectional LSTM hidden states, include forward and backward states 61 | # (batch_size, timesteps, embedding_size * 2) 62 | h1 = inputs[0] 63 | h2 = inputs[1] 64 | embedding_size = K.int_shape(h1)[-1] / 2 65 | h1_fw = h1[:, :, :embedding_size] 66 | h1_bw = h1[:, :, embedding_size:] 67 | h2_fw = h2[:, :, :embedding_size] 68 | h2_bw = h2[:, :, embedding_size:] 69 | 70 | # 4 matching strategy 71 | list_matching = [] 72 | 73 | # full matching ops 74 | matching_fw = self._full_matching(h1_fw, h2_fw, self.kernel_full_fw) 75 | matching_bw = self._full_matching(h1_bw, h2_bw, self.kernel_full_bw) 76 | list_matching.extend([matching_fw, matching_bw]) 77 | 78 | # cosine matrix 79 | cosine_matrix_fw = self._cosine_matrix(h1_fw, h2_fw) 80 | cosine_matrix_bw = self._cosine_matrix(h1_bw, h2_bw) 81 | 82 | # attentive matching ops 83 | matching_fw = self._attentive_matching( 84 | h1_fw, h2_fw, cosine_matrix_fw, self.kernel_attentive_fw) 85 | matching_bw = self._attentive_matching( 86 | h1_bw, h2_bw, cosine_matrix_bw, self.kernel_attentive_bw) 87 | list_matching.extend([matching_fw, matching_bw]) 88 | 89 | # max attentive matching ops 90 | matching_fw = self._max_attentive_matching( 91 | h1_fw, h2_fw, cosine_matrix_fw, self.kernel_max_attentive_fw) 92 | matching_bw = self._max_attentive_matching( 93 | h1_bw, h2_bw, cosine_matrix_bw, self.kernel_max_attentive_bw) 94 | list_matching.extend([matching_fw, matching_bw]) 95 | 96 | # max pooling matching ops 97 | matching_fw = self._max_pooling_matching(h1_fw, h2_fw, self.kernel_max_pool_fw) 98 | matching_bw = self._max_pooling_matching(h1_bw, h2_bw, self.kernel_max_pool_bw) 99 | list_matching.extend([matching_fw, matching_bw]) 100 | 101 | return K.concatenate(list_matching, axis=-1) 102 | 103 | def _cosine_similarity(self, x1, x2): 104 | """Compute cosine similarity. 105 | 106 | # Arguments: 107 | x1: (..., embedding_size) 108 | x2: (..., embedding_size) 109 | """ 110 | cos = K.sum(x1 * x2, axis=-1) 111 | x1_norm = K.sqrt(K.maximum(K.sum(K.square(x1), axis=-1), self.epsilon)) 112 | x2_norm = K.sqrt(K.maximum(K.sum(K.square(x2), axis=-1), self.epsilon)) 113 | cos = cos / x1_norm / x2_norm 114 | return cos 115 | 116 | def _cosine_matrix(self, x1, x2): 117 | """Cosine similarity matrix. 118 | 119 | Calculate the cosine similarities between each forward (or backward) 120 | contextual embedding h_i_p and every forward (or backward) 121 | contextual embeddings of the other sentence 122 | 123 | # Arguments 124 | x1: (batch_size, x1_timesteps, embedding_size) 125 | x2: (batch_size, x2_timesteps, embedding_size) 126 | 127 | # Output shape 128 | (batch_size, x1_timesteps, x2_timesteps) 129 | """ 130 | # expand h1 shape to (batch_size, x1_timesteps, 1, embedding_size) 131 | x1 = K.expand_dims(x1, axis=2) 132 | # expand x2 shape to (batch_size, 1, x2_timesteps, embedding_size) 133 | x2 = K.expand_dims(x2, axis=1) 134 | # cosine matrix (batch_size, h1_timesteps, h2_timesteps) 135 | cos_matrix = self._cosine_similarity(x1, x2) 136 | return cos_matrix 137 | 138 | def _mean_attentive_vectors(self, x2, cosine_matrix): 139 | """Mean attentive vectors. 140 | 141 | Calculate mean attentive vector for the entire sentence by weighted 142 | summing all the contextual embeddings of the entire sentence 143 | 144 | # Arguments 145 | x2: sequence vectors, (batch_size, x2_timesteps, embedding_size) 146 | cosine_matrix: cosine similarities matrix of x1 and x2, 147 | (batch_size, x1_timesteps, x2_timesteps) 148 | 149 | # Output shape 150 | (batch_size, x1_timesteps, embedding_size) 151 | """ 152 | # (batch_size, x1_timesteps, x2_timesteps, 1) 153 | expanded_cosine_matrix = K.expand_dims(cosine_matrix, axis=-1) 154 | # (batch_size, 1, x2_timesteps, embedding_size) 155 | x2 = K.expand_dims(x2, axis=1) 156 | # (batch_size, x1_timesteps, embedding_size) 157 | weighted_sum = K.sum(expanded_cosine_matrix * x2, axis=2) 158 | # (batch_size, x1_timesteps, 1) 159 | sum_cosine = K.expand_dims(K.sum(cosine_matrix, axis=-1) + self.epsilon, axis=-1) 160 | # (batch_size, x1_timesteps, embedding_size) 161 | attentive_vector = weighted_sum / sum_cosine 162 | return attentive_vector 163 | 164 | def _max_attentive_vectors(self, x2, cosine_matrix): 165 | """Max attentive vectors. 166 | 167 | Calculate max attentive vector for the entire sentence by picking 168 | the contextual embedding with the highest cosine similarity 169 | as the attentive vector. 170 | 171 | # Arguments 172 | x2: sequence vectors, (batch_size, x2_timesteps, embedding_size) 173 | cosine_matrix: cosine similarities matrix of x1 and x2, 174 | (batch_size, x1_timesteps, x2_timesteps) 175 | 176 | # Output shape 177 | (batch_size, x1_timesteps, embedding_size) 178 | """ 179 | # (batch_size, x1_timesteps) 180 | max_x2_step = K.argmax(cosine_matrix, axis=-1) 181 | 182 | embedding_size = K.int_shape(x2)[-1] 183 | timesteps = K.int_shape(max_x2_step)[-1] 184 | if timesteps is None: 185 | timesteps = K.shape(max_x2_step)[-1] 186 | 187 | # collapse time dimension and batch dimension together 188 | # collapse x2 to (batch_size * x2_timestep, embedding_size) 189 | x2 = K.reshape(x2, (-1, embedding_size)) 190 | # collapse max_x2_step to (batch_size * h1_timesteps) 191 | max_x2_step = K.reshape(max_x2_step, (-1,)) 192 | # (batch_size * x1_timesteps, embedding_size) 193 | max_x2 = K.gather(x2, max_x2_step) 194 | # reshape max_x2, (batch_size, x1_timesteps, embedding_size) 195 | attentive_vector = K.reshape(max_x2, K.stack([-1, timesteps, embedding_size])) 196 | return attentive_vector 197 | 198 | def _time_distributed_multiply(self, x, w): 199 | """Element-wise multiply vector and weights. 200 | 201 | # Arguments 202 | x: sequence of hidden states, (batch_size, ?, embedding_size) 203 | w: weights of one matching strategy of one direction, 204 | (mp_dim, embedding_size) 205 | 206 | # Output shape 207 | (?, mp_dim, embedding_size) 208 | """ 209 | # dimension of vector 210 | n_dim = K.ndim(x) 211 | embedding_size = K.int_shape(x)[-1] 212 | timesteps = K.int_shape(x)[1] 213 | if timesteps is None: 214 | timesteps = K.shape(x)[1] 215 | 216 | # collapse time dimension and batch dimension together 217 | x = K.reshape(x, (-1, embedding_size)) 218 | # reshape to (?, 1, embedding_size) 219 | x = K.expand_dims(x, axis=1) 220 | # reshape weights to (1, mp_dim, embedding_size) 221 | w = K.expand_dims(w, axis=0) 222 | # element-wise multiply 223 | x = x * w 224 | # reshape to original shape 225 | if n_dim == 3: 226 | x = K.reshape(x, K.stack([-1, timesteps, self.mp_dim, embedding_size])) 227 | x.set_shape([None, None, None, embedding_size]) 228 | elif n_dim == 2: 229 | x = K.reshape(x, K.stack([-1, self.mp_dim, embedding_size])) 230 | x.set_shape([None, None, embedding_size]) 231 | return x 232 | 233 | def _full_matching(self, h1, h2, w): 234 | """Full matching operation. 235 | 236 | # Arguments 237 | h1: (batch_size, h1_timesteps, embedding_size) 238 | h2: (batch_size, h2_timesteps, embedding_size) 239 | w: weights of one direction, (mp_dim, embedding_size) 240 | 241 | # Output shape 242 | (batch_size, h1_timesteps, mp_dim) 243 | """ 244 | # h2 forward last step hidden vector, (batch_size, embedding_size) 245 | h2_last_state = h2[:, -1, :] 246 | # h1 * weights, (batch_size, h1_timesteps, mp_dim, embedding_size) 247 | h1 = self._time_distributed_multiply(h1, w) 248 | # h2_last_state * weights, (batch_size, mp_dim, embedding_size) 249 | h2 = self._time_distributed_multiply(h2_last_state, w) 250 | # reshape to (batch_size, 1, mp_dim, embedding_size) 251 | h2 = K.expand_dims(h2, axis=1) 252 | # matching vector, (batch_size, h1_timesteps, mp_dim) 253 | matching = self._cosine_similarity(h1, h2) 254 | return matching 255 | 256 | def _max_pooling_matching(self, h1, h2, w): 257 | """Max pooling matching operation. 258 | 259 | # Arguments 260 | h1: (batch_size, h1_timesteps, embedding_size) 261 | h2: (batch_size, h2_timesteps, embedding_size) 262 | w: weights of one direction, (mp_dim, embedding_size) 263 | 264 | # Output shape 265 | (batch_size, h1_timesteps, mp_dim) 266 | """ 267 | # h1 * weights, (batch_size, h1_timesteps, mp_dim, embedding_size) 268 | h1 = self._time_distributed_multiply(h1, w) 269 | # h2 * weights, (batch_size, h2_timesteps, mp_dim, embedding_size) 270 | h2 = self._time_distributed_multiply(h2, w) 271 | # reshape v1 to (batch_size, h1_timesteps, 1, mp_dim, embedding_size) 272 | h1 = K.expand_dims(h1, axis=2) 273 | # reshape v1 to (batch_size, 1, h2_timesteps, mp_dim, embedding_size) 274 | h2 = K.expand_dims(h2, axis=1) 275 | # cosine similarity, (batch_size, h1_timesteps, h2_timesteps, mp_dim) 276 | cos = self._cosine_similarity(h1, h2) 277 | # (batch_size, h1_timesteps, mp_dim) 278 | matching = K.max(cos, axis=2) 279 | return matching 280 | 281 | def _attentive_matching(self, h1, h2, cosine_matrix, w): 282 | """Attentive matching operation. 283 | 284 | # Arguments 285 | h1: (batch_size, h1_timesteps, embedding_size) 286 | h2: (batch_size, h2_timesteps, embedding_size) 287 | cosine_matrix: weights of hidden state h2, 288 | (batch_size, h1_timesteps, h2_timesteps) 289 | w: weights of one direction, (mp_dim, embedding_size) 290 | 291 | # Output shape 292 | (batch_size, h1_timesteps, mp_dim) 293 | """ 294 | # h1 * weights, (batch_size, h1_timesteps, mp_dim, embedding_size) 295 | h1 = self._time_distributed_multiply(h1, w) 296 | # attentive vector (batch_size, h1_timesteps, embedding_szie) 297 | attentive_vec = self._mean_attentive_vectors(h2, cosine_matrix) 298 | # attentive_vec * weights, (batch_size, h1_timesteps, mp_dim, embedding_size) 299 | attentive_vec = self._time_distributed_multiply(attentive_vec, w) 300 | # matching vector, (batch_size, h1_timesteps, mp_dim) 301 | matching = self._cosine_similarity(h1, attentive_vec) 302 | return matching 303 | 304 | def _max_attentive_matching(self, h1, h2, cosine_matrix, w): 305 | """Max attentive matching operation. 306 | 307 | # Arguments 308 | h1: (batch_size, h1_timesteps, embedding_size) 309 | h2: (batch_size, h2_timesteps, embedding_size) 310 | cosine_matrix: weights of hidden state h2, 311 | (batch_size, h1_timesteps, h2_timesteps) 312 | w: weights of one direction, (mp_dim, embedding_size) 313 | 314 | # Output shape 315 | (batch_size, h1_timesteps, mp_dim) 316 | """ 317 | # h1 * weights, (batch_size, h1_timesteps, mp_dim, embedding_size) 318 | h1 = self._time_distributed_multiply(h1, w) 319 | # max attentive vector (batch_size, h1_timesteps, embedding_szie) 320 | max_attentive_vec = self._max_attentive_vectors(h2, cosine_matrix) 321 | # max_attentive_vec * weights, (batch_size, h1_timesteps, mp_dim, embedding_size) 322 | max_attentive_vec = self._time_distributed_multiply(max_attentive_vec, w) 323 | # matching vector, (batch_size, h1_timesteps, mp_dim) 324 | matching = self._cosine_similarity(h1, max_attentive_vec) 325 | return matching 326 | -------------------------------------------------------------------------------- /train_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.callbacks import ModelCheckpoint, EarlyStopping 3 | from sklearn.model_selection import KFold 4 | from models.bimpm import build_model as build_bimpm 5 | from config import ( 6 | DirConfig, TrainConfig, TestConfig, BiMPMConfig 7 | ) 8 | from data_util import ( 9 | get_text_sequence, save_training_history, create_submission, 10 | save_model, load_trained_models, load_word2vec_matrix, 11 | merge_several_folds_mean, split_train_data 12 | ) 13 | 14 | 15 | def train_model(): 16 | print('###### Start training for {}. ######'.format( 17 | 'debugging' if DirConfig.DEBUG else 'production')) 18 | 19 | # Get model config 20 | config = BiMPMConfig 21 | 22 | # Load trained model from cache 23 | models = load_trained_models(config) 24 | if len(models) > 0: 25 | print('--- load model from cache.') 26 | # Compile model 27 | for m in models: 28 | m.compile(loss='binary_crossentropy', 29 | optimizer='nadam', 30 | metrics=['accuracy']) 31 | return models, None, None, None 32 | 33 | # Load train/test data set 34 | train_x1, train_x2, test_x1, test_x2, labels, test_ids, word_index, char_index = \ 35 | get_text_sequence() 36 | 37 | # Load pretrained word embedding vectors 38 | embedding_matrix = load_word2vec_matrix( 39 | DirConfig.W2V_FILE, word_index, config) 40 | 41 | # Reweight params 42 | if TestConfig.RE_WEIGHT: 43 | class_weight = TestConfig.CLASS_WEIGHT 44 | else: 45 | class_weight = None 46 | 47 | # Split dataset indices 48 | kf = KFold(n_splits=10, shuffle=True) 49 | kf_gen = kf.split(labels) 50 | fold = 1 51 | models = [] 52 | 53 | # Cross-validation train model 54 | for train_index, val_index in kf_gen: 55 | # Load current fold dataset 56 | train_data, train_labels, val_data, val_labels = split_train_data( 57 | train_x1, train_x2, labels, train_index, val_index) 58 | 59 | # Define validation sample weight 60 | val_weight = np.ones(len(val_labels)) 61 | if TestConfig.RE_WEIGHT: 62 | val_weight *= TrainConfig.CLASS_WEIGHT[0] 63 | val_weight[val_labels == 0] = TrainConfig.CLASS_WEIGHT[1] 64 | 65 | # Build model 66 | model = build_model(embedding_matrix, word_index, char_index) 67 | 68 | # Define model callbacks 69 | early_stopping = EarlyStopping(monitor='val_loss', patience=5) 70 | model_checkpoint = ModelCheckpoint( 71 | config.CHECKPOINT, save_best_only=True, save_weights_only=True) 72 | 73 | # Training 74 | history = model.fit( 75 | train_data, y=train_labels, 76 | validation_data=(val_data, val_labels, val_weight), 77 | # validation_split=TrainConfig.VALIDATION_SPLIT, 78 | epochs=TrainConfig.NB_EPOCH, 79 | batch_size=TrainConfig.BATCH_SIZE, shuffle=True, 80 | class_weight=class_weight, 81 | callbacks=[early_stopping, model_checkpoint]) 82 | save_model(model, config, fold=fold) 83 | save_training_history(DirConfig.HISTORYA_DIR, config, history) 84 | fold += 1 85 | models.append(model) 86 | if fold > TrainConfig.KFOLD: 87 | break 88 | return models, test_x1, test_x2, test_ids 89 | 90 | 91 | def test_model(model_name, models=[], test_x1=None, test_x2=None, test_ids=None): 92 | print('###### Start testing for {}. ######'.format( 93 | 'debugging' if DirConfig.DEBUG else 'production')) 94 | 95 | config = BiMPMConfig 96 | 97 | # Load models from cache 98 | if len(models) == 0: 99 | models = load_trained_models(config) 100 | 101 | # Load test data from cache 102 | if test_x1 is None: 103 | _, _, test_x1, test_x2, _, test_ids, _, _ = \ 104 | get_text_sequence() 105 | 106 | if TrainConfig.USE_CHAR: 107 | test_data = [test_x1[0], test_x2[0], test_x1[1], test_x2[1]] 108 | else: 109 | test_data = [test_x1, test_x2] 110 | 111 | predictions = [] 112 | 113 | # Testing 114 | for model in models: 115 | preds = model.predict( 116 | test_data, 117 | batch_size=TestConfig.BATCH_SIZE, verbose=1) 118 | predictions.append(preds) 119 | 120 | preds_mean = np.array(merge_several_folds_mean(predictions, len(models))) 121 | create_submission(DirConfig.SUBM_DIR, config, preds_mean, test_ids) 122 | 123 | 124 | def build_model(embedding_matrix, word_index, char_index): 125 | return build_bimpm(embedding_matrix, word_index, char_index) 126 | 127 | 128 | def main(): 129 | models, test_x1, test_x2, test_ids = train_model() 130 | test_model(models, test_x1, test_x2, test_ids) 131 | 132 | 133 | if __name__ == '__main__': 134 | main() 135 | --------------------------------------------------------------------------------