├── README.md
├── config.py
├── data_util.py
├── models
    ├── __init__.py
    ├── bimpm.py
    ├── layers.py
    └── multi_perspective.py
└── train_model.py


/README.md:
--------------------------------------------------------------------------------
 1 | # BiMPM_keras
 2 | Keras implementation of Bilateral Multi-Perspective Matching [1] using in [Quora Question Duplicate Pairs Competition](https://www.kaggle.com/c/quora-question-pairs). You can find the original tensorflow implementation from [here](https://github.com/zhiguowang/BiMPM). 
 3 | 
 4 | ## Description
 5 | 
 6 | `models/bimpm.py` - model graph.
 7 | 
 8 | `models/multi_perspective.py` - multi perspective matching layer.
 9 | 
10 | `models/layers.py` - other layers, word embedding layers, context layer, etc.
11 | 
12 | `train_model.py` - train and test BiMPM model.
13 | 
14 | `config.py` - hyper-parameters.
15 | 
16 | If you find any bugs, please create an issue, thanks.
17 | 
18 | ## Requirements
19 | 
20 | - python 2.7
21 | - tensorflow 1.1.0
22 | - keras 2.0.3
23 | - numpy 1.12.1
24 | - pandas 0.19.2
25 | - nltk 3.2.2
26 | - gensim 1.0.1
27 | 
28 | ## References
29 | 
30 | [[1]](https://arxiv.org/pdf/1702.03814) Zhiguo Wang, Wael Hamza and Radu Florian. "Bilateral Multi-Perspective Matching for Natural Language Sentences."
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Model configs.
 3 | """
 4 | import os
 5 | 
 6 | 
 7 | class DirConfig(object):
 8 |     DEBUG = 0
 9 |     W2V_FILE = '../embeddings/GoogleNews-vectors-negative300.bin'
10 |     GLOVE_FILE = '../embeddings/glove.840B.300d.txt'
11 |     BASE_DIR = '../'
12 |     DATA_DIR = '../dataset/'
13 |     TRAIN_FILE = DATA_DIR + 'train.csv'
14 |     TEST_FILE = DATA_DIR + 'test.csv'
15 |     TRAIN_FEATURES_FILE = DATA_DIR + 'train_xgb_features.csv'
16 |     TEST_FEATURES_FILE = DATA_DIR + 'test_xgb_features.csv'
17 |     SAMPLE_TRAIN_FILE = DATA_DIR + 'sample_train.csv'
18 |     SAMPLE_TEST_FILE = DATA_DIR + 'sample_test.csv'
19 |     SAMPLE_TRAIN_FEATURES_FILE = DATA_DIR + 'sample_train_xgb_features.csv'
20 |     SAMPLE_TEST_FEATURES_FILE = DATA_DIR + 'sample_test_xgb_features.csv'
21 |     HISTORYA_DIR = os.path.join(BASE_DIR, 'history')
22 |     SUBM_DIR = '../subm/'
23 |     Q1_CACHE_TRAIN = '../dataset/cache_train_q1.npy'
24 |     Q2_CACHE_TRAIN = '../dataset/cache_train_q2.npy'
25 |     Q1_CACHE_TEST = '../dataset/cache_test_q1.npy'
26 |     Q2_CACHE_TEST = '../dataset/cache_test_q2.npy'
27 |     CHAR1_CACHE_TRAIN = '../dataset/cache_train_char1.npy'
28 |     CHAR2_CACHE_TRAIN = '../dataset/cache_train_char2.npy'
29 |     CHAR1_CACHE_TEST = '../dataset/cache_test_char1.npy'
30 |     CHAR2_CACHE_TEST = '../dataset/cache_test_char2.npy'
31 |     CHAR_INDEX_CACHE = '../dataset/char_index.npy'
32 |     W2V_CACHE = '../dataset/w2v_matrix.npy'
33 |     GLOVE_CACHE = '../dataset/glove_matrix.npy'
34 |     WORD_INDEX_CACHE = '../dataset/word_index.npy'
35 |     TARGETS_CACHE = '../dataset/cache_targets.npy'
36 |     TEST_ID_CACHE = '../dataset/cache_test_id.npy'
37 | 
38 | 
39 | class TrainConfig(object):
40 |     TEST_SIZE = 0.1
41 |     RE_WEIGHT = True
42 |     BATCH_SIZE = 1024
43 |     NB_EPOCH = 5 if DirConfig.DEBUG else 50
44 |     CLASS_WEIGHT = {0: 1.0, 1: 1.708574797505075}
45 |     SHARE_RNN = 1
46 |     USE_CHAR = 0
47 |     REMOVE_STOPWORDS = 0
48 |     USE_STEM = 0
49 |     W2V_TYPE = 'word2vec'
50 |     KFOLD = 1
51 |     MAX_SEQUENCE_LENGTH = 40
52 |     MAX_NB_WORDS = 200000
53 |     WORD_EMBEDDING_DIM = 300
54 |     MAX_NB_CHARS = 50
55 |     MAX_CHAR_PER_WORD = 10
56 |     CHAR_EMBEDDING_DIM = 20
57 |     CHAR_LSTM_DIM = 50
58 |     VALIDATION_SPLIT = 0.1
59 | 
60 | 
61 | class TestConfig(object):
62 |     RE_WEIGHT = True
63 |     BATCH_SIZE = 1024
64 |     CLASS_WEIGHT = {0: 1.309028344, 1: 0.472001959}
65 | 
66 | 
67 | class BiMPMConfig(object):
68 |     SEED = 2017 + 6
69 |     MODEL = 'BiMPM'
70 |     RNN_UNIT = 'gru'
71 |     TRIAL = 5
72 |     BASE_DIR = '../models/'
73 |     CONTEXT_LSTM_DIM = 100
74 |     AGGREGATION_LSTM_DIM = 300
75 |     DENSE_DIM = 100
76 |     RATE_DROP_REPRES = 0.4
77 |     DROP_RATE = 0.4
78 |     WITH_HIGHWAY = 1
79 |     MP_DIM = 10
80 |     CHECKPOINT = '../checkpoint/{}_trial_{}_db_{}.h5'.format(MODEL, TRIAL, DirConfig.DEBUG)
81 |     INFO = '%s_rnn_%s_seq_%d_context_%d_mp_%d_aggreg_%d_highway_%d_shareRNN_%d_drop_%.2f \
82 |              _char_%d_k_%d' % \
83 |             (MODEL, RNN_UNIT, TrainConfig.MAX_SEQUENCE_LENGTH, CONTEXT_LSTM_DIM, MP_DIM,
84 |              AGGREGATION_LSTM_DIM, WITH_HIGHWAY, TrainConfig.SHARE_RNN, DROP_RATE,
85 |              TrainConfig.USE_CHAR, TrainConfig.KFOLD)
86 |     W2V_TYPE = 'word2vector'
87 | 


--------------------------------------------------------------------------------
/data_util.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from nltk.corpus import stopwords
  3 | from nltk.stem import SnowballStemmer
  4 | from config import (
  5 |     DirConfig, TrainConfig
  6 | )
  7 | from keras.preprocessing.text import Tokenizer
  8 | from keras.preprocessing.sequence import pad_sequences
  9 | from keras.models import load_model
 10 | import pandas as pd
 11 | from tqdm import tqdm
 12 | import numpy as np
 13 | from gensim.models import KeyedVectors
 14 | import datetime
 15 | import os
 16 | 
 17 | 
 18 | def get_text_sequence():
 19 |     if os.path.isfile(DirConfig.CHAR1_CACHE_TRAIN):
 20 |         print('---- Load data from cache.')
 21 |         train_x1 = np.load(open(DirConfig.Q1_CACHE_TRAIN, 'rb'))
 22 |         train_x2 = np.load(open(DirConfig.Q2_CACHE_TRAIN, 'rb'))
 23 |         test_x1 = np.load(open(DirConfig.Q1_CACHE_TEST, 'rb'))
 24 |         test_x2 = np.load(open(DirConfig.Q2_CACHE_TEST, 'rb'))
 25 |         labels = np.load(open(DirConfig.TARGETS_CACHE, 'rb'))
 26 |         test_ids = np.load(open(DirConfig.TEST_ID_CACHE, 'rb'))
 27 |         word_index = np.load(open(DirConfig.WORD_INDEX_CACHE, 'rb')).item()
 28 |         char_index = None
 29 | 
 30 |         # use char representation
 31 |         if TrainConfig.USE_CHAR:
 32 |             train_words1 = np.load(open(DirConfig.CHAR1_CACHE_TRAIN, 'rb'))
 33 |             train_words2 = np.load(open(DirConfig.CHAR2_CACHE_TRAIN, 'rb'))
 34 |             test_words1 = np.load(open(DirConfig.CHAR1_CACHE_TEST, 'rb'))
 35 |             test_words2 = np.load(open(DirConfig.CHAR2_CACHE_TEST, 'rb'))
 36 |             char_index = np.load(open(DirConfig.CHAR_INDEX_CACHE, 'rb')).item()
 37 |     else:
 38 |         # load data from csv
 39 |         if DirConfig.DEBUG:
 40 |             train_data = pd.read_csv(DirConfig.SAMPLE_TRAIN_FILE)
 41 |             test_data = pd.read_csv(DirConfig.SAMPLE_TEST_FILE)
 42 |         else:
 43 |             train_data = pd.read_csv(DirConfig.TRAIN_FILE)
 44 |             test_data = pd.read_csv(DirConfig.TEST_FILE)
 45 | 
 46 |         # train and text text
 47 |         train_ori1 = list(train_data.question1.values.astype(str))
 48 |         train_ori2 = list(train_data.question2.values.astype(str))
 49 |         test_ori1 = list(test_data.question1.values.astype(str))
 50 |         test_ori2 = list(test_data.question2.values.astype(str))
 51 | 
 52 |         # target labels
 53 |         labels = train_data.is_duplicate.values
 54 |         test_ids = test_data.test_id
 55 |         np.save(open(DirConfig.TARGETS_CACHE, 'wb'), labels)
 56 |         np.save(open(DirConfig.TEST_ID_CACHE, 'wb'), test_ids)
 57 | 
 58 |         train_ori1 = preprocess_texts(train_ori1)
 59 |         train_ori2 = preprocess_texts(train_ori2)
 60 |         test_ori1 = preprocess_texts(test_ori1)
 61 |         test_ori2 = preprocess_texts(test_ori2)
 62 | 
 63 |         train_x1, train_x2, test_x1, test_x2, word_index = \
 64 |             get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2)
 65 | 
 66 |         if TrainConfig.USE_CHAR:
 67 |             train_words1, train_words2, test_words1, test_words2, char_index = \
 68 |                 get_char_seq(train_ori1, train_ori2, test_ori1, test_ori2)
 69 |         else:
 70 |             char_index = None
 71 | 
 72 |     if TrainConfig.USE_CHAR:
 73 |         # concatenate inputs
 74 |         train_x1 = (train_x1, train_words1)
 75 |         train_x2 = (train_x2, train_words2)
 76 |         test_x1 = (test_x1, test_words1)
 77 |         test_x2 = (test_x2, test_words2)
 78 | 
 79 |     return train_x1, train_x2, test_x1, test_x2, labels, test_ids, word_index, char_index
 80 | 
 81 | 
 82 | def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2):
 83 |     # fit tokenizer
 84 |     tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS)
 85 |     tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
 86 |     word_index = tk.word_index
 87 | 
 88 |     # q1, q2 training text sequence
 89 |     # (sentence_len, MAX_SEQUENCE_LENGTH)
 90 |     train_x1 = tk.texts_to_sequences(train_ori1)
 91 |     train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
 92 |     train_x2 = tk.texts_to_sequences(train_ori2)
 93 |     train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
 94 | 
 95 |     # q1, q2 testing text sequence
 96 |     test_x1 = tk.texts_to_sequences(test_ori1)
 97 |     test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
 98 |     test_x2 = tk.texts_to_sequences(test_ori2)
 99 |     test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
100 | 
101 |     np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1)
102 |     np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2)
103 |     np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1)
104 |     np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2)
105 |     np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index)
106 |     return train_x1, train_x2, test_x1, test_x2, word_index
107 | 
108 | 
109 | def words_to_char_sequence(words_list, tk):
110 |     """Convert words list to chars sequence
111 | 
112 |     # Arguments
113 |         words: word list, (sentence_len, word_len)
114 | 
115 |     # Output shape
116 |         (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
117 |     """
118 |     c_seqs = np.zeros((len(words_list),
119 |                        TrainConfig.MAX_SEQUENCE_LENGTH,
120 |                        TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
121 |     for w_i in xrange(len(words_list)):
122 |         words = words_list[w_i]
123 |         fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
124 |                              TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
125 |         ws = tk.texts_to_sequences(words)
126 |         ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
127 |         if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
128 |             max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
129 |         else:
130 |             max_word_len = len(words)
131 |         fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
132 |         c_seqs[w_i] = fixed_ws
133 |     return c_seqs
134 | 
135 | 
136 | def get_char_seq(train_ori1, train_ori2, test_ori1, test_ori2):
137 |     # extract words from each text
138 |     train_words1 = extract_words(train_ori1)
139 |     train_words2 = extract_words(train_ori2)
140 |     test_words1 = extract_words(test_ori1)
141 |     test_words2 = extract_words(test_ori2)
142 | 
143 |     # fit tokenizer
144 |     tk = Tokenizer(num_words=TrainConfig.MAX_NB_CHARS, char_level=True)
145 |     tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
146 |     char_index = tk.word_index
147 | 
148 |     # q1, q2 training word sequence
149 |     train_s1 = words_to_char_sequence(train_words1, tk)
150 |     train_s2 = words_to_char_sequence(train_words2, tk)
151 | 
152 |     # q1, q2 testing word sequence
153 |     test_s1 = words_to_char_sequence(test_words1, tk)
154 |     test_s2 = words_to_char_sequence(test_words2, tk)
155 | 
156 |     # save cache
157 |     np.save(open(DirConfig.CHAR1_CACHE_TRAIN, 'wb'), train_s1)
158 |     np.save(open(DirConfig.CHAR2_CACHE_TRAIN, 'wb'), train_s2)
159 |     np.save(open(DirConfig.CHAR1_CACHE_TEST, 'wb'), test_s1)
160 |     np.save(open(DirConfig.CHAR2_CACHE_TEST, 'wb'), test_s2)
161 |     np.save(open(DirConfig.CHAR_INDEX_CACHE, 'wb'), char_index)
162 |     return train_s1, train_s2, test_s1, test_s2, char_index
163 | 
164 | 
165 | # from https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
166 | def text_to_wordlist(text, remove_stopwords=False, stem_words=False):    
167 |     # Convert words to lower case and split them
168 |     text = str(text).lower().split()
169 | 
170 |     # Optionally, remove stop words
171 |     if remove_stopwords:
172 |         stops = set(stopwords.words("english"))
173 |         text = [w for w in text if not w in stops]
174 | 
175 |     text = " ".join(text)
176 | 
177 |     # Clean the text
178 |     text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
179 |     text = re.sub(r"what's", "what is ", text)
180 |     text = re.sub(r"\'s", " ", text)
181 |     text = re.sub(r"\'ve", " have ", text)
182 |     text = re.sub(r"can't", "cannot ", text)
183 |     text = re.sub(r"n't", " not ", text)
184 |     text = re.sub(r"i'm", "i am ", text)
185 |     text = re.sub(r"\'re", " are ", text)
186 |     text = re.sub(r"\'d", " would ", text)
187 |     text = re.sub(r"\'ll", " will ", text)
188 |     text = re.sub(r" e g ", " eg ", text)
189 |     text = re.sub(r" b g ", " bg ", text)
190 |     text = re.sub(r"e-mail", "email", text)
191 |     text = re.sub(r"imrovement", "improvement", text)
192 |     text = re.sub(r"intially", "initially", text)
193 |     text = re.sub(r"demonitization", "demonetization", text) 
194 |     text = re.sub(r"actived", "active", text)
195 | 
196 |     text = re.sub(r",", " ", text)
197 |     text = re.sub(r"\.", " ", text)
198 |     text = re.sub(r"!", " ! ", text)
199 |     text = re.sub(r"\/", " ", text)
200 |     text = re.sub(r"\^", " ^ ", text)
201 |     text = re.sub(r"\+", " + ", text)
202 |     text = re.sub(r"\-", " - ", text)
203 |     text = re.sub(r"\=", " = ", text)
204 |     text = re.sub(r"'", " ", text)
205 |     text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
206 |     text = re.sub(r":", " : ", text)
207 |     text = re.sub(r" u s ", " american ", text)
208 |     text = re.sub(r"\0s", "0", text)
209 |     text = re.sub(r" 9 11 ", " 911 ", text)
210 |     text = re.sub(r"e - mail", "email", text)
211 |     text = re.sub(r"j k", "jk", text)
212 |     text = re.sub(r"\s{2,}", " ", text)
213 | 
214 |     # Optionally, shorten words to their stems
215 |     if stem_words:
216 |         text = text.split()
217 |         stemmer = SnowballStemmer('english')
218 |         stemmed_words = [stemmer.stem(word) for word in text]
219 |         text = " ".join(stemmed_words)
220 | 
221 |     # Return a list of words
222 |     return(text)
223 | 
224 | 
225 | def preprocess_texts(texts):
226 |     processed = []
227 |     for t in texts:
228 |         processed.append(text_to_wordlist(
229 |             t, remove_stopwords=TrainConfig.REMOVE_STOPWORDS, stem_words=TrainConfig.USE_STEM))
230 |     return processed
231 | 
232 | 
233 | def split_train_data(train_x1, train_x2, labels, train_index, val_index):
234 |     if TrainConfig.USE_CHAR:
235 |         train_w1 = train_x1[0][train_index]
236 |         train_w2 = train_x2[0][train_index]
237 |         train_c1 = train_x1[1][train_index]
238 |         train_c2 = train_x2[1][train_index]
239 |         train_data = [train_w1, train_w2, train_c1, train_c2]
240 | 
241 |         val_w1 = train_x1[0][val_index]
242 |         val_w2 = train_x2[0][val_index]
243 |         val_c1 = train_x1[1][val_index]
244 |         val_c2 = train_x2[1][val_index]
245 |         val_data = [val_w1, val_w2, val_c1, val_c2]
246 |     else:
247 |         train_data = [train_x1[train_index], train_x2[train_index]]
248 |         val_data = [train_x1[val_index], train_x2[val_index]]
249 | 
250 |     train_labels = labels[train_index]
251 |     val_labels = labels[val_index]
252 |     return train_data, train_labels, val_data, val_labels
253 | 
254 | 
255 | def extract_words(sentences):
256 |     """Extract chars from each sentence
257 | 
258 |     # Arguments
259 |         sentences: list of sentences
260 |     """
261 |     w_seqs = []
262 |     for s in sentences:
263 |         s = re.sub(r"[?^,!.\/'+-=()]", " ", s)
264 |         s = s.strip()
265 |         words = []
266 |         for word in re.split('\\s+', s):
267 |             words.append(word)
268 |         w_seqs.append(words)
269 |     return w_seqs
270 | 
271 | 
272 | def load_word_embedding(type, vec_file, word_index, config):
273 |     if type == 'glove':
274 |         return load_glove_matrix(vec_file, word_index, config)
275 |     else:
276 |         return load_word2vec_matrix(vec_file, word_index, config)
277 | 
278 | 
279 | def load_glove_matrix(vec_file, word_index, config):
280 |     if os.path.isfile(DirConfig.GLOVE_CACHE):
281 |         print('---- Load word vectors from cache.')
282 |         embedding_matrix = np.load(open(DirConfig.GLOVE_CACHE, 'rb'))
283 |         return embedding_matrix
284 | 
285 |     print('---- loading glove ...')
286 |     embeddings_index = {}
287 |     f = open(vec_file)
288 |     for line in tqdm(f):
289 |         values = line.split()
290 |         word = values[0]
291 |         coefs = np.asarray(values[1:], dtype='float32')
292 |         embeddings_index[word] = coefs
293 |     f.close()
294 | 
295 |     print('Found %s word vectors.' % len(embeddings_index))
296 | 
297 |     nb_words = min(config.MAX_NB_WORDS, len(word_index)) + 1
298 |     embedding_matrix = np.zeros((nb_words, config.WORD_EMBEDDING_DIM))
299 |     for word, i in tqdm(word_index.items()):
300 |         embedding_vector = embeddings_index.get(word)
301 |         if embedding_vector is not None:
302 |             embedding_matrix[i] = embedding_vector
303 | 
304 |     # check the words which not in embedding vectors
305 |     not_found_words = []
306 |     for word, i in word_index.items():
307 |         if word not in embeddings_index:
308 |             not_found_words.append(word)
309 | 
310 |     np.save(open(DirConfig.GLOVE_CACHE, 'wb'), embedding_matrix)
311 |     return embedding_matrix
312 | 
313 | 
314 | def load_word2vec_matrix(vec_file, word_index, config):
315 |     if os.path.isfile(DirConfig.W2V_CACHE):
316 |         print('---- Load word vectors from cache.')
317 |         embedding_matrix = np.load(open(DirConfig.W2V_CACHE, 'rb'))
318 |         return embedding_matrix
319 | 
320 |     print('---- loading word2vec ...')
321 |     word2vec = KeyedVectors.load_word2vec_format(
322 |         vec_file, binary=True)
323 |     print('Found %s word vectors of word2vec' % len(word2vec.vocab))
324 | 
325 |     nb_words = min(config.MAX_NB_WORDS, len(word_index)) + 1
326 |     embedding_matrix = np.zeros((nb_words, config.WORD_EMBEDDING_DIM))
327 |     for word, i in word_index.items():
328 |         if word in word2vec.vocab:
329 |             embedding_matrix[i] = word2vec.word_vec(word)
330 |     print('Null word embeddings: %d' % \
331 |           np.sum(np.sum(embedding_matrix, axis=1) == 0))
332 | 
333 |     # check the words which not in embedding vectors
334 |     not_found_words = []
335 |     for word, i in word_index.items():
336 |         if word not in word2vec.vocab:
337 |             not_found_words.append(word)
338 | 
339 |     np.save(open(DirConfig.W2V_CACHE, 'wb'), embedding_matrix)
340 |     return embedding_matrix
341 | 
342 | 
343 | def save_training_history(path, config, history, fold=0):
344 |     values = np.array(history.history.values())
345 |     results = pd.DataFrame(values.transpose(), columns=[history.history.keys()])
346 |     now = datetime.datetime.now()
347 |     suffix = str(now.strftime("%Y-%m-%d-%H-%M"))
348 |     path = os.path.join(
349 |         path, 'his_{}_trial_{}_db_{}_k_{}-{}.csv'.format(
350 |             config.INFO, config.TRIAL, DirConfig.DEBUG, fold, suffix))
351 |     results.to_csv(path)
352 | 
353 | 
354 | def create_submission(path, config, preds, test_ids, low_threhold=0.05):
355 |     print('----- Create submission for {}'.format(config.MODEL))
356 |     if preds.shape[1] > 1:
357 |         preds = preds[:, 1]
358 |     preds = preds.clip(low_threhold, 1.0 - low_threhold)
359 |     submission = pd.DataFrame(test_ids, columns=['test_id'])
360 |     submission.loc[:, 'is_duplicate'] = preds.ravel()
361 |     now = datetime.datetime.now()
362 |     subm_file = os.path.join(path, 'subm_{}_trial_{}_db_{}-{}.csv'.format(
363 |         config.INFO, config.TRIAL, DirConfig.DEBUG, str(now.strftime("%Y-%m-%d-%H-%M"))))
364 |     submission.to_csv(subm_file, index=False)
365 |     return subm_file
366 | 
367 | 
368 | def save_model(model, config, fold=0):
369 |     m_file = os.path.join(
370 |         config.BASE_DIR, '{}_trial_{}_db_{}_k_{}_model.h5'.format(
371 |             config.INFO, config.TRIAL, DirConfig.DEBUG, fold))
372 |     w_file = os.path.join(
373 |         config.BASE_DIR, '{}_trial_{}_db_{}_k_{}_weight.h5'.format(
374 |             config.INFO, config.TRIAL, DirConfig.DEBUG, fold))
375 |     model.save(m_file)
376 |     model.save_weights(w_file)
377 |     print('--- Saved model.')
378 | 
379 | 
380 | def load_keras_model(config, custom_objects=None, fold=0):
381 |     m_file = os.path.join(
382 |         config.BASE_DIR, '{}_trial_{}_db_{}_k_{}_model.h5'.format(
383 |             config.INFO, config.TRIAL, DirConfig.DEBUG, fold))
384 |     if os.path.isfile(m_file):
385 |         model = load_model(m_file, custom_objects)
386 |         return model
387 |     else:
388 |         return None
389 | 
390 | 
391 | def merge_several_folds_mean(data, nfolds):
392 |     print('------ Merge several folds results to mean. -----')
393 |     a = np.array(data[0])
394 |     for i in range(1, nfolds):
395 |         a += np.array(data[i])
396 |     a /= nfolds
397 |     return a.tolist()
398 | 
399 | 
400 | def load_trained_models(config):
401 |     models = []
402 |     for k in range(TrainConfig.KFOLD):
403 |         model = load_keras_model(config, fold=k + 1)
404 |         if model is None:
405 |             break
406 |         # Compile model
407 |         model.compile(loss='binary_crossentropy',
408 |                       optimizer='nadam',
409 |                       metrics=['accuracy'])
410 |         models.append(model)
411 |     return models
412 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ijinmao/BiMPM_keras/73245b76a4f4f53424ad1aa0a79df7f864181c8c/models/__init__.py


--------------------------------------------------------------------------------
/models/bimpm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Model graph of Bilateral Multi-Perspective Matching.
  3 | 
  4 | References:
  5 |     Bilateral Multi-Perspective Matching for Natural Language Sentences
  6 | """
  7 | import numpy as np
  8 | from keras.layers import Input
  9 | from keras.models import Model
 10 | from keras.layers.merge import concatenate
 11 | import keras.backend as K
 12 | from config import (
 13 |     BiMPMConfig, TrainConfig
 14 | )
 15 | from model.multi_perspective import MultiPerspective
 16 | from models.layers import (
 17 |     WordRepresLayer, CharRepresLayer, ContextLayer, PredictLayer
 18 | )
 19 | 
 20 | np.random.seed(BiMPMConfig.SEED)
 21 | 
 22 | 
 23 | def build_model(embedding_matrix, word_index, char_index=None):
 24 |     print('--- Building model...')
 25 | 
 26 |     # Parameters
 27 |     sequence_length = TrainConfig.MAX_SEQUENCE_LENGTH
 28 |     nb_per_word = TrainConfig.MAX_CHAR_PER_WORD
 29 |     rnn_unit = BiMPMConfig.RNN_UNIT
 30 |     nb_words = min(TrainConfig.MAX_NB_WORDS, len(word_index)) + 1
 31 |     word_embedding_dim = TrainConfig.WORD_EMBEDDING_DIM
 32 |     dropout = BiMPMConfig.DROP_RATE
 33 |     context_rnn_dim = BiMPMConfig.CONTEXT_LSTM_DIM
 34 |     mp_dim = BiMPMConfig.MP_DIM
 35 |     highway = BiMPMConfig.WITH_HIGHWAY
 36 |     aggregate_rnn_dim = BiMPMConfig.AGGREGATION_LSTM_DIM
 37 |     dense_dim = BiMPMConfig.DENSE_DIM
 38 |     if TrainConfig.USE_CHAR:
 39 |         nb_chars = min(TrainConfig.MAX_NB_CHARS, len(char_index)) + 1
 40 |         char_embedding_dim = TrainConfig.CHAR_EMBEDDING_DIM
 41 |         char_rnn_dim = TrainConfig.CHAR_LSTM_DIM
 42 | 
 43 |     # Model words input
 44 |     w1 = Input(shape=(sequence_length,), dtype='int32')
 45 |     w2 = Input(shape=(sequence_length,), dtype='int32')
 46 |     if TrainConfig.USE_CHAR:
 47 |         c1 = Input(shape=(sequence_length, nb_per_word), dtype='int32')
 48 |         c2 = Input(shape=(sequence_length, nb_per_word), dtype='int32')
 49 | 
 50 |     # Build word representation layer
 51 |     word_layer = WordRepresLayer(
 52 |         sequence_length, nb_words, word_embedding_dim, embedding_matrix)
 53 |     w_res1 = word_layer(w1)
 54 |     w_res2 = word_layer(w2)
 55 | 
 56 |     # Model chars input
 57 |     if TrainConfig.USE_CHAR:
 58 |         char_layer = CharRepresLayer(
 59 |             sequence_length, nb_chars, nb_per_word, char_embedding_dim,
 60 |             char_rnn_dim, rnn_unit=rnn_unit, dropout=dropout)
 61 |         c_res1 = char_layer(c1)
 62 |         c_res2 = char_layer(c2)
 63 |         sequence1 = concatenate([w_res1, c_res1])
 64 |         sequence2 = concatenate([w_res2, c_res2])
 65 |     else:
 66 |         sequence1 = w_res1
 67 |         sequence2 = w_res2
 68 | 
 69 |     # Build context representation layer
 70 |     context_layer = ContextLayer(
 71 |         context_rnn_dim, rnn_unit=rnn_unit, dropout=dropout, highway=highway,
 72 |         input_shape=(sequence_length, K.int_shape(sequence1)[-1],),
 73 |         return_sequences=True)
 74 |     context1 = context_layer(sequence1)
 75 |     context2 = context_layer(sequence2)
 76 | 
 77 |     # Build matching layer
 78 |     matching_layer = MultiPerspective(mp_dim)
 79 |     matching1 = matching_layer([context1, context2])
 80 |     matching2 = matching_layer([context2, context1])
 81 |     matching = concatenate([matching1, matching2])
 82 | 
 83 |     # Build aggregation layer
 84 |     aggregate_layer = ContextLayer(
 85 |         aggregate_rnn_dim, rnn_unit=rnn_unit, dropout=dropout, highway=highway,
 86 |         input_shape=(sequence_length, K.int_shape(matching)[-1],),
 87 |         return_sequences=False)
 88 |     aggregation = aggregate_layer(matching)
 89 | 
 90 |     # Build prediction layer
 91 |     pred = PredictLayer(dense_dim,
 92 |                         input_dim=K.int_shape(aggregation)[-1],
 93 |                         dropout=dropout)(aggregation)
 94 |     # Build model
 95 |     if TrainConfig.USE_CHAR:
 96 |         inputs = (w1, w2, c1, c2)
 97 |     else:
 98 |         inputs = (w1, w2)
 99 | 
100 |     # Build model graph
101 |     model = Model(inputs=inputs,
102 |                   outputs=pred)
103 | 
104 |     # Compile model
105 |     model.compile(loss='binary_crossentropy',
106 |                   optimizer='adam',
107 |                   metrics=['accuracy'])
108 |     print(model.summary())
109 |     return model
110 | 


--------------------------------------------------------------------------------
/models/layers.py:
--------------------------------------------------------------------------------
  1 | from keras.models import Sequential
  2 | from keras.layers.embeddings import Embedding
  3 | from keras.layers.core import Lambda, Dense, Dropout
  4 | from keras.layers.recurrent import LSTM, GRU
  5 | from keras.layers.wrappers import Bidirectional
  6 | from keras.legacy.layers import Highway
  7 | from keras.layers import TimeDistributed
  8 | import keras.backend as K
  9 | from keras.layers.normalization import BatchNormalization
 10 | 
 11 | 
 12 | class WordRepresLayer(object):
 13 |     """Word embedding representation layer
 14 |     """
 15 |     def __init__(self, sequence_length, nb_words,
 16 |                  word_embedding_dim, embedding_matrix):
 17 |         self.model = Sequential()
 18 |         self.model.add(Embedding(nb_words,
 19 |                                  word_embedding_dim,
 20 |                                  weights=[embedding_matrix],
 21 |                                  input_length=sequence_length,
 22 |                                  trainable=False))
 23 | 
 24 |     def __call__(self, inputs):
 25 |         return self.model(inputs)
 26 | 
 27 | 
 28 | class CharRepresLayer(object):
 29 |     """Char embedding representation layer
 30 |     """
 31 |     def __init__(self, sequence_length, nb_chars, nb_per_word,
 32 |                  embedding_dim, rnn_dim, rnn_unit='gru', dropout=0.0):
 33 |         def _collapse_input(x, nb_per_word=0):
 34 |             x = K.reshape(x, (-1, nb_per_word))
 35 |             return x
 36 | 
 37 |         def _unroll_input(x, sequence_length=0, rnn_dim=0):
 38 |             x = K.reshape(x, (-1, sequence_length, rnn_dim))
 39 |             return x
 40 | 
 41 |         if rnn_unit == 'gru':
 42 |             rnn = GRU
 43 |         else:
 44 |             rnn = LSTM
 45 |         self.model = Sequential()
 46 |         self.model.add(Lambda(_collapse_input,
 47 |                               arguments={'nb_per_word': nb_per_word},
 48 |                               output_shape=(nb_per_word,),
 49 |                               input_shape=(sequence_length, nb_per_word,)))
 50 |         self.model.add(Embedding(nb_chars,
 51 |                                  embedding_dim,
 52 |                                  input_length=nb_per_word,
 53 |                                  trainable=True))
 54 |         self.model.add(rnn(rnn_dim,
 55 |                            dropout=dropout,
 56 |                            recurrent_dropout=dropout))
 57 |         self.model.add(Lambda(_unroll_input,
 58 |                               arguments={'sequence_length': sequence_length,
 59 |                                          'rnn_dim': rnn_dim},
 60 |                               output_shape=(sequence_length, rnn_dim)))
 61 |         
 62 |     def __call__(self, inputs):
 63 |         return self.model(inputs)
 64 | 
 65 | 
 66 | class ContextLayer(object):
 67 |     """Word context layer
 68 |     """
 69 |     def __init__(self, rnn_dim, rnn_unit='gru', input_shape=(0,),
 70 |                  dropout=0.0, highway=False, return_sequences=False,
 71 |                  dense_dim=0):
 72 |         if rnn_unit == 'gru':
 73 |             rnn = GRU
 74 |         else:
 75 |             rnn = LSTM
 76 |         self.model = Sequential()
 77 |         self.model.add(
 78 |             Bidirectional(rnn(rnn_dim,
 79 |                               dropout=dropout,
 80 |                               recurrent_dropout=dropout,
 81 |                               return_sequences=return_sequences),
 82 |                           input_shape=input_shape))
 83 |         # self.model.add(rnn(rnn_dim,
 84 |         #                    dropout=dropout,
 85 |         #                    recurrent_dropout=dropout,
 86 |         #                    return_sequences=return_sequences,
 87 |         #                    input_shape=input_shape))
 88 |         if highway:
 89 |             if return_sequences:
 90 |                 self.model.add(TimeDistributed(Highway(activation='tanh')))
 91 |             else:
 92 |                 self.model.add(Highway(activation='tanh'))
 93 | 
 94 |         if dense_dim > 0:
 95 |             self.model.add(TimeDistributed(Dense(dense_dim,
 96 |                                                  activation='relu')))
 97 |             self.model.add(TimeDistributed(Dropout(dropout)))
 98 |             self.model.add(TimeDistributed(BatchNormalization()))
 99 | 
100 |     def __call__(self, inputs):
101 |         return self.model(inputs)
102 | 
103 | 
104 | class PredictLayer(object):
105 |     """Prediction layer.
106 | 
107 |     """
108 |     def __init__(self, dense_dim, input_dim=0,
109 |                  dropout=0.0):
110 |         self.model = Sequential()
111 |         self.model.add(Dense(dense_dim,
112 |                              activation='relu',
113 |                              input_shape=(input_dim,)))
114 |         self.model.add(Dropout(dropout))
115 |         self.model.add(BatchNormalization())
116 |         self.model.add(Dense(1, activation='sigmoid'))
117 | 
118 |     def __call__(self, inputs):
119 |         return self.model(inputs)
120 | 


--------------------------------------------------------------------------------
/models/multi_perspective.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Multi-perspective Matching Layer.
  4 | 
  5 | Reference: Bilateral Multi-Perspective Matching for Natural Language Sentences.
  6 | """
  7 | 
  8 | import keras.backend as K
  9 | from keras.engine.topology import Layer
 10 | 
 11 | 
 12 | class MultiPerspective(Layer):
 13 |     """Multi-perspective Matching Layer.
 14 | 
 15 |     # Arguments
 16 |         mp_dim: single forward/backward multi-perspective dimention
 17 |     """
 18 | 
 19 |     def __init__(self, mp_dim, epsilon=1e-6, **kwargs):
 20 |         self.mp_dim = mp_dim
 21 |         self.epsilon = 1e-6
 22 |         self.strategy = 4
 23 |         super(MultiPerspective, self).__init__(**kwargs)
 24 | 
 25 |     def build(self, input_shape):
 26 |         if isinstance(input_shape, list):
 27 |             input_shape = input_shape[0]
 28 |         embedding_size = input_shape[-1] / 2
 29 |         # Create a trainable weight variable for this layer.
 30 |         # input_shape is bidirectional RNN input shape
 31 |         # kernel shape (mp_dim * 2 * self.strategy, embedding_size)
 32 |         self.kernel = self.add_weight((self.mp_dim,
 33 |                                        embedding_size * 2 * self.strategy),
 34 |                                        name='kernel',
 35 |                                        initializer='glorot_uniform',
 36 |                                        trainable=True)
 37 |         self.kernel_full_fw = self.kernel[:, :embedding_size]
 38 |         self.kernel_full_bw = self.kernel[:, embedding_size: embedding_size * 2]
 39 |         self.kernel_attentive_fw = self.kernel[:, embedding_size * 2: embedding_size * 3]
 40 |         self.kernel_attentive_bw = self.kernel[:, embedding_size * 3: embedding_size * 4]
 41 |         self.kernel_max_attentive_fw = self.kernel[:, embedding_size * 4: embedding_size * 5]
 42 |         self.kernel_max_attentive_bw = self.kernel[:, embedding_size * 5: embedding_size * 6]
 43 |         self.kernel_max_pool_fw = self.kernel[:, embedding_size * 6: embedding_size * 7]
 44 |         self.kernel_max_pool_bw = self.kernel[:, embedding_size * 7:]
 45 |         self.built = True
 46 |         super(MultiPerspective, self).build(input_shape)
 47 | 
 48 |     def compute_output_shape(self, input_shape):
 49 |         if isinstance(input_shape, list):
 50 |             input_shape = input_shape[0]
 51 |         return (input_shape[0], input_shape[1], self.mp_dim * 2 * self.strategy)
 52 | 
 53 |     def get_config(self):
 54 |         config = {'mp_dim': self.mp_dim,
 55 |                   'epsilon': self.epsilon}
 56 |         base_config = super(MultiPerspective, self).get_config()
 57 |         return dict(list(base_config.items()) + list(config.items()))
 58 | 
 59 |     def call(self, inputs):
 60 |         # h1, h2: bidirectional LSTM hidden states, include forward and backward states
 61 |         #         (batch_size, timesteps, embedding_size * 2)
 62 |         h1 = inputs[0]
 63 |         h2 = inputs[1]
 64 |         embedding_size = K.int_shape(h1)[-1] / 2
 65 |         h1_fw = h1[:, :, :embedding_size]
 66 |         h1_bw = h1[:, :, embedding_size:]
 67 |         h2_fw = h2[:, :, :embedding_size]
 68 |         h2_bw = h2[:, :, embedding_size:]
 69 | 
 70 |         # 4 matching strategy
 71 |         list_matching = []
 72 | 
 73 |         # full matching ops
 74 |         matching_fw = self._full_matching(h1_fw, h2_fw, self.kernel_full_fw)
 75 |         matching_bw = self._full_matching(h1_bw, h2_bw, self.kernel_full_bw)
 76 |         list_matching.extend([matching_fw, matching_bw])
 77 | 
 78 |         # cosine matrix
 79 |         cosine_matrix_fw = self._cosine_matrix(h1_fw, h2_fw)
 80 |         cosine_matrix_bw = self._cosine_matrix(h1_bw, h2_bw)
 81 | 
 82 |         # attentive matching ops
 83 |         matching_fw = self._attentive_matching(
 84 |             h1_fw, h2_fw, cosine_matrix_fw, self.kernel_attentive_fw)
 85 |         matching_bw = self._attentive_matching(
 86 |             h1_bw, h2_bw, cosine_matrix_bw, self.kernel_attentive_bw)
 87 |         list_matching.extend([matching_fw, matching_bw])
 88 | 
 89 |         # max attentive matching ops
 90 |         matching_fw = self._max_attentive_matching(
 91 |             h1_fw, h2_fw, cosine_matrix_fw, self.kernel_max_attentive_fw)
 92 |         matching_bw = self._max_attentive_matching(
 93 |             h1_bw, h2_bw, cosine_matrix_bw, self.kernel_max_attentive_bw)
 94 |         list_matching.extend([matching_fw, matching_bw])
 95 | 
 96 |         # max pooling matching ops
 97 |         matching_fw = self._max_pooling_matching(h1_fw, h2_fw, self.kernel_max_pool_fw)
 98 |         matching_bw = self._max_pooling_matching(h1_bw, h2_bw, self.kernel_max_pool_bw)
 99 |         list_matching.extend([matching_fw, matching_bw])
100 | 
101 |         return K.concatenate(list_matching, axis=-1)
102 | 
103 |     def _cosine_similarity(self, x1, x2):
104 |         """Compute cosine similarity.
105 | 
106 |         # Arguments:
107 |             x1: (..., embedding_size)
108 |             x2: (..., embedding_size)
109 |         """
110 |         cos = K.sum(x1 * x2, axis=-1)
111 |         x1_norm = K.sqrt(K.maximum(K.sum(K.square(x1), axis=-1), self.epsilon))
112 |         x2_norm = K.sqrt(K.maximum(K.sum(K.square(x2), axis=-1), self.epsilon))
113 |         cos = cos / x1_norm / x2_norm
114 |         return cos
115 | 
116 |     def _cosine_matrix(self, x1, x2):
117 |         """Cosine similarity matrix.
118 | 
119 |         Calculate the cosine similarities between each forward (or backward)
120 |         contextual embedding h_i_p and every forward (or backward)
121 |         contextual embeddings of the other sentence
122 | 
123 |         # Arguments
124 |             x1: (batch_size, x1_timesteps, embedding_size)
125 |             x2: (batch_size, x2_timesteps, embedding_size)
126 | 
127 |         # Output shape
128 |             (batch_size, x1_timesteps, x2_timesteps)
129 |         """
130 |         # expand h1 shape to (batch_size, x1_timesteps, 1, embedding_size)
131 |         x1 = K.expand_dims(x1, axis=2)
132 |         # expand x2 shape to (batch_size, 1, x2_timesteps, embedding_size)
133 |         x2 = K.expand_dims(x2, axis=1)
134 |         # cosine matrix (batch_size, h1_timesteps, h2_timesteps)
135 |         cos_matrix = self._cosine_similarity(x1, x2)
136 |         return cos_matrix
137 | 
138 |     def _mean_attentive_vectors(self, x2, cosine_matrix):
139 |         """Mean attentive vectors.
140 | 
141 |         Calculate mean attentive vector for the entire sentence by weighted
142 |         summing all the contextual embeddings of the entire sentence
143 | 
144 |         # Arguments
145 |             x2: sequence vectors, (batch_size, x2_timesteps, embedding_size)
146 |             cosine_matrix: cosine similarities matrix of x1 and x2,
147 |                            (batch_size, x1_timesteps, x2_timesteps)
148 | 
149 |         # Output shape
150 |             (batch_size, x1_timesteps, embedding_size)
151 |         """
152 |         # (batch_size, x1_timesteps, x2_timesteps, 1)
153 |         expanded_cosine_matrix = K.expand_dims(cosine_matrix, axis=-1)
154 |         # (batch_size, 1, x2_timesteps, embedding_size)
155 |         x2 = K.expand_dims(x2, axis=1)
156 |         # (batch_size, x1_timesteps, embedding_size)
157 |         weighted_sum = K.sum(expanded_cosine_matrix * x2, axis=2)
158 |         # (batch_size, x1_timesteps, 1)
159 |         sum_cosine = K.expand_dims(K.sum(cosine_matrix, axis=-1) + self.epsilon, axis=-1)
160 |         # (batch_size, x1_timesteps, embedding_size)
161 |         attentive_vector = weighted_sum / sum_cosine
162 |         return attentive_vector
163 | 
164 |     def _max_attentive_vectors(self, x2, cosine_matrix):
165 |         """Max attentive vectors.
166 | 
167 |         Calculate max attentive vector for the entire sentence by picking
168 |         the contextual embedding with the highest cosine similarity
169 |         as the attentive vector.
170 | 
171 |         # Arguments
172 |             x2: sequence vectors, (batch_size, x2_timesteps, embedding_size)
173 |             cosine_matrix: cosine similarities matrix of x1 and x2,
174 |                            (batch_size, x1_timesteps, x2_timesteps)
175 | 
176 |         # Output shape
177 |             (batch_size, x1_timesteps, embedding_size)
178 |         """
179 |         # (batch_size, x1_timesteps)
180 |         max_x2_step = K.argmax(cosine_matrix, axis=-1)
181 | 
182 |         embedding_size = K.int_shape(x2)[-1]
183 |         timesteps = K.int_shape(max_x2_step)[-1]
184 |         if timesteps is None:
185 |             timesteps = K.shape(max_x2_step)[-1]
186 | 
187 |         # collapse time dimension and batch dimension together
188 |         # collapse x2 to (batch_size * x2_timestep, embedding_size)
189 |         x2 = K.reshape(x2, (-1, embedding_size))
190 |         # collapse max_x2_step to (batch_size * h1_timesteps)
191 |         max_x2_step = K.reshape(max_x2_step, (-1,))
192 |         # (batch_size * x1_timesteps, embedding_size)
193 |         max_x2 = K.gather(x2, max_x2_step)
194 |         # reshape max_x2, (batch_size, x1_timesteps, embedding_size)
195 |         attentive_vector = K.reshape(max_x2, K.stack([-1, timesteps, embedding_size]))
196 |         return attentive_vector
197 | 
198 |     def _time_distributed_multiply(self, x, w):
199 |         """Element-wise multiply vector and weights.
200 | 
201 |         # Arguments
202 |             x: sequence of hidden states, (batch_size, ?, embedding_size)
203 |             w: weights of one matching strategy of one direction,
204 |                (mp_dim, embedding_size)
205 | 
206 |         # Output shape
207 |             (?, mp_dim, embedding_size)
208 |         """
209 |         # dimension of vector
210 |         n_dim = K.ndim(x)
211 |         embedding_size = K.int_shape(x)[-1]
212 |         timesteps = K.int_shape(x)[1]
213 |         if timesteps is None:
214 |             timesteps = K.shape(x)[1]
215 | 
216 |         # collapse time dimension and batch dimension together
217 |         x = K.reshape(x, (-1, embedding_size))
218 |         # reshape to (?, 1, embedding_size)
219 |         x = K.expand_dims(x, axis=1)
220 |         # reshape weights to (1, mp_dim, embedding_size)
221 |         w = K.expand_dims(w, axis=0)
222 |         # element-wise multiply
223 |         x = x * w
224 |         # reshape to original shape
225 |         if n_dim == 3:
226 |             x = K.reshape(x, K.stack([-1, timesteps, self.mp_dim, embedding_size]))
227 |             x.set_shape([None, None, None, embedding_size])
228 |         elif n_dim == 2:
229 |             x = K.reshape(x, K.stack([-1, self.mp_dim, embedding_size]))
230 |             x.set_shape([None, None, embedding_size])
231 |         return x
232 | 
233 |     def _full_matching(self, h1, h2, w):
234 |         """Full matching operation.
235 | 
236 |         # Arguments
237 |             h1: (batch_size, h1_timesteps, embedding_size)
238 |             h2: (batch_size, h2_timesteps, embedding_size)
239 |             w: weights of one direction, (mp_dim, embedding_size)
240 | 
241 |         # Output shape
242 |             (batch_size, h1_timesteps, mp_dim)
243 |         """
244 |         # h2 forward last step hidden vector, (batch_size, embedding_size)
245 |         h2_last_state = h2[:, -1, :]
246 |         # h1 * weights, (batch_size, h1_timesteps, mp_dim, embedding_size)
247 |         h1 = self._time_distributed_multiply(h1, w)
248 |         # h2_last_state * weights, (batch_size, mp_dim, embedding_size)
249 |         h2 = self._time_distributed_multiply(h2_last_state, w)
250 |         # reshape to (batch_size, 1, mp_dim, embedding_size)
251 |         h2 = K.expand_dims(h2, axis=1)
252 |         # matching vector, (batch_size, h1_timesteps, mp_dim)
253 |         matching = self._cosine_similarity(h1, h2)
254 |         return matching
255 | 
256 |     def _max_pooling_matching(self, h1, h2, w):
257 |         """Max pooling matching operation.
258 | 
259 |         # Arguments
260 |             h1: (batch_size, h1_timesteps, embedding_size)
261 |             h2: (batch_size, h2_timesteps, embedding_size)
262 |             w: weights of one direction, (mp_dim, embedding_size)
263 | 
264 |         # Output shape
265 |             (batch_size, h1_timesteps, mp_dim)
266 |         """
267 |         # h1 * weights, (batch_size, h1_timesteps, mp_dim, embedding_size)
268 |         h1 = self._time_distributed_multiply(h1, w)
269 |         # h2 * weights, (batch_size, h2_timesteps, mp_dim, embedding_size)
270 |         h2 = self._time_distributed_multiply(h2, w)
271 |         # reshape v1 to (batch_size, h1_timesteps, 1, mp_dim, embedding_size)
272 |         h1 = K.expand_dims(h1, axis=2)
273 |         # reshape v1 to (batch_size, 1, h2_timesteps, mp_dim, embedding_size)
274 |         h2 = K.expand_dims(h2, axis=1)
275 |         # cosine similarity, (batch_size, h1_timesteps, h2_timesteps, mp_dim)
276 |         cos = self._cosine_similarity(h1, h2)
277 |         # (batch_size, h1_timesteps, mp_dim)
278 |         matching = K.max(cos, axis=2)
279 |         return matching
280 | 
281 |     def _attentive_matching(self, h1, h2, cosine_matrix, w):
282 |         """Attentive matching operation.
283 | 
284 |         # Arguments
285 |             h1: (batch_size, h1_timesteps, embedding_size)
286 |             h2: (batch_size, h2_timesteps, embedding_size)
287 |             cosine_matrix: weights of hidden state h2,
288 |                           (batch_size, h1_timesteps, h2_timesteps)
289 |             w: weights of one direction, (mp_dim, embedding_size)
290 | 
291 |         # Output shape
292 |             (batch_size, h1_timesteps, mp_dim)
293 |         """
294 |         # h1 * weights, (batch_size, h1_timesteps, mp_dim, embedding_size)
295 |         h1 = self._time_distributed_multiply(h1, w)
296 |         # attentive vector (batch_size, h1_timesteps, embedding_szie)
297 |         attentive_vec = self._mean_attentive_vectors(h2, cosine_matrix)
298 |         # attentive_vec * weights, (batch_size, h1_timesteps, mp_dim, embedding_size)
299 |         attentive_vec = self._time_distributed_multiply(attentive_vec, w)
300 |         # matching vector, (batch_size, h1_timesteps, mp_dim)
301 |         matching = self._cosine_similarity(h1, attentive_vec)
302 |         return matching
303 | 
304 |     def _max_attentive_matching(self, h1, h2, cosine_matrix, w):
305 |         """Max attentive matching operation.
306 | 
307 |         # Arguments
308 |             h1: (batch_size, h1_timesteps, embedding_size)
309 |             h2: (batch_size, h2_timesteps, embedding_size)
310 |             cosine_matrix: weights of hidden state h2,
311 |                           (batch_size, h1_timesteps, h2_timesteps)
312 |             w: weights of one direction, (mp_dim, embedding_size)
313 | 
314 |         # Output shape
315 |             (batch_size, h1_timesteps, mp_dim)
316 |         """
317 |         # h1 * weights, (batch_size, h1_timesteps, mp_dim, embedding_size)
318 |         h1 = self._time_distributed_multiply(h1, w)
319 |         # max attentive vector (batch_size, h1_timesteps, embedding_szie)
320 |         max_attentive_vec = self._max_attentive_vectors(h2, cosine_matrix)
321 |         # max_attentive_vec * weights, (batch_size, h1_timesteps, mp_dim, embedding_size)
322 |         max_attentive_vec = self._time_distributed_multiply(max_attentive_vec, w)
323 |         # matching vector, (batch_size, h1_timesteps, mp_dim)
324 |         matching = self._cosine_similarity(h1, max_attentive_vec)
325 |         return matching
326 | 


--------------------------------------------------------------------------------
/train_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.callbacks import ModelCheckpoint, EarlyStopping
  3 | from sklearn.model_selection import KFold
  4 | from models.bimpm import build_model as build_bimpm
  5 | from config import (
  6 |     DirConfig, TrainConfig, TestConfig, BiMPMConfig
  7 | )
  8 | from data_util import (
  9 |     get_text_sequence, save_training_history, create_submission,
 10 |     save_model, load_trained_models, load_word2vec_matrix,
 11 |     merge_several_folds_mean, split_train_data
 12 | )
 13 | 
 14 | 
 15 | def train_model():
 16 |     print('###### Start training for {}. ######'.format(
 17 |         'debugging' if DirConfig.DEBUG else 'production'))
 18 | 
 19 |     # Get model config
 20 |     config = BiMPMConfig
 21 | 
 22 |     # Load trained model from cache
 23 |     models = load_trained_models(config)
 24 |     if len(models) > 0:
 25 |         print('--- load model from cache.')
 26 |         # Compile model
 27 |         for m in models:
 28 |             m.compile(loss='binary_crossentropy',
 29 |                       optimizer='nadam',
 30 |                       metrics=['accuracy'])
 31 |         return models, None, None, None
 32 | 
 33 |     # Load train/test data set
 34 |     train_x1, train_x2, test_x1, test_x2, labels, test_ids, word_index, char_index = \
 35 |         get_text_sequence()
 36 | 
 37 |     # Load pretrained word embedding vectors
 38 |     embedding_matrix = load_word2vec_matrix(
 39 |         DirConfig.W2V_FILE, word_index, config)
 40 | 
 41 |     # Reweight params
 42 |     if TestConfig.RE_WEIGHT:
 43 |         class_weight = TestConfig.CLASS_WEIGHT
 44 |     else:
 45 |         class_weight = None
 46 | 
 47 |     # Split dataset indices
 48 |     kf = KFold(n_splits=10, shuffle=True)
 49 |     kf_gen = kf.split(labels)
 50 |     fold = 1
 51 |     models = []
 52 | 
 53 |     # Cross-validation train model
 54 |     for train_index, val_index in kf_gen:
 55 |         # Load current fold dataset
 56 |         train_data, train_labels, val_data, val_labels = split_train_data(
 57 |             train_x1, train_x2, labels, train_index, val_index)
 58 | 
 59 |         # Define validation sample weight
 60 |         val_weight = np.ones(len(val_labels))
 61 |         if TestConfig.RE_WEIGHT:
 62 |             val_weight *= TrainConfig.CLASS_WEIGHT[0]
 63 |             val_weight[val_labels == 0] = TrainConfig.CLASS_WEIGHT[1]
 64 | 
 65 |         # Build model
 66 |         model = build_model(embedding_matrix, word_index, char_index)
 67 | 
 68 |         # Define model callbacks
 69 |         early_stopping = EarlyStopping(monitor='val_loss', patience=5)
 70 |         model_checkpoint = ModelCheckpoint(
 71 |             config.CHECKPOINT, save_best_only=True, save_weights_only=True)
 72 | 
 73 |         # Training
 74 |         history = model.fit(
 75 |             train_data, y=train_labels,
 76 |             validation_data=(val_data, val_labels, val_weight),
 77 |             # validation_split=TrainConfig.VALIDATION_SPLIT,
 78 |             epochs=TrainConfig.NB_EPOCH,
 79 |             batch_size=TrainConfig.BATCH_SIZE, shuffle=True,
 80 |             class_weight=class_weight,
 81 |             callbacks=[early_stopping, model_checkpoint])
 82 |         save_model(model, config, fold=fold)
 83 |         save_training_history(DirConfig.HISTORYA_DIR, config, history)
 84 |         fold += 1
 85 |         models.append(model)
 86 |         if fold > TrainConfig.KFOLD:
 87 |             break
 88 |     return models, test_x1, test_x2, test_ids
 89 | 
 90 | 
 91 | def test_model(model_name, models=[], test_x1=None, test_x2=None, test_ids=None):
 92 |     print('###### Start testing for {}. ######'.format(
 93 |         'debugging' if DirConfig.DEBUG else 'production'))
 94 | 
 95 |     config = BiMPMConfig
 96 | 
 97 |     # Load models from cache
 98 |     if len(models) == 0:
 99 |         models = load_trained_models(config)
100 | 
101 |     # Load test data from cache
102 |     if test_x1 is None:
103 |         _, _, test_x1, test_x2, _, test_ids, _, _ = \
104 |             get_text_sequence()
105 | 
106 |     if TrainConfig.USE_CHAR:
107 |         test_data = [test_x1[0], test_x2[0], test_x1[1], test_x2[1]]
108 |     else:
109 |         test_data = [test_x1, test_x2]
110 | 
111 |     predictions = []
112 | 
113 |     # Testing
114 |     for model in models:
115 |         preds = model.predict(
116 |             test_data,
117 |             batch_size=TestConfig.BATCH_SIZE, verbose=1)
118 |         predictions.append(preds)
119 | 
120 |     preds_mean = np.array(merge_several_folds_mean(predictions, len(models)))
121 |     create_submission(DirConfig.SUBM_DIR, config, preds_mean, test_ids)
122 | 
123 | 
124 | def build_model(embedding_matrix, word_index, char_index):
125 |     return build_bimpm(embedding_matrix, word_index, char_index)
126 | 
127 | 
128 | def main():
129 |     models, test_x1, test_x2, test_ids = train_model()
130 |     test_model(models, test_x1, test_x2, test_ids)
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     main()
135 | 


--------------------------------------------------------------------------------