├── README.md
└── code
    ├── config.py
    ├── createFeature.py
    ├── ensemble.py
    ├── nn.py
    ├── readdata.py
    ├── text.py
    ├── tool.py
    └── train_word.py


/README.md:
--------------------------------------------------------------------------------
 1 | # 拍拍贷第三届魔镜杯大赛
 2 | 拍拍贷第三届魔镜杯大赛rank6解决方案
 3 | 最终排名 https://ai.ppdai.com/mirror/goToMirrorDetail?mirrorId=1&tabindex=2
 4 | 
 5 | ## 比赛答辩PPT与方案
 6 | https://qrfaction.github.io/2018/07/25/%E9%AD%94%E9%95%9C%E6%9D%AF%E6%AF%94%E8%B5%9B%E7%AD%94%E8%BE%A9PPT/
 7 | 
 8 | ## code注意事项
 9 | 缺失队友权重迁移和数据增强部分（那两部分没涨分）
10 | 
11 | ## Conclusion
12 | 更新于2019.02.12
13 | 此次比赛认识了很多圈内人，在日后很多比赛中都常常见到熟人
14 | 感觉圈子好小啊
15 | 


--------------------------------------------------------------------------------
/code/config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | MAX_NB_WORDS = 30000
 5 | n_folds = 10
 6 | MAX_NUM_WORDS = 15
 7 | MAX_NUM_CHARS = 25
 8 | 
 9 | use_data = 'word'
10 | use_model = 'rnnword'
11 | use_device = '2'
12 | 
13 | n_components = 32
14 | 
15 | model_path = 'temp.hdf5'
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/code/createFeature.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from collections import defaultdict
  4 | from tqdm import tqdm
  5 | import networkx as nx
  6 | import multiprocessing as mlp
  7 | from sklearn.decomposition import TruncatedSVD
  8 | 
  9 | tqdm.pandas()
 10 | 
 11 | def hash_q(train_orig,test_orig,aug=None):
 12 | 
 13 | 
 14 |     df1 = train_orig[['q1']].copy()
 15 |     df2 = train_orig[['q2']].copy()
 16 |     df1_test = test_orig[['q1']].copy()
 17 |     df2_test = test_orig[['q2']].copy()
 18 | 
 19 |     df2.rename(columns={'q2': 'q1'}, inplace=True)
 20 |     df2_test.rename(columns={'q2': 'q1'}, inplace=True)
 21 | 
 22 |     train_questions = df1.append(df2)
 23 |     train_questions = train_questions.append(df1_test)
 24 |     train_questions = train_questions.append(df2_test)
 25 |     train_questions.drop_duplicates(subset=['q1'], inplace=True)
 26 | 
 27 |     train_questions.reset_index(inplace=True, drop=True)
 28 |     questions_dict = pd.Series(train_questions.index.values, index=train_questions.q1.values).to_dict()
 29 |     train_cp = train_orig.copy()
 30 |     test_cp = test_orig.copy()
 31 | 
 32 |     train_cp['label'] = 1
 33 |     test_cp['label'] = -1
 34 |     comb = pd.concat([train_cp, test_cp])
 35 | 
 36 |     comb['q1_hash'] = comb['q1'].map(questions_dict)
 37 |     comb['q2_hash'] = comb['q2'].map(questions_dict)
 38 | 
 39 |     train_comb = comb[comb['label'] >= 0][['q1_hash', 'q2_hash']]
 40 |     test_comb = comb[comb['label'] == -1][['q1_hash', 'q2_hash']]
 41 | 
 42 |     train_orig = pd.concat([train_orig,train_comb], axis=1)
 43 |     test_orig = pd.concat([test_orig,test_comb], axis=1)
 44 | 
 45 |     return train_orig,test_orig
 46 | 
 47 | def adj_feat_worker(data,FG,suffix):
 48 |     def get_weights_adj(x):
 49 |         q1 = x['q1']
 50 |         q2 = x['q2']
 51 |         q1_adj = set(FG[q1])
 52 |         q2_adj = set(FG[q2])
 53 | 
 54 |         q1_or_q2 = q1_adj | q2_adj
 55 |         total_weight = 0
 56 |         for node in q1_or_q2:
 57 |             if node in FG[q1]:
 58 |                 total_weight+=FG.get_edge_data(q1,node)['weight']
 59 |             if node in FG[q2]:
 60 |                 total_weight+=FG.get_edge_data(q2,node)['weight']
 61 |         x['q1q2_union'+suffix] = total_weight
 62 | 
 63 |         total_weight = 0
 64 |         q1_and_q2 = q1_adj & q2_adj
 65 |         for node in q1_and_q2:
 66 |             if node in FG[q1]:
 67 |                 total_weight += FG.get_edge_data(q1, node)['weight']
 68 |             if node in FG[q2]:
 69 |                 total_weight += FG.get_edge_data(q2, node)['weight']
 70 |         x['q1q2_inter' + suffix] = total_weight
 71 | 
 72 |         return x
 73 | 
 74 |     data = data.progress_apply(get_weights_adj, axis=1, raw=True)
 75 |     return data[['q1q2_inter' + suffix,'q1q2_union'+suffix]]
 76 | 
 77 | def get_shortest_path_worker(data,FG,suffix):
 78 | 
 79 |     def get_shortest_path(x):
 80 |         q1 = x['q1']
 81 |         q2 = x['q2']
 82 |         w = FG.get_edge_data(q1, q2)['weight']
 83 |         FG.remove_edge(q1, q2)
 84 |         try:
 85 |             res = nx.dijkstra_path_length(FG, q1, q2)
 86 |         except:
 87 |             res = 0
 88 |         FG.add_edge(q1, q2, weight=w)
 89 |         x['shortest_path'+suffix] = res
 90 |         return x
 91 |     data = data.progress_apply(get_shortest_path, axis=1, raw=True)
 92 |     return data['shortest_path'+suffix]
 93 | 
 94 | def graph_feature(train,test,use_label,aug=None):
 95 | 
 96 |     def q_weight(data,FG,suffix):
 97 |         all_q_weights = {k: sum([x[1].get('weight') for x in FG[k].items()]) for k in FG.nodes}
 98 |         data['q1_num_adj' + suffix] = data['q1'].map(all_q_weights)
 99 |         data['q2_num_adj' + suffix] = data['q2'].map(all_q_weights)
100 |         return data
101 | 
102 |     def multi_process(data,FG,suffix,feat_f):
103 | 
104 |         num_cpu = mlp.cpu_count()
105 |         pool = mlp.Pool(num_cpu)
106 | 
107 |         aver_t = int(len(data) / num_cpu) + 1
108 |         results = []
109 |         for i in range(num_cpu):
110 |             result = pool.apply_async(feat_f,args=(data.iloc[i*aver_t:(i+1)*aver_t],FG,suffix))
111 |             results.append(result)
112 |         pool.close()
113 |         pool.join()
114 | 
115 |         feat = []
116 |         for result in results:
117 |             feat.append(result.get())
118 |         feat = pd.concat(feat,axis=0)
119 |         data = pd.concat([data,feat],axis=1)
120 | 
121 |         return data
122 | 
123 |     def pagerank(data,FG,suffix):
124 |         pr = nx.pagerank(FG, alpha=0.85)
125 |         data['q1_pr' + suffix] = data['q1'].map(pr)
126 |         data['q2_pr' + suffix] = data['q2'].map(pr)
127 |         return data
128 | 
129 |     if use_label:
130 |         suffix = '_w'
131 |     else:
132 |         suffix = ''
133 |     if aug is not None:
134 |         if use_label:
135 |             aug['y_pre'] = pd.read_csv('./data/aug_data_with_pre.csv',usecols=['y_pre'])['y_pre']
136 |         else:
137 |             aug['y_pre'] = 1.0
138 | 
139 |     if use_label:
140 |         train['y_pre'] = pd.read_csv('./data/tr_graph_weight.csv')['y_pre']
141 |         test['y_pre'] = pd.read_csv('./data/te_graph_weight.csv')['y_pre']
142 |     else:
143 |         train['y_pre'] = 1.0
144 |         test['y_pre'] = 1.0
145 | 
146 |     if aug is not None:
147 |         data = pd.concat([train, test,aug], ignore_index=True)
148 |     else:
149 |         data = pd.concat([train, test],ignore_index=True)
150 | 
151 |     FG = nx.Graph()
152 |     FG.add_weighted_edges_from(data[['q1','q2','y_pre']].values)
153 | 
154 |     data = pagerank(data,FG,suffix)
155 |     data = q_weight(data,FG,suffix)
156 | 
157 |     if use_label:
158 |         data = multi_process(data, FG, suffix, get_shortest_path_worker)
159 |     data = multi_process(data,FG,suffix,adj_feat_worker)
160 | 
161 |     data.drop(['y_pre'],inplace=True,axis=1)
162 | 
163 |     if aug is not None:
164 |         train = data.iloc[:train.shape[0]].reset_index(drop=True)
165 |         test = data.iloc[train.shape[0]:train.shape[0]+test.shape[0]].reset_index(drop=True)
166 |         aug = data.iloc[train.shape[0]+test.shape[0]:].reset_index(drop=True)
167 |         return train, test,aug
168 |     else:
169 |         train = data.iloc[:train.shape[0]].reset_index( drop=True)
170 |         test = data.iloc[train.shape[0]:].reset_index( drop=True)
171 |         return train,test
172 | 
173 | def svd_graph(train,test,use_label,aug=None):
174 |     from scipy.sparse import coo_matrix,save_npz
175 | 
176 |     if aug is not None:
177 |         if use_label:
178 |             aug['y_pre'] = pd.read_csv('./data/aug_data_with_pre.csv',usecols=['y_pre'])['y_pre']
179 |         else:
180 |             aug['y_pre'] = 1.0
181 | 
182 |     if use_label:
183 |         train['y_pre'] = pd.read_csv('./data/tr_graph_weight.csv')['y_pre']
184 |         test['y_pre'] = pd.read_csv('./data/te_graph_weight.csv')['y_pre']
185 |     else:
186 |         train['y_pre'] = 1.0
187 |         test['y_pre'] = 1.0
188 |     if aug is not None:
189 |         all_samples = pd.concat([train,test,aug]).reset_index(drop=True)[['q1','q2','y_pre']]
190 |     else:
191 |         all_samples = pd.concat([train, test]).reset_index(drop=True)[['q1', 'q2', 'y_pre']]
192 |     questions = all_samples['q1'].append(all_samples['q2']).drop_duplicates().reset_index(drop=True)
193 | 
194 | 
195 |     q2i = pd.Series(questions.index.values, index=questions.values).to_dict()
196 |     i2q = questions.to_dict()
197 | 
198 |     print('get coo matrix')
199 |     row = [i for i in range(len(q2i))]
200 |     col = [i for i in range(len(q2i))]
201 |     value = [1 for i in range(len(q2i))]
202 |     # row = []
203 |     # col = []
204 |     # value = []
205 |     for q1,q2,w in all_samples.values:
206 |         row.append(q2i[q1])
207 |         col.append(q2i[q2])
208 |         value.append(w)
209 | 
210 |         row.append(q2i[q2])
211 |         col.append(q2i[q1])
212 |         value.append(w)
213 | 
214 |     qmatrix = coo_matrix((value, (row,col)), shape=(len(q2i),len(q2i)))
215 |     # save_npz('./data/q_adj_matrix.npz', qmatrix)
216 | 
217 |     print('svd ...')
218 |     from config import n_components
219 |     svd = TruncatedSVD(n_components=n_components,algorithm='arpack',n_iter=100)
220 |     q_matrix = svd.fit_transform(qmatrix)
221 | 
222 |     total_ratio = []
223 |     ratio = 0
224 |     for i in svd.explained_variance_ratio_:
225 |         ratio += i
226 |         total_ratio.append(ratio)
227 |     print(total_ratio)
228 | 
229 |     q_matrix[q_matrix<1e-5] = 0
230 |     print(np.sum(q_matrix==0)/(q_matrix.shape[0]*q_matrix.shape[1]))
231 |     q_matrix = pd.DataFrame(q_matrix,columns=['feat'+str(i) for i in range(n_components)])
232 |     q_matrix['qid'] = list(range(len(q2i)))
233 |     q_matrix['qid'] = q_matrix['qid'].map(i2q)
234 | 
235 |     q_matrix.to_csv('./data/q_matrix_v2.csv',index=False)
236 | 
237 |     train.drop(['y_pre'],inplace=True,axis=1)
238 |     test.drop(['y_pre'], inplace=True,axis=1)
239 |     if aug is not None:
240 |         aug.drop(['y_pre'],inplace=True,axis=1)
241 |     return q_matrix
242 | 
243 | def num_same_w(data):
244 |     def num_of_common(x):
245 |         x['words_common'] = len(set(x['words_x']) & set(x['words_y']))
246 |         x['chars_common'] = len(set(x['chars_x']) & set(x['chars_y']))
247 |         return x
248 |     return data.progress_apply(num_of_common,axis=1)[['words_common','chars_common']]
249 | 
250 | def lcs_worker(data):
251 |     def lcs_length(a, b):
252 |         table = [[0] * (len(b) + 1) for _ in range(len(a) + 1)]
253 |         for i, ca in enumerate(a, 1):
254 |             for j, cb in enumerate(b, 1):
255 |                 table[i][j] = (
256 |                     table[i - 1][j - 1] + 1 if ca == cb else
257 |                     max(table[i][j - 1], table[i - 1][j]))
258 |         return table[-1][-1]
259 |     def lcs_feat(x):
260 |         x['lcs_words'] = lcs_length(x['words_x'],x['words_y'])
261 |         x['lcs_chars'] = lcs_length(x['chars_x'],x['chars_y'])
262 |         return x
263 | 
264 |     return data.progress_apply(lcs_feat, axis=1)[['lcs_words','lcs_chars']]
265 | 
266 | def edit_distance(data):
267 |     from pyxdameraulevenshtein import damerau_levenshtein_distance
268 |     def edit_feat(x):
269 |         x['edit_words'] = damerau_levenshtein_distance(x['words_x'],x['words_y'])
270 |         x['edit_chars'] = damerau_levenshtein_distance(x['chars_x'],x['chars_y'])
271 |         return x
272 | 
273 |     return data.progress_apply(edit_feat, axis=1)[['edit_words','edit_chars']]
274 | 
275 | 
276 | def distance_feat(train,test,aug=None):
277 | 
278 |     def multi_process(data,feat_f):
279 | 
280 |         num_cpu = mlp.cpu_count()
281 |         pool = mlp.Pool(num_cpu)
282 | 
283 |         aver_t = int(len(data) / num_cpu) + 1
284 |         results = []
285 |         for i in range(num_cpu):
286 |             result = pool.apply_async(feat_f,args=(data.iloc[i*aver_t:(i+1)*aver_t],))
287 |             results.append(result)
288 |         pool.close()
289 |         pool.join()
290 | 
291 |         feat = []
292 |         for result in results:
293 |             feat.append(result.get())
294 |         feat = pd.concat(feat,axis=0)
295 |         data = pd.concat([data,feat],axis=1)
296 | 
297 |         return data
298 | 
299 |     question = pd.read_csv('./data/question.csv')
300 | 
301 |     if aug is not None:
302 |         data = pd.concat([train, test,aug], ignore_index=True)
303 |     else:
304 |         data = pd.concat([train, test], ignore_index=True)
305 |     data = pd.merge(data, question, left_on=['q1'], right_on=['qid'], how='left')
306 |     data = pd.merge(data, question, left_on=['q2'], right_on=['qid'], how='left')
307 |     data.drop(['qid_x','qid_y'],axis=1,inplace=True)
308 |     '''训练集长度'''
309 | 
310 |     data['q1_word_len'] = data['words_x'].progress_apply(lambda x: len(x.split()))
311 |     data['q2_word_len'] = data['words_y'].progress_apply(lambda x: len(x.split()))
312 |     data['q1_char_len'] = data['chars_x'].progress_apply(lambda x: len(x.split()))
313 |     data['q2_char_len'] = data['chars_y'].progress_apply(lambda x: len(x.split()))
314 | 
315 |     data['words_x'] = data['words_x'].str.split()
316 |     data['words_y'] = data['words_y'].str.split()
317 |     data['chars_x'] = data['chars_x'].str.split()
318 |     data['chars_y'] = data['chars_y'].str.split()
319 |     print(data.columns)
320 |     data = multi_process(data,lcs_worker)
321 |     print(data.columns)
322 |     data = multi_process(data,edit_distance)
323 |     print(data.columns)
324 |     data = multi_process(data, num_same_w)
325 |     print(data.columns)
326 | 
327 | 
328 |     data.drop(['chars_x','chars_y','words_x','words_y'],axis=1,inplace=True)
329 | 
330 |     if aug is not None:
331 |         train = data.iloc[:train.shape[0]].reset_index(drop=True)
332 |         test = data.iloc[train.shape[0]:train.shape[0]+test.shape[0]].reset_index(drop=True)
333 |         aug = data.iloc[train.shape[0]+test.shape[0]:].reset_index(drop=True)
334 |         return train, test,aug
335 |     else:
336 |         train = data.iloc[:train.shape[0]].reset_index( drop=True)
337 |         test = data.iloc[train.shape[0]:].reset_index( drop=True)
338 |         return train,test
339 | 
340 | 
341 | 
342 | train = pd.read_csv('./data/train.csv',usecols=['q1','q2','label'])
343 | test = pd.read_csv('./data/test.csv',usecols=['q1','q2'])
344 | aug = pd.read_csv('./data/aug_data_with_pre.csv',usecols=['q1','q2','label'])
345 | labels = train['label']
346 | aug_label = aug['label']
347 | train.drop(['label'],axis=1,inplace=True)
348 | aug.drop(['label'],axis=1,inplace=True)
349 | 
350 | print(aug.shape)
351 | print(train.shape)
352 | print(test.shape)
353 | 
354 | print(aug)
355 | q_matrix = svd_graph(train,test,False,aug)
356 | 
357 | # train,test = hash_q(train,test)
358 | train,test,aug = graph_feature(train,test,False,aug)
359 | print(aug)
360 | train,test,aug = graph_feature(train,test,True,aug)
361 | 
362 | # train,test,aug = distance_feat(train,test,aug)
363 | train['label'] = labels
364 | aug['label'] = aug_label
365 | 
366 | 
367 | 
368 | train.to_csv('./data/train_v2.csv',index=False)
369 | test.to_csv('./data/test_v2.csv',index=False)
370 | aug.to_csv('./data/aug_data_filter.csv',index=False)
371 | # train = pd.merge(train,q_matrix, left_on=['q1'], right_on=['qid'], how='left')
372 | # test = pd.merge(test,q_matrix, left_on=['q1'], right_on=['qid'], how='left')
373 | 
374 | test['label'] = pd.read_csv('147037.csv')['y_pre']
375 | 
376 | 
377 | print(train.corr())
378 | print(test.corr())


--------------------------------------------------------------------------------
/code/ensemble.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from glob import glob
 3 | from tqdm import tqdm
 4 | import tensorflow as tf
 5 | import keras.backend.tensorflow_backend as KTF
 6 | config = tf.ConfigProto()
 7 | config.gpu_options.allow_growth=True
 8 | session = tf.Session(config=config)
 9 | KTF.set_session(session)
10 | import keras.backend as K
11 | import multiprocessing as mlp
12 | 
13 | def ensemble(model_name,te_word,te_char,embedding_matrix_word,emembedding_matrix_char):
14 |     from nn import rnnword, aggmodel, esim, attention, rnn_res
15 | 
16 |     if model_name == 'rnnword':
17 |         get_model = rnnword
18 |     elif model_name == 'aggmodel':
19 |         pass
20 |     elif model_name == 'esim':
21 |         get_model = esim
22 |     elif model_name == 'attention':
23 |         get_model = attention
24 |     elif model_name == 'res':
25 |         get_model = rnn_res
26 |     else:
27 |         raise RuntimeError("don't have this model")
28 | 
29 |     path = './weight_' + model_name + '/'
30 | 
31 |     results = []
32 |     m_char = get_model(emembedding_matrix_char,False)
33 |     m_word = get_model(embedding_matrix_word,True)
34 | 
35 |     for model_path in tqdm(glob(path+'*.h5')):
36 | 
37 |         if "2018-07-15_16:15:17" not in model_path:
38 |             continue
39 |         if 'chars_True'in model_path or 'words_True' in model_path:
40 |             ense_w = 7
41 |         elif 'chars_False' in model_path:
42 |             ense_w = 3
43 |         elif 'words_False' in model_path:
44 |             ense_w = 4
45 |         else:
46 |             raise RuntimeError("error model")
47 | 
48 |         if 'char' in model_path:
49 |             m_char.load_weights(model_path)
50 |             results.append((m_char.predict(te_char,batch_size=1024), ense_w))
51 |         else:
52 |             m_word.load_weights(model_path)
53 |             results.append((m_word.predict(te_word,batch_size=1024),ense_w))
54 | 
55 |     K.clear_session()
56 |     tf.reset_default_graph()
57 | 
58 |     submit = 0
59 |     total_w = 0
60 |     for y_pred,ense_w in results:
61 |         submit += ense_w*y_pred
62 |         total_w += ense_w
63 | 
64 |     return submit/total_w
65 | 
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     from readdata import read_data
70 | 
71 |     _, te_word, embedding_matrix_word,__ = read_data('words', data_aug=False)
72 |     _, te_char, embedding_matrix_char,__ = read_data('chars', data_aug=False)
73 | 
74 |     submit_atten = ensemble('esim',te_word,te_char,embedding_matrix_word,embedding_matrix_char)
75 | 
76 |     submit = pd.DataFrame()
77 |     submit['y_pre'] = list(submit_atten[:, 0])
78 |     submit.to_csv('atten.csv', index=False)
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/code/nn.py:
--------------------------------------------------------------------------------
  1 | from keras.models import Model
  2 | from keras.layers import *
  3 | from keras.regularizers import l2
  4 | from keras.callbacks import Callback, ModelCheckpoint
  5 | from keras.utils.data_utils import get_file
  6 | from keras import backend as K
  7 | from sklearn.model_selection import train_test_split
  8 | from keras.optimizers import Nadam,RMSprop
  9 | import tensorflow as tf
 10 | from keras.initializers import VarianceScaling
 11 | from itertools import combinations
 12 | from keras.constraints import non_neg,min_max_norm
 13 | 
 14 | 
 15 | def co_attention(q1, q2):
 16 | 
 17 |     dense_w = TimeDistributed(Dense(1))
 18 |     atten = Lambda(lambda x: K.batch_dot(x[0], x[1]))([q1, Permute((2, 1))(q2)])   # 15 * 15
 19 | 
 20 |     atten_1 = dense_w(atten)
 21 |     atten_1 = Flatten()(atten_1)
 22 |     atten_1 = Activation('softmax')(atten_1)
 23 |     atten_1 = Reshape((1,-1))(atten_1)
 24 | 
 25 |     atten_2 = dense_w(Permute((2, 1))(atten))
 26 |     atten_2 = Flatten()(atten_2)
 27 |     atten_2 = Activation('softmax')(atten_2)
 28 |     atten_2 = Reshape((1,-1))(atten_2)
 29 | 
 30 |     q1 = Lambda(lambda x: K.batch_dot(x[0], x[1]))([atten_1,q1])   # 1*300
 31 |     q1 = Flatten()(q1)
 32 |     q2 = Lambda(lambda x: K.batch_dot(x[0], x[1]))([atten_2,q2])  # 1*300
 33 |     q2 = Flatten()(q2)
 34 |     return q1, q2
 35 | 
 36 | def unchanged_shape(input_shape):
 37 |     "Function for Lambda layer"
 38 |     return input_shape
 39 | 
 40 | def soft_attention_alignment(input_1, input_2):
 41 |     attention = Dot(axes=-1)([input_1, input_2])
 42 |     w_att_1 = Lambda(lambda x: K.softmax(x, axis=1),
 43 |                      output_shape=unchanged_shape)(attention)
 44 |     w_att_2 = Permute((2, 1))(Lambda(lambda x: K.softmax(x, axis=2),
 45 |                                      output_shape=unchanged_shape)(attention))
 46 |     in1_aligned = Dot(axes=1)([w_att_1, input_1])
 47 |     in2_aligned = Dot(axes=1)([w_att_2, input_2])
 48 |     return in1_aligned, in2_aligned
 49 | 
 50 | def norm_layer(x, axis=1):
 51 |     return (x - K.mean(x, axis=axis, keepdims=True)) / K.std(x, axis=axis, keepdims=True)
 52 | 
 53 | def distance(q1,q2,dist,normlize=False):
 54 |     if normlize:
 55 |         q1 = Lambda(norm_layer)(q1)
 56 |         q2 = Lambda(norm_layer)(q2)
 57 | 
 58 |     if dist == 'cos':
 59 |         return multiply([q1,q2])
 60 | 
 61 |     elif dist == 'h_mean':
 62 |         def dice(x):
 63 |             return x[0]*x[1]/(K.sum(K.abs(x[0]),axis=1,keepdims=True)+K.sum(K.abs(x[1]),axis=1,keepdims=True))
 64 |         return Lambda(dice)([q1,q2])
 65 | 
 66 |     elif dist == 'dice':
 67 |         def dice(x):
 68 |             return x[0]*x[1]/(K.sum(x[0]**2,axis=1,keepdims=True)+K.sum(x[1]**2,axis=1,keepdims=True))
 69 |         return Lambda(dice)([q1,q2])
 70 | 
 71 |     elif dist == 'jaccard':
 72 |         def jaccard(x):
 73 |             return  x[0]*x[1]/(
 74 |                     K.sum(x[0]**2,axis=1,keepdims=True)+
 75 |                     K.sum(x[1]**2,axis=1,keepdims=True)-
 76 |                     K.sum(K.abs(x[0]*x[1]),axis=1,keepdims=True))
 77 |         return Lambda(jaccard)([q1,q2])
 78 |     elif dist == 'jac_add':
 79 |         def jac_add(x):
 80 |             a = K.sum(x[0]**2,axis=1,keepdims=True)+K.sum(x[1]**2,axis=1,keepdims=True)-K.sum(K.abs(x[0]*x[1]),axis=1,keepdims=True)
 81 |             b = x[0]+x[1]
 82 |             return  b/a
 83 |         return Lambda(jac_add)([q1,q2])
 84 |     elif dist == 'dice_add':
 85 |         def dice_add(x):
 86 |             a = K.sum(x[0]**2,axis=1,keepdims=True)+K.sum(x[1]**2,axis=1,keepdims=True)
 87 |             b = x[0]+x[1]
 88 |             return  b/a
 89 |         return Lambda(dice_add)([q1,q2])
 90 | 
 91 | def pool_corr(q1,q2,pool_way,dist):
 92 |     if pool_way == 'max':
 93 |         pool = GlobalMaxPooling1D()
 94 |     elif pool_way == 'ave':
 95 |         pool = GlobalAveragePooling1D()
 96 |     else:
 97 |         raise RuntimeError("don't have this pool way")
 98 | 
 99 |     q1 = pool(q1)
100 |     q2 = pool(q2)
101 | 
102 |     merged = distance(q1,q2,dist,normlize=True)
103 | 
104 | 
105 |     return merged
106 | 
107 | def weight_ave(q1,q2):
108 | 
109 |     down = TimeDistributed(Dense(1,use_bias=False))
110 | 
111 |     q1 = down(Permute((2,1))(q1))
112 |     q1 = Flatten()(q1)
113 |     q1 = Lambda(norm_layer)(q1)
114 |     q2 = down(Permute((2,1))(q2))
115 |     q2 = Flatten()(q2)
116 |     q2 = Lambda(norm_layer)(q2)
117 |     merged = multiply([q1, q2])
118 |     return merged
119 | 
120 | def simility_vec(q1,q2):
121 |     simi = Lambda(lambda x: K.batch_dot(x[0], x[1]))([q1, Permute((2, 1))(q2)])
122 |     simi = Reshape((-1,))(simi)
123 |     return simi
124 | 
125 | def rnnword(word_embedding_matrix,use_word):
126 |     if use_word:
127 |         from config import MAX_NUM_WORDS
128 |         text_len = MAX_NUM_WORDS
129 |     else:
130 |         from config import MAX_NUM_CHARS
131 |         text_len = MAX_NUM_CHARS
132 | 
133 |     question1 = Input(shape=(text_len,),name='q1')
134 |     question2 = Input(shape=(text_len,),name='q2')
135 | 
136 | 
137 | 
138 |     embedd_word = Embedding(
139 |                    len(word_embedding_matrix),
140 |                    word_embedding_matrix.shape[1],
141 |                    weights=[word_embedding_matrix],
142 |                    input_length=text_len,
143 |                    trainable=True,)
144 | 
145 | 
146 |     gru_dim1 = 384
147 |     gru_dim2 = 256
148 | 
149 | 
150 |     gru_w = Bidirectional(CuDNNGRU(gru_dim1,return_sequences=True),merge_mode='sum')
151 |     gru2_w = Bidirectional(CuDNNGRU(gru_dim2,return_sequences=True),merge_mode='sum')
152 | 
153 | 
154 |     norm = BatchNormalization()
155 |     q1 = embedd_word(question1)
156 |     q1 = norm(q1)
157 |     q1 = SpatialDropout1D(0.2)(q1)
158 | 
159 |     q2 = embedd_word(question2)
160 |     q2 = norm(q2)
161 |     q2 = SpatialDropout1D(0.2)(q2)
162 | 
163 |     q1_1 = gru_w(q1)
164 |     q2_1 = gru_w(q2)
165 | 
166 |     q1 = gru2_w(q1_1)
167 |     q2 = gru2_w(q2_1)
168 | 
169 |     merged_max = pool_corr(q1,q2,'max','jaccard')
170 |     merged_ave = pool_corr(q1,q2,'ave','jaccard')
171 | 
172 |     from config import n_components
173 |     q1_g = Input(shape=(n_components,),name='q1node')
174 |     q2_g = Input(shape=(n_components,),name='q2node')
175 | 
176 | 
177 |     norm = BatchNormalization()
178 |     q1_node = norm(q1_g)
179 |     q2_node = norm(q2_g)
180 | 
181 |     fc = Dense(units=2)
182 |     act = PReLU()
183 |     q1_node = fc(q1_node)
184 |     q1_node = act(q1_node)
185 |     q2_node = fc(q2_node)
186 |     q2_node = act(q2_node)
187 | 
188 |     node_vec = multiply([q1_node,q2_node])
189 | 
190 |     graph_f = Input(shape=(11,),name='gf')
191 |     gf = BatchNormalization()(graph_f)
192 |     gf = Dropout(0.2)(gf)
193 | 
194 |     merged = concatenate([merged_ave,merged_max])
195 |     merged = Dense(512,activation='relu')(merged)
196 |     merged = concatenate([merged, gf,node_vec])
197 |     merged = Dense(512,activation='relu')(merged)
198 |     output = Dense(1, activation='sigmoid')(merged)
199 | 
200 |     lr=0.0008
201 | 
202 |     model = Model(inputs=[question1,question2,graph_f,q1_g,q2_g], outputs=output)
203 | 
204 |     model.compile(loss='binary_crossentropy',optimizer=Nadam(lr),metrics=['binary_crossentropy','accuracy'])
205 |     print(lr)
206 | 
207 |     return model
208 | 
209 | def aggmodel(word_embedding_matrix,char_embedding_matrix):
210 | 
211 |     def prepocess(q1,q2,embedd):
212 |         norm = BatchNormalization()
213 |         q1 = embedd(q1)
214 |         q1 = norm(q1)
215 |         q1 = SpatialDropout1D(0.2)(q1)
216 | 
217 |         q2 = embedd(q2)
218 |         q2 = norm(q2)
219 |         q2 = SpatialDropout1D(0.2)(q2)
220 |         return q1,q2
221 | 
222 |     from config import MAX_NUM_WORDS,MAX_NUM_CHARS
223 | 
224 | 
225 |     word1 = Input(shape=(MAX_NUM_WORDS,))
226 |     word2 = Input(shape=(MAX_NUM_WORDS,))
227 |     char1 = Input(shape=(MAX_NUM_CHARS,))
228 |     char2 = Input(shape=(MAX_NUM_CHARS,))
229 | 
230 | 
231 |     embedd_word = Embedding(
232 |                    len(word_embedding_matrix),
233 |                    word_embedding_matrix.shape[1],
234 |                    weights=[word_embedding_matrix],
235 |                    input_length=MAX_NUM_WORDS,
236 |                    trainable=True)
237 |     embedd_char = Embedding(
238 |         len(char_embedding_matrix),
239 |         char_embedding_matrix.shape[1],
240 |         weights=[char_embedding_matrix],
241 |         input_length=MAX_NUM_CHARS,
242 |         trainable=True)
243 | 
244 |     gru_dim1 = 384
245 |     gru_dim2 = 256
246 | 
247 | 
248 |     gru_w = Bidirectional(CuDNNGRU(gru_dim1,return_sequences=True),merge_mode='sum')
249 |     gru2_w = Bidirectional(CuDNNGRU(gru_dim2,return_sequences=True,),merge_mode='sum')
250 | 
251 |     gru_wc = Bidirectional(CuDNNGRU(gru_dim1, return_sequences=True), merge_mode='sum')
252 |     gru2_wc = Bidirectional(CuDNNGRU(gru_dim2, return_sequences=True), merge_mode='sum')
253 | 
254 |     q1,q2 = prepocess(word1,word2,embedd_word)
255 |     qc1,qc2 = prepocess(char1,char2,embedd_char)
256 | 
257 |     q1 = gru_w(q1)
258 |     q2 = gru_w(q2)
259 |     qc1 = gru_wc(qc1)
260 |     qc2 = gru_wc(qc2)
261 | 
262 |     q1 = gru2_w(q1)
263 |     q2 = gru2_w(q2)
264 |     qc1 = gru2_wc(qc1)
265 |     qc2 = gru2_wc(qc2)
266 | 
267 |     merged_max1 = pool_corr(q1,qc2,'max')
268 |     merged_max2 = pool_corr(qc1,q2,'max')
269 |     merged_ave1 = pool_corr(q1,qc2,'ave')
270 |     merged_ave2 = pool_corr(qc1,q2,'ave')
271 | 
272 |     merged_max3 = pool_corr(q1,q2, 'max')
273 |     merged_max4 = pool_corr(qc1,qc2, 'max')
274 |     merged_ave3 = pool_corr(q1,q2, 'ave')
275 |     merged_ave4 = pool_corr(qc1,qc2, 'ave')
276 | 
277 | 
278 |     merged = concatenate([merged_max1,merged_max2,merged_max3,merged_max4,
279 |                           merged_ave1,merged_ave2,merged_ave3,merged_ave4])
280 |     merged = Dense(512,activation='relu')(merged)
281 |     # merged = Dropout(0.2)(merged)
282 |     merged = Dense(512,activation='relu')(merged)
283 |     # merged = Dropout(0.2)(merged)
284 |     output = Dense(1, activation='sigmoid')(merged)
285 | 
286 | 
287 | 
288 |     lr=0.0008
289 | 
290 | 
291 |     model = Model(inputs=[word1,word2,char1,char2], outputs=output)
292 | 
293 |     # model = multi_gpu_model(model,gpus=4)
294 | 
295 |     model.compile(loss='binary_crossentropy',optimizer=Nadam(lr),metrics=['binary_crossentropy','accuracy'])
296 | 
297 |     # model.load_weights("./data/weights_best_0.0008.hdf5")
298 |     print(lr)
299 | 
300 |     return model
301 | 
302 | def esim(word_embedding_matrix, use_word):
303 |     if use_word:
304 |         from config import MAX_NUM_WORDS
305 |         text_len = MAX_NUM_WORDS
306 |     else:
307 |         from config import MAX_NUM_CHARS
308 |         text_len = MAX_NUM_CHARS
309 | 
310 |     q1 = Input(name='q1', shape=(text_len,))
311 |     q2 = Input(name='q2', shape=(text_len,))
312 | 
313 |     embedding = Embedding(
314 |                    len(word_embedding_matrix),
315 |                    word_embedding_matrix.shape[1],
316 |                    weights=[word_embedding_matrix],
317 |                    input_length=text_len,
318 |                    trainable=True)
319 | 
320 |     bn = BatchNormalization()
321 |     q1_embed = bn(embedding(q1))
322 |     q1_embed = SpatialDropout1D(0.2)(q1_embed)
323 |     q2_embed = bn(embedding(q2))
324 |     q2_embed = SpatialDropout1D(0.2)(q2_embed)
325 | 
326 |     encode = Bidirectional(CuDNNLSTM(384,return_sequences=True), merge_mode='sum')
327 |     q1_encoded = encode(q1_embed)
328 |     q2_encoded = encode(q2_embed)
329 | 
330 |     q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded)
331 | 
332 |     q1_combined = Concatenate()([q1_encoded, q2_aligned, multiply([q1_encoded, q2_aligned])])
333 |     q2_combined = Concatenate()([q2_encoded, q1_aligned, multiply([q2_encoded, q1_aligned])])
334 | 
335 |     compose = Bidirectional(CuDNNLSTM(384,return_sequences=True), merge_mode='sum')
336 |     q1_compare = compose(q1_combined)
337 |     q2_compare = compose(q2_combined)
338 | 
339 | 
340 |     merged_ave = pool_corr(q1_compare,q2_compare,'ave','dice')
341 |     merged_max = pool_corr(q1_compare,q2_compare,'max','dice')
342 | 
343 |     from config import n_components
344 |     q1_g = Input(shape=(n_components,), name='q1node')
345 |     q2_g = Input(shape=(n_components,), name='q2node')
346 | 
347 |     norm = BatchNormalization()
348 |     q1_node = norm(q1_g)
349 |     q2_node = norm(q2_g)
350 | 
351 |     fc = Dense(units=2)
352 |     act = PReLU()
353 |     q1_node = fc(q1_node)
354 |     q1_node = act(q1_node)
355 |     q2_node = fc(q2_node)
356 |     q2_node = act(q2_node)
357 | 
358 |     node_vec = multiply([q1_node, q2_node])
359 | 
360 |     graph_f = Input(shape=(11,), name='gf')
361 |     gf = BatchNormalization()(graph_f)
362 |     gf = Dropout(0.2)(gf)
363 | 
364 |     merged = Concatenate()([merged_max, merged_ave])
365 | 
366 |     dense = Dense(512, activation='relu')(merged)
367 |     dense = concatenate([dense,gf,node_vec])
368 |     dense = Dense(512, activation='relu')(dense)
369 |     out_ = Dense(1, activation='sigmoid')(dense)
370 |     lr = 0.0008
371 | 
372 |     model = Model(inputs=[q1, q2, graph_f,q1_g,q2_g], outputs=out_)
373 |     model.compile(optimizer=Nadam(lr=lr), loss='binary_crossentropy', metrics=['binary_crossentropy', 'accuracy'])
374 |     return model
375 | 
376 | def attention(word_embedding_matrix,use_word):
377 |     if use_word:
378 |         from config import MAX_NUM_WORDS
379 |         text_len = MAX_NUM_WORDS
380 |     else:
381 |         from config import MAX_NUM_CHARS
382 |         text_len = MAX_NUM_CHARS
383 | 
384 |     question1 = Input(shape=(text_len,),name='q1')
385 |     question2 = Input(shape=(text_len,),name='q2')
386 | 
387 | 
388 | 
389 |     embedd_word = Embedding(
390 |                    len(word_embedding_matrix),
391 |                    word_embedding_matrix.shape[1],
392 |                    weights=[word_embedding_matrix],
393 |                    input_length=text_len,
394 |                    trainable=True)
395 | 
396 |     gru_dim1 = 300
397 |     gru_dim2 = 300
398 | 
399 |     gru_w = Bidirectional(CuDNNLSTM(gru_dim1,return_sequences=True),merge_mode='sum')
400 |     gru2_w = Bidirectional(CuDNNLSTM(gru_dim2,return_sequences=True),merge_mode='sum')
401 | 
402 | 
403 |     norm = BatchNormalization()
404 |     q1 = embedd_word(question1)
405 |     q1 = norm(q1)
406 |     q1 = SpatialDropout1D(0.2)(q1)
407 | 
408 |     q2 = embedd_word(question2)
409 |     q2 = norm(q2)
410 |     q2 = SpatialDropout1D(0.2)(q2)
411 | 
412 |     q1 = gru_w(q1)
413 |     q2 = gru_w(q2)
414 | 
415 |     q1 = gru2_w(q1)
416 |     q2 = gru2_w(q2)
417 | 
418 |     q1_1,q2_2 = co_attention(q1,q2)
419 |     merged_1 = distance(q1_1,q2_2,'dice', normlize=True)
420 |     merged_3 = pool_corr(q1,q2,'max','dice')
421 |     merged_4 = distance(q1_1,q2_2,'dice_add',normlize=True)
422 | 
423 |     from config import n_components
424 |     q1_g = Input(shape=(n_components,),name='q1node')
425 |     q2_g = Input(shape=(n_components,),name='q2node')
426 | 
427 |     norm = BatchNormalization()
428 |     q1_node = norm(q1_g)
429 |     q2_node = norm(q2_g)
430 | 
431 |     fc = Dense(units=2)
432 |     act = PReLU()
433 |     q1_node = fc(q1_node)
434 |     q1_node = act(q1_node)
435 |     q2_node = fc(q2_node)
436 |     q2_node = act(q2_node)
437 | 
438 |     node_vec = multiply([q1_node,q2_node])
439 | 
440 |     graph_f = Input(shape=(11,),name='gf')
441 |     gf = BatchNormalization()(graph_f)
442 |     gf = Dropout(0.2)(gf)
443 | 
444 |     merged = concatenate([merged_1,merged_3,merged_4])
445 |     merged = Dense(768,activation='relu')(merged)
446 |     merged = Dropout(0.2)(merged)
447 |     merged = concatenate([merged,gf,node_vec])
448 |     merged = Dense(768,activation='relu')(merged)
449 |     output = Dense(1, activation='sigmoid')(merged)
450 | 
451 |     lr=0.0008
452 | 
453 |     model = Model(inputs=[question1,question2,graph_f,q1_g,q2_g], outputs=output)
454 | 
455 |     model.compile(loss='binary_crossentropy',optimizer=Nadam(lr),metrics=['binary_crossentropy','accuracy'])
456 |     print(lr)
457 | 
458 |     return model
459 | 
460 | 
461 | def rnn_res(word_embedding_matrix,use_word):
462 |     if use_word:
463 |         from config import MAX_NUM_WORDS
464 |         text_len = MAX_NUM_WORDS
465 |     else:
466 |         from config import MAX_NUM_CHARS
467 |         text_len = MAX_NUM_CHARS
468 | 
469 |     question1 = Input(shape=(text_len,),name='q1')
470 |     question2 = Input(shape=(text_len,),name='q2')
471 | 
472 | 
473 | 
474 |     embedd_word = Embedding(
475 |                    len(word_embedding_matrix),
476 |                    word_embedding_matrix.shape[1],
477 |                    weights=[word_embedding_matrix],
478 |                    input_length=text_len,
479 |                    trainable=True)
480 | 
481 |     gru_dim1 = 300
482 |     gru_dim2 = 300
483 | 
484 |     gru_w = Bidirectional(CuDNNLSTM(gru_dim1,return_sequences=True),merge_mode='sum')
485 |     gru2_w = Bidirectional(CuDNNGRU(gru_dim2,return_sequences=True),merge_mode='sum')
486 | 
487 | 
488 |     norm = BatchNormalization()
489 |     q1 = embedd_word(question1)
490 |     q1 = norm(q1)
491 |     q1 = SpatialDropout1D(0.2)(q1)
492 | 
493 |     q2 = embedd_word(question2)
494 |     q2 = norm(q2)
495 |     q2 = SpatialDropout1D(0.2)(q2)
496 | 
497 |     q1_0 = gru_w(q1)
498 |     q2_0 = gru_w(q2)
499 | 
500 |     q1 = gru2_w(q1_0)
501 |     q2 = gru2_w(q2_0)
502 | 
503 |     merged_0 = pool_corr(q1_0,q2_0,'ave','jaccard')
504 |     merged_1 = pool_corr(q1,q2,'ave','dice')
505 |     merged_2 = pool_corr(q1_0, q2_0, 'max', 'jaccard')
506 |     merged_3 = pool_corr(q1,q2,'max','dice')
507 | 
508 |     from config import n_components
509 |     q1_g = Input(shape=(n_components,),name='q1node')
510 |     q2_g = Input(shape=(n_components,),name='q2node')
511 | 
512 |     norm = BatchNormalization()
513 |     q1_node = norm(q1_g)
514 |     q2_node = norm(q2_g)
515 | 
516 |     fc = Dense(units=2)
517 |     act = PReLU()
518 |     q1_node = fc(q1_node)
519 |     q1_node = act(q1_node)
520 |     q2_node = fc(q2_node)
521 |     q2_node = act(q2_node)
522 | 
523 |     node_vec = multiply([q1_node,q2_node])
524 | 
525 |     graph_f = Input(shape=(11,),name='gf')
526 |     gf = BatchNormalization()(graph_f)
527 |     gf = Dropout(0.2)(gf)
528 | 
529 |     merged = concatenate([merged_1,merged_3,merged_2,merged_0])
530 |     merged = Dense(768,activation='relu')(merged)
531 |     merged = concatenate([merged, gf,node_vec])
532 |     merged = Dense(768,activation='relu')(merged)
533 |     output = Dense(1, activation='sigmoid')(merged)
534 | 
535 |     lr=0.0008
536 | 
537 |     model = Model(inputs=[question1,question2,graph_f,q1_g,q2_g], outputs=output)
538 | 
539 |     model.compile(loss='binary_crossentropy',optimizer=Nadam(lr),metrics=['binary_crossentropy','accuracy'])
540 |     print(lr)
541 | 
542 |     return model
543 | 


--------------------------------------------------------------------------------
/code/readdata.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | import numpy as np
  4 | # 文本处理
  5 | from keras.preprocessing.text import Tokenizer
  6 | from keras.preprocessing.sequence import pad_sequences
  7 | from config import MAX_NB_WORDS
  8 | from tqdm import tqdm
  9 | from tool import get_samples
 10 | from sklearn.preprocessing import LabelEncoder
 11 | # 20890
 12 | 
 13 | def get_embedding_matrix(word_index,file):
 14 |     embeddings_index = {}
 15 |     with open(file, 'r') as f:
 16 |         wordmat = f.read().split('\n')
 17 |         if wordmat[-1] == '':
 18 |             wordmat = wordmat[:-1]
 19 |         if wordmat[0] == '':
 20 |             wordmat = wordmat[1:]
 21 | 
 22 |     for line in tqdm(wordmat):
 23 |         wvec = line.strip('\n').strip(' ').split(' ')
 24 |         embeddings_index[wvec[0]] = np.asarray(wvec[1:], dtype='float')
 25 | 
 26 |     print('embedding', len(embeddings_index))
 27 | 
 28 |     EMBEDDING_DIM = 300
 29 |     nb_words = min(MAX_NB_WORDS, len(word_index))
 30 |     embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
 31 |     for word, i in word_index.items():
 32 |         if i > MAX_NB_WORDS:
 33 |             continue
 34 |         embedding_vector = embeddings_index.get(str(word).upper())
 35 |         if embedding_vector is not None:
 36 |             embedding_matrix[i] = embedding_vector
 37 |     return embedding_matrix
 38 | 
 39 | def read_data(use_data,file=None,data_aug=False):
 40 | 
 41 |     question = pd.read_csv('./data/question.csv')
 42 |     question = question[['qid', use_data]]
 43 | 
 44 | 
 45 |     if data_aug:
 46 |         train = pd.read_csv('./data/train.csv', usecols=['label', 'q1', 'q2'])
 47 |         samples = pd.read_csv('./data/aug_data_filter.csv',usecols=['label','q1','q2'])
 48 |         train = pd.concat([train,samples]).reset_index(drop=True)
 49 |         test = pd.read_csv('./data/test.csv', usecols=['q1', 'q2'])
 50 |     else:
 51 |         train = pd.read_csv('./data/train.csv', usecols=['label', 'q1', 'q2'])
 52 |         test = pd.read_csv('./data/test.csv', usecols=['q1', 'q2'])
 53 | 
 54 | 
 55 |     train = pd.merge(train, question, left_on=['q1'], right_on=['qid'], how='left')
 56 |     train = pd.merge(train, question, left_on=['q2'], right_on=['qid'], how='left')
 57 |     train = train[[use_data+'_x', use_data+'_y','label']]
 58 |     train.columns = ['q1', 'q2','label']
 59 | 
 60 |     test = pd.merge(test, question, left_on=['q1'], right_on=['qid'], how='left')
 61 |     test = pd.merge(test, question, left_on=['q2'], right_on=['qid'], how='left')
 62 |     test = test[[use_data+'_x', use_data+'_y']]
 63 |     test.columns = ['q1', 'q2']
 64 | 
 65 |     all = pd.concat([train, test])
 66 | 
 67 |     # 分词 词转序列
 68 |     tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
 69 |     tokenizer.fit_on_texts(question[use_data])
 70 | 
 71 |     word_index = tokenizer.word_index
 72 |     print(len(word_index))
 73 | 
 74 |     q1_word_seq = tokenizer.texts_to_sequences(all['q1'])
 75 |     q2_word_seq = tokenizer.texts_to_sequences(all['q2'])
 76 | 
 77 |     if file is None:
 78 |         if use_data == 'words':
 79 |             file = './data/word_embed.txt'
 80 |         if use_data == 'chars':
 81 |             file = './data/char_embed.txt'
 82 |     word_embedding_matrix = get_embedding_matrix(word_index, file)
 83 | 
 84 | 
 85 |     from config import MAX_NUM_WORDS,MAX_NUM_CHARS
 86 |     if use_data == 'words':
 87 |         text_len = MAX_NUM_WORDS
 88 |     elif use_data == 'chars':
 89 |         text_len = MAX_NUM_CHARS
 90 |     else:
 91 |         raise RuntimeError('use data error')
 92 | 
 93 |     q1_data = pad_sequences(q1_word_seq,maxlen=text_len,truncating='post')
 94 |     q2_data = pad_sequences(q2_word_seq,maxlen=text_len,truncating='post')
 95 | 
 96 |     tr_q1 = q1_data[:train.shape[0]]
 97 |     tr_q2 = q2_data[:train.shape[0]]
 98 | 
 99 |     te_q1 = q1_data[train.shape[0]:]
100 |     te_q2 = q2_data[train.shape[0]:]
101 | 
102 |     usecols = [
103 |         'q1q2_union_w',
104 |         'q1q2_inter_w',
105 |         'q1_num_adj_w',
106 |         'q2_num_adj_w',
107 |         'q1q2_union',
108 |         'q1q2_inter',
109 |         'q1_num_adj',
110 |         'q2_num_adj',
111 |         # 'q1_hash',
112 |         # 'q2_hash',
113 |         # 'q1q2_inter',
114 |         'shortest_path_w',
115 |         # 'edit_words',
116 |         # 'edit_chars',
117 |         # 'lcs_words',
118 |         # 'lcs_chars',
119 |         # 'q1_word_len',
120 |         # 'q2_word_len',
121 |         # 'q1_char_len',
122 |         # 'q2_char_len',
123 |         # 'words_common',
124 |         # 'chars_common',
125 |     ]
126 |     # if data_aug==False:
127 |     usecols+=['q1_pr_w','q2_pr_w']
128 | 
129 |     tr = {}
130 |     tr['q1'] = tr_q1
131 |     tr['q2'] = tr_q2
132 |     te = {}
133 |     te['q1'] = te_q1
134 |     te['q2'] = te_q2
135 | 
136 |     if data_aug:
137 |         tr['gf'] = pd.concat([pd.read_csv('./data/train.csv',usecols=usecols),
138 |                               pd.DataFrame(np.zeros((len(samples),11)),columns=usecols)]).values
139 |         te['gf'] = pd.read_csv('./data/test.csv', usecols=usecols).values
140 |     else:
141 |         tr['gf'] = pd.read_csv('./data/train.csv', usecols=usecols).values
142 |         te['gf'] = pd.read_csv('./data/test.csv', usecols=usecols).values
143 | 
144 | 
145 |     if data_aug:
146 |         q_tr = pd.concat([pd.read_csv('./data/train.csv',usecols=['q1','q2']),
147 |                           pd.read_csv('./data/aug_data_filter.csv',usecols=['q1','q2'])]).reset_index(drop=True)
148 |         q_te = pd.read_csv('./data/test.csv', usecols=['q1', 'q2'])
149 |     else:
150 |         q_tr = pd.read_csv('./data/train.csv',usecols=['q1','q2'])
151 |         q_te = pd.read_csv('./data/test.csv',usecols=['q1','q2'])
152 | 
153 |     if data_aug:
154 |         questions = pd.read_csv('./data/q_matrix.csv')
155 |     else:
156 |         questions = pd.read_csv('./data/q_matrix.csv')
157 | 
158 |     from config import n_components
159 |     feat = ["feat"+str(i) for i in range(n_components)]
160 | 
161 |     tr['q1node'] = pd.merge(q_tr, questions, left_on=['q1'], right_on=['qid'], how='left').loc[:,feat].values
162 |     tr['q2node'] = pd.merge(q_tr, questions, left_on=['q2'], right_on=['qid'], how='left').loc[:,feat].values
163 |     te['q1node'] = pd.merge(q_te, questions, left_on=['q1'], right_on=['qid'], how='left').loc[:,feat].values
164 |     te['q2node'] = pd.merge(q_te, questions, left_on=['q2'], right_on=['qid'], how='left').loc[:,feat].values
165 | 
166 |     # q_embed = questions.loc[:,['feat'+str(i) for i in range(128)]].values
167 | 
168 |     return tr,te, word_embedding_matrix,train['label']
169 | 
170 | def save_data_tree(use_data,file=None):
171 |     question = pd.read_csv('./data/question.csv')
172 |     question = question[['qid', use_data]]
173 | 
174 |     train = pd.read_csv('./data/train.csv')
175 |     test = pd.read_csv('./data/test.csv')
176 |     train = pd.merge(train, question, left_on=['q1'], right_on=['qid'], how='left')
177 |     train = pd.merge(train, question, left_on=['q2'], right_on=['qid'], how='left')
178 |     train = train[[use_data + '_x', use_data + '_y', 'label']]
179 |     train.columns = ['q1', 'q2', 'label']
180 | 
181 |     test = pd.merge(test, question, left_on=['q1'], right_on=['qid'], how='left')
182 |     test = pd.merge(test, question, left_on=['q2'], right_on=['qid'], how='left')
183 |     test = test[[use_data + '_x', use_data + '_y']]
184 |     test.columns = ['q1', 'q2']
185 | 
186 |     all = pd.concat([train, test])
187 | 
188 |     tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
189 |     tokenizer.fit_on_texts(question[use_data])
190 | 
191 |     word_index = tokenizer.word_index
192 |     print(len(word_index))
193 | 
194 |     q1_word_seq = tokenizer.texts_to_sequences(all['q1'])
195 |     q2_word_seq = tokenizer.texts_to_sequences(all['q2'])
196 | 
197 |     if file is None:
198 |         if use_data == 'words':
199 |             file = './data/word_embed.txt'
200 |         if use_data == 'chars':
201 |             file = './data/char_embed.txt'
202 |     embedding_matrix = get_embedding_matrix(word_index, file)
203 | 
204 |     from config import MAX_NUM_WORDS, MAX_NUM_CHARS
205 |     if use_data == 'words':
206 |         text_len = MAX_NUM_WORDS
207 |     elif use_data == 'chars':
208 |         text_len = MAX_NUM_CHARS
209 |     else:
210 |         raise RuntimeError('use data error')
211 | 
212 |     q1_data = pad_sequences(q1_word_seq, maxlen=text_len, truncating='post')
213 |     q2_data = pad_sequences(q2_word_seq, maxlen=text_len, truncating='post')
214 | 
215 |     q1_matrix = np.zeros((len(q1_data),300*text_len),dtype=np.float16)
216 |     q2_matrix = np.zeros((len(q2_data),300*text_len),dtype=np.float16)
217 | 
218 |     embedding_matrix = embedding_matrix.astype(np.float16)
219 |     for i,(q1,q2) in tqdm(enumerate(zip(q1_data,q2_data))):
220 |         for j in range(text_len):
221 |             if q1[j] != 0:
222 |                 w_v = embedding_matrix[q1[j]]
223 |                 q1_matrix[i,j*300:(j+1)*300] = w_v
224 |             if q2[j] != 0:
225 |                 w_v = embedding_matrix[q2[j]]
226 |                 q2_matrix[i,j*300:(j+1)*300] = w_v
227 | 
228 |     from scipy.sparse import csr_matrix,save_npz
229 | 
230 |     save_npz('./data/q1_matrix'+use_data+'.npz',csr_matrix(q1_matrix))
231 |     save_npz('./data/q2_matrix'+use_data+'.npz',csr_matrix(q2_matrix))
232 | 
233 |     print('success')
234 | 
235 | if __name__ == '__main__':
236 |     save_data_tree('words')
237 |     save_data_tree('chars')
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 


--------------------------------------------------------------------------------
/code/text.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from config import use_device
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = use_device
  4 | import tensorflow as tf
  5 | import keras.backend.tensorflow_backend as KTF
  6 | config = tf.ConfigProto()
  7 | config.gpu_options.allow_growth=True
  8 | session = tf.Session(config=config)
  9 | KTF.set_session(session)
 10 | from keras import backend as K
 11 | import pandas as pd
 12 | import numpy as np
 13 | from keras.callbacks import EarlyStopping,ModelCheckpoint,Callback,LearningRateScheduler
 14 | import warnings
 15 | warnings.filterwarnings('ignore')
 16 | from sklearn.metrics import log_loss
 17 | import datetime
 18 | 
 19 | def lr_de(epoch,lr):
 20 |     if epoch==0:
 21 |         return lr
 22 |     elif lr>0.0002:
 23 |             return lr/2
 24 |     else:
 25 |         return lr
 26 | 
 27 | class epochHistory(Callback):
 28 | 
 29 |     def on_train_begin(self, logs=None):
 30 |         self.epochs = []
 31 | 
 32 |     def on_epoch_end(self, epoch, logs=None):
 33 |         self.epochs.append(epoch)
 34 | 
 35 | def iter_ense(epochs,model,te):
 36 | 
 37 |     result = 0
 38 |     for e in epochs[-3:]:
 39 |         model.load_weights('./weight/weights.'+str(e+1)+'.hdf5')
 40 |         result += model.predict(te, batch_size=1024)
 41 |     return result/3
 42 | 
 43 | 
 44 | def train(use_data,semi_sv,output,data_aug,use_model):
 45 | 
 46 |     def get_subset(dataset,idx):
 47 |         data = {}
 48 |         for key,value in dataset.items():
 49 |             data[key] = value[idx]
 50 |         return data
 51 | 
 52 |     def concat_data(data1,data2):
 53 |         result = {}
 54 |         for k in data1.keys():
 55 |             result[k] = np.concatenate([data1[k],data2[k]])
 56 |         return result
 57 | 
 58 |     def get_aug_data(tr_x, tr_y):
 59 |         tr_q1 = tr_x['q1']
 60 |         tr_q2 = tr_x['q2']
 61 |         tr_gf = tr_x['gf']
 62 |         tr_q1node = tr_x['q1node']
 63 |         tr_q2node = tr_x['q2node']
 64 | 
 65 |         res_q1 = []
 66 |         res_q2 = []
 67 |         res_gf = []
 68 |         res_q1node = []
 69 |         res_q2node = []
 70 |         res_y = []
 71 | 
 72 |         for q1, q2, gf, q1node, q2node, y in zip(tr_q1, tr_q2, tr_gf, tr_q1node, tr_q2node, tr_y):
 73 |             r1 = q1[np.in1d(q1, q2, invert=True)]
 74 |             len1 = len(r1)
 75 |             if len1 < 4 or len1==len(q1[q1!=0]):
 76 |                 continue
 77 | 
 78 |             r2 = q2[np.in1d(q2, q1, invert=True)]
 79 |             len2 = len(r2)
 80 |             if len2 < 4 or len2==len(q2[q2!=0]):
 81 |                 continue
 82 | 
 83 |             out1 = np.zeros(15, dtype=np.int32)
 84 |             out2 = np.zeros(15, dtype=np.int32)
 85 |             out1[-len1:] = r1
 86 |             out2[-len2:] = r2
 87 | 
 88 |             res_q1.append(out1)
 89 |             res_q2.append(out2)
 90 |             res_gf.append(gf)
 91 |             res_q1node.append(q1node)
 92 |             res_q2node.append(q2node)
 93 |             res_y.append(y)
 94 | 
 95 | 
 96 |         res_x = {
 97 |             'q1': np.asarray(res_q1),
 98 |             'q2': np.asarray(res_q2),
 99 |             'gf': np.asarray(res_gf),
100 |             'q1node': np.asarray(res_q1node),
101 |             'q2node': np.asarray(res_q2node)
102 |         }
103 |         res_y = np.asarray(res_y)
104 |         return res_x, res_y
105 | 
106 |     from nn import rnnword, aggmodel, esim,attention,rnn_res
107 |     if use_model == 'rnnword':
108 |         get_model = rnnword
109 |     elif use_model == 'aggmodel':
110 |         pass
111 |     elif use_model == 'esim':
112 |         get_model = esim
113 |     elif use_model == 'attention':
114 |         get_model = attention
115 |     elif use_model == 'res':
116 |         get_model = rnn_res
117 |     else:
118 |         raise RuntimeError("don't have this model")
119 | 
120 |     from readdata import read_data
121 | 
122 |     model_name = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')+'_'+use_data+'_'+str(semi_sv)+'_'+str(data_aug)+'_'
123 | 
124 |     tr,te, embedding_matrix, labels = read_data(use_data,data_aug=data_aug)
125 | 
126 |     print(use_data)
127 |     print('Shape of label tensor:', labels.shape)
128 | 
129 |     y = labels
130 | 
131 |     from config import model_path
132 |     from sklearn.cross_validation import StratifiedKFold, KFold
133 |     from config import n_folds
134 | 
135 |     y_pred = pd.read_csv("./data/y_pred.csv")['y_pre'].values
136 |     y_pos_ = y_pred == 1
137 |     y_neg_ = y_pred == 0
138 |     add_idx = np.any([y_pos_, y_neg_], axis=0)
139 |     add_y = y_pred[add_idx]
140 | 
141 | 
142 |     y_pos = y_pred > 0.75
143 |     y_neg = y_pred < 0.25
144 |     y_idx = np.any([y_pos, y_neg], axis=0)
145 |     y_pred = y_pred[y_idx]
146 |     print(y_idx.shape)
147 | 
148 | 
149 |     folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
150 |     result = np.zeros((len(te['q1']), 1))
151 | 
152 |     oof_y = np.zeros((len(y), 1))
153 |     for n_fold, (tr_idx, val_idx) in enumerate(folds):
154 |         tr_x = get_subset(tr,tr_idx)
155 |         tr_y = y[tr_idx]
156 |         # if data_aug:
157 |         #     res_x,res_y = get_aug_data(tr_x,tr_y)
158 |         #     tr_x = concat_data(tr_x,res_x)
159 |         #     tr_y = np.concatenate([tr_y,res_y])
160 | 
161 |         if semi_sv:
162 |             te_x = get_subset(te, y_idx)
163 |             tr_data = concat_data(tr_x,te_x)
164 |             tr_y = np.concatenate([tr_y,y_pred])
165 |             patience = 3
166 |         else:
167 |             add_data = get_subset(te,add_idx)
168 |             tr_data = concat_data(tr_x,add_data)
169 |             tr_y = np.concatenate([tr_y, add_y])
170 |             patience = 2
171 |             # tr_data = tr_x
172 |             # tr_y = y[tr_idx]
173 | 
174 |         val_x = get_subset(tr, val_idx)
175 |         val_y = y[val_idx]
176 | 
177 |         use_word = True
178 |         if use_data!='words':
179 |             use_word = False
180 |         model = get_model(word_embedding_matrix=embedding_matrix,use_word=use_word)
181 |         if n_fold == 0:
182 |             print(model.summary())
183 | 
184 |         # hist = epochHistory()
185 |         print(n_fold)
186 |         model.fit(tr_data,
187 |                   tr_y,
188 |                   epochs=1000,
189 |                   validation_data=[val_x,val_y],
190 |                   verbose=1,
191 |                   batch_size=256,
192 |                   callbacks=[
193 |                       EarlyStopping(patience=patience, monitor='val_binary_crossentropy'),
194 |                       # LearningRateScheduler(lr_de,verbose=1)
195 |                       # hist,
196 |                       # ModelCheckpoint('./weight/weights.{epoch:d}.hdf5',monitor='val_binary_crossentropy',save_weights_only=True)
197 |                   ])
198 |         # result += iter_ense(hist.epochs,model,te)
199 |         result += model.predict(te, batch_size=1024)
200 | 
201 |         model.save_weights('./weight/'+model_name+str(n_fold)+'.h5')
202 |         # oof_y[val_idx] = model.predict(val_x, batch_size=2048)
203 | 
204 |         K.clear_session()
205 |         tf.reset_default_graph()
206 | 
207 |     # 提交结果
208 |     result /= n_folds
209 |     submit = pd.DataFrame()
210 |     submit['y_pre'] = list(result[:, 0])
211 |     submit.to_csv(output, index=False)
212 | 
213 | 
214 |     ## 保存预测的训练标签
215 |     # oof_y = oof_y[:,0]
216 |     # oof_y_ = oof_y.round().astype(int)
217 |     #
218 |     # error_idx = oof_y_!=y
219 |     # print(np.sum(error_idx))
220 |     # oof_y[error_idx] = 1-oof_y[error_idx]
221 |     submit = pd.DataFrame()
222 |     submit['y_pre'] = oof_y[:,0]
223 |     submit.to_csv('./data/oofy.csv',index=False)
224 | 
225 | 
226 | """
227 | train('words',False,'esim_word0_2.csv',False,'esim')
228 | train('words',True,'esim_word1_2.csv',False,'esim')
229 | train('chars',False,'esim_char0_2.csv',False,'esim')
230 | train('chars',True,'esim_char1_2.csv',False,'esim')
231 | 
232 | train('words',False,'esim_word0_3.csv',False,'esim')
233 | train('words',True,'esim_word1_3.csv',False,'esim')
234 | train('chars',False,'esim_char0_3.csv',False,'esim')
235 | train('chars',True,'esim_char1_3.csv',False,'esim')
236 | 
237 | train('words',False,'attention_word0_0.csv',False,'attention')
238 | train('chars',True,'attention_char1_0.csv',False,'attention')
239 | 
240 | train('words',True,'attention_word1_0.csv',False,'attention')
241 | train('chars',False,'attention_char0_0.csv',False,'attention')
242 | """
243 | 
244 | """
245 | train('words',False,'attention_word0_1.csv',False,'attention')
246 | train('chars',True,'attention_char1_1.csv',False,'attention')
247 | 
248 | train('words',True,'attention_word1_1.csv',False,'attention')
249 | train('chars',False,'attention_char0_1.csv',False,'attention')
250 | 
251 | """
252 | 
253 | """
254 | train('words',False,'attention_word0_2.csv',False,'attention')
255 | train('chars',True,'attention_char1_2.csv',False,'attention')
256 | 
257 | train('words',True,'attention_word1_2.csv',False,'attention')
258 | train('chars',False,'attention_char0_2.csv',False,'attention')
259 | """
260 | 
261 | """
262 | train('words',False,'attention_word0_3.csv',False,'attention')
263 | train('chars',True,'attention_char1_3.csv',False,'attention')
264 | 
265 | train('words',True,'attention_word1_3.csv',False,'attention')
266 | train('chars',False,'attention_char0_3.csv',False,'attention')
267 | """
268 | 
269 | 
270 | train('words',False,'res_word0_2.csv',False,'res')
271 | train('chars',True,'res_char1_2.csv',False,'res')
272 | 
273 | train('words',True,'res_word1_2.csv',False,'res')
274 | train('chars',False,'res_char0_2.csv',False,'res')
275 | 
276 | 
277 | """
278 | train('words',False,'res_word0_3.csv',False,'res')
279 | train('chars',True,'res_char1_3.csv',False,'res')
280 | 
281 | train('words',True,'res_word1_3.csv',False,'res')
282 | train('chars',False,'res_char0_3.csv',False,'res')
283 | """


--------------------------------------------------------------------------------
/code/tool.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from tqdm import tqdm
  3 | import json
  4 | import numpy as np
  5 | import multiprocessing as mlp
  6 | import gc
  7 | 
  8 | def cluster_pos(file='train'):
  9 | 
 10 | 
 11 |     tr = pd.read_csv('./data/'+file+'.csv')
 12 |     # tr = tr.append(pd.read_csv('save_sample.csv')).reset_index(drop=True)
 13 |     y_idx = tr['label'] == 1
 14 | 
 15 | 
 16 |     # y_pred = pd.read_csv('./149367.csv')['y_pre'].values
 17 |     # y_pos = y_pred < 1
 18 |     # y_neg = y_pred > 0.5
 19 |     # y_idx = np.logical_and(y_pos,y_neg)
 20 | 
 21 | 
 22 |     tr = tr.loc[y_idx,['q1','q2']]
 23 |     print(tr.shape)
 24 |     tr = tr.sort_values(by=['q1', 'q2']).values
 25 |     for i in range(len(tr)):
 26 |         if tr[i][0]>tr[i][1]:
 27 |             tr[i] = [tr[i][1],tr[i][0]]
 28 | 
 29 |     q2group = {}
 30 |     num_group = 0
 31 |     error = 0
 32 |     for q1,q2 in tr:
 33 |         assert q1 < q2
 34 |         if q1 not in q2group and q2 not in q2group:
 35 |             q2group[q1] = num_group
 36 |             q2group[q2] = num_group
 37 |             num_group += 1
 38 |         elif q2 in q2group and q1 not in q2group:
 39 |             q2group[q1] = q2group[q2]
 40 |         elif q1 in q2group and q2 not in q2group:
 41 |             q2group[q2] = q2group[q1]
 42 |         else:
 43 |             if q2group[q2] != q2group[q1]:
 44 |                 error+=1
 45 |     print(error)
 46 |     while error != 0:
 47 |         for q1,q2 in tr:
 48 |             if q2group[q1] != q2group[q2]:
 49 |                 group_id = min(q2group[q1],q2group[q2])
 50 |                 q2group[q1] = group_id
 51 |                 q2group[q2] = group_id
 52 |         error = 0
 53 |         for q1,q2 in tr:
 54 |             if q2group[q1] != q2group[q2]:
 55 |                 error+=1
 56 |         print(error)
 57 | 
 58 | 
 59 |     with open('./info/q2group.json','w') as f:
 60 |         f.write(json.dumps(q2group,sort_keys=True,indent=4, separators=(',', ': ')))
 61 | 
 62 |     group2q = [{} for i in range(num_group)]
 63 |     for q,g_id in q2group.items():
 64 |         group2q[g_id][q] = 1
 65 | 
 66 |     with open('./info/group2q.json', 'w') as f:
 67 |         f.write(json.dumps(group2q, sort_keys=True, indent=4, separators=(',', ': ')))
 68 | 
 69 | 
 70 |     group_n = {}
 71 |     for i,q in enumerate(group2q):
 72 |         group_n[str(i)] = len(q)
 73 |     group_n = sorted(group_n.items(),key=lambda x:x[1])
 74 |     with open('./info/group_samples_num.json', 'w') as f:
 75 |         f.write(json.dumps(group_n, sort_keys=True, indent=4, separators=(',', ': ')))
 76 | 
 77 | 
 78 | 
 79 | def cluster_neg():
 80 | 
 81 |     tr = pd.read_csv('./data/train.csv')
 82 |     tr = tr.loc[tr['label'] == 0, ['q1', 'q2']].values
 83 | 
 84 |     with open('./info/q2group.json','r') as f:
 85 |         q2group = json.loads(f.read())
 86 | 
 87 |     neg_pair = {}
 88 |     for q1,q2 in tr:
 89 |         if q1 in q2group and q2 in q2group:
 90 |             if q2group[q1]<q2group[q2]:
 91 |                 neg_pair[str(q2group[q1])+'_'+str(q2group[q2])] = 1
 92 |             elif q2group[q1]>q2group[q2]:
 93 |                 neg_pair[str(q2group[q2])+'_'+str(q2group[q1])] = 1
 94 | 
 95 |     with open('./info/neg_rule.json','w') as f:
 96 |         f.write(json.dumps(neg_pair,sort_keys=True,indent=4,separators=(',', ': ')))
 97 | 
 98 |     te = pd.read_csv('./data/test.csv').values
 99 |     need_rule = {}
100 |     for q1, q2 in te:
101 |         if q1 in q2group and q2 in q2group:
102 |             if q2group[q1] < q2group[q2]:
103 |                 pair = str(q2group[q1]) + '_' + str(q2group[q2])
104 |             elif q2group[q1] > q2group[q2]:
105 |                 pair = str(q2group[q2]) + '_' + str(q2group[q1])
106 |             else:
107 |                 continue
108 |             if pair not in neg_pair:
109 |                 if pair not in need_rule:
110 |                     need_rule[pair] = 0
111 |                 need_rule[pair]+=1
112 |     need_rule = sorted(need_rule.items(),key=lambda x:x[1])
113 |     with open('./info/need_rule.json','w') as f:
114 |         f.write(json.dumps(need_rule,sort_keys=True,indent=4,separators=(',', ': ')))
115 | 
116 | 
117 | 
118 | def create_pos_sample():
119 | 
120 |     with open('./info/q_te_dict.json','r') as f:
121 |         q_te = json.loads(f.read())
122 |     with open('./info/q_tr_dict.json','r') as f:
123 |         q_tr = json.loads(f.read())
124 | 
125 |     with open('./info/group2q.json','r') as f:
126 |         group2q = json.loads(f.read())
127 | 
128 |     from itertools import combinations
129 | 
130 |     samples_dict = {}
131 | 
132 |     for questions in tqdm(group2q):
133 |         if len(questions) == 2:
134 |             continue
135 |         if len(questions) == 0:
136 |             continue
137 |         for q1,q2 in combinations(list(questions.keys()),2):
138 |             samples_dict[q1 + '_' + q2] = 1
139 | 
140 | 
141 |     tr = pd.read_csv('./data/train.csv')
142 |     te = pd.read_csv('./data/test.csv')
143 |     tr.append(te)
144 |     tr = tr[['q1','q2']].values
145 |     for q1,q2 in tr:
146 |         a = q1 + '_' + q2 in samples_dict
147 |         b = q2 + '_' + q1 in samples_dict
148 |         assert (a and b) == False
149 |         if q1 + '_' + q2 in samples_dict:
150 |             samples_dict.pop(q1 + '_' + q2)
151 |         elif q2 + '_' + q1 in samples_dict:
152 |             samples_dict.pop(q2 + '_' + q1)
153 | 
154 |     samples = []
155 |     for k in samples_dict.keys():
156 |         samples.append(k.split("_"))
157 | 
158 |     print(len(samples))
159 | 
160 |     train_extend = pd.DataFrame(samples,columns=['q1','q2'])
161 |     train_extend.to_csv('./info/pos_sample.csv',index=False)
162 | 
163 | 
164 | def create_neg_sample():
165 | 
166 |     with open('./info/group2q.json','r') as f:
167 |         group2q = json.loads(f.read())
168 |     with open('./info/neg_rule.json','r') as f:
169 |         neg_pair = json.loads(f.read())
170 | 
171 |     from itertools import product
172 | 
173 |     samples_dict = {}
174 |     num_sample = 0
175 |     for pair in tqdm(neg_pair):
176 |         c1,c2 = pair.split('_')
177 |         for q1,q2 in product(group2q[int(c1)],group2q[int(c2)]):
178 |             samples_dict[q1 + '_' + q2] = 1
179 |             num_sample+=1
180 | 
181 |     print(num_sample)
182 |     tr = pd.read_csv('./data/train.csv',usecols=['q1','q2'])
183 |     te = pd.read_csv('./data/test.csv',usecols=['q1','q2'])
184 |     tr.append(te)
185 |     tr = tr[['q1','q2']].values
186 |     for q1,q2 in tr:
187 |         a = q1 + '_' + q2 in samples_dict
188 |         b = q2 + '_' + q1 in samples_dict
189 |         assert (a and b) == False
190 |         if q1 + '_' + q2 in samples_dict:
191 |             samples_dict.pop(q1 + '_' + q2)
192 |         elif q2 + '_' + q1 in samples_dict:
193 |             samples_dict.pop(q2 + '_' + q1)
194 | 
195 |     samples = []
196 |     for k in samples_dict.keys():
197 |         samples.append(k.split("_"))
198 | 
199 |     print(len(samples))
200 |     del samples_dict
201 |     import gc
202 |     gc.collect()
203 |     train_extend = pd.DataFrame(samples,columns=['q1','q2'])
204 |     train_extend.to_csv('./info/neg_sample.csv',index=False)
205 | 
206 | 
207 | def post_process(file,output='baseline.csv'):
208 | 
209 |     with open('./info/q2group.json','r') as f:
210 |         q2group = json.loads(f.read())
211 | 
212 |     with open('./info/group2q.json','r') as f:
213 |         group2q = json.loads(f.read())
214 | 
215 |     te = pd.read_csv('./data/test.csv',usecols=['q1','q2']).values
216 |     y_pre = pd.read_csv(file)
217 | 
218 | 
219 |     "正例修正"
220 |     n = 0
221 |     loss = 0
222 | 
223 |     save_samples = []
224 |     s = 0
225 |     for i, (q1, q2) in enumerate(te):
226 |         if q1 in q2group and q2 in q2group:
227 |             if q2group[q1] == q2group[q2]:
228 |                 n += 1
229 |                 loss = loss - np.log(y_pre.iloc[i,0])
230 |                 y_pre.iloc[i, 0] = 1
231 |                 save_samples.append([1,q1, q2])
232 | 
233 |     # save_samples = pd.DataFrame(save_samples,columns=['label','q1','q2'])
234 |     # save_samples.to_csv('./info/save_sample.csv',index=False)
235 |     print('n:',n)
236 | 
237 |     print(s)
238 |     "负例修正"
239 |     with open('./info/neg_rule.json','r') as f:
240 |         neg_pair = json.loads(f.read())
241 |     n = 0
242 |     for i, (q1, q2) in tqdm(enumerate(te)):
243 |         if q1 in q2group and q2 in q2group:
244 |             if q2group[q1] < q2group[q2]:
245 |                 pair = str(q2group[q1]) + '_' + str(q2group[q2])
246 |             elif q2group[q1] > q2group[q2]:
247 |                 pair = str(q2group[q2]) + '_' + str(q2group[q1])
248 |             else:
249 |                 pair = ''
250 |             if pair in neg_pair:
251 |                 loss = loss - np.log(1-y_pre.iloc[i, 0])
252 |                 y_pre.iloc[i, 0] = 0
253 |                 n += 1
254 |     print('loss:', loss / len(te))
255 |     print(n)
256 | 
257 |     y_pre.to_csv(output, index=False)
258 | 
259 |     return y_pre
260 | 
261 | def q_distr():
262 | 
263 |     te = pd.read_csv('./data/test.csv').values
264 | 
265 |     q_dict = {}
266 |     for q1, q2 in te:
267 |         if q1 not in q_dict:
268 |             q_dict[q1] = 0
269 |         q_dict[q1] += 1
270 |         if q2 not in q_dict:
271 |             q_dict[q2] = 0
272 |         q_dict[q2] += 1
273 |     te_q = sorted(q_dict.items(), key=lambda x: x[1])
274 |     with open('./info/te_q.json', 'w') as f:
275 |         f.write(json.dumps(te_q, sort_keys=True, indent=4, separators=(',', ': ')))
276 |     with open('./info/q_te_dict.json', 'w') as f:
277 |         f.write(json.dumps(q_dict, sort_keys=True, indent=4, separators=(',', ': ')))
278 | 
279 |     tr = pd.read_csv('./data/train.csv',usecols=['q1','q2']).values
280 | 
281 |     q_dict = {}
282 |     for q1, q2 in tr:
283 |         if q1 not in q_dict:
284 |             q_dict[q1] = 0
285 |         q_dict[q1] += 1
286 |         if q2 not in q_dict:
287 |             q_dict[q2] = 0
288 |         q_dict[q2] += 1
289 |     tr_q = sorted(q_dict.items(), key=lambda x: x[1])
290 |     with open('./info/tr_q.json', 'w') as f:
291 |         f.write(json.dumps(tr_q, sort_keys=True, indent=4, separators=(',', ': ')))
292 |     with open('./info/q_tr_dict.json', 'w') as f:
293 |         f.write(json.dumps(q_dict, sort_keys=True, indent=4, separators=(',', ': ')))
294 | 
295 | def te_test():
296 | 
297 |     with open('q2group.json','r') as f:
298 |         q2group = json.loads(f.read())
299 |     with open('neg_rule.json','r') as f:
300 |         neg_pair = json.loads(f.read())
301 | 
302 |     te = pd.read_csv('./data/test.csv').values
303 | 
304 | 
305 | def get_samples():
306 | 
307 |     with open('./info/q_te_dict.json', 'r') as f:
308 |         q_te = json.loads(f.read())
309 |     with open('./info/q_tr_dict.json', 'r') as f:
310 |         q_tr = json.loads(f.read())
311 | 
312 |     pos_samples = pd.read_csv('./info/pos_sample.csv',usecols=['q1','q2']).sample(frac=1).reset_index(drop=True).values
313 |     neg_samples = pd.read_csv('./info/neg_sample.csv',usecols=['q1','q2']).sample(frac=1).reset_index(drop=True).values
314 | 
315 |     te_q = pd.read_csv("./data/test.csv",usecols=['q1','q2'])
316 |     te_q = list(set(te_q['q1'].tolist() + te_q['q2'].tolist()))
317 | 
318 |     data = []
319 |     for q in te_q:
320 |         data.append([1, q, q])
321 |     for i,samples in [(1,pos_samples),(0,neg_samples)]:
322 |         q_freq = {}
323 |         num_sample = 0
324 |         for q1,q2 in tqdm(samples):
325 |             if q1 not in q_te or q2 not in q_te:
326 |                 continue
327 |             # if q1 not in q_freq:
328 |             #     q_freq[q1] = 0
329 |             # if q2 not in q_freq:
330 |             #     q_freq[q2] = 0
331 |             # if q_freq[q1] > min(2-q_tr[q1]/30+q_te[q1]/10,4):
332 |             #     continue
333 |             # if q_freq[q2] > min(2-q_tr[q2]/30+q_te[q2]/10,4):
334 |             #     continue
335 |             # q_freq[q1] += 1
336 |             # q_freq[q2] += 1
337 |             data.append([i,q1,q2])
338 |             num_sample += 1
339 |         print(num_sample)
340 | 
341 |     data = pd.DataFrame(data,columns=['label','q1','q2'])
342 |     data.to_csv('./data/aug_data.csv',index=False)
343 |     return data
344 | 
345 | def test():
346 | 
347 |     te = pd.read_csv("./data/test.csv",usecols=['q1','q2'])
348 |     te['y_pre'] = pd.read_csv("./145192.csv")['y_pre']
349 | 
350 |     te = te.loc[te['y_pre']<1]
351 |     te = te.loc[te['y_pre']>0]
352 | 
353 |     q = {}
354 |     for q1,q2 in te[['q1','q2']].values:
355 |         if q1 not in q:
356 |             q[q1] = 0
357 |         if q2 not in q:
358 |             q[q2] = 0
359 |         q[q1] +=1
360 |         q[q2] +=1
361 |     import json
362 |     q = sorted(q.items(), key=lambda x: x[1])
363 |     with open("./info/q.json",'w') as f:
364 |         f.write(json.dumps(q,sort_keys=True, indent=4, separators=(',', ': ')))
365 | 
366 | 
367 | def sample_filter():
368 | 
369 |     samples = pd.read_csv("./data/aug_data.csv")
370 |     samples['y_pre'] = pd.read_csv("./data/submit.csv")['y_pre']
371 |     pos_samples = samples.loc[samples['label']==1]
372 |     neg_samples = samples.loc[samples['label']==0]
373 | 
374 |     del samples
375 |     gc.collect()
376 | 
377 |     pos_samples = pos_samples.loc[pos_samples['y_pre']<0.5]
378 |     neg_samples = neg_samples.loc[neg_samples['y_pre']>0.5]
379 |     print(pos_samples.shape)
380 |     print(neg_samples.shape)
381 | 
382 |     with open('./info/q_te_dict.json', 'r') as f:
383 |         q_te = json.loads(f.read())
384 |     with open('./info/q_tr_dict.json', 'r') as f:
385 |         q_tr = json.loads(f.read())
386 |     data = []
387 |     for i, samples in [(1, pos_samples), (0, neg_samples)]:
388 |         q_freq = {}
389 |         num_sample = 0
390 |         for q1, q2 in tqdm(samples[['q1','q2']].values):
391 |             if i==0:
392 |                 if q1 not in q_freq:
393 |                     q_freq[q1] = 0
394 |                 if q2 not in q_freq:
395 |                     q_freq[q2] = 0
396 |                 if q_freq[q1] > min(2-q_tr[q1]/30+q_te[q1]/10,3):
397 |                     continue
398 |                 if q_freq[q2] > min(2-q_tr[q2]/30+q_te[q2]/10,3):
399 |                     continue
400 |                 q_freq[q1] += 1
401 |                 q_freq[q2] += 1
402 |             data.append([i, q1, q2])
403 |             num_sample += 1
404 |         print(num_sample)
405 | 
406 |     data = pd.DataFrame(data, columns=['label', 'q1', 'q2'])
407 |     data.to_csv('./data/aug_data_filter.csv', index=False)
408 | 
409 | if __name__=='__main__':
410 |     # q_distr()
411 | 
412 |     # create_pos_sample()
413 |     # create_neg_sample()
414 |     # cluster_pos()
415 |     # cluster_neg()
416 |     # get_samples()
417 |     # from glob import  glob
418 |     # path= './resultv4/'
419 |     # files = glob(path+'*.csv')
420 |     # for f in files:
421 |     #     post_process(f)
422 |     # sample_filter()
423 |     a = post_process('./base.csv')
424 |     print(a.describe())
425 | 
426 |     # a = pd.read_csv('./ensemble/144392.csv')
427 |     # b = pd.read_csv('./ensemble/ense2_14459.csv')
428 |     # a['y_pre'] = 2*a['y_pre']/3 + b['y_pre']/3
429 |     # print(a.describe())
430 | 
431 |     # post_process_v2()
432 |     # test()
433 | 
434 | 
435 | 
436 | 
437 | 
438 | 
439 | 
440 | 
441 | 
442 | 


--------------------------------------------------------------------------------
/code/train_word.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from keras.preprocessing.text import Tokenizer
  3 | from keras.preprocessing.sequence import pad_sequences
  4 | from config import MAX_NB_WORDS,MAX_NUM_CHARS,MAX_NUM_WORDS
  5 | from tqdm import tqdm
  6 | import numpy as np
  7 | from keras.optimizers import Nadam
  8 | 
  9 | 
 10 | def get_embedd(word_index,file):
 11 |     embeddings_index = {}
 12 |     with open(file, 'r') as f:
 13 |         wordmat = f.read().split('\n')
 14 |         if wordmat[-1] == '':
 15 |             wordmat = wordmat[:-1]
 16 |         if wordmat[0] == '':
 17 |             wordmat = wordmat[1:]
 18 | 
 19 |     for line in tqdm(wordmat):
 20 |         wvec = line.strip('\n').strip(' ').split(' ')
 21 |         embeddings_index[wvec[0]] = np.asarray(wvec[1:], dtype='float')
 22 | 
 23 |     print('embedding', len(embeddings_index))
 24 | 
 25 |     EMBEDDING_DIM = 300
 26 |     nb_words = len(word_index)
 27 |     embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
 28 |     for word, i in word_index.items():
 29 |         embedding_vector = embeddings_index.get(str(word).upper())
 30 |         if embedding_vector is not None:
 31 |             embedding_matrix[i] = embedding_vector
 32 | 
 33 |     return embedding_matrix
 34 | 
 35 | def get_model(word_embedding_matrix,char_embedding_matrix):
 36 |     from keras.models import Model
 37 |     from keras.layers import Input,Lambda,BatchNormalization
 38 |     from keras.layers import CuDNNGRU, Bidirectional,GlobalMaxPooling1D
 39 |     from keras.layers import Embedding,SpatialDropout1D,Dense
 40 |     from keras import backend as K
 41 | 
 42 | 
 43 |     def loss(y_true,y_pred):
 44 |         return -K.mean(y_pred * y_true)
 45 | 
 46 |     from config import MAX_NUM_WORDS,MAX_NUM_CHARS
 47 | 
 48 | 
 49 |     word = Input(shape=(MAX_NUM_WORDS,))
 50 |     char = Input(shape=(MAX_NUM_CHARS,))
 51 | 
 52 | 
 53 |     embedd_word = Embedding(
 54 |                    len(word_embedding_matrix),
 55 |                    word_embedding_matrix.shape[1],
 56 |                    weights=[word_embedding_matrix],
 57 |                    input_length=MAX_NUM_WORDS,
 58 |                    trainable=True,name='word_weight')
 59 |     embedd_char = Embedding(
 60 |         len(char_embedding_matrix),
 61 |         char_embedding_matrix.shape[1],
 62 |         weights=[char_embedding_matrix],
 63 |         input_length=MAX_NUM_CHARS,
 64 |         trainable=True,name='char_weight')
 65 | 
 66 |     gru_dim1 = 384
 67 | 
 68 |     gru_w = Bidirectional(CuDNNGRU(gru_dim1,return_sequences=True),merge_mode='sum')
 69 | 
 70 |     gru_c = Bidirectional(CuDNNGRU(gru_dim1, return_sequences=True), merge_mode='sum')
 71 | 
 72 |     w = embedd_word(word)
 73 |     c = embedd_char(char)
 74 |     w = BatchNormalization()(w)
 75 |     c = BatchNormalization()(c)
 76 |     w = SpatialDropout1D(0.2)(w)
 77 |     c = SpatialDropout1D(0.2)(c)
 78 | 
 79 |     w = gru_w(w)
 80 |     c = gru_c(c)
 81 | 
 82 |     w = GlobalMaxPooling1D()(w)
 83 |     c = GlobalMaxPooling1D()(c)
 84 | 
 85 |     def jaccard(x):
 86 |         x0_2 = K.sum(x[0] ** 2, axis=1, keepdims=True)
 87 |         x1_2 = K.sum(x[1] ** 2, axis=1, keepdims=True)
 88 |         x01_ = K.sum(K.abs(x[0] * x[1]), axis=1, keepdims=True)
 89 | 
 90 |         return x[0] * x[1]/(x0_2+x1_2-x01_)
 91 | 
 92 | 
 93 |     output = Lambda(jaccard)([w,c])
 94 |     output = Dense(1,activation='sigmoid')(output)
 95 |     model = Model(inputs=[word,char], outputs=output)
 96 | 
 97 |     model.compile(loss='binary_crossentropy',optimizer=Nadam())
 98 | 
 99 |     return model
100 | 
101 | def train():
102 |     question = pd.read_csv('./data/question.csv')
103 | 
104 | 
105 |     toke_word = Tokenizer(num_words=MAX_NB_WORDS)
106 |     toke_word.fit_on_texts(question['words'])
107 |     q_word = toke_word.texts_to_sequences(question['words'])
108 |     q_word = pad_sequences(q_word, maxlen=MAX_NUM_WORDS, truncating='post')
109 |     q_word = np.array(list(q_word)*2)
110 |     word_index = toke_word.word_index
111 |     word_embedd = get_embedd(word_index,'./data/word_embed.txt')
112 | 
113 | 
114 |     toke_char = Tokenizer(num_words=MAX_NB_WORDS)
115 |     toke_char.fit_on_texts(question['chars'])
116 |     q_char = toke_word.texts_to_sequences(question['chars'])
117 |     q_char = pad_sequences(q_char, maxlen=MAX_NUM_CHARS, truncating='post')
118 |     q_char = np.array(list(q_char) + list(q_char)[::-1])
119 |     char_index = toke_char.word_index
120 |     char_embedd = get_embedd(char_index, './data/char_embed.txt')
121 | 
122 | 
123 |     model = get_model(word_embedd,char_embedd)
124 |     y = np.ones(len(q_char))
125 |     y[len(question):] = 0
126 |     model.fit([q_word,q_char],y,verbose=1,epochs=2,batch_size=512,shuffle=True)
127 | 
128 |     word_embedd = model.get_layer('word_weight').get_weights()
129 |     char_embedd = model.get_layer('char_weight').get_weights()
130 | 
131 | 
132 |     print('save  ')
133 |     word_mat = ''
134 |     for i in range(len(word_embedd)):
135 |         w = word_index.get(i)
136 |         if w is None:
137 |             continue
138 |         vec_str = ' '.join([w]+list(word_embedd[i]))
139 |         vec_str+='\n'
140 |         word_mat+=vec_str
141 |     with open('./data/word_embed1.txt','w') as f:
142 |         f.write(word_mat)
143 | 
144 | 
145 | 
146 |     char_mat = ''
147 |     for i in range(len(char_embedd)):
148 |         c = char_index.get(i)
149 |         if c is None:
150 |             continue
151 |         vec_str = ' '.join([c] + list(char_embedd[i]))
152 |         vec_str += '\n'
153 |         char_mat += vec_str
154 | 
155 |     with open('./data/char_embed1.txt', 'w') as f:
156 |         f.write(char_mat)
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     train()
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------