├── README.md └── code ├── config.py ├── createFeature.py ├── ensemble.py ├── nn.py ├── readdata.py ├── text.py ├── tool.py └── train_word.py /README.md: -------------------------------------------------------------------------------- 1 | # 拍拍贷第三届魔镜杯大赛 2 | 拍拍贷第三届魔镜杯大赛rank6解决方案 3 | 最终排名 https://ai.ppdai.com/mirror/goToMirrorDetail?mirrorId=1&tabindex=2 4 | 5 | ## 比赛答辩PPT与方案 6 | https://qrfaction.github.io/2018/07/25/%E9%AD%94%E9%95%9C%E6%9D%AF%E6%AF%94%E8%B5%9B%E7%AD%94%E8%BE%A9PPT/ 7 | 8 | ## code注意事项 9 | 缺失队友权重迁移和数据增强部分(那两部分没涨分) 10 | 11 | ## Conclusion 12 | 更新于2019.02.12 13 | 此次比赛认识了很多圈内人,在日后很多比赛中都常常见到熟人 14 | 感觉圈子好小啊 15 | -------------------------------------------------------------------------------- /code/config.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | MAX_NB_WORDS = 30000 5 | n_folds = 10 6 | MAX_NUM_WORDS = 15 7 | MAX_NUM_CHARS = 25 8 | 9 | use_data = 'word' 10 | use_model = 'rnnword' 11 | use_device = '2' 12 | 13 | n_components = 32 14 | 15 | model_path = 'temp.hdf5' 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /code/createFeature.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from collections import defaultdict 4 | from tqdm import tqdm 5 | import networkx as nx 6 | import multiprocessing as mlp 7 | from sklearn.decomposition import TruncatedSVD 8 | 9 | tqdm.pandas() 10 | 11 | def hash_q(train_orig,test_orig,aug=None): 12 | 13 | 14 | df1 = train_orig[['q1']].copy() 15 | df2 = train_orig[['q2']].copy() 16 | df1_test = test_orig[['q1']].copy() 17 | df2_test = test_orig[['q2']].copy() 18 | 19 | df2.rename(columns={'q2': 'q1'}, inplace=True) 20 | df2_test.rename(columns={'q2': 'q1'}, inplace=True) 21 | 22 | train_questions = df1.append(df2) 23 | train_questions = train_questions.append(df1_test) 24 | train_questions = train_questions.append(df2_test) 25 | train_questions.drop_duplicates(subset=['q1'], inplace=True) 26 | 27 | train_questions.reset_index(inplace=True, drop=True) 28 | questions_dict = pd.Series(train_questions.index.values, index=train_questions.q1.values).to_dict() 29 | train_cp = train_orig.copy() 30 | test_cp = test_orig.copy() 31 | 32 | train_cp['label'] = 1 33 | test_cp['label'] = -1 34 | comb = pd.concat([train_cp, test_cp]) 35 | 36 | comb['q1_hash'] = comb['q1'].map(questions_dict) 37 | comb['q2_hash'] = comb['q2'].map(questions_dict) 38 | 39 | train_comb = comb[comb['label'] >= 0][['q1_hash', 'q2_hash']] 40 | test_comb = comb[comb['label'] == -1][['q1_hash', 'q2_hash']] 41 | 42 | train_orig = pd.concat([train_orig,train_comb], axis=1) 43 | test_orig = pd.concat([test_orig,test_comb], axis=1) 44 | 45 | return train_orig,test_orig 46 | 47 | def adj_feat_worker(data,FG,suffix): 48 | def get_weights_adj(x): 49 | q1 = x['q1'] 50 | q2 = x['q2'] 51 | q1_adj = set(FG[q1]) 52 | q2_adj = set(FG[q2]) 53 | 54 | q1_or_q2 = q1_adj | q2_adj 55 | total_weight = 0 56 | for node in q1_or_q2: 57 | if node in FG[q1]: 58 | total_weight+=FG.get_edge_data(q1,node)['weight'] 59 | if node in FG[q2]: 60 | total_weight+=FG.get_edge_data(q2,node)['weight'] 61 | x['q1q2_union'+suffix] = total_weight 62 | 63 | total_weight = 0 64 | q1_and_q2 = q1_adj & q2_adj 65 | for node in q1_and_q2: 66 | if node in FG[q1]: 67 | total_weight += FG.get_edge_data(q1, node)['weight'] 68 | if node in FG[q2]: 69 | total_weight += FG.get_edge_data(q2, node)['weight'] 70 | x['q1q2_inter' + suffix] = total_weight 71 | 72 | return x 73 | 74 | data = data.progress_apply(get_weights_adj, axis=1, raw=True) 75 | return data[['q1q2_inter' + suffix,'q1q2_union'+suffix]] 76 | 77 | def get_shortest_path_worker(data,FG,suffix): 78 | 79 | def get_shortest_path(x): 80 | q1 = x['q1'] 81 | q2 = x['q2'] 82 | w = FG.get_edge_data(q1, q2)['weight'] 83 | FG.remove_edge(q1, q2) 84 | try: 85 | res = nx.dijkstra_path_length(FG, q1, q2) 86 | except: 87 | res = 0 88 | FG.add_edge(q1, q2, weight=w) 89 | x['shortest_path'+suffix] = res 90 | return x 91 | data = data.progress_apply(get_shortest_path, axis=1, raw=True) 92 | return data['shortest_path'+suffix] 93 | 94 | def graph_feature(train,test,use_label,aug=None): 95 | 96 | def q_weight(data,FG,suffix): 97 | all_q_weights = {k: sum([x[1].get('weight') for x in FG[k].items()]) for k in FG.nodes} 98 | data['q1_num_adj' + suffix] = data['q1'].map(all_q_weights) 99 | data['q2_num_adj' + suffix] = data['q2'].map(all_q_weights) 100 | return data 101 | 102 | def multi_process(data,FG,suffix,feat_f): 103 | 104 | num_cpu = mlp.cpu_count() 105 | pool = mlp.Pool(num_cpu) 106 | 107 | aver_t = int(len(data) / num_cpu) + 1 108 | results = [] 109 | for i in range(num_cpu): 110 | result = pool.apply_async(feat_f,args=(data.iloc[i*aver_t:(i+1)*aver_t],FG,suffix)) 111 | results.append(result) 112 | pool.close() 113 | pool.join() 114 | 115 | feat = [] 116 | for result in results: 117 | feat.append(result.get()) 118 | feat = pd.concat(feat,axis=0) 119 | data = pd.concat([data,feat],axis=1) 120 | 121 | return data 122 | 123 | def pagerank(data,FG,suffix): 124 | pr = nx.pagerank(FG, alpha=0.85) 125 | data['q1_pr' + suffix] = data['q1'].map(pr) 126 | data['q2_pr' + suffix] = data['q2'].map(pr) 127 | return data 128 | 129 | if use_label: 130 | suffix = '_w' 131 | else: 132 | suffix = '' 133 | if aug is not None: 134 | if use_label: 135 | aug['y_pre'] = pd.read_csv('./data/aug_data_with_pre.csv',usecols=['y_pre'])['y_pre'] 136 | else: 137 | aug['y_pre'] = 1.0 138 | 139 | if use_label: 140 | train['y_pre'] = pd.read_csv('./data/tr_graph_weight.csv')['y_pre'] 141 | test['y_pre'] = pd.read_csv('./data/te_graph_weight.csv')['y_pre'] 142 | else: 143 | train['y_pre'] = 1.0 144 | test['y_pre'] = 1.0 145 | 146 | if aug is not None: 147 | data = pd.concat([train, test,aug], ignore_index=True) 148 | else: 149 | data = pd.concat([train, test],ignore_index=True) 150 | 151 | FG = nx.Graph() 152 | FG.add_weighted_edges_from(data[['q1','q2','y_pre']].values) 153 | 154 | data = pagerank(data,FG,suffix) 155 | data = q_weight(data,FG,suffix) 156 | 157 | if use_label: 158 | data = multi_process(data, FG, suffix, get_shortest_path_worker) 159 | data = multi_process(data,FG,suffix,adj_feat_worker) 160 | 161 | data.drop(['y_pre'],inplace=True,axis=1) 162 | 163 | if aug is not None: 164 | train = data.iloc[:train.shape[0]].reset_index(drop=True) 165 | test = data.iloc[train.shape[0]:train.shape[0]+test.shape[0]].reset_index(drop=True) 166 | aug = data.iloc[train.shape[0]+test.shape[0]:].reset_index(drop=True) 167 | return train, test,aug 168 | else: 169 | train = data.iloc[:train.shape[0]].reset_index( drop=True) 170 | test = data.iloc[train.shape[0]:].reset_index( drop=True) 171 | return train,test 172 | 173 | def svd_graph(train,test,use_label,aug=None): 174 | from scipy.sparse import coo_matrix,save_npz 175 | 176 | if aug is not None: 177 | if use_label: 178 | aug['y_pre'] = pd.read_csv('./data/aug_data_with_pre.csv',usecols=['y_pre'])['y_pre'] 179 | else: 180 | aug['y_pre'] = 1.0 181 | 182 | if use_label: 183 | train['y_pre'] = pd.read_csv('./data/tr_graph_weight.csv')['y_pre'] 184 | test['y_pre'] = pd.read_csv('./data/te_graph_weight.csv')['y_pre'] 185 | else: 186 | train['y_pre'] = 1.0 187 | test['y_pre'] = 1.0 188 | if aug is not None: 189 | all_samples = pd.concat([train,test,aug]).reset_index(drop=True)[['q1','q2','y_pre']] 190 | else: 191 | all_samples = pd.concat([train, test]).reset_index(drop=True)[['q1', 'q2', 'y_pre']] 192 | questions = all_samples['q1'].append(all_samples['q2']).drop_duplicates().reset_index(drop=True) 193 | 194 | 195 | q2i = pd.Series(questions.index.values, index=questions.values).to_dict() 196 | i2q = questions.to_dict() 197 | 198 | print('get coo matrix') 199 | row = [i for i in range(len(q2i))] 200 | col = [i for i in range(len(q2i))] 201 | value = [1 for i in range(len(q2i))] 202 | # row = [] 203 | # col = [] 204 | # value = [] 205 | for q1,q2,w in all_samples.values: 206 | row.append(q2i[q1]) 207 | col.append(q2i[q2]) 208 | value.append(w) 209 | 210 | row.append(q2i[q2]) 211 | col.append(q2i[q1]) 212 | value.append(w) 213 | 214 | qmatrix = coo_matrix((value, (row,col)), shape=(len(q2i),len(q2i))) 215 | # save_npz('./data/q_adj_matrix.npz', qmatrix) 216 | 217 | print('svd ...') 218 | from config import n_components 219 | svd = TruncatedSVD(n_components=n_components,algorithm='arpack',n_iter=100) 220 | q_matrix = svd.fit_transform(qmatrix) 221 | 222 | total_ratio = [] 223 | ratio = 0 224 | for i in svd.explained_variance_ratio_: 225 | ratio += i 226 | total_ratio.append(ratio) 227 | print(total_ratio) 228 | 229 | q_matrix[q_matrix<1e-5] = 0 230 | print(np.sum(q_matrix==0)/(q_matrix.shape[0]*q_matrix.shape[1])) 231 | q_matrix = pd.DataFrame(q_matrix,columns=['feat'+str(i) for i in range(n_components)]) 232 | q_matrix['qid'] = list(range(len(q2i))) 233 | q_matrix['qid'] = q_matrix['qid'].map(i2q) 234 | 235 | q_matrix.to_csv('./data/q_matrix_v2.csv',index=False) 236 | 237 | train.drop(['y_pre'],inplace=True,axis=1) 238 | test.drop(['y_pre'], inplace=True,axis=1) 239 | if aug is not None: 240 | aug.drop(['y_pre'],inplace=True,axis=1) 241 | return q_matrix 242 | 243 | def num_same_w(data): 244 | def num_of_common(x): 245 | x['words_common'] = len(set(x['words_x']) & set(x['words_y'])) 246 | x['chars_common'] = len(set(x['chars_x']) & set(x['chars_y'])) 247 | return x 248 | return data.progress_apply(num_of_common,axis=1)[['words_common','chars_common']] 249 | 250 | def lcs_worker(data): 251 | def lcs_length(a, b): 252 | table = [[0] * (len(b) + 1) for _ in range(len(a) + 1)] 253 | for i, ca in enumerate(a, 1): 254 | for j, cb in enumerate(b, 1): 255 | table[i][j] = ( 256 | table[i - 1][j - 1] + 1 if ca == cb else 257 | max(table[i][j - 1], table[i - 1][j])) 258 | return table[-1][-1] 259 | def lcs_feat(x): 260 | x['lcs_words'] = lcs_length(x['words_x'],x['words_y']) 261 | x['lcs_chars'] = lcs_length(x['chars_x'],x['chars_y']) 262 | return x 263 | 264 | return data.progress_apply(lcs_feat, axis=1)[['lcs_words','lcs_chars']] 265 | 266 | def edit_distance(data): 267 | from pyxdameraulevenshtein import damerau_levenshtein_distance 268 | def edit_feat(x): 269 | x['edit_words'] = damerau_levenshtein_distance(x['words_x'],x['words_y']) 270 | x['edit_chars'] = damerau_levenshtein_distance(x['chars_x'],x['chars_y']) 271 | return x 272 | 273 | return data.progress_apply(edit_feat, axis=1)[['edit_words','edit_chars']] 274 | 275 | 276 | def distance_feat(train,test,aug=None): 277 | 278 | def multi_process(data,feat_f): 279 | 280 | num_cpu = mlp.cpu_count() 281 | pool = mlp.Pool(num_cpu) 282 | 283 | aver_t = int(len(data) / num_cpu) + 1 284 | results = [] 285 | for i in range(num_cpu): 286 | result = pool.apply_async(feat_f,args=(data.iloc[i*aver_t:(i+1)*aver_t],)) 287 | results.append(result) 288 | pool.close() 289 | pool.join() 290 | 291 | feat = [] 292 | for result in results: 293 | feat.append(result.get()) 294 | feat = pd.concat(feat,axis=0) 295 | data = pd.concat([data,feat],axis=1) 296 | 297 | return data 298 | 299 | question = pd.read_csv('./data/question.csv') 300 | 301 | if aug is not None: 302 | data = pd.concat([train, test,aug], ignore_index=True) 303 | else: 304 | data = pd.concat([train, test], ignore_index=True) 305 | data = pd.merge(data, question, left_on=['q1'], right_on=['qid'], how='left') 306 | data = pd.merge(data, question, left_on=['q2'], right_on=['qid'], how='left') 307 | data.drop(['qid_x','qid_y'],axis=1,inplace=True) 308 | '''训练集长度''' 309 | 310 | data['q1_word_len'] = data['words_x'].progress_apply(lambda x: len(x.split())) 311 | data['q2_word_len'] = data['words_y'].progress_apply(lambda x: len(x.split())) 312 | data['q1_char_len'] = data['chars_x'].progress_apply(lambda x: len(x.split())) 313 | data['q2_char_len'] = data['chars_y'].progress_apply(lambda x: len(x.split())) 314 | 315 | data['words_x'] = data['words_x'].str.split() 316 | data['words_y'] = data['words_y'].str.split() 317 | data['chars_x'] = data['chars_x'].str.split() 318 | data['chars_y'] = data['chars_y'].str.split() 319 | print(data.columns) 320 | data = multi_process(data,lcs_worker) 321 | print(data.columns) 322 | data = multi_process(data,edit_distance) 323 | print(data.columns) 324 | data = multi_process(data, num_same_w) 325 | print(data.columns) 326 | 327 | 328 | data.drop(['chars_x','chars_y','words_x','words_y'],axis=1,inplace=True) 329 | 330 | if aug is not None: 331 | train = data.iloc[:train.shape[0]].reset_index(drop=True) 332 | test = data.iloc[train.shape[0]:train.shape[0]+test.shape[0]].reset_index(drop=True) 333 | aug = data.iloc[train.shape[0]+test.shape[0]:].reset_index(drop=True) 334 | return train, test,aug 335 | else: 336 | train = data.iloc[:train.shape[0]].reset_index( drop=True) 337 | test = data.iloc[train.shape[0]:].reset_index( drop=True) 338 | return train,test 339 | 340 | 341 | 342 | train = pd.read_csv('./data/train.csv',usecols=['q1','q2','label']) 343 | test = pd.read_csv('./data/test.csv',usecols=['q1','q2']) 344 | aug = pd.read_csv('./data/aug_data_with_pre.csv',usecols=['q1','q2','label']) 345 | labels = train['label'] 346 | aug_label = aug['label'] 347 | train.drop(['label'],axis=1,inplace=True) 348 | aug.drop(['label'],axis=1,inplace=True) 349 | 350 | print(aug.shape) 351 | print(train.shape) 352 | print(test.shape) 353 | 354 | print(aug) 355 | q_matrix = svd_graph(train,test,False,aug) 356 | 357 | # train,test = hash_q(train,test) 358 | train,test,aug = graph_feature(train,test,False,aug) 359 | print(aug) 360 | train,test,aug = graph_feature(train,test,True,aug) 361 | 362 | # train,test,aug = distance_feat(train,test,aug) 363 | train['label'] = labels 364 | aug['label'] = aug_label 365 | 366 | 367 | 368 | train.to_csv('./data/train_v2.csv',index=False) 369 | test.to_csv('./data/test_v2.csv',index=False) 370 | aug.to_csv('./data/aug_data_filter.csv',index=False) 371 | # train = pd.merge(train,q_matrix, left_on=['q1'], right_on=['qid'], how='left') 372 | # test = pd.merge(test,q_matrix, left_on=['q1'], right_on=['qid'], how='left') 373 | 374 | test['label'] = pd.read_csv('147037.csv')['y_pre'] 375 | 376 | 377 | print(train.corr()) 378 | print(test.corr()) -------------------------------------------------------------------------------- /code/ensemble.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from glob import glob 3 | from tqdm import tqdm 4 | import tensorflow as tf 5 | import keras.backend.tensorflow_backend as KTF 6 | config = tf.ConfigProto() 7 | config.gpu_options.allow_growth=True 8 | session = tf.Session(config=config) 9 | KTF.set_session(session) 10 | import keras.backend as K 11 | import multiprocessing as mlp 12 | 13 | def ensemble(model_name,te_word,te_char,embedding_matrix_word,emembedding_matrix_char): 14 | from nn import rnnword, aggmodel, esim, attention, rnn_res 15 | 16 | if model_name == 'rnnword': 17 | get_model = rnnword 18 | elif model_name == 'aggmodel': 19 | pass 20 | elif model_name == 'esim': 21 | get_model = esim 22 | elif model_name == 'attention': 23 | get_model = attention 24 | elif model_name == 'res': 25 | get_model = rnn_res 26 | else: 27 | raise RuntimeError("don't have this model") 28 | 29 | path = './weight_' + model_name + '/' 30 | 31 | results = [] 32 | m_char = get_model(emembedding_matrix_char,False) 33 | m_word = get_model(embedding_matrix_word,True) 34 | 35 | for model_path in tqdm(glob(path+'*.h5')): 36 | 37 | if "2018-07-15_16:15:17" not in model_path: 38 | continue 39 | if 'chars_True'in model_path or 'words_True' in model_path: 40 | ense_w = 7 41 | elif 'chars_False' in model_path: 42 | ense_w = 3 43 | elif 'words_False' in model_path: 44 | ense_w = 4 45 | else: 46 | raise RuntimeError("error model") 47 | 48 | if 'char' in model_path: 49 | m_char.load_weights(model_path) 50 | results.append((m_char.predict(te_char,batch_size=1024), ense_w)) 51 | else: 52 | m_word.load_weights(model_path) 53 | results.append((m_word.predict(te_word,batch_size=1024),ense_w)) 54 | 55 | K.clear_session() 56 | tf.reset_default_graph() 57 | 58 | submit = 0 59 | total_w = 0 60 | for y_pred,ense_w in results: 61 | submit += ense_w*y_pred 62 | total_w += ense_w 63 | 64 | return submit/total_w 65 | 66 | 67 | 68 | if __name__ == '__main__': 69 | from readdata import read_data 70 | 71 | _, te_word, embedding_matrix_word,__ = read_data('words', data_aug=False) 72 | _, te_char, embedding_matrix_char,__ = read_data('chars', data_aug=False) 73 | 74 | submit_atten = ensemble('esim',te_word,te_char,embedding_matrix_word,embedding_matrix_char) 75 | 76 | submit = pd.DataFrame() 77 | submit['y_pre'] = list(submit_atten[:, 0]) 78 | submit.to_csv('atten.csv', index=False) 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /code/nn.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.layers import * 3 | from keras.regularizers import l2 4 | from keras.callbacks import Callback, ModelCheckpoint 5 | from keras.utils.data_utils import get_file 6 | from keras import backend as K 7 | from sklearn.model_selection import train_test_split 8 | from keras.optimizers import Nadam,RMSprop 9 | import tensorflow as tf 10 | from keras.initializers import VarianceScaling 11 | from itertools import combinations 12 | from keras.constraints import non_neg,min_max_norm 13 | 14 | 15 | def co_attention(q1, q2): 16 | 17 | dense_w = TimeDistributed(Dense(1)) 18 | atten = Lambda(lambda x: K.batch_dot(x[0], x[1]))([q1, Permute((2, 1))(q2)]) # 15 * 15 19 | 20 | atten_1 = dense_w(atten) 21 | atten_1 = Flatten()(atten_1) 22 | atten_1 = Activation('softmax')(atten_1) 23 | atten_1 = Reshape((1,-1))(atten_1) 24 | 25 | atten_2 = dense_w(Permute((2, 1))(atten)) 26 | atten_2 = Flatten()(atten_2) 27 | atten_2 = Activation('softmax')(atten_2) 28 | atten_2 = Reshape((1,-1))(atten_2) 29 | 30 | q1 = Lambda(lambda x: K.batch_dot(x[0], x[1]))([atten_1,q1]) # 1*300 31 | q1 = Flatten()(q1) 32 | q2 = Lambda(lambda x: K.batch_dot(x[0], x[1]))([atten_2,q2]) # 1*300 33 | q2 = Flatten()(q2) 34 | return q1, q2 35 | 36 | def unchanged_shape(input_shape): 37 | "Function for Lambda layer" 38 | return input_shape 39 | 40 | def soft_attention_alignment(input_1, input_2): 41 | attention = Dot(axes=-1)([input_1, input_2]) 42 | w_att_1 = Lambda(lambda x: K.softmax(x, axis=1), 43 | output_shape=unchanged_shape)(attention) 44 | w_att_2 = Permute((2, 1))(Lambda(lambda x: K.softmax(x, axis=2), 45 | output_shape=unchanged_shape)(attention)) 46 | in1_aligned = Dot(axes=1)([w_att_1, input_1]) 47 | in2_aligned = Dot(axes=1)([w_att_2, input_2]) 48 | return in1_aligned, in2_aligned 49 | 50 | def norm_layer(x, axis=1): 51 | return (x - K.mean(x, axis=axis, keepdims=True)) / K.std(x, axis=axis, keepdims=True) 52 | 53 | def distance(q1,q2,dist,normlize=False): 54 | if normlize: 55 | q1 = Lambda(norm_layer)(q1) 56 | q2 = Lambda(norm_layer)(q2) 57 | 58 | if dist == 'cos': 59 | return multiply([q1,q2]) 60 | 61 | elif dist == 'h_mean': 62 | def dice(x): 63 | return x[0]*x[1]/(K.sum(K.abs(x[0]),axis=1,keepdims=True)+K.sum(K.abs(x[1]),axis=1,keepdims=True)) 64 | return Lambda(dice)([q1,q2]) 65 | 66 | elif dist == 'dice': 67 | def dice(x): 68 | return x[0]*x[1]/(K.sum(x[0]**2,axis=1,keepdims=True)+K.sum(x[1]**2,axis=1,keepdims=True)) 69 | return Lambda(dice)([q1,q2]) 70 | 71 | elif dist == 'jaccard': 72 | def jaccard(x): 73 | return x[0]*x[1]/( 74 | K.sum(x[0]**2,axis=1,keepdims=True)+ 75 | K.sum(x[1]**2,axis=1,keepdims=True)- 76 | K.sum(K.abs(x[0]*x[1]),axis=1,keepdims=True)) 77 | return Lambda(jaccard)([q1,q2]) 78 | elif dist == 'jac_add': 79 | def jac_add(x): 80 | a = K.sum(x[0]**2,axis=1,keepdims=True)+K.sum(x[1]**2,axis=1,keepdims=True)-K.sum(K.abs(x[0]*x[1]),axis=1,keepdims=True) 81 | b = x[0]+x[1] 82 | return b/a 83 | return Lambda(jac_add)([q1,q2]) 84 | elif dist == 'dice_add': 85 | def dice_add(x): 86 | a = K.sum(x[0]**2,axis=1,keepdims=True)+K.sum(x[1]**2,axis=1,keepdims=True) 87 | b = x[0]+x[1] 88 | return b/a 89 | return Lambda(dice_add)([q1,q2]) 90 | 91 | def pool_corr(q1,q2,pool_way,dist): 92 | if pool_way == 'max': 93 | pool = GlobalMaxPooling1D() 94 | elif pool_way == 'ave': 95 | pool = GlobalAveragePooling1D() 96 | else: 97 | raise RuntimeError("don't have this pool way") 98 | 99 | q1 = pool(q1) 100 | q2 = pool(q2) 101 | 102 | merged = distance(q1,q2,dist,normlize=True) 103 | 104 | 105 | return merged 106 | 107 | def weight_ave(q1,q2): 108 | 109 | down = TimeDistributed(Dense(1,use_bias=False)) 110 | 111 | q1 = down(Permute((2,1))(q1)) 112 | q1 = Flatten()(q1) 113 | q1 = Lambda(norm_layer)(q1) 114 | q2 = down(Permute((2,1))(q2)) 115 | q2 = Flatten()(q2) 116 | q2 = Lambda(norm_layer)(q2) 117 | merged = multiply([q1, q2]) 118 | return merged 119 | 120 | def simility_vec(q1,q2): 121 | simi = Lambda(lambda x: K.batch_dot(x[0], x[1]))([q1, Permute((2, 1))(q2)]) 122 | simi = Reshape((-1,))(simi) 123 | return simi 124 | 125 | def rnnword(word_embedding_matrix,use_word): 126 | if use_word: 127 | from config import MAX_NUM_WORDS 128 | text_len = MAX_NUM_WORDS 129 | else: 130 | from config import MAX_NUM_CHARS 131 | text_len = MAX_NUM_CHARS 132 | 133 | question1 = Input(shape=(text_len,),name='q1') 134 | question2 = Input(shape=(text_len,),name='q2') 135 | 136 | 137 | 138 | embedd_word = Embedding( 139 | len(word_embedding_matrix), 140 | word_embedding_matrix.shape[1], 141 | weights=[word_embedding_matrix], 142 | input_length=text_len, 143 | trainable=True,) 144 | 145 | 146 | gru_dim1 = 384 147 | gru_dim2 = 256 148 | 149 | 150 | gru_w = Bidirectional(CuDNNGRU(gru_dim1,return_sequences=True),merge_mode='sum') 151 | gru2_w = Bidirectional(CuDNNGRU(gru_dim2,return_sequences=True),merge_mode='sum') 152 | 153 | 154 | norm = BatchNormalization() 155 | q1 = embedd_word(question1) 156 | q1 = norm(q1) 157 | q1 = SpatialDropout1D(0.2)(q1) 158 | 159 | q2 = embedd_word(question2) 160 | q2 = norm(q2) 161 | q2 = SpatialDropout1D(0.2)(q2) 162 | 163 | q1_1 = gru_w(q1) 164 | q2_1 = gru_w(q2) 165 | 166 | q1 = gru2_w(q1_1) 167 | q2 = gru2_w(q2_1) 168 | 169 | merged_max = pool_corr(q1,q2,'max','jaccard') 170 | merged_ave = pool_corr(q1,q2,'ave','jaccard') 171 | 172 | from config import n_components 173 | q1_g = Input(shape=(n_components,),name='q1node') 174 | q2_g = Input(shape=(n_components,),name='q2node') 175 | 176 | 177 | norm = BatchNormalization() 178 | q1_node = norm(q1_g) 179 | q2_node = norm(q2_g) 180 | 181 | fc = Dense(units=2) 182 | act = PReLU() 183 | q1_node = fc(q1_node) 184 | q1_node = act(q1_node) 185 | q2_node = fc(q2_node) 186 | q2_node = act(q2_node) 187 | 188 | node_vec = multiply([q1_node,q2_node]) 189 | 190 | graph_f = Input(shape=(11,),name='gf') 191 | gf = BatchNormalization()(graph_f) 192 | gf = Dropout(0.2)(gf) 193 | 194 | merged = concatenate([merged_ave,merged_max]) 195 | merged = Dense(512,activation='relu')(merged) 196 | merged = concatenate([merged, gf,node_vec]) 197 | merged = Dense(512,activation='relu')(merged) 198 | output = Dense(1, activation='sigmoid')(merged) 199 | 200 | lr=0.0008 201 | 202 | model = Model(inputs=[question1,question2,graph_f,q1_g,q2_g], outputs=output) 203 | 204 | model.compile(loss='binary_crossentropy',optimizer=Nadam(lr),metrics=['binary_crossentropy','accuracy']) 205 | print(lr) 206 | 207 | return model 208 | 209 | def aggmodel(word_embedding_matrix,char_embedding_matrix): 210 | 211 | def prepocess(q1,q2,embedd): 212 | norm = BatchNormalization() 213 | q1 = embedd(q1) 214 | q1 = norm(q1) 215 | q1 = SpatialDropout1D(0.2)(q1) 216 | 217 | q2 = embedd(q2) 218 | q2 = norm(q2) 219 | q2 = SpatialDropout1D(0.2)(q2) 220 | return q1,q2 221 | 222 | from config import MAX_NUM_WORDS,MAX_NUM_CHARS 223 | 224 | 225 | word1 = Input(shape=(MAX_NUM_WORDS,)) 226 | word2 = Input(shape=(MAX_NUM_WORDS,)) 227 | char1 = Input(shape=(MAX_NUM_CHARS,)) 228 | char2 = Input(shape=(MAX_NUM_CHARS,)) 229 | 230 | 231 | embedd_word = Embedding( 232 | len(word_embedding_matrix), 233 | word_embedding_matrix.shape[1], 234 | weights=[word_embedding_matrix], 235 | input_length=MAX_NUM_WORDS, 236 | trainable=True) 237 | embedd_char = Embedding( 238 | len(char_embedding_matrix), 239 | char_embedding_matrix.shape[1], 240 | weights=[char_embedding_matrix], 241 | input_length=MAX_NUM_CHARS, 242 | trainable=True) 243 | 244 | gru_dim1 = 384 245 | gru_dim2 = 256 246 | 247 | 248 | gru_w = Bidirectional(CuDNNGRU(gru_dim1,return_sequences=True),merge_mode='sum') 249 | gru2_w = Bidirectional(CuDNNGRU(gru_dim2,return_sequences=True,),merge_mode='sum') 250 | 251 | gru_wc = Bidirectional(CuDNNGRU(gru_dim1, return_sequences=True), merge_mode='sum') 252 | gru2_wc = Bidirectional(CuDNNGRU(gru_dim2, return_sequences=True), merge_mode='sum') 253 | 254 | q1,q2 = prepocess(word1,word2,embedd_word) 255 | qc1,qc2 = prepocess(char1,char2,embedd_char) 256 | 257 | q1 = gru_w(q1) 258 | q2 = gru_w(q2) 259 | qc1 = gru_wc(qc1) 260 | qc2 = gru_wc(qc2) 261 | 262 | q1 = gru2_w(q1) 263 | q2 = gru2_w(q2) 264 | qc1 = gru2_wc(qc1) 265 | qc2 = gru2_wc(qc2) 266 | 267 | merged_max1 = pool_corr(q1,qc2,'max') 268 | merged_max2 = pool_corr(qc1,q2,'max') 269 | merged_ave1 = pool_corr(q1,qc2,'ave') 270 | merged_ave2 = pool_corr(qc1,q2,'ave') 271 | 272 | merged_max3 = pool_corr(q1,q2, 'max') 273 | merged_max4 = pool_corr(qc1,qc2, 'max') 274 | merged_ave3 = pool_corr(q1,q2, 'ave') 275 | merged_ave4 = pool_corr(qc1,qc2, 'ave') 276 | 277 | 278 | merged = concatenate([merged_max1,merged_max2,merged_max3,merged_max4, 279 | merged_ave1,merged_ave2,merged_ave3,merged_ave4]) 280 | merged = Dense(512,activation='relu')(merged) 281 | # merged = Dropout(0.2)(merged) 282 | merged = Dense(512,activation='relu')(merged) 283 | # merged = Dropout(0.2)(merged) 284 | output = Dense(1, activation='sigmoid')(merged) 285 | 286 | 287 | 288 | lr=0.0008 289 | 290 | 291 | model = Model(inputs=[word1,word2,char1,char2], outputs=output) 292 | 293 | # model = multi_gpu_model(model,gpus=4) 294 | 295 | model.compile(loss='binary_crossentropy',optimizer=Nadam(lr),metrics=['binary_crossentropy','accuracy']) 296 | 297 | # model.load_weights("./data/weights_best_0.0008.hdf5") 298 | print(lr) 299 | 300 | return model 301 | 302 | def esim(word_embedding_matrix, use_word): 303 | if use_word: 304 | from config import MAX_NUM_WORDS 305 | text_len = MAX_NUM_WORDS 306 | else: 307 | from config import MAX_NUM_CHARS 308 | text_len = MAX_NUM_CHARS 309 | 310 | q1 = Input(name='q1', shape=(text_len,)) 311 | q2 = Input(name='q2', shape=(text_len,)) 312 | 313 | embedding = Embedding( 314 | len(word_embedding_matrix), 315 | word_embedding_matrix.shape[1], 316 | weights=[word_embedding_matrix], 317 | input_length=text_len, 318 | trainable=True) 319 | 320 | bn = BatchNormalization() 321 | q1_embed = bn(embedding(q1)) 322 | q1_embed = SpatialDropout1D(0.2)(q1_embed) 323 | q2_embed = bn(embedding(q2)) 324 | q2_embed = SpatialDropout1D(0.2)(q2_embed) 325 | 326 | encode = Bidirectional(CuDNNLSTM(384,return_sequences=True), merge_mode='sum') 327 | q1_encoded = encode(q1_embed) 328 | q2_encoded = encode(q2_embed) 329 | 330 | q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) 331 | 332 | q1_combined = Concatenate()([q1_encoded, q2_aligned, multiply([q1_encoded, q2_aligned])]) 333 | q2_combined = Concatenate()([q2_encoded, q1_aligned, multiply([q2_encoded, q1_aligned])]) 334 | 335 | compose = Bidirectional(CuDNNLSTM(384,return_sequences=True), merge_mode='sum') 336 | q1_compare = compose(q1_combined) 337 | q2_compare = compose(q2_combined) 338 | 339 | 340 | merged_ave = pool_corr(q1_compare,q2_compare,'ave','dice') 341 | merged_max = pool_corr(q1_compare,q2_compare,'max','dice') 342 | 343 | from config import n_components 344 | q1_g = Input(shape=(n_components,), name='q1node') 345 | q2_g = Input(shape=(n_components,), name='q2node') 346 | 347 | norm = BatchNormalization() 348 | q1_node = norm(q1_g) 349 | q2_node = norm(q2_g) 350 | 351 | fc = Dense(units=2) 352 | act = PReLU() 353 | q1_node = fc(q1_node) 354 | q1_node = act(q1_node) 355 | q2_node = fc(q2_node) 356 | q2_node = act(q2_node) 357 | 358 | node_vec = multiply([q1_node, q2_node]) 359 | 360 | graph_f = Input(shape=(11,), name='gf') 361 | gf = BatchNormalization()(graph_f) 362 | gf = Dropout(0.2)(gf) 363 | 364 | merged = Concatenate()([merged_max, merged_ave]) 365 | 366 | dense = Dense(512, activation='relu')(merged) 367 | dense = concatenate([dense,gf,node_vec]) 368 | dense = Dense(512, activation='relu')(dense) 369 | out_ = Dense(1, activation='sigmoid')(dense) 370 | lr = 0.0008 371 | 372 | model = Model(inputs=[q1, q2, graph_f,q1_g,q2_g], outputs=out_) 373 | model.compile(optimizer=Nadam(lr=lr), loss='binary_crossentropy', metrics=['binary_crossentropy', 'accuracy']) 374 | return model 375 | 376 | def attention(word_embedding_matrix,use_word): 377 | if use_word: 378 | from config import MAX_NUM_WORDS 379 | text_len = MAX_NUM_WORDS 380 | else: 381 | from config import MAX_NUM_CHARS 382 | text_len = MAX_NUM_CHARS 383 | 384 | question1 = Input(shape=(text_len,),name='q1') 385 | question2 = Input(shape=(text_len,),name='q2') 386 | 387 | 388 | 389 | embedd_word = Embedding( 390 | len(word_embedding_matrix), 391 | word_embedding_matrix.shape[1], 392 | weights=[word_embedding_matrix], 393 | input_length=text_len, 394 | trainable=True) 395 | 396 | gru_dim1 = 300 397 | gru_dim2 = 300 398 | 399 | gru_w = Bidirectional(CuDNNLSTM(gru_dim1,return_sequences=True),merge_mode='sum') 400 | gru2_w = Bidirectional(CuDNNLSTM(gru_dim2,return_sequences=True),merge_mode='sum') 401 | 402 | 403 | norm = BatchNormalization() 404 | q1 = embedd_word(question1) 405 | q1 = norm(q1) 406 | q1 = SpatialDropout1D(0.2)(q1) 407 | 408 | q2 = embedd_word(question2) 409 | q2 = norm(q2) 410 | q2 = SpatialDropout1D(0.2)(q2) 411 | 412 | q1 = gru_w(q1) 413 | q2 = gru_w(q2) 414 | 415 | q1 = gru2_w(q1) 416 | q2 = gru2_w(q2) 417 | 418 | q1_1,q2_2 = co_attention(q1,q2) 419 | merged_1 = distance(q1_1,q2_2,'dice', normlize=True) 420 | merged_3 = pool_corr(q1,q2,'max','dice') 421 | merged_4 = distance(q1_1,q2_2,'dice_add',normlize=True) 422 | 423 | from config import n_components 424 | q1_g = Input(shape=(n_components,),name='q1node') 425 | q2_g = Input(shape=(n_components,),name='q2node') 426 | 427 | norm = BatchNormalization() 428 | q1_node = norm(q1_g) 429 | q2_node = norm(q2_g) 430 | 431 | fc = Dense(units=2) 432 | act = PReLU() 433 | q1_node = fc(q1_node) 434 | q1_node = act(q1_node) 435 | q2_node = fc(q2_node) 436 | q2_node = act(q2_node) 437 | 438 | node_vec = multiply([q1_node,q2_node]) 439 | 440 | graph_f = Input(shape=(11,),name='gf') 441 | gf = BatchNormalization()(graph_f) 442 | gf = Dropout(0.2)(gf) 443 | 444 | merged = concatenate([merged_1,merged_3,merged_4]) 445 | merged = Dense(768,activation='relu')(merged) 446 | merged = Dropout(0.2)(merged) 447 | merged = concatenate([merged,gf,node_vec]) 448 | merged = Dense(768,activation='relu')(merged) 449 | output = Dense(1, activation='sigmoid')(merged) 450 | 451 | lr=0.0008 452 | 453 | model = Model(inputs=[question1,question2,graph_f,q1_g,q2_g], outputs=output) 454 | 455 | model.compile(loss='binary_crossentropy',optimizer=Nadam(lr),metrics=['binary_crossentropy','accuracy']) 456 | print(lr) 457 | 458 | return model 459 | 460 | 461 | def rnn_res(word_embedding_matrix,use_word): 462 | if use_word: 463 | from config import MAX_NUM_WORDS 464 | text_len = MAX_NUM_WORDS 465 | else: 466 | from config import MAX_NUM_CHARS 467 | text_len = MAX_NUM_CHARS 468 | 469 | question1 = Input(shape=(text_len,),name='q1') 470 | question2 = Input(shape=(text_len,),name='q2') 471 | 472 | 473 | 474 | embedd_word = Embedding( 475 | len(word_embedding_matrix), 476 | word_embedding_matrix.shape[1], 477 | weights=[word_embedding_matrix], 478 | input_length=text_len, 479 | trainable=True) 480 | 481 | gru_dim1 = 300 482 | gru_dim2 = 300 483 | 484 | gru_w = Bidirectional(CuDNNLSTM(gru_dim1,return_sequences=True),merge_mode='sum') 485 | gru2_w = Bidirectional(CuDNNGRU(gru_dim2,return_sequences=True),merge_mode='sum') 486 | 487 | 488 | norm = BatchNormalization() 489 | q1 = embedd_word(question1) 490 | q1 = norm(q1) 491 | q1 = SpatialDropout1D(0.2)(q1) 492 | 493 | q2 = embedd_word(question2) 494 | q2 = norm(q2) 495 | q2 = SpatialDropout1D(0.2)(q2) 496 | 497 | q1_0 = gru_w(q1) 498 | q2_0 = gru_w(q2) 499 | 500 | q1 = gru2_w(q1_0) 501 | q2 = gru2_w(q2_0) 502 | 503 | merged_0 = pool_corr(q1_0,q2_0,'ave','jaccard') 504 | merged_1 = pool_corr(q1,q2,'ave','dice') 505 | merged_2 = pool_corr(q1_0, q2_0, 'max', 'jaccard') 506 | merged_3 = pool_corr(q1,q2,'max','dice') 507 | 508 | from config import n_components 509 | q1_g = Input(shape=(n_components,),name='q1node') 510 | q2_g = Input(shape=(n_components,),name='q2node') 511 | 512 | norm = BatchNormalization() 513 | q1_node = norm(q1_g) 514 | q2_node = norm(q2_g) 515 | 516 | fc = Dense(units=2) 517 | act = PReLU() 518 | q1_node = fc(q1_node) 519 | q1_node = act(q1_node) 520 | q2_node = fc(q2_node) 521 | q2_node = act(q2_node) 522 | 523 | node_vec = multiply([q1_node,q2_node]) 524 | 525 | graph_f = Input(shape=(11,),name='gf') 526 | gf = BatchNormalization()(graph_f) 527 | gf = Dropout(0.2)(gf) 528 | 529 | merged = concatenate([merged_1,merged_3,merged_2,merged_0]) 530 | merged = Dense(768,activation='relu')(merged) 531 | merged = concatenate([merged, gf,node_vec]) 532 | merged = Dense(768,activation='relu')(merged) 533 | output = Dense(1, activation='sigmoid')(merged) 534 | 535 | lr=0.0008 536 | 537 | model = Model(inputs=[question1,question2,graph_f,q1_g,q2_g], outputs=output) 538 | 539 | model.compile(loss='binary_crossentropy',optimizer=Nadam(lr),metrics=['binary_crossentropy','accuracy']) 540 | print(lr) 541 | 542 | return model 543 | -------------------------------------------------------------------------------- /code/readdata.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | # 文本处理 5 | from keras.preprocessing.text import Tokenizer 6 | from keras.preprocessing.sequence import pad_sequences 7 | from config import MAX_NB_WORDS 8 | from tqdm import tqdm 9 | from tool import get_samples 10 | from sklearn.preprocessing import LabelEncoder 11 | # 20890 12 | 13 | def get_embedding_matrix(word_index,file): 14 | embeddings_index = {} 15 | with open(file, 'r') as f: 16 | wordmat = f.read().split('\n') 17 | if wordmat[-1] == '': 18 | wordmat = wordmat[:-1] 19 | if wordmat[0] == '': 20 | wordmat = wordmat[1:] 21 | 22 | for line in tqdm(wordmat): 23 | wvec = line.strip('\n').strip(' ').split(' ') 24 | embeddings_index[wvec[0]] = np.asarray(wvec[1:], dtype='float') 25 | 26 | print('embedding', len(embeddings_index)) 27 | 28 | EMBEDDING_DIM = 300 29 | nb_words = min(MAX_NB_WORDS, len(word_index)) 30 | embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM)) 31 | for word, i in word_index.items(): 32 | if i > MAX_NB_WORDS: 33 | continue 34 | embedding_vector = embeddings_index.get(str(word).upper()) 35 | if embedding_vector is not None: 36 | embedding_matrix[i] = embedding_vector 37 | return embedding_matrix 38 | 39 | def read_data(use_data,file=None,data_aug=False): 40 | 41 | question = pd.read_csv('./data/question.csv') 42 | question = question[['qid', use_data]] 43 | 44 | 45 | if data_aug: 46 | train = pd.read_csv('./data/train.csv', usecols=['label', 'q1', 'q2']) 47 | samples = pd.read_csv('./data/aug_data_filter.csv',usecols=['label','q1','q2']) 48 | train = pd.concat([train,samples]).reset_index(drop=True) 49 | test = pd.read_csv('./data/test.csv', usecols=['q1', 'q2']) 50 | else: 51 | train = pd.read_csv('./data/train.csv', usecols=['label', 'q1', 'q2']) 52 | test = pd.read_csv('./data/test.csv', usecols=['q1', 'q2']) 53 | 54 | 55 | train = pd.merge(train, question, left_on=['q1'], right_on=['qid'], how='left') 56 | train = pd.merge(train, question, left_on=['q2'], right_on=['qid'], how='left') 57 | train = train[[use_data+'_x', use_data+'_y','label']] 58 | train.columns = ['q1', 'q2','label'] 59 | 60 | test = pd.merge(test, question, left_on=['q1'], right_on=['qid'], how='left') 61 | test = pd.merge(test, question, left_on=['q2'], right_on=['qid'], how='left') 62 | test = test[[use_data+'_x', use_data+'_y']] 63 | test.columns = ['q1', 'q2'] 64 | 65 | all = pd.concat([train, test]) 66 | 67 | # 分词 词转序列 68 | tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 69 | tokenizer.fit_on_texts(question[use_data]) 70 | 71 | word_index = tokenizer.word_index 72 | print(len(word_index)) 73 | 74 | q1_word_seq = tokenizer.texts_to_sequences(all['q1']) 75 | q2_word_seq = tokenizer.texts_to_sequences(all['q2']) 76 | 77 | if file is None: 78 | if use_data == 'words': 79 | file = './data/word_embed.txt' 80 | if use_data == 'chars': 81 | file = './data/char_embed.txt' 82 | word_embedding_matrix = get_embedding_matrix(word_index, file) 83 | 84 | 85 | from config import MAX_NUM_WORDS,MAX_NUM_CHARS 86 | if use_data == 'words': 87 | text_len = MAX_NUM_WORDS 88 | elif use_data == 'chars': 89 | text_len = MAX_NUM_CHARS 90 | else: 91 | raise RuntimeError('use data error') 92 | 93 | q1_data = pad_sequences(q1_word_seq,maxlen=text_len,truncating='post') 94 | q2_data = pad_sequences(q2_word_seq,maxlen=text_len,truncating='post') 95 | 96 | tr_q1 = q1_data[:train.shape[0]] 97 | tr_q2 = q2_data[:train.shape[0]] 98 | 99 | te_q1 = q1_data[train.shape[0]:] 100 | te_q2 = q2_data[train.shape[0]:] 101 | 102 | usecols = [ 103 | 'q1q2_union_w', 104 | 'q1q2_inter_w', 105 | 'q1_num_adj_w', 106 | 'q2_num_adj_w', 107 | 'q1q2_union', 108 | 'q1q2_inter', 109 | 'q1_num_adj', 110 | 'q2_num_adj', 111 | # 'q1_hash', 112 | # 'q2_hash', 113 | # 'q1q2_inter', 114 | 'shortest_path_w', 115 | # 'edit_words', 116 | # 'edit_chars', 117 | # 'lcs_words', 118 | # 'lcs_chars', 119 | # 'q1_word_len', 120 | # 'q2_word_len', 121 | # 'q1_char_len', 122 | # 'q2_char_len', 123 | # 'words_common', 124 | # 'chars_common', 125 | ] 126 | # if data_aug==False: 127 | usecols+=['q1_pr_w','q2_pr_w'] 128 | 129 | tr = {} 130 | tr['q1'] = tr_q1 131 | tr['q2'] = tr_q2 132 | te = {} 133 | te['q1'] = te_q1 134 | te['q2'] = te_q2 135 | 136 | if data_aug: 137 | tr['gf'] = pd.concat([pd.read_csv('./data/train.csv',usecols=usecols), 138 | pd.DataFrame(np.zeros((len(samples),11)),columns=usecols)]).values 139 | te['gf'] = pd.read_csv('./data/test.csv', usecols=usecols).values 140 | else: 141 | tr['gf'] = pd.read_csv('./data/train.csv', usecols=usecols).values 142 | te['gf'] = pd.read_csv('./data/test.csv', usecols=usecols).values 143 | 144 | 145 | if data_aug: 146 | q_tr = pd.concat([pd.read_csv('./data/train.csv',usecols=['q1','q2']), 147 | pd.read_csv('./data/aug_data_filter.csv',usecols=['q1','q2'])]).reset_index(drop=True) 148 | q_te = pd.read_csv('./data/test.csv', usecols=['q1', 'q2']) 149 | else: 150 | q_tr = pd.read_csv('./data/train.csv',usecols=['q1','q2']) 151 | q_te = pd.read_csv('./data/test.csv',usecols=['q1','q2']) 152 | 153 | if data_aug: 154 | questions = pd.read_csv('./data/q_matrix.csv') 155 | else: 156 | questions = pd.read_csv('./data/q_matrix.csv') 157 | 158 | from config import n_components 159 | feat = ["feat"+str(i) for i in range(n_components)] 160 | 161 | tr['q1node'] = pd.merge(q_tr, questions, left_on=['q1'], right_on=['qid'], how='left').loc[:,feat].values 162 | tr['q2node'] = pd.merge(q_tr, questions, left_on=['q2'], right_on=['qid'], how='left').loc[:,feat].values 163 | te['q1node'] = pd.merge(q_te, questions, left_on=['q1'], right_on=['qid'], how='left').loc[:,feat].values 164 | te['q2node'] = pd.merge(q_te, questions, left_on=['q2'], right_on=['qid'], how='left').loc[:,feat].values 165 | 166 | # q_embed = questions.loc[:,['feat'+str(i) for i in range(128)]].values 167 | 168 | return tr,te, word_embedding_matrix,train['label'] 169 | 170 | def save_data_tree(use_data,file=None): 171 | question = pd.read_csv('./data/question.csv') 172 | question = question[['qid', use_data]] 173 | 174 | train = pd.read_csv('./data/train.csv') 175 | test = pd.read_csv('./data/test.csv') 176 | train = pd.merge(train, question, left_on=['q1'], right_on=['qid'], how='left') 177 | train = pd.merge(train, question, left_on=['q2'], right_on=['qid'], how='left') 178 | train = train[[use_data + '_x', use_data + '_y', 'label']] 179 | train.columns = ['q1', 'q2', 'label'] 180 | 181 | test = pd.merge(test, question, left_on=['q1'], right_on=['qid'], how='left') 182 | test = pd.merge(test, question, left_on=['q2'], right_on=['qid'], how='left') 183 | test = test[[use_data + '_x', use_data + '_y']] 184 | test.columns = ['q1', 'q2'] 185 | 186 | all = pd.concat([train, test]) 187 | 188 | tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 189 | tokenizer.fit_on_texts(question[use_data]) 190 | 191 | word_index = tokenizer.word_index 192 | print(len(word_index)) 193 | 194 | q1_word_seq = tokenizer.texts_to_sequences(all['q1']) 195 | q2_word_seq = tokenizer.texts_to_sequences(all['q2']) 196 | 197 | if file is None: 198 | if use_data == 'words': 199 | file = './data/word_embed.txt' 200 | if use_data == 'chars': 201 | file = './data/char_embed.txt' 202 | embedding_matrix = get_embedding_matrix(word_index, file) 203 | 204 | from config import MAX_NUM_WORDS, MAX_NUM_CHARS 205 | if use_data == 'words': 206 | text_len = MAX_NUM_WORDS 207 | elif use_data == 'chars': 208 | text_len = MAX_NUM_CHARS 209 | else: 210 | raise RuntimeError('use data error') 211 | 212 | q1_data = pad_sequences(q1_word_seq, maxlen=text_len, truncating='post') 213 | q2_data = pad_sequences(q2_word_seq, maxlen=text_len, truncating='post') 214 | 215 | q1_matrix = np.zeros((len(q1_data),300*text_len),dtype=np.float16) 216 | q2_matrix = np.zeros((len(q2_data),300*text_len),dtype=np.float16) 217 | 218 | embedding_matrix = embedding_matrix.astype(np.float16) 219 | for i,(q1,q2) in tqdm(enumerate(zip(q1_data,q2_data))): 220 | for j in range(text_len): 221 | if q1[j] != 0: 222 | w_v = embedding_matrix[q1[j]] 223 | q1_matrix[i,j*300:(j+1)*300] = w_v 224 | if q2[j] != 0: 225 | w_v = embedding_matrix[q2[j]] 226 | q2_matrix[i,j*300:(j+1)*300] = w_v 227 | 228 | from scipy.sparse import csr_matrix,save_npz 229 | 230 | save_npz('./data/q1_matrix'+use_data+'.npz',csr_matrix(q1_matrix)) 231 | save_npz('./data/q2_matrix'+use_data+'.npz',csr_matrix(q2_matrix)) 232 | 233 | print('success') 234 | 235 | if __name__ == '__main__': 236 | save_data_tree('words') 237 | save_data_tree('chars') 238 | 239 | 240 | 241 | 242 | 243 | 244 | -------------------------------------------------------------------------------- /code/text.py: -------------------------------------------------------------------------------- 1 | import os 2 | from config import use_device 3 | os.environ["CUDA_VISIBLE_DEVICES"] = use_device 4 | import tensorflow as tf 5 | import keras.backend.tensorflow_backend as KTF 6 | config = tf.ConfigProto() 7 | config.gpu_options.allow_growth=True 8 | session = tf.Session(config=config) 9 | KTF.set_session(session) 10 | from keras import backend as K 11 | import pandas as pd 12 | import numpy as np 13 | from keras.callbacks import EarlyStopping,ModelCheckpoint,Callback,LearningRateScheduler 14 | import warnings 15 | warnings.filterwarnings('ignore') 16 | from sklearn.metrics import log_loss 17 | import datetime 18 | 19 | def lr_de(epoch,lr): 20 | if epoch==0: 21 | return lr 22 | elif lr>0.0002: 23 | return lr/2 24 | else: 25 | return lr 26 | 27 | class epochHistory(Callback): 28 | 29 | def on_train_begin(self, logs=None): 30 | self.epochs = [] 31 | 32 | def on_epoch_end(self, epoch, logs=None): 33 | self.epochs.append(epoch) 34 | 35 | def iter_ense(epochs,model,te): 36 | 37 | result = 0 38 | for e in epochs[-3:]: 39 | model.load_weights('./weight/weights.'+str(e+1)+'.hdf5') 40 | result += model.predict(te, batch_size=1024) 41 | return result/3 42 | 43 | 44 | def train(use_data,semi_sv,output,data_aug,use_model): 45 | 46 | def get_subset(dataset,idx): 47 | data = {} 48 | for key,value in dataset.items(): 49 | data[key] = value[idx] 50 | return data 51 | 52 | def concat_data(data1,data2): 53 | result = {} 54 | for k in data1.keys(): 55 | result[k] = np.concatenate([data1[k],data2[k]]) 56 | return result 57 | 58 | def get_aug_data(tr_x, tr_y): 59 | tr_q1 = tr_x['q1'] 60 | tr_q2 = tr_x['q2'] 61 | tr_gf = tr_x['gf'] 62 | tr_q1node = tr_x['q1node'] 63 | tr_q2node = tr_x['q2node'] 64 | 65 | res_q1 = [] 66 | res_q2 = [] 67 | res_gf = [] 68 | res_q1node = [] 69 | res_q2node = [] 70 | res_y = [] 71 | 72 | for q1, q2, gf, q1node, q2node, y in zip(tr_q1, tr_q2, tr_gf, tr_q1node, tr_q2node, tr_y): 73 | r1 = q1[np.in1d(q1, q2, invert=True)] 74 | len1 = len(r1) 75 | if len1 < 4 or len1==len(q1[q1!=0]): 76 | continue 77 | 78 | r2 = q2[np.in1d(q2, q1, invert=True)] 79 | len2 = len(r2) 80 | if len2 < 4 or len2==len(q2[q2!=0]): 81 | continue 82 | 83 | out1 = np.zeros(15, dtype=np.int32) 84 | out2 = np.zeros(15, dtype=np.int32) 85 | out1[-len1:] = r1 86 | out2[-len2:] = r2 87 | 88 | res_q1.append(out1) 89 | res_q2.append(out2) 90 | res_gf.append(gf) 91 | res_q1node.append(q1node) 92 | res_q2node.append(q2node) 93 | res_y.append(y) 94 | 95 | 96 | res_x = { 97 | 'q1': np.asarray(res_q1), 98 | 'q2': np.asarray(res_q2), 99 | 'gf': np.asarray(res_gf), 100 | 'q1node': np.asarray(res_q1node), 101 | 'q2node': np.asarray(res_q2node) 102 | } 103 | res_y = np.asarray(res_y) 104 | return res_x, res_y 105 | 106 | from nn import rnnword, aggmodel, esim,attention,rnn_res 107 | if use_model == 'rnnword': 108 | get_model = rnnword 109 | elif use_model == 'aggmodel': 110 | pass 111 | elif use_model == 'esim': 112 | get_model = esim 113 | elif use_model == 'attention': 114 | get_model = attention 115 | elif use_model == 'res': 116 | get_model = rnn_res 117 | else: 118 | raise RuntimeError("don't have this model") 119 | 120 | from readdata import read_data 121 | 122 | model_name = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')+'_'+use_data+'_'+str(semi_sv)+'_'+str(data_aug)+'_' 123 | 124 | tr,te, embedding_matrix, labels = read_data(use_data,data_aug=data_aug) 125 | 126 | print(use_data) 127 | print('Shape of label tensor:', labels.shape) 128 | 129 | y = labels 130 | 131 | from config import model_path 132 | from sklearn.cross_validation import StratifiedKFold, KFold 133 | from config import n_folds 134 | 135 | y_pred = pd.read_csv("./data/y_pred.csv")['y_pre'].values 136 | y_pos_ = y_pred == 1 137 | y_neg_ = y_pred == 0 138 | add_idx = np.any([y_pos_, y_neg_], axis=0) 139 | add_y = y_pred[add_idx] 140 | 141 | 142 | y_pos = y_pred > 0.75 143 | y_neg = y_pred < 0.25 144 | y_idx = np.any([y_pos, y_neg], axis=0) 145 | y_pred = y_pred[y_idx] 146 | print(y_idx.shape) 147 | 148 | 149 | folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) 150 | result = np.zeros((len(te['q1']), 1)) 151 | 152 | oof_y = np.zeros((len(y), 1)) 153 | for n_fold, (tr_idx, val_idx) in enumerate(folds): 154 | tr_x = get_subset(tr,tr_idx) 155 | tr_y = y[tr_idx] 156 | # if data_aug: 157 | # res_x,res_y = get_aug_data(tr_x,tr_y) 158 | # tr_x = concat_data(tr_x,res_x) 159 | # tr_y = np.concatenate([tr_y,res_y]) 160 | 161 | if semi_sv: 162 | te_x = get_subset(te, y_idx) 163 | tr_data = concat_data(tr_x,te_x) 164 | tr_y = np.concatenate([tr_y,y_pred]) 165 | patience = 3 166 | else: 167 | add_data = get_subset(te,add_idx) 168 | tr_data = concat_data(tr_x,add_data) 169 | tr_y = np.concatenate([tr_y, add_y]) 170 | patience = 2 171 | # tr_data = tr_x 172 | # tr_y = y[tr_idx] 173 | 174 | val_x = get_subset(tr, val_idx) 175 | val_y = y[val_idx] 176 | 177 | use_word = True 178 | if use_data!='words': 179 | use_word = False 180 | model = get_model(word_embedding_matrix=embedding_matrix,use_word=use_word) 181 | if n_fold == 0: 182 | print(model.summary()) 183 | 184 | # hist = epochHistory() 185 | print(n_fold) 186 | model.fit(tr_data, 187 | tr_y, 188 | epochs=1000, 189 | validation_data=[val_x,val_y], 190 | verbose=1, 191 | batch_size=256, 192 | callbacks=[ 193 | EarlyStopping(patience=patience, monitor='val_binary_crossentropy'), 194 | # LearningRateScheduler(lr_de,verbose=1) 195 | # hist, 196 | # ModelCheckpoint('./weight/weights.{epoch:d}.hdf5',monitor='val_binary_crossentropy',save_weights_only=True) 197 | ]) 198 | # result += iter_ense(hist.epochs,model,te) 199 | result += model.predict(te, batch_size=1024) 200 | 201 | model.save_weights('./weight/'+model_name+str(n_fold)+'.h5') 202 | # oof_y[val_idx] = model.predict(val_x, batch_size=2048) 203 | 204 | K.clear_session() 205 | tf.reset_default_graph() 206 | 207 | # 提交结果 208 | result /= n_folds 209 | submit = pd.DataFrame() 210 | submit['y_pre'] = list(result[:, 0]) 211 | submit.to_csv(output, index=False) 212 | 213 | 214 | ## 保存预测的训练标签 215 | # oof_y = oof_y[:,0] 216 | # oof_y_ = oof_y.round().astype(int) 217 | # 218 | # error_idx = oof_y_!=y 219 | # print(np.sum(error_idx)) 220 | # oof_y[error_idx] = 1-oof_y[error_idx] 221 | submit = pd.DataFrame() 222 | submit['y_pre'] = oof_y[:,0] 223 | submit.to_csv('./data/oofy.csv',index=False) 224 | 225 | 226 | """ 227 | train('words',False,'esim_word0_2.csv',False,'esim') 228 | train('words',True,'esim_word1_2.csv',False,'esim') 229 | train('chars',False,'esim_char0_2.csv',False,'esim') 230 | train('chars',True,'esim_char1_2.csv',False,'esim') 231 | 232 | train('words',False,'esim_word0_3.csv',False,'esim') 233 | train('words',True,'esim_word1_3.csv',False,'esim') 234 | train('chars',False,'esim_char0_3.csv',False,'esim') 235 | train('chars',True,'esim_char1_3.csv',False,'esim') 236 | 237 | train('words',False,'attention_word0_0.csv',False,'attention') 238 | train('chars',True,'attention_char1_0.csv',False,'attention') 239 | 240 | train('words',True,'attention_word1_0.csv',False,'attention') 241 | train('chars',False,'attention_char0_0.csv',False,'attention') 242 | """ 243 | 244 | """ 245 | train('words',False,'attention_word0_1.csv',False,'attention') 246 | train('chars',True,'attention_char1_1.csv',False,'attention') 247 | 248 | train('words',True,'attention_word1_1.csv',False,'attention') 249 | train('chars',False,'attention_char0_1.csv',False,'attention') 250 | 251 | """ 252 | 253 | """ 254 | train('words',False,'attention_word0_2.csv',False,'attention') 255 | train('chars',True,'attention_char1_2.csv',False,'attention') 256 | 257 | train('words',True,'attention_word1_2.csv',False,'attention') 258 | train('chars',False,'attention_char0_2.csv',False,'attention') 259 | """ 260 | 261 | """ 262 | train('words',False,'attention_word0_3.csv',False,'attention') 263 | train('chars',True,'attention_char1_3.csv',False,'attention') 264 | 265 | train('words',True,'attention_word1_3.csv',False,'attention') 266 | train('chars',False,'attention_char0_3.csv',False,'attention') 267 | """ 268 | 269 | 270 | train('words',False,'res_word0_2.csv',False,'res') 271 | train('chars',True,'res_char1_2.csv',False,'res') 272 | 273 | train('words',True,'res_word1_2.csv',False,'res') 274 | train('chars',False,'res_char0_2.csv',False,'res') 275 | 276 | 277 | """ 278 | train('words',False,'res_word0_3.csv',False,'res') 279 | train('chars',True,'res_char1_3.csv',False,'res') 280 | 281 | train('words',True,'res_word1_3.csv',False,'res') 282 | train('chars',False,'res_char0_3.csv',False,'res') 283 | """ -------------------------------------------------------------------------------- /code/tool.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | import json 4 | import numpy as np 5 | import multiprocessing as mlp 6 | import gc 7 | 8 | def cluster_pos(file='train'): 9 | 10 | 11 | tr = pd.read_csv('./data/'+file+'.csv') 12 | # tr = tr.append(pd.read_csv('save_sample.csv')).reset_index(drop=True) 13 | y_idx = tr['label'] == 1 14 | 15 | 16 | # y_pred = pd.read_csv('./149367.csv')['y_pre'].values 17 | # y_pos = y_pred < 1 18 | # y_neg = y_pred > 0.5 19 | # y_idx = np.logical_and(y_pos,y_neg) 20 | 21 | 22 | tr = tr.loc[y_idx,['q1','q2']] 23 | print(tr.shape) 24 | tr = tr.sort_values(by=['q1', 'q2']).values 25 | for i in range(len(tr)): 26 | if tr[i][0]>tr[i][1]: 27 | tr[i] = [tr[i][1],tr[i][0]] 28 | 29 | q2group = {} 30 | num_group = 0 31 | error = 0 32 | for q1,q2 in tr: 33 | assert q1 < q2 34 | if q1 not in q2group and q2 not in q2group: 35 | q2group[q1] = num_group 36 | q2group[q2] = num_group 37 | num_group += 1 38 | elif q2 in q2group and q1 not in q2group: 39 | q2group[q1] = q2group[q2] 40 | elif q1 in q2group and q2 not in q2group: 41 | q2group[q2] = q2group[q1] 42 | else: 43 | if q2group[q2] != q2group[q1]: 44 | error+=1 45 | print(error) 46 | while error != 0: 47 | for q1,q2 in tr: 48 | if q2group[q1] != q2group[q2]: 49 | group_id = min(q2group[q1],q2group[q2]) 50 | q2group[q1] = group_id 51 | q2group[q2] = group_id 52 | error = 0 53 | for q1,q2 in tr: 54 | if q2group[q1] != q2group[q2]: 55 | error+=1 56 | print(error) 57 | 58 | 59 | with open('./info/q2group.json','w') as f: 60 | f.write(json.dumps(q2group,sort_keys=True,indent=4, separators=(',', ': '))) 61 | 62 | group2q = [{} for i in range(num_group)] 63 | for q,g_id in q2group.items(): 64 | group2q[g_id][q] = 1 65 | 66 | with open('./info/group2q.json', 'w') as f: 67 | f.write(json.dumps(group2q, sort_keys=True, indent=4, separators=(',', ': '))) 68 | 69 | 70 | group_n = {} 71 | for i,q in enumerate(group2q): 72 | group_n[str(i)] = len(q) 73 | group_n = sorted(group_n.items(),key=lambda x:x[1]) 74 | with open('./info/group_samples_num.json', 'w') as f: 75 | f.write(json.dumps(group_n, sort_keys=True, indent=4, separators=(',', ': '))) 76 | 77 | 78 | 79 | def cluster_neg(): 80 | 81 | tr = pd.read_csv('./data/train.csv') 82 | tr = tr.loc[tr['label'] == 0, ['q1', 'q2']].values 83 | 84 | with open('./info/q2group.json','r') as f: 85 | q2group = json.loads(f.read()) 86 | 87 | neg_pair = {} 88 | for q1,q2 in tr: 89 | if q1 in q2group and q2 in q2group: 90 | if q2group[q1]q2group[q2]: 93 | neg_pair[str(q2group[q2])+'_'+str(q2group[q1])] = 1 94 | 95 | with open('./info/neg_rule.json','w') as f: 96 | f.write(json.dumps(neg_pair,sort_keys=True,indent=4,separators=(',', ': '))) 97 | 98 | te = pd.read_csv('./data/test.csv').values 99 | need_rule = {} 100 | for q1, q2 in te: 101 | if q1 in q2group and q2 in q2group: 102 | if q2group[q1] < q2group[q2]: 103 | pair = str(q2group[q1]) + '_' + str(q2group[q2]) 104 | elif q2group[q1] > q2group[q2]: 105 | pair = str(q2group[q2]) + '_' + str(q2group[q1]) 106 | else: 107 | continue 108 | if pair not in neg_pair: 109 | if pair not in need_rule: 110 | need_rule[pair] = 0 111 | need_rule[pair]+=1 112 | need_rule = sorted(need_rule.items(),key=lambda x:x[1]) 113 | with open('./info/need_rule.json','w') as f: 114 | f.write(json.dumps(need_rule,sort_keys=True,indent=4,separators=(',', ': '))) 115 | 116 | 117 | 118 | def create_pos_sample(): 119 | 120 | with open('./info/q_te_dict.json','r') as f: 121 | q_te = json.loads(f.read()) 122 | with open('./info/q_tr_dict.json','r') as f: 123 | q_tr = json.loads(f.read()) 124 | 125 | with open('./info/group2q.json','r') as f: 126 | group2q = json.loads(f.read()) 127 | 128 | from itertools import combinations 129 | 130 | samples_dict = {} 131 | 132 | for questions in tqdm(group2q): 133 | if len(questions) == 2: 134 | continue 135 | if len(questions) == 0: 136 | continue 137 | for q1,q2 in combinations(list(questions.keys()),2): 138 | samples_dict[q1 + '_' + q2] = 1 139 | 140 | 141 | tr = pd.read_csv('./data/train.csv') 142 | te = pd.read_csv('./data/test.csv') 143 | tr.append(te) 144 | tr = tr[['q1','q2']].values 145 | for q1,q2 in tr: 146 | a = q1 + '_' + q2 in samples_dict 147 | b = q2 + '_' + q1 in samples_dict 148 | assert (a and b) == False 149 | if q1 + '_' + q2 in samples_dict: 150 | samples_dict.pop(q1 + '_' + q2) 151 | elif q2 + '_' + q1 in samples_dict: 152 | samples_dict.pop(q2 + '_' + q1) 153 | 154 | samples = [] 155 | for k in samples_dict.keys(): 156 | samples.append(k.split("_")) 157 | 158 | print(len(samples)) 159 | 160 | train_extend = pd.DataFrame(samples,columns=['q1','q2']) 161 | train_extend.to_csv('./info/pos_sample.csv',index=False) 162 | 163 | 164 | def create_neg_sample(): 165 | 166 | with open('./info/group2q.json','r') as f: 167 | group2q = json.loads(f.read()) 168 | with open('./info/neg_rule.json','r') as f: 169 | neg_pair = json.loads(f.read()) 170 | 171 | from itertools import product 172 | 173 | samples_dict = {} 174 | num_sample = 0 175 | for pair in tqdm(neg_pair): 176 | c1,c2 = pair.split('_') 177 | for q1,q2 in product(group2q[int(c1)],group2q[int(c2)]): 178 | samples_dict[q1 + '_' + q2] = 1 179 | num_sample+=1 180 | 181 | print(num_sample) 182 | tr = pd.read_csv('./data/train.csv',usecols=['q1','q2']) 183 | te = pd.read_csv('./data/test.csv',usecols=['q1','q2']) 184 | tr.append(te) 185 | tr = tr[['q1','q2']].values 186 | for q1,q2 in tr: 187 | a = q1 + '_' + q2 in samples_dict 188 | b = q2 + '_' + q1 in samples_dict 189 | assert (a and b) == False 190 | if q1 + '_' + q2 in samples_dict: 191 | samples_dict.pop(q1 + '_' + q2) 192 | elif q2 + '_' + q1 in samples_dict: 193 | samples_dict.pop(q2 + '_' + q1) 194 | 195 | samples = [] 196 | for k in samples_dict.keys(): 197 | samples.append(k.split("_")) 198 | 199 | print(len(samples)) 200 | del samples_dict 201 | import gc 202 | gc.collect() 203 | train_extend = pd.DataFrame(samples,columns=['q1','q2']) 204 | train_extend.to_csv('./info/neg_sample.csv',index=False) 205 | 206 | 207 | def post_process(file,output='baseline.csv'): 208 | 209 | with open('./info/q2group.json','r') as f: 210 | q2group = json.loads(f.read()) 211 | 212 | with open('./info/group2q.json','r') as f: 213 | group2q = json.loads(f.read()) 214 | 215 | te = pd.read_csv('./data/test.csv',usecols=['q1','q2']).values 216 | y_pre = pd.read_csv(file) 217 | 218 | 219 | "正例修正" 220 | n = 0 221 | loss = 0 222 | 223 | save_samples = [] 224 | s = 0 225 | for i, (q1, q2) in enumerate(te): 226 | if q1 in q2group and q2 in q2group: 227 | if q2group[q1] == q2group[q2]: 228 | n += 1 229 | loss = loss - np.log(y_pre.iloc[i,0]) 230 | y_pre.iloc[i, 0] = 1 231 | save_samples.append([1,q1, q2]) 232 | 233 | # save_samples = pd.DataFrame(save_samples,columns=['label','q1','q2']) 234 | # save_samples.to_csv('./info/save_sample.csv',index=False) 235 | print('n:',n) 236 | 237 | print(s) 238 | "负例修正" 239 | with open('./info/neg_rule.json','r') as f: 240 | neg_pair = json.loads(f.read()) 241 | n = 0 242 | for i, (q1, q2) in tqdm(enumerate(te)): 243 | if q1 in q2group and q2 in q2group: 244 | if q2group[q1] < q2group[q2]: 245 | pair = str(q2group[q1]) + '_' + str(q2group[q2]) 246 | elif q2group[q1] > q2group[q2]: 247 | pair = str(q2group[q2]) + '_' + str(q2group[q1]) 248 | else: 249 | pair = '' 250 | if pair in neg_pair: 251 | loss = loss - np.log(1-y_pre.iloc[i, 0]) 252 | y_pre.iloc[i, 0] = 0 253 | n += 1 254 | print('loss:', loss / len(te)) 255 | print(n) 256 | 257 | y_pre.to_csv(output, index=False) 258 | 259 | return y_pre 260 | 261 | def q_distr(): 262 | 263 | te = pd.read_csv('./data/test.csv').values 264 | 265 | q_dict = {} 266 | for q1, q2 in te: 267 | if q1 not in q_dict: 268 | q_dict[q1] = 0 269 | q_dict[q1] += 1 270 | if q2 not in q_dict: 271 | q_dict[q2] = 0 272 | q_dict[q2] += 1 273 | te_q = sorted(q_dict.items(), key=lambda x: x[1]) 274 | with open('./info/te_q.json', 'w') as f: 275 | f.write(json.dumps(te_q, sort_keys=True, indent=4, separators=(',', ': '))) 276 | with open('./info/q_te_dict.json', 'w') as f: 277 | f.write(json.dumps(q_dict, sort_keys=True, indent=4, separators=(',', ': '))) 278 | 279 | tr = pd.read_csv('./data/train.csv',usecols=['q1','q2']).values 280 | 281 | q_dict = {} 282 | for q1, q2 in tr: 283 | if q1 not in q_dict: 284 | q_dict[q1] = 0 285 | q_dict[q1] += 1 286 | if q2 not in q_dict: 287 | q_dict[q2] = 0 288 | q_dict[q2] += 1 289 | tr_q = sorted(q_dict.items(), key=lambda x: x[1]) 290 | with open('./info/tr_q.json', 'w') as f: 291 | f.write(json.dumps(tr_q, sort_keys=True, indent=4, separators=(',', ': '))) 292 | with open('./info/q_tr_dict.json', 'w') as f: 293 | f.write(json.dumps(q_dict, sort_keys=True, indent=4, separators=(',', ': '))) 294 | 295 | def te_test(): 296 | 297 | with open('q2group.json','r') as f: 298 | q2group = json.loads(f.read()) 299 | with open('neg_rule.json','r') as f: 300 | neg_pair = json.loads(f.read()) 301 | 302 | te = pd.read_csv('./data/test.csv').values 303 | 304 | 305 | def get_samples(): 306 | 307 | with open('./info/q_te_dict.json', 'r') as f: 308 | q_te = json.loads(f.read()) 309 | with open('./info/q_tr_dict.json', 'r') as f: 310 | q_tr = json.loads(f.read()) 311 | 312 | pos_samples = pd.read_csv('./info/pos_sample.csv',usecols=['q1','q2']).sample(frac=1).reset_index(drop=True).values 313 | neg_samples = pd.read_csv('./info/neg_sample.csv',usecols=['q1','q2']).sample(frac=1).reset_index(drop=True).values 314 | 315 | te_q = pd.read_csv("./data/test.csv",usecols=['q1','q2']) 316 | te_q = list(set(te_q['q1'].tolist() + te_q['q2'].tolist())) 317 | 318 | data = [] 319 | for q in te_q: 320 | data.append([1, q, q]) 321 | for i,samples in [(1,pos_samples),(0,neg_samples)]: 322 | q_freq = {} 323 | num_sample = 0 324 | for q1,q2 in tqdm(samples): 325 | if q1 not in q_te or q2 not in q_te: 326 | continue 327 | # if q1 not in q_freq: 328 | # q_freq[q1] = 0 329 | # if q2 not in q_freq: 330 | # q_freq[q2] = 0 331 | # if q_freq[q1] > min(2-q_tr[q1]/30+q_te[q1]/10,4): 332 | # continue 333 | # if q_freq[q2] > min(2-q_tr[q2]/30+q_te[q2]/10,4): 334 | # continue 335 | # q_freq[q1] += 1 336 | # q_freq[q2] += 1 337 | data.append([i,q1,q2]) 338 | num_sample += 1 339 | print(num_sample) 340 | 341 | data = pd.DataFrame(data,columns=['label','q1','q2']) 342 | data.to_csv('./data/aug_data.csv',index=False) 343 | return data 344 | 345 | def test(): 346 | 347 | te = pd.read_csv("./data/test.csv",usecols=['q1','q2']) 348 | te['y_pre'] = pd.read_csv("./145192.csv")['y_pre'] 349 | 350 | te = te.loc[te['y_pre']<1] 351 | te = te.loc[te['y_pre']>0] 352 | 353 | q = {} 354 | for q1,q2 in te[['q1','q2']].values: 355 | if q1 not in q: 356 | q[q1] = 0 357 | if q2 not in q: 358 | q[q2] = 0 359 | q[q1] +=1 360 | q[q2] +=1 361 | import json 362 | q = sorted(q.items(), key=lambda x: x[1]) 363 | with open("./info/q.json",'w') as f: 364 | f.write(json.dumps(q,sort_keys=True, indent=4, separators=(',', ': '))) 365 | 366 | 367 | def sample_filter(): 368 | 369 | samples = pd.read_csv("./data/aug_data.csv") 370 | samples['y_pre'] = pd.read_csv("./data/submit.csv")['y_pre'] 371 | pos_samples = samples.loc[samples['label']==1] 372 | neg_samples = samples.loc[samples['label']==0] 373 | 374 | del samples 375 | gc.collect() 376 | 377 | pos_samples = pos_samples.loc[pos_samples['y_pre']<0.5] 378 | neg_samples = neg_samples.loc[neg_samples['y_pre']>0.5] 379 | print(pos_samples.shape) 380 | print(neg_samples.shape) 381 | 382 | with open('./info/q_te_dict.json', 'r') as f: 383 | q_te = json.loads(f.read()) 384 | with open('./info/q_tr_dict.json', 'r') as f: 385 | q_tr = json.loads(f.read()) 386 | data = [] 387 | for i, samples in [(1, pos_samples), (0, neg_samples)]: 388 | q_freq = {} 389 | num_sample = 0 390 | for q1, q2 in tqdm(samples[['q1','q2']].values): 391 | if i==0: 392 | if q1 not in q_freq: 393 | q_freq[q1] = 0 394 | if q2 not in q_freq: 395 | q_freq[q2] = 0 396 | if q_freq[q1] > min(2-q_tr[q1]/30+q_te[q1]/10,3): 397 | continue 398 | if q_freq[q2] > min(2-q_tr[q2]/30+q_te[q2]/10,3): 399 | continue 400 | q_freq[q1] += 1 401 | q_freq[q2] += 1 402 | data.append([i, q1, q2]) 403 | num_sample += 1 404 | print(num_sample) 405 | 406 | data = pd.DataFrame(data, columns=['label', 'q1', 'q2']) 407 | data.to_csv('./data/aug_data_filter.csv', index=False) 408 | 409 | if __name__=='__main__': 410 | # q_distr() 411 | 412 | # create_pos_sample() 413 | # create_neg_sample() 414 | # cluster_pos() 415 | # cluster_neg() 416 | # get_samples() 417 | # from glob import glob 418 | # path= './resultv4/' 419 | # files = glob(path+'*.csv') 420 | # for f in files: 421 | # post_process(f) 422 | # sample_filter() 423 | a = post_process('./base.csv') 424 | print(a.describe()) 425 | 426 | # a = pd.read_csv('./ensemble/144392.csv') 427 | # b = pd.read_csv('./ensemble/ense2_14459.csv') 428 | # a['y_pre'] = 2*a['y_pre']/3 + b['y_pre']/3 429 | # print(a.describe()) 430 | 431 | # post_process_v2() 432 | # test() 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | -------------------------------------------------------------------------------- /code/train_word.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from keras.preprocessing.text import Tokenizer 3 | from keras.preprocessing.sequence import pad_sequences 4 | from config import MAX_NB_WORDS,MAX_NUM_CHARS,MAX_NUM_WORDS 5 | from tqdm import tqdm 6 | import numpy as np 7 | from keras.optimizers import Nadam 8 | 9 | 10 | def get_embedd(word_index,file): 11 | embeddings_index = {} 12 | with open(file, 'r') as f: 13 | wordmat = f.read().split('\n') 14 | if wordmat[-1] == '': 15 | wordmat = wordmat[:-1] 16 | if wordmat[0] == '': 17 | wordmat = wordmat[1:] 18 | 19 | for line in tqdm(wordmat): 20 | wvec = line.strip('\n').strip(' ').split(' ') 21 | embeddings_index[wvec[0]] = np.asarray(wvec[1:], dtype='float') 22 | 23 | print('embedding', len(embeddings_index)) 24 | 25 | EMBEDDING_DIM = 300 26 | nb_words = len(word_index) 27 | embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM)) 28 | for word, i in word_index.items(): 29 | embedding_vector = embeddings_index.get(str(word).upper()) 30 | if embedding_vector is not None: 31 | embedding_matrix[i] = embedding_vector 32 | 33 | return embedding_matrix 34 | 35 | def get_model(word_embedding_matrix,char_embedding_matrix): 36 | from keras.models import Model 37 | from keras.layers import Input,Lambda,BatchNormalization 38 | from keras.layers import CuDNNGRU, Bidirectional,GlobalMaxPooling1D 39 | from keras.layers import Embedding,SpatialDropout1D,Dense 40 | from keras import backend as K 41 | 42 | 43 | def loss(y_true,y_pred): 44 | return -K.mean(y_pred * y_true) 45 | 46 | from config import MAX_NUM_WORDS,MAX_NUM_CHARS 47 | 48 | 49 | word = Input(shape=(MAX_NUM_WORDS,)) 50 | char = Input(shape=(MAX_NUM_CHARS,)) 51 | 52 | 53 | embedd_word = Embedding( 54 | len(word_embedding_matrix), 55 | word_embedding_matrix.shape[1], 56 | weights=[word_embedding_matrix], 57 | input_length=MAX_NUM_WORDS, 58 | trainable=True,name='word_weight') 59 | embedd_char = Embedding( 60 | len(char_embedding_matrix), 61 | char_embedding_matrix.shape[1], 62 | weights=[char_embedding_matrix], 63 | input_length=MAX_NUM_CHARS, 64 | trainable=True,name='char_weight') 65 | 66 | gru_dim1 = 384 67 | 68 | gru_w = Bidirectional(CuDNNGRU(gru_dim1,return_sequences=True),merge_mode='sum') 69 | 70 | gru_c = Bidirectional(CuDNNGRU(gru_dim1, return_sequences=True), merge_mode='sum') 71 | 72 | w = embedd_word(word) 73 | c = embedd_char(char) 74 | w = BatchNormalization()(w) 75 | c = BatchNormalization()(c) 76 | w = SpatialDropout1D(0.2)(w) 77 | c = SpatialDropout1D(0.2)(c) 78 | 79 | w = gru_w(w) 80 | c = gru_c(c) 81 | 82 | w = GlobalMaxPooling1D()(w) 83 | c = GlobalMaxPooling1D()(c) 84 | 85 | def jaccard(x): 86 | x0_2 = K.sum(x[0] ** 2, axis=1, keepdims=True) 87 | x1_2 = K.sum(x[1] ** 2, axis=1, keepdims=True) 88 | x01_ = K.sum(K.abs(x[0] * x[1]), axis=1, keepdims=True) 89 | 90 | return x[0] * x[1]/(x0_2+x1_2-x01_) 91 | 92 | 93 | output = Lambda(jaccard)([w,c]) 94 | output = Dense(1,activation='sigmoid')(output) 95 | model = Model(inputs=[word,char], outputs=output) 96 | 97 | model.compile(loss='binary_crossentropy',optimizer=Nadam()) 98 | 99 | return model 100 | 101 | def train(): 102 | question = pd.read_csv('./data/question.csv') 103 | 104 | 105 | toke_word = Tokenizer(num_words=MAX_NB_WORDS) 106 | toke_word.fit_on_texts(question['words']) 107 | q_word = toke_word.texts_to_sequences(question['words']) 108 | q_word = pad_sequences(q_word, maxlen=MAX_NUM_WORDS, truncating='post') 109 | q_word = np.array(list(q_word)*2) 110 | word_index = toke_word.word_index 111 | word_embedd = get_embedd(word_index,'./data/word_embed.txt') 112 | 113 | 114 | toke_char = Tokenizer(num_words=MAX_NB_WORDS) 115 | toke_char.fit_on_texts(question['chars']) 116 | q_char = toke_word.texts_to_sequences(question['chars']) 117 | q_char = pad_sequences(q_char, maxlen=MAX_NUM_CHARS, truncating='post') 118 | q_char = np.array(list(q_char) + list(q_char)[::-1]) 119 | char_index = toke_char.word_index 120 | char_embedd = get_embedd(char_index, './data/char_embed.txt') 121 | 122 | 123 | model = get_model(word_embedd,char_embedd) 124 | y = np.ones(len(q_char)) 125 | y[len(question):] = 0 126 | model.fit([q_word,q_char],y,verbose=1,epochs=2,batch_size=512,shuffle=True) 127 | 128 | word_embedd = model.get_layer('word_weight').get_weights() 129 | char_embedd = model.get_layer('char_weight').get_weights() 130 | 131 | 132 | print('save ') 133 | word_mat = '' 134 | for i in range(len(word_embedd)): 135 | w = word_index.get(i) 136 | if w is None: 137 | continue 138 | vec_str = ' '.join([w]+list(word_embedd[i])) 139 | vec_str+='\n' 140 | word_mat+=vec_str 141 | with open('./data/word_embed1.txt','w') as f: 142 | f.write(word_mat) 143 | 144 | 145 | 146 | char_mat = '' 147 | for i in range(len(char_embedd)): 148 | c = char_index.get(i) 149 | if c is None: 150 | continue 151 | vec_str = ' '.join([c] + list(char_embedd[i])) 152 | vec_str += '\n' 153 | char_mat += vec_str 154 | 155 | with open('./data/char_embed1.txt', 'w') as f: 156 | f.write(char_mat) 157 | 158 | 159 | if __name__ == '__main__': 160 | train() 161 | 162 | 163 | 164 | 165 | 166 | --------------------------------------------------------------------------------