├── baseline ├── eda.py ├── gensim_d2v.py ├── ioUtil.py ├── keras_bgru_cnn_sim.py ├── keras_bilstm_sim.py ├── keras_cnn_sim.py ├── keras_lstm_sim.py ├── keras_seq2seq_sim.py ├── seg.py └── tf_bilstm_sim.py ├── config.py ├── feature_engineering.py ├── requirements.txt ├── tf_TextCNN.py ├── tf_TextRNN.py ├── tf_model └── tf_train_lstm.py ├── tf_train.py └── upload ├── keras_main1.py ├── run.sh └── train.txt /baseline/eda.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | input_file="../input/process.csv" 5 | df = pd.read_csv(input_file,encoding="utf-8") 6 | print('Total number of question pairs for training: {}'.format(len(df))) 7 | 8 | qids = pd.Series(df['question1'].tolist() + df['question2'].tolist()) 9 | print('Total number of questions in the training data: {}'.format(len( 10 | np.unique(qids)))) 11 | 12 | print('Number of questions that appear multiple times: {}'.format(np.sum(qids.value_counts() > 1))) 13 | 14 | print('equal intent pairs: {}%'.format(round(df['label'].mean()*100, 2))) 15 | 16 | plt.figure(figsize=(12, 5)) 17 | plt.hist(qids.value_counts(), bins=50) 18 | plt.yscale('log', nonposy='clip') 19 | plt.title('Log-Histogram of question appearance counts') 20 | plt.xlabel('Number of occurences of question') 21 | plt.ylabel('Number of questions') 22 | plt.show() -------------------------------------------------------------------------------- /baseline/gensim_d2v.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | 3 | import logging 4 | import sys 5 | import multiprocessing 6 | import numpy as np 7 | 8 | from gensim.models import Word2Vec 9 | from gensim.models.word2vec import LineSentence 10 | from gensim.models import doc2vec 11 | 12 | embedding_dims=128 13 | if __name__ == '__main__': 14 | 15 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 16 | 17 | 18 | r = np.random.randint(100000,999999,size = (1,)) 19 | print (r[0]) 20 | sents = doc2vec.TaggedLineDocument("./fc.dat") 21 | print (sents) 22 | model = doc2vec.Doc2Vec(sents, size = embedding_dims, window = 9, min_count=1, iter=45, hs=0, negative=11, seed=r[0]) 23 | model.wv.save_word2vec_format("w2v.txt", binary=False) 24 | # model.save("d2v.model") 25 | 26 | -------------------------------------------------------------------------------- /baseline/ioUtil.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pandas as pd 3 | import jieba 4 | input_file='../input/atec_nlp_sim_train.csv' 5 | ret=[] 6 | jieba.add_word('花呗') 7 | jieba.add_word('借呗') 8 | jieba.add_word('余额宝') 9 | jieba.add_word('***') 10 | 11 | def seg(text): 12 | seg_list = jieba.cut(text) 13 | return " ".join(seg_list) 14 | 15 | with open(input_file,encoding="utf-8") as fp: 16 | for line in fp: 17 | q={} 18 | lines=line.split("\t") 19 | if(len(lines)==3): 20 | q['question1']=seg(lines[0].strip()) 21 | q['question2']=seg(lines[1].strip()) 22 | q['label']=lines[2].strip() 23 | else: 24 | print(line) 25 | ret.append(q) 26 | df = pd.DataFrame(ret) 27 | df.to_csv("../input/process.csv",encoding="utf-8",index=False) 28 | 29 | -------------------------------------------------------------------------------- /baseline/keras_bgru_cnn_sim.py: -------------------------------------------------------------------------------- 1 | input_file = "../input/process.csv" 2 | w2vpath = '../data/baike.128.no_truncate.glove.txt' 3 | embedding_matrix_path = './temp_no_truncate.npy' 4 | kernel_name = "bilstm" 5 | import pandas as pd 6 | import numpy as np 7 | import keras 8 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback 9 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score 10 | 11 | MAX_TEXT_LENGTH = 50 12 | MAX_FEATURES = 10000 13 | embedding_dims = 128 14 | dr = 0.2 15 | 16 | from keras import backend as K 17 | 18 | 19 | def f1_score_metrics(y_true, y_pred): 20 | def recall(y_true, y_pred): 21 | """Recall metric. 22 | 23 | Only computes a batch-wise average of recall. 24 | 25 | Computes the recall, a metric for multi-label classification of 26 | how many relevant items are selected. 27 | """ 28 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 29 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 30 | recall = true_positives / (possible_positives + K.epsilon()) 31 | return recall 32 | 33 | def precision(y_true, y_pred): 34 | """Precision metric. 35 | 36 | Only computes a batch-wise average of precision. 37 | 38 | Computes the precision, a metric for multi-label classification of 39 | how many selected items are relevant. 40 | """ 41 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 42 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 43 | precision = true_positives / (predicted_positives + K.epsilon()) 44 | return precision 45 | 46 | precision = precision(y_true, y_pred) 47 | recall = recall(y_true, y_pred) 48 | return 2 * ((precision * recall) / (precision + recall + K.epsilon())) 49 | 50 | 51 | def get_model(embedding_matrix, nb_words): 52 | input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 53 | input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 54 | words_embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims, 55 | weights=[embedding_matrix], 56 | input_length=MAX_TEXT_LENGTH, 57 | trainable=True) 58 | seq_embedding_layer = keras.layers.Bidirectional(keras.layers.GRU(256, recurrent_dropout=dr, return_sequences=True)) 59 | cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform") 60 | 61 | x1=words_embedding_layer(input1_tensor) 62 | x1=seq_embedding_layer(x1) 63 | x1=cnn1d_layer(x1) 64 | 65 | x2 = words_embedding_layer(input2_tensor) 66 | x2 = seq_embedding_layer(x2) 67 | x2=cnn1d_layer(x2) 68 | # pooled_gru_cnn= lambda tensor: cnn1d_layer(seq_embedding_layer(words_embedding_layer(tensor))) 69 | avg_pool = keras.layers.GlobalAveragePooling1D() 70 | max_pool = keras.layers.GlobalMaxPooling1D() 71 | x1=keras.layers.concatenate([avg_pool(x1),max_pool(x1)]) 72 | x2=keras.layers.concatenate([avg_pool(x2),max_pool(x2)]) 73 | # seq_embedding = lambda tensor: [avg_pool(pooled_gru_cnn(tensor)),max_pool(pooled_gru_cnn(tensor))] 74 | merge_layer = keras.layers.multiply([x1, x2]) 75 | merge_layer = keras.layers.Dropout(dr)(merge_layer) 76 | dense1_layer = keras.layers.Dense(64, activation='relu')(merge_layer) 77 | ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer) 78 | model = keras.models.Model([input1_tensor, input2_tensor], ouput_layer) 79 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy", f1_score_metrics]) 80 | model.summary() 81 | return model 82 | 83 | 84 | from tqdm import tqdm 85 | import mmap 86 | import os 87 | 88 | 89 | def get_num_lines(file_path): 90 | fp = open(file_path, "r+") 91 | buf = mmap.mmap(fp.fileno(), 0) 92 | lines = 0 93 | while buf.readline(): 94 | lines += 1 95 | return lines 96 | 97 | 98 | def get_embedding_matrix(word_index, Emed_path, Embed_npy): 99 | if (os.path.exists(Embed_npy)): 100 | return np.load(Embed_npy) 101 | print('Indexing word vectors') 102 | embeddings_index = {} 103 | file_line = get_num_lines(Emed_path) 104 | print('lines ', file_line) 105 | with open(Emed_path, encoding='utf-8') as f: 106 | for line in tqdm(f, total=file_line): 107 | values = line.split() 108 | if (len(values) < embedding_dims): 109 | print(values) 110 | continue 111 | word = ' '.join(values[:-embedding_dims]) 112 | coefs = np.asarray(values[-embedding_dims:], dtype='float32') 113 | embeddings_index[word] = coefs 114 | f.close() 115 | 116 | print('Total %s word vectors.' % len(embeddings_index)) 117 | print('Preparing embedding matrix') 118 | nb_words = MAX_FEATURES # min(MAX_FEATURES, len(word_index)) 119 | all_embs = np.stack(embeddings_index.values()) 120 | print(all_embs.shape) 121 | emb_mean, emb_std = all_embs.mean(), all_embs.std() 122 | embedding_matrix = np.random.normal(loc=emb_mean, scale=emb_std, size=(nb_words, embedding_dims)) 123 | 124 | # embedding_matrix = np.zeros((nb_words, embedding_dims)) 125 | count = 0 126 | for word, i in tqdm(word_index.items()): 127 | if i >= MAX_FEATURES: 128 | continue 129 | embedding_vector = embeddings_index.get(word) 130 | if embedding_vector is not None: 131 | # words not found in embedding index will be all-zeros. 132 | embedding_matrix[i] = embedding_vector 133 | count += 1 134 | np.save(Embed_npy, embedding_matrix) 135 | print('Null word embeddings: %d' % (nb_words - count)) 136 | print('not Null word embeddings: %d' % count) 137 | print('embedding_matrix shape', embedding_matrix.shape) 138 | # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) 139 | return embedding_matrix 140 | 141 | 142 | df = pd.read_csv(input_file, encoding="utf-8") 143 | 144 | question1 = df['question1'].values 145 | question2 = df['question2'].values 146 | y = df['label'].values 147 | from keras.preprocessing.sequence import pad_sequences 148 | from keras.preprocessing.text import Tokenizer 149 | 150 | tokenizer = Tokenizer(num_words=MAX_FEATURES) 151 | tokenizer.fit_on_texts(list(question1) + list(question2)) 152 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1) 153 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2) 154 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) 155 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) 156 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index)) 157 | print("nb_words", nb_words) 158 | embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, w2vpath, embedding_matrix_path) 159 | seed = 20180426 160 | cv_folds = 10 161 | from sklearn.model_selection import StratifiedKFold 162 | 163 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False) 164 | pred_oob = np.zeros(shape=(len(y), 1)) 165 | # print(pred_oob.shape) 166 | count = 0 167 | for ind_tr, ind_te in skf.split(X_train_q1, y): 168 | x_train_q1 = X_train_q1[ind_tr] 169 | x_train_q2 = X_train_q2[ind_tr] 170 | x_val_q1 = X_train_q1[ind_te] 171 | x_val_q2 = X_train_q2[ind_te] 172 | y_train = y[ind_tr] 173 | y_val = y[ind_te] 174 | model = get_model(embedding_matrix1, nb_words) 175 | early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1) 176 | bst_model_path = kernel_name + '_weight_%d.h5' % count 177 | model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 178 | save_best_only=True, verbose=1, save_weights_only=True) 179 | hist = model.fit([x_train_q1, x_train_q2], y_train, 180 | validation_data=([x_val_q1, x_val_q2], y_val), 181 | epochs=20, batch_size=256, shuffle=True, 182 | class_weight={0: 1.2233, 1: 0.4472}, 183 | callbacks=[early_stopping, model_checkpoint]) 184 | model.load_weights(bst_model_path) 185 | y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1) 186 | pred_oob[ind_te] = y_predict 187 | y_predict = (y_predict > 0.5).astype(int) 188 | recall = recall_score(y_val, y_predict) 189 | print(count, "recal", recall) 190 | precision = precision_score(y_val, y_predict) 191 | print(count, "precision", precision) 192 | accuracy = accuracy_score(y_val, y_predict) 193 | print(count, "accuracy ", accuracy) 194 | f1 = f1_score(y_val, y_predict) 195 | print(count, "f1", f1) 196 | count += 1 197 | pred_oob1 = (pred_oob > 0.5).astype(int) 198 | recall = recall_score(y, pred_oob1) 199 | print("recal", recall) 200 | precision = precision_score(y, pred_oob1) 201 | print("precision", precision) 202 | accuracy = accuracy_score(y, pred_oob1) 203 | print("accuracy", accuracy) 204 | f1 = f1_score(y, pred_oob1) 205 | print("f1", f1) 206 | -------------------------------------------------------------------------------- /baseline/keras_bilstm_sim.py: -------------------------------------------------------------------------------- 1 | input_file = "../input/process.csv" 2 | w2vpath = '../data/baike.128.no_truncate.glove.txt' 3 | embedding_matrix_path = './temp_no_truncate.npy' 4 | kernel_name = "bilstm" 5 | word_index_path="worddict.pkl" 6 | import pandas as pd 7 | import numpy as np 8 | import keras 9 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback 10 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score 11 | 12 | MAX_TEXT_LENGTH = 50 13 | MAX_FEATURES = 10000 14 | embedding_dims = 128 15 | dr = 0.2 16 | 17 | from keras import backend as K 18 | 19 | 20 | def f1_score_metrics(y_true, y_pred): 21 | def recall(y_true, y_pred): 22 | """Recall metric. 23 | 24 | Only computes a batch-wise average of recall. 25 | 26 | Computes the recall, a metric for multi-label classification of 27 | how many relevant items are selected. 28 | """ 29 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 30 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 31 | recall = true_positives / (possible_positives + K.epsilon()) 32 | return recall 33 | 34 | def precision(y_true, y_pred): 35 | """Precision metric. 36 | 37 | Only computes a batch-wise average of precision. 38 | 39 | Computes the precision, a metric for multi-label classification of 40 | how many selected items are relevant. 41 | """ 42 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 43 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 44 | precision = true_positives / (predicted_positives + K.epsilon()) 45 | return precision 46 | 47 | precision = precision(y_true, y_pred) 48 | recall = recall(y_true, y_pred) 49 | return 2 * ((precision * recall) / (precision + recall + K.epsilon())) 50 | 51 | 52 | class F1ScoreCallback(Callback): 53 | def __init__(self, predict_batch_size=1024, include_on_batch=False): 54 | super(F1ScoreCallback, self).__init__() 55 | self.predict_batch_size = predict_batch_size 56 | self.include_on_batch = include_on_batch 57 | 58 | def on_batch_begin(self, batch, logs={}): 59 | pass 60 | 61 | def on_train_begin(self, logs={}): 62 | pass 63 | 64 | def on_batch_end(self, batch, logs={}): 65 | pass 66 | 67 | def on_epoch_end(self, epoch, logs={}): 68 | if (self.validation_data): 69 | y_predict = self.model.predict([self.validation_data[0], self.validation_data[1]], 70 | batch_size=self.predict_batch_size) 71 | y_predict = (y_predict > 0.5).astype(int) 72 | accuracy=accuracy_score(self.validation_data[2], y_predict) 73 | precision=precision_score(self.validation_data[2], y_predict) 74 | recall = recall_score(self.validation_data[2], y_predict) 75 | f1 = f1_score(self.validation_data[2], y_predict) 76 | print("precision %.3f recall %.3f f1_score %.3f accuracy %.3f "% (precision, recall,f1,accuracy)) 77 | 78 | 79 | def get_model(embedding_matrix, nb_words): 80 | input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 81 | input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 82 | words_embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims, 83 | weights=[embedding_matrix], 84 | input_length=MAX_TEXT_LENGTH, 85 | trainable=True) 86 | seq_embedding_layer = keras.layers.Bidirectional(keras.layers.GRU(256, recurrent_dropout=dr)) 87 | seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor)) 88 | merge_layer = keras.layers.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)]) 89 | merge_layer = keras.layers.Dropout(dr)(merge_layer) 90 | dense1_layer = keras.layers.Dense(64, activation='relu')(merge_layer) 91 | ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer) 92 | model = keras.models.Model([input1_tensor, input2_tensor], ouput_layer) 93 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy", f1_score_metrics]) 94 | model.summary() 95 | return model 96 | 97 | 98 | from tqdm import tqdm 99 | import mmap 100 | import os 101 | 102 | 103 | def get_num_lines(file_path): 104 | fp = open(file_path, "r+") 105 | buf = mmap.mmap(fp.fileno(), 0) 106 | lines = 0 107 | while buf.readline(): 108 | lines += 1 109 | return lines 110 | 111 | 112 | def get_embedding_matrix(word_index, Emed_path, Embed_npy): 113 | if (os.path.exists(Embed_npy)): 114 | return np.load(Embed_npy) 115 | print('Indexing word vectors') 116 | embeddings_index = {} 117 | file_line = get_num_lines(Emed_path) 118 | print('lines ', file_line) 119 | with open(Emed_path, encoding='utf-8') as f: 120 | for line in tqdm(f, total=file_line): 121 | values = line.split() 122 | if (len(values) < embedding_dims): 123 | print(values) 124 | continue 125 | word = ' '.join(values[:-embedding_dims]) 126 | coefs = np.asarray(values[-embedding_dims:], dtype='float32') 127 | embeddings_index[word] = coefs 128 | f.close() 129 | 130 | print('Total %s word vectors.' % len(embeddings_index)) 131 | print('Preparing embedding matrix') 132 | nb_words = MAX_FEATURES # min(MAX_FEATURES, len(word_index)) 133 | all_embs = np.stack(embeddings_index.values()) 134 | print(all_embs.shape) 135 | emb_mean, emb_std = all_embs.mean(), all_embs.std() 136 | embedding_matrix = np.random.normal(loc=emb_mean, scale=emb_std, size=(nb_words, embedding_dims)) 137 | 138 | # embedding_matrix = np.zeros((nb_words, embedding_dims)) 139 | count = 0 140 | for word, i in tqdm(word_index.items()): 141 | if i >= MAX_FEATURES: 142 | continue 143 | embedding_vector = embeddings_index.get(word) 144 | if embedding_vector is not None: 145 | # words not found in embedding index will be all-zeros. 146 | embedding_matrix[i] = embedding_vector 147 | count += 1 148 | np.save(Embed_npy, embedding_matrix) 149 | print('Null word embeddings: %d' % (nb_words - count)) 150 | print('not Null word embeddings: %d' % count) 151 | print('embedding_matrix shape', embedding_matrix.shape) 152 | # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) 153 | return embedding_matrix 154 | 155 | 156 | df = pd.read_csv(input_file, encoding="utf-8") 157 | 158 | question1 = df['question1'].values 159 | question2 = df['question2'].values 160 | y = df['label'].values 161 | from keras.preprocessing.sequence import pad_sequences 162 | from keras.preprocessing.text import Tokenizer 163 | 164 | tokenizer = Tokenizer(num_words=MAX_FEATURES) 165 | tokenizer.fit_on_texts(list(question1) + list(question2)) 166 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1) 167 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2) 168 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) 169 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) 170 | 171 | inpath="test1.txt" 172 | test_data1 = [] 173 | test_data2 = [] 174 | linenos=[] 175 | import jieba 176 | jieba.add_word('花呗') 177 | jieba.add_word('借呗') 178 | jieba.add_word('余额宝') 179 | 180 | def seg(text): 181 | seg_list = jieba.cut(text) 182 | return " ".join(seg_list) 183 | 184 | with open(inpath, 'r') as fin: 185 | for line in fin: 186 | lineno, sen1, sen2 = line.strip().split('\t') 187 | test_data1.append(seg(sen1)) 188 | test_data2.append(seg(sen2)) 189 | linenos.append(lineno) 190 | 191 | list_tokenized_question1 = tokenizer.texts_to_sequences(test_data1) 192 | list_tokenized_question2 = tokenizer.texts_to_sequences(test_data2) 193 | x_val_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) 194 | x_val_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) 195 | 196 | # import pickle 197 | # with open(word_index_path, 'wb') as fw: 198 | # pickle.dumps(tokenizer,fw) 199 | 200 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index)) 201 | print("nb_words", nb_words) 202 | embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, w2vpath, embedding_matrix_path) 203 | seed = 20180426 204 | cv_folds = 10 205 | from sklearn.model_selection import StratifiedKFold 206 | 207 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False) 208 | pred_oob = np.zeros(shape=(len(y), 1)) 209 | # print(pred_oob.shape) 210 | count = 0 211 | for ind_tr, ind_te in skf.split(X_train_q1, y): 212 | x_train_q1 = X_train_q1[ind_tr] 213 | x_train_q2 = X_train_q2[ind_tr] 214 | x_val_q1 = X_train_q1[ind_te] 215 | x_val_q2 = X_train_q2[ind_te] 216 | y_train = y[ind_tr] 217 | y_val = y[ind_te] 218 | model = get_model(embedding_matrix1, nb_words) 219 | early_stopping = EarlyStopping(monitor='val_f1_score_metrics', patience=5, mode='max', verbose=1) 220 | bst_model_path = kernel_name + '_weight_%d.h5' % count 221 | model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_f1_score_metrics', mode='max', 222 | save_best_only=True, verbose=1, save_weights_only=True) 223 | hist = model.fit([x_train_q1, x_train_q2], y_train, 224 | validation_data=([x_val_q1, x_val_q2], y_val), 225 | epochs=6, batch_size=32, shuffle=True, 226 | class_weight={0: 1.2233, 1: 0.4472}, 227 | callbacks=[early_stopping, model_checkpoint,F1ScoreCallback()]) 228 | model.load_weights(bst_model_path) 229 | y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1) 230 | # y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1) 231 | pred_oob[ind_te] = y_predict 232 | # pred_oob += y_predict 233 | y_predict = (y_predict > 0.5).astype(int) 234 | recall = recall_score(y, y_predict) 235 | print(count, "recal", recall) 236 | precision = precision_score(y, y_predict) 237 | print(count, "precision", precision) 238 | accuracy = accuracy_score(y, y_predict) 239 | print(count, "accuracy ", accuracy) 240 | f1 = f1_score(y, y_predict) 241 | print(count, "f1", f1) 242 | count += 1 243 | pred_oob/=cv_folds 244 | pred_oob1 = (pred_oob > 0.5).astype(int) 245 | recall = recall_score(y, pred_oob1) 246 | print("recal", recall) 247 | precision = precision_score(y, pred_oob1) 248 | print("precision", precision) 249 | accuracy = accuracy_score(y, pred_oob1) 250 | print("accuracy", accuracy) 251 | f1 = f1_score(y, pred_oob1) 252 | print("f1", f1) 253 | -------------------------------------------------------------------------------- /baseline/keras_cnn_sim.py: -------------------------------------------------------------------------------- 1 | input_file = "../input/process.csv" 2 | w2vpath = '../data/baike.128.no_truncate.glove.txt' 3 | embedding_matrix_path = './temp_no_truncate.npy' 4 | kernel_name = "CNN_mutilwin" 5 | import pandas as pd 6 | import numpy as np 7 | import keras 8 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback 9 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score 10 | 11 | MAX_TEXT_LENGTH = 50 12 | MAX_FEATURES = 10000 13 | embedding_dims = 128 14 | dr = 0.2 15 | cnn_filters = 64 16 | kernel_sizes = [2, 3, 8, 9] 17 | from keras import backend as K 18 | 19 | 20 | def f1_score_metrics(y_true, y_pred): 21 | def recall(y_true, y_pred): 22 | """Recall metric. 23 | 24 | Only computes a batch-wise average of recall. 25 | 26 | Computes the recall, a metric for multi-label classification of 27 | how many relevant items are selected. 28 | """ 29 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 30 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 31 | recall = true_positives / (possible_positives + K.epsilon()) 32 | return recall 33 | 34 | def precision(y_true, y_pred): 35 | """Precision metric. 36 | 37 | Only computes a batch-wise average of precision. 38 | 39 | Computes the precision, a metric for multi-label classification of 40 | how many selected items are relevant. 41 | """ 42 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 43 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 44 | precision = true_positives / (predicted_positives + K.epsilon()) 45 | return precision 46 | 47 | precision = precision(y_true, y_pred) 48 | recall = recall(y_true, y_pred) 49 | return 2 * ((precision * recall) / (precision + recall + K.epsilon())) 50 | 51 | 52 | def get_model(embedding_matrix, nb_words): 53 | input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 54 | input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 55 | words_embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims, 56 | weights=[embedding_matrix], 57 | input_length=MAX_TEXT_LENGTH, 58 | trainable=True) 59 | embedded_sequences1=words_embedding_layer(input1_tensor) 60 | x1=[] 61 | for win in kernel_sizes: 62 | xi = keras.layers.Conv1D(filters=cnn_filters, 63 | filter_length=win, 64 | padding='same', 65 | activation='relu' 66 | )(embedded_sequences1) 67 | x1.append(xi) 68 | 69 | 70 | x1 = keras.layers.add(x1) 71 | x1 = keras.layers.GlobalMaxPooling1D()(x1) 72 | 73 | embedded_sequences2=words_embedding_layer(input2_tensor) 74 | x2 = [] 75 | for win in kernel_sizes: 76 | xi = keras.layers.Conv1D(filters=cnn_filters, 77 | filter_length=win, 78 | padding='same', 79 | activation='relu' 80 | )(embedded_sequences2) 81 | x2.append(xi) 82 | 83 | x2 = keras.layers.add(x2) 84 | x2 = keras.layers.GlobalMaxPooling1D()(x2) 85 | merge_layer = keras.layers.multiply([x1, x2]) 86 | merge_layer = keras.layers.Dropout(dr)(merge_layer) 87 | dense1_layer = keras.layers.Dense(64, activation='relu')(merge_layer) 88 | ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer) 89 | model = keras.models.Model([input1_tensor, input2_tensor], ouput_layer) 90 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy", f1_score_metrics]) 91 | model.summary() 92 | return model 93 | 94 | 95 | from tqdm import tqdm 96 | import mmap 97 | import os 98 | 99 | 100 | def get_num_lines(file_path): 101 | fp = open(file_path, "r+") 102 | buf = mmap.mmap(fp.fileno(), 0) 103 | lines = 0 104 | while buf.readline(): 105 | lines += 1 106 | return lines 107 | 108 | 109 | def get_embedding_matrix(word_index, Emed_path, Embed_npy): 110 | if (os.path.exists(Embed_npy)): 111 | return np.load(Embed_npy) 112 | print('Indexing word vectors') 113 | embeddings_index = {} 114 | file_line = get_num_lines(Emed_path) 115 | print('lines ', file_line) 116 | with open(Emed_path, encoding='utf-8') as f: 117 | for line in tqdm(f, total=file_line): 118 | values = line.split() 119 | if (len(values) < embedding_dims): 120 | print(values) 121 | continue 122 | word = ' '.join(values[:-embedding_dims]) 123 | coefs = np.asarray(values[-embedding_dims:], dtype='float32') 124 | embeddings_index[word] = coefs 125 | f.close() 126 | 127 | print('Total %s word vectors.' % len(embeddings_index)) 128 | print('Preparing embedding matrix') 129 | nb_words = MAX_FEATURES # min(MAX_FEATURES, len(word_index)) 130 | all_embs = np.stack(embeddings_index.values()) 131 | print(all_embs.shape) 132 | emb_mean, emb_std = all_embs.mean(), all_embs.std() 133 | embedding_matrix = np.random.normal(loc=emb_mean, scale=emb_std, size=(nb_words, embedding_dims)) 134 | 135 | # embedding_matrix = np.zeros((nb_words, embedding_dims)) 136 | count = 0 137 | for word, i in tqdm(word_index.items()): 138 | if i >= MAX_FEATURES: 139 | continue 140 | embedding_vector = embeddings_index.get(word) 141 | if embedding_vector is not None: 142 | # words not found in embedding index will be all-zeros. 143 | embedding_matrix[i] = embedding_vector 144 | count += 1 145 | np.save(Embed_npy, embedding_matrix) 146 | print('Null word embeddings: %d' % (nb_words - count)) 147 | print('not Null word embeddings: %d' % count) 148 | print('embedding_matrix shape', embedding_matrix.shape) 149 | # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) 150 | return embedding_matrix 151 | 152 | 153 | df = pd.read_csv(input_file, encoding="utf-8") 154 | 155 | question1 = df['question1'].values 156 | question2 = df['question2'].values 157 | y = df['label'].values 158 | from keras.preprocessing.sequence import pad_sequences 159 | from keras.preprocessing.text import Tokenizer 160 | 161 | tokenizer = Tokenizer(num_words=MAX_FEATURES) 162 | tokenizer.fit_on_texts(list(question1) + list(question2)) 163 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1) 164 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2) 165 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) 166 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) 167 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index)) 168 | print("nb_words", nb_words) 169 | embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, w2vpath, embedding_matrix_path) 170 | seed = 20180426 171 | cv_folds = 10 172 | from sklearn.model_selection import StratifiedKFold 173 | 174 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False) 175 | pred_oob = np.zeros(shape=(len(y), 1)) 176 | # print(pred_oob.shape) 177 | count = 0 178 | for ind_tr, ind_te in skf.split(X_train_q1, y): 179 | x_train_q1 = X_train_q1[ind_tr] 180 | x_train_q2 = X_train_q2[ind_tr] 181 | x_val_q1 = X_train_q1[ind_te] 182 | x_val_q2 = X_train_q2[ind_te] 183 | y_train = y[ind_tr] 184 | y_val = y[ind_te] 185 | model = get_model(embedding_matrix1, nb_words) 186 | early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1) 187 | bst_model_path = kernel_name + '_weight_%d.h5' % count 188 | model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 189 | save_best_only=True, verbose=1, save_weights_only=True) 190 | hist = model.fit([x_train_q1, x_train_q2], y_train, 191 | validation_data=([x_val_q1, x_val_q2], y_val), 192 | epochs=20, batch_size=256, shuffle=True, 193 | class_weight={0: 1.2233, 1: 0.4472}, 194 | callbacks=[early_stopping, model_checkpoint]) 195 | model.load_weights(bst_model_path) 196 | y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1) 197 | pred_oob[ind_te] = y_predict 198 | y_predict = (y_predict > 0.5).astype(int) 199 | recall = recall_score(y_val, y_predict) 200 | print(count, "recal", recall) 201 | precision = precision_score(y_val, y_predict) 202 | print(count, "precision", precision) 203 | accuracy = accuracy_score(y_val, y_predict) 204 | print(count, "accuracy ", accuracy) 205 | f1 = f1_score(y_val, y_predict) 206 | print(count, "f1", f1) 207 | count += 1 208 | pred_oob1 = (pred_oob > 0.5).astype(int) 209 | recall = recall_score(y, pred_oob1) 210 | print("recal", recall) 211 | precision = precision_score(y, pred_oob1) 212 | print("precision", precision) 213 | accuracy = accuracy_score(y, pred_oob1) 214 | print("accuracy", accuracy) 215 | f1 = f1_score(y, pred_oob1) 216 | print("f1", f1) 217 | -------------------------------------------------------------------------------- /baseline/keras_lstm_sim.py: -------------------------------------------------------------------------------- 1 | """ 2 | pip install tensorflow 3 | pip install keras 4 | pip install numpy 5 | pip install tqdm 6 | 7 | """ 8 | input_file = "../input/process.csv" 9 | w2vpath = '../data/baike.128.truncate.glove.txt' 10 | embedding_matrix_path = './temp.npy' 11 | kernel_name="lstm" 12 | import pandas as pd 13 | import numpy as np 14 | import keras 15 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback 16 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score 17 | from keras.optimizers import Adam 18 | MAX_TEXT_LENGTH = 50 19 | MAX_FEATURES = 10000 20 | embedding_dims = 128 21 | dr = 0.2 22 | 23 | 24 | from keras import backend as K 25 | 26 | def f1_score_metrics(y_true, y_pred): 27 | def recall(y_true, y_pred): 28 | """Recall metric. 29 | 30 | Only computes a batch-wise average of recall. 31 | 32 | Computes the recall, a metric for multi-label classification of 33 | how many relevant items are selected. 34 | """ 35 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 36 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 37 | recall = true_positives / (possible_positives + K.epsilon()) 38 | return recall 39 | 40 | def precision(y_true, y_pred): 41 | """Precision metric. 42 | 43 | Only computes a batch-wise average of precision. 44 | 45 | Computes the precision, a metric for multi-label classification of 46 | how many selected items are relevant. 47 | """ 48 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 49 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 50 | precision = true_positives / (predicted_positives + K.epsilon()) 51 | return precision 52 | 53 | precision = precision(y_true, y_pred) 54 | recall = recall(y_true, y_pred) 55 | return 2 * ((precision * recall) / (precision + recall + K.epsilon())) 56 | 57 | def get_model(embedding_matrix,nb_words): 58 | input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 59 | input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 60 | words_embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims, 61 | weights=[embedding_matrix], 62 | input_length=MAX_TEXT_LENGTH, 63 | trainable=True) 64 | seq_embedding_layer = keras.layers.LSTM(256, activation='tanh',recurrent_dropout=dr) 65 | seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor)) 66 | merge_layer = keras.layers.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)]) 67 | dense1_layer = keras.layers.Dense(64, activation='relu')(merge_layer) 68 | ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer) 69 | model = keras.models.Model([input1_tensor, input2_tensor], ouput_layer) 70 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy",f1_score_metrics]) 71 | model.summary() 72 | return model 73 | 74 | from tqdm import tqdm 75 | import mmap 76 | import os 77 | 78 | 79 | def get_num_lines(file_path): 80 | fp = open(file_path, "r+") 81 | buf = mmap.mmap(fp.fileno(), 0) 82 | lines = 0 83 | while buf.readline(): 84 | lines += 1 85 | return lines 86 | 87 | 88 | def get_embedding_matrix(word_index, Emed_path, Embed_npy): 89 | if (os.path.exists(Embed_npy)): 90 | return np.load(Embed_npy) 91 | print('Indexing word vectors') 92 | embeddings_index = {} 93 | file_line = get_num_lines(Emed_path) 94 | print('lines ', file_line) 95 | with open(Emed_path, encoding='utf-8') as f: 96 | for line in tqdm(f, total=file_line): 97 | values = line.split() 98 | if(len(values)= MAX_FEATURES: 118 | continue 119 | embedding_vector = embeddings_index.get(word) 120 | if embedding_vector is not None: 121 | # words not found in embedding index will be all-zeros. 122 | embedding_matrix[i] = embedding_vector 123 | count+=1 124 | np.save(Embed_npy, embedding_matrix) 125 | print('Null word embeddings: %d' % (nb_words-count)) 126 | print('not Null word embeddings: %d' % count) 127 | print('embedding_matrix shape', embedding_matrix.shape) 128 | # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) 129 | return embedding_matrix 130 | 131 | 132 | df = pd.read_csv(input_file, encoding="utf-8") 133 | 134 | question1 = df['question1'].values 135 | question2 = df['question2'].values 136 | y = df['label'].values 137 | from keras.preprocessing.sequence import pad_sequences 138 | from keras.preprocessing.text import Tokenizer 139 | 140 | tokenizer = Tokenizer(num_words=MAX_FEATURES) 141 | tokenizer.fit_on_texts(list(question1) + list(question2)) 142 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1) 143 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2) 144 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) 145 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) 146 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index)) 147 | print("nb_words",nb_words) 148 | embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, w2vpath, embedding_matrix_path) 149 | seed = 20180426 150 | cv_folds = 10 151 | from sklearn.model_selection import StratifiedKFold 152 | 153 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False) 154 | pred_oob = np.zeros(shape=(len(y), 1)) 155 | # print(pred_oob.shape) 156 | count = 0 157 | for ind_tr, ind_te in skf.split(X_train_q1, y): 158 | x_train_q1 = X_train_q1[ind_tr] 159 | x_train_q2 = X_train_q2[ind_tr] 160 | x_val_q1 = X_train_q1[ind_te] 161 | x_val_q2 = X_train_q2[ind_te] 162 | y_train = y[ind_tr] 163 | y_val = y[ind_te] 164 | model = get_model(embedding_matrix1,nb_words) 165 | early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1) 166 | bst_model_path =kernel_name+'_weight_%d.h5' % count 167 | model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 168 | save_best_only=True, verbose=1, save_weights_only=True) 169 | hist = model.fit([x_train_q1,x_train_q2], y_train, 170 | validation_data=([x_val_q1,x_val_q2], y_val), 171 | epochs=5, batch_size=256, shuffle=True, 172 | callbacks=[early_stopping, model_checkpoint]) 173 | model.load_weights(bst_model_path) 174 | y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1) 175 | pred_oob[ind_te] = y_predict 176 | y_predict = (y_predict > 0.5).astype(int) 177 | recall = recall_score(y_val, y_predict) 178 | print(count, "recal", recall) 179 | precision = precision_score(y_val, y_predict) 180 | print(count, "precision", precision) 181 | accuracy = accuracy_score(y_val, y_predict) 182 | print(count, "accuracy ", accuracy) 183 | f1 = f1_score(y_val, y_predict) 184 | print(count, "f1", f1) 185 | count += 1 186 | pred_label = (pred_oob > 0.5).astype(int) 187 | recall = recall_score(y, pred_label) 188 | print("recal", recall) 189 | precision = precision_score(y, pred_label) 190 | print("precision", precision) 191 | accuracy = accuracy_score(y, pred_label) 192 | print("accuracy", accuracy) 193 | f1 = f1_score(y, pred_label) 194 | print("f1", f1) 195 | -------------------------------------------------------------------------------- /baseline/keras_seq2seq_sim.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Bidirectional, GRU, Dropout, Merge 2 | 3 | input_file = "../input/process.csv" 4 | w2vpath = '../data/baike.128.truncate.glove.txt' 5 | embedding_matrix_path = './temp.npy' 6 | kernel_name="seq2seq" 7 | import pandas as pd 8 | import numpy as np 9 | import keras 10 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback 11 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score 12 | 13 | MAX_TEXT_LENGTH = 50 14 | MAX_FEATURES = 10000 15 | embedding_dims = 128 16 | dr = 0.2 17 | lstm_size=64 18 | from keras import backend as K 19 | def f1_score_metrics(y_true, y_pred): 20 | def recall(y_true, y_pred): 21 | """Recall metric. 22 | 23 | Only computes a batch-wise average of recall. 24 | 25 | Computes the recall, a metric for multi-label classification of 26 | how many relevant items are selected. 27 | """ 28 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 29 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 30 | recall = true_positives / (possible_positives + K.epsilon()) 31 | return recall 32 | 33 | def precision(y_true, y_pred): 34 | """Precision metric. 35 | 36 | Only computes a batch-wise average of precision. 37 | 38 | Computes the precision, a metric for multi-label classification of 39 | how many selected items are relevant. 40 | """ 41 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 42 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 43 | precision = true_positives / (predicted_positives + K.epsilon()) 44 | return precision 45 | 46 | precision = precision(y_true, y_pred) 47 | recall = recall(y_true, y_pred) 48 | return 2 * ((precision * recall) / (precision + recall + K.epsilon())) 49 | 50 | def exponent_neg_manhattan_distance(left, right): 51 | return K.exp(-K.sum(K.abs(left - right), axis=1, keepdims=True)) 52 | def distance(left, right): 53 | return K.exp(-K.sum(K.abs(left - right), axis=1, keepdims=True)) 54 | def get_model(embedding_matrix,nb_words): 55 | input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 56 | input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 57 | embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims, 58 | weights=[embedding_matrix], 59 | input_length=MAX_TEXT_LENGTH, 60 | trainable=True) 61 | shared_encode = Bidirectional(GRU(lstm_size, return_sequences=False)) 62 | embedded_sequences = embedding_layer(input1_tensor) 63 | l_lstm1 = shared_encode(embedded_sequences) 64 | l_lstm1 = Dropout(dr)(l_lstm1) 65 | 66 | embedded_sequences1 = embedding_layer(input2_tensor) 67 | l_lstm2 = shared_encode(embedded_sequences1) 68 | l_lstm2 = Dropout(dr)(l_lstm2) 69 | 70 | # Calculates the distance as defined by the MaLSTM model 71 | malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), 72 | output_shape=lambda x: (x[0][0], 1))([l_lstm1, l_lstm2]) 73 | 74 | # dense1_layer = keras.layers.Dense(64, activation='relu')(malstm_distance) 75 | # ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer) 76 | model = keras.models.Model([input1_tensor, input2_tensor], [malstm_distance]) 77 | model.compile(loss='mean_squared_error', optimizer='adam', metrics=["accuracy",f1_score_metrics]) 78 | model.summary() 79 | return model 80 | 81 | 82 | 83 | 84 | 85 | from tqdm import tqdm 86 | import mmap 87 | import os 88 | 89 | 90 | def get_num_lines(file_path): 91 | fp = open(file_path, "r+") 92 | buf = mmap.mmap(fp.fileno(), 0) 93 | lines = 0 94 | while buf.readline(): 95 | lines += 1 96 | return lines 97 | 98 | 99 | def get_embedding_matrix(word_index, Emed_path, Embed_npy): 100 | if (os.path.exists(Embed_npy)): 101 | return np.load(Embed_npy) 102 | print('Indexing word vectors') 103 | embeddings_index = {} 104 | file_line = get_num_lines(Emed_path) 105 | print('lines ', file_line) 106 | with open(Emed_path, encoding='utf-8') as f: 107 | for line in tqdm(f, total=file_line): 108 | values = line.split() 109 | if(len(values)<128): 110 | print(values) 111 | continue 112 | word = ' '.join(values[:-128]) 113 | coefs = np.asarray(values[-128:], dtype='float32') 114 | embeddings_index[word] = coefs 115 | f.close() 116 | 117 | print('Total %s word vectors.' % len(embeddings_index)) 118 | print('Preparing embedding matrix') 119 | nb_words = MAX_FEATURES#min(MAX_FEATURES, len(word_index)) 120 | all_embs = np.stack(embeddings_index.values()) 121 | print(all_embs.shape) 122 | emb_mean, emb_std = all_embs.mean(), all_embs.std() 123 | embedding_matrix = np.random.normal(loc=emb_mean, scale=emb_std, size=(nb_words, embedding_dims)) 124 | 125 | # embedding_matrix = np.zeros((nb_words, embedding_dims)) 126 | count=0 127 | for word, i in tqdm(word_index.items()): 128 | if i >= MAX_FEATURES: 129 | continue 130 | embedding_vector = embeddings_index.get(word) 131 | if embedding_vector is not None: 132 | # words not found in embedding index will be all-zeros. 133 | embedding_matrix[i] = embedding_vector 134 | count+=1 135 | np.save(Embed_npy, embedding_matrix) 136 | print('Null word embeddings: %d' % (nb_words-count)) 137 | print('not Null word embeddings: %d' % count) 138 | print('embedding_matrix shape', embedding_matrix.shape) 139 | # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) 140 | return embedding_matrix 141 | 142 | 143 | df = pd.read_csv(input_file, encoding="utf-8") 144 | 145 | question1 = df['question1'].values 146 | question2 = df['question2'].values 147 | y = df['label'].values 148 | from keras.preprocessing.sequence import pad_sequences 149 | from keras.preprocessing.text import Tokenizer 150 | 151 | tokenizer = Tokenizer(num_words=MAX_FEATURES) 152 | tokenizer.fit_on_texts(list(question1) + list(question2)) 153 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1) 154 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2) 155 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) 156 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) 157 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index)) 158 | print("nb_words",nb_words) 159 | embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, w2vpath, embedding_matrix_path) 160 | seed = 20180426 161 | cv_folds = 10 162 | from sklearn.model_selection import StratifiedKFold 163 | 164 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False) 165 | pred_oob = np.zeros(shape=(len(y), 1)) 166 | # print(pred_oob.shape) 167 | count = 0 168 | for ind_tr, ind_te in skf.split(X_train_q1, y): 169 | x_train_q1 = X_train_q1[ind_tr] 170 | x_train_q2 = X_train_q2[ind_tr] 171 | x_val_q1 = X_train_q1[ind_te] 172 | x_val_q2 = X_train_q2[ind_te] 173 | y_train = y[ind_tr] 174 | y_val = y[ind_te] 175 | model = get_model(embedding_matrix1,nb_words) 176 | early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min') 177 | bst_model_path =kernel_name+'_weight_%d.h5' % count 178 | model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 179 | save_best_only=True, verbose=1, save_weights_only=True) 180 | hist = model.fit([x_train_q1,x_train_q2], y_train, 181 | validation_data=([x_val_q1,x_val_q2], y_val), 182 | epochs=15, batch_size=32, shuffle=True, 183 | class_weight={0: 1.2233, 1: 0.4472}, 184 | callbacks=[early_stopping, model_checkpoint]) 185 | model.load_weights(bst_model_path) 186 | y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1) 187 | pred_oob[ind_te] = y_predict 188 | y_predict = (y_predict > 0.5).astype(int) 189 | recall = recall_score(y_val, y_predict) 190 | print(count, "recal", recall) 191 | precision = precision_score(y_val, y_predict) 192 | print(count, "precision", precision) 193 | accuracy = accuracy_score(y_val, y_predict) 194 | print(count, "accuracy ", accuracy) 195 | f1 = f1_score(y_val, y_predict) 196 | print(count, "f1", f1) 197 | count += 1 198 | pred_oob1 = (pred_oob > 0.5).astype(int) 199 | recall = recall_score(y, pred_oob1) 200 | print("recal", recall) 201 | precision = precision_score(y, pred_oob1) 202 | print("precision", precision) 203 | accuracy = accuracy_score(y, pred_oob1) 204 | print("accuracy", accuracy) 205 | f1 = f1_score(y, pred_oob1) 206 | print("f1", f1) 207 | -------------------------------------------------------------------------------- /baseline/seg.py: -------------------------------------------------------------------------------- 1 | #py3 2 | import sys 3 | import jieba 4 | import re 5 | import numpy as np 6 | jieba.add_word('花呗') 7 | jieba.add_word('借呗') 8 | jieba.add_word('余额宝') 9 | 10 | input_file="..\\data\\answers.txt" 11 | # input_file="..\\input\\atec_nlp_sim_train.csv" 12 | output_file="fc2.txt" 13 | def seg(text): 14 | seg_list = jieba.cut(text.strip()) 15 | return " ".join(seg_list) 16 | 17 | # dict_file_name="../data/dict.txt" 18 | # jieba.load_userdict(input_file) 19 | # # jieba.add_word('花呗') 20 | # df = pd.read_csv(input_file,encoding="utf-8") 21 | # q=df["question1"] 22 | # for s in q: 23 | # seg_list=jieba.cut(s) 24 | # print("/ ".join(seg_list)) 25 | # break 26 | 27 | # a=[0.1,0.5,0.8] 28 | # l=[0,1,1] 29 | # b=np.array(a) 30 | # d = (b>0.5).astype(int) 31 | # # d=np.stack((b,c),axis=1) 32 | # print(d) 33 | # print(d.shape) 34 | # from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score 35 | # s=f1_score(l,d) 36 | # # b=d.argmax(axis=-1) 37 | # print(s) 38 | special_character_removal = re.compile(r'[@#$%^&*,.【】[]{};‘,。、?!? \\/"\']', re.IGNORECASE) 39 | replace_numbers = re.compile(r'\d+', re.IGNORECASE) 40 | if __name__ == '__main__': 41 | 42 | with open(input_file,encoding="utf-8") as fp,open(output_file,"w",encoding="utf-8") as fw: 43 | for line in fp: 44 | line = special_character_removal.sub('', line) 45 | line = replace_numbers.sub('NUMBER_REPLACE', line) 46 | lines=line.strip().split(" ++$++ ") 47 | if(len(lines)==3): 48 | line=lines[1] 49 | fw.write(seg(line)) 50 | fw.write("\n") 51 | 52 | 53 | -------------------------------------------------------------------------------- /baseline/tf_bilstm_sim.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | #py2 4 | from __future__ import print_function 5 | input_file = "../input/process.csv" 6 | w2vpath = '../data/baike.128.no_truncate.glove.txt' 7 | embedding_matrix_path = './temp_no_truncate.npy' 8 | kernel_name="bilstm" 9 | import pandas as pd 10 | import numpy as np 11 | import os 12 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score 13 | import tensorflow as tf 14 | 15 | MAX_TEXT_LENGTH = 50 16 | MAX_FEATURES = 10000 17 | embedding_dims = 128 18 | dr = 0.2 19 | batch_size = 256 20 | save_dir = 'checkpoints/textrnn' 21 | save_path = os.path.join(save_dir, 'best_validation') # 最佳验证结果保存路径 22 | 23 | class TRNNConfig(object): 24 | """RNN配置参数""" 25 | 26 | # 模型参数 27 | embedding_dim = 64 # 词向量维度 28 | seq_length = MAX_TEXT_LENGTH # 序列长度 29 | num_classes = 2 # 类别数 30 | vocab_size = MAX_FEATURES # 词汇表达小 31 | 32 | num_layers= 1 # 隐藏层层数 33 | hidden_dim = 256 # 隐藏层神经元 34 | rnn = 'gru' # lstm 或 gru 35 | 36 | dropout_keep_prob = 0.8 # dropout保留比例 37 | learning_rate = 1e-3 # 学习率 38 | 39 | batch_size = 256 # 每批训练大小 40 | num_epochs = 10 # 总迭代轮次 41 | 42 | print_per_batch = 100 # 每多少轮输出一次结果 43 | save_per_batch = 10 # 每多少轮存入tensorboard 44 | 45 | class TextRNN(): 46 | def __init__(self, 47 | embedding_matrix=None, 48 | config=TRNNConfig): 49 | self.config = config 50 | def lstm_cell(): # lstm核 51 | return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True) 52 | 53 | def gru_cell(): # gru核 54 | return tf.contrib.rnn.GRUCell(self.config.hidden_dim) 55 | 56 | def dropout(): # 为每一个rnn核后面加一个dropout层 57 | if (self.config.rnn == 'lstm'): 58 | cell = lstm_cell() 59 | else: 60 | cell = gru_cell() 61 | return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) 62 | # Placeholders for input, output and dropout 63 | self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x') 64 | self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y') 65 | self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') 66 | # 词向量映射 67 | with tf.device('/cpu:0'): 68 | embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim]) 69 | embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x) 70 | 71 | with tf.name_scope("rnn"): 72 | # 多层rnn网络 73 | cells = [dropout() for _ in range(self.config.num_layers)] 74 | rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) 75 | 76 | _outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32) 77 | last = _outputs[:, -1, :] # 取最后一个时序输出作为结果 78 | 79 | with tf.name_scope("score"): 80 | # 全连接层,后面接dropout以及relu激活 81 | fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1') 82 | fc = tf.contrib.layers.dropout(fc, self.keep_prob) 83 | fc = tf.nn.relu(fc) 84 | 85 | # 分类器 86 | self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2') 87 | self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别 88 | 89 | with tf.name_scope("optimize"): 90 | # 损失函数,交叉熵 91 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y) 92 | self.loss = tf.reduce_mean(cross_entropy) 93 | # 优化器 94 | self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss) 95 | 96 | with tf.name_scope("accuracy"): 97 | # 准确率 98 | correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls) 99 | self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 100 | 101 | 102 | def train(model,config) : 103 | print("Configuring TensorBoard and Saver...") 104 | # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖 105 | tensorboard_dir = 'tensorboard/textrnn' 106 | if not os.path.exists(tensorboard_dir): 107 | os.makedirs(tensorboard_dir) 108 | 109 | tf.summary.scalar("loss", model.loss) 110 | tf.summary.scalar("accuracy", model.acc) 111 | merged_summary = tf.summary.merge_all() 112 | writer = tf.summary.FileWriter(tensorboard_dir) 113 | 114 | # 配置 Saver 115 | saver = tf.train.Saver() 116 | if not os.path.exists(save_dir): 117 | os.makedirs(save_dir) 118 | 119 | # 创建session 120 | session = tf.Session() 121 | session.run(tf.global_variables_initializer()) 122 | writer.add_graph(session.graph) 123 | 124 | print('Training and evaluating...') 125 | total_batch = 0 # 总批次 126 | best_acc_val = 0.0 # 最佳验证集准确率 127 | last_improved = 0 # 记录上一次提升批次 128 | require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 129 | 130 | flag = False 131 | for epoch in range(config.num_epochs): 132 | print('Epoch:', epoch + 1) 133 | batch_train = batch_iter(x_train, y_train, config.batch_size) 134 | for x_batch, y_batch in batch_train: 135 | feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob) 136 | if total_batch % config.save_per_batch == 0: 137 | # 每多少轮次将训练结果写入tensorboard scalar 138 | s = session.run(merged_summary, feed_dict=feed_dict) 139 | writer.add_summary(s, total_batch) 140 | if total_batch % config.print_per_batch == 0: 141 | # 每多少轮次输出在训练集和验证集上的性能 142 | feed_dict[model.keep_prob] = 1.0 143 | loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) 144 | loss_val, acc_val = evaluate(session, x_val, y_val) # todo 145 | 146 | if acc_val > best_acc_val: 147 | # 保存最好结果 148 | best_acc_val = acc_val 149 | last_improved = total_batch 150 | saver.save(sess=session, save_path=save_path) 151 | improved_str = '*' 152 | else: 153 | improved_str = '' 154 | 155 | session.run(model.optim, feed_dict=feed_dict) # 运行优化 156 | total_batch += 1 157 | 158 | df = pd.read_csv(input_file, encoding="utf-8") 159 | 160 | question1 = df['question1'].values 161 | question2 = df['question2'].values 162 | y = df['label'].values 163 | from keras.preprocessing.sequence import pad_sequences 164 | from keras.preprocessing.text import Tokenizer 165 | 166 | tokenizer = Tokenizer(num_words=MAX_FEATURES) 167 | tokenizer.fit_on_texts(list(question1) + list(question2)) 168 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1) 169 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2) 170 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) 171 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) 172 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index)) 173 | print("nb_words",nb_words) 174 | seed = 20180426 175 | cv_folds = 10 176 | from sklearn.model_selection import StratifiedKFold 177 | 178 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False) 179 | pred_oob = np.zeros(shape=(len(y), 1)) 180 | # print(pred_oob.shape) 181 | count = 0 182 | for ind_tr, ind_te in skf.split(X_train_q1, y): 183 | x_train_q1 = X_train_q1[ind_tr] 184 | x_train_q2 = X_train_q2[ind_tr] 185 | x_val_q1 = X_train_q1[ind_te] 186 | x_val_q2 = X_train_q2[ind_te] 187 | y_train = y[ind_tr] 188 | y_val = y[ind_te] 189 | 190 | # model = get_model(embedding_matrix1,nb_words) 191 | # early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1) 192 | # bst_model_path =kernel_name+'_weight_%d.h5' % count 193 | # model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min', 194 | # save_best_only=True, verbose=1, save_weights_only=True) 195 | # hist = model.fit([x_train_q1,x_train_q2], y_train, 196 | # validation_data=([x_val_q1,x_val_q2], y_val), 197 | # epochs=6, batch_size=256, shuffle=True, 198 | # class_weight={0: 1.3233, 1: 0.4472}, 199 | # callbacks=[early_stopping, model_checkpoint]) 200 | # model.load_weights(bst_model_path) 201 | y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1) 202 | pred_oob[ind_te] = y_predict 203 | y_predict = (y_predict > 0.5).astype(int) 204 | recall = recall_score(y_val, y_predict) 205 | print(count, "recal", recall) 206 | precision = precision_score(y_val, y_predict) 207 | print(count, "precision", precision) 208 | accuracy = accuracy_score(y_val, y_predict) 209 | print(count, "accuracy ", accuracy) 210 | f1 = f1_score(y_val, y_predict) 211 | print(count, "f1", f1) 212 | count += 1 213 | pred_oob = (pred_oob > 0.5).astype(int) 214 | recall = recall_score(y, pred_oob) 215 | print("recal", recall) 216 | precision = precision_score(y, pred_oob) 217 | print("precision", precision) 218 | accuracy = accuracy_score(y, pred_oob) 219 | print("accuracy", accuracy) 220 | f1 = f1_score(y, pred_oob) 221 | print("f1", f1) 222 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | input_file = "./input/process.csv" 2 | w2vpath = './data/baike.128.no_truncate.glove.txt' 3 | embedding_matrix_path = './baseline/temp.npy' 4 | kernel_name="bilstm" 5 | import pandas as pd 6 | import numpy as np 7 | import os 8 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score 9 | import tensorflow as tf 10 | 11 | MAX_TEXT_LENGTH = 50 12 | MAX_FEATURES = 10000 13 | embedding_dims = 128 14 | dr = 0.2 15 | batch_size = 256 16 | 17 | class TRNNConfig(object): 18 | """RNN配置参数""" 19 | 20 | # 模型参数 21 | embedding_dim = 64 # 词向量维度 22 | seq_length = MAX_TEXT_LENGTH # 序列长度 23 | num_classes = 1 # 类别数 24 | vocab_size = MAX_FEATURES # 词汇表达小 25 | 26 | num_layers= 1 # 隐藏层层数 27 | hidden_dim = 256 # 隐藏层神经元 28 | rnn = 'gru' # lstm 或 gru 29 | fc_hidden_dim=64 30 | dropout_keep_prob = 0.8 # dropout保留比例 31 | learning_rate = 1e-3 # 学习率 32 | 33 | batch_size = 256 # 每批训练大小 34 | num_epochs = 50 # 总迭代轮次 35 | early_stop=5 36 | 37 | print_per_batch = 1 # 每多少轮输出一次结果 38 | save_per_batch = 10 # 每多少轮存入tensorboard 39 | 40 | num_checkpoints=5 #Number of checkpoints to store (default: 5) 41 | 42 | class_weight0=1.0 43 | class_weight1=2.3 44 | -------------------------------------------------------------------------------- /feature_engineering.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detecting duplicate quora questions 3 | feature engineering 4 | @author: Abhishek Thakur 5 | """ 6 | input_file = "../input/process.csv" 7 | w2vpath = '../data/baike.128.no_truncate.glove.txt' 8 | # import cPickle 9 | import pandas as pd 10 | import numpy as np 11 | import gensim 12 | from fuzzywuzzy import fuzz 13 | # from nltk.corpus import stopwords 14 | from tqdm import tqdm 15 | from scipy.stats import skew, kurtosis 16 | from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis 17 | # from nltk import word_tokenize 18 | # stop_words = stopwords.words('english') 19 | stop_words=['的',',','。'] 20 | 21 | def wmd(s1, s2): 22 | s1 = s1.split() 23 | s2 = s2.split() 24 | # stop_words = stopwords.words('english') 25 | s1 = [w for w in s1 if w not in stop_words] 26 | s2 = [w for w in s2 if w not in stop_words] 27 | return model.wmdistance(s1, s2) 28 | 29 | 30 | def norm_wmd(s1, s2): 31 | s1 = s1.lower().split() 32 | s2 = s2.lower().split() 33 | # stop_words = stopwords.words('english') 34 | s1 = [w for w in s1 if w not in stop_words] 35 | s2 = [w for w in s2 if w not in stop_words] 36 | return norm_model.wmdistance(s1, s2) 37 | 38 | 39 | def sent2vec(s): 40 | words = s 41 | # words = word_tokenize(words) 42 | words = [w for w in words if not w in stop_words] 43 | words = [w for w in words if w.isalpha()] 44 | M = [] 45 | for w in words: 46 | try: 47 | M.append(model[w]) 48 | except: 49 | continue 50 | M = np.array(M) 51 | v = M.sum(axis=0) 52 | return v / np.sqrt((v ** 2).sum()) 53 | 54 | 55 | data = pd.read_csv(input_file) 56 | # data = data.drop(['id', 'qid1', 'qid2'], axis=1) 57 | 58 | 59 | data['len_q1'] = data.question1.apply(lambda x: len(str(x))) 60 | data['len_q2'] = data.question2.apply(lambda x: len(str(x))) 61 | data['diff_len'] = data.len_q1 - data.len_q2 62 | data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) 63 | data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) 64 | data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split())) 65 | data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split())) 66 | data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1) 67 | # data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) 68 | data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) 69 | data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) 70 | data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) 71 | data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) 72 | data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) 73 | data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) 74 | 75 | 76 | model = gensim.models.KeyedVectors.load_word2vec_format('../data/vectors.txt', binary=False) 77 | data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1) 78 | 79 | 80 | norm_model = gensim.models.KeyedVectors.load_word2vec_format('../data/vectors.txt', binary=False) 81 | norm_model.init_sims(replace=True) 82 | data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1) 83 | 84 | question1_vectors = np.zeros((data.shape[0], 300)) 85 | error_count = 0 86 | 87 | for i, q in tqdm(enumerate(data.question1.values)): 88 | question1_vectors[i, :] = sent2vec(q) 89 | 90 | question2_vectors = np.zeros((data.shape[0], 300)) 91 | for i, q in tqdm(enumerate(data.question2.values)): 92 | question2_vectors[i, :] = sent2vec(q) 93 | 94 | data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), 95 | np.nan_to_num(question2_vectors))] 96 | 97 | data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), 98 | np.nan_to_num(question2_vectors))] 99 | 100 | data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), 101 | np.nan_to_num(question2_vectors))] 102 | 103 | data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), 104 | np.nan_to_num(question2_vectors))] 105 | 106 | data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), 107 | np.nan_to_num(question2_vectors))] 108 | 109 | data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), 110 | np.nan_to_num(question2_vectors))] 111 | 112 | data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), 113 | np.nan_to_num(question2_vectors))] 114 | 115 | data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] 116 | data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] 117 | data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] 118 | data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] 119 | 120 | # cPickle.dump(question1_vectors, open('data/q1_w2v.pkl', 'wb'), -1) 121 | # cPickle.dump(question2_vectors, open('data/q2_w2v.pkl', 'wb'), -1) 122 | 123 | data.to_csv('data/quora_features.csv', index=False) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | pandas 4 | scikit-learn 5 | Keras>=2.0.0 6 | tqdm 7 | 8 | -------------------------------------------------------------------------------- /tf_TextCNN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from config import * 3 | 4 | filter_sizes=[2,3,8,9] 5 | num_filters=3 6 | class TextRNN(): 7 | def __init__(self, 8 | embedding_matrix=None, 9 | config=TRNNConfig()): 10 | self.config = config 11 | 12 | def lstm_cell(): # lstm核 13 | return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True) 14 | 15 | def gru_cell(): # gru核 16 | return tf.contrib.rnn.GRUCell(self.config.hidden_dim) 17 | 18 | def dropout(): # 为每一个rnn核后面加一个dropout层 19 | if (self.config.rnn == 'lstm'): 20 | cell = lstm_cell() 21 | else: 22 | cell = gru_cell() 23 | return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) 24 | 25 | # Placeholders for input, output and dropout 26 | self.input_x1 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x1') 27 | self.input_x2 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x2') 28 | self.input_y = tf.placeholder(tf.float32, [None, 1], name='input_y') 29 | self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') 30 | # 词向量映射 31 | with tf.device('/cpu:0'): 32 | W = tf.Variable( 33 | tf.random_uniform([config.vocab_size, config.embedding_dim], -1.0, 1.0), 34 | name="W") 35 | self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x1) 36 | self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) 37 | # W = tf.get_variable(name="W", shape=embedding_matrix.shape, initializer=tf.constant_initializer(embedding_matrix),trainable=True) 38 | 39 | pooled_outputs = [] 40 | for i, filter_size in enumerate(filter_sizes): 41 | with tf.name_scope("conv-maxpool-%s" % filter_size): 42 | # Convolution Layer 43 | filter_shape = [filter_size, config.embedding_dim, 1, num_filters] 44 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") 45 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") 46 | conv = tf.nn.conv2d( 47 | self.embedded_chars_expanded, 48 | W, 49 | strides=[1, 1, 1, 1], 50 | padding="VALID", 51 | name="conv") 52 | # Apply nonlinearity 53 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 54 | # Max-pooling over the outputs 55 | pooled = tf.nn.max_pool( 56 | h, 57 | ksize=[1, config.vocab_size - filter_size + 1, 1, 1], 58 | strides=[1, 1, 1, 1], 59 | padding='VALID', 60 | name="pool") 61 | pooled_outputs.append(pooled) 62 | 63 | 64 | # Combine all the pooled features 65 | num_filters_total = num_filters * len(filter_sizes) 66 | self.h_pool = tf.concat(3, pooled_outputs) 67 | self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) 68 | 69 | # Add dropout 70 | with tf.name_scope("dropout"): 71 | self.h_drop = tf.nn.dropout(self.h_pool_flat, self.config.dropout_keep_prob) 72 | with tf.name_scope("output"): 73 | W = tf.Variable(tf.truncated_normal([num_filters_total, config.num_classes], stddev=0.1), name="W") 74 | b = tf.Variable(tf.constant(0.1, shape=[config.num_classes]), name="b") 75 | self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") 76 | self.predictions = tf.argmax(self.scores, 1, name="predictions") 77 | 78 | with tf.name_scope("optimize"): 79 | # 损失函数,交叉熵 80 | # cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores,labels=self.input_y) 81 | # self.loss = tf.reduce_mean(cross_entropy) 82 | 83 | # self.loss = -tf.reduce_sum(tf.cast(self.input_y, tf.float32) 84 | # * tf.log(tf.cast(self.y_pred_cls, tf.float32)), reduction_indices=1) 85 | self.loss=tf.losses.mean_squared_error(logits=self.scores,labels=self.input_y) 86 | # 优化器 87 | self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss) 88 | 89 | with tf.name_scope("accuracy"): 90 | # 准确率 91 | correct_pred = tf.equal(tf.cast(self.input_y, tf.float32), tf.cast(self.y_pred_cls, tf.float32)) 92 | self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 93 | -------------------------------------------------------------------------------- /tf_TextRNN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from config import * 3 | 4 | 5 | class TextRNN(): 6 | def __init__(self, 7 | embedding_matrix=None, 8 | config=TRNNConfig()): 9 | self.config = config 10 | 11 | def lstm_cell(): # lstm核 12 | cell = tf.nn.rnn_cell.BasicLSTMCell(config.hidden_dim, forget_bias=0.0, state_is_tuple=True) 13 | if config.dropout_keep_prob < 1: 14 | cell = tf.nn.rnn_cell.DropoutWrapper( 15 | cell, output_keep_prob=config.dropout_keep_prob 16 | ) 17 | return cell 18 | 19 | def gru_cell(): # gru核 20 | return tf.contrib.rnn.GRUCell(self.config.hidden_dim) 21 | 22 | 23 | 24 | # Placeholders for input, output and dropout 25 | self.input_x1 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x1') 26 | self.input_x2 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x2') 27 | self.input_y = tf.placeholder(tf.float32, [None, 1], name='input_y') 28 | self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') 29 | # 词向量映射 30 | with tf.device('/cpu:0'): 31 | # weW = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim]) 32 | weW = tf.get_variable(name="W", shape=embedding_matrix.shape, initializer=tf.constant_initializer(embedding_matrix),trainable=True) 33 | embedding_inputs1 = tf.nn.embedding_lookup(weW, self.input_x1) 34 | embedding_inputs2 = tf.nn.embedding_lookup(weW, self.input_x2) 35 | print('input_x1', self.input_x1.get_shape()) 36 | with tf.name_scope("rnn"): 37 | # 多层rnn网络 38 | cells = [lstm_cell() for _ in range(self.config.num_layers)] 39 | rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) 40 | 41 | _outputs1, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs1, dtype=tf.float32) 42 | _outputs2, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs2, dtype=tf.float32) 43 | print("_outputs2", _outputs2.get_shape()) 44 | encode1 = _outputs1[:, -1, :] 45 | encode2 = _outputs2[:, -1, :] # 取最后一个时序输出作为结果 46 | print("encode2", encode2.get_shape()) 47 | last = tf.multiply(encode1, encode2, name="last") 48 | print("multiply",last.get_shape()) 49 | with tf.name_scope("score"): 50 | # 全连接层,后面接dropout以及relu激活 51 | fc = tf.layers.dense(last, self.config.fc_hidden_dim, name='fc1',activation=tf.nn.relu) 52 | # fc = tf.contrib.layers.dropout(fc, self.keep_prob) 53 | # fc = tf.nn.relu(fc) 54 | print('fc',fc.get_shape()) 55 | # 分类器 56 | # lbW = tf.Variable(tf.truncated_normal([self.config.hidden_dim, self.config.num_classes], stddev=0.1), name="lbW") 57 | # b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name="b") 58 | # print('lbW',lbW.get_shape()) 59 | # print('b',b.get_shape()) 60 | self.scores = tf.layers.dense(fc,1,activation=tf.nn.sigmoid) # Softmax 61 | # self.scores = tf.nn.xw_plus_b(fc, lbW, b, name="scores") 62 | self.y_pred_cls = tf.round(self.scores, name="predictions") 63 | # self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2') 64 | # self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别 65 | with tf.name_scope("optimize"): 66 | # 损失函数,交叉熵 67 | # cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores,labels=self.input_y) 68 | # self.loss = tf.reduce_mean(cross_entropy) 69 | 70 | # self.loss = -tf.reduce_sum(tf.cast(self.input_y, tf.float32) 71 | # * tf.log(tf.cast(self.y_pred_cls, tf.float32)), reduction_indices=1) 72 | self.loss = tf.reduce_mean(-tf.reduce_sum(self.input_y* tf.log(self.scores)*config.class_weight1 73 | +(1-self.input_y)*tf.log(1-self.scores)*config.class_weight0 74 | , reduction_indices=[1])) 75 | 76 | # self.loss = tf.losses.sigmoid_cross_entropy(logits=self.scores, multi_class_labels=self.input_y) 77 | # 优化器 78 | # self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss) 79 | 80 | with tf.name_scope("accuracy"): 81 | # 准确率 82 | correct_pred = tf.equal(tf.cast(self.input_y, tf.float32), tf.cast(self.y_pred_cls, tf.float32)) 83 | self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="accuracy") 84 | -------------------------------------------------------------------------------- /tf_model/tf_train_lstm.py: -------------------------------------------------------------------------------- 1 | input_file = "./input/process.csv" 2 | w2vpath = './data/baike.128.no_truncate.glove.txt' 3 | embedding_matrix_path = './baseline/temp.npy' 4 | kernel_name="bilstm" 5 | import pandas as pd 6 | import numpy as np 7 | import os 8 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score 9 | import tensorflow as tf 10 | import datetime 11 | MAX_TEXT_LENGTH = 50 12 | MAX_FEATURES = 10000 13 | embedding_dims = 128 14 | dr = 0.2 15 | batch_size = 256 16 | 17 | class TRNNConfig(object): 18 | """RNN配置参数""" 19 | 20 | # 模型参数 21 | embedding_dim = 64 # 词向量维度 22 | seq_length = MAX_TEXT_LENGTH # 序列长度 23 | num_classes = 1 # 类别数 24 | vocab_size = MAX_FEATURES # 词汇表达小 25 | 26 | num_layers= 1 # 隐藏层层数 27 | hidden_dim = 256 # 隐藏层神经元 28 | rnn = 'gru' # lstm 或 gru 29 | fc_hidden_dim=64 30 | dropout_keep_prob = 0.8 # dropout保留比例 31 | learning_rate = 1e-3 # 学习率 32 | 33 | batch_size = 256 # 每批训练大小 34 | num_epochs = 50 # 总迭代轮次 35 | early_stop=5 36 | 37 | print_per_batch = 1 # 每多少轮输出一次结果 38 | save_per_batch = 10 # 每多少轮存入tensorboard 39 | 40 | num_checkpoints=5 #Number of checkpoints to store (default: 5) 41 | 42 | class_weight0=1.0 43 | class_weight1=2.3 44 | 45 | class TextRNN(): 46 | def __init__(self, 47 | embedding_matrix=None, 48 | config=TRNNConfig()): 49 | self.config = config 50 | 51 | def lstm_cell(): # lstm核 52 | cell = tf.nn.rnn_cell.BasicLSTMCell(config.hidden_dim, forget_bias=0.0, state_is_tuple=True) 53 | if config.dropout_keep_prob < 1: 54 | cell = tf.nn.rnn_cell.DropoutWrapper( 55 | cell, output_keep_prob=config.dropout_keep_prob 56 | ) 57 | return cell 58 | 59 | def gru_cell(): # gru核 60 | return tf.contrib.rnn.GRUCell(self.config.hidden_dim) 61 | 62 | 63 | 64 | # Placeholders for input, output and dropout 65 | self.input_x1 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x1') 66 | self.input_x2 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x2') 67 | self.input_y = tf.placeholder(tf.float32, [None, 1], name='input_y') 68 | self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') 69 | # 词向量映射 70 | with tf.device('/cpu:0'): 71 | # weW = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim]) 72 | weW = tf.get_variable(name="W", shape=embedding_matrix.shape, initializer=tf.constant_initializer(embedding_matrix),trainable=True) 73 | embedding_inputs1 = tf.nn.embedding_lookup(weW, self.input_x1) 74 | embedding_inputs2 = tf.nn.embedding_lookup(weW, self.input_x2) 75 | print('input_x1', self.input_x1.get_shape()) 76 | with tf.name_scope("rnn"): 77 | # 多层rnn网络 78 | cells = [lstm_cell() for _ in range(self.config.num_layers)] 79 | rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) 80 | 81 | _outputs1, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs1, dtype=tf.float32) 82 | _outputs2, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs2, dtype=tf.float32) 83 | print("_outputs2", _outputs2.get_shape()) 84 | encode1 = _outputs1[:, -1, :] 85 | encode2 = _outputs2[:, -1, :] # 取最后一个时序输出作为结果 86 | print("encode2", encode2.get_shape()) 87 | last = tf.multiply(encode1, encode2, name="last") 88 | print("multiply",last.get_shape()) 89 | with tf.name_scope("score"): 90 | # 全连接层,后面接dropout以及relu激活 91 | fc = tf.layers.dense(last, self.config.fc_hidden_dim, name='fc1',activation=tf.nn.relu) 92 | # fc = tf.contrib.layers.dropout(fc, self.keep_prob) 93 | # fc = tf.nn.relu(fc) 94 | print('fc',fc.get_shape()) 95 | # 分类器 96 | # lbW = tf.Variable(tf.truncated_normal([self.config.hidden_dim, self.config.num_classes], stddev=0.1), name="lbW") 97 | # b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name="b") 98 | # print('lbW',lbW.get_shape()) 99 | # print('b',b.get_shape()) 100 | self.scores = tf.layers.dense(fc,1,activation=tf.nn.sigmoid) # Softmax 101 | # self.scores = tf.nn.xw_plus_b(fc, lbW, b, name="scores") 102 | self.y_pred_cls = tf.round(self.scores, name="predictions") 103 | # self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2') 104 | # self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别 105 | with tf.name_scope("optimize"): 106 | # 损失函数,交叉熵 107 | # cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores,labels=self.input_y) 108 | # self.loss = tf.reduce_mean(cross_entropy) 109 | 110 | # self.loss = -tf.reduce_sum(tf.cast(self.input_y, tf.float32) 111 | # * tf.log(tf.cast(self.y_pred_cls, tf.float32)), reduction_indices=1) 112 | self.loss = tf.reduce_mean(-tf.reduce_sum(self.input_y* tf.log(self.scores)*config.class_weight1 113 | +(1-self.input_y)*tf.log(1-self.scores)*config.class_weight0 114 | , reduction_indices=[1])) 115 | 116 | # self.loss = tf.losses.sigmoid_cross_entropy(logits=self.scores, multi_class_labels=self.input_y) 117 | # 优化器 118 | # self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss) 119 | 120 | with tf.name_scope("accuracy"): 121 | # 准确率 122 | correct_pred = tf.equal(tf.cast(self.input_y, tf.float32), tf.cast(self.y_pred_cls, tf.float32)) 123 | self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="accuracy") 124 | 125 | df = pd.read_csv(input_file, encoding="utf-8") 126 | 127 | question1 = df['question1'].values 128 | question2 = df['question2'].values 129 | y = df['label'].values 130 | y=np.array(y,dtype=np.float32) 131 | embedding_matrix1=np.load(embedding_matrix_path) 132 | def train(x_train1, x_train2, y_train, x_val1, x_val2, y_val, model=TextRNN(embedding_matrix=embedding_matrix1), config=TRNNConfig()): 133 | print("Configuring TensorBoard and Saver...") 134 | # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖 135 | out_dir = 'textrnn' 136 | if not os.path.exists(out_dir): 137 | os.makedirs(out_dir) 138 | # Define Training procedure 139 | global_step = tf.Variable(0, name="global_step", trainable=False) 140 | # optimizer = tf.train.GradientDescentOptimizer(5e-3) 141 | optimizer = tf.train.AdamOptimizer(1e-3) 142 | train_step_ = optimizer.minimize(model.loss) 143 | grads_and_vars = optimizer.compute_gradients(model.loss) 144 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 145 | 146 | # 创建session 147 | session = tf.Session() 148 | session.run(tf.global_variables_initializer()) 149 | 150 | # Summaries for loss and accuracy 151 | loss_summary = tf.summary.scalar("loss", model.loss) 152 | acc_summary = tf.summary.scalar("accuracy", model.acc) 153 | # Keep track of gradient values and sparsity (optional) 154 | # grad_summaries = [] 155 | # for g, v in grads_and_vars: 156 | # if g is not None: 157 | # grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) 158 | # sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 159 | # grad_summaries.append(grad_hist_summary) 160 | # grad_summaries.append(sparsity_summary) 161 | # grad_summaries_merged = tf.summary.merge(grad_summaries) 162 | # Train Summaries 163 | train_summary_op = tf.summary.merge([loss_summary, acc_summary]) 164 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 165 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph) 166 | 167 | # Dev summaries 168 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) 169 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 170 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, session.graph) 171 | 172 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 173 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints123")) 174 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 175 | if not os.path.exists(checkpoint_dir): 176 | os.makedirs(checkpoint_dir) 177 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=config.num_checkpoints) 178 | 179 | def train_fun(x_batch1, x_batch2, y_batch): 180 | """ 181 | A single training step 182 | """ 183 | feed_dict = { 184 | model.input_x1: x_batch1, 185 | model.input_x2: x_batch2, 186 | model.input_y: y_batch, 187 | model.keep_prob: config.dropout_keep_prob 188 | } 189 | _, step, summaries, loss, accuracy = session.run( 190 | [train_op, global_step, train_summary_op, model.loss, model.acc], 191 | feed_dict) 192 | time_str = datetime.datetime.now().isoformat() 193 | # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 194 | train_summary_writer.add_summary(summaries, step) 195 | 196 | def dev_fun(x_batch1,x_batch2, y_batch, writer=None): 197 | """ 198 | Evaluates model on a dev set 199 | """ 200 | feed_dict = { 201 | model.input_x1: x_batch1, 202 | model.input_x2: x_batch2, 203 | model.input_y: y_batch, 204 | model.keep_prob: 1.0 205 | } 206 | step, summaries, loss, accuracy,predict = session.run( 207 | [global_step, dev_summary_op, model.loss, model.acc,model.scores], 208 | feed_dict) 209 | pred_label = (predict > 0.5).astype(int) 210 | print(np,sum(pred_label),np,sum(predict)) 211 | accuracy1 = accuracy_score(y_batch, pred_label) 212 | recall = recall_score(y_batch, pred_label) 213 | precision = precision_score(y_batch, pred_label) 214 | time_str = datetime.datetime.now().isoformat() 215 | print("dev {}: step {}, loss {:g}, acc {:g},acc1 {:g},recall {:g},precision {:g}".format(time_str, step, loss, accuracy,accuracy1,recall,precision)) 216 | if writer: 217 | writer.add_summary(summaries, step) 218 | return loss, accuracy,predict 219 | 220 | def batch_iter(x1, x2, y, batch_size): 221 | idx = np.arange(len(y)) 222 | batches = [idx[range(batch_size * i, min(len(y), batch_size * (i + 1)))] for i in 223 | range(len(y) // batch_size + 1)] 224 | for i in batches: 225 | yield x1[i], x2[i], y[i] 226 | 227 | best_acc_val = 0 228 | monitor_early_stop=0 229 | for epoch in range(config.num_epochs): 230 | print('Epoch:', epoch + 1) 231 | total_batch = 0 232 | for x_batch1, x_batch2, y_batch in batch_iter(x_train1, x_train2, y_train, config.batch_size): 233 | train_fun(x_batch1, x_batch2, y_batch) 234 | total_batch += 1 235 | if epoch % config.print_per_batch == 0: 236 | # 每多少轮次输出在训练集和验证集上的性能 237 | loss_val, acc_val,predict = dev_fun(x_val1, x_val2, y_val, writer=dev_summary_writer) # todo 238 | 239 | if acc_val > best_acc_val: 240 | # 保存最好结果 241 | best_acc_val = acc_val 242 | path = saver.save(sess=session, save_path=checkpoint_prefix) 243 | print("Saved model checkpoint to {}\n".format(path)) 244 | monitor_early_stop=0 245 | else: 246 | monitor_early_stop+=1 247 | print("do not save ") 248 | if(monitor_early_stop>=config.early_stop): 249 | break 250 | loss_val, acc_val, predict = dev_fun(x_val1, x_val2, y_val) 251 | return predict 252 | 253 | 254 | from keras.preprocessing.sequence import pad_sequences 255 | from keras.preprocessing.text import Tokenizer 256 | 257 | tokenizer = Tokenizer(num_words=MAX_FEATURES) 258 | tokenizer.fit_on_texts(list(question1) + list(question2)) 259 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1) 260 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2) 261 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) 262 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) 263 | seed = 20180426 264 | cv_folds = 10 265 | y=np.reshape(y,[len(y),1]) 266 | from sklearn.model_selection import StratifiedKFold 267 | 268 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False) 269 | pred_oob = np.zeros(shape=(len(y), 1)) 270 | # print(pred_oob.shape) 271 | count = 0 272 | for ind_tr, ind_te in skf.split(X_train_q1, y): 273 | x_train_q1 = X_train_q1[ind_tr] 274 | x_train_q2 = X_train_q2[ind_tr] 275 | y_train = y[ind_tr] 276 | 277 | x_val_q1 = X_train_q1[ind_te] 278 | x_val_q2 = X_train_q2[ind_te] 279 | y_val = y[ind_te] 280 | # mymodel = TextRNN() 281 | predict=train(x_train1= x_train_q1, x_train2= x_train_q2,y_train=y_train, 282 | x_val1= x_val_q1, x_val2= x_val_q2, y_val=y_val) 283 | pred_oob[ind_te]=predict 284 | # break 285 | pred_label = (pred_oob > 0.5).astype(int) 286 | recall = recall_score(y, pred_label) 287 | print("recal", recall) 288 | precision = precision_score(y, pred_label) 289 | print("precision", precision) 290 | accuracy = accuracy_score(y, pred_label) 291 | print("accuracy", accuracy) 292 | f1 = f1_score(y, pred_label) 293 | print("f1", f1) -------------------------------------------------------------------------------- /tf_train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from config import * 3 | 4 | from tf_TextRNN import TextRNN 5 | import datetime 6 | df = pd.read_csv(input_file, encoding="utf-8") 7 | 8 | question1 = df['question1'].values 9 | question2 = df['question2'].values 10 | y = df['label'].values 11 | y=np.array(y,dtype=np.float32) 12 | embedding_matrix1=np.load(embedding_matrix_path) 13 | def train(x_train1, x_train2, y_train, x_val1, x_val2, y_val, model=TextRNN(embedding_matrix=embedding_matrix1), config=TRNNConfig()): 14 | print("Configuring TensorBoard and Saver...") 15 | # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖 16 | out_dir = 'textrnn' 17 | if not os.path.exists(out_dir): 18 | os.makedirs(out_dir) 19 | # Define Training procedure 20 | global_step = tf.Variable(0, name="global_step", trainable=False) 21 | # optimizer = tf.train.GradientDescentOptimizer(5e-3) 22 | optimizer = tf.train.AdamOptimizer(1e-3) 23 | train_step_ = optimizer.minimize(model.loss) 24 | grads_and_vars = optimizer.compute_gradients(model.loss) 25 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 26 | 27 | # 创建session 28 | session = tf.Session() 29 | session.run(tf.global_variables_initializer()) 30 | 31 | # Summaries for loss and accuracy 32 | loss_summary = tf.summary.scalar("loss", model.loss) 33 | acc_summary = tf.summary.scalar("accuracy", model.acc) 34 | # Keep track of gradient values and sparsity (optional) 35 | # grad_summaries = [] 36 | # for g, v in grads_and_vars: 37 | # if g is not None: 38 | # grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) 39 | # sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 40 | # grad_summaries.append(grad_hist_summary) 41 | # grad_summaries.append(sparsity_summary) 42 | # grad_summaries_merged = tf.summary.merge(grad_summaries) 43 | # Train Summaries 44 | train_summary_op = tf.summary.merge([loss_summary, acc_summary]) 45 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 46 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph) 47 | 48 | # Dev summaries 49 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) 50 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 51 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, session.graph) 52 | 53 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 54 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints123")) 55 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 56 | if not os.path.exists(checkpoint_dir): 57 | os.makedirs(checkpoint_dir) 58 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=config.num_checkpoints) 59 | 60 | def train_fun(x_batch1, x_batch2, y_batch): 61 | """ 62 | A single training step 63 | """ 64 | feed_dict = { 65 | model.input_x1: x_batch1, 66 | model.input_x2: x_batch2, 67 | model.input_y: y_batch, 68 | model.keep_prob: config.dropout_keep_prob 69 | } 70 | _, step, summaries, loss, accuracy = session.run( 71 | [train_op, global_step, train_summary_op, model.loss, model.acc], 72 | feed_dict) 73 | time_str = datetime.datetime.now().isoformat() 74 | # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 75 | train_summary_writer.add_summary(summaries, step) 76 | 77 | def dev_fun(x_batch1,x_batch2, y_batch, writer=None): 78 | """ 79 | Evaluates model on a dev set 80 | """ 81 | feed_dict = { 82 | model.input_x1: x_batch1, 83 | model.input_x2: x_batch2, 84 | model.input_y: y_batch, 85 | model.keep_prob: 1.0 86 | } 87 | step, summaries, loss, accuracy,predict = session.run( 88 | [global_step, dev_summary_op, model.loss, model.acc,model.scores], 89 | feed_dict) 90 | pred_label = (predict > 0.5).astype(int) 91 | print(np,sum(pred_label),np,sum(predict)) 92 | accuracy1 = accuracy_score(y_batch, pred_label) 93 | recall = recall_score(y_batch, pred_label) 94 | precision = precision_score(y_batch, pred_label) 95 | time_str = datetime.datetime.now().isoformat() 96 | print("dev {}: step {}, loss {:g}, acc {:g},acc1 {:g},recall {:g},precision {:g}".format(time_str, step, loss, accuracy,accuracy1,recall,precision)) 97 | if writer: 98 | writer.add_summary(summaries, step) 99 | return loss, accuracy,predict 100 | 101 | def batch_iter(x1, x2, y, batch_size): 102 | idx = np.arange(len(y)) 103 | batches = [idx[range(batch_size * i, min(len(y), batch_size * (i + 1)))] for i in 104 | range(len(y) // batch_size + 1)] 105 | for i in batches: 106 | yield x1[i], x2[i], y[i] 107 | 108 | best_acc_val = 0 109 | monitor_early_stop=0 110 | for epoch in range(config.num_epochs): 111 | print('Epoch:', epoch + 1) 112 | total_batch = 0 113 | for x_batch1, x_batch2, y_batch in batch_iter(x_train1, x_train2, y_train, config.batch_size): 114 | train_fun(x_batch1, x_batch2, y_batch) 115 | total_batch += 1 116 | if epoch % config.print_per_batch == 0: 117 | # 每多少轮次输出在训练集和验证集上的性能 118 | loss_val, acc_val,predict = dev_fun(x_val1, x_val2, y_val, writer=dev_summary_writer) # todo 119 | 120 | if acc_val > best_acc_val: 121 | # 保存最好结果 122 | best_acc_val = acc_val 123 | path = saver.save(sess=session, save_path=checkpoint_prefix) 124 | print("Saved model checkpoint to {}\n".format(path)) 125 | monitor_early_stop=0 126 | else: 127 | monitor_early_stop+=1 128 | print("do not save ") 129 | if(monitor_early_stop>=config.early_stop): 130 | break 131 | loss_val, acc_val, predict = dev_fun(x_val1, x_val2, y_val) 132 | return predict 133 | 134 | 135 | from keras.preprocessing.sequence import pad_sequences 136 | from keras.preprocessing.text import Tokenizer 137 | 138 | tokenizer = Tokenizer(num_words=MAX_FEATURES) 139 | tokenizer.fit_on_texts(list(question1) + list(question2)) 140 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1) 141 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2) 142 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) 143 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) 144 | seed = 20180426 145 | cv_folds = 10 146 | y=np.reshape(y,[len(y),1]) 147 | from sklearn.model_selection import StratifiedKFold 148 | 149 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False) 150 | pred_oob = np.zeros(shape=(len(y), 1)) 151 | # print(pred_oob.shape) 152 | count = 0 153 | for ind_tr, ind_te in skf.split(X_train_q1, y): 154 | x_train_q1 = X_train_q1[ind_tr] 155 | x_train_q2 = X_train_q2[ind_tr] 156 | y_train = y[ind_tr] 157 | 158 | x_val_q1 = X_train_q1[ind_te] 159 | x_val_q2 = X_train_q2[ind_te] 160 | y_val = y[ind_te] 161 | # mymodel = TextRNN() 162 | predict=train(x_train1= x_train_q1, x_train2= x_train_q2,y_train=y_train, 163 | x_val1= x_val_q1, x_val2= x_val_q2, y_val=y_val) 164 | pred_oob[ind_te]=predict 165 | break 166 | pred_label = (pred_oob > 0.5).astype(int) 167 | recall = recall_score(y, pred_label) 168 | print("recal", recall) 169 | precision = precision_score(y, pred_label) 170 | print("precision", precision) 171 | accuracy = accuracy_score(y, pred_label) 172 | print("accuracy", accuracy) 173 | f1 = f1_score(y, pred_label) 174 | print("f1", f1) -------------------------------------------------------------------------------- /upload/keras_main1.py: -------------------------------------------------------------------------------- 1 | # /usr/bin/env python 2 | # coding=utf-8 3 | input_file = "./train.txt" 4 | embedding_matrix_path = './temp_no_truncate.npy' 5 | kernel_name = "bilstm" 6 | import numpy as np 7 | import keras 8 | import sys 9 | from keras.callbacks import EarlyStopping, ModelCheckpoint 10 | # from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score 11 | import jieba 12 | import codecs 13 | 14 | jieba.add_word('花呗') 15 | jieba.add_word('借呗') 16 | jieba.add_word('余额宝') 17 | 18 | MAX_TEXT_LENGTH = 50 19 | MAX_FEATURES = 10000 20 | embedding_dims = 128 21 | dr = 0.01 22 | 23 | 24 | def pandas_process(input_train): 25 | q1 = [] 26 | q2 = [] 27 | vlabel = [] 28 | df = {} 29 | fin = codecs.open(input_train, 'r', encoding='utf-8') 30 | fin.readline() 31 | for line in fin: 32 | l, sen1, sen2 = line.strip().split('\t') 33 | q1.append(sen1) 34 | q2.append(sen2) 35 | vlabel.append(int(l)) 36 | fin.close() 37 | df["question1"] = q1 38 | df["question2"] = q2 39 | df["label"] = vlabel 40 | return df 41 | 42 | 43 | def seg(text): 44 | seg_list = jieba.cut(text) 45 | return " ".join(seg_list) 46 | 47 | 48 | def get_model(): 49 | input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 50 | input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,)) 51 | words_embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims, 52 | # weights=[embedding_matrix], 53 | input_length=MAX_TEXT_LENGTH, 54 | trainable=True) 55 | seq_embedding_layer = keras.layers.Bidirectional(keras.layers.GRU(256, recurrent_dropout=dr)) 56 | seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor)) 57 | merge_layer = keras.layers.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)]) 58 | merge_layer = keras.layers.Dropout(dr)(merge_layer) 59 | dense1_layer = keras.layers.Dense(64, activation='relu')(merge_layer) 60 | ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer) 61 | model = keras.models.Model([input1_tensor, input2_tensor], ouput_layer) 62 | # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"]) 63 | # model.summary() 64 | return model 65 | 66 | 67 | if __name__ == '__main__': 68 | inpath = sys.argv[1] 69 | outputpath = sys.argv[2] 70 | # import pandas as pd 71 | # input_file = "../input/process.csv" 72 | # df=pd.read_csv(input_file) 73 | # question1 = df['question1'].values 74 | # question2 = df['question2'].values 75 | # y = df['label'].values 76 | df = pandas_process(input_file) 77 | question1 = df['question1'] 78 | question2 = df['question2'] 79 | y = df['label'] 80 | from keras.preprocessing.sequence import pad_sequences 81 | from keras.preprocessing.text import Tokenizer 82 | 83 | # np.savetxt('X_train_q1.out', X_train_q1, delimiter=',') 84 | # np.savetxt('X_train_q2.out', X_train_q2, delimiter=',') 85 | # inpath="test1.txt" 86 | test_data1 = [] 87 | test_data2 = [] 88 | linenos = [] 89 | fin = codecs.open(inpath, 'r', encoding='utf-8') 90 | for line in fin: 91 | lineno, sen1, sen2 = line.strip().split('\t') 92 | sen1 = seg(sen1) 93 | sen2 = seg(sen2) 94 | test_data1.append(sen1) 95 | test_data2.append(sen2) 96 | linenos.append(lineno) 97 | fin.close() 98 | 99 | tokenizer = Tokenizer(num_words=MAX_FEATURES) 100 | tokenizer.fit_on_texts(list(question1) + list(question2)) 101 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1) 102 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2) 103 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH) 104 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH) 105 | list_tokenized_question11 = tokenizer.texts_to_sequences(test_data1) 106 | list_tokenized_question22 = tokenizer.texts_to_sequences(test_data2) 107 | x_val_q1 = pad_sequences(list_tokenized_question11, maxlen=MAX_TEXT_LENGTH) 108 | x_val_q2 = pad_sequences(list_tokenized_question22, maxlen=MAX_TEXT_LENGTH) 109 | 110 | # for i in range(len(x_val_q1)): 111 | # t=np.array_equal(X_train_q1[i], x_val_q1[i]) 112 | # if not t: 113 | # print X_train_q1[i]," | ",x_val_q1[i] 114 | # print i,question1[i]," | ",test_data1[i] 115 | # t=np.array_equal(X_train_q2[i], x_val_q2[i]) 116 | # if not t: 117 | # print X_train_q2[i]," | ", x_val_q2[i] 118 | # print i,question2[i]," | ",test_data2[i] 119 | 120 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index)) 121 | # print("nb_words", nb_words) 122 | # embedding_matrix1 = np.load(embedding_matrix_path) 123 | seed = 20180426 124 | cv_folds = 10 125 | # from sklearn.model_selection import StratifiedKFold 126 | 127 | # skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False) 128 | y = y[0:len(x_val_q1)] 129 | # print x_val_q1.shape 130 | pred_oob = np.zeros(shape=(len(x_val_q1), 1)) 131 | # print pred_oob.shape 132 | count = 0 133 | # print "start to predict." 134 | model = get_model() 135 | for index in range(cv_folds): 136 | bst_model_path = kernel_name + '_weight_%d.h5' % count 137 | model.load_weights(bst_model_path) 138 | y_predict = model.predict([x_val_q1, x_val_q2], batch_size=1024, verbose=0) 139 | pred_oob += y_predict 140 | # print "*", 141 | # break 142 | # try: 143 | # y_predict = (y_predict > 0.5).astype(int) 144 | # recall = recall_score(y, y_predict) 145 | # print(count, "recall", recall) 146 | # precision = precision_score(y, y_predict) 147 | # print(count, "precision", precision) 148 | # accuracy = accuracy_score(y, y_predict) 149 | # print(count, "accuracy ", accuracy) 150 | # f1 = f1_score(y, y_predict) 151 | # print(count, "f1", f1) 152 | # count += 1 153 | # except: 154 | # pass 155 | # print "predict done.Saving output to %s"%outputpath 156 | pred_oob /= cv_folds 157 | pred_oob1 = (pred_oob > 0.5).astype(int) 158 | fout = codecs.open(outputpath, 'w', encoding='utf-8') 159 | for index, la in enumerate(pred_oob1): 160 | lineno = linenos[index] 161 | fout.write(lineno + '\t%d\n' % la) 162 | # print "All is done." 163 | # try: 164 | # recall = recall_score(y, pred_oob1) 165 | # print("recal", recall) 166 | # precision = precision_score(y, pred_oob1) 167 | # print("precision", precision) 168 | # accuracy = accuracy_score(y, pred_oob1) 169 | # print("accuracy", accuracy) 170 | # f1 = f1_score(y, pred_oob1) 171 | # print("f1", f1) 172 | # except: 173 | # pass 174 | -------------------------------------------------------------------------------- /upload/run.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | python keras_main1.py $1 $2 4 | --------------------------------------------------------------------------------