├── baseline
    ├── eda.py
    ├── gensim_d2v.py
    ├── ioUtil.py
    ├── keras_bgru_cnn_sim.py
    ├── keras_bilstm_sim.py
    ├── keras_cnn_sim.py
    ├── keras_lstm_sim.py
    ├── keras_seq2seq_sim.py
    ├── seg.py
    └── tf_bilstm_sim.py
├── config.py
├── feature_engineering.py
├── requirements.txt
├── tf_TextCNN.py
├── tf_TextRNN.py
├── tf_model
    └── tf_train_lstm.py
├── tf_train.py
└── upload
    ├── keras_main1.py
    ├── run.sh
    └── train.txt


/baseline/eda.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | input_file="../input/process.csv"
 5 | df = pd.read_csv(input_file,encoding="utf-8")
 6 | print('Total number of question pairs for training: {}'.format(len(df)))
 7 | 
 8 | qids = pd.Series(df['question1'].tolist() + df['question2'].tolist())
 9 | print('Total number of questions in the training data: {}'.format(len(
10 |     np.unique(qids))))
11 | 
12 | print('Number of questions that appear multiple times: {}'.format(np.sum(qids.value_counts() > 1)))
13 | 
14 | print('equal intent pairs: {}%'.format(round(df['label'].mean()*100, 2)))
15 | 
16 | plt.figure(figsize=(12, 5))
17 | plt.hist(qids.value_counts(), bins=50)
18 | plt.yscale('log', nonposy='clip')
19 | plt.title('Log-Histogram of question appearance counts')
20 | plt.xlabel('Number of occurences of question')
21 | plt.ylabel('Number of questions')
22 | plt.show()


--------------------------------------------------------------------------------
/baseline/gensim_d2v.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | 
 3 | import logging
 4 | import sys
 5 | import multiprocessing
 6 | import numpy as np
 7 | 
 8 | from gensim.models import Word2Vec
 9 | from gensim.models.word2vec import LineSentence
10 | from gensim.models import doc2vec
11 | 
12 | embedding_dims=128
13 | if __name__ == '__main__':
14 | 
15 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
16 | 
17 | 
18 | r = np.random.randint(100000,999999,size = (1,))
19 | print (r[0])
20 | sents = doc2vec.TaggedLineDocument("./fc.dat")
21 | print (sents)
22 | model = doc2vec.Doc2Vec(sents, size = embedding_dims, window = 9, min_count=1, iter=45, hs=0, negative=11, seed=r[0])
23 | model.wv.save_word2vec_format("w2v.txt", binary=False)
24 | # model.save("d2v.model")
25 | 
26 | 


--------------------------------------------------------------------------------
/baseline/ioUtil.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pandas as pd
 3 | import jieba
 4 | input_file='../input/atec_nlp_sim_train.csv'
 5 | ret=[]
 6 | jieba.add_word('花呗')
 7 | jieba.add_word('借呗')
 8 | jieba.add_word('余额宝')
 9 | jieba.add_word('***')
10 | 
11 | def seg(text):
12 |     seg_list = jieba.cut(text)
13 |     return " ".join(seg_list)
14 | 
15 | with open(input_file,encoding="utf-8") as fp:
16 |     for line in fp:
17 |         q={}
18 |         lines=line.split("\t")
19 |         if(len(lines)==3):
20 |             q['question1']=seg(lines[0].strip())
21 |             q['question2']=seg(lines[1].strip())
22 |             q['label']=lines[2].strip()
23 |         else:
24 |             print(line)
25 |         ret.append(q)
26 |     df = pd.DataFrame(ret)
27 |     df.to_csv("../input/process.csv",encoding="utf-8",index=False)
28 | 
29 | 


--------------------------------------------------------------------------------
/baseline/keras_bgru_cnn_sim.py:
--------------------------------------------------------------------------------
  1 | input_file = "../input/process.csv"
  2 | w2vpath = '../data/baike.128.no_truncate.glove.txt'
  3 | embedding_matrix_path = './temp_no_truncate.npy'
  4 | kernel_name = "bilstm"
  5 | import pandas as pd
  6 | import numpy as np
  7 | import keras
  8 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
  9 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
 10 | 
 11 | MAX_TEXT_LENGTH = 50
 12 | MAX_FEATURES = 10000
 13 | embedding_dims = 128
 14 | dr = 0.2
 15 | 
 16 | from keras import backend as K
 17 | 
 18 | 
 19 | def f1_score_metrics(y_true, y_pred):
 20 |     def recall(y_true, y_pred):
 21 |         """Recall metric.
 22 | 
 23 |         Only computes a batch-wise average of recall.
 24 | 
 25 |         Computes the recall, a metric for multi-label classification of
 26 |         how many relevant items are selected.
 27 |         """
 28 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 29 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
 30 |         recall = true_positives / (possible_positives + K.epsilon())
 31 |         return recall
 32 | 
 33 |     def precision(y_true, y_pred):
 34 |         """Precision metric.
 35 | 
 36 |         Only computes a batch-wise average of precision.
 37 | 
 38 |         Computes the precision, a metric for multi-label classification of
 39 |         how many selected items are relevant.
 40 |         """
 41 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 42 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
 43 |         precision = true_positives / (predicted_positives + K.epsilon())
 44 |         return precision
 45 | 
 46 |     precision = precision(y_true, y_pred)
 47 |     recall = recall(y_true, y_pred)
 48 |     return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
 49 | 
 50 | 
 51 | def get_model(embedding_matrix, nb_words):
 52 |     input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 53 |     input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 54 |     words_embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims,
 55 |                                                    weights=[embedding_matrix],
 56 |                                                    input_length=MAX_TEXT_LENGTH,
 57 |                                                    trainable=True)
 58 |     seq_embedding_layer = keras.layers.Bidirectional(keras.layers.GRU(256, recurrent_dropout=dr, return_sequences=True))
 59 |     cnn1d_layer=keras.layers.Conv1D(64, kernel_size=2, padding="valid", kernel_initializer="he_uniform")
 60 | 
 61 |     x1=words_embedding_layer(input1_tensor)
 62 |     x1=seq_embedding_layer(x1)
 63 |     x1=cnn1d_layer(x1)
 64 | 
 65 |     x2 = words_embedding_layer(input2_tensor)
 66 |     x2 = seq_embedding_layer(x2)
 67 |     x2=cnn1d_layer(x2)
 68 |     # pooled_gru_cnn= lambda tensor: cnn1d_layer(seq_embedding_layer(words_embedding_layer(tensor)))
 69 |     avg_pool = keras.layers.GlobalAveragePooling1D()
 70 |     max_pool = keras.layers.GlobalMaxPooling1D()
 71 |     x1=keras.layers.concatenate([avg_pool(x1),max_pool(x1)])
 72 |     x2=keras.layers.concatenate([avg_pool(x2),max_pool(x2)])
 73 |     # seq_embedding = lambda tensor: [avg_pool(pooled_gru_cnn(tensor)),max_pool(pooled_gru_cnn(tensor))]
 74 |     merge_layer = keras.layers.multiply([x1, x2])
 75 |     merge_layer = keras.layers.Dropout(dr)(merge_layer)
 76 |     dense1_layer = keras.layers.Dense(64, activation='relu')(merge_layer)
 77 |     ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer)
 78 |     model = keras.models.Model([input1_tensor, input2_tensor], ouput_layer)
 79 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy", f1_score_metrics])
 80 |     model.summary()
 81 |     return model
 82 | 
 83 | 
 84 | from tqdm import tqdm
 85 | import mmap
 86 | import os
 87 | 
 88 | 
 89 | def get_num_lines(file_path):
 90 |     fp = open(file_path, "r+")
 91 |     buf = mmap.mmap(fp.fileno(), 0)
 92 |     lines = 0
 93 |     while buf.readline():
 94 |         lines += 1
 95 |     return lines
 96 | 
 97 | 
 98 | def get_embedding_matrix(word_index, Emed_path, Embed_npy):
 99 |     if (os.path.exists(Embed_npy)):
100 |         return np.load(Embed_npy)
101 |     print('Indexing word vectors')
102 |     embeddings_index = {}
103 |     file_line = get_num_lines(Emed_path)
104 |     print('lines ', file_line)
105 |     with open(Emed_path, encoding='utf-8') as f:
106 |         for line in tqdm(f, total=file_line):
107 |             values = line.split()
108 |             if (len(values) < embedding_dims):
109 |                 print(values)
110 |                 continue
111 |             word = ' '.join(values[:-embedding_dims])
112 |             coefs = np.asarray(values[-embedding_dims:], dtype='float32')
113 |             embeddings_index[word] = coefs
114 |     f.close()
115 | 
116 |     print('Total %s word vectors.' % len(embeddings_index))
117 |     print('Preparing embedding matrix')
118 |     nb_words = MAX_FEATURES  # min(MAX_FEATURES, len(word_index))
119 |     all_embs = np.stack(embeddings_index.values())
120 |     print(all_embs.shape)
121 |     emb_mean, emb_std = all_embs.mean(), all_embs.std()
122 |     embedding_matrix = np.random.normal(loc=emb_mean, scale=emb_std, size=(nb_words, embedding_dims))
123 | 
124 |     # embedding_matrix = np.zeros((nb_words, embedding_dims))
125 |     count = 0
126 |     for word, i in tqdm(word_index.items()):
127 |         if i >= MAX_FEATURES:
128 |             continue
129 |         embedding_vector = embeddings_index.get(word)
130 |         if embedding_vector is not None:
131 |             # words not found in embedding index will be all-zeros.
132 |             embedding_matrix[i] = embedding_vector
133 |             count += 1
134 |     np.save(Embed_npy, embedding_matrix)
135 |     print('Null word embeddings: %d' % (nb_words - count))
136 |     print('not Null word embeddings: %d' % count)
137 |     print('embedding_matrix shape', embedding_matrix.shape)
138 |     # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
139 |     return embedding_matrix
140 | 
141 | 
142 | df = pd.read_csv(input_file, encoding="utf-8")
143 | 
144 | question1 = df['question1'].values
145 | question2 = df['question2'].values
146 | y = df['label'].values
147 | from keras.preprocessing.sequence import pad_sequences
148 | from keras.preprocessing.text import Tokenizer
149 | 
150 | tokenizer = Tokenizer(num_words=MAX_FEATURES)
151 | tokenizer.fit_on_texts(list(question1) + list(question2))
152 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1)
153 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2)
154 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
155 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)
156 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index))
157 | print("nb_words", nb_words)
158 | embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, w2vpath, embedding_matrix_path)
159 | seed = 20180426
160 | cv_folds = 10
161 | from sklearn.model_selection import StratifiedKFold
162 | 
163 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False)
164 | pred_oob = np.zeros(shape=(len(y), 1))
165 | # print(pred_oob.shape)
166 | count = 0
167 | for ind_tr, ind_te in skf.split(X_train_q1, y):
168 |     x_train_q1 = X_train_q1[ind_tr]
169 |     x_train_q2 = X_train_q2[ind_tr]
170 |     x_val_q1 = X_train_q1[ind_te]
171 |     x_val_q2 = X_train_q2[ind_te]
172 |     y_train = y[ind_tr]
173 |     y_val = y[ind_te]
174 |     model = get_model(embedding_matrix1, nb_words)
175 |     early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1)
176 |     bst_model_path = kernel_name + '_weight_%d.h5' % count
177 |     model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
178 |                                        save_best_only=True, verbose=1, save_weights_only=True)
179 |     hist = model.fit([x_train_q1, x_train_q2], y_train,
180 |                      validation_data=([x_val_q1, x_val_q2], y_val),
181 |                      epochs=20, batch_size=256, shuffle=True,
182 |                      class_weight={0: 1.2233, 1: 0.4472},
183 |                      callbacks=[early_stopping, model_checkpoint])
184 |     model.load_weights(bst_model_path)
185 |     y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1)
186 |     pred_oob[ind_te] = y_predict
187 |     y_predict = (y_predict > 0.5).astype(int)
188 |     recall = recall_score(y_val, y_predict)
189 |     print(count, "recal", recall)
190 |     precision = precision_score(y_val, y_predict)
191 |     print(count, "precision", precision)
192 |     accuracy = accuracy_score(y_val, y_predict)
193 |     print(count, "accuracy ", accuracy)
194 |     f1 = f1_score(y_val, y_predict)
195 |     print(count, "f1", f1)
196 |     count += 1
197 | pred_oob1 = (pred_oob > 0.5).astype(int)
198 | recall = recall_score(y, pred_oob1)
199 | print("recal", recall)
200 | precision = precision_score(y, pred_oob1)
201 | print("precision", precision)
202 | accuracy = accuracy_score(y, pred_oob1)
203 | print("accuracy", accuracy)
204 | f1 = f1_score(y, pred_oob1)
205 | print("f1", f1)
206 | 


--------------------------------------------------------------------------------
/baseline/keras_bilstm_sim.py:
--------------------------------------------------------------------------------
  1 | input_file = "../input/process.csv"
  2 | w2vpath = '../data/baike.128.no_truncate.glove.txt'
  3 | embedding_matrix_path = './temp_no_truncate.npy'
  4 | kernel_name = "bilstm"
  5 | word_index_path="worddict.pkl"
  6 | import pandas as pd
  7 | import numpy as np
  8 | import keras
  9 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
 10 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
 11 | 
 12 | MAX_TEXT_LENGTH = 50
 13 | MAX_FEATURES = 10000
 14 | embedding_dims = 128
 15 | dr = 0.2
 16 | 
 17 | from keras import backend as K
 18 | 
 19 | 
 20 | def f1_score_metrics(y_true, y_pred):
 21 |     def recall(y_true, y_pred):
 22 |         """Recall metric.
 23 | 
 24 |         Only computes a batch-wise average of recall.
 25 | 
 26 |         Computes the recall, a metric for multi-label classification of
 27 |         how many relevant items are selected.
 28 |         """
 29 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 30 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
 31 |         recall = true_positives / (possible_positives + K.epsilon())
 32 |         return recall
 33 | 
 34 |     def precision(y_true, y_pred):
 35 |         """Precision metric.
 36 | 
 37 |         Only computes a batch-wise average of precision.
 38 | 
 39 |         Computes the precision, a metric for multi-label classification of
 40 |         how many selected items are relevant.
 41 |         """
 42 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 43 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
 44 |         precision = true_positives / (predicted_positives + K.epsilon())
 45 |         return precision
 46 | 
 47 |     precision = precision(y_true, y_pred)
 48 |     recall = recall(y_true, y_pred)
 49 |     return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
 50 | 
 51 | 
 52 | class F1ScoreCallback(Callback):
 53 |     def __init__(self, predict_batch_size=1024, include_on_batch=False):
 54 |         super(F1ScoreCallback, self).__init__()
 55 |         self.predict_batch_size = predict_batch_size
 56 |         self.include_on_batch = include_on_batch
 57 | 
 58 |     def on_batch_begin(self, batch, logs={}):
 59 |         pass
 60 | 
 61 |     def on_train_begin(self, logs={}):
 62 |         pass
 63 | 
 64 |     def on_batch_end(self, batch, logs={}):
 65 |         pass
 66 | 
 67 |     def on_epoch_end(self, epoch, logs={}):
 68 |         if (self.validation_data):
 69 |             y_predict = self.model.predict([self.validation_data[0], self.validation_data[1]],
 70 |                                            batch_size=self.predict_batch_size)
 71 |             y_predict = (y_predict > 0.5).astype(int)
 72 |             accuracy=accuracy_score(self.validation_data[2], y_predict)
 73 |             precision=precision_score(self.validation_data[2], y_predict)
 74 |             recall = recall_score(self.validation_data[2], y_predict)
 75 |             f1 = f1_score(self.validation_data[2], y_predict)
 76 |             print("precision %.3f recall %.3f f1_score %.3f accuracy %.3f "% (precision, recall,f1,accuracy))
 77 | 
 78 | 
 79 | def get_model(embedding_matrix, nb_words):
 80 |     input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 81 |     input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 82 |     words_embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims,
 83 |                                                    weights=[embedding_matrix],
 84 |                                                    input_length=MAX_TEXT_LENGTH,
 85 |                                                    trainable=True)
 86 |     seq_embedding_layer = keras.layers.Bidirectional(keras.layers.GRU(256, recurrent_dropout=dr))
 87 |     seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))
 88 |     merge_layer = keras.layers.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])
 89 |     merge_layer = keras.layers.Dropout(dr)(merge_layer)
 90 |     dense1_layer = keras.layers.Dense(64, activation='relu')(merge_layer)
 91 |     ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer)
 92 |     model = keras.models.Model([input1_tensor, input2_tensor], ouput_layer)
 93 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy", f1_score_metrics])
 94 |     model.summary()
 95 |     return model
 96 | 
 97 | 
 98 | from tqdm import tqdm
 99 | import mmap
100 | import os
101 | 
102 | 
103 | def get_num_lines(file_path):
104 |     fp = open(file_path, "r+")
105 |     buf = mmap.mmap(fp.fileno(), 0)
106 |     lines = 0
107 |     while buf.readline():
108 |         lines += 1
109 |     return lines
110 | 
111 | 
112 | def get_embedding_matrix(word_index, Emed_path, Embed_npy):
113 |     if (os.path.exists(Embed_npy)):
114 |         return np.load(Embed_npy)
115 |     print('Indexing word vectors')
116 |     embeddings_index = {}
117 |     file_line = get_num_lines(Emed_path)
118 |     print('lines ', file_line)
119 |     with open(Emed_path, encoding='utf-8') as f:
120 |         for line in tqdm(f, total=file_line):
121 |             values = line.split()
122 |             if (len(values) < embedding_dims):
123 |                 print(values)
124 |                 continue
125 |             word = ' '.join(values[:-embedding_dims])
126 |             coefs = np.asarray(values[-embedding_dims:], dtype='float32')
127 |             embeddings_index[word] = coefs
128 |     f.close()
129 | 
130 |     print('Total %s word vectors.' % len(embeddings_index))
131 |     print('Preparing embedding matrix')
132 |     nb_words = MAX_FEATURES  # min(MAX_FEATURES, len(word_index))
133 |     all_embs = np.stack(embeddings_index.values())
134 |     print(all_embs.shape)
135 |     emb_mean, emb_std = all_embs.mean(), all_embs.std()
136 |     embedding_matrix = np.random.normal(loc=emb_mean, scale=emb_std, size=(nb_words, embedding_dims))
137 | 
138 |     # embedding_matrix = np.zeros((nb_words, embedding_dims))
139 |     count = 0
140 |     for word, i in tqdm(word_index.items()):
141 |         if i >= MAX_FEATURES:
142 |             continue
143 |         embedding_vector = embeddings_index.get(word)
144 |         if embedding_vector is not None:
145 |             # words not found in embedding index will be all-zeros.
146 |             embedding_matrix[i] = embedding_vector
147 |             count += 1
148 |     np.save(Embed_npy, embedding_matrix)
149 |     print('Null word embeddings: %d' % (nb_words - count))
150 |     print('not Null word embeddings: %d' % count)
151 |     print('embedding_matrix shape', embedding_matrix.shape)
152 |     # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
153 |     return embedding_matrix
154 | 
155 | 
156 | df = pd.read_csv(input_file, encoding="utf-8")
157 | 
158 | question1 = df['question1'].values
159 | question2 = df['question2'].values
160 | y = df['label'].values
161 | from keras.preprocessing.sequence import pad_sequences
162 | from keras.preprocessing.text import Tokenizer
163 | 
164 | tokenizer = Tokenizer(num_words=MAX_FEATURES)
165 | tokenizer.fit_on_texts(list(question1) + list(question2))
166 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1)
167 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2)
168 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
169 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)
170 | 
171 | inpath="test1.txt"
172 | test_data1 = []
173 | test_data2 = []
174 | linenos=[]
175 | import jieba
176 | jieba.add_word('花呗')
177 | jieba.add_word('借呗')
178 | jieba.add_word('余额宝')
179 | 
180 | def seg(text):
181 |     seg_list = jieba.cut(text)
182 |     return " ".join(seg_list)
183 | 
184 | with open(inpath, 'r') as fin:
185 |     for line in fin:
186 |         lineno, sen1, sen2 = line.strip().split('\t')
187 |         test_data1.append(seg(sen1))
188 |         test_data2.append(seg(sen2))
189 |         linenos.append(lineno)
190 | 
191 | list_tokenized_question1 = tokenizer.texts_to_sequences(test_data1)
192 | list_tokenized_question2 = tokenizer.texts_to_sequences(test_data2)
193 | x_val_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
194 | x_val_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)
195 | 
196 | # import pickle
197 | # with open(word_index_path, 'wb') as fw:
198 | #     pickle.dumps(tokenizer,fw)
199 | 
200 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index))
201 | print("nb_words", nb_words)
202 | embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, w2vpath, embedding_matrix_path)
203 | seed = 20180426
204 | cv_folds = 10
205 | from sklearn.model_selection import StratifiedKFold
206 | 
207 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False)
208 | pred_oob = np.zeros(shape=(len(y), 1))
209 | # print(pred_oob.shape)
210 | count = 0
211 | for ind_tr, ind_te in skf.split(X_train_q1, y):
212 |     x_train_q1 = X_train_q1[ind_tr]
213 |     x_train_q2 = X_train_q2[ind_tr]
214 |     x_val_q1 = X_train_q1[ind_te]
215 |     x_val_q2 = X_train_q2[ind_te]
216 |     y_train = y[ind_tr]
217 |     y_val = y[ind_te]
218 |     model = get_model(embedding_matrix1, nb_words)
219 |     early_stopping = EarlyStopping(monitor='val_f1_score_metrics', patience=5, mode='max', verbose=1)
220 |     bst_model_path = kernel_name + '_weight_%d.h5' % count
221 |     model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_f1_score_metrics', mode='max',
222 |                                        save_best_only=True, verbose=1, save_weights_only=True)
223 |     hist = model.fit([x_train_q1, x_train_q2], y_train,
224 |                      validation_data=([x_val_q1, x_val_q2], y_val),
225 |                      epochs=6, batch_size=32, shuffle=True,
226 |                      class_weight={0: 1.2233, 1: 0.4472},
227 |                      callbacks=[early_stopping, model_checkpoint,F1ScoreCallback()])
228 |     model.load_weights(bst_model_path)
229 |     y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1)
230 |     # y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1)
231 |     pred_oob[ind_te] = y_predict
232 |     # pred_oob  += y_predict
233 |     y_predict = (y_predict > 0.5).astype(int)
234 |     recall = recall_score(y, y_predict)
235 |     print(count, "recal", recall)
236 |     precision = precision_score(y, y_predict)
237 |     print(count, "precision", precision)
238 |     accuracy = accuracy_score(y, y_predict)
239 |     print(count, "accuracy ", accuracy)
240 |     f1 = f1_score(y, y_predict)
241 |     print(count, "f1", f1)
242 |     count += 1
243 | pred_oob/=cv_folds
244 | pred_oob1 = (pred_oob > 0.5).astype(int)
245 | recall = recall_score(y, pred_oob1)
246 | print("recal", recall)
247 | precision = precision_score(y, pred_oob1)
248 | print("precision", precision)
249 | accuracy = accuracy_score(y, pred_oob1)
250 | print("accuracy", accuracy)
251 | f1 = f1_score(y, pred_oob1)
252 | print("f1", f1)
253 | 


--------------------------------------------------------------------------------
/baseline/keras_cnn_sim.py:
--------------------------------------------------------------------------------
  1 | input_file = "../input/process.csv"
  2 | w2vpath = '../data/baike.128.no_truncate.glove.txt'
  3 | embedding_matrix_path = './temp_no_truncate.npy'
  4 | kernel_name = "CNN_mutilwin"
  5 | import pandas as pd
  6 | import numpy as np
  7 | import keras
  8 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
  9 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
 10 | 
 11 | MAX_TEXT_LENGTH = 50
 12 | MAX_FEATURES = 10000
 13 | embedding_dims = 128
 14 | dr = 0.2
 15 | cnn_filters = 64
 16 | kernel_sizes = [2, 3, 8, 9]
 17 | from keras import backend as K
 18 | 
 19 | 
 20 | def f1_score_metrics(y_true, y_pred):
 21 |     def recall(y_true, y_pred):
 22 |         """Recall metric.
 23 | 
 24 |         Only computes a batch-wise average of recall.
 25 | 
 26 |         Computes the recall, a metric for multi-label classification of
 27 |         how many relevant items are selected.
 28 |         """
 29 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 30 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
 31 |         recall = true_positives / (possible_positives + K.epsilon())
 32 |         return recall
 33 | 
 34 |     def precision(y_true, y_pred):
 35 |         """Precision metric.
 36 | 
 37 |         Only computes a batch-wise average of precision.
 38 | 
 39 |         Computes the precision, a metric for multi-label classification of
 40 |         how many selected items are relevant.
 41 |         """
 42 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 43 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
 44 |         precision = true_positives / (predicted_positives + K.epsilon())
 45 |         return precision
 46 | 
 47 |     precision = precision(y_true, y_pred)
 48 |     recall = recall(y_true, y_pred)
 49 |     return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
 50 | 
 51 | 
 52 | def get_model(embedding_matrix, nb_words):
 53 |     input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 54 |     input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 55 |     words_embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims,
 56 |                                                    weights=[embedding_matrix],
 57 |                                                    input_length=MAX_TEXT_LENGTH,
 58 |                                                    trainable=True)
 59 |     embedded_sequences1=words_embedding_layer(input1_tensor)
 60 |     x1=[]
 61 |     for win in kernel_sizes:
 62 |         xi =  keras.layers.Conv1D(filters=cnn_filters,
 63 |                     filter_length=win,
 64 |                     padding='same',
 65 |                     activation='relu'
 66 |                     )(embedded_sequences1)
 67 |         x1.append(xi)
 68 | 
 69 | 
 70 |     x1 = keras.layers.add(x1)
 71 |     x1 = keras.layers.GlobalMaxPooling1D()(x1)
 72 | 
 73 |     embedded_sequences2=words_embedding_layer(input2_tensor)
 74 |     x2 = []
 75 |     for win in kernel_sizes:
 76 |         xi = keras.layers.Conv1D(filters=cnn_filters,
 77 |                                  filter_length=win,
 78 |                                  padding='same',
 79 |                                  activation='relu'
 80 |                                  )(embedded_sequences2)
 81 |         x2.append(xi)
 82 | 
 83 |     x2 = keras.layers.add(x2)
 84 |     x2 = keras.layers.GlobalMaxPooling1D()(x2)
 85 |     merge_layer = keras.layers.multiply([x1, x2])
 86 |     merge_layer = keras.layers.Dropout(dr)(merge_layer)
 87 |     dense1_layer = keras.layers.Dense(64, activation='relu')(merge_layer)
 88 |     ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer)
 89 |     model = keras.models.Model([input1_tensor, input2_tensor], ouput_layer)
 90 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy", f1_score_metrics])
 91 |     model.summary()
 92 |     return model
 93 | 
 94 | 
 95 | from tqdm import tqdm
 96 | import mmap
 97 | import os
 98 | 
 99 | 
100 | def get_num_lines(file_path):
101 |     fp = open(file_path, "r+")
102 |     buf = mmap.mmap(fp.fileno(), 0)
103 |     lines = 0
104 |     while buf.readline():
105 |         lines += 1
106 |     return lines
107 | 
108 | 
109 | def get_embedding_matrix(word_index, Emed_path, Embed_npy):
110 |     if (os.path.exists(Embed_npy)):
111 |         return np.load(Embed_npy)
112 |     print('Indexing word vectors')
113 |     embeddings_index = {}
114 |     file_line = get_num_lines(Emed_path)
115 |     print('lines ', file_line)
116 |     with open(Emed_path, encoding='utf-8') as f:
117 |         for line in tqdm(f, total=file_line):
118 |             values = line.split()
119 |             if (len(values) < embedding_dims):
120 |                 print(values)
121 |                 continue
122 |             word = ' '.join(values[:-embedding_dims])
123 |             coefs = np.asarray(values[-embedding_dims:], dtype='float32')
124 |             embeddings_index[word] = coefs
125 |     f.close()
126 | 
127 |     print('Total %s word vectors.' % len(embeddings_index))
128 |     print('Preparing embedding matrix')
129 |     nb_words = MAX_FEATURES  # min(MAX_FEATURES, len(word_index))
130 |     all_embs = np.stack(embeddings_index.values())
131 |     print(all_embs.shape)
132 |     emb_mean, emb_std = all_embs.mean(), all_embs.std()
133 |     embedding_matrix = np.random.normal(loc=emb_mean, scale=emb_std, size=(nb_words, embedding_dims))
134 | 
135 |     # embedding_matrix = np.zeros((nb_words, embedding_dims))
136 |     count = 0
137 |     for word, i in tqdm(word_index.items()):
138 |         if i >= MAX_FEATURES:
139 |             continue
140 |         embedding_vector = embeddings_index.get(word)
141 |         if embedding_vector is not None:
142 |             # words not found in embedding index will be all-zeros.
143 |             embedding_matrix[i] = embedding_vector
144 |             count += 1
145 |     np.save(Embed_npy, embedding_matrix)
146 |     print('Null word embeddings: %d' % (nb_words - count))
147 |     print('not Null word embeddings: %d' % count)
148 |     print('embedding_matrix shape', embedding_matrix.shape)
149 |     # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
150 |     return embedding_matrix
151 | 
152 | 
153 | df = pd.read_csv(input_file, encoding="utf-8")
154 | 
155 | question1 = df['question1'].values
156 | question2 = df['question2'].values
157 | y = df['label'].values
158 | from keras.preprocessing.sequence import pad_sequences
159 | from keras.preprocessing.text import Tokenizer
160 | 
161 | tokenizer = Tokenizer(num_words=MAX_FEATURES)
162 | tokenizer.fit_on_texts(list(question1) + list(question2))
163 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1)
164 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2)
165 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
166 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)
167 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index))
168 | print("nb_words", nb_words)
169 | embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, w2vpath, embedding_matrix_path)
170 | seed = 20180426
171 | cv_folds = 10
172 | from sklearn.model_selection import StratifiedKFold
173 | 
174 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False)
175 | pred_oob = np.zeros(shape=(len(y), 1))
176 | # print(pred_oob.shape)
177 | count = 0
178 | for ind_tr, ind_te in skf.split(X_train_q1, y):
179 |     x_train_q1 = X_train_q1[ind_tr]
180 |     x_train_q2 = X_train_q2[ind_tr]
181 |     x_val_q1 = X_train_q1[ind_te]
182 |     x_val_q2 = X_train_q2[ind_te]
183 |     y_train = y[ind_tr]
184 |     y_val = y[ind_te]
185 |     model = get_model(embedding_matrix1, nb_words)
186 |     early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1)
187 |     bst_model_path = kernel_name + '_weight_%d.h5' % count
188 |     model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
189 |                                        save_best_only=True, verbose=1, save_weights_only=True)
190 |     hist = model.fit([x_train_q1, x_train_q2], y_train,
191 |                      validation_data=([x_val_q1, x_val_q2], y_val),
192 |                      epochs=20, batch_size=256, shuffle=True,
193 |                      class_weight={0: 1.2233, 1: 0.4472},
194 |                      callbacks=[early_stopping, model_checkpoint])
195 |     model.load_weights(bst_model_path)
196 |     y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1)
197 |     pred_oob[ind_te] = y_predict
198 |     y_predict = (y_predict > 0.5).astype(int)
199 |     recall = recall_score(y_val, y_predict)
200 |     print(count, "recal", recall)
201 |     precision = precision_score(y_val, y_predict)
202 |     print(count, "precision", precision)
203 |     accuracy = accuracy_score(y_val, y_predict)
204 |     print(count, "accuracy ", accuracy)
205 |     f1 = f1_score(y_val, y_predict)
206 |     print(count, "f1", f1)
207 |     count += 1
208 | pred_oob1 = (pred_oob > 0.5).astype(int)
209 | recall = recall_score(y, pred_oob1)
210 | print("recal", recall)
211 | precision = precision_score(y, pred_oob1)
212 | print("precision", precision)
213 | accuracy = accuracy_score(y, pred_oob1)
214 | print("accuracy", accuracy)
215 | f1 = f1_score(y, pred_oob1)
216 | print("f1", f1)
217 | 


--------------------------------------------------------------------------------
/baseline/keras_lstm_sim.py:
--------------------------------------------------------------------------------
  1 | """
  2 | pip install tensorflow
  3 | pip install keras
  4 | pip install numpy
  5 | pip install tqdm
  6 | 
  7 | """
  8 | input_file = "../input/process.csv"
  9 | w2vpath = '../data/baike.128.truncate.glove.txt'
 10 | embedding_matrix_path = './temp.npy'
 11 | kernel_name="lstm"
 12 | import pandas as pd
 13 | import numpy as np
 14 | import keras
 15 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
 16 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
 17 | from keras.optimizers import Adam
 18 | MAX_TEXT_LENGTH = 50
 19 | MAX_FEATURES = 10000
 20 | embedding_dims = 128
 21 | dr = 0.2
 22 | 
 23 | 
 24 | from keras import backend as K
 25 | 
 26 | def f1_score_metrics(y_true, y_pred):
 27 |     def recall(y_true, y_pred):
 28 |         """Recall metric.
 29 | 
 30 |         Only computes a batch-wise average of recall.
 31 | 
 32 |         Computes the recall, a metric for multi-label classification of
 33 |         how many relevant items are selected.
 34 |         """
 35 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 36 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
 37 |         recall = true_positives / (possible_positives + K.epsilon())
 38 |         return recall
 39 | 
 40 |     def precision(y_true, y_pred):
 41 |         """Precision metric.
 42 | 
 43 |         Only computes a batch-wise average of precision.
 44 | 
 45 |         Computes the precision, a metric for multi-label classification of
 46 |         how many selected items are relevant.
 47 |         """
 48 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 49 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
 50 |         precision = true_positives / (predicted_positives + K.epsilon())
 51 |         return precision
 52 | 
 53 |     precision = precision(y_true, y_pred)
 54 |     recall = recall(y_true, y_pred)
 55 |     return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
 56 | 
 57 | def get_model(embedding_matrix,nb_words):
 58 |     input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 59 |     input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 60 |     words_embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims,
 61 |                                                    weights=[embedding_matrix],
 62 |                                                    input_length=MAX_TEXT_LENGTH,
 63 |                                                    trainable=True)
 64 |     seq_embedding_layer = keras.layers.LSTM(256, activation='tanh',recurrent_dropout=dr)
 65 |     seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))
 66 |     merge_layer = keras.layers.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])
 67 |     dense1_layer = keras.layers.Dense(64, activation='relu')(merge_layer)
 68 |     ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer)
 69 |     model = keras.models.Model([input1_tensor, input2_tensor], ouput_layer)
 70 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy",f1_score_metrics])
 71 |     model.summary()
 72 |     return model
 73 | 
 74 | from tqdm import tqdm
 75 | import mmap
 76 | import os
 77 | 
 78 | 
 79 | def get_num_lines(file_path):
 80 |     fp = open(file_path, "r+")
 81 |     buf = mmap.mmap(fp.fileno(), 0)
 82 |     lines = 0
 83 |     while buf.readline():
 84 |         lines += 1
 85 |     return lines
 86 | 
 87 | 
 88 | def get_embedding_matrix(word_index, Emed_path, Embed_npy):
 89 |     if (os.path.exists(Embed_npy)):
 90 |         return np.load(Embed_npy)
 91 |     print('Indexing word vectors')
 92 |     embeddings_index = {}
 93 |     file_line = get_num_lines(Emed_path)
 94 |     print('lines ', file_line)
 95 |     with open(Emed_path, encoding='utf-8') as f:
 96 |         for line in tqdm(f, total=file_line):
 97 |             values = line.split()
 98 |             if(len(values)<embedding_dims):
 99 |                 print(values)
100 |                 continue
101 |             word = ' '.join(values[:-embedding_dims])
102 |             coefs = np.asarray(values[-embedding_dims:], dtype='float32')
103 |             embeddings_index[word] = coefs
104 |     f.close()
105 | 
106 |     print('Total %s word vectors.' % len(embeddings_index))
107 |     print('Preparing embedding matrix')
108 |     nb_words = MAX_FEATURES#min(MAX_FEATURES, len(word_index))
109 |     all_embs = np.stack(embeddings_index.values())
110 |     print(all_embs.shape)
111 |     emb_mean, emb_std = all_embs.mean(), all_embs.std()
112 |     embedding_matrix = np.random.normal(loc=emb_mean, scale=emb_std, size=(nb_words, embedding_dims))
113 | 
114 |     # embedding_matrix = np.zeros((nb_words, embedding_dims))
115 |     count=0
116 |     for word, i in tqdm(word_index.items()):
117 |         if i >= MAX_FEATURES:
118 |             continue
119 |         embedding_vector = embeddings_index.get(word)
120 |         if embedding_vector is not None:
121 |             # words not found in embedding index will be all-zeros.
122 |             embedding_matrix[i] = embedding_vector
123 |             count+=1
124 |     np.save(Embed_npy, embedding_matrix)
125 |     print('Null word embeddings: %d' % (nb_words-count))
126 |     print('not Null word embeddings: %d' % count)
127 |     print('embedding_matrix shape', embedding_matrix.shape)
128 |     # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
129 |     return embedding_matrix
130 | 
131 | 
132 | df = pd.read_csv(input_file, encoding="utf-8")
133 | 
134 | question1 = df['question1'].values
135 | question2 = df['question2'].values
136 | y = df['label'].values
137 | from keras.preprocessing.sequence import pad_sequences
138 | from keras.preprocessing.text import Tokenizer
139 | 
140 | tokenizer = Tokenizer(num_words=MAX_FEATURES)
141 | tokenizer.fit_on_texts(list(question1) + list(question2))
142 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1)
143 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2)
144 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
145 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)
146 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index))
147 | print("nb_words",nb_words)
148 | embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, w2vpath, embedding_matrix_path)
149 | seed = 20180426
150 | cv_folds = 10
151 | from sklearn.model_selection import StratifiedKFold
152 | 
153 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False)
154 | pred_oob = np.zeros(shape=(len(y), 1))
155 | # print(pred_oob.shape)
156 | count = 0
157 | for ind_tr, ind_te in skf.split(X_train_q1, y):
158 |     x_train_q1 = X_train_q1[ind_tr]
159 |     x_train_q2 = X_train_q2[ind_tr]
160 |     x_val_q1 = X_train_q1[ind_te]
161 |     x_val_q2 = X_train_q2[ind_te]
162 |     y_train = y[ind_tr]
163 |     y_val = y[ind_te]
164 |     model = get_model(embedding_matrix1,nb_words)
165 |     early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1)
166 |     bst_model_path =kernel_name+'_weight_%d.h5' % count
167 |     model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
168 |                                        save_best_only=True, verbose=1, save_weights_only=True)
169 |     hist = model.fit([x_train_q1,x_train_q2], y_train,
170 |                      validation_data=([x_val_q1,x_val_q2], y_val),
171 |                      epochs=5, batch_size=256, shuffle=True,
172 |                      callbacks=[early_stopping, model_checkpoint])
173 |     model.load_weights(bst_model_path)
174 |     y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1)
175 |     pred_oob[ind_te] = y_predict
176 |     y_predict = (y_predict > 0.5).astype(int)
177 |     recall = recall_score(y_val, y_predict)
178 |     print(count, "recal", recall)
179 |     precision = precision_score(y_val, y_predict)
180 |     print(count, "precision", precision)
181 |     accuracy = accuracy_score(y_val, y_predict)
182 |     print(count, "accuracy ", accuracy)
183 |     f1 = f1_score(y_val, y_predict)
184 |     print(count, "f1", f1)
185 |     count += 1
186 | pred_label = (pred_oob > 0.5).astype(int)
187 | recall = recall_score(y, pred_label)
188 | print("recal", recall)
189 | precision = precision_score(y, pred_label)
190 | print("precision", precision)
191 | accuracy = accuracy_score(y, pred_label)
192 | print("accuracy", accuracy)
193 | f1 = f1_score(y, pred_label)
194 | print("f1", f1)
195 | 


--------------------------------------------------------------------------------
/baseline/keras_seq2seq_sim.py:
--------------------------------------------------------------------------------
  1 | from keras.layers import Bidirectional, GRU, Dropout, Merge
  2 | 
  3 | input_file = "../input/process.csv"
  4 | w2vpath = '../data/baike.128.truncate.glove.txt'
  5 | embedding_matrix_path = './temp.npy'
  6 | kernel_name="seq2seq"
  7 | import pandas as pd
  8 | import numpy as np
  9 | import keras
 10 | from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
 11 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
 12 | 
 13 | MAX_TEXT_LENGTH = 50
 14 | MAX_FEATURES = 10000
 15 | embedding_dims = 128
 16 | dr = 0.2
 17 | lstm_size=64
 18 | from keras import backend as K
 19 | def f1_score_metrics(y_true, y_pred):
 20 |     def recall(y_true, y_pred):
 21 |         """Recall metric.
 22 | 
 23 |         Only computes a batch-wise average of recall.
 24 | 
 25 |         Computes the recall, a metric for multi-label classification of
 26 |         how many relevant items are selected.
 27 |         """
 28 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 29 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
 30 |         recall = true_positives / (possible_positives + K.epsilon())
 31 |         return recall
 32 | 
 33 |     def precision(y_true, y_pred):
 34 |         """Precision metric.
 35 | 
 36 |         Only computes a batch-wise average of precision.
 37 | 
 38 |         Computes the precision, a metric for multi-label classification of
 39 |         how many selected items are relevant.
 40 |         """
 41 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 42 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
 43 |         precision = true_positives / (predicted_positives + K.epsilon())
 44 |         return precision
 45 | 
 46 |     precision = precision(y_true, y_pred)
 47 |     recall = recall(y_true, y_pred)
 48 |     return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
 49 | 
 50 | def exponent_neg_manhattan_distance(left, right):
 51 |     return K.exp(-K.sum(K.abs(left - right), axis=1, keepdims=True))
 52 | def distance(left, right):
 53 |     return K.exp(-K.sum(K.abs(left - right), axis=1, keepdims=True))
 54 | def get_model(embedding_matrix,nb_words):
 55 |     input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 56 |     input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 57 |     embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims,
 58 |                                                    weights=[embedding_matrix],
 59 |                                                    input_length=MAX_TEXT_LENGTH,
 60 |                                                    trainable=True)
 61 |     shared_encode = Bidirectional(GRU(lstm_size, return_sequences=False))
 62 |     embedded_sequences = embedding_layer(input1_tensor)
 63 |     l_lstm1 = shared_encode(embedded_sequences)
 64 |     l_lstm1 = Dropout(dr)(l_lstm1)
 65 | 
 66 |     embedded_sequences1 = embedding_layer(input2_tensor)
 67 |     l_lstm2 = shared_encode(embedded_sequences1)
 68 |     l_lstm2 = Dropout(dr)(l_lstm2)
 69 | 
 70 |     # Calculates the distance as defined by the MaLSTM model
 71 |     malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),
 72 |                             output_shape=lambda x: (x[0][0], 1))([l_lstm1, l_lstm2])
 73 | 
 74 |     # dense1_layer = keras.layers.Dense(64, activation='relu')(malstm_distance)
 75 |     # ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer)
 76 |     model = keras.models.Model([input1_tensor, input2_tensor], [malstm_distance])
 77 |     model.compile(loss='mean_squared_error', optimizer='adam', metrics=["accuracy",f1_score_metrics])
 78 |     model.summary()
 79 |     return model
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | from tqdm import tqdm
 86 | import mmap
 87 | import os
 88 | 
 89 | 
 90 | def get_num_lines(file_path):
 91 |     fp = open(file_path, "r+")
 92 |     buf = mmap.mmap(fp.fileno(), 0)
 93 |     lines = 0
 94 |     while buf.readline():
 95 |         lines += 1
 96 |     return lines
 97 | 
 98 | 
 99 | def get_embedding_matrix(word_index, Emed_path, Embed_npy):
100 |     if (os.path.exists(Embed_npy)):
101 |         return np.load(Embed_npy)
102 |     print('Indexing word vectors')
103 |     embeddings_index = {}
104 |     file_line = get_num_lines(Emed_path)
105 |     print('lines ', file_line)
106 |     with open(Emed_path, encoding='utf-8') as f:
107 |         for line in tqdm(f, total=file_line):
108 |             values = line.split()
109 |             if(len(values)<128):
110 |                 print(values)
111 |                 continue
112 |             word = ' '.join(values[:-128])
113 |             coefs = np.asarray(values[-128:], dtype='float32')
114 |             embeddings_index[word] = coefs
115 |     f.close()
116 | 
117 |     print('Total %s word vectors.' % len(embeddings_index))
118 |     print('Preparing embedding matrix')
119 |     nb_words = MAX_FEATURES#min(MAX_FEATURES, len(word_index))
120 |     all_embs = np.stack(embeddings_index.values())
121 |     print(all_embs.shape)
122 |     emb_mean, emb_std = all_embs.mean(), all_embs.std()
123 |     embedding_matrix = np.random.normal(loc=emb_mean, scale=emb_std, size=(nb_words, embedding_dims))
124 | 
125 |     # embedding_matrix = np.zeros((nb_words, embedding_dims))
126 |     count=0
127 |     for word, i in tqdm(word_index.items()):
128 |         if i >= MAX_FEATURES:
129 |             continue
130 |         embedding_vector = embeddings_index.get(word)
131 |         if embedding_vector is not None:
132 |             # words not found in embedding index will be all-zeros.
133 |             embedding_matrix[i] = embedding_vector
134 |             count+=1
135 |     np.save(Embed_npy, embedding_matrix)
136 |     print('Null word embeddings: %d' % (nb_words-count))
137 |     print('not Null word embeddings: %d' % count)
138 |     print('embedding_matrix shape', embedding_matrix.shape)
139 |     # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
140 |     return embedding_matrix
141 | 
142 | 
143 | df = pd.read_csv(input_file, encoding="utf-8")
144 | 
145 | question1 = df['question1'].values
146 | question2 = df['question2'].values
147 | y = df['label'].values
148 | from keras.preprocessing.sequence import pad_sequences
149 | from keras.preprocessing.text import Tokenizer
150 | 
151 | tokenizer = Tokenizer(num_words=MAX_FEATURES)
152 | tokenizer.fit_on_texts(list(question1) + list(question2))
153 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1)
154 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2)
155 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
156 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)
157 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index))
158 | print("nb_words",nb_words)
159 | embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, w2vpath, embedding_matrix_path)
160 | seed = 20180426
161 | cv_folds = 10
162 | from sklearn.model_selection import StratifiedKFold
163 | 
164 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False)
165 | pred_oob = np.zeros(shape=(len(y), 1))
166 | # print(pred_oob.shape)
167 | count = 0
168 | for ind_tr, ind_te in skf.split(X_train_q1, y):
169 |     x_train_q1 = X_train_q1[ind_tr]
170 |     x_train_q2 = X_train_q2[ind_tr]
171 |     x_val_q1 = X_train_q1[ind_te]
172 |     x_val_q2 = X_train_q2[ind_te]
173 |     y_train = y[ind_tr]
174 |     y_val = y[ind_te]
175 |     model = get_model(embedding_matrix1,nb_words)
176 |     early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
177 |     bst_model_path =kernel_name+'_weight_%d.h5' % count
178 |     model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
179 |                                        save_best_only=True, verbose=1, save_weights_only=True)
180 |     hist = model.fit([x_train_q1,x_train_q2], y_train,
181 |                      validation_data=([x_val_q1,x_val_q2], y_val),
182 |                      epochs=15, batch_size=32, shuffle=True,
183 |                      class_weight={0: 1.2233, 1: 0.4472},
184 |                      callbacks=[early_stopping, model_checkpoint])
185 |     model.load_weights(bst_model_path)
186 |     y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1)
187 |     pred_oob[ind_te] = y_predict
188 |     y_predict = (y_predict > 0.5).astype(int)
189 |     recall = recall_score(y_val, y_predict)
190 |     print(count, "recal", recall)
191 |     precision = precision_score(y_val, y_predict)
192 |     print(count, "precision", precision)
193 |     accuracy = accuracy_score(y_val, y_predict)
194 |     print(count, "accuracy ", accuracy)
195 |     f1 = f1_score(y_val, y_predict)
196 |     print(count, "f1", f1)
197 |     count += 1
198 | pred_oob1 = (pred_oob > 0.5).astype(int)
199 | recall = recall_score(y, pred_oob1)
200 | print("recal", recall)
201 | precision = precision_score(y, pred_oob1)
202 | print("precision", precision)
203 | accuracy = accuracy_score(y, pred_oob1)
204 | print("accuracy", accuracy)
205 | f1 = f1_score(y, pred_oob1)
206 | print("f1", f1)
207 | 


--------------------------------------------------------------------------------
/baseline/seg.py:
--------------------------------------------------------------------------------
 1 | #py3
 2 | import sys
 3 | import jieba
 4 | import re
 5 | import numpy as np
 6 | jieba.add_word('花呗')
 7 | jieba.add_word('借呗')
 8 | jieba.add_word('余额宝')
 9 | 
10 | input_file="..\\data\\answers.txt"
11 | # input_file="..\\input\\atec_nlp_sim_train.csv"
12 | output_file="fc2.txt"
13 | def seg(text):
14 |     seg_list = jieba.cut(text.strip())
15 |     return " ".join(seg_list)
16 | 
17 | # dict_file_name="../data/dict.txt"
18 | # jieba.load_userdict(input_file)
19 | # # jieba.add_word('花呗')
20 | # df = pd.read_csv(input_file,encoding="utf-8")
21 | # q=df["question1"]
22 | # for s in q:
23 | #     seg_list=jieba.cut(s)
24 | #     print("/ ".join(seg_list))
25 | #     break
26 | 
27 | # a=[0.1,0.5,0.8]
28 | # l=[0,1,1]
29 | # b=np.array(a)
30 | # d =  (b>0.5).astype(int)
31 | # # d=np.stack((b,c),axis=1)
32 | # print(d)
33 | # print(d.shape)
34 | # from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score
35 | # s=f1_score(l,d)
36 | # # b=d.argmax(axis=-1)
37 | # print(s)
38 | special_character_removal = re.compile(r'[@#$%^&*,.【】[]{}；‘，。、？!? \\/"\']', re.IGNORECASE)
39 | replace_numbers = re.compile(r'\d+', re.IGNORECASE)
40 | if __name__ == '__main__':
41 | 
42 |     with open(input_file,encoding="utf-8") as fp,open(output_file,"w",encoding="utf-8") as fw:
43 |         for line in fp:
44 |             line = special_character_removal.sub('', line)
45 |             line = replace_numbers.sub('NUMBER_REPLACE', line)
46 |             lines=line.strip().split(" ++$++ ")
47 |             if(len(lines)==3):
48 |                 line=lines[1]
49 |             fw.write(seg(line))
50 |             fw.write("\n")
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/baseline/tf_bilstm_sim.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | #py2
  4 | from __future__ import  print_function
  5 | input_file = "../input/process.csv"
  6 | w2vpath = '../data/baike.128.no_truncate.glove.txt'
  7 | embedding_matrix_path = './temp_no_truncate.npy'
  8 | kernel_name="bilstm"
  9 | import pandas as pd
 10 | import numpy as np
 11 | import os
 12 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
 13 | import tensorflow as tf
 14 | 
 15 | MAX_TEXT_LENGTH = 50
 16 | MAX_FEATURES = 10000
 17 | embedding_dims = 128
 18 | dr = 0.2
 19 | batch_size = 256
 20 | save_dir = 'checkpoints/textrnn'
 21 | save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径
 22 | 
 23 | class TRNNConfig(object):
 24 |     """RNN配置参数"""
 25 | 
 26 |     # 模型参数
 27 |     embedding_dim = 64      # 词向量维度
 28 |     seq_length = MAX_TEXT_LENGTH        # 序列长度
 29 |     num_classes = 2        # 类别数
 30 |     vocab_size = MAX_FEATURES       # 词汇表达小
 31 | 
 32 |     num_layers= 1           # 隐藏层层数
 33 |     hidden_dim = 256        # 隐藏层神经元
 34 |     rnn = 'gru'             # lstm 或 gru
 35 | 
 36 |     dropout_keep_prob = 0.8 # dropout保留比例
 37 |     learning_rate = 1e-3    # 学习率
 38 | 
 39 |     batch_size = 256         # 每批训练大小
 40 |     num_epochs = 10          # 总迭代轮次
 41 | 
 42 |     print_per_batch = 100    # 每多少轮输出一次结果
 43 |     save_per_batch = 10      # 每多少轮存入tensorboard
 44 | 
 45 | class TextRNN():
 46 |     def __init__(self,
 47 |                  embedding_matrix=None,
 48 |                  config=TRNNConfig):
 49 |         self.config = config
 50 |         def lstm_cell():  # lstm核
 51 |             return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)
 52 | 
 53 |         def gru_cell():  # gru核
 54 |             return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
 55 | 
 56 |         def dropout():  # 为每一个rnn核后面加一个dropout层
 57 |             if (self.config.rnn == 'lstm'):
 58 |                 cell = lstm_cell()
 59 |             else:
 60 |                 cell = gru_cell()
 61 |             return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
 62 |         # Placeholders for input, output and dropout
 63 |         self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
 64 |         self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
 65 |         self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
 66 |         # 词向量映射
 67 |         with tf.device('/cpu:0'):
 68 |             embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
 69 |             embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
 70 | 
 71 |         with tf.name_scope("rnn"):
 72 |             # 多层rnn网络
 73 |             cells = [dropout() for _ in range(self.config.num_layers)]
 74 |             rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
 75 | 
 76 |             _outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
 77 |             last = _outputs[:, -1, :]  # 取最后一个时序输出作为结果
 78 | 
 79 |         with tf.name_scope("score"):
 80 |             # 全连接层，后面接dropout以及relu激活
 81 |             fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')
 82 |             fc = tf.contrib.layers.dropout(fc, self.keep_prob)
 83 |             fc = tf.nn.relu(fc)
 84 | 
 85 |             # 分类器
 86 |             self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
 87 |             self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别
 88 | 
 89 |         with tf.name_scope("optimize"):
 90 |             # 损失函数，交叉熵
 91 |             cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
 92 |             self.loss = tf.reduce_mean(cross_entropy)
 93 |             # 优化器
 94 |             self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
 95 | 
 96 |         with tf.name_scope("accuracy"):
 97 |             # 准确率
 98 |             correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
 99 |             self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
100 | 
101 | 
102 | def train(model,config) :
103 |     print("Configuring TensorBoard and Saver...")
104 |     # 配置 Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
105 |     tensorboard_dir = 'tensorboard/textrnn'
106 |     if not os.path.exists(tensorboard_dir):
107 |         os.makedirs(tensorboard_dir)
108 | 
109 |     tf.summary.scalar("loss", model.loss)
110 |     tf.summary.scalar("accuracy", model.acc)
111 |     merged_summary = tf.summary.merge_all()
112 |     writer = tf.summary.FileWriter(tensorboard_dir)
113 | 
114 |     # 配置 Saver
115 |     saver = tf.train.Saver()
116 |     if not os.path.exists(save_dir):
117 |         os.makedirs(save_dir)
118 | 
119 |     # 创建session
120 |     session = tf.Session()
121 |     session.run(tf.global_variables_initializer())
122 |     writer.add_graph(session.graph)
123 | 
124 |     print('Training and evaluating...')
125 |     total_batch = 0  # 总批次
126 |     best_acc_val = 0.0  # 最佳验证集准确率
127 |     last_improved = 0  # 记录上一次提升批次
128 |     require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练
129 | 
130 |     flag = False
131 |     for epoch in range(config.num_epochs):
132 |         print('Epoch:', epoch + 1)
133 |         batch_train = batch_iter(x_train, y_train, config.batch_size)
134 |         for x_batch, y_batch in batch_train:
135 |             feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)
136 |             if total_batch % config.save_per_batch == 0:
137 |                 # 每多少轮次将训练结果写入tensorboard scalar
138 |                 s = session.run(merged_summary, feed_dict=feed_dict)
139 |                 writer.add_summary(s, total_batch)
140 |             if total_batch % config.print_per_batch == 0:
141 |                 # 每多少轮次输出在训练集和验证集上的性能
142 |                 feed_dict[model.keep_prob] = 1.0
143 |                 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
144 |                 loss_val, acc_val = evaluate(session, x_val, y_val)  # todo
145 | 
146 |                 if acc_val > best_acc_val:
147 |                     # 保存最好结果
148 |                     best_acc_val = acc_val
149 |                     last_improved = total_batch
150 |                     saver.save(sess=session, save_path=save_path)
151 |                     improved_str = '*'
152 |                 else:
153 |                     improved_str = ''
154 | 
155 |             session.run(model.optim, feed_dict=feed_dict)  # 运行优化
156 |             total_batch += 1
157 | 
158 | df = pd.read_csv(input_file, encoding="utf-8")
159 | 
160 | question1 = df['question1'].values
161 | question2 = df['question2'].values
162 | y = df['label'].values
163 | from keras.preprocessing.sequence import pad_sequences
164 | from keras.preprocessing.text import Tokenizer
165 | 
166 | tokenizer = Tokenizer(num_words=MAX_FEATURES)
167 | tokenizer.fit_on_texts(list(question1) + list(question2))
168 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1)
169 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2)
170 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
171 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)
172 | nb_words = min(MAX_FEATURES, len(tokenizer.word_index))
173 | print("nb_words",nb_words)
174 | seed = 20180426
175 | cv_folds = 10
176 | from sklearn.model_selection import StratifiedKFold
177 | 
178 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False)
179 | pred_oob = np.zeros(shape=(len(y), 1))
180 | # print(pred_oob.shape)
181 | count = 0
182 | for ind_tr, ind_te in skf.split(X_train_q1, y):
183 |     x_train_q1 = X_train_q1[ind_tr]
184 |     x_train_q2 = X_train_q2[ind_tr]
185 |     x_val_q1 = X_train_q1[ind_te]
186 |     x_val_q2 = X_train_q2[ind_te]
187 |     y_train = y[ind_tr]
188 |     y_val = y[ind_te]
189 | 
190 |     # model = get_model(embedding_matrix1,nb_words)
191 |     # early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1)
192 |     # bst_model_path =kernel_name+'_weight_%d.h5' % count
193 |     # model_checkpoint = ModelCheckpoint(bst_model_path, monitor='val_loss', mode='min',
194 |     #                                    save_best_only=True, verbose=1, save_weights_only=True)
195 |     # hist = model.fit([x_train_q1,x_train_q2], y_train,
196 |     #                  validation_data=([x_val_q1,x_val_q2], y_val),
197 |     #                  epochs=6, batch_size=256, shuffle=True,
198 |     #                  class_weight={0: 1.3233, 1: 0.4472},
199 |     #                  callbacks=[early_stopping, model_checkpoint])
200 |     # model.load_weights(bst_model_path)
201 |     y_predict = model.predict([x_val_q1, x_val_q2], batch_size=256, verbose=1)
202 |     pred_oob[ind_te] = y_predict
203 |     y_predict = (y_predict > 0.5).astype(int)
204 |     recall = recall_score(y_val, y_predict)
205 |     print(count, "recal", recall)
206 |     precision = precision_score(y_val, y_predict)
207 |     print(count, "precision", precision)
208 |     accuracy = accuracy_score(y_val, y_predict)
209 |     print(count, "accuracy ", accuracy)
210 |     f1 = f1_score(y_val, y_predict)
211 |     print(count, "f1", f1)
212 |     count += 1
213 | pred_oob = (pred_oob > 0.5).astype(int)
214 | recall = recall_score(y, pred_oob)
215 | print("recal", recall)
216 | precision = precision_score(y, pred_oob)
217 | print("precision", precision)
218 | accuracy = accuracy_score(y, pred_oob)
219 | print("accuracy", accuracy)
220 | f1 = f1_score(y, pred_oob)
221 | print("f1", f1)
222 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | input_file = "./input/process.csv"
 2 | w2vpath = './data/baike.128.no_truncate.glove.txt'
 3 | embedding_matrix_path = './baseline/temp.npy'
 4 | kernel_name="bilstm"
 5 | import pandas as pd
 6 | import numpy as np
 7 | import os
 8 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
 9 | import tensorflow as tf
10 | 
11 | MAX_TEXT_LENGTH = 50
12 | MAX_FEATURES = 10000
13 | embedding_dims = 128
14 | dr = 0.2
15 | batch_size = 256
16 | 
17 | class TRNNConfig(object):
18 |     """RNN配置参数"""
19 | 
20 |     # 模型参数
21 |     embedding_dim = 64      # 词向量维度
22 |     seq_length = MAX_TEXT_LENGTH        # 序列长度
23 |     num_classes = 1        # 类别数
24 |     vocab_size = MAX_FEATURES       # 词汇表达小
25 | 
26 |     num_layers= 1           # 隐藏层层数
27 |     hidden_dim = 256        # 隐藏层神经元
28 |     rnn = 'gru'             # lstm 或 gru
29 |     fc_hidden_dim=64
30 |     dropout_keep_prob = 0.8 # dropout保留比例
31 |     learning_rate = 1e-3    # 学习率
32 | 
33 |     batch_size = 256         # 每批训练大小
34 |     num_epochs = 50          # 总迭代轮次
35 |     early_stop=5
36 | 
37 |     print_per_batch = 1    # 每多少轮输出一次结果
38 |     save_per_batch = 10      # 每多少轮存入tensorboard
39 | 
40 |     num_checkpoints=5  #Number of checkpoints to store (default: 5)
41 | 
42 |     class_weight0=1.0
43 |     class_weight1=2.3
44 | 


--------------------------------------------------------------------------------
/feature_engineering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Detecting duplicate quora questions
  3 | feature engineering
  4 | @author: Abhishek Thakur
  5 | """
  6 | input_file = "../input/process.csv"
  7 | w2vpath = '../data/baike.128.no_truncate.glove.txt'
  8 | # import cPickle
  9 | import pandas as pd
 10 | import numpy as np
 11 | import gensim
 12 | from fuzzywuzzy import fuzz
 13 | # from nltk.corpus import stopwords
 14 | from tqdm import tqdm
 15 | from scipy.stats import skew, kurtosis
 16 | from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
 17 | # from nltk import word_tokenize
 18 | # stop_words = stopwords.words('english')
 19 | stop_words=['的',',','。']
 20 | 
 21 | def wmd(s1, s2):
 22 |     s1 = s1.split()
 23 |     s2 = s2.split()
 24 |     # stop_words = stopwords.words('english')
 25 |     s1 = [w for w in s1 if w not in stop_words]
 26 |     s2 = [w for w in s2 if w not in stop_words]
 27 |     return model.wmdistance(s1, s2)
 28 | 
 29 | 
 30 | def norm_wmd(s1, s2):
 31 |     s1 = s1.lower().split()
 32 |     s2 = s2.lower().split()
 33 |     # stop_words = stopwords.words('english')
 34 |     s1 = [w for w in s1 if w not in stop_words]
 35 |     s2 = [w for w in s2 if w not in stop_words]
 36 |     return norm_model.wmdistance(s1, s2)
 37 | 
 38 | 
 39 | def sent2vec(s):
 40 |     words = s
 41 |     # words = word_tokenize(words)
 42 |     words = [w for w in words if not w in stop_words]
 43 |     words = [w for w in words if w.isalpha()]
 44 |     M = []
 45 |     for w in words:
 46 |         try:
 47 |             M.append(model[w])
 48 |         except:
 49 |             continue
 50 |     M = np.array(M)
 51 |     v = M.sum(axis=0)
 52 |     return v / np.sqrt((v ** 2).sum())
 53 | 
 54 | 
 55 | data = pd.read_csv(input_file)
 56 | # data = data.drop(['id', 'qid1', 'qid2'], axis=1)
 57 | 
 58 | 
 59 | data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
 60 | data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
 61 | data['diff_len'] = data.len_q1 - data.len_q2
 62 | data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
 63 | data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
 64 | data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
 65 | data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
 66 | data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
 67 | # data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
 68 | data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
 69 | data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
 70 | data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
 71 | data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
 72 | data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
 73 | data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
 74 | 
 75 | 
 76 | model = gensim.models.KeyedVectors.load_word2vec_format('../data/vectors.txt', binary=False)
 77 | data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
 78 | 
 79 | 
 80 | norm_model = gensim.models.KeyedVectors.load_word2vec_format('../data/vectors.txt', binary=False)
 81 | norm_model.init_sims(replace=True)
 82 | data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)
 83 | 
 84 | question1_vectors = np.zeros((data.shape[0], 300))
 85 | error_count = 0
 86 | 
 87 | for i, q in tqdm(enumerate(data.question1.values)):
 88 |     question1_vectors[i, :] = sent2vec(q)
 89 | 
 90 | question2_vectors  = np.zeros((data.shape[0], 300))
 91 | for i, q in tqdm(enumerate(data.question2.values)):
 92 |     question2_vectors[i, :] = sent2vec(q)
 93 | 
 94 | data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
 95 |                                                           np.nan_to_num(question2_vectors))]
 96 | 
 97 | data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
 98 |                                                           np.nan_to_num(question2_vectors))]
 99 | 
100 | data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
101 |                                                           np.nan_to_num(question2_vectors))]
102 | 
103 | data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
104 |                                                           np.nan_to_num(question2_vectors))]
105 | 
106 | data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
107 |                                                           np.nan_to_num(question2_vectors))]
108 | 
109 | data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
110 |                                                           np.nan_to_num(question2_vectors))]
111 | 
112 | data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
113 |                                                           np.nan_to_num(question2_vectors))]
114 | 
115 | data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
116 | data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
117 | data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
118 | data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
119 | 
120 | # cPickle.dump(question1_vectors, open('data/q1_w2v.pkl', 'wb'), -1)
121 | # cPickle.dump(question2_vectors, open('data/q2_w2v.pkl', 'wb'), -1)
122 | 
123 | data.to_csv('data/quora_features.csv', index=False)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | pandas
4 | scikit-learn
5 | Keras>=2.0.0
6 | tqdm
7 | 
8 | 


--------------------------------------------------------------------------------
/tf_TextCNN.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from config import *
 3 | 
 4 | filter_sizes=[2,3,8,9]
 5 | num_filters=3
 6 | class TextRNN():
 7 |     def __init__(self,
 8 |                  embedding_matrix=None,
 9 |                  config=TRNNConfig()):
10 |         self.config = config
11 | 
12 |         def lstm_cell():  # lstm核
13 |             return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)
14 | 
15 |         def gru_cell():  # gru核
16 |             return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
17 | 
18 |         def dropout():  # 为每一个rnn核后面加一个dropout层
19 |             if (self.config.rnn == 'lstm'):
20 |                 cell = lstm_cell()
21 |             else:
22 |                 cell = gru_cell()
23 |             return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
24 | 
25 |         # Placeholders for input, output and dropout
26 |         self.input_x1 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x1')
27 |         self.input_x2 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x2')
28 |         self.input_y = tf.placeholder(tf.float32, [None, 1], name='input_y')
29 |         self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
30 |         # 词向量映射
31 |         with tf.device('/cpu:0'):
32 |             W = tf.Variable(
33 |                 tf.random_uniform([config.vocab_size, config.embedding_dim], -1.0, 1.0),
34 |                 name="W")
35 |             self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x1)
36 |             self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
37 |             # W = tf.get_variable(name="W", shape=embedding_matrix.shape, initializer=tf.constant_initializer(embedding_matrix),trainable=True)
38 | 
39 |         pooled_outputs = []
40 |         for i, filter_size in enumerate(filter_sizes):
41 |             with tf.name_scope("conv-maxpool-%s" % filter_size):
42 |                 # Convolution Layer
43 |                 filter_shape = [filter_size, config.embedding_dim, 1, num_filters]
44 |                 W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
45 |                 b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
46 |                 conv = tf.nn.conv2d(
47 |                     self.embedded_chars_expanded,
48 |                     W,
49 |                     strides=[1, 1, 1, 1],
50 |                     padding="VALID",
51 |                     name="conv")
52 |                 # Apply nonlinearity
53 |                 h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
54 |                 # Max-pooling over the outputs
55 |                 pooled = tf.nn.max_pool(
56 |                     h,
57 |                     ksize=[1, config.vocab_size - filter_size + 1, 1, 1],
58 |                     strides=[1, 1, 1, 1],
59 |                     padding='VALID',
60 |                     name="pool")
61 |                 pooled_outputs.append(pooled)
62 | 
63 | 
64 |         # Combine all the pooled features
65 |         num_filters_total = num_filters * len(filter_sizes)
66 |         self.h_pool = tf.concat(3, pooled_outputs)
67 |         self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
68 | 
69 |         # Add dropout
70 |         with tf.name_scope("dropout"):
71 |             self.h_drop = tf.nn.dropout(self.h_pool_flat, self.config.dropout_keep_prob)
72 |         with tf.name_scope("output"):
73 |             W = tf.Variable(tf.truncated_normal([num_filters_total, config.num_classes], stddev=0.1), name="W")
74 |             b = tf.Variable(tf.constant(0.1, shape=[config.num_classes]), name="b")
75 |             self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
76 |             self.predictions = tf.argmax(self.scores, 1, name="predictions")
77 | 
78 |         with tf.name_scope("optimize"):
79 |             # 损失函数，交叉熵
80 |             # cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores,labels=self.input_y)
81 |             # self.loss = tf.reduce_mean(cross_entropy)
82 | 
83 |             # self.loss = -tf.reduce_sum(tf.cast(self.input_y, tf.float32)
84 |             #                                           * tf.log(tf.cast(self.y_pred_cls, tf.float32)), reduction_indices=1)
85 |             self.loss=tf.losses.mean_squared_error(logits=self.scores,labels=self.input_y)
86 |             # 优化器
87 |             self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
88 | 
89 |         with tf.name_scope("accuracy"):
90 |             # 准确率
91 |             correct_pred = tf.equal(tf.cast(self.input_y, tf.float32), tf.cast(self.y_pred_cls, tf.float32))
92 |             self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
93 | 


--------------------------------------------------------------------------------
/tf_TextRNN.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from config import *
 3 | 
 4 | 
 5 | class TextRNN():
 6 |     def __init__(self,
 7 |                  embedding_matrix=None,
 8 |                  config=TRNNConfig()):
 9 |         self.config = config
10 | 
11 |         def lstm_cell():  # lstm核
12 |             cell = tf.nn.rnn_cell.BasicLSTMCell(config.hidden_dim, forget_bias=0.0, state_is_tuple=True)
13 |             if config.dropout_keep_prob < 1:
14 |                 cell = tf.nn.rnn_cell.DropoutWrapper(
15 |                     cell, output_keep_prob=config.dropout_keep_prob
16 |                 )
17 |             return cell
18 | 
19 |         def gru_cell():  # gru核
20 |             return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
21 | 
22 | 
23 | 
24 |         # Placeholders for input, output and dropout
25 |         self.input_x1 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x1')
26 |         self.input_x2 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x2')
27 |         self.input_y = tf.placeholder(tf.float32, [None, 1], name='input_y')
28 |         self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
29 |         # 词向量映射
30 |         with tf.device('/cpu:0'):
31 |             # weW = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
32 |             weW = tf.get_variable(name="W", shape=embedding_matrix.shape, initializer=tf.constant_initializer(embedding_matrix),trainable=True)
33 |             embedding_inputs1 = tf.nn.embedding_lookup(weW, self.input_x1)
34 |             embedding_inputs2 = tf.nn.embedding_lookup(weW, self.input_x2)
35 |             print('input_x1', self.input_x1.get_shape())
36 |         with tf.name_scope("rnn"):
37 |             # 多层rnn网络
38 |             cells = [lstm_cell() for _ in range(self.config.num_layers)]
39 |             rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
40 | 
41 |             _outputs1, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs1, dtype=tf.float32)
42 |             _outputs2, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs2, dtype=tf.float32)
43 |             print("_outputs2", _outputs2.get_shape())
44 |             encode1 = _outputs1[:, -1, :]
45 |             encode2 = _outputs2[:, -1, :]  # 取最后一个时序输出作为结果
46 |             print("encode2", encode2.get_shape())
47 |             last = tf.multiply(encode1, encode2, name="last")
48 |             print("multiply",last.get_shape())
49 |         with tf.name_scope("score"):
50 |             # 全连接层，后面接dropout以及relu激活
51 |             fc = tf.layers.dense(last, self.config.fc_hidden_dim, name='fc1',activation=tf.nn.relu)
52 |             # fc = tf.contrib.layers.dropout(fc, self.keep_prob)
53 |             # fc = tf.nn.relu(fc)
54 |             print('fc',fc.get_shape())
55 |             # 分类器
56 |             # lbW = tf.Variable(tf.truncated_normal([self.config.hidden_dim, self.config.num_classes], stddev=0.1), name="lbW")
57 |             # b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name="b")
58 |             # print('lbW',lbW.get_shape())
59 |             # print('b',b.get_shape())
60 |             self.scores = tf.layers.dense(fc,1,activation=tf.nn.sigmoid)  # Softmax
61 |             # self.scores = tf.nn.xw_plus_b(fc, lbW, b, name="scores")
62 |             self.y_pred_cls = tf.round(self.scores, name="predictions")
63 |             # self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
64 |             # self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别
65 |         with tf.name_scope("optimize"):
66 |             # 损失函数，交叉熵
67 |             # cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores,labels=self.input_y)
68 |             # self.loss = tf.reduce_mean(cross_entropy)
69 | 
70 |             # self.loss = -tf.reduce_sum(tf.cast(self.input_y, tf.float32)
71 |             #                                           * tf.log(tf.cast(self.y_pred_cls, tf.float32)), reduction_indices=1)
72 |             self.loss = tf.reduce_mean(-tf.reduce_sum(self.input_y* tf.log(self.scores)*config.class_weight1
73 |                                                       +(1-self.input_y)*tf.log(1-self.scores)*config.class_weight0
74 |                                                       , reduction_indices=[1]))
75 | 
76 |             # self.loss = tf.losses.sigmoid_cross_entropy(logits=self.scores, multi_class_labels=self.input_y)
77 |             # 优化器
78 |             # self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
79 | 
80 |         with tf.name_scope("accuracy"):
81 |             # 准确率
82 |             correct_pred = tf.equal(tf.cast(self.input_y, tf.float32), tf.cast(self.y_pred_cls, tf.float32))
83 |             self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="accuracy")
84 | 


--------------------------------------------------------------------------------
/tf_model/tf_train_lstm.py:
--------------------------------------------------------------------------------
  1 | input_file = "./input/process.csv"
  2 | w2vpath = './data/baike.128.no_truncate.glove.txt'
  3 | embedding_matrix_path = './baseline/temp.npy'
  4 | kernel_name="bilstm"
  5 | import pandas as pd
  6 | import numpy as np
  7 | import os
  8 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
  9 | import tensorflow as tf
 10 | import datetime
 11 | MAX_TEXT_LENGTH = 50
 12 | MAX_FEATURES = 10000
 13 | embedding_dims = 128
 14 | dr = 0.2
 15 | batch_size = 256
 16 | 
 17 | class TRNNConfig(object):
 18 |     """RNN配置参数"""
 19 | 
 20 |     # 模型参数
 21 |     embedding_dim = 64      # 词向量维度
 22 |     seq_length = MAX_TEXT_LENGTH        # 序列长度
 23 |     num_classes = 1        # 类别数
 24 |     vocab_size = MAX_FEATURES       # 词汇表达小
 25 | 
 26 |     num_layers= 1           # 隐藏层层数
 27 |     hidden_dim = 256        # 隐藏层神经元
 28 |     rnn = 'gru'             # lstm 或 gru
 29 |     fc_hidden_dim=64
 30 |     dropout_keep_prob = 0.8 # dropout保留比例
 31 |     learning_rate = 1e-3    # 学习率
 32 | 
 33 |     batch_size = 256         # 每批训练大小
 34 |     num_epochs = 50          # 总迭代轮次
 35 |     early_stop=5
 36 | 
 37 |     print_per_batch = 1    # 每多少轮输出一次结果
 38 |     save_per_batch = 10      # 每多少轮存入tensorboard
 39 | 
 40 |     num_checkpoints=5  #Number of checkpoints to store (default: 5)
 41 | 
 42 |     class_weight0=1.0
 43 |     class_weight1=2.3
 44 | 
 45 | class TextRNN():
 46 |     def __init__(self,
 47 |                  embedding_matrix=None,
 48 |                  config=TRNNConfig()):
 49 |         self.config = config
 50 | 
 51 |         def lstm_cell():  # lstm核
 52 |             cell = tf.nn.rnn_cell.BasicLSTMCell(config.hidden_dim, forget_bias=0.0, state_is_tuple=True)
 53 |             if config.dropout_keep_prob < 1:
 54 |                 cell = tf.nn.rnn_cell.DropoutWrapper(
 55 |                     cell, output_keep_prob=config.dropout_keep_prob
 56 |                 )
 57 |             return cell
 58 | 
 59 |         def gru_cell():  # gru核
 60 |             return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
 61 | 
 62 | 
 63 | 
 64 |         # Placeholders for input, output and dropout
 65 |         self.input_x1 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x1')
 66 |         self.input_x2 = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x2')
 67 |         self.input_y = tf.placeholder(tf.float32, [None, 1], name='input_y')
 68 |         self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
 69 |         # 词向量映射
 70 |         with tf.device('/cpu:0'):
 71 |             # weW = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
 72 |             weW = tf.get_variable(name="W", shape=embedding_matrix.shape, initializer=tf.constant_initializer(embedding_matrix),trainable=True)
 73 |             embedding_inputs1 = tf.nn.embedding_lookup(weW, self.input_x1)
 74 |             embedding_inputs2 = tf.nn.embedding_lookup(weW, self.input_x2)
 75 |             print('input_x1', self.input_x1.get_shape())
 76 |         with tf.name_scope("rnn"):
 77 |             # 多层rnn网络
 78 |             cells = [lstm_cell() for _ in range(self.config.num_layers)]
 79 |             rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
 80 | 
 81 |             _outputs1, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs1, dtype=tf.float32)
 82 |             _outputs2, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs2, dtype=tf.float32)
 83 |             print("_outputs2", _outputs2.get_shape())
 84 |             encode1 = _outputs1[:, -1, :]
 85 |             encode2 = _outputs2[:, -1, :]  # 取最后一个时序输出作为结果
 86 |             print("encode2", encode2.get_shape())
 87 |             last = tf.multiply(encode1, encode2, name="last")
 88 |             print("multiply",last.get_shape())
 89 |         with tf.name_scope("score"):
 90 |             # 全连接层，后面接dropout以及relu激活
 91 |             fc = tf.layers.dense(last, self.config.fc_hidden_dim, name='fc1',activation=tf.nn.relu)
 92 |             # fc = tf.contrib.layers.dropout(fc, self.keep_prob)
 93 |             # fc = tf.nn.relu(fc)
 94 |             print('fc',fc.get_shape())
 95 |             # 分类器
 96 |             # lbW = tf.Variable(tf.truncated_normal([self.config.hidden_dim, self.config.num_classes], stddev=0.1), name="lbW")
 97 |             # b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name="b")
 98 |             # print('lbW',lbW.get_shape())
 99 |             # print('b',b.get_shape())
100 |             self.scores = tf.layers.dense(fc,1,activation=tf.nn.sigmoid)  # Softmax
101 |             # self.scores = tf.nn.xw_plus_b(fc, lbW, b, name="scores")
102 |             self.y_pred_cls = tf.round(self.scores, name="predictions")
103 |             # self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
104 |             # self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别
105 |         with tf.name_scope("optimize"):
106 |             # 损失函数，交叉熵
107 |             # cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores,labels=self.input_y)
108 |             # self.loss = tf.reduce_mean(cross_entropy)
109 | 
110 |             # self.loss = -tf.reduce_sum(tf.cast(self.input_y, tf.float32)
111 |             #                                           * tf.log(tf.cast(self.y_pred_cls, tf.float32)), reduction_indices=1)
112 |             self.loss = tf.reduce_mean(-tf.reduce_sum(self.input_y* tf.log(self.scores)*config.class_weight1
113 |                                                       +(1-self.input_y)*tf.log(1-self.scores)*config.class_weight0
114 |                                                       , reduction_indices=[1]))
115 | 
116 |             # self.loss = tf.losses.sigmoid_cross_entropy(logits=self.scores, multi_class_labels=self.input_y)
117 |             # 优化器
118 |             # self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
119 | 
120 |         with tf.name_scope("accuracy"):
121 |             # 准确率
122 |             correct_pred = tf.equal(tf.cast(self.input_y, tf.float32), tf.cast(self.y_pred_cls, tf.float32))
123 |             self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="accuracy")
124 | 
125 | df = pd.read_csv(input_file, encoding="utf-8")
126 | 
127 | question1 = df['question1'].values
128 | question2 = df['question2'].values
129 | y = df['label'].values
130 | y=np.array(y,dtype=np.float32)
131 | embedding_matrix1=np.load(embedding_matrix_path)
132 | def train(x_train1, x_train2, y_train, x_val1, x_val2, y_val, model=TextRNN(embedding_matrix=embedding_matrix1), config=TRNNConfig()):
133 |     print("Configuring TensorBoard and Saver...")
134 |     # 配置 Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
135 |     out_dir = 'textrnn'
136 |     if not os.path.exists(out_dir):
137 |         os.makedirs(out_dir)
138 |     # Define Training procedure
139 |     global_step = tf.Variable(0, name="global_step", trainable=False)
140 |     # optimizer = tf.train.GradientDescentOptimizer(5e-3)
141 |     optimizer = tf.train.AdamOptimizer(1e-3)
142 |     train_step_ = optimizer.minimize(model.loss)
143 |     grads_and_vars = optimizer.compute_gradients(model.loss)
144 |     train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
145 | 
146 |     # 创建session
147 |     session = tf.Session()
148 |     session.run(tf.global_variables_initializer())
149 | 
150 |     # Summaries for loss and accuracy
151 |     loss_summary = tf.summary.scalar("loss", model.loss)
152 |     acc_summary = tf.summary.scalar("accuracy", model.acc)
153 |     # Keep track of gradient values and sparsity (optional)
154 |     # grad_summaries = []
155 |     # for g, v in grads_and_vars:
156 |     #     if g is not None:
157 |     #         grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
158 |     #         sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
159 |     #         grad_summaries.append(grad_hist_summary)
160 |     #         grad_summaries.append(sparsity_summary)
161 |     # grad_summaries_merged = tf.summary.merge(grad_summaries)
162 |     # Train Summaries
163 |     train_summary_op = tf.summary.merge([loss_summary, acc_summary])
164 |     train_summary_dir = os.path.join(out_dir, "summaries", "train")
165 |     train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph)
166 | 
167 |     # Dev summaries
168 |     dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
169 |     dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
170 |     dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, session.graph)
171 | 
172 |     # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
173 |     checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints123"))
174 |     checkpoint_prefix = os.path.join(checkpoint_dir, "model")
175 |     if not os.path.exists(checkpoint_dir):
176 |         os.makedirs(checkpoint_dir)
177 |     saver = tf.train.Saver(tf.global_variables(), max_to_keep=config.num_checkpoints)
178 | 
179 |     def train_fun(x_batch1, x_batch2, y_batch):
180 |         """
181 |         A single training step
182 |         """
183 |         feed_dict = {
184 |             model.input_x1: x_batch1,
185 |             model.input_x2: x_batch2,
186 |             model.input_y: y_batch,
187 |             model.keep_prob: config.dropout_keep_prob
188 |         }
189 |         _, step, summaries, loss, accuracy = session.run(
190 |             [train_op, global_step, train_summary_op, model.loss, model.acc],
191 |             feed_dict)
192 |         time_str = datetime.datetime.now().isoformat()
193 |         # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
194 |         train_summary_writer.add_summary(summaries, step)
195 | 
196 |     def dev_fun(x_batch1,x_batch2, y_batch, writer=None):
197 |         """
198 |         Evaluates model on a dev set
199 |         """
200 |         feed_dict = {
201 |             model.input_x1: x_batch1,
202 |             model.input_x2: x_batch2,
203 |             model.input_y: y_batch,
204 |             model.keep_prob: 1.0
205 |         }
206 |         step, summaries, loss, accuracy,predict = session.run(
207 |             [global_step, dev_summary_op, model.loss, model.acc,model.scores],
208 |             feed_dict)
209 |         pred_label = (predict > 0.5).astype(int)
210 |         print(np,sum(pred_label),np,sum(predict))
211 |         accuracy1 = accuracy_score(y_batch, pred_label)
212 |         recall = recall_score(y_batch, pred_label)
213 |         precision = precision_score(y_batch, pred_label)
214 |         time_str = datetime.datetime.now().isoformat()
215 |         print("dev {}: step {}, loss {:g}, acc {:g},acc1 {:g},recall {:g},precision {:g}".format(time_str, step, loss, accuracy,accuracy1,recall,precision))
216 |         if writer:
217 |             writer.add_summary(summaries, step)
218 |         return loss, accuracy,predict
219 | 
220 |     def batch_iter(x1, x2, y, batch_size):
221 |         idx = np.arange(len(y))
222 |         batches = [idx[range(batch_size * i, min(len(y), batch_size * (i + 1)))] for i in
223 |                    range(len(y) // batch_size + 1)]
224 |         for i in batches:
225 |             yield x1[i], x2[i], y[i]
226 | 
227 |     best_acc_val = 0
228 |     monitor_early_stop=0
229 |     for epoch in range(config.num_epochs):
230 |         print('Epoch:', epoch + 1)
231 |         total_batch = 0
232 |         for x_batch1, x_batch2, y_batch in batch_iter(x_train1, x_train2, y_train, config.batch_size):
233 |             train_fun(x_batch1, x_batch2, y_batch)
234 |             total_batch += 1
235 |         if epoch % config.print_per_batch == 0:
236 |             # 每多少轮次输出在训练集和验证集上的性能
237 |             loss_val, acc_val,predict = dev_fun(x_val1, x_val2, y_val, writer=dev_summary_writer)  # todo
238 | 
239 |             if acc_val > best_acc_val:
240 |                 # 保存最好结果
241 |                 best_acc_val = acc_val
242 |                 path = saver.save(sess=session, save_path=checkpoint_prefix)
243 |                 print("Saved model checkpoint to {}\n".format(path))
244 |                 monitor_early_stop=0
245 |             else:
246 |                 monitor_early_stop+=1
247 |                 print("do not save ")
248 |                 if(monitor_early_stop>=config.early_stop):
249 |                     break
250 |     loss_val, acc_val, predict = dev_fun(x_val1, x_val2, y_val)
251 |     return predict
252 | 
253 | 
254 | from keras.preprocessing.sequence import pad_sequences
255 | from keras.preprocessing.text import Tokenizer
256 | 
257 | tokenizer = Tokenizer(num_words=MAX_FEATURES)
258 | tokenizer.fit_on_texts(list(question1) + list(question2))
259 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1)
260 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2)
261 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
262 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)
263 | seed = 20180426
264 | cv_folds = 10
265 | y=np.reshape(y,[len(y),1])
266 | from sklearn.model_selection import StratifiedKFold
267 | 
268 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False)
269 | pred_oob = np.zeros(shape=(len(y), 1))
270 | # print(pred_oob.shape)
271 | count = 0
272 | for ind_tr, ind_te in skf.split(X_train_q1, y):
273 |     x_train_q1 = X_train_q1[ind_tr]
274 |     x_train_q2 = X_train_q2[ind_tr]
275 |     y_train = y[ind_tr]
276 | 
277 |     x_val_q1 = X_train_q1[ind_te]
278 |     x_val_q2 = X_train_q2[ind_te]
279 |     y_val = y[ind_te]
280 |     # mymodel = TextRNN()
281 |     predict=train(x_train1= x_train_q1, x_train2= x_train_q2,y_train=y_train,
282 |           x_val1= x_val_q1, x_val2= x_val_q2, y_val=y_val)
283 |     pred_oob[ind_te]=predict
284 |     # break
285 | pred_label = (pred_oob > 0.5).astype(int)
286 | recall = recall_score(y, pred_label)
287 | print("recal", recall)
288 | precision = precision_score(y, pred_label)
289 | print("precision", precision)
290 | accuracy = accuracy_score(y, pred_label)
291 | print("accuracy", accuracy)
292 | f1 = f1_score(y, pred_label)
293 | print("f1", f1)


--------------------------------------------------------------------------------
/tf_train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from config import *
  3 | 
  4 | from tf_TextRNN import TextRNN
  5 | import datetime
  6 | df = pd.read_csv(input_file, encoding="utf-8")
  7 | 
  8 | question1 = df['question1'].values
  9 | question2 = df['question2'].values
 10 | y = df['label'].values
 11 | y=np.array(y,dtype=np.float32)
 12 | embedding_matrix1=np.load(embedding_matrix_path)
 13 | def train(x_train1, x_train2, y_train, x_val1, x_val2, y_val, model=TextRNN(embedding_matrix=embedding_matrix1), config=TRNNConfig()):
 14 |     print("Configuring TensorBoard and Saver...")
 15 |     # 配置 Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
 16 |     out_dir = 'textrnn'
 17 |     if not os.path.exists(out_dir):
 18 |         os.makedirs(out_dir)
 19 |     # Define Training procedure
 20 |     global_step = tf.Variable(0, name="global_step", trainable=False)
 21 |     # optimizer = tf.train.GradientDescentOptimizer(5e-3)
 22 |     optimizer = tf.train.AdamOptimizer(1e-3)
 23 |     train_step_ = optimizer.minimize(model.loss)
 24 |     grads_and_vars = optimizer.compute_gradients(model.loss)
 25 |     train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
 26 | 
 27 |     # 创建session
 28 |     session = tf.Session()
 29 |     session.run(tf.global_variables_initializer())
 30 | 
 31 |     # Summaries for loss and accuracy
 32 |     loss_summary = tf.summary.scalar("loss", model.loss)
 33 |     acc_summary = tf.summary.scalar("accuracy", model.acc)
 34 |     # Keep track of gradient values and sparsity (optional)
 35 |     # grad_summaries = []
 36 |     # for g, v in grads_and_vars:
 37 |     #     if g is not None:
 38 |     #         grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
 39 |     #         sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
 40 |     #         grad_summaries.append(grad_hist_summary)
 41 |     #         grad_summaries.append(sparsity_summary)
 42 |     # grad_summaries_merged = tf.summary.merge(grad_summaries)
 43 |     # Train Summaries
 44 |     train_summary_op = tf.summary.merge([loss_summary, acc_summary])
 45 |     train_summary_dir = os.path.join(out_dir, "summaries", "train")
 46 |     train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph)
 47 | 
 48 |     # Dev summaries
 49 |     dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
 50 |     dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
 51 |     dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, session.graph)
 52 | 
 53 |     # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
 54 |     checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints123"))
 55 |     checkpoint_prefix = os.path.join(checkpoint_dir, "model")
 56 |     if not os.path.exists(checkpoint_dir):
 57 |         os.makedirs(checkpoint_dir)
 58 |     saver = tf.train.Saver(tf.global_variables(), max_to_keep=config.num_checkpoints)
 59 | 
 60 |     def train_fun(x_batch1, x_batch2, y_batch):
 61 |         """
 62 |         A single training step
 63 |         """
 64 |         feed_dict = {
 65 |             model.input_x1: x_batch1,
 66 |             model.input_x2: x_batch2,
 67 |             model.input_y: y_batch,
 68 |             model.keep_prob: config.dropout_keep_prob
 69 |         }
 70 |         _, step, summaries, loss, accuracy = session.run(
 71 |             [train_op, global_step, train_summary_op, model.loss, model.acc],
 72 |             feed_dict)
 73 |         time_str = datetime.datetime.now().isoformat()
 74 |         # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
 75 |         train_summary_writer.add_summary(summaries, step)
 76 | 
 77 |     def dev_fun(x_batch1,x_batch2, y_batch, writer=None):
 78 |         """
 79 |         Evaluates model on a dev set
 80 |         """
 81 |         feed_dict = {
 82 |             model.input_x1: x_batch1,
 83 |             model.input_x2: x_batch2,
 84 |             model.input_y: y_batch,
 85 |             model.keep_prob: 1.0
 86 |         }
 87 |         step, summaries, loss, accuracy,predict = session.run(
 88 |             [global_step, dev_summary_op, model.loss, model.acc,model.scores],
 89 |             feed_dict)
 90 |         pred_label = (predict > 0.5).astype(int)
 91 |         print(np,sum(pred_label),np,sum(predict))
 92 |         accuracy1 = accuracy_score(y_batch, pred_label)
 93 |         recall = recall_score(y_batch, pred_label)
 94 |         precision = precision_score(y_batch, pred_label)
 95 |         time_str = datetime.datetime.now().isoformat()
 96 |         print("dev {}: step {}, loss {:g}, acc {:g},acc1 {:g},recall {:g},precision {:g}".format(time_str, step, loss, accuracy,accuracy1,recall,precision))
 97 |         if writer:
 98 |             writer.add_summary(summaries, step)
 99 |         return loss, accuracy,predict
100 | 
101 |     def batch_iter(x1, x2, y, batch_size):
102 |         idx = np.arange(len(y))
103 |         batches = [idx[range(batch_size * i, min(len(y), batch_size * (i + 1)))] for i in
104 |                    range(len(y) // batch_size + 1)]
105 |         for i in batches:
106 |             yield x1[i], x2[i], y[i]
107 | 
108 |     best_acc_val = 0
109 |     monitor_early_stop=0
110 |     for epoch in range(config.num_epochs):
111 |         print('Epoch:', epoch + 1)
112 |         total_batch = 0
113 |         for x_batch1, x_batch2, y_batch in batch_iter(x_train1, x_train2, y_train, config.batch_size):
114 |             train_fun(x_batch1, x_batch2, y_batch)
115 |             total_batch += 1
116 |         if epoch % config.print_per_batch == 0:
117 |             # 每多少轮次输出在训练集和验证集上的性能
118 |             loss_val, acc_val,predict = dev_fun(x_val1, x_val2, y_val, writer=dev_summary_writer)  # todo
119 | 
120 |             if acc_val > best_acc_val:
121 |                 # 保存最好结果
122 |                 best_acc_val = acc_val
123 |                 path = saver.save(sess=session, save_path=checkpoint_prefix)
124 |                 print("Saved model checkpoint to {}\n".format(path))
125 |                 monitor_early_stop=0
126 |             else:
127 |                 monitor_early_stop+=1
128 |                 print("do not save ")
129 |                 if(monitor_early_stop>=config.early_stop):
130 |                     break
131 |     loss_val, acc_val, predict = dev_fun(x_val1, x_val2, y_val)
132 |     return predict
133 | 
134 | 
135 | from keras.preprocessing.sequence import pad_sequences
136 | from keras.preprocessing.text import Tokenizer
137 | 
138 | tokenizer = Tokenizer(num_words=MAX_FEATURES)
139 | tokenizer.fit_on_texts(list(question1) + list(question2))
140 | list_tokenized_question1 = tokenizer.texts_to_sequences(question1)
141 | list_tokenized_question2 = tokenizer.texts_to_sequences(question2)
142 | X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
143 | X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)
144 | seed = 20180426
145 | cv_folds = 10
146 | y=np.reshape(y,[len(y),1])
147 | from sklearn.model_selection import StratifiedKFold
148 | 
149 | skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False)
150 | pred_oob = np.zeros(shape=(len(y), 1))
151 | # print(pred_oob.shape)
152 | count = 0
153 | for ind_tr, ind_te in skf.split(X_train_q1, y):
154 |     x_train_q1 = X_train_q1[ind_tr]
155 |     x_train_q2 = X_train_q2[ind_tr]
156 |     y_train = y[ind_tr]
157 | 
158 |     x_val_q1 = X_train_q1[ind_te]
159 |     x_val_q2 = X_train_q2[ind_te]
160 |     y_val = y[ind_te]
161 |     # mymodel = TextRNN()
162 |     predict=train(x_train1= x_train_q1, x_train2= x_train_q2,y_train=y_train,
163 |           x_val1= x_val_q1, x_val2= x_val_q2, y_val=y_val)
164 |     pred_oob[ind_te]=predict
165 |     break
166 | pred_label = (pred_oob > 0.5).astype(int)
167 | recall = recall_score(y, pred_label)
168 | print("recal", recall)
169 | precision = precision_score(y, pred_label)
170 | print("precision", precision)
171 | accuracy = accuracy_score(y, pred_label)
172 | print("accuracy", accuracy)
173 | f1 = f1_score(y, pred_label)
174 | print("f1", f1)


--------------------------------------------------------------------------------
/upload/keras_main1.py:
--------------------------------------------------------------------------------
  1 | # /usr/bin/env python
  2 | # coding=utf-8
  3 | input_file = "./train.txt"
  4 | embedding_matrix_path = './temp_no_truncate.npy'
  5 | kernel_name = "bilstm"
  6 | import numpy as np
  7 | import keras
  8 | import sys
  9 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 10 | # from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
 11 | import jieba
 12 | import codecs
 13 | 
 14 | jieba.add_word('花呗')
 15 | jieba.add_word('借呗')
 16 | jieba.add_word('余额宝')
 17 | 
 18 | MAX_TEXT_LENGTH = 50
 19 | MAX_FEATURES = 10000
 20 | embedding_dims = 128
 21 | dr = 0.01
 22 | 
 23 | 
 24 | def pandas_process(input_train):
 25 |     q1 = []
 26 |     q2 = []
 27 |     vlabel = []
 28 |     df = {}
 29 |     fin = codecs.open(input_train, 'r', encoding='utf-8')
 30 |     fin.readline()
 31 |     for line in fin:
 32 |         l, sen1, sen2 = line.strip().split('\t')
 33 |         q1.append(sen1)
 34 |         q2.append(sen2)
 35 |         vlabel.append(int(l))
 36 |     fin.close()
 37 |     df["question1"] = q1
 38 |     df["question2"] = q2
 39 |     df["label"] = vlabel
 40 |     return df
 41 | 
 42 | 
 43 | def seg(text):
 44 |     seg_list = jieba.cut(text)
 45 |     return " ".join(seg_list)
 46 | 
 47 | 
 48 | def get_model():
 49 |     input1_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 50 |     input2_tensor = keras.layers.Input(shape=(MAX_TEXT_LENGTH,))
 51 |     words_embedding_layer = keras.layers.Embedding(MAX_FEATURES, embedding_dims,
 52 |                                                    # weights=[embedding_matrix],
 53 |                                                    input_length=MAX_TEXT_LENGTH,
 54 |                                                    trainable=True)
 55 |     seq_embedding_layer = keras.layers.Bidirectional(keras.layers.GRU(256, recurrent_dropout=dr))
 56 |     seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))
 57 |     merge_layer = keras.layers.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])
 58 |     merge_layer = keras.layers.Dropout(dr)(merge_layer)
 59 |     dense1_layer = keras.layers.Dense(64, activation='relu')(merge_layer)
 60 |     ouput_layer = keras.layers.Dense(1, activation='sigmoid')(dense1_layer)
 61 |     model = keras.models.Model([input1_tensor, input2_tensor], ouput_layer)
 62 |     # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
 63 |     # model.summary()
 64 |     return model
 65 | 
 66 | 
 67 | if __name__ == '__main__':
 68 |     inpath = sys.argv[1]
 69 |     outputpath = sys.argv[2]
 70 |     # import pandas as pd
 71 |     # input_file = "../input/process.csv"
 72 |     # df=pd.read_csv(input_file)
 73 |     # question1 = df['question1'].values
 74 |     # question2 = df['question2'].values
 75 |     # y = df['label'].values
 76 |     df = pandas_process(input_file)
 77 |     question1 = df['question1']
 78 |     question2 = df['question2']
 79 |     y = df['label']
 80 |     from keras.preprocessing.sequence import pad_sequences
 81 |     from keras.preprocessing.text import Tokenizer
 82 | 
 83 |     # np.savetxt('X_train_q1.out', X_train_q1, delimiter=',')
 84 |     # np.savetxt('X_train_q2.out', X_train_q2, delimiter=',')
 85 |     # inpath="test1.txt"
 86 |     test_data1 = []
 87 |     test_data2 = []
 88 |     linenos = []
 89 |     fin = codecs.open(inpath, 'r', encoding='utf-8')
 90 |     for line in fin:
 91 |         lineno, sen1, sen2 = line.strip().split('\t')
 92 |         sen1 = seg(sen1)
 93 |         sen2 = seg(sen2)
 94 |         test_data1.append(sen1)
 95 |         test_data2.append(sen2)
 96 |         linenos.append(lineno)
 97 |     fin.close()
 98 | 
 99 |     tokenizer = Tokenizer(num_words=MAX_FEATURES)
100 |     tokenizer.fit_on_texts(list(question1) + list(question2))
101 |     list_tokenized_question1 = tokenizer.texts_to_sequences(question1)
102 |     list_tokenized_question2 = tokenizer.texts_to_sequences(question2)
103 |     X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
104 |     X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)
105 |     list_tokenized_question11 = tokenizer.texts_to_sequences(test_data1)
106 |     list_tokenized_question22 = tokenizer.texts_to_sequences(test_data2)
107 |     x_val_q1 = pad_sequences(list_tokenized_question11, maxlen=MAX_TEXT_LENGTH)
108 |     x_val_q2 = pad_sequences(list_tokenized_question22, maxlen=MAX_TEXT_LENGTH)
109 | 
110 |     # for i in range(len(x_val_q1)):
111 |     #     t=np.array_equal(X_train_q1[i], x_val_q1[i])
112 |     #     if not t:
113 |     #         print X_train_q1[i]," | ",x_val_q1[i]
114 |     #         print i,question1[i]," | ",test_data1[i]
115 |     #     t=np.array_equal(X_train_q2[i], x_val_q2[i])
116 |     #     if not t:
117 |     #         print X_train_q2[i]," | ", x_val_q2[i]
118 |     #         print i,question2[i]," | ",test_data2[i]
119 | 
120 |     nb_words = min(MAX_FEATURES, len(tokenizer.word_index))
121 |     # print("nb_words", nb_words)
122 |     # embedding_matrix1 = np.load(embedding_matrix_path)
123 |     seed = 20180426
124 |     cv_folds = 10
125 |     # from sklearn.model_selection import StratifiedKFold
126 | 
127 |     # skf = StratifiedKFold(n_splits=cv_folds, random_state=seed, shuffle=False)
128 |     y = y[0:len(x_val_q1)]
129 |     # print x_val_q1.shape
130 |     pred_oob = np.zeros(shape=(len(x_val_q1), 1))
131 |     # print pred_oob.shape
132 |     count = 0
133 |     # print "start to predict."
134 |     model = get_model()
135 |     for index in range(cv_folds):
136 |         bst_model_path = kernel_name + '_weight_%d.h5' % count
137 |         model.load_weights(bst_model_path)
138 |         y_predict = model.predict([x_val_q1, x_val_q2], batch_size=1024, verbose=0)
139 |         pred_oob += y_predict
140 |         # print "*",
141 |         # break
142 |         # try:
143 |         #     y_predict = (y_predict > 0.5).astype(int)
144 |         #     recall = recall_score(y, y_predict)
145 |         #     print(count, "recall", recall)
146 |         #     precision = precision_score(y, y_predict)
147 |         #     print(count, "precision", precision)
148 |         #     accuracy = accuracy_score(y, y_predict)
149 |         #     print(count, "accuracy ", accuracy)
150 |         #     f1 = f1_score(y, y_predict)
151 |         #     print(count, "f1", f1)
152 |         #     count += 1
153 |         # except:
154 |         #     pass
155 |     # print "predict done.Saving output to %s"%outputpath
156 |     pred_oob /= cv_folds
157 |     pred_oob1 = (pred_oob > 0.5).astype(int)
158 |     fout = codecs.open(outputpath, 'w', encoding='utf-8')
159 |     for index, la in enumerate(pred_oob1):
160 |         lineno = linenos[index]
161 |         fout.write(lineno + '\t%d\n' % la)
162 |     # print "All is done."
163 |             # try:
164 |             #     recall = recall_score(y, pred_oob1)
165 |             #     print("recal", recall)
166 |             #     precision = precision_score(y, pred_oob1)
167 |             #     print("precision", precision)
168 |             #     accuracy = accuracy_score(y, pred_oob1)
169 |             #     print("accuracy", accuracy)
170 |             #     f1 = f1_score(y, pred_oob1)
171 |             #     print("f1", f1)
172 |             # except:
173 |             #     pass
174 | 


--------------------------------------------------------------------------------
/upload/run.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 | 
3 | python keras_main1.py $1 $2
4 | 


--------------------------------------------------------------------------------