├── README.md ├── data_augmentation_with_graph_features.py ├── data_helper.py ├── post_processing_with_graph_features.py ├── siamese_lstm_with_distance_and_angle.py └── feature_engineering.py /README.md: -------------------------------------------------------------------------------- 1 | # -第三届魔镜杯 智能客服问题相似性算法设计 第12名解决方案 2 | 3 | 我们是moka_tree团队,在本次比赛中,初赛第16名, 复赛第12名。 4 | 5 | 详细解决方案请查看简书: 6 | 智能客服问题相似度算法设计——第三届魔镜杯大赛第12名解决方案 7 | https://www.jianshu.com/p/827dd447daf9 8 | 9 | 比赛数据下载: 10 | 链接:https://pan.baidu.com/s/1DgV8-iu_T_PtaH3HgEGY6g 11 | 提取码:1cbr 12 | -------------------------------------------------------------------------------- /data_augmentation_with_graph_features.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 利用图特征来做数据增强:获取更多的训练数据 3 | 数据增强原则: 4 | (1)如果q1,q2相似,且q1,q2在同一个连通图,则连通图的问题都相似 5 | —— 利用connected components得到 6 | —— 组合后一共3796821个问题对,太多了 7 | —— 我们取 1 1 and matrix[i][j] <= max_distance \ 49 | and (cc[i], cc[j]) not in distance.keys() \ 50 | and (cc[j], cc[i]) not in distance.keys(): 51 | distance[(cc[i], cc[j])] = matrix[i][j] 52 | 53 | return distance, matrix, cc 54 | 55 | 56 | def gen_similar_data(train_graph, connected_components, max_cc_size, max_distance): 57 | ''' 58 | 对每个连通图,计算连通图中任意两点的距离 59 | 注意: 60 | 如果连通图节点只有2个,直接break 61 | 如果连通图计算的距离为1,不存储 62 | ''' 63 | 64 | distance = {} 65 | 66 | for cc in connected_components: 67 | if len(cc) > 2 and len(cc) <= max_cc_size: 68 | cc_distance, _, _ = all_pair_Dijkstra(train_graph, cc, max_distance) 69 | distance.update(cc_distance) 70 | else: 71 | continue 72 | 73 | return distance 74 | 75 | 76 | def gen_dissimilar_data(independent_groups, max_group_size): 77 | ''' 78 | 如果q1, q2不相似,且存在连通图cc1包含q1,和cc2包含q2,则cc1和cc2的任意组合均不相似 79 | max_group_size用来控制返回的问题对数量,设为46,对应100万左右的问题对 80 | ''' 81 | dissimilar_pairs = set() 82 | for ig in independent_groups: 83 | cc1 = ig[0] 84 | cc2 = ig[1] 85 | # 限制连通图大小,不然太多了 86 | if len(cc1) < max_group_size and len(cc2) < max_group_size: 87 | for q1 in cc1: 88 | for q2 in cc2: 89 | dissimilar_pairs.add((q1, q2)) 90 | return dissimilar_pairs 91 | 92 | 93 | 94 | def data_augmentation(train, similar_data, dissimilar_data): 95 | ''' 96 | 与train数据去重,生成平衡数据集 97 | similar_data: dict,{(q1,q2): d(q1,q2)} 98 | dissimilar_data: set, {(q1,q2)} 99 | ''' 100 | 101 | #问题对转化为set格式 102 | similar_pairs = set(similar_data.keys()) 103 | train_data1 = set([(train.loc[i,'q1'], train.loc[i,'q2']) for i in train.index]) 104 | train_data2 = set([(train.loc[i,'q2'], train.loc[i,'q1']) for i in train.index]) 105 | 106 | # 查看(q1,q2)组合是否与train数据重复,如重复,则去掉 107 | similar_pairs = similar_pairs - train_data1 108 | similar_pairs = list(similar_pairs - train_data2) 109 | dissimilar_pairs = dissimilar_data - train_data1 110 | dissimilar_pairs = list(dissimilar_pairs - train_data2) 111 | 112 | 113 | # 生成新的训练数据并导出 114 | new_data = [] 115 | new_data.extend(similar_pairs) 116 | new_data.extend(dissimilar_pairs) 117 | new_data = pd.DataFrame(np.array(new_data)) 118 | new_data.columns = ['q1','q2'] 119 | new_data['label'] = 0 120 | new_data.loc[0:len(similar_pairs)-1, 'label'] = 1 121 | 122 | return new_data -------------------------------------------------------------------------------- /data_helper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import math 5 | 6 | 7 | def sentences_to_indices(X, word_to_index, max_len): 8 | """ 9 | Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences. 10 | The output shape should be such that it can be given to `Embedding()` 11 | 12 | Arguments: 13 | X -- array of sentences (strings), of shape (m, 1) 14 | word_to_index -- a dictionary containing the each word mapped to its index 15 | max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 16 | 17 | Returns: 18 | X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len) 19 | """ 20 | m = X.shape[0] # number of training examples 21 | # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line) 22 | X_indices = np.zeros((m, max_len)) 23 | for i in range(m): # loop over training examples 24 | # split the sentences into words 25 | sentence_words =X[i].split(' ') 26 | # Loop over the words of sentence_words 27 | for j,w in enumerate(sentence_words): 28 | if j >= max_len: 29 | break 30 | # Set the (i,j)th entry of X_indices to the index of the correct word. 31 | X_indices[i, j] = word_to_index[w] 32 | 33 | return X_indices 34 | 35 | 36 | def load_dataset(max_seq_len, embed_dim, word_level=True): 37 | ''' 38 | 读取数据,对数据进行预处理,并生成embed_matrix 39 | ''' 40 | #1、读取数据,数据预处理 41 | #数据路径 42 | question_path = os.path.join('datasets', 'question.csv') 43 | train_path = os.path.join('datasets', 'train.csv') 44 | if word_level: 45 | embed_path = os.path.join('datasets', 'word_embed.txt') 46 | else: 47 | embed_path = os.path.join('datasets', 'char_embed.txt') 48 | 49 | #读取数据 50 | question = pd.read_csv(question_path) 51 | 52 | train = pd.read_csv(train_path) 53 | # 把train里面的问题id匹配到句子 54 | train = pd.merge(train,question,left_on=['q1'],right_on=['qid'],how='left') 55 | train = pd.merge(train,question,left_on=['q2'],right_on=['qid'],how='left') 56 | 57 | if word_level: 58 | train = train[['label','words_x','words_y']] 59 | else: 60 | train = train[['label','chars_x','chars_y']] 61 | train.columns = ['label','q1','q2'] 62 | 63 | # 读取word_to_vec_map,注意这里的index是word id 64 | word_to_vec_map = pd.read_csv(embed_path, sep=' ', header=None, index_col=0) 65 | 66 | # 先定义两个字典,实现wid与(positive) index的相互转换,注意index从1开始 67 | word = word_to_vec_map.index.values 68 | word_to_index = dict([(word[i],i+1) for i in range(len(word))]) 69 | index_to_word = dict([(i+1, word[i]) for i in range(len(word))]) 70 | 71 | # 把句子转换成int indices,并zero pad the sentance to max_seq_len 72 | train_q1_indices = sentences_to_indices(train.q1.values, word_to_index, max_seq_len) 73 | train_q2_indices = sentences_to_indices(train.q2.values, word_to_index, max_seq_len) 74 | label = train.label.values 75 | 76 | #3、生成embeding_matrix, index为整数,其中index=0,对应的是np.zeros(300),0向量,对应我们padding的值 77 | vocab_len = len(word_to_index) + 1 78 | # Initialize the embedding matrix as numpy arrays of zeros 79 | embed_matrix = np.zeros((vocab_len, embed_dim)) 80 | # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary 81 | for word, index in word_to_index.items(): 82 | embed_matrix[index, :] = word_to_vec_map.loc[word].values 83 | 84 | return (train_q1_indices,train_q2_indices, label, embed_matrix) 85 | 86 | 87 | 88 | def load_test_data(max_seq_len, word_level=True): 89 | ''' 90 | 读取测试数据 91 | ''' 92 | #1、读取数据,数据预处理 93 | #数据路径 94 | question_path = os.path.join('datasets', 'question.csv') 95 | test_path = os.path.join('datasets', 'test.csv') 96 | if word_level: 97 | embed_path = os.path.join('datasets', 'word_embed.txt') 98 | else: 99 | embed_path = os.path.join('datasets', 'char_embed.txt') 100 | 101 | #读取数据 102 | question = pd.read_csv(question_path) 103 | test = pd.read_csv(test_path) 104 | # 把train里面的问题id匹配到句子 105 | test = pd.merge(test,question,left_on=['q1'],right_on=['qid'],how='left') 106 | test = pd.merge(test,question,left_on=['q2'],right_on=['qid'],how='left') 107 | if word_level: 108 | test = test[['words_x','words_y']] 109 | else: 110 | test = test[['chars_x','chars_y']] 111 | test.columns = ['q1','q2'] 112 | # 读取word_to_vec_map,注意这里的index是word id 113 | word_to_vec_map = pd.read_csv(embed_path, sep=' ', header=None, index_col=0) 114 | 115 | # 先定义两个字典,实现wid与(positive) index的相互转换,注意index从1开始 116 | word = word_to_vec_map.index.values 117 | word_to_index = dict([(word[i],i+1) for i in range(len(word))]) 118 | index_to_word = dict([(i+1, word[i]) for i in range(len(word))]) 119 | 120 | # 把句子转换成int indices,并zero pad the sentance to max_seq_len 121 | test_q1_indices = sentences_to_indices(test.q1.values, word_to_index, max_seq_len).astype(np.int32) 122 | test_q2_indices = sentences_to_indices(test.q2.values, word_to_index, max_seq_len).astype(np.int32) 123 | 124 | 125 | return test_q1_indices,test_q2_indices 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /post_processing_with_graph_features.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 后处理 3 | ''' 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | from collections import deque 8 | 9 | 10 | #char_embed = pd.read_csv('datasets/char_embed.txt', sep=' ', header=None, index_col=0) 11 | #word_embed = pd.read_csv('datasets/word_embed.txt', sep=' ', header=None, index_col=0) 12 | #question = pd.read_csv('datasets/question.csv',index_col=0) 13 | #train = pd.read_csv('datasets/train.csv') 14 | #test = pd.read_csv('datasets/test.csv') 15 | 16 | 17 | 18 | #求q1,q2的图上距离(最短路径) 19 | ''' 20 | step1 : 利用训练数据生成无向图,求各个连通分量 21 | step2 : 求q1,q2的距离d 22 | (1)如果q1,q2在一个连通图上:求q1,q2的距离d 23 | (2)如果q1,q2不在一个连通图上,令d(q1,q2) = 1000 24 | ''' 25 | # 生成无向图 26 | def gen_graph(train): 27 | """ 28 | 把输入数据转化为以字典表示的无向图 29 | """ 30 | data = train[train['label']==1][['q1','q2']] 31 | graph = {} 32 | for i in range(len(data)): 33 | if data.iloc[i,0] not in graph.keys(): 34 | graph[data.iloc[i,0]] = set([data.iloc[i,1]]) 35 | else: 36 | graph[data.iloc[i,0]].add(data.iloc[i,1]) 37 | 38 | if data.iloc[i,1] not in graph.keys(): 39 | graph[data.iloc[i,1]] = set([data.iloc[i,0]]) 40 | else: 41 | graph[data.iloc[i,1]].add(data.iloc[i,0]) 42 | 43 | return graph 44 | 45 | 46 | def bfs_visited(ugraph, start_node): 47 | """ 48 | 输入无向图ugraph和一个节点start_node 49 | 返回从这个节点出发,通过广度优先搜索访问的所有节点的集合 50 | """ 51 | # initialize Q to be an empty queue 52 | que = deque() 53 | # initialize visited 54 | visited = [start_node] 55 | # enqueue(que, start_node) 56 | que.append(start_node) 57 | while len(que) > 0: 58 | current_node = que.popleft() 59 | neighbours = ugraph[current_node] 60 | for nei in neighbours: 61 | if nei not in visited: 62 | visited.append(nei) 63 | que.append(nei) 64 | return set(visited) 65 | 66 | 67 | def cc_visited(ugraph): 68 | """ 69 | 输入无向图ugraph 70 | 返回一个list,list的元素是每个连通分量的节点构成的集合 71 | """ 72 | remaining_nodes = list(ugraph.keys()) 73 | connected_components = [] 74 | while len(remaining_nodes) > 0 : 75 | # choose the first element in remaining_nodes to be the start_node 76 | start_node = remaining_nodes[0] 77 | # use bfs_visited() to get the connected component containing start_node 78 | con_component = bfs_visited(ugraph, start_node) 79 | # update connected_components 80 | connected_components.append(con_component) 81 | # update remaining_nodes 82 | remaining_nodes = list(set(remaining_nodes) - con_component) 83 | return connected_components 84 | 85 | 86 | # 单源最短路径 87 | def Dijkstra(ugraph, connected_component, start_node): 88 | ''' 89 | 返回start_node到connected_component所有节点的最短距离 90 | ''' 91 | # 初始化 92 | minv = start_node 93 | visited = set() 94 | 95 | # 源顶点到其余各顶点的初始路程 96 | dist = dict([(node,np.float('inf')) for node in connected_component]) 97 | dist[minv] = 0 98 | 99 | # 遍历集合V中与A直接相邻的顶点,找出当前与A距离最短的顶点 100 | while len(visited) < len(connected_component): 101 | visited.add(minv) 102 | # 确定当期顶点的距离 103 | for v in ugraph[minv]: 104 | if dist[minv] + 1 < dist[v]: # 如果从当前点扩展到某一点的距离小与已知最短距离 105 | dist[v] = dist[minv] + 1 # 对已知距离进行更新 106 | 107 | # 从剩下的未确定点中选择最小距离点作为新的扩散点 108 | new = np.float('inf') 109 | for w in connected_component - visited: 110 | if dist[w] < new: 111 | new = dist[w] 112 | minv = w 113 | return dist 114 | 115 | 116 | ## 先生成图 117 | #print('Generating Graph...') 118 | #start = time.time() 119 | #train_graph = gen_graph(train) 120 | #end = time.time() 121 | #print('Graph generated. Time used {:0.1f} mins'.format((end-start)/60)) 122 | 123 | ## 寻找各连通分项(大概7分钟) 124 | #print('Searching Connected Components...') 125 | #start = time.time() 126 | #connected_components = cc_visited(train_graph) 127 | #end = time.time() 128 | #print('Search finished. Time used {:0.1f} mins'.format((end-start)/60)) 129 | 130 | def get_graph_distance(data, train_graph, connected_components, training_data=True): 131 | ''' 132 | 1. 如果q1,q2在一个连通图上:返回q1,q2的距离d 133 | 2. 如果q1,q2不在一个连通图上: 令d(q1, q2) = 1000 134 | ''' 135 | n = data.shape[0] 136 | 137 | # 初始化 138 | record_distance = {} #用来记录已经计算过的距离 139 | result_distance = [1000 for i in range(n)] 140 | 141 | for i in range(n): 142 | q1 = data.loc[i,'q1'] 143 | q2 = data.loc[i,'q2'] 144 | 145 | # 如果是训练数据的相似问题,则dist=1 146 | if training_data and data.loc[i,'label'] == 1: 147 | result_distance[i] = 1 148 | 149 | # 如果已经计算过,直接取出计算过的值 150 | elif (q1,q2) in record_distance.keys(): 151 | result_distance[i] = record_distance[(q1,q2)] 152 | 153 | elif (q2,q1) in record_distance.keys(): 154 | result_distance[i] = record_distance[(q2,q1)] 155 | 156 | else: 157 | # check whether q1,q2 are in one connected_componets 158 | for cc in connected_components: 159 | if (q1 in cc) and (q2 in cc): 160 | # 连通图cc,q1到其它节点的距离 161 | q1_dist = Dijkstra(train_graph, cc, q1) 162 | # 把计算过的距离保存起来 163 | new_dict = dict([((q1,node),q1_dist[node]) for node in q1_dist.keys()]) 164 | record_distance.update(new_dict) 165 | result_distance[i] = q1_dist[q2] 166 | break 167 | 168 | result_distance = pd.DataFrame(np.array(result_distance), index=data.index) 169 | result_distance.columns = ['graph_distance'] 170 | 171 | return result_distance 172 | 173 | ''' 174 | 通过训练数据得到问题之间的距离,进行统计发现: 175 | label = 1 : graph_distance = 1 176 | label = 0 : graph_distance = 1000(表示不连通) 177 | 说明:不相似的问题不可能在一个连通图里 178 | 推断:q1与q2不相似,则q1与q2的连通图G(q2)的所有顶点都不相似,q2与q1的连通图G(q1)的所有顶点都不相似 179 | 另有一个不太充分的结论: 相似问题具有传递性,而且可以传递很远。 180 | 181 | 算法:区分确定的不相似和不确定的不相似 182 | input: (q1, q2) , connected component 183 | return: graph_feature(gf for short) 184 | step1: 先利用训练数据中问题对的不相似,找出相互独立的连通子图对 185 | step2: 对于测试数据中的问题对(q1, q2),如果q1存在连通图cc(q1), q2存在连通图cc(q2),且cc(q1)与cc(q2)独立,则q1,q2不相似。 186 | ''' 187 | 188 | def get_independent_groups(train, train_graph_distance, connected_components): 189 | 190 | # 找出不相似的问题对 191 | data = train[train.label == 0] 192 | 193 | independent_groups = [] 194 | 195 | for i in data.index: 196 | q1 = data.loc[i,'q1'] 197 | q2 = data.loc[i,'q2'] 198 | 199 | if train_graph_distance.loc[i, 'graph_distance'] == 1000: 200 | # 查看它们是否有连通图 201 | cc1 = set([]) 202 | cc2 = set([]) 203 | for cc in connected_components: 204 | if q1 in cc: 205 | cc1 = cc 206 | if q2 in cc: 207 | cc2 = cc 208 | if len(cc1) > 0 and len(cc2) > 0 and (cc1,cc2) not in independent_groups and (cc2,cc1) not in independent_groups: 209 | independent_groups.append((cc1,cc2)) 210 | 211 | return independent_groups 212 | 213 | 214 | 215 | def get_graph_features(test, test_graph_distance, independent_groups): 216 | 217 | n = test.shape[0] 218 | 219 | # 初始化, 0 表示从训练集的graph无法确定是否相似, 1表示确定相似,-1表示确定不相似 220 | graph_features = [0 for i in range(n)] 221 | 222 | for i in range(n): 223 | q1 = test.loc[i,'q1'] 224 | q2 = test.loc[i,'q2'] 225 | 226 | if test_graph_distance.loc[i,'graph_distance'] < 1000: 227 | graph_features[i] = 1 228 | else: 229 | # 看看q1和q2是否在independent group里面,如果在,则q1,q2确定不相似 230 | for ig in independent_groups: 231 | if (q1 in ig[0] and q2 in ig[1]) or (q1 in ig[1] and q2 in ig[0]): 232 | graph_features[i] = -1 233 | 234 | graph_features = pd.DataFrame(np.array(graph_features), index=test.index) 235 | graph_features.columns = ['graph_features'] 236 | 237 | return graph_features 238 | 239 | -------------------------------------------------------------------------------- /siamese_lstm_with_distance_and_angle.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | np.random.seed(0) 4 | 5 | from keras.models import Model 6 | from keras.layers import Dense, Input, Dropout, LSTM, Activation, BatchNormalization,concatenate,Subtract, Dot, Multiply,Bidirectional,Lambda 7 | from keras.layers.embeddings import Embedding 8 | from keras.initializers import glorot_uniform 9 | from keras.layers.noise import GaussianNoise 10 | from keras import backend as K 11 | from keras import optimizers 12 | import tensorflow as tf 13 | 14 | import keras.callbacks as kcallbacks 15 | np.random.seed(1) 16 | 17 | from data_helper import * 18 | 19 | import warnings 20 | warnings.filterwarnings('ignore') 21 | 22 | # jupyter magic commands,自动重新加载更改的模块 23 | %load_ext autoreload 24 | %autoreload 2 25 | 26 | 27 | MAX_SEQUENCE_LENGTH = 15 # 20 for character level and 15 for word level 28 | EMBEDDING_DIM = 300 29 | 30 | # 读取数据 31 | train_q1, train_q2, train_label, embed_matrix = load_dataset(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, word_level=True) 32 | print('train_q1: ',train_q1.shape) 33 | print('train_q2: ', train_q2.shape) 34 | print('train_label: ',train_label.shape) 35 | print('embed_matrix: ',embed_matrix.shape) 36 | 37 | # 加载test 数据 38 | test_q1, test_q2 = load_test_data( MAX_SEQUENCE_LENGTH, word_level=True) 39 | print('test_q1: ',test_q1.shape) 40 | print('test_q2: ', test_q2.shape) 41 | 42 | 43 | # 读取手工特征 44 | train_features = pd.read_csv('features/0714_all_train_features_17.csv') 45 | test_features = pd.read_csv('features/0714_all_test_features_17.csv') 46 | 47 | train_moka_features = pd.read_csv('features/non_nlp_features_train.csv') 48 | test_moka_features = pd.read_csv('features/non_nlp_features_test.csv') 49 | 50 | train_features = pd.merge(train_features, train_moka_features, left_index=True, right_index=True) 51 | test_features = pd.merge(test_features, test_moka_features, left_index=True, right_index=True) 52 | 53 | pick_columns = ['adjusted_common_word_ratio', 'edit_distance','len_diff', 'pword_dside_rate', 'pword_oside_rate', 54 | 'adjusted_common_char_ratio', 'pchar_dside_rate', 'pchar_oside_rate', 55 | 'coo_max_degree_(0, 5]','coo_max_degree_(5, 30]', 'coo_max_degree_(30, 130]', 56 | 'coo_q1_q2_degree_diff','common_neighbor_ratio'] 57 | 58 | train_features = train_features[pick_columns] 59 | test_features = test_features[pick_columns] 60 | 61 | train_features.info() 62 | 63 | 64 | # 读取数据分裂index 65 | split_index = {} 66 | for i in range(10): 67 | split_index[i]= pd.read_csv('features/0714_train_split_index/vali_idx_'+str(i)+'.csv').idx.values 68 | 69 | 70 | # define model, 10-fold cv 71 | 72 | best_vali_score ={} 73 | 74 | def trainLSTM(train_q1, train_q2, train_label, embed_matrix, test_q1, test_q2, train_features, test_features, split_index): 75 | 76 | lstm_num = 75 77 | lstm_drop = 0.5 78 | BATCH_SIZE = 256 # 128 79 | 80 | for model_count in range(10): 81 | 82 | print("MODEL:", model_count) 83 | 84 | # split data into train/vali set 85 | idx_val = split_index[model_count] 86 | idx_train = [] 87 | for i in range(10): 88 | if i != model_count: 89 | idx_train.extend(list(split_index[i])) 90 | 91 | q1_train = train_q1[idx_train] 92 | q2_train = train_q2[idx_train] 93 | y_train = train_label[idx_train] 94 | f_train = train_features[idx_train] 95 | 96 | q1_val = train_q1[idx_val] 97 | q2_val = train_q2[idx_val] 98 | y_val = train_label[idx_val] 99 | f_val = train_features[idx_val] 100 | 101 | # Define the model 102 | question1 = Input(shape=(MAX_SEQUENCE_LENGTH,)) 103 | question2 = Input(shape=(MAX_SEQUENCE_LENGTH,)) 104 | 105 | embed_layer = Embedding(embed_matrix.shape[0], EMBEDDING_DIM, weights=[embed_matrix], 106 | input_length=MAX_SEQUENCE_LENGTH, trainable=False) 107 | 108 | q1_embed = embed_layer(question1) 109 | q2_embed = embed_layer(question2) 110 | 111 | shared_lstm_1 = LSTM(lstm_num, return_sequences=True) 112 | shared_lstm_2 = LSTM(lstm_num) 113 | 114 | q1 = shared_lstm_1(q1_embed) 115 | q1 = Dropout(lstm_drop)(q1) 116 | q1 = BatchNormalization()(q1) 117 | q1 = shared_lstm_2(q1) 118 | # q1 = Dropout(0.5)(q1) 119 | 120 | q2 = shared_lstm_1(q2_embed) 121 | q2 = Dropout(lstm_drop)(q2) 122 | q2 = BatchNormalization()(q2) 123 | q2 = shared_lstm_2(q2) 124 | # q2 = Dropout(0.5)(q2) # of shape (batch_size, 128) 125 | 126 | # 求distance (batch_size,1) 127 | d = Subtract()([q1, q2]) 128 | #distance = Dot(axes=1, normalize=False)([d, d]) 129 | #distance = Lambda(lambda x: K.abs(x))(d) 130 | distance = Multiply()([d, d]) 131 | # 求angle (batch_size,1) 132 | # angle = Dot(axes=1, normalize=False)([q1, q2]) 133 | angle = Multiply()([q1, q2]) 134 | # merged = concatenate([distance,angle]) 135 | 136 | # magic featurues 137 | magic_input = Input(shape=(train_features.shape[1],)) 138 | magic_dense = BatchNormalization()(magic_input) 139 | magic_dense = Dense(64, activation='relu')(magic_dense) 140 | #magic_dense = Dropout(0.3)(magic_dense) 141 | 142 | merged = concatenate([distance,angle,magic_dense]) 143 | merged = Dropout(0.3)(merged) 144 | merged = BatchNormalization()(merged) 145 | 146 | merged = Dense(256, activation='relu')(merged) # 64 147 | merged = Dropout(0.3)(merged) 148 | merged = BatchNormalization()(merged) 149 | 150 | merged = Dense(64, activation='relu')(merged) # 64 151 | merged = Dropout(0.3)(merged) 152 | merged = BatchNormalization()(merged) 153 | 154 | is_duplicate = Dense(1, activation='sigmoid')(merged) 155 | 156 | model = Model(inputs=[question1, question2, magic_input], outputs=is_duplicate) 157 | 158 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 159 | 160 | model.summary() 161 | 162 | # define save model 163 | best_weights_filepath = 'models/0715 lstm keras/word_lstm_with_magics_' + str(model_count) + '.hdf5' 164 | earlyStopping = kcallbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto') 165 | saveBestModel = kcallbacks.ModelCheckpoint(best_weights_filepath, monitor='val_loss', verbose=1,\ 166 | save_best_only=True, mode='auto') 167 | 168 | hist = model.fit([q1_train, q2_train, f_train], 169 | y_train, 170 | validation_data=([q1_val, q2_val, f_val], y_val), 171 | epochs=30, 172 | batch_size=BATCH_SIZE, 173 | shuffle=True, 174 | callbacks=[earlyStopping, saveBestModel], 175 | verbose=1) 176 | 177 | model.load_weights(best_weights_filepath) 178 | print(model_count, "validation loss:", min(hist.history["val_loss"])) 179 | best_vali_score[model_count] = min(hist.history["val_loss"]) 180 | 181 | # predict on the val set 182 | preds = model.predict([q1_val, q2_val, f_val], batch_size=1024, verbose=1) 183 | val_preds = pd.DataFrame({"y_pre": preds.ravel()}) 184 | val_preds['val_index'] = idx_val 185 | save_path = 'features/0715_lstm_word_with_magic/vali_' + str(model_count) + '.csv' 186 | val_preds.to_csv(save_path, index=0) 187 | print(model_count, "val preds saved.") 188 | 189 | # predict on the test set 190 | preds1 = model.predict([test_q1, test_q2, test_features], batch_size=1024, verbose=1) 191 | test_preds = pd.DataFrame({"y_pre": preds1.ravel()}) 192 | save_path1 = 'features/0715_lstm_word_with_magic/test_' + str(model_count) + '.csv' 193 | test_preds.to_csv(save_path1, index=0) 194 | print(model_count, "test preds saved.") 195 | 196 | 197 | # run the model and predict 198 | import time 199 | start = time.time() 200 | 201 | trainLSTM(train_q1, train_q2, train_label, embed_matrix, test_q1, test_q2, \ 202 | train_features.values, test_features.values, split_index) 203 | 204 | end = time.time() 205 | print('Training time {0:.3f} 分钟'.format((end-start)/60)) -------------------------------------------------------------------------------- /feature_engineering.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 特征构造 3 | ''' 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | from collections import deque 8 | 9 | 10 | #char_embed = pd.read_csv('datasets/char_embed.txt', sep=' ', header=None, index_col=0) 11 | #word_embed = pd.read_csv('datasets/word_embed.txt', sep=' ', header=None, index_col=0) 12 | #question = pd.read_csv('datasets/question.csv',index_col=0) 13 | #train = pd.read_csv('datasets/train.csv') 14 | #test = pd.read_csv('datasets/test.csv') 15 | 16 | 17 | ''' 18 | 求q1,q2长度差异,normalize by the max len of question pairs 19 | ''' 20 | def get_len_diff(merge_data, word_level=True): 21 | if word_level: 22 | merge = merge_data[['words_x','words_y']] 23 | else: 24 | merge = merge_data[['chars_x','chars_y']] 25 | 26 | merge.columns = ['q1','q2'] 27 | 28 | q1_len = merge.q1.apply(lambda x: len(x.split(' '))).values 29 | q2_len = merge.q2.apply(lambda x: len(x.split(' '))).values 30 | 31 | len_diff = np.abs((q1_len - q2_len) / np.max([q1_len, q2_len],axis=0)) 32 | 33 | return len_diff 34 | 35 | 36 | ''' 37 | # 取q1,q2中相同词的个数 38 | ''' 39 | def get_num_common_words(question, data): 40 | # merge data 41 | merge = pd.merge(data,question,left_on=['q1'],right_on=['qid'],how='left') 42 | merge = pd.merge(merge,question,left_on=['q2'],right_on=['qid'],how='left') 43 | merge = merge[['words_x','words_y']] 44 | merge.columns = ['q1','q2'] 45 | 46 | q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values 47 | q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values 48 | 49 | result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(len(q1_word_set))] 50 | result = pd.DataFrame(result, index=data.index) 51 | result.columns = ['num_common_words'] 52 | return result 53 | 54 | ''' 55 | 计算共现词比例 56 | ''' 57 | def get_common_word_ratio(merge_data, data, word_level=True): 58 | 59 | if word_level: 60 | merge = merge_data[['words_x','words_y']] 61 | else: 62 | merge = merge_data[['chars_x','chars_y']] 63 | merge.columns = ['q1','q2'] 64 | 65 | q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values 66 | q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values 67 | q1_word_len = merge.q1.apply(lambda x: len(x.split(' '))).values 68 | q2_word_len = merge.q2.apply(lambda x: len(x.split(' '))).values 69 | 70 | result = [len(q1_word_set[i] & q2_word_set[i])/max(q1_word_len[i],q2_word_len[i]) for i in range(len(q1_word_set))] 71 | result = pd.DataFrame(result, index=data.index) 72 | result.columns = ['common_word_ratio'] 73 | return result 74 | 75 | ''' 76 | 计算tf-idf向量 77 | ''' 78 | def get_tfidf_vector(question, merge_data, word_level=True): 79 | 80 | # use the question corpus to train tf-idf vec 81 | if word_level: 82 | vectorizer = TfidfVectorizer().fit(question.words.values) #max_features=1000 83 | merge = merge_data[['words_x','words_y']] 84 | else: 85 | vectorizer = TfidfVectorizer().fit(question.chars.values) 86 | merge = merge_data[['chars_x','chars_y']] 87 | merge.columns = ['q1','q2'] 88 | 89 | q1_tfidf = vectorizer.transform(merge.q1.values) 90 | q2_tfidf = vectorizer.transform(merge.q2.values) 91 | 92 | return vectorizer.vocabulary_,q1_tfidf, q2_tfidf 93 | 94 | ''' 95 | 用tfidf作为系数,调整共现词比例 96 | ''' 97 | def common_word_ratio_adjust_with_tfidf(merge_data, word_to_index, q1_tfidf, q2_tfidf, word_level=True): 98 | 99 | if word_level: 100 | merge = merge_data[['words_x','words_y']] 101 | merge.columns = ['q1','q2'] 102 | else: 103 | merge = merge_data[['chars_x','chars_y']] 104 | merge.columns = ['q1','q2'] 105 | 106 | adjusted_common_word_ratio = [] 107 | 108 | for i in range(q1_tfidf.shape[0]): 109 | q1words = {} 110 | q2words = {} 111 | for word in merge.loc[i,'q1'].lower().split(): 112 | q1words[word] = q1words.get(word, 0) + 1 113 | for word in merge.loc[i,'q2'].lower().split(): 114 | q2words[word] = q2words.get(word, 0) + 1 115 | 116 | sum_shared_word_in_q1 = sum([q1words[w] * q1_tfidf[i,word_to_index[w]] for w in q1words if w in q2words]) 117 | sum_shared_word_in_q2 = sum([q2words[w] * q2_tfidf[i,word_to_index[w]] for w in q2words if w in q1words]) 118 | sum_tol = sum(q1words[w] * q1_tfidf[i,word_to_index[w]] for w in q1words) + sum(q2words[w] * q2_tfidf[i,word_to_index[w]] for w in q2words) 119 | if 1e-6 > sum_tol: 120 | adjusted_common_word_ratio.append(0.) 121 | else: 122 | adjusted_common_word_ratio.append(1.0 * (sum_shared_word_in_q1 + sum_shared_word_in_q2) / sum_tol) 123 | 124 | return adjusted_common_word_ratio 125 | 126 | 127 | """ 128 | 计算数据中词语的影响力,格式如下: 129 | 词语 --> [0. 出现语句对数量,1. 出现语句对比例,2. 正确语句对比例,3. 单侧语句对比例,4. 单侧语句对正确比例,5. 双侧语句对比例,6. 双侧语句对正确比例] 130 | """ 131 | def generate_powerful_word(merge_data, word_level=True): 132 | 133 | if word_level: 134 | train_subset_data = merge_data[['label','words_x','words_y']] 135 | else: 136 | train_subset_data = merge_data[['label','chars_x','chars_y']] 137 | 138 | train_subset_data.columns = ['label','q1','q2'] 139 | 140 | words_power = {} 141 | 142 | for i in train_subset_data.index: 143 | label = int(train_subset_data.loc[i,'label']) 144 | q1_words = train_subset_data.loc[i,'q1'].lower().split() 145 | q2_words = train_subset_data.loc[i,'q2'].lower().split() 146 | all_words = set(q1_words + q2_words) 147 | q1_words = set(q1_words) 148 | q2_words = set(q2_words) 149 | for word in all_words: 150 | if word not in words_power: 151 | words_power[word] = [0. for i in range(7)] 152 | # 计算出现语句对数量 153 | words_power[word][0] += 1. 154 | words_power[word][1] += 1. 155 | 156 | if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)): 157 | # 计算单侧语句数量 158 | words_power[word][3] += 1. 159 | if 0 == label: 160 | # 计算正确语句对数量 161 | words_power[word][2] += 1. 162 | # 计算单侧语句正确比例 163 | words_power[word][4] += 1. 164 | 165 | if (word in q1_words) and (word in q2_words): 166 | # 计算双侧语句数量 167 | words_power[word][5] += 1. 168 | if 1 == label: 169 | # 计算正确语句对数量 170 | words_power[word][2] += 1. 171 | # 计算双侧语句正确比例 172 | words_power[word][6] += 1. 173 | 174 | for word in words_power: 175 | # 计算出现语句对比例 176 | words_power[word][1] /= train_subset_data.shape[0] 177 | # 计算正确语句对比例 178 | words_power[word][2] /= words_power[word][0] 179 | # 计算单侧语句对正确比例 180 | if words_power[word][3] > 1e-6: 181 | words_power[word][4] /= words_power[word][3] 182 | # 计算单侧语句对比例 183 | words_power[word][3] /= words_power[word][0] 184 | # 计算双侧语句对正确比例 185 | if words_power[word][5] > 1e-6: 186 | words_power[word][6] /= words_power[word][5] 187 | # 计算双侧语句对比例 188 | words_power[word][5] /= words_power[word][0] 189 | 190 | sorted_words_power = sorted(words_power.items(), key=lambda d: d[1][0], reverse=True) 191 | 192 | return sorted_words_power 193 | 194 | 195 | ''' 196 | 若问题两侧存在有预测力的powerful words,则设置标签为1,否则为0 197 | ''' 198 | def powerful_words_dside_tag(pword, merge_data,thresh_num, thresh_rate, word_level=True): 199 | #筛选powerful words (有预测力的) 200 | pword_dside = [] 201 | pword = filter(lambda x: x[1][0] * x[1][5] >= thresh_num, pword) #保证统计可靠性 202 | pword_sort = sorted(pword, key=lambda d: d[1][6], reverse=True) 203 | pword_dside.extend(map(lambda x: x[0], filter(lambda x: x[1][6] >= thresh_rate, pword_sort))) #保证抽取到真正powerful的word 204 | 205 | if word_level: 206 | merge = merge_data[['words_x','words_y']] 207 | else: 208 | merge = merge_data[['chars_x','chars_y']] 209 | 210 | merge.columns = ['q1','q2'] 211 | 212 | pword_dside_tags = [] 213 | 214 | for i in merge_data.index: 215 | tags = [] 216 | q1_words = set(merge.loc[i,'q1'].lower().split()) 217 | q2_words = set(merge.loc[i,'q2'].lower().split()) 218 | for word in pword_dside: 219 | if (word in q1_words) and (word in q2_words): 220 | tags.append(1.0) 221 | else: 222 | tags.append(0.0) 223 | 224 | pword_dside_tags.append(tags) 225 | 226 | return pword_dside, pword_dside_tags 227 | 228 | 229 | def powerful_words_oside_tag(pword, merge_data,thresh_num, thresh_rate, word_level=True): 230 | 231 | pword_oside = [] 232 | pword = filter(lambda x: x[1][0] * x[1][3] >= thresh_num, pword) 233 | pword_oside.extend(map(lambda x: x[0], filter(lambda x: x[1][4] >= thresh_rate, pword))) 234 | 235 | if word_level: 236 | merge = merge_data[['words_x','words_y']] 237 | else: 238 | merge = merge_data[['chars_x','chars_y']] 239 | 240 | merge.columns = ['q1','q2'] 241 | 242 | pword_oside_tags = [] 243 | 244 | for i in merge_data.index: 245 | tags = [] 246 | q1_words = set(merge.loc[i,'q1'].lower().split()) 247 | q2_words = set(merge.loc[i,'q2'].lower().split()) 248 | for word in pword_oside: 249 | if (word in q1_words) and (word not in q2_words): 250 | tags.append(1.0) 251 | elif (word not in q1_words) and (word in q2_words): 252 | tags.append(1.0) 253 | else: 254 | tags.append(0.0) 255 | 256 | pword_oside_tags.append(tags) 257 | 258 | return pword_oside, pword_oside_tags 259 | 260 | 261 | def powerful_word_dside_rate(sorted_words_power, pword_dside, merge_data, word_level=True): 262 | ''' 263 | 注意rate是指label=0的可能性,question pair中两侧powerful word越多,power越大,则rate越小 264 | ''' 265 | num_least = 300 266 | 267 | if word_level: 268 | merge = merge_data[['words_x','words_y']] 269 | else: 270 | merge = merge_data[['chars_x','chars_y']] 271 | 272 | merge.columns = ['q1','q2'] 273 | 274 | words_power = dict(sorted_words_power) #转化为字典格式 275 | 276 | pword_dside_rate = [] 277 | 278 | for i in merge.index: 279 | rate = 1.0 # 指labei=0的可能性,先初始化为1 280 | q1_words = set(merge.loc[i,'q1'].lower().split()) 281 | q2_words = set(merge.loc[i,'q2'].lower().split()) 282 | share_words = list(q1_words.intersection(q2_words)) 283 | for word in share_words: 284 | if word in pword_dside: 285 | rate *= (1.0 - words_power[word][6]) #uestion pair中两侧powerful word越多,power越大,则rate越小 286 | pword_dside_rate.append(1-rate) 287 | return pword_dside_rate 288 | 289 | def powerful_word_oside_rate(sorted_words_power, pword_oside, merge_data, word_level=True): 290 | ''' 291 | 注意rate是指label=1的可能性,question pair中单侧powerful word越多,power越大,则rate越小 292 | ''' 293 | num_least = 300 294 | 295 | if word_level: 296 | merge = merge_data[['words_x','words_y']] 297 | else: 298 | merge = merge_data[['chars_x','chars_y']] 299 | 300 | merge.columns = ['q1','q2'] 301 | words_power = dict(sorted_words_power) #转化为字典格式 302 | 303 | pword_oside_rate = [] 304 | 305 | for i in merge.index: 306 | rate = 1.0 # 指labei=1的可能性,先初始化为1 307 | q1_words = set(merge.loc[i,'q1'].lower().split()) 308 | q2_words = set(merge.loc[i,'q2'].lower().split()) 309 | q1_diff = list(set(q1_words).difference(set(q2_words))) 310 | q2_diff = list(set(q2_words).difference(set(q1_words))) 311 | all_diff = set(q1_diff + q2_diff) 312 | for word in all_diff: 313 | if word in pword_oside: 314 | rate *= (1.0 - words_power[word][4]) #question pair中单侧powerful word越多,power越大,则rate越小 315 | pword_oside_rate.append(1-rate) 316 | 317 | return pword_oside_rate 318 | 319 | 320 | 321 | ''' 322 | 扩展的编辑距离(Damerau-Levenshtein Distance) 323 | 扩展的编辑距离在思想上与编辑距离一样,只是除插入、删除和替换操作外,还支持 相邻字符的交换 这样一个操作,增加这个操作的考虑是人们在计算机上输入文档时的错误情况中,因为快速敲击而前后两个字符的顺序被输错的情况很常见。 324 | ''' 325 | def edit_distance(q1, q2): 326 | 327 | str1 = q1.split(' ') 328 | str2 = q2.split(' ') 329 | matrix = [[i+j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)] 330 | 331 | for i in range(1,len(str1)+1): 332 | for j in range(1,len(str2)+1): 333 | if str1[i-1] == str2[j-1]: 334 | d = 0 335 | else: 336 | d = 1 337 | matrix[i][j] = min(matrix[i-1][j]+1,matrix[i][j-1]+1,matrix[i-1][j-1]+d) 338 | 339 | if i > 1 and j > 1 and str1[i-1] == str2[j-2] and str1[i-2] == str2[j-1]: 340 | d = 0 # d=0表示允许交换,d =1表示不允许交换 341 | matrix[i][j] = min(matrix[i][j], matrix[i-2][j-2] + d) # allow transposition 342 | 343 | return matrix[len(str1)][len(str2)] 344 | 345 | def get_edit_distance(merge_data, word_level=True): 346 | 347 | if word_level: 348 | merge = merge_data[['words_x','words_y']] 349 | merge.columns = ['q1','q2'] 350 | else: 351 | merge = merge_data[['chars_x','chars_y']] 352 | merge.columns = ['q1','q2'] 353 | 354 | q1_len = merge['q1'].apply(lambda x: len(x.split(' '))).values 355 | q2_len = merge['q2'].apply(lambda x: len(x.split(' '))).values 356 | 357 | # normalize the edit_distance by the max(len(q1),len(q2)) 358 | dist =[edit_distance(merge.loc[i,'q1'],merge.loc[i,'q2'])/ np.max([q1_len,q2_len],axis=0)[i] for i in merge.index] 359 | 360 | return dist 361 | --------------------------------------------------------------------------------