├── README.md
├── data_augmentation_with_graph_features.py
├── data_helper.py
├── post_processing_with_graph_features.py
├── siamese_lstm_with_distance_and_angle.py
└── feature_engineering.py


/README.md:
--------------------------------------------------------------------------------
 1 | # -第三届魔镜杯 智能客服问题相似性算法设计 第12名解决方案
 2 | 
 3 | 我们是moka_tree团队，在本次比赛中，初赛第16名, 复赛第12名。
 4 | 
 5 | 详细解决方案请查看简书： 
 6 | 智能客服问题相似度算法设计——第三届魔镜杯大赛第12名解决方案 
 7 | https://www.jianshu.com/p/827dd447daf9
 8 | 
 9 | 比赛数据下载：
10 | 链接：https://pan.baidu.com/s/1DgV8-iu_T_PtaH3HgEGY6g 
11 | 提取码：1cbr 
12 | 


--------------------------------------------------------------------------------
/data_augmentation_with_graph_features.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 利用图特征来做数据增强：获取更多的训练数据
  3 | 数据增强原则：
  4 | （1）如果q1，q2相似，且q1,q2在同一个连通图，则连通图的问题都相似 
  5 |    —— 利用connected components得到
  6 |    —— 组合后一共3796821个问题对，太多了
  7 |    —— 我们取 1<graph_distance<5 的问题对 （之所以要大于1，是因为等于1的是从训练数据来的）
  8 |    
  9 | （2）如果q1, q2不相似，且存在连通图cc1包含q1，和cc2包含q2，则cc1和cc2的任意组合均不相似
 10 |    —— 利用independent groups得到
 11 |    —— 注意一共有119475717个,太多了!!! 限制集合大小小于50，仍有1577269个问题对。选100万个。
 12 |    ---- 取跟相似问题一样多的量就好，构建平衡数据集
 13 | 
 14 | 我们把数据增强的结果保存为一个单独的文件,包含3列 label, q1, q2
 15 | 
 16 | 用了效果不好耶...
 17 | '''
 18 | 
 19 | import numpy as np
 20 | import pandas as pd
 21 | from collections import deque
 22 | from post_processing_with_graph_features import *
 23 | 
 24 | 
 25 | #char_embed =  pd.read_csv('datasets/char_embed.txt', sep=' ', header=None, index_col=0)
 26 | #word_embed = pd.read_csv('datasets/word_embed.txt', sep=' ', header=None, index_col=0)
 27 | #question = pd.read_csv('datasets/question.csv',index_col=0)
 28 | #train = pd.read_csv('datasets/train.csv')
 29 | #test = pd.read_csv('datasets/test.csv')
 30 | 
 31 | 
 32 | def all_pair_Dijkstra(train_graph, connected_component, max_distance):
 33 |     '''
 34 |     计算连通图中所有节点对的距离, 比Floyd算法快2.7倍左右
 35 |     '''
 36 |     m = len(connected_component)
 37 |     cc = list(connected_component)
 38 |     distance = {}
 39 |     
 40 |     # 初始化距离矩阵
 41 |     matrix = [[0 for j in range(m)] for i in range(m)]
 42 |     
 43 |     for i in range(m):
 44 |         dist = Dijkstra(train_graph, connected_component, cc[i])
 45 |         for j,node in enumerate(cc):
 46 |             matrix[i][j] = dist[node]
 47 |             # 保存我们想要的距离
 48 |             if matrix[i][j] > 1 and matrix[i][j] <= max_distance \
 49 |                           and (cc[i], cc[j]) not in distance.keys() \
 50 |                           and (cc[j], cc[i]) not in distance.keys():
 51 |                 distance[(cc[i], cc[j])] = matrix[i][j]
 52 |                     
 53 |     return distance, matrix, cc
 54 | 
 55 | 
 56 | def gen_similar_data(train_graph, connected_components, max_cc_size, max_distance):
 57 |     '''
 58 |     对每个连通图，计算连通图中任意两点的距离
 59 |     注意：
 60 |     如果连通图节点只有2个，直接break
 61 |     如果连通图计算的距离为1，不存储
 62 |     '''
 63 |     
 64 |     distance = {}
 65 |     
 66 |     for cc in connected_components:
 67 |         if len(cc) > 2 and len(cc) <= max_cc_size:
 68 |             cc_distance, _, _ = all_pair_Dijkstra(train_graph, cc, max_distance)
 69 |             distance.update(cc_distance)
 70 |         else:
 71 |             continue
 72 |     
 73 |     return distance
 74 | 
 75 | 
 76 | def gen_dissimilar_data(independent_groups, max_group_size):
 77 |     '''
 78 |     如果q1, q2不相似，且存在连通图cc1包含q1，和cc2包含q2，则cc1和cc2的任意组合均不相似
 79 |     max_group_size用来控制返回的问题对数量，设为46，对应100万左右的问题对
 80 |     '''
 81 |     dissimilar_pairs = set()  
 82 |     for ig in independent_groups:
 83 |         cc1 = ig[0]
 84 |         cc2 = ig[1]
 85 |         # 限制连通图大小，不然太多了
 86 |         if len(cc1) < max_group_size and len(cc2) < max_group_size:
 87 |             for q1 in cc1:
 88 |                 for q2 in cc2:
 89 |                     dissimilar_pairs.add((q1, q2))
 90 |     return dissimilar_pairs
 91 |  
 92 |     
 93 |     
 94 | def data_augmentation(train, similar_data, dissimilar_data):
 95 |     '''
 96 |     与train数据去重，生成平衡数据集
 97 |     similar_data: dict,{(q1,q2): d(q1,q2)}
 98 |     dissimilar_data: set, {(q1,q2)}
 99 |     '''
100 |    
101 |     #问题对转化为set格式
102 |     similar_pairs = set(similar_data.keys())
103 |     train_data1 = set([(train.loc[i,'q1'], train.loc[i,'q2']) for i in train.index])
104 |     train_data2 = set([(train.loc[i,'q2'], train.loc[i,'q1']) for i in train.index])
105 |     
106 |     # 查看（q1,q2）组合是否与train数据重复,如重复，则去掉
107 |     similar_pairs = similar_pairs - train_data1
108 |     similar_pairs = list(similar_pairs - train_data2)
109 |     dissimilar_pairs = dissimilar_data - train_data1
110 |     dissimilar_pairs = list(dissimilar_pairs - train_data2)
111 |     
112 |     
113 |     # 生成新的训练数据并导出
114 |     new_data = []
115 |     new_data.extend(similar_pairs)
116 |     new_data.extend(dissimilar_pairs)
117 |     new_data = pd.DataFrame(np.array(new_data))
118 |     new_data.columns = ['q1','q2']
119 |     new_data['label'] = 0
120 |     new_data.loc[0:len(similar_pairs)-1, 'label'] = 1
121 |     
122 |     return new_data


--------------------------------------------------------------------------------
/data_helper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import os
  4 | import math
  5 | 
  6 | 
  7 | def sentences_to_indices(X, word_to_index, max_len):
  8 |     """
  9 |     Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
 10 |     The output shape should be such that it can be given to `Embedding()` 
 11 |     
 12 |     Arguments:
 13 |     X -- array of sentences (strings), of shape (m, 1)
 14 |     word_to_index -- a dictionary containing the each word mapped to its index
 15 |     max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
 16 |     
 17 |     Returns:
 18 |     X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
 19 |     """
 20 |     m = X.shape[0]            # number of training examples 
 21 |     # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
 22 |     X_indices = np.zeros((m, max_len))
 23 |     for i in range(m):                               # loop over training examples
 24 |         # split the sentences into words
 25 |         sentence_words =X[i].split(' ') 
 26 |         # Loop over the words of sentence_words
 27 |         for j,w in enumerate(sentence_words):
 28 |             if j >= max_len:
 29 |                 break
 30 |             # Set the (i,j)th entry of X_indices to the index of the correct word.
 31 |             X_indices[i, j] = word_to_index[w]
 32 |             
 33 |     return X_indices
 34 | 
 35 | 
 36 | def load_dataset(max_seq_len, embed_dim, word_level=True):
 37 |     '''
 38 |     读取数据，对数据进行预处理，并生成embed_matrix
 39 |     '''
 40 |     #1、读取数据，数据预处理
 41 |     #数据路径
 42 |     question_path = os.path.join('datasets', 'question.csv')
 43 |     train_path = os.path.join('datasets', 'train.csv')
 44 |     if word_level:
 45 |         embed_path = os.path.join('datasets', 'word_embed.txt')
 46 |     else:
 47 |         embed_path = os.path.join('datasets', 'char_embed.txt')
 48 |     
 49 |     #读取数据
 50 |     question = pd.read_csv(question_path)
 51 |     
 52 |     train = pd.read_csv(train_path)
 53 |     # 把train里面的问题id匹配到句子
 54 |     train = pd.merge(train,question,left_on=['q1'],right_on=['qid'],how='left')
 55 |     train = pd.merge(train,question,left_on=['q2'],right_on=['qid'],how='left')
 56 |     
 57 |     if word_level:
 58 |         train = train[['label','words_x','words_y']]
 59 |     else:
 60 |         train = train[['label','chars_x','chars_y']]
 61 |     train.columns = ['label','q1','q2']
 62 |     
 63 |     # 读取word_to_vec_map，注意这里的index是word id
 64 |     word_to_vec_map = pd.read_csv(embed_path, sep=' ', header=None, index_col=0)
 65 |     
 66 |     # 先定义两个字典，实现wid与(positive) index的相互转换,注意index从1开始
 67 |     word = word_to_vec_map.index.values
 68 |     word_to_index = dict([(word[i],i+1) for i in range(len(word))])
 69 |     index_to_word = dict([(i+1, word[i]) for i in range(len(word))])
 70 |     
 71 |     # 把句子转换成int indices,并zero pad the sentance to max_seq_len
 72 |     train_q1_indices = sentences_to_indices(train.q1.values, word_to_index, max_seq_len)
 73 |     train_q2_indices = sentences_to_indices(train.q2.values, word_to_index, max_seq_len)
 74 |     label = train.label.values
 75 |     
 76 |     #3、生成embeding_matrix, index为整数，其中index=0,对应的是np.zeros(300),0向量，对应我们padding的值
 77 |     vocab_len = len(word_to_index) + 1                                   
 78 |     # Initialize the embedding matrix as numpy arrays of zeros
 79 |     embed_matrix = np.zeros((vocab_len, embed_dim))
 80 |     # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
 81 |     for word, index in word_to_index.items():
 82 |         embed_matrix[index, :] = word_to_vec_map.loc[word].values
 83 | 
 84 |     return (train_q1_indices,train_q2_indices, label, embed_matrix)
 85 | 
 86 | 	
 87 | 
 88 | def load_test_data(max_seq_len, word_level=True):
 89 |     '''
 90 |     读取测试数据
 91 |     '''
 92 |     #1、读取数据，数据预处理
 93 |     #数据路径
 94 |     question_path = os.path.join('datasets', 'question.csv')
 95 |     test_path = os.path.join('datasets', 'test.csv')
 96 |     if word_level:
 97 |         embed_path = os.path.join('datasets', 'word_embed.txt')
 98 |     else:
 99 |         embed_path = os.path.join('datasets', 'char_embed.txt')
100 |     
101 |     #读取数据
102 |     question = pd.read_csv(question_path)
103 |     test = pd.read_csv(test_path)
104 |     # 把train里面的问题id匹配到句子
105 |     test = pd.merge(test,question,left_on=['q1'],right_on=['qid'],how='left')
106 |     test = pd.merge(test,question,left_on=['q2'],right_on=['qid'],how='left')
107 |     if word_level:
108 |         test = test[['words_x','words_y']]
109 |     else:
110 |         test = test[['chars_x','chars_y']]
111 |     test.columns = ['q1','q2']
112 |     # 读取word_to_vec_map，注意这里的index是word id
113 |     word_to_vec_map = pd.read_csv(embed_path, sep=' ', header=None, index_col=0)
114 |     
115 |     # 先定义两个字典，实现wid与(positive) index的相互转换,注意index从1开始
116 |     word = word_to_vec_map.index.values
117 |     word_to_index = dict([(word[i],i+1) for i in range(len(word))])
118 |     index_to_word = dict([(i+1, word[i]) for i in range(len(word))])
119 |     
120 |     # 把句子转换成int indices,并zero pad the sentance to max_seq_len
121 |     test_q1_indices = sentences_to_indices(test.q1.values, word_to_index, max_seq_len).astype(np.int32)
122 |     test_q2_indices = sentences_to_indices(test.q2.values, word_to_index, max_seq_len).astype(np.int32)
123 |     
124 |     
125 |     return test_q1_indices,test_q2_indices
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/post_processing_with_graph_features.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 后处理
  3 | '''
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.feature_extraction.text import TfidfVectorizer
  7 | from collections import deque
  8 | 
  9 | 
 10 | #char_embed =  pd.read_csv('datasets/char_embed.txt', sep=' ', header=None, index_col=0)
 11 | #word_embed = pd.read_csv('datasets/word_embed.txt', sep=' ', header=None, index_col=0)
 12 | #question = pd.read_csv('datasets/question.csv',index_col=0)
 13 | #train = pd.read_csv('datasets/train.csv')
 14 | #test = pd.read_csv('datasets/test.csv')
 15 | 
 16 | 
 17 | 
 18 | #求q1,q2的图上距离（最短路径）
 19 | '''
 20 | step1 : 利用训练数据生成无向图，求各个连通分量
 21 | step2 : 求q1,q2的距离d
 22 |      （1）如果q1,q2在一个连通图上：求q1,q2的距离d 
 23 |      （2）如果q1,q2不在一个连通图上，令d(q1,q2) = 1000
 24 | '''
 25 | # 生成无向图
 26 | def gen_graph(train):
 27 |     """
 28 |     把输入数据转化为以字典表示的无向图
 29 |     """ 
 30 |     data = train[train['label']==1][['q1','q2']]
 31 |     graph = {}
 32 |     for i in range(len(data)):
 33 |         if data.iloc[i,0] not in graph.keys():
 34 |             graph[data.iloc[i,0]] = set([data.iloc[i,1]])
 35 |         else:
 36 |             graph[data.iloc[i,0]].add(data.iloc[i,1])
 37 |     
 38 |         if data.iloc[i,1] not in graph.keys():
 39 |             graph[data.iloc[i,1]] = set([data.iloc[i,0]])
 40 |         else:
 41 |             graph[data.iloc[i,1]].add(data.iloc[i,0])
 42 |     
 43 |     return graph
 44 | 
 45 | 
 46 | def bfs_visited(ugraph, start_node):
 47 |     """
 48 |     输入无向图ugraph和一个节点start_node
 49 |     返回从这个节点出发，通过广度优先搜索访问的所有节点的集合
 50 |     """
 51 |     # initialize Q to be an empty queue
 52 |     que = deque()
 53 |     # initialize visited
 54 |     visited = [start_node]
 55 |     # enqueue(que, start_node)
 56 |     que.append(start_node)
 57 |     while len(que) > 0:
 58 |         current_node = que.popleft()
 59 |         neighbours = ugraph[current_node]
 60 |         for nei in neighbours:
 61 |             if nei not in visited:
 62 |                 visited.append(nei)
 63 |                 que.append(nei) 
 64 |     return set(visited)
 65 | 
 66 | 
 67 | def cc_visited(ugraph):
 68 |     """
 69 |     输入无向图ugraph
 70 |     返回一个list，list的元素是每个连通分量的节点构成的集合
 71 |     """
 72 |     remaining_nodes = list(ugraph.keys())
 73 |     connected_components = []
 74 |     while len(remaining_nodes) > 0 :
 75 |         # choose the first element in remaining_nodes to be the start_node
 76 |         start_node = remaining_nodes[0]
 77 |         # use bfs_visited() to get the connected component containing start_node
 78 |         con_component = bfs_visited(ugraph, start_node)
 79 |         # update connected_components
 80 |         connected_components.append(con_component)
 81 |         # update remaining_nodes
 82 |         remaining_nodes = list(set(remaining_nodes) - con_component)
 83 |     return connected_components
 84 | 
 85 | 
 86 | # 单源最短路径
 87 | def Dijkstra(ugraph, connected_component, start_node):
 88 |     '''
 89 |     返回start_node到connected_component所有节点的最短距离
 90 |     '''
 91 |     # 初始化
 92 |     minv = start_node
 93 |     visited = set()
 94 |     
 95 |     # 源顶点到其余各顶点的初始路程
 96 |     dist = dict([(node,np.float('inf')) for node in connected_component])
 97 |     dist[minv] = 0
 98 |     
 99 |     # 遍历集合V中与A直接相邻的顶点，找出当前与A距离最短的顶点
100 |     while len(visited) < len(connected_component):
101 |         visited.add(minv)
102 |         # 确定当期顶点的距离
103 |         for v in ugraph[minv]:
104 |             if dist[minv] + 1 < dist[v]:   # 如果从当前点扩展到某一点的距离小与已知最短距离 
105 |                 dist[v] = dist[minv] + 1   # 对已知距离进行更新
106 |         
107 |         # 从剩下的未确定点中选择最小距离点作为新的扩散点
108 |         new = np.float('inf')                                      
109 |         for w in connected_component - visited:   
110 |             if dist[w] < new: 
111 |                 new = dist[w]
112 |                 minv = w  
113 |     return dist
114 |             
115 | 
116 | ## 先生成图
117 | #print('Generating Graph...')
118 | #start = time.time()
119 | #train_graph = gen_graph(train)
120 | #end = time.time()
121 | #print('Graph generated. Time used {:0.1f} mins'.format((end-start)/60))
122 | 
123 | ## 寻找各连通分项（大概7分钟）
124 | #print('Searching Connected Components...')
125 | #start = time.time()
126 | #connected_components = cc_visited(train_graph)
127 | #end = time.time()
128 | #print('Search finished. Time used {:0.1f} mins'.format((end-start)/60))
129 | 
130 | def get_graph_distance(data, train_graph, connected_components, training_data=True):
131 |     '''
132 |     1. 如果q1,q2在一个连通图上：返回q1,q2的距离d
133 |     2. 如果q1,q2不在一个连通图上: 令d(q1, q2) = 1000
134 |     '''
135 |     n = data.shape[0]
136 |     
137 |     # 初始化
138 |     record_distance = {}  #用来记录已经计算过的距离
139 |     result_distance = [1000 for i in range(n)]
140 |     
141 |     for i in range(n):
142 |         q1 = data.loc[i,'q1']
143 |         q2 = data.loc[i,'q2']
144 | 
145 |         # 如果是训练数据的相似问题，则dist=1
146 |         if training_data and data.loc[i,'label'] == 1:
147 |             result_distance[i] = 1
148 |         
149 |         # 如果已经计算过，直接取出计算过的值
150 |         elif (q1,q2) in record_distance.keys():
151 |             result_distance[i] = record_distance[(q1,q2)]
152 | 
153 |         elif (q2,q1) in record_distance.keys():
154 |             result_distance[i] = record_distance[(q2,q1)]
155 | 
156 |         else:       
157 |             # check whether q1,q2 are in one connected_componets
158 |             for cc in connected_components:
159 |                 if (q1 in cc) and (q2 in cc):
160 |                     # 连通图cc,q1到其它节点的距离
161 |                     q1_dist = Dijkstra(train_graph, cc, q1)
162 |                     # 把计算过的距离保存起来
163 |                     new_dict = dict([((q1,node),q1_dist[node]) for node in q1_dist.keys()])
164 |                     record_distance.update(new_dict)          
165 |                     result_distance[i] = q1_dist[q2]            
166 |                     break
167 | 
168 |     result_distance = pd.DataFrame(np.array(result_distance), index=data.index)
169 |     result_distance.columns = ['graph_distance']
170 |     
171 |     return result_distance
172 |                 
173 | '''
174 | 通过训练数据得到问题之间的距离，进行统计发现：
175 | label = 1 : graph_distance = 1
176 | label = 0 : graph_distance = 1000（表示不连通）
177 | 说明：不相似的问题不可能在一个连通图里
178 | 推断：q1与q2不相似，则q1与q2的连通图G(q2)的所有顶点都不相似，q2与q1的连通图G(q1)的所有顶点都不相似
179 | 另有一个不太充分的结论： 相似问题具有传递性，而且可以传递很远。
180 | 
181 | 算法：区分确定的不相似和不确定的不相似
182 | input: (q1, q2) , connected component
183 | return: graph_feature（gf for short）
184 | step1: 先利用训练数据中问题对的不相似，找出相互独立的连通子图对
185 | step2: 对于测试数据中的问题对（q1, q2）,如果q1存在连通图cc(q1), q2存在连通图cc(q2)，且cc(q1)与cc(q2)独立，则q1,q2不相似。
186 | '''
187 | 
188 | def get_independent_groups(train, train_graph_distance, connected_components):
189 |     
190 |     # 找出不相似的问题对
191 |     data = train[train.label == 0]
192 |     
193 |     independent_groups = []
194 |        
195 |     for i in data.index:
196 |         q1 = data.loc[i,'q1']
197 |         q2 = data.loc[i,'q2']
198 |         
199 |         if train_graph_distance.loc[i, 'graph_distance'] == 1000:
200 |             # 查看它们是否有连通图
201 |             cc1 = set([])
202 |             cc2 = set([])
203 |             for cc in connected_components:
204 |                 if q1 in cc:
205 |                     cc1 = cc
206 |                 if q2 in cc:
207 |                     cc2 = cc
208 |             if len(cc1) > 0 and len(cc2) > 0 and (cc1,cc2) not in independent_groups and (cc2,cc1) not in independent_groups:
209 |                 independent_groups.append((cc1,cc2))
210 |                 
211 |     return independent_groups
212 | 
213 | 
214 | 
215 | def get_graph_features(test, test_graph_distance, independent_groups):
216 |     
217 |     n = test.shape[0]
218 |     
219 |     # 初始化, 0 表示从训练集的graph无法确定是否相似， 1表示确定相似，-1表示确定不相似
220 |     graph_features = [0 for i in range(n)]
221 |     
222 |     for i in range(n):
223 |         q1 = test.loc[i,'q1']
224 |         q2 = test.loc[i,'q2']
225 | 
226 |         if test_graph_distance.loc[i,'graph_distance'] < 1000:
227 |             graph_features[i] = 1
228 |         else:
229 |             # 看看q1和q2是否在independent group里面，如果在，则q1，q2确定不相似
230 |             for ig in independent_groups:
231 |                 if (q1 in ig[0] and q2 in ig[1]) or (q1 in ig[1] and q2 in ig[0]):
232 |                     graph_features[i] = -1
233 |       
234 |     graph_features = pd.DataFrame(np.array(graph_features), index=test.index)
235 |     graph_features.columns = ['graph_features']
236 |     
237 |     return graph_features
238 | 
239 | 


--------------------------------------------------------------------------------
/siamese_lstm_with_distance_and_angle.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | np.random.seed(0)
  4 | 
  5 | from keras.models import Model
  6 | from keras.layers import Dense, Input, Dropout, LSTM, Activation, BatchNormalization,concatenate,Subtract, Dot, Multiply,Bidirectional,Lambda
  7 | from keras.layers.embeddings import Embedding
  8 | from keras.initializers import glorot_uniform
  9 | from keras.layers.noise import GaussianNoise
 10 | from keras import backend as K
 11 | from keras import optimizers
 12 | import tensorflow as tf
 13 | 
 14 | import keras.callbacks as kcallbacks
 15 | np.random.seed(1)
 16 | 
 17 | from data_helper import *
 18 | 
 19 | import warnings
 20 | warnings.filterwarnings('ignore')
 21 | 
 22 | # jupyter magic commands，自动重新加载更改的模块
 23 | %load_ext autoreload
 24 | %autoreload 2
 25 | 
 26 | 
 27 | MAX_SEQUENCE_LENGTH = 15  # 20 for character level and 15 for word level
 28 | EMBEDDING_DIM = 300
 29 | 
 30 | # 读取数据
 31 | train_q1, train_q2, train_label, embed_matrix = load_dataset(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, word_level=True)
 32 | print('train_q1: ',train_q1.shape)
 33 | print('train_q2: ', train_q2.shape)
 34 | print('train_label: ',train_label.shape)
 35 | print('embed_matrix: ',embed_matrix.shape)
 36 | 
 37 | # 加载test 数据
 38 | test_q1, test_q2 = load_test_data( MAX_SEQUENCE_LENGTH, word_level=True)
 39 | print('test_q1: ',test_q1.shape)
 40 | print('test_q2: ', test_q2.shape)
 41 | 
 42 | 
 43 | # 读取手工特征
 44 | train_features = pd.read_csv('features/0714_all_train_features_17.csv')
 45 | test_features = pd.read_csv('features/0714_all_test_features_17.csv')
 46 | 
 47 | train_moka_features = pd.read_csv('features/non_nlp_features_train.csv')
 48 | test_moka_features = pd.read_csv('features/non_nlp_features_test.csv')
 49 | 
 50 | train_features = pd.merge(train_features, train_moka_features, left_index=True, right_index=True)
 51 | test_features = pd.merge(test_features, test_moka_features, left_index=True, right_index=True)
 52 | 
 53 | pick_columns = ['adjusted_common_word_ratio', 'edit_distance','len_diff', 'pword_dside_rate', 'pword_oside_rate',
 54 |                 'adjusted_common_char_ratio', 'pchar_dside_rate', 'pchar_oside_rate',
 55 |                 'coo_max_degree_(0, 5]','coo_max_degree_(5, 30]', 'coo_max_degree_(30, 130]',
 56 |                  'coo_q1_q2_degree_diff','common_neighbor_ratio']
 57 | 
 58 | train_features = train_features[pick_columns]
 59 | test_features = test_features[pick_columns]
 60 | 
 61 | train_features.info()
 62 | 
 63 | 
 64 | # 读取数据分裂index
 65 | split_index = {}
 66 | for i in range(10):
 67 |     split_index[i]= pd.read_csv('features/0714_train_split_index/vali_idx_'+str(i)+'.csv').idx.values
 68 | 	
 69 | 
 70 | # define model, 10-fold cv
 71 | 
 72 | best_vali_score ={}
 73 | 
 74 | def trainLSTM(train_q1, train_q2, train_label, embed_matrix, test_q1, test_q2, train_features, test_features, split_index):
 75 |    
 76 |     lstm_num = 75
 77 |     lstm_drop = 0.5
 78 |     BATCH_SIZE = 256  # 128
 79 |     
 80 |     for model_count in range(10):
 81 |         
 82 |         print("MODEL:", model_count)
 83 |             
 84 |         # split data into train/vali set
 85 |         idx_val = split_index[model_count]
 86 |         idx_train = []
 87 |         for i in range(10):
 88 |              if i != model_count:
 89 |                     idx_train.extend(list(split_index[i]))
 90 | 
 91 |         q1_train = train_q1[idx_train]
 92 |         q2_train = train_q2[idx_train]
 93 |         y_train = train_label[idx_train]
 94 |         f_train = train_features[idx_train]
 95 |        
 96 |         q1_val = train_q1[idx_val]
 97 |         q2_val = train_q2[idx_val]
 98 |         y_val = train_label[idx_val]
 99 |         f_val = train_features[idx_val]
100 |     
101 |         # Define the model
102 |         question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
103 |         question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))
104 | 
105 |         embed_layer = Embedding(embed_matrix.shape[0], EMBEDDING_DIM, weights=[embed_matrix],
106 |                                 input_length=MAX_SEQUENCE_LENGTH, trainable=False)
107 | 
108 |         q1_embed = embed_layer(question1)
109 |         q2_embed = embed_layer(question2)
110 | 
111 |         shared_lstm_1 = LSTM(lstm_num, return_sequences=True)
112 |         shared_lstm_2 = LSTM(lstm_num)
113 | 
114 |         q1 = shared_lstm_1(q1_embed)
115 |         q1 = Dropout(lstm_drop)(q1)
116 |         q1 = BatchNormalization()(q1)
117 |         q1 = shared_lstm_2(q1)
118 |         # q1 = Dropout(0.5)(q1)
119 | 
120 |         q2 = shared_lstm_1(q2_embed)
121 |         q2 = Dropout(lstm_drop)(q2)
122 |         q2 = BatchNormalization()(q2)
123 |         q2 = shared_lstm_2(q2)
124 |         # q2 = Dropout(0.5)(q2)   # of shape (batch_size, 128)
125 | 
126 |         # 求distance (batch_size,1)
127 |         d = Subtract()([q1, q2])
128 |         #distance = Dot(axes=1, normalize=False)([d, d])
129 |         #distance = Lambda(lambda x: K.abs(x))(d)
130 |         distance = Multiply()([d, d])
131 |         # 求angle (batch_size,1)
132 |         # angle = Dot(axes=1, normalize=False)([q1, q2])
133 |         angle = Multiply()([q1, q2])
134 |         # merged = concatenate([distance,angle])
135 | 
136 |         # magic featurues
137 |         magic_input = Input(shape=(train_features.shape[1],))
138 |         magic_dense = BatchNormalization()(magic_input)
139 |         magic_dense = Dense(64, activation='relu')(magic_dense)
140 |         #magic_dense = Dropout(0.3)(magic_dense)
141 |         
142 |         merged = concatenate([distance,angle,magic_dense])
143 |         merged = Dropout(0.3)(merged)
144 |         merged = BatchNormalization()(merged)
145 | 
146 |         merged = Dense(256, activation='relu')(merged)  # 64
147 |         merged = Dropout(0.3)(merged)
148 |         merged = BatchNormalization()(merged)
149 | 
150 |         merged = Dense(64, activation='relu')(merged)  # 64
151 |         merged = Dropout(0.3)(merged)
152 |         merged = BatchNormalization()(merged)
153 | 
154 |         is_duplicate = Dense(1, activation='sigmoid')(merged)
155 | 
156 |         model = Model(inputs=[question1, question2, magic_input], outputs=is_duplicate)
157 | 
158 |         model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
159 | 
160 |         model.summary()
161 |         
162 |         # define save model 
163 |         best_weights_filepath = 'models/0715 lstm keras/word_lstm_with_magics_' + str(model_count) + '.hdf5'
164 |         earlyStopping = kcallbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
165 |         saveBestModel = kcallbacks.ModelCheckpoint(best_weights_filepath, monitor='val_loss', verbose=1,\
166 |                                                    save_best_only=True, mode='auto')
167 | 
168 |         hist = model.fit([q1_train, q2_train, f_train], 
169 |                          y_train,
170 |                          validation_data=([q1_val, q2_val, f_val], y_val),
171 |                          epochs=30, 
172 |                          batch_size=BATCH_SIZE, 
173 |                          shuffle=True,
174 |                          callbacks=[earlyStopping, saveBestModel], 
175 |                          verbose=1)
176 | 
177 |         model.load_weights(best_weights_filepath)
178 |         print(model_count, "validation loss:", min(hist.history["val_loss"]))
179 |         best_vali_score[model_count] = min(hist.history["val_loss"])
180 |         
181 |         # predict on the val set
182 |         preds = model.predict([q1_val, q2_val, f_val], batch_size=1024, verbose=1)
183 |         val_preds = pd.DataFrame({"y_pre": preds.ravel()})
184 |         val_preds['val_index'] = idx_val
185 |         save_path = 'features/0715_lstm_word_with_magic/vali_' + str(model_count) + '.csv'
186 |         val_preds.to_csv(save_path, index=0)
187 |         print(model_count, "val preds saved.")
188 |         
189 |         # predict on the test set
190 |         preds1 = model.predict([test_q1, test_q2, test_features], batch_size=1024, verbose=1)
191 |         test_preds = pd.DataFrame({"y_pre": preds1.ravel()})
192 |         save_path1 = 'features/0715_lstm_word_with_magic/test_' + str(model_count) + '.csv'
193 |         test_preds.to_csv(save_path1, index=0)
194 |         print(model_count, "test preds saved.")
195 | 		
196 | 
197 | # run the model and predict		
198 | import time
199 | start = time.time()
200 | 
201 | trainLSTM(train_q1, train_q2, train_label, embed_matrix, test_q1, test_q2, \
202 |           train_features.values, test_features.values, split_index)
203 | 
204 | end = time.time()
205 | print('Training time {0:.3f} 分钟'.format((end-start)/60))


--------------------------------------------------------------------------------
/feature_engineering.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 特征构造
  3 | '''
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.feature_extraction.text import TfidfVectorizer
  7 | from collections import deque
  8 | 
  9 | 
 10 | #char_embed =  pd.read_csv('datasets/char_embed.txt', sep=' ', header=None, index_col=0)
 11 | #word_embed = pd.read_csv('datasets/word_embed.txt', sep=' ', header=None, index_col=0)
 12 | #question = pd.read_csv('datasets/question.csv',index_col=0)
 13 | #train = pd.read_csv('datasets/train.csv')
 14 | #test = pd.read_csv('datasets/test.csv')
 15 | 
 16 | 
 17 | '''
 18 | 求q1,q2长度差异，normalize by the max len of question pairs
 19 | '''
 20 | def get_len_diff(merge_data, word_level=True):
 21 |     if word_level:
 22 |         merge = merge_data[['words_x','words_y']]
 23 |     else:
 24 |         merge = merge_data[['chars_x','chars_y']]
 25 |     
 26 |     merge.columns = ['q1','q2']
 27 |    
 28 |     q1_len = merge.q1.apply(lambda x: len(x.split(' '))).values
 29 |     q2_len = merge.q2.apply(lambda x: len(x.split(' '))).values
 30 |  
 31 |     len_diff = np.abs((q1_len - q2_len) / np.max([q1_len, q2_len],axis=0))
 32 |     
 33 |     return len_diff
 34 | 
 35 | 
 36 | '''
 37 | # 取q1,q2中相同词的个数
 38 | '''
 39 | def get_num_common_words(question, data):
 40 |     # merge data
 41 |     merge = pd.merge(data,question,left_on=['q1'],right_on=['qid'],how='left')
 42 |     merge = pd.merge(merge,question,left_on=['q2'],right_on=['qid'],how='left')
 43 |     merge = merge[['words_x','words_y']]
 44 |     merge.columns = ['q1','q2']
 45 |     
 46 |     q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values
 47 |     q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values
 48 |            
 49 |     result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(len(q1_word_set))]
 50 |     result = pd.DataFrame(result, index=data.index)
 51 |     result.columns = ['num_common_words']
 52 |     return result
 53 | 
 54 | '''
 55 | 计算共现词比例
 56 | '''
 57 | def get_common_word_ratio(merge_data, data, word_level=True):
 58 |     
 59 |     if word_level:
 60 |         merge = merge_data[['words_x','words_y']]
 61 |     else:
 62 |         merge = merge_data[['chars_x','chars_y']]
 63 |     merge.columns = ['q1','q2']
 64 |     
 65 |     q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values
 66 |     q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values
 67 |     q1_word_len = merge.q1.apply(lambda x: len(x.split(' '))).values
 68 |     q2_word_len = merge.q2.apply(lambda x: len(x.split(' '))).values
 69 |            
 70 |     result = [len(q1_word_set[i] & q2_word_set[i])/max(q1_word_len[i],q2_word_len[i]) for i in range(len(q1_word_set))]
 71 |     result = pd.DataFrame(result, index=data.index)
 72 |     result.columns = ['common_word_ratio']
 73 |     return result
 74 | 
 75 | '''
 76 | 计算tf-idf向量
 77 | '''
 78 | def get_tfidf_vector(question, merge_data, word_level=True):
 79 |   
 80 |     # use the question corpus to train tf-idf vec
 81 |     if word_level:
 82 |         vectorizer = TfidfVectorizer().fit(question.words.values)   #max_features=1000
 83 |         merge = merge_data[['words_x','words_y']]
 84 |     else:
 85 |         vectorizer = TfidfVectorizer().fit(question.chars.values)
 86 |         merge = merge_data[['chars_x','chars_y']]
 87 |     merge.columns = ['q1','q2']
 88 |         
 89 |     q1_tfidf = vectorizer.transform(merge.q1.values)
 90 |     q2_tfidf = vectorizer.transform(merge.q2.values)
 91 |    
 92 |     return vectorizer.vocabulary_,q1_tfidf, q2_tfidf
 93 | 
 94 | '''
 95 | 用tfidf作为系数，调整共现词比例
 96 | '''
 97 | def common_word_ratio_adjust_with_tfidf(merge_data, word_to_index, q1_tfidf, q2_tfidf, word_level=True):
 98 |     
 99 |     if word_level:
100 |         merge = merge_data[['words_x','words_y']]
101 |         merge.columns = ['q1','q2']
102 |     else:
103 |         merge = merge_data[['chars_x','chars_y']]
104 |         merge.columns = ['q1','q2']
105 |     
106 |     adjusted_common_word_ratio = []
107 |     
108 |     for i in range(q1_tfidf.shape[0]):
109 |         q1words = {}
110 |         q2words = {}
111 |         for word in merge.loc[i,'q1'].lower().split():
112 |             q1words[word] = q1words.get(word, 0) + 1
113 |         for word in merge.loc[i,'q2'].lower().split():
114 |             q2words[word] = q2words.get(word, 0) + 1
115 |         
116 |         sum_shared_word_in_q1 = sum([q1words[w] * q1_tfidf[i,word_to_index[w]] for w in q1words if w in q2words])
117 |         sum_shared_word_in_q2 = sum([q2words[w] * q2_tfidf[i,word_to_index[w]] for w in q2words if w in q1words])
118 |         sum_tol = sum(q1words[w] * q1_tfidf[i,word_to_index[w]] for w in q1words) + sum(q2words[w] * q2_tfidf[i,word_to_index[w]] for w in q2words)
119 |         if 1e-6 > sum_tol:
120 |             adjusted_common_word_ratio.append(0.)
121 |         else:
122 |             adjusted_common_word_ratio.append(1.0 * (sum_shared_word_in_q1 + sum_shared_word_in_q2) / sum_tol)
123 |      
124 |     return adjusted_common_word_ratio
125 | 
126 | 
127 | """
128 | 计算数据中词语的影响力，格式如下：
129 | 词语 --> [0. 出现语句对数量，1. 出现语句对比例，2. 正确语句对比例，3. 单侧语句对比例，4. 单侧语句对正确比例，5. 双侧语句对比例，6. 双侧语句对正确比例]
130 | """
131 | def generate_powerful_word(merge_data, word_level=True):
132 |     
133 |     if word_level:
134 |         train_subset_data = merge_data[['label','words_x','words_y']]
135 |     else:
136 |         train_subset_data = merge_data[['label','chars_x','chars_y']]
137 |         
138 |     train_subset_data.columns = ['label','q1','q2']
139 |     
140 |     words_power = {}
141 |     
142 |     for i in train_subset_data.index:
143 |         label = int(train_subset_data.loc[i,'label'])
144 |         q1_words = train_subset_data.loc[i,'q1'].lower().split()
145 |         q2_words = train_subset_data.loc[i,'q2'].lower().split()
146 |         all_words = set(q1_words + q2_words)
147 |         q1_words = set(q1_words)
148 |         q2_words = set(q2_words)
149 |         for word in all_words:
150 |             if word not in words_power:
151 |                 words_power[word] = [0. for i in range(7)]
152 |                 # 计算出现语句对数量
153 |             words_power[word][0] += 1.
154 |             words_power[word][1] += 1.
155 | 
156 |             if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)):
157 |                 # 计算单侧语句数量
158 |                 words_power[word][3] += 1.
159 |                 if 0 == label:
160 |                     # 计算正确语句对数量
161 |                     words_power[word][2] += 1.
162 |                     # 计算单侧语句正确比例
163 |                     words_power[word][4] += 1.
164 |                     
165 |             if (word in q1_words) and (word in q2_words):
166 |                 # 计算双侧语句数量
167 |                 words_power[word][5] += 1.
168 |                 if 1 == label:
169 |                     # 计算正确语句对数量
170 |                     words_power[word][2] += 1.
171 |                     # 计算双侧语句正确比例
172 |                     words_power[word][6] += 1.
173 |     
174 |     for word in words_power:
175 |         # 计算出现语句对比例
176 |         words_power[word][1] /= train_subset_data.shape[0]
177 |         # 计算正确语句对比例
178 |         words_power[word][2] /= words_power[word][0]
179 |         # 计算单侧语句对正确比例
180 |         if words_power[word][3] > 1e-6:
181 |             words_power[word][4] /= words_power[word][3]
182 |         # 计算单侧语句对比例
183 |         words_power[word][3] /= words_power[word][0]
184 |         # 计算双侧语句对正确比例
185 |         if words_power[word][5] > 1e-6:
186 |             words_power[word][6] /= words_power[word][5]
187 |         # 计算双侧语句对比例
188 |         words_power[word][5] /= words_power[word][0]
189 |     
190 |     sorted_words_power = sorted(words_power.items(), key=lambda d: d[1][0], reverse=True)
191 |         
192 |     return sorted_words_power
193 | 
194 | 
195 | '''
196 | 若问题两侧存在有预测力的powerful words,则设置标签为1，否则为0
197 | '''
198 | def powerful_words_dside_tag(pword, merge_data,thresh_num, thresh_rate, word_level=True):
199 |     #筛选powerful words (有预测力的)
200 |     pword_dside = []
201 |     pword = filter(lambda x: x[1][0] * x[1][5] >= thresh_num, pword)   #保证统计可靠性
202 |     pword_sort = sorted(pword, key=lambda d: d[1][6], reverse=True)
203 |     pword_dside.extend(map(lambda x: x[0], filter(lambda x: x[1][6] >= thresh_rate, pword_sort)))  #保证抽取到真正powerful的word
204 |     
205 |     if word_level:
206 |         merge = merge_data[['words_x','words_y']]
207 |     else:
208 |         merge = merge_data[['chars_x','chars_y']]
209 |         
210 |     merge.columns = ['q1','q2']
211 |     
212 |     pword_dside_tags = []
213 |     
214 |     for i in merge_data.index:
215 |         tags = []
216 |         q1_words = set(merge.loc[i,'q1'].lower().split())
217 |         q2_words = set(merge.loc[i,'q2'].lower().split())
218 |         for word in pword_dside:
219 |             if (word in q1_words) and (word in q2_words):
220 |                 tags.append(1.0)
221 |             else:
222 |                 tags.append(0.0)
223 |                 
224 |         pword_dside_tags.append(tags)
225 |                 
226 |     return pword_dside, pword_dside_tags
227 | 	
228 | 	
229 | def powerful_words_oside_tag(pword, merge_data,thresh_num, thresh_rate, word_level=True):
230 | 
231 |     pword_oside = []
232 |     pword = filter(lambda x: x[1][0] * x[1][3] >= thresh_num, pword)
233 |     pword_oside.extend(map(lambda x: x[0], filter(lambda x: x[1][4] >= thresh_rate, pword)))
234 | 
235 |     if word_level:
236 |         merge = merge_data[['words_x','words_y']]
237 |     else:
238 |         merge = merge_data[['chars_x','chars_y']]
239 |         
240 |     merge.columns = ['q1','q2']
241 |     
242 |     pword_oside_tags = []
243 |     
244 |     for i in merge_data.index:
245 |         tags = []
246 |         q1_words = set(merge.loc[i,'q1'].lower().split())
247 |         q2_words = set(merge.loc[i,'q2'].lower().split())
248 |         for word in pword_oside:
249 |             if (word in q1_words) and (word not in q2_words):
250 |                 tags.append(1.0)
251 |             elif (word not in q1_words) and (word in q2_words):
252 |                 tags.append(1.0)
253 |             else:
254 |                 tags.append(0.0)
255 |                 
256 |         pword_oside_tags.append(tags)
257 |         
258 |     return pword_oside, pword_oside_tags
259 | 
260 | 
261 | def powerful_word_dside_rate(sorted_words_power, pword_dside, merge_data, word_level=True):
262 |     '''
263 |     注意rate是指label=0的可能性，question pair中两侧powerful word越多，power越大，则rate越小
264 |     '''
265 |     num_least = 300
266 |     
267 |     if word_level:
268 |         merge = merge_data[['words_x','words_y']]
269 |     else:
270 |         merge = merge_data[['chars_x','chars_y']]
271 |         
272 |     merge.columns = ['q1','q2']
273 |         
274 |     words_power = dict(sorted_words_power)  #转化为字典格式
275 | 
276 |     pword_dside_rate = []
277 | 
278 |     for i in merge.index:
279 |         rate = 1.0    # 指labei=0的可能性,先初始化为1
280 |         q1_words = set(merge.loc[i,'q1'].lower().split())
281 |         q2_words = set(merge.loc[i,'q2'].lower().split())
282 |         share_words = list(q1_words.intersection(q2_words))
283 |         for word in share_words:
284 |             if word in pword_dside:
285 |                 rate *= (1.0 - words_power[word][6])    #uestion pair中两侧powerful word越多，power越大，则rate越小
286 |         pword_dside_rate.append(1-rate)
287 |     return pword_dside_rate
288 |     
289 | def powerful_word_oside_rate(sorted_words_power, pword_oside, merge_data, word_level=True):
290 |     '''
291 |     注意rate是指label=1的可能性，question pair中单侧powerful word越多，power越大，则rate越小
292 |     '''
293 |     num_least = 300
294 |     
295 |     if word_level:
296 |         merge = merge_data[['words_x','words_y']]
297 |     else:
298 |         merge = merge_data[['chars_x','chars_y']]
299 |         
300 |     merge.columns = ['q1','q2']
301 |     words_power = dict(sorted_words_power)  #转化为字典格式
302 |         
303 |     pword_oside_rate = []
304 |         
305 |     for i in merge.index:
306 |         rate = 1.0    # 指labei=1的可能性,先初始化为1
307 |         q1_words = set(merge.loc[i,'q1'].lower().split())
308 |         q2_words = set(merge.loc[i,'q2'].lower().split())
309 |         q1_diff = list(set(q1_words).difference(set(q2_words)))
310 |         q2_diff = list(set(q2_words).difference(set(q1_words)))
311 |         all_diff = set(q1_diff + q2_diff)
312 |         for word in all_diff:
313 |             if word in pword_oside:
314 |                 rate *= (1.0 - words_power[word][4])    #question pair中单侧powerful word越多，power越大，则rate越小
315 |         pword_oside_rate.append(1-rate)
316 |             
317 |     return pword_oside_rate
318 |     
319 | 
320 | 
321 | '''
322 | 扩展的编辑距离(Damerau-Levenshtein Distance)
323 | 扩展的编辑距离在思想上与编辑距离一样，只是除插入、删除和替换操作外，还支持 相邻字符的交换 这样一个操作，增加这个操作的考虑是人们在计算机上输入文档时的错误情况中，因为快速敲击而前后两个字符的顺序被输错的情况很常见。
324 | '''
325 | def edit_distance(q1, q2):
326 |     
327 |     str1 = q1.split(' ')
328 |     str2 = q2.split(' ')
329 |     matrix = [[i+j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
330 |  
331 |     for i in range(1,len(str1)+1):
332 |         for j in range(1,len(str2)+1):
333 |             if str1[i-1] == str2[j-1]:
334 |                 d = 0
335 |             else:
336 |                 d = 1
337 |             matrix[i][j] = min(matrix[i-1][j]+1,matrix[i][j-1]+1,matrix[i-1][j-1]+d)
338 |  
339 |         if i > 1 and j > 1 and str1[i-1] == str2[j-2] and str1[i-2] == str2[j-1]:
340 |             d = 0   # d=0表示允许交换，d =1表示不允许交换
341 |             matrix[i][j] = min(matrix[i][j], matrix[i-2][j-2] + d)   # allow transposition
342 |  
343 |     return matrix[len(str1)][len(str2)]
344 | 
345 | def get_edit_distance(merge_data, word_level=True):
346 | 
347 |     if word_level:
348 |         merge = merge_data[['words_x','words_y']]
349 |         merge.columns = ['q1','q2']
350 |     else:
351 |         merge = merge_data[['chars_x','chars_y']]
352 |         merge.columns = ['q1','q2']
353 |         
354 |     q1_len = merge['q1'].apply(lambda x: len(x.split(' '))).values
355 |     q2_len = merge['q2'].apply(lambda x: len(x.split(' '))).values
356 |     
357 |     # normalize the edit_distance by the max(len(q1),len(q2))
358 |     dist =[edit_distance(merge.loc[i,'q1'],merge.loc[i,'q2'])/ np.max([q1_len,q2_len],axis=0)[i] for i in merge.index]
359 |     
360 |     return dist
361 |   


--------------------------------------------------------------------------------