├── .gitignore ├── DSSM ├── README.md ├── __init__.py └── dssm.py ├── README.md ├── helper ├── __init__.py ├── distance.py ├── tools.py └── wordhash.py └── quora_dssm.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Personal ignore 2 | dataset/ 3 | *.pkl 4 | result/ 5 | config.py 6 | nohup.out 7 | 8 | # Compiled source # 9 | ################### 10 | *.com 11 | *.class 12 | *.dll 13 | *.exe 14 | *.o 15 | *.so 16 | 17 | # Packages # 18 | ############ 19 | # it's better to unpack these files and commit the raw source 20 | # git has its own built in compression methods 21 | *.7z 22 | *.dmg 23 | *.gz 24 | *.iso 25 | *.jar 26 | *.rar 27 | *.tar 28 | *.zip 29 | 30 | # Logs and databases # 31 | ###################### 32 | *.log 33 | *.sql 34 | *.sqlite 35 | 36 | # OS generated files # 37 | ###################### 38 | .DS_Store 39 | .DS_Store? 40 | ._* 41 | .Spotlight-V100 42 | .Trashes 43 | ehthumbs.db 44 | Thumbs.db 45 | 46 | # IDE files # 47 | ############# 48 | nbproject 49 | .~lock.* 50 | .buildpath 51 | .idea 52 | .project 53 | .settings 54 | composer.lock 55 | 56 | # Byte-compiled / optimized / DLL files 57 | __pycache__/ 58 | *.py[cod] 59 | *$py.class 60 | 61 | # C extensions 62 | *.so 63 | 64 | # Distribution / packaging 65 | .Python 66 | env/ 67 | build/ 68 | develop-eggs/ 69 | dist/ 70 | downloads/ 71 | eggs/ 72 | .eggs/ 73 | lib/ 74 | lib64/ 75 | parts/ 76 | sdist/ 77 | var/ 78 | wheels/ 79 | *.egg-info/ 80 | .installed.cfg 81 | *.egg 82 | 83 | # PyInstaller 84 | # Usually these files are written by a python script from a template 85 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 86 | *.manifest 87 | *.spec 88 | 89 | # Installer logs 90 | pip-log.txt 91 | pip-delete-this-directory.txt 92 | 93 | # Unit test / coverage reports 94 | htmlcov/ 95 | .tox/ 96 | .coverage 97 | .coverage.* 98 | .cache 99 | nosetests.xml 100 | coverage.xml 101 | *,cover 102 | .hypothesis/ 103 | 104 | # Translations 105 | *.mo 106 | *.pot 107 | 108 | # Django stuff: 109 | *.log 110 | local_settings.py 111 | 112 | # Flask stuff: 113 | instance/ 114 | .webassets-cache 115 | 116 | # Scrapy stuff: 117 | .scrapy 118 | 119 | # Sphinx documentation 120 | docs/_build/ 121 | 122 | # PyBuilder 123 | target/ 124 | 125 | # Jupyter Notebook 126 | .ipynb_checkpoints 127 | 128 | # pyenv 129 | .python-version 130 | 131 | # celery beat schedule file 132 | celerybeat-schedule 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # dotenv 138 | .env 139 | 140 | # virtualenv 141 | .venv 142 | venv/ 143 | ENV/ 144 | 145 | # Spyder project settings 146 | .spyderproject 147 | 148 | # Rope project settings 149 | .ropeproject 150 | 151 | -------------------------------------------------------------------------------- /DSSM/README.md: -------------------------------------------------------------------------------- 1 | 2 | DSSM : word hash & DNN 3 | 4 | ## Dataset 5 | ------------- 6 | [search query log data](http://jeffhuang.com/search_query_logs.html) 7 |
https://www.quora.com/Where-can-I-find-dataset-having-search-query-logs-from-general-purpose-search-engines-Google-Yahoo-etc 8 | 9 | 10 | 11 | ## References 12 | ------------- 13 | [Microsoft/CNTK](https://github.com/Microsoft/CNTK/wiki/Train-a-DSSM-(or-a-convolutional-DSSM)-model) 14 |
https://github.com/airalcorn2/Deep-Semantic-Similarity-Model 15 | 16 | -------------------------------------------------------------------------------- /DSSM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Accagain2014/TextMatching/b22d8d705da64a34293d2079e577027c819c00d1/DSSM/__init__.py -------------------------------------------------------------------------------- /DSSM/dssm.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import sys 4 | sys.path.append('../helper/') 5 | import time 6 | 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | import tools 11 | 12 | 13 | class DSSM(object): 14 | ''' 15 | Impletement DSSM Model in the Paper: Learning Deep Structured Semantic Models for Web Search using Clickthrough Data 16 | ''' 17 | def __init__(self, hash_tokens_nums=3000, dnn_layer_nums=1, dnn_hidden_node_nums=50, feature_nums=50, 18 | batch_size=10, neg_nums=4, learning_rate=0.5, max_epochs=200, loss_kind='mcl', w_init=0.1, \ 19 | save_model_path='./', mlp_hidden_node_nums=32, mlp_layer_nums=2, input_is_sparse=False): 20 | ''' 21 | paras: 22 | hash_tokens_nums: word hash后词的个数 23 | dnn_layer_nums: dnn的层数 24 | dnn_hidden_node_nums: dnn的结点个数 25 | feature_nums: 最终输出的特征的个数 26 | batch_size: 每个batch的大小 27 | neg_nums: 负样本的个数 28 | learning_rate: 学习率 29 | max_epoch: 迭代次数 30 | loss_kind: 'mcl': maximize the condition likelihood，极大似然估计条件概率; 'log_loss'：交叉熵的方式计算loss 31 | w_init: 权重初始化 32 | save_model_path: 保存验证集上最优模型的文件路劲 33 | mlp_hidden_node_nums: 学习到的隐向量连接后加mlp层的节点数 34 | mlp_layer_nums： mlp层的层数 35 | input_is_sparse: 输入是否是sparse矩阵 36 | ''' 37 | 38 | self.hash_token_nums = hash_tokens_nums 39 | self.dnn_layer_nums = dnn_layer_nums 40 | self.dnn_hidden_node_nums = dnn_hidden_node_nums 41 | self.feature_nums = feature_nums 42 | self.batch_size = batch_size 43 | self.neg_nums = neg_nums 44 | self.learning_rate = learning_rate 45 | self.max_epochs = max_epochs 46 | self.loss_kind = loss_kind 47 | self.positive_weights = 1 48 | self.w_init = w_init 49 | self.save_model_path = save_model_path 50 | self.mlp_hidden_node_nums = mlp_hidden_node_nums 51 | self.mlp_layer_nums = mlp_layer_nums 52 | self.input_is_sparse = input_is_sparse 53 | 54 | ''' 55 | query and doc 使用不同的网络结构，像论文中提到的那样 56 | ''' 57 | if not self.input_is_sparse: 58 | self.input_q = tf.placeholder(tf.float32, shape=[None, self.hash_token_nums]) # sample_nums, word_nums, hash_tokens_nums 59 | self.input_doc = tf.placeholder(tf.float32, shape=[None, self.hash_token_nums]) # sample_nums, word_nums, hash_tokens_nums 60 | else: 61 | self.input_q = tf.sparse_placeholder(tf.float32, shape=[None, self.hash_token_nums]) 62 | self.input_doc = tf.sparse_placeholder(tf.float32, shape=[None, self.hash_token_nums]) 63 | 64 | self.label = tf.placeholder(tf.float32, shape=[None]) 65 | 66 | self.predict_doc = None 67 | self.predict_query = None 68 | 69 | self.relevance = self.create_model_op() 70 | 71 | if self.loss_kind == 'mlc': 72 | self.loss = self.create_loss_max_condition_lh_op() 73 | elif self.loss_kind == 'log_loss': 74 | self.loss = self.create_log_loss_op() 75 | else: 76 | pass 77 | 78 | self.train = self.create_train_op() 79 | 80 | def set_positive_weights(self, positive_weights): 81 | self.positive_weights = positive_weights 82 | 83 | def create_model_op(self): 84 | 85 | ''' 86 | 建立整个模型，分成两端的网络，query端和doc端的 87 | ''' 88 | 89 | features = [] 90 | structures = ['query_dnn', 'doc_dnn'] 91 | input_dict = { 92 | structures[0]: self.input_q, 93 | structures[1]: self.input_doc 94 | } 95 | 96 | ''' 97 | 尝试用一种结构试下 98 | ''' 99 | 100 | result = [0] * 2 101 | with tf.variable_scope('DNN'): 102 | now_w_init = tools.xavier_init(self.hash_token_nums, self.dnn_hidden_node_nums) 103 | w = tf.Variable( 104 | tf.random_uniform([self.hash_token_nums, self.dnn_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer1') 105 | b = tf.Variable(tf.zeros([self.dnn_hidden_node_nums]), name="bias_DNN_layer1") 106 | 107 | result[0] = input_dict['query_dnn'] 108 | result[1] = input_dict['doc_dnn'] 109 | 110 | if self.input_is_sparse: 111 | result[0] = tf.sparse_tensor_dense_matmul(result[0], w) + b 112 | result[1] = tf.sparse_tensor_dense_matmul(result[1], w) + b 113 | 114 | else: 115 | result[0] = tf.matmul(result[0], w) + b 116 | result[1] = tf.matmul(result[1], w) + b 117 | 118 | result[0] = tf.nn.tanh(result[0]) 119 | result[1] = tf.nn.tanh(result[1]) 120 | 121 | 122 | now_w_init = tools.xavier_init(self.dnn_hidden_node_nums, self.dnn_hidden_node_nums) 123 | w = tf.Variable( 124 | tf.random_uniform([self.dnn_hidden_node_nums, self.dnn_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer2') 125 | b = tf.Variable(tf.zeros([self.dnn_hidden_node_nums]), name="bias_DNN_layer2") 126 | result[0] = tf.matmul(result[0], w) + b 127 | result[0] = tf.nn.tanh(result[0]) 128 | result[1] = tf.matmul(result[1], w) + b 129 | result[1] = tf.nn.tanh(result[1]) 130 | 131 | 132 | now_w_init = tools.xavier_init(self.dnn_hidden_node_nums, self.dnn_hidden_node_nums) 133 | w = tf.Variable( 134 | tf.random_uniform([self.dnn_hidden_node_nums, self.dnn_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer3') 135 | b = tf.Variable(tf.zeros([self.dnn_hidden_node_nums]), name="bias_DNN_layer3") 136 | result[0] = tf.matmul(result[0], w) + b 137 | result[0] = tf.nn.tanh(result[0]) 138 | result[1] = tf.matmul(result[1], w) + b 139 | result[1] = tf.nn.tanh(result[1]) 140 | 141 | now_w_init = tools.xavier_init(self.dnn_hidden_node_nums, self.dnn_hidden_node_nums) 142 | w = tf.Variable( 143 | tf.random_uniform([self.dnn_hidden_node_nums, self.dnn_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer4') 144 | b = tf.Variable(tf.zeros([self.dnn_hidden_node_nums]), name="bias_DNN_layer4") 145 | result[0] = tf.matmul(result[0], w) + b 146 | result[0] = tf.nn.tanh(result[0]) 147 | result[1] = tf.matmul(result[1], w) + b 148 | result[1] = tf.nn.tanh(result[1]) 149 | ''' 150 | 151 | now_w_init = tools.xavier_init(self.dnn_hidden_node_nums, self.dnn_hidden_node_nums) 152 | w = tf.Variable( 153 | tf.random_uniform([self.dnn_hidden_node_nums, self.dnn_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer5') 154 | b = tf.Variable(tf.zeros([self.dnn_hidden_node_nums]), name="bias_DNN_layer5") 155 | result[0] = tf.matmul(result[0], w) + b 156 | result[0] = tf.nn.tanh(result[0]) 157 | result[1] = tf.matmul(result[1], w) + b 158 | result[1] = tf.nn.tanh(result[1]) 159 | ''' 160 | 161 | 162 | now_w_init = tools.xavier_init(self.dnn_hidden_node_nums, self.feature_nums) 163 | w = tf.Variable( 164 | tf.random_uniform([self.dnn_hidden_node_nums, self.feature_nums], -now_w_init, now_w_init), name='weights_DNN_layer_last') 165 | b = tf.Variable(tf.zeros([self.feature_nums]), name="bias_DNN_layer_last") 166 | result[0] = tf.matmul(result[0], w) + b 167 | result[0] = tf.nn.tanh(result[0]) 168 | result[1] = tf.matmul(result[1], w) + b 169 | result[1] = tf.nn.tanh(result[1]) 170 | 171 | 172 | ''' 173 | i = tf.constant(0) 174 | sum_layer = self.dnn_layer_nums 175 | #node_nums = tf.convert_to_tensor([self.dnn_hidden_node_nums] * self.dnn_layer_nums + [self.dnn_hidden_node_nums]) 176 | node_nums = [self.dnn_hidden_node_nums] * self.dnn_layer_nums + [self.dnn_hidden_node_nums] 177 | 178 | cond = lambda x, layer, result: tf.less(x, sum_layer) 179 | layer = 0 180 | def body(i, layer, result): 181 | tmp = tf.add(i, 1) 182 | w = tf.Variable( 183 | tf.random_uniform([node_nums[layer], node_nums[layer+1]], -self.w_init, self.w_init)) 184 | b = tf.Variable(tf.zeros([node_nums[layer+1]])) 185 | 186 | result[0] = tf.matmul(result[0], w) + b 187 | result[0] = tf.nn.tanh(result[0]) 188 | result[1] = tf.matmul(result[1], w) + b 189 | result[1] = tf.nn.tanh(result[1]) 190 | 191 | return tmp, layer, result 192 | 193 | i, _, result = tf.while_loop(cond, body, [i, layer, result]) 194 | ''' 195 | 196 | features.append(result[0]) 197 | features.append(result[1]) 198 | 199 | 200 | self.predict_query = features[0] 201 | self.predict_doc = features[1] 202 | 203 | ''' 204 | 为了对学习到了两个向量进行相似度打分，加一个mlp层, 最后一层全连接 205 | 206 | ''' 207 | 208 | result = tf.concat(features, -1) 209 | 210 | with tf.variable_scope('mlp'): 211 | node_nums = tf.convert_to_tensor([self.feature_nums*2] + [self.mlp_hidden_node_nums] * self.mlp_layer_nums + [1]) 212 | sum_layer = self.mlp_hidden_node_nums + 1 213 | 214 | 215 | now_w_init = tools.xavier_init(self.feature_nums * 2, self.mlp_hidden_node_nums) 216 | w = tf.Variable( 217 | tf.random_uniform([self.feature_nums*2, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer1') 218 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer1") 219 | result = tf.matmul(result, w) + b 220 | result = tf.nn.tanh(result) 221 | 222 | ''' 223 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, self.mlp_hidden_node_nums) 224 | w = tf.Variable( 225 | tf.random_uniform([self.mlp_hidden_node_nums, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer2') 226 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer2") 227 | result = tf.matmul(result, w) + b 228 | result = tf.nn.tanh(result) 229 | 230 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, self.mlp_hidden_node_nums) 231 | w = tf.Variable( 232 | tf.random_uniform([self.mlp_hidden_node_nums, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer2') 233 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer2") 234 | result = tf.matmul(result, w) + b 235 | result = tf.nn.tanh(result) 236 | 237 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, self.mlp_hidden_node_nums) 238 | w = tf.Variable( 239 | tf.random_uniform([self.mlp_hidden_node_nums, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer3') 240 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer3") 241 | result = tf.matmul(result, w) + b 242 | result = tf.nn.tanh(result) 243 | 244 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, self.mlp_hidden_node_nums) 245 | w = tf.Variable( 246 | tf.random_uniform([self.mlp_hidden_node_nums, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer4') 247 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer4") 248 | result = tf.matmul(result, w) + b 249 | result = tf.nn.tanh(result) 250 | 251 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, self.mlp_hidden_node_nums) 252 | w = tf.Variable( 253 | tf.random_uniform([self.mlp_hidden_node_nums, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer5') 254 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer5") 255 | result = tf.matmul(result, w) + b 256 | result = tf.nn.tanh(result) 257 | ''' 258 | 259 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, 1) 260 | w = tf.Variable( 261 | tf.random_uniform([self.mlp_hidden_node_nums, 1], -now_w_init, now_w_init), name='weights_DNN_layer_last') 262 | b = tf.Variable(tf.zeros([1]), name="bias_DNN_layer_last") 263 | result = tf.matmul(result, w) + b 264 | result = tf.nn.sigmoid(result) 265 | 266 | 267 | # norms1 = tf.sqrt(tf.reduce_sum(tf.square(features[0]), 1, keep_dims=False)) 268 | # norms2 = tf.sqrt(tf.reduce_sum(tf.square(features[1]), 1, keep_dims=False)) 269 | # relevance = tf.reduce_sum(features[0] * features[1], 1) / norms1 / norms2 270 | 271 | # w_r = tf.Variable(tf.random_uniform([1], -self.w_init, self.w_init), name="weight-of-relevance") 272 | # b_r = tf.Variable(tf.zeros([1]), name="bais-of-relevance") 273 | # relevance = relevance * w_r + b_r 274 | # relevance = tf.nn.softmax(relevance) 275 | 276 | return tf.reshape(result, [-1]) 277 | 278 | 279 | def create_loss_max_condition_lh_op(self): 280 | ''' 281 | 用极大似然的方法计算, 正例的条件概率 282 | 计算相关文档的loss, gama经验值也用来学习 283 | :return: 284 | ''' 285 | gama = tf.Variable(tf.random_uniform([1]), name="gama") 286 | ret = self.relevance * gama 287 | ret = tf.reshape(ret, [-1, self.neg_nums+1]) 288 | ret = tf.log(tf.nn.softmax(ret)) 289 | ret = tf.reduce_sum(ret, 0) # 行相加 290 | return -tf.gather(ret, 0) # 得到第一个，也即是正例的loss 291 | 292 | 293 | def create_log_loss_op(self): 294 | ''' 295 | 计算log_loss, 也就是交叉熵 296 | :return: 297 | ''' 298 | return tf.reduce_sum(tf.contrib.losses.log_loss(self.relevance, self.label)) 299 | 300 | 301 | def create_train_op(self): 302 | ''' 303 | 采用梯度下降方式学习 304 | :return: 305 | ''' 306 | return tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss) 307 | #return tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss) 308 | 309 | 310 | def creat_feed_dict(self, query_batch, doc_batch, label_batch): 311 | ''' 312 | :param query_batch: 查询输入 313 | :param doc_batch: 文档输入 314 | :param label_batch: 查询和文档对应的相关性label 315 | :return: 316 | ''' 317 | 318 | if self.input_is_sparse: 319 | query_coo_matrix = query_batch.tocoo() 320 | doc_coo_matrix = doc_batch.tocoo() 321 | self.feed_dict = { 322 | self.input_q : tf.SparseTensorValue(np.array([query_coo_matrix.row, query_coo_matrix.col]).T, query_batch.data, query_batch.shape), 323 | self.input_doc : tf.SparseTensorValue(np.array([doc_coo_matrix.row, doc_coo_matrix.col]).T, doc_batch.data, doc_batch.shape), 324 | self.label : label_batch 325 | } 326 | else: 327 | self.feed_dict = { 328 | self.input_q : query_batch, 329 | self.input_doc: doc_batch, 330 | self.label : label_batch 331 | } 332 | 333 | 334 | def run_epoch(self, sess, query_input, doc_input, labels, is_valid=False): 335 | ''' 336 | 计算一次迭代过程 337 | :param sess: 338 | :param query_input: 339 | :param doc_input: 340 | :param labels: 341 | :return: 342 | ''' 343 | average_loss = 0 344 | step = 0 345 | relevance = [] 346 | 347 | for step, (query, doc, label) in enumerate( 348 | tools.data_iterator(query_input, doc_input, labels, self.batch_size, shuffle=True, is_normalize=True) 349 | ): 350 | # print query[1, 1], doc[1, 1], label[1] 351 | self.creat_feed_dict(query, doc, label) 352 | #print query.shape, doc.shape, label.shape 353 | #print type(query),is_sparse 354 | # self.set_positive_weights(len(query)) 355 | 356 | # shape1, shape2, shape3 = sess.run([self.shape_1, self.shape_2, self.shape_3], feed_dict=self.feed_dict) 357 | # print shape1, shape2, shape3 358 | 359 | if not is_valid: 360 | # 跑这个train的时候才更新W 361 | _, loss_value, predict_query, predict_doc, relevance = sess.run([self.train, self.loss, self.predict_query\ 362 | , self.predict_doc, self.relevance], feed_dict=self.feed_dict) 363 | else: 364 | 365 | loss_value, relevance = sess.run([self.loss, self.relevance], feed_dict=self.feed_dict) 366 | # print 'Chcek ', sklearn.metrics.log_loss(label, relevance), loss_value 367 | 368 | average_loss += loss_value 369 | #print 'step ', step, loss_value 370 | #print 'predict ', predict_query[0], predict_doc[0], relevance[0] 371 | return average_loss / (step+1), relevance 372 | 373 | 374 | def fit(self, sess, query_input, doc_input, labels, valid_q_input=None, valid_d_input=None, valid_labels=None, \ 375 | load_model=False): 376 | ''' 377 | 模型入口 378 | :param sess: 379 | :param query_input: 380 | :param doc_input: 381 | :param labels: 382 | :return: 383 | ''' 384 | losses = [] 385 | best_loss = 99999 386 | saver = tf.train.Saver() 387 | if load_model: 388 | saver.restore(sess, self.save_model_path) 389 | start_time = time.time() 390 | valid_loss, _ = self.run_epoch(sess, valid_q_input, valid_d_input, valid_labels, is_valid=True) 391 | duration = time.time() - start_time 392 | print('valid loss = %.5f (%.3f sec)' 393 | % (valid_loss, duration)) 394 | losses.append(valid_loss) 395 | return losses 396 | 397 | for epoch in range(self.max_epochs): 398 | start_time = time.time() 399 | average_loss, relevance = self.run_epoch(sess, query_input, doc_input, labels) 400 | duration = time.time() - start_time 401 | 402 | if (epoch+1) % 1 == 0: 403 | if valid_labels is None: 404 | print('Epoch %d: loss = %.5f relevance[0] = %.5f (%.3f sec)' 405 | % (epoch+1, average_loss, relevance[0], duration)) 406 | else: 407 | valid_loss, _ = self.run_epoch(sess, valid_q_input, valid_d_input, valid_labels, is_valid=True) 408 | if valid_loss < best_loss: 409 | print 'Save model' 410 | best_loss = valid_loss 411 | saver.save(sess, self.save_model_path) 412 | duration = time.time() - start_time 413 | print('Epoch %d: loss = %.5f valid loss = %.5f relevance[0] = %.5f (%.3f sec)' 414 | % (epoch+1, average_loss, valid_loss, relevance[0], duration)) 415 | sys.stdout.flush() 416 | losses.append(average_loss) 417 | 418 | if not valid_labels is None: 419 | print 'Final valid loss: ', best_loss 420 | return losses 421 | 422 | def predict(self, sess, query, doc, labels): 423 | ''' 424 | 计算预测过后的查询与文档的相关性 425 | :param sess: 426 | :param query: 427 | :param doc: 428 | :param labels: 429 | :return: 430 | ''' 431 | if not self.is_sparse: 432 | self.creat_feed_dict(query, doc, labels) 433 | predict = sess.run(self.relevance, feed_dict=self.feed_dict) 434 | else: 435 | predict = [] 436 | for step, (query_, doc_, label_) in enumerate( 437 | tools.data_iterator(query, doc, labels, self.batch_size, shuffle=True, is_normalize=True) 438 | ): 439 | self.creat_feed_dict(query, doc, labels) 440 | now_pre = sess.run(self.relevance, feed_dict=self.feed_dict) 441 | predict += now_pre 442 | 443 | return predict 444 | 445 | 446 | 447 | def test_dssm(): 448 | ''' 449 | 测试函数 450 | :return: 451 | ''' 452 | with tf.Graph().as_default(): 453 | tf.set_random_seed(1) 454 | 455 | model = DSSM(hash_tokens_nums=30000, dnn_layer_nums=2, dnn_hidden_node_nums=300, feature_nums=128, 456 | batch_size=10, neg_nums=4, learning_rate=0.02, max_epochs=500) 457 | sess = tf.Session() 458 | init = tf.initialize_all_variables() 459 | sess.run(init) 460 | np.random.seed(1) 461 | 462 | query = np.random.rand(500, 30000) 463 | doc = np.random.rand(500, 30000) 464 | label = np.array([1, 0, 0, 0, 0] * 100) 465 | 466 | #print query 467 | #print doc 468 | #print label 469 | 470 | losses = model.fit(sess, query, doc, label) 471 | 472 | #print losses[-1] 473 | 474 | 475 | if __name__ == '__main__': 476 | test_dssm() 477 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## For what 2 | Understanding the Methods in Text Matching Area Including Key-words based Matching Model & Latent Semantic Matching Model. 3 | Implement the Classical Methods. 4 | 5 | ## Categories 6 | - tradition model (feature based models) 7 | - Key-words based methods 8 | - tf-idf model 9 | - words common rate model 10 | - find the most important word with adding syntax information 11 | - boosting models 12 | - linear models 13 | - factorization machine 14 | - Semantic deep model 15 | - representation-based models 16 | - DSSM, CDSSM 17 | - interaction-based models 18 | 19 | 20 | ## People in these area 21 | - [Po-Sen Huang](https://posenhuang.github.io/full_publication.html) 22 | - [Jianfeng Gao](https://www.microsoft.com/en-us/research/people/jfgao/) 23 | - [Richard Socher](http://www.socher.org/index.php/Main/HomePage) 24 | - [Hang Li](http://www.hangli-hl.com/index.html) 25 | 26 | ## Survey 27 | > [深度文本匹配综述(A Survey on Deep Text Matching)](http://kns.cnki.net/KCMS/detail/detail.aspx?dbcode=CJFQ&dbname=CAPJLAST&filename=JSJX20160920002&uid=WEEvREcwSlJHSldRa1FhdXNXYXJvK0FZMlhXUDZsYnBMQjhHTElMeE1jRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4ggI8Fm4gTkoUKaID8j8gFw!!&v=MzA2OTFscVdNMENMTDdSN3FlWU9ac0ZDcmxWYnZPSTFzPUx6N0Jkckc0SDlmTXBvMUZaT3NOWXc5TXptUm42ajU3VDNm) 28 |
29 | 30 | 31 | ## Methods & Papers about Semantic Methods 32 | 33 | > [**DSSM**](./DSSM/dssm.py) 34 |
[Learning Deep Structured Semantic Models for Web Search using Clickthrough Data](https://posenhuang.github.io/papers/cikm2013_DSSM_fullversion.pdf) 35 |
CIKM 2013 36 |
词袋模型,基于语义表达的结构, word hash + DNN 37 |
[详细解释](http://www.leiphone.com/news/201607/TRldqYnzm6nRbEnY.html) 38 |
[代码](./DSSM/dssm.py) 39 | ----- 40 | > [**CDSSM**]() 41 |
[Learning Semantic Representations Using Convolutional Neural Networks for Web Search](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/www2014_cdssm_p07.pdf) 42 |
WWW 2014, word hash + CNN + DNN 43 | ---- 44 | 45 | > [**CLSM**]() 46 |
[A Latent Semantic Model with Convolutional-Pooling Structure for Information Retrieval](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2014_cdssm_final.pdf) 47 |
CIKM 2014 48 |
基于匹配的结构, word hash + CNN, CLSM和C-DSSM有什么区别呢 49 | ---- 50 | 51 | > [**DSSM的应用**]() 52 | [Modeling Interestingness with Deep Neural Networks](https://www.microsoft.com/en-us/research/wp-content/uploads/2014/10/604_Paper.pdf) 53 |
EMNLP 2014 54 |
DSSM应用于文本分析，在automatic highlighting和contextual entity search问题上效果好。 55 |
主要有两点贡献： 56 |
1) DSSM + CNN 57 |
2) 不针对相关性，加了一个ranker 58 | ---- 59 | 60 | > [**ARC-I/ARC-II**]() 61 | [Convolutional Neural Network Architectures 62 | for Matching Natural Language Sentences](https://papers.nips.cc/paper/5550-convolutional-neural-network-architectures-for-matching-natural-language-sentences.pdf) 63 |
NIPS 2014 64 |
CNN的基于语义表达和基于匹配的两种结构; 增加了门解决句子长度不一致问题 65 | ---- 66 | > [**CNTN**]() 67 |
[Convolutional Neural Tensor Network 68 | Architecture for Community-based Question Answering](https://ijcai.org/Proceedings/15/Papers/188.pdf) 69 |
IJCAI 2015 70 |
(D)CNN+MLP(tensor layer); 71 |
基于语义表达的结构 72 | 73 | ----- 74 | > [**DeepMatch**]() 75 |
[A Deep Architecture for Matching Short Texts](https://papers.nips.cc/paper/5019-a-deep-architecture-for-matching-short-texts.pdf) 76 |
NIPS 2013 77 |
[Reviews](https://media.nips.cc/nipsbooks/nipspapers/paper_files/nips26/reviews/697.html) 78 |
目的：建模更复杂的匹配关系。最早的基于匹配的结构把。 79 |
结合了localness和hierarchy intrinsic，基于点积的网络不好做的，最大的亮点是用话题模型建立网络吧。 80 | 81 | ------ 82 | > [**DeepMatch_tree**]() 83 |
[Syntax-based Deep Matching of Short Texts](https://arxiv.org/pdf/1503.02427.pdf) 84 | 85 | ## Methods & Papers about Key Words Based Methods 86 | > [****]() 87 |
[]() 88 | 89 | ## Related talks and books 90 | * [Deep Learning for Web Search and 91 | Natural Language Processing](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/wsdm2015.v3.pdf) 92 | * [Deep Learning for Information Retrieval(Sigir 2016 Tutorial)](http://www.hangli-hl.com/uploads/3/4/4/6/34465961/deep_learning_for_information_retrieval.pdf) 93 | * [Semantic Matching in Search (Sigir 2014 Workshop)](http://www.hangli-hl.com/uploads/3/4/4/6/34465961/semantic_matching_in_search.pdf) 94 | * [Semantic Matching in Search (Book 2014)](http://www.bigdatalab.ac.cn/~junxu/publications/SemanticMatchingInSearch_2014.pdf) 95 | * [gensim notebook](https://github.com/RaRe-Technologies/gensim/tree/develop/docs/notebooks) 96 | 97 | 98 | ## Downloads 99 | > [DSSM/Sent2Vec Release Version](https://www.microsoft.com/en-us/download/details.aspx?id=52365) 100 |
MSRA发布的Sent2Vec发行版 101 | 102 | ## Datasets 103 | * [Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks](http://arxiv.org/abs/1502.05698 "Jason Weston, Antoine Bordes, Sumit Chopra, Tomas Mikolov, Alexander M. Rush") ([fb.ai/babi](http://fb.ai/babi)) 104 | * [Teaching Machines to Read and Comprehend](http://arxiv.org/abs/1506.03340 "Karl Moritz Hermann, Tomáš Kočiský, Edward Grefenstette, Lasse Espeholt, Will Kay, Mustafa Suleyman, Phil Blunsom") ([github.com/deepmind/rc-data](https://github.com/deepmind/rc-data)) 105 | * [One Billion Word Benchmark for Measuring Progress in Statistical Language Modeling](http://arxiv.org/abs/1312.3005 "Ciprian Chelba, Tomas Mikolov, Mike Schuster, Qi Ge, Thorsten Brants, Phillipp Koehn, Tony Robinson") ([github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark)) 106 | * [The Ubuntu Dialogue Corpus: A Large Dataset for Research in Unstructured Multi-Turn Dialogue Systems](http://arxiv.org/abs/1506.08909 "Ryan Lowe, Nissan Pow, Iulian Serban, Joelle Pineau") ([cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0](http://cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0/)) 107 | * [Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books](http://arxiv.org/abs/1506.06724 "Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, Sanja Fidler") ([BookCorpus](http://www.cs.toronto.edu/~mbweb/)) 108 | * [Every publicly available Reddit comment, for research.](https://www.reddit.com/r/datasets/comments/3bxlg7/i_have_every_publicly_available_reddit_comment/ "Stuck_In_the_Matrix") 109 | * [Stack Exchange Data Dump](https://archive.org/details/stackexchange "Stack Exchange") 110 | * [Europarl: A Parallel Corpus for Statistical Machine Translation](http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl-mtsummit05.pdf "Philipp Koehn") ([www.statmt.org/europarl/](http://www.statmt.org/europarl/)) 111 | * [RTE Knowledge Resources](http://aclweb.org/aclwiki/index.php?title=RTE_Knowledge_Resources) 112 | * [**Kaggle Quora Question Pairs**]() 113 | 114 | 115 | ## Competition 116 | * [Kaggle Quora Question Pairs](https://www.kaggle.com/c/quora-question-pairs) 117 |
[SQuAD(The Stanford Question Answering Dataset)](https://rajpurkar.github.io/SQuAD-explorer/) 118 | 119 | ## Pretrained Models 120 | * [Model Zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo "Berkeley Vision and Learning Center") 121 | * [word2vec](https://code.google.com/p/word2vec/ "Tomas Mikolov") 122 | * [GoogleNews-vectors-negative300.bin.gz](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing) 123 | * [freebase-vectors-skipgram1000.bin.gz](https://docs.google.com/file/d/0B7XkCwpI5KDYaDBDQm1tZGNDRHc/edit?usp=sharing) 124 | * [GloVe](http://nlp.stanford.edu/projects/glove/ "Jeffrey Pennington, Richard Socher, Christopher D. Manning") 125 | * [SENNA](http://ronan.collobert.com/senna/ "R. Collobert, J. Weston, L. Bottou, M. Karlen, K. Kavukcuoglu, P. Kuksa") 126 | 127 | ## Important Online Courses 128 | * [Stanford CS224d Deep Learning for Natural Language Processing](http://cs224d.stanford.edu/) 129 | * [Stanford CS20SI Tensorflow for Deep Learning Research](https://web.stanford.edu/class/cs20si/) 130 | * [Stanford CS231n Convolutional Neural Networks for Visual Recognition](http://cs231n.stanford.edu/) 131 | 132 | 133 | ## References 134 | https://github.com/robertsdionne/neural-network-papers/blob/master/README.md 135 | -------------------------------------------------------------------------------- /helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Accagain2014/TextMatching/b22d8d705da64a34293d2079e577027c819c00d1/helper/__init__.py -------------------------------------------------------------------------------- /helper/distance.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import numpy as np 4 | from sklearn.metrics.pairwise import cosine_similarity 5 | 6 | def cos_dis(x, y): 7 | ''' 8 | Calculate cosine distance about vector x and y 9 | 10 | :param x: np.array liked, a vector, one dimension 11 | :param y: np.array liked, a vector, one dimension 12 | :return: cosine distance between vector x and vector y 13 | ''' 14 | 15 | dot_mul = x * y 16 | ans = np.sum(dot_mul) / np.power(np.sum(x**2), 0.5) / np.power(np.sum(y**2), 0.5) 17 | 18 | ''' 19 | ans_sk = cosine_similarity(x, y) 20 | 21 | assert ans - ans_sk < 1e-10 22 | ''' 23 | 24 | return ans 25 | 26 | -------------------------------------------------------------------------------- /helper/tools.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import numpy as np 4 | import math 5 | from scipy import sparse as sps 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.preprocessing import normalize 9 | 10 | def softmax(x): 11 | if len(x.shape) > 1: 12 | tmp = np.max(x, axis=1) 13 | x -= tmp.reshape((x.shape[0], 1)) 14 | x = np.exp(x) 15 | tmp = np.sum(x, axis=1) 16 | x /= tmp.reshape((x.shape[0], 1)) 17 | else: 18 | tmp = np.max(x) 19 | x -= tmp 20 | x = np.exp(x) 21 | tmp = np.sum(x) 22 | x /= tmp 23 | 24 | return x 25 | 26 | 27 | def sigmoid(x): 28 | x = 1. / (1 + np.exp(-x)) 29 | return x 30 | 31 | 32 | def sigmoid_grad(f): 33 | f = f * (1 - f) 34 | return f 35 | 36 | 37 | def xavier_init(n1, n2): 38 | return math.sqrt(6.0/(n1+n2)) 39 | 40 | 41 | def oversample(X_ot,y,p=0.165): 42 | pos_ot = X_ot[y==1] 43 | neg_ot = X_ot[y==0] 44 | 45 | scale = ((pos_ot.shape[0]*1.0 / (pos_ot.shape[0] + neg_ot.shape[0])) / p) - 1 46 | while scale > 1: 47 | neg_ot = sps.vstack([neg_ot, neg_ot]).tocsr() 48 | scale -=1 49 | neg_ot = sps.vstack([neg_ot, neg_ot[:int(scale * neg_ot.shape[0])]]).tocsr() 50 | ot = sps.vstack([pos_ot, neg_ot]).tocsr() 51 | y=np.zeros(ot.shape[0]) 52 | y[:pos_ot.shape[0]]=1.0 53 | print 'After oversample, the "is_duplicate" field mean: ', y.mean() 54 | return ot,y 55 | 56 | 57 | def data_iterator(orig_X, orig_y=None, orig_label=None, batch_size=10, shuffle=False, is_normalize=False): 58 | ''' 59 | 60 | :param orig_X: 61 | :param orig_y: 62 | :param orig_label: 63 | :param batch_size: 64 | :param shuffle: 65 | :return: 66 | ''' 67 | 68 | # Optionally shuffle the data before training 69 | if shuffle: 70 | indices = np.random.permutation(orig_X.shape[0]) 71 | data_X = orig_X[indices] 72 | data_y = orig_y[indices] 73 | data_label = orig_label[indices] 74 | else: 75 | data_X = orig_X 76 | data_y = orig_y 77 | data_label = orig_label 78 | ### 79 | total_processed_examples = 0 80 | total_steps = int(np.ceil(data_X.shape[0]) / float(batch_size)) 81 | for step in xrange(total_steps): 82 | # Create the batch by selecting up to batch_size elements 83 | batch_start = step * batch_size 84 | x = data_X[batch_start : batch_start + batch_size] 85 | y = data_y[batch_start : batch_start + batch_size] 86 | label = orig_label[batch_start : batch_start + batch_size] 87 | ''' 88 | if is_sparse: 89 | yield x.toarray(), y.toarray(), label 90 | else: 91 | yield x, y, label 92 | ''' 93 | if is_normalize: 94 | yield normalize(x, axis=0), normalize(y, axis=0), label 95 | else: 96 | yield x, y, label 97 | total_processed_examples += x.shape[0] 98 | # Sanity check to make sure we iterated over all the dataset as intended 99 | #assert total_processed_examples == data_X.shape[0], 'Expected {} and processed {}'.format(data_X.shape[0], total_processed_examples) 100 | -------------------------------------------------------------------------------- /helper/wordhash.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import pickle 3 | import sys 4 | 5 | import pandas as pd 6 | from scipy import sparse as sps 7 | 8 | from helper import distance 9 | 10 | 11 | class WordHash(object): 12 | 13 | ''' 14 | Implement word hash methods mentioned in the paper: Learning Deep Structured Semantic Models for Web Search using Clickthrough Data 15 | ''' 16 | 17 | def __init__(self, words, n_gram=3, marks='#', load_from_file=False, dump_to_file=False, file=None): 18 | ''' 19 | 20 | :param words: origin vacabulary 21 | :param n_gram: number of letters to make a n_gram term 22 | :param marks: the character added in the starting and ending position of a word 23 | :param load_from_file: load n_gram index map dict from file or not 24 | :param load_file: load file name 25 | ''' 26 | 27 | if load_from_file: 28 | with open(file, 'rb') as fr: 29 | self.__dict__ = pickle.load(fr).__dict__ # load an object 30 | return 31 | 32 | self.ori_words = words 33 | self.ori_len = len(self.ori_words) 34 | self.words = map(lambda x: marks+x.lower()+marks, list(set(self.ori_words))) 35 | self.ori_diff_len = len(self.words) 36 | self.n_gram = n_gram 37 | self.marks = marks 38 | self.hashed_words = set() 39 | self.n_gram_index_map = {} 40 | self.n_gram_size = 0 41 | 42 | print 'Sum number of origin words: ', self.ori_len 43 | print 'Sum number of origin diff words: ', self.ori_diff_len 44 | print 'Letter n-gram: ', self.n_gram 45 | 46 | self._get_hash_dict() 47 | 48 | if dump_to_file: 49 | with open(file, 'wb') as fw: 50 | pickle.dump(self, fw) 51 | 52 | 53 | def _split(self, word): 54 | ''' 55 | Split a word with lenth of self.n_gram_size 56 | :param self: 57 | :param word: word to be splited by n_gram len 58 | :return: 59 | ''' 60 | 61 | splited_ngrams = [] 62 | word_len = len(word) 63 | split_point = 0 64 | while split_point < word_len-1: # don't consider the last marks 65 | splited_ngrams.append(word[split_point : split_point+self.n_gram]) 66 | split_point += 1 67 | return splited_ngrams 68 | 69 | 70 | def _get_hash_dict(self): 71 | ''' 72 | Get n_gram terms and mapping them to indexes 73 | :return: 74 | ''' 75 | 76 | for one_word in self.words: 77 | ngram_words = self._split(one_word) 78 | self.hashed_words = self.hashed_words | set(ngram_words) 79 | 80 | word_keys = list(self.hashed_words) 81 | word_values = range(0, len(word_keys)) 82 | self.n_gram_index_map = dict(zip(word_keys, word_values)) 83 | self.n_gram_size = len(word_keys) 84 | 85 | print 'Sum numbers of n-grams: ', self.n_gram_size 86 | return self.hashed_words 87 | 88 | def get_n_gram_count(self, sentences, is_dump=False, dump_file=None): 89 | ''' 90 | Get n_gram counting term matrix 91 | :param sentences: sentences to be handled to get n_gram term counting matrix 92 | :param is_dump: whether dump the result to file or not 93 | :param dump_file: dump file name 94 | :return: n_gram term counting sparse matrix, shapes(sentences number, n_gram term size) 95 | ''' 96 | 97 | # n_gram_count = np.zeros((len(sentences), self.n_gram_size)) 98 | n_gram_count = sps.lil_matrix((len(sentences), self.n_gram_size)) 99 | sen_cnt = 0 100 | for one_sen in sentences: 101 | one_sen = one_sen.strip().split() 102 | for one_word in one_sen: 103 | one_word = one_word.strip() 104 | one_word = self.marks+one_word.lower()+self.marks 105 | splited_n_gram = self._split(one_word) 106 | n_gram_index = map(lambda x: self.n_gram_index_map[x], splited_n_gram) 107 | # n_gram_count[sen_cnt, n_gram_index] += 1 108 | for one_n_gram_index in n_gram_index: 109 | n_gram_count[sen_cnt, one_n_gram_index] += 1 110 | sen_cnt += 1 111 | if is_dump: 112 | pd.to_pickle(n_gram_count.tocsr(), dump_file) 113 | ''' 114 | with open(dump_file, 'wb') as fw: 115 | pickle.dump(n_gram_count.tocsr(), fw) 116 | ''' 117 | print 'Dump to file ', dump_file, ' done.' 118 | sys.stdout.flush() 119 | print 'Get n_gram count matrix done, shape with: ', n_gram_count.shape 120 | return n_gram_count.tocsr() 121 | 122 | def test_WordHash(): 123 | 124 | sentence = 'Key words based text matching methods and semantic matching methods' 125 | 126 | print sentence.split() 127 | wordhash = WordHash(sentence.split(), load_from_file=False, load_file='n_gram_term_index_mapping.pkl', dump_to_file=False, dump_file='n_gram_term_index_mapping.pkl') 128 | print wordhash.n_gram_index_map 129 | n_gram_matrix = wordhash.get_n_gram_count(['key words text matching methods', 'semantic text matching methods']) 130 | 131 | print distance.cos_dis(n_gram_matrix[0].toarray().reshape([-1]), n_gram_matrix[1].toarray().reshape([-1])) 132 | 133 | if __name__ == '__main__': 134 | test_WordHash() 135 | -------------------------------------------------------------------------------- /quora_dssm.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import sys 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from sklearn.datasets import load_svmlight_file 7 | from sklearn.model_selection import KFold 8 | from sklearn.utils import shuffle 9 | 10 | import config 11 | from DSSM.dssm import DSSM 12 | from helper import tools 13 | 14 | 15 | def get_words(sentences): 16 | words = [] 17 | for one_sen in sentences: 18 | one_sen = one_sen.strip().split() 19 | one_sen = map(lambda x: x.strip(), one_sen) 20 | words += one_sen 21 | return words 22 | 23 | 24 | def quora_dssm(train_input_file, test_input_file): 25 | 26 | ''' 27 | 测试函数 28 | :return: 29 | ''' 30 | seed = 2222 31 | 32 | ''' 33 | train_ori = pd.read_csv(train_input_file) 34 | test = pd.read_csv(test_input_file)#, nrows=1001) 35 | test['is_duplicate'] = 0 36 | # train_ori = train_ori[:1000] 37 | # test = test_ori[:] 38 | 39 | print train_ori['is_duplicate'].value_counts() 40 | 41 | q = ['question1', 'question2'] 42 | words = [] 43 | for one_q in q: 44 | train_ori[one_q] = train_ori[one_q].astype(str) 45 | test[one_q] = test[one_q].astype(str) 46 | ''' 47 | #wordhash = WordHash(words, load_from_file=True, \ 48 | # dump_to_file=True, file='result/n_gram_term_index_mapping.pkl') 49 | #print 'Load n_gram_term_index_mapping.pkl done' 50 | #sys.stdout.flush() 51 | 52 | 53 | #train_ori_q1 = wordhash.get_n_gram_count(train_ori[q[0]].values, is_dump=True, dump_file='result/train_q1_ngram_counting_matrix.pkl') 54 | 55 | #train_ori_q2 = wordhash.get_n_gram_count(train_ori[q[1]].values, is_dump=True, dump_file='result/train_q2_ngram_counting_matrix.pkl') 56 | 57 | ''' 58 | with open('result/train_q1_ngram_counting_matrix.pkl', 'rb') as fr: 59 | train_ori_q1 = pickle.load(fr) 60 | with open('result/train_q2_ngram_counting_matrix.pkl', 'rb') as fr: 61 | train_ori_q2 = pickle.load(fr) 62 | print 'Get train origin sparse matrix done' 63 | sys.stdout.flush() 64 | ''' 65 | 66 | ''' 67 | y = train_ori['is_duplicate'].values[:] 68 | y_t = test['is_duplicate'].values[:] 69 | del train_ori 70 | 71 | test_q1 = pd.read_pickle('result/test_q1_ngram_counting_matrix.pkl') 72 | test_q2 = pd.read_pickle('result/test_q2_ngram_counting_matrix.pkl') 73 | 74 | #test_q1 = wordhash.get_n_gram_count(test[q[0]].values, is_dump=True, dump_file='result/test_q1_ngram_counting_matrix.pkl') 75 | #test_q2 = wordhash.get_n_gram_count(test[q[1]].values, is_dump=True, dump_file='result/test_q2_ngram_counting_matrix.pkl') 76 | del test 77 | 78 | print 'Get test origin sparse matrix done' 79 | sys.stdout.flush() 80 | ''' 81 | 82 | 83 | ''' 84 | X = sps.hstack( 85 | [train_ori_q1, train_ori_q2] 86 | ).tocsr() 87 | 88 | X_t = sps.hstack( 89 | [test_q1, test_q2] 90 | ).tocsr() 91 | ''' 92 | 93 | ''' 94 | Get origin train and test svm format file. 95 | ''' 96 | 97 | #dump_svmlight_file(X, y, 'result/train_ori_n_gram_counting_sparse_matrix.svm') 98 | #dump_svmlight_file(X_t, y_t, 'result/test_n_gram_counting_sparse_matrix.svm') 99 | 100 | #X, y, X_test, y_test = load_svmlight_files(['result/train_ori_n_gram_counting_sparse_matrix.svm', 'result/test_n_gram_counting_sparse_matrix.svm']) # 注意load_svmlight_file的shape是根据推断来的，可能导致不一致 101 | #X = normalize(X, axis=0) 102 | #X_test = normalize(X_test, axis=0) 103 | 104 | #dump_svmlight_file(X, y, 'result/train_ori_n_gram_counting_sparse_matrix.norm.svm') 105 | #dump_svmlight_file(X_test, y_test, 'result/test_n_gram_counting_sparse_matrix.norm.svm') 106 | 107 | X, y = load_svmlight_file('result/train_ori_n_gram_counting_sparse_matrix.norm.mini.svm', n_features=111166) 108 | print X.shape 109 | used_as_train = X.shape[0]/10 110 | X = X[:used_as_train] 111 | y = y[:used_as_train] 112 | 113 | print y[:10] 114 | #dump_svmlight_file(X, y, 'result/train_ori_n_gram_counting_sparse_matrix.norm.mini.svm') 115 | 116 | #print 'X_train shape: ', X.shape, ' X_test shape: ', X_test.shape 117 | print 'Load done' 118 | sys.stdout.flush() 119 | 120 | skf = KFold(n_splits=5, shuffle=True, random_state=seed).split(X) 121 | for ind_tr, ind_te in skf: 122 | X_train = X[ind_tr] 123 | y_train = y[ind_tr] 124 | 125 | X_valid = X[ind_te] 126 | y_valid = y[ind_te] 127 | break 128 | 129 | X_train, y_train = tools.oversample(X_train.tocsr(), y_train, p=0.165) 130 | X_valid, y_valid = tools.oversample(X_valid.tocsr(), y_valid, p=0.165) 131 | 132 | X_train, y_train = shuffle(X_train, y_train, random_state=seed) 133 | 134 | #dump_svmlight_file(X_train, y_train, 'result/oversample_train_n_gram_counting_sparse_matrix.svm') 135 | #dump_svmlight_file(X_valid, y_valid, 'result/oversample_valid_n_gram_counting_sparse_matrix.svm') 136 | 137 | #print 'Dump to svm format done.' 138 | 139 | ''' 140 | for _ in q: 141 | train_ori[_] = train_ori[_].astype(str) 142 | test[_] = test_ori[_].astype(str) 143 | words += get_words(train_ori[_].values) 144 | words += get_words(test[_].values) 145 | 146 | print 'Sum words: ', len(words), ' sum diff words: ', len(set(words)) 147 | 148 | wordhash = WordHash(words, load_from_file=True, load_file='n_gram_term_index_mapping.pkl', \ 149 | dump_to_file=True, dump_file='n_gram_term_index_mapping.pkl') 150 | 151 | 152 | split_point = int(0.7 * len(train_ori)) 153 | train = train_ori[:split_point] 154 | valid = train_ori[split_point:] 155 | 156 | train_q1 = wordhash.get_n_gram_count(train[q[0]].values, is_dump=True, dump_file='result/train_q1_ngram_counting_matrix.pkl') 157 | train_q2 = wordhash.get_n_gram_count(train[q[1]].values, is_dump=True, dump_file='result/train_q2_ngram_counting_matrix.pkl') 158 | train_label = train['is_duplicate'].values 159 | 160 | valid_q1 = wordhash.get_n_gram_count(valid[q[0]].values, is_dump=True, dump_file='result/valid_q1_ngram_counting_matrix.pkl') 161 | valid_q2 = wordhash.get_n_gram_count(valid[q[1]].values, is_dump=True, dump_file='result/valid_q2_ngram_counting_matrix.pkl') 162 | valid_label = valid['is_duplicate'].values 163 | 164 | test_q1 = wordhash.get_n_gram_count(test[q[0]].values, is_dump=True, dump_file='result/test_q1_ngram_counting_matrix.pkl') 165 | test_q2 = wordhash.get_n_gram_count(test[q[1]].values, is_dump=True, dump_file='result/test_q2_ngram_counting_matrix.pkl') 166 | test_label = test['is_duplicate'].values 167 | 168 | ''' 169 | print 'train shape: ', X_train.shape, 'valid shape: ', X_valid.shape 170 | #print 'test shape: ', X_test.shape 171 | n_gram_size = X_train.shape[1] 172 | #n_gram_size = X_test.shape[1] 173 | sys.stdout.flush() 174 | 175 | with tf.Graph().as_default(): 176 | tf.set_random_seed(1) 177 | model = DSSM(hash_tokens_nums=n_gram_size/2, dnn_layer_nums=2, dnn_hidden_node_nums=288, feature_nums=64, batch_size=X_train.shape[0], neg_nums=0, learning_rate=0.001, max_epochs=400, loss_kind='log_loss', w_init=1,save_model_path='result/save-model', mlp_hidden_node_nums=16, mlp_layer_nums=100,input_is_sparse=True) 178 | sess = tf.Session() 179 | init = tf.initialize_all_variables() 180 | sess.run(init) 181 | np.random.seed(1) 182 | 183 | # query = np.random.rand(500, 30000) 184 | # doc = np.random.rand(500, 30000) 185 | # label = np.array([1, 0, 0, 0, 0] * 100) 186 | # model.set_positive_weights([1]*500) 187 | 188 | #print query 189 | #print doc 190 | #print label 191 | X_train_q1 = X_train[:, :n_gram_size/2] 192 | X_train_q2 = X_train[:, n_gram_size/2:] 193 | 194 | X_valid_q1 = X_valid[:, :n_gram_size/2] 195 | X_valid_q2 = X_valid[:, n_gram_size/2:] 196 | 197 | #X_test_q1 = X_test[:, :n_gram_size/2] 198 | #X_test_q2 = X_test[:, n_gram_size/2:] 199 | 200 | losses = model.fit(sess, X_train_q1, X_train_q2, y_train, X_valid_q1, X_valid_q2, y_valid, load_model=False) 201 | ''' 202 | print 'Start to test. ' 203 | 204 | test['is_duplicate'] = model.predict(sess, X_test_q1, X_test_q2, y_test, is_sparse=True) 205 | test[['test_id', 'is_duplicate']].to_csv('result/out.csv', index=False) 206 | ''' 207 | if __name__ == '__main__': 208 | 209 | train_file = config.train_file 210 | test_file = config.test_file 211 | 212 | quora_dssm(train_file, test_file) 213 | --------------------------------------------------------------------------------