├── .gitignore
├── DSSM
├── README.md
├── __init__.py
└── dssm.py
├── README.md
├── helper
├── __init__.py
├── distance.py
├── tools.py
└── wordhash.py
└── quora_dssm.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Personal ignore
2 | dataset/
3 | *.pkl
4 | result/
5 | config.py
6 | nohup.out
7 |
8 | # Compiled source #
9 | ###################
10 | *.com
11 | *.class
12 | *.dll
13 | *.exe
14 | *.o
15 | *.so
16 |
17 | # Packages #
18 | ############
19 | # it's better to unpack these files and commit the raw source
20 | # git has its own built in compression methods
21 | *.7z
22 | *.dmg
23 | *.gz
24 | *.iso
25 | *.jar
26 | *.rar
27 | *.tar
28 | *.zip
29 |
30 | # Logs and databases #
31 | ######################
32 | *.log
33 | *.sql
34 | *.sqlite
35 |
36 | # OS generated files #
37 | ######################
38 | .DS_Store
39 | .DS_Store?
40 | ._*
41 | .Spotlight-V100
42 | .Trashes
43 | ehthumbs.db
44 | Thumbs.db
45 |
46 | # IDE files #
47 | #############
48 | nbproject
49 | .~lock.*
50 | .buildpath
51 | .idea
52 | .project
53 | .settings
54 | composer.lock
55 |
56 | # Byte-compiled / optimized / DLL files
57 | __pycache__/
58 | *.py[cod]
59 | *$py.class
60 |
61 | # C extensions
62 | *.so
63 |
64 | # Distribution / packaging
65 | .Python
66 | env/
67 | build/
68 | develop-eggs/
69 | dist/
70 | downloads/
71 | eggs/
72 | .eggs/
73 | lib/
74 | lib64/
75 | parts/
76 | sdist/
77 | var/
78 | wheels/
79 | *.egg-info/
80 | .installed.cfg
81 | *.egg
82 |
83 | # PyInstaller
84 | # Usually these files are written by a python script from a template
85 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
86 | *.manifest
87 | *.spec
88 |
89 | # Installer logs
90 | pip-log.txt
91 | pip-delete-this-directory.txt
92 |
93 | # Unit test / coverage reports
94 | htmlcov/
95 | .tox/
96 | .coverage
97 | .coverage.*
98 | .cache
99 | nosetests.xml
100 | coverage.xml
101 | *,cover
102 | .hypothesis/
103 |
104 | # Translations
105 | *.mo
106 | *.pot
107 |
108 | # Django stuff:
109 | *.log
110 | local_settings.py
111 |
112 | # Flask stuff:
113 | instance/
114 | .webassets-cache
115 |
116 | # Scrapy stuff:
117 | .scrapy
118 |
119 | # Sphinx documentation
120 | docs/_build/
121 |
122 | # PyBuilder
123 | target/
124 |
125 | # Jupyter Notebook
126 | .ipynb_checkpoints
127 |
128 | # pyenv
129 | .python-version
130 |
131 | # celery beat schedule file
132 | celerybeat-schedule
133 |
134 | # SageMath parsed files
135 | *.sage.py
136 |
137 | # dotenv
138 | .env
139 |
140 | # virtualenv
141 | .venv
142 | venv/
143 | ENV/
144 |
145 | # Spyder project settings
146 | .spyderproject
147 |
148 | # Rope project settings
149 | .ropeproject
150 |
151 |
--------------------------------------------------------------------------------
/DSSM/README.md:
--------------------------------------------------------------------------------
1 |
2 | DSSM : word hash & DNN
3 |
4 | ## Dataset
5 | -------------
6 | [search query log data](http://jeffhuang.com/search_query_logs.html)
7 |
https://www.quora.com/Where-can-I-find-dataset-having-search-query-logs-from-general-purpose-search-engines-Google-Yahoo-etc
8 |
9 |
10 |
11 | ## References
12 | -------------
13 | [Microsoft/CNTK](https://github.com/Microsoft/CNTK/wiki/Train-a-DSSM-(or-a-convolutional-DSSM)-model)
14 |
https://github.com/airalcorn2/Deep-Semantic-Similarity-Model
15 |
16 |
--------------------------------------------------------------------------------
/DSSM/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Accagain2014/TextMatching/b22d8d705da64a34293d2079e577027c819c00d1/DSSM/__init__.py
--------------------------------------------------------------------------------
/DSSM/dssm.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 |
3 | import sys
4 | sys.path.append('../helper/')
5 | import time
6 |
7 | import numpy as np
8 | import tensorflow as tf
9 |
10 | import tools
11 |
12 |
13 | class DSSM(object):
14 | '''
15 | Impletement DSSM Model in the Paper: Learning Deep Structured Semantic Models for Web Search using Clickthrough Data
16 | '''
17 | def __init__(self, hash_tokens_nums=3000, dnn_layer_nums=1, dnn_hidden_node_nums=50, feature_nums=50,
18 | batch_size=10, neg_nums=4, learning_rate=0.5, max_epochs=200, loss_kind='mcl', w_init=0.1, \
19 | save_model_path='./', mlp_hidden_node_nums=32, mlp_layer_nums=2, input_is_sparse=False):
20 | '''
21 | paras:
22 | hash_tokens_nums: word hash后词的个数
23 | dnn_layer_nums: dnn的层数
24 | dnn_hidden_node_nums: dnn的结点个数
25 | feature_nums: 最终输出的特征的个数
26 | batch_size: 每个batch的大小
27 | neg_nums: 负样本的个数
28 | learning_rate: 学习率
29 | max_epoch: 迭代次数
30 | loss_kind: 'mcl': maximize the condition likelihood,极大似然估计条件概率; 'log_loss':交叉熵的方式计算loss
31 | w_init: 权重初始化
32 | save_model_path: 保存验证集上最优模型的文件路劲
33 | mlp_hidden_node_nums: 学习到的隐向量连接后加mlp层的节点数
34 | mlp_layer_nums: mlp层的层数
35 | input_is_sparse: 输入是否是sparse矩阵
36 | '''
37 |
38 | self.hash_token_nums = hash_tokens_nums
39 | self.dnn_layer_nums = dnn_layer_nums
40 | self.dnn_hidden_node_nums = dnn_hidden_node_nums
41 | self.feature_nums = feature_nums
42 | self.batch_size = batch_size
43 | self.neg_nums = neg_nums
44 | self.learning_rate = learning_rate
45 | self.max_epochs = max_epochs
46 | self.loss_kind = loss_kind
47 | self.positive_weights = 1
48 | self.w_init = w_init
49 | self.save_model_path = save_model_path
50 | self.mlp_hidden_node_nums = mlp_hidden_node_nums
51 | self.mlp_layer_nums = mlp_layer_nums
52 | self.input_is_sparse = input_is_sparse
53 |
54 | '''
55 | query and doc 使用不同的网络结构,像论文中提到的那样
56 | '''
57 | if not self.input_is_sparse:
58 | self.input_q = tf.placeholder(tf.float32, shape=[None, self.hash_token_nums]) # sample_nums, word_nums, hash_tokens_nums
59 | self.input_doc = tf.placeholder(tf.float32, shape=[None, self.hash_token_nums]) # sample_nums, word_nums, hash_tokens_nums
60 | else:
61 | self.input_q = tf.sparse_placeholder(tf.float32, shape=[None, self.hash_token_nums])
62 | self.input_doc = tf.sparse_placeholder(tf.float32, shape=[None, self.hash_token_nums])
63 |
64 | self.label = tf.placeholder(tf.float32, shape=[None])
65 |
66 | self.predict_doc = None
67 | self.predict_query = None
68 |
69 | self.relevance = self.create_model_op()
70 |
71 | if self.loss_kind == 'mlc':
72 | self.loss = self.create_loss_max_condition_lh_op()
73 | elif self.loss_kind == 'log_loss':
74 | self.loss = self.create_log_loss_op()
75 | else:
76 | pass
77 |
78 | self.train = self.create_train_op()
79 |
80 | def set_positive_weights(self, positive_weights):
81 | self.positive_weights = positive_weights
82 |
83 | def create_model_op(self):
84 |
85 | '''
86 | 建立整个模型,分成两端的网络,query端和doc端的
87 | '''
88 |
89 | features = []
90 | structures = ['query_dnn', 'doc_dnn']
91 | input_dict = {
92 | structures[0]: self.input_q,
93 | structures[1]: self.input_doc
94 | }
95 |
96 | '''
97 | 尝试用一种结构试下
98 | '''
99 |
100 | result = [0] * 2
101 | with tf.variable_scope('DNN'):
102 | now_w_init = tools.xavier_init(self.hash_token_nums, self.dnn_hidden_node_nums)
103 | w = tf.Variable(
104 | tf.random_uniform([self.hash_token_nums, self.dnn_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer1')
105 | b = tf.Variable(tf.zeros([self.dnn_hidden_node_nums]), name="bias_DNN_layer1")
106 |
107 | result[0] = input_dict['query_dnn']
108 | result[1] = input_dict['doc_dnn']
109 |
110 | if self.input_is_sparse:
111 | result[0] = tf.sparse_tensor_dense_matmul(result[0], w) + b
112 | result[1] = tf.sparse_tensor_dense_matmul(result[1], w) + b
113 |
114 | else:
115 | result[0] = tf.matmul(result[0], w) + b
116 | result[1] = tf.matmul(result[1], w) + b
117 |
118 | result[0] = tf.nn.tanh(result[0])
119 | result[1] = tf.nn.tanh(result[1])
120 |
121 |
122 | now_w_init = tools.xavier_init(self.dnn_hidden_node_nums, self.dnn_hidden_node_nums)
123 | w = tf.Variable(
124 | tf.random_uniform([self.dnn_hidden_node_nums, self.dnn_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer2')
125 | b = tf.Variable(tf.zeros([self.dnn_hidden_node_nums]), name="bias_DNN_layer2")
126 | result[0] = tf.matmul(result[0], w) + b
127 | result[0] = tf.nn.tanh(result[0])
128 | result[1] = tf.matmul(result[1], w) + b
129 | result[1] = tf.nn.tanh(result[1])
130 |
131 |
132 | now_w_init = tools.xavier_init(self.dnn_hidden_node_nums, self.dnn_hidden_node_nums)
133 | w = tf.Variable(
134 | tf.random_uniform([self.dnn_hidden_node_nums, self.dnn_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer3')
135 | b = tf.Variable(tf.zeros([self.dnn_hidden_node_nums]), name="bias_DNN_layer3")
136 | result[0] = tf.matmul(result[0], w) + b
137 | result[0] = tf.nn.tanh(result[0])
138 | result[1] = tf.matmul(result[1], w) + b
139 | result[1] = tf.nn.tanh(result[1])
140 |
141 | now_w_init = tools.xavier_init(self.dnn_hidden_node_nums, self.dnn_hidden_node_nums)
142 | w = tf.Variable(
143 | tf.random_uniform([self.dnn_hidden_node_nums, self.dnn_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer4')
144 | b = tf.Variable(tf.zeros([self.dnn_hidden_node_nums]), name="bias_DNN_layer4")
145 | result[0] = tf.matmul(result[0], w) + b
146 | result[0] = tf.nn.tanh(result[0])
147 | result[1] = tf.matmul(result[1], w) + b
148 | result[1] = tf.nn.tanh(result[1])
149 | '''
150 |
151 | now_w_init = tools.xavier_init(self.dnn_hidden_node_nums, self.dnn_hidden_node_nums)
152 | w = tf.Variable(
153 | tf.random_uniform([self.dnn_hidden_node_nums, self.dnn_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer5')
154 | b = tf.Variable(tf.zeros([self.dnn_hidden_node_nums]), name="bias_DNN_layer5")
155 | result[0] = tf.matmul(result[0], w) + b
156 | result[0] = tf.nn.tanh(result[0])
157 | result[1] = tf.matmul(result[1], w) + b
158 | result[1] = tf.nn.tanh(result[1])
159 | '''
160 |
161 |
162 | now_w_init = tools.xavier_init(self.dnn_hidden_node_nums, self.feature_nums)
163 | w = tf.Variable(
164 | tf.random_uniform([self.dnn_hidden_node_nums, self.feature_nums], -now_w_init, now_w_init), name='weights_DNN_layer_last')
165 | b = tf.Variable(tf.zeros([self.feature_nums]), name="bias_DNN_layer_last")
166 | result[0] = tf.matmul(result[0], w) + b
167 | result[0] = tf.nn.tanh(result[0])
168 | result[1] = tf.matmul(result[1], w) + b
169 | result[1] = tf.nn.tanh(result[1])
170 |
171 |
172 | '''
173 | i = tf.constant(0)
174 | sum_layer = self.dnn_layer_nums
175 | #node_nums = tf.convert_to_tensor([self.dnn_hidden_node_nums] * self.dnn_layer_nums + [self.dnn_hidden_node_nums])
176 | node_nums = [self.dnn_hidden_node_nums] * self.dnn_layer_nums + [self.dnn_hidden_node_nums]
177 |
178 | cond = lambda x, layer, result: tf.less(x, sum_layer)
179 | layer = 0
180 | def body(i, layer, result):
181 | tmp = tf.add(i, 1)
182 | w = tf.Variable(
183 | tf.random_uniform([node_nums[layer], node_nums[layer+1]], -self.w_init, self.w_init))
184 | b = tf.Variable(tf.zeros([node_nums[layer+1]]))
185 |
186 | result[0] = tf.matmul(result[0], w) + b
187 | result[0] = tf.nn.tanh(result[0])
188 | result[1] = tf.matmul(result[1], w) + b
189 | result[1] = tf.nn.tanh(result[1])
190 |
191 | return tmp, layer, result
192 |
193 | i, _, result = tf.while_loop(cond, body, [i, layer, result])
194 | '''
195 |
196 | features.append(result[0])
197 | features.append(result[1])
198 |
199 |
200 | self.predict_query = features[0]
201 | self.predict_doc = features[1]
202 |
203 | '''
204 | 为了对学习到了两个向量进行相似度打分,加一个mlp层, 最后一层全连接
205 |
206 | '''
207 |
208 | result = tf.concat(features, -1)
209 |
210 | with tf.variable_scope('mlp'):
211 | node_nums = tf.convert_to_tensor([self.feature_nums*2] + [self.mlp_hidden_node_nums] * self.mlp_layer_nums + [1])
212 | sum_layer = self.mlp_hidden_node_nums + 1
213 |
214 |
215 | now_w_init = tools.xavier_init(self.feature_nums * 2, self.mlp_hidden_node_nums)
216 | w = tf.Variable(
217 | tf.random_uniform([self.feature_nums*2, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer1')
218 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer1")
219 | result = tf.matmul(result, w) + b
220 | result = tf.nn.tanh(result)
221 |
222 | '''
223 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, self.mlp_hidden_node_nums)
224 | w = tf.Variable(
225 | tf.random_uniform([self.mlp_hidden_node_nums, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer2')
226 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer2")
227 | result = tf.matmul(result, w) + b
228 | result = tf.nn.tanh(result)
229 |
230 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, self.mlp_hidden_node_nums)
231 | w = tf.Variable(
232 | tf.random_uniform([self.mlp_hidden_node_nums, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer2')
233 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer2")
234 | result = tf.matmul(result, w) + b
235 | result = tf.nn.tanh(result)
236 |
237 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, self.mlp_hidden_node_nums)
238 | w = tf.Variable(
239 | tf.random_uniform([self.mlp_hidden_node_nums, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer3')
240 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer3")
241 | result = tf.matmul(result, w) + b
242 | result = tf.nn.tanh(result)
243 |
244 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, self.mlp_hidden_node_nums)
245 | w = tf.Variable(
246 | tf.random_uniform([self.mlp_hidden_node_nums, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer4')
247 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer4")
248 | result = tf.matmul(result, w) + b
249 | result = tf.nn.tanh(result)
250 |
251 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, self.mlp_hidden_node_nums)
252 | w = tf.Variable(
253 | tf.random_uniform([self.mlp_hidden_node_nums, self.mlp_hidden_node_nums], -now_w_init, now_w_init), name='weights_DNN_layer5')
254 | b = tf.Variable(tf.zeros([self.mlp_hidden_node_nums]), name="bias_DNN_layer5")
255 | result = tf.matmul(result, w) + b
256 | result = tf.nn.tanh(result)
257 | '''
258 |
259 | now_w_init = tools.xavier_init(self.mlp_hidden_node_nums, 1)
260 | w = tf.Variable(
261 | tf.random_uniform([self.mlp_hidden_node_nums, 1], -now_w_init, now_w_init), name='weights_DNN_layer_last')
262 | b = tf.Variable(tf.zeros([1]), name="bias_DNN_layer_last")
263 | result = tf.matmul(result, w) + b
264 | result = tf.nn.sigmoid(result)
265 |
266 |
267 | # norms1 = tf.sqrt(tf.reduce_sum(tf.square(features[0]), 1, keep_dims=False))
268 | # norms2 = tf.sqrt(tf.reduce_sum(tf.square(features[1]), 1, keep_dims=False))
269 | # relevance = tf.reduce_sum(features[0] * features[1], 1) / norms1 / norms2
270 |
271 | # w_r = tf.Variable(tf.random_uniform([1], -self.w_init, self.w_init), name="weight-of-relevance")
272 | # b_r = tf.Variable(tf.zeros([1]), name="bais-of-relevance")
273 | # relevance = relevance * w_r + b_r
274 | # relevance = tf.nn.softmax(relevance)
275 |
276 | return tf.reshape(result, [-1])
277 |
278 |
279 | def create_loss_max_condition_lh_op(self):
280 | '''
281 | 用极大似然的方法计算, 正例的条件概率
282 | 计算相关文档的loss, gama经验值也用来学习
283 | :return:
284 | '''
285 | gama = tf.Variable(tf.random_uniform([1]), name="gama")
286 | ret = self.relevance * gama
287 | ret = tf.reshape(ret, [-1, self.neg_nums+1])
288 | ret = tf.log(tf.nn.softmax(ret))
289 | ret = tf.reduce_sum(ret, 0) # 行相加
290 | return -tf.gather(ret, 0) # 得到第一个,也即是正例的loss
291 |
292 |
293 | def create_log_loss_op(self):
294 | '''
295 | 计算log_loss, 也就是交叉熵
296 | :return:
297 | '''
298 | return tf.reduce_sum(tf.contrib.losses.log_loss(self.relevance, self.label))
299 |
300 |
301 | def create_train_op(self):
302 | '''
303 | 采用梯度下降方式学习
304 | :return:
305 | '''
306 | return tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
307 | #return tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss)
308 |
309 |
310 | def creat_feed_dict(self, query_batch, doc_batch, label_batch):
311 | '''
312 | :param query_batch: 查询输入
313 | :param doc_batch: 文档输入
314 | :param label_batch: 查询和文档对应的相关性label
315 | :return:
316 | '''
317 |
318 | if self.input_is_sparse:
319 | query_coo_matrix = query_batch.tocoo()
320 | doc_coo_matrix = doc_batch.tocoo()
321 | self.feed_dict = {
322 | self.input_q : tf.SparseTensorValue(np.array([query_coo_matrix.row, query_coo_matrix.col]).T, query_batch.data, query_batch.shape),
323 | self.input_doc : tf.SparseTensorValue(np.array([doc_coo_matrix.row, doc_coo_matrix.col]).T, doc_batch.data, doc_batch.shape),
324 | self.label : label_batch
325 | }
326 | else:
327 | self.feed_dict = {
328 | self.input_q : query_batch,
329 | self.input_doc: doc_batch,
330 | self.label : label_batch
331 | }
332 |
333 |
334 | def run_epoch(self, sess, query_input, doc_input, labels, is_valid=False):
335 | '''
336 | 计算一次迭代过程
337 | :param sess:
338 | :param query_input:
339 | :param doc_input:
340 | :param labels:
341 | :return:
342 | '''
343 | average_loss = 0
344 | step = 0
345 | relevance = []
346 |
347 | for step, (query, doc, label) in enumerate(
348 | tools.data_iterator(query_input, doc_input, labels, self.batch_size, shuffle=True, is_normalize=True)
349 | ):
350 | # print query[1, 1], doc[1, 1], label[1]
351 | self.creat_feed_dict(query, doc, label)
352 | #print query.shape, doc.shape, label.shape
353 | #print type(query),is_sparse
354 | # self.set_positive_weights(len(query))
355 |
356 | # shape1, shape2, shape3 = sess.run([self.shape_1, self.shape_2, self.shape_3], feed_dict=self.feed_dict)
357 | # print shape1, shape2, shape3
358 |
359 | if not is_valid:
360 | # 跑这个train的时候 才更新W
361 | _, loss_value, predict_query, predict_doc, relevance = sess.run([self.train, self.loss, self.predict_query\
362 | , self.predict_doc, self.relevance], feed_dict=self.feed_dict)
363 | else:
364 |
365 | loss_value, relevance = sess.run([self.loss, self.relevance], feed_dict=self.feed_dict)
366 | # print 'Chcek ', sklearn.metrics.log_loss(label, relevance), loss_value
367 |
368 | average_loss += loss_value
369 | #print 'step ', step, loss_value
370 | #print 'predict ', predict_query[0], predict_doc[0], relevance[0]
371 | return average_loss / (step+1), relevance
372 |
373 |
374 | def fit(self, sess, query_input, doc_input, labels, valid_q_input=None, valid_d_input=None, valid_labels=None, \
375 | load_model=False):
376 | '''
377 | 模型入口
378 | :param sess:
379 | :param query_input:
380 | :param doc_input:
381 | :param labels:
382 | :return:
383 | '''
384 | losses = []
385 | best_loss = 99999
386 | saver = tf.train.Saver()
387 | if load_model:
388 | saver.restore(sess, self.save_model_path)
389 | start_time = time.time()
390 | valid_loss, _ = self.run_epoch(sess, valid_q_input, valid_d_input, valid_labels, is_valid=True)
391 | duration = time.time() - start_time
392 | print('valid loss = %.5f (%.3f sec)'
393 | % (valid_loss, duration))
394 | losses.append(valid_loss)
395 | return losses
396 |
397 | for epoch in range(self.max_epochs):
398 | start_time = time.time()
399 | average_loss, relevance = self.run_epoch(sess, query_input, doc_input, labels)
400 | duration = time.time() - start_time
401 |
402 | if (epoch+1) % 1 == 0:
403 | if valid_labels is None:
404 | print('Epoch %d: loss = %.5f relevance[0] = %.5f (%.3f sec)'
405 | % (epoch+1, average_loss, relevance[0], duration))
406 | else:
407 | valid_loss, _ = self.run_epoch(sess, valid_q_input, valid_d_input, valid_labels, is_valid=True)
408 | if valid_loss < best_loss:
409 | print 'Save model'
410 | best_loss = valid_loss
411 | saver.save(sess, self.save_model_path)
412 | duration = time.time() - start_time
413 | print('Epoch %d: loss = %.5f valid loss = %.5f relevance[0] = %.5f (%.3f sec)'
414 | % (epoch+1, average_loss, valid_loss, relevance[0], duration))
415 | sys.stdout.flush()
416 | losses.append(average_loss)
417 |
418 | if not valid_labels is None:
419 | print 'Final valid loss: ', best_loss
420 | return losses
421 |
422 | def predict(self, sess, query, doc, labels):
423 | '''
424 | 计算预测过后的查询与文档的相关性
425 | :param sess:
426 | :param query:
427 | :param doc:
428 | :param labels:
429 | :return:
430 | '''
431 | if not self.is_sparse:
432 | self.creat_feed_dict(query, doc, labels)
433 | predict = sess.run(self.relevance, feed_dict=self.feed_dict)
434 | else:
435 | predict = []
436 | for step, (query_, doc_, label_) in enumerate(
437 | tools.data_iterator(query, doc, labels, self.batch_size, shuffle=True, is_normalize=True)
438 | ):
439 | self.creat_feed_dict(query, doc, labels)
440 | now_pre = sess.run(self.relevance, feed_dict=self.feed_dict)
441 | predict += now_pre
442 |
443 | return predict
444 |
445 |
446 |
447 | def test_dssm():
448 | '''
449 | 测试函数
450 | :return:
451 | '''
452 | with tf.Graph().as_default():
453 | tf.set_random_seed(1)
454 |
455 | model = DSSM(hash_tokens_nums=30000, dnn_layer_nums=2, dnn_hidden_node_nums=300, feature_nums=128,
456 | batch_size=10, neg_nums=4, learning_rate=0.02, max_epochs=500)
457 | sess = tf.Session()
458 | init = tf.initialize_all_variables()
459 | sess.run(init)
460 | np.random.seed(1)
461 |
462 | query = np.random.rand(500, 30000)
463 | doc = np.random.rand(500, 30000)
464 | label = np.array([1, 0, 0, 0, 0] * 100)
465 |
466 | #print query
467 | #print doc
468 | #print label
469 |
470 | losses = model.fit(sess, query, doc, label)
471 |
472 | #print losses[-1]
473 |
474 |
475 | if __name__ == '__main__':
476 | test_dssm()
477 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## For what
2 | Understanding the Methods in Text Matching Area Including Key-words based Matching Model & Latent Semantic Matching Model.
3 | Implement the Classical Methods.
4 |
5 | ## Categories
6 | - tradition model (feature based models)
7 | - Key-words based methods
8 | - tf-idf model
9 | - words common rate model
10 | - find the most important word with adding syntax information
11 | - boosting models
12 | - linear models
13 | - factorization machine
14 | - Semantic deep model
15 | - representation-based models
16 | - DSSM, CDSSM
17 | - interaction-based models
18 |
19 |
20 | ## People in these area
21 | - [Po-Sen Huang](https://posenhuang.github.io/full_publication.html)
22 | - [Jianfeng Gao](https://www.microsoft.com/en-us/research/people/jfgao/)
23 | - [Richard Socher](http://www.socher.org/index.php/Main/HomePage)
24 | - [Hang Li](http://www.hangli-hl.com/index.html)
25 |
26 | ## Survey
27 | > [深度文本匹配综述(A Survey on Deep Text Matching)](http://kns.cnki.net/KCMS/detail/detail.aspx?dbcode=CJFQ&dbname=CAPJLAST&filename=JSJX20160920002&uid=WEEvREcwSlJHSldRa1FhdXNXYXJvK0FZMlhXUDZsYnBMQjhHTElMeE1jRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4ggI8Fm4gTkoUKaID8j8gFw!!&v=MzA2OTFscVdNMENMTDdSN3FlWU9ac0ZDcmxWYnZPSTFzPUx6N0Jkckc0SDlmTXBvMUZaT3NOWXc5TXptUm42ajU3VDNm)
28 |
29 |
30 |
31 | ## Methods & Papers about Semantic Methods
32 |
33 | > [**DSSM**](./DSSM/dssm.py)
34 |
[Learning Deep Structured Semantic Models for Web Search using Clickthrough Data](https://posenhuang.github.io/papers/cikm2013_DSSM_fullversion.pdf)
35 |
CIKM 2013
36 |
词袋模型,基于语义表达的结构, word hash + DNN
37 |
[详细解释](http://www.leiphone.com/news/201607/TRldqYnzm6nRbEnY.html)
38 |
[代码](./DSSM/dssm.py)
39 | -----
40 | > [**CDSSM**]()
41 |
[Learning Semantic Representations Using Convolutional Neural Networks for Web Search](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/www2014_cdssm_p07.pdf)
42 |
WWW 2014, word hash + CNN + DNN
43 | ----
44 |
45 | > [**CLSM**]()
46 |
[A Latent Semantic Model with Convolutional-Pooling Structure for Information Retrieval](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2014_cdssm_final.pdf)
47 |
CIKM 2014
48 |
基于匹配的结构, word hash + CNN, CLSM和C-DSSM有什么区别呢
49 | ----
50 |
51 | > [**DSSM的应用**]()
52 | [Modeling Interestingness with Deep Neural Networks](https://www.microsoft.com/en-us/research/wp-content/uploads/2014/10/604_Paper.pdf)
53 |
EMNLP 2014
54 |
DSSM应用于文本分析,在automatic highlighting和contextual entity search问题上效果好。
55 |
主要有两点贡献:
56 |
1) DSSM + CNN
57 |
2) 不针对相关性,加了一个ranker
58 | ----
59 |
60 | > [**ARC-I/ARC-II**]()
61 | [Convolutional Neural Network Architectures
62 | for Matching Natural Language Sentences](https://papers.nips.cc/paper/5550-convolutional-neural-network-architectures-for-matching-natural-language-sentences.pdf)
63 |
NIPS 2014
64 |
CNN的基于语义表达和基于匹配的两种结构; 增加了门解决句子长度不一致问题
65 | ----
66 | > [**CNTN**]()
67 |
[Convolutional Neural Tensor Network
68 | Architecture for Community-based Question Answering](https://ijcai.org/Proceedings/15/Papers/188.pdf)
69 |
IJCAI 2015
70 |
(D)CNN+MLP(tensor layer);
71 |
基于语义表达的结构
72 |
73 | -----
74 | > [**DeepMatch**]()
75 |
[A Deep Architecture for Matching Short Texts](https://papers.nips.cc/paper/5019-a-deep-architecture-for-matching-short-texts.pdf)
76 |
NIPS 2013
77 |
[Reviews](https://media.nips.cc/nipsbooks/nipspapers/paper_files/nips26/reviews/697.html)
78 |
目的:建模更复杂的匹配关系。最早的基于匹配的结构把。
79 |
结合了localness和hierarchy intrinsic,基于点积的网络不好做的,最大的亮点是用话题模型建立网络吧。
80 |
81 | ------
82 | > [**DeepMatch_tree**]()
83 |
[Syntax-based Deep Matching of Short Texts](https://arxiv.org/pdf/1503.02427.pdf)
84 |
85 | ## Methods & Papers about Key Words Based Methods
86 | > [****]()
87 |
[]()
88 |
89 | ## Related talks and books
90 | * [Deep Learning for Web Search and
91 | Natural Language Processing](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/wsdm2015.v3.pdf)
92 | * [Deep Learning for Information Retrieval(Sigir 2016 Tutorial)](http://www.hangli-hl.com/uploads/3/4/4/6/34465961/deep_learning_for_information_retrieval.pdf)
93 | * [Semantic Matching in Search (Sigir 2014 Workshop)](http://www.hangli-hl.com/uploads/3/4/4/6/34465961/semantic_matching_in_search.pdf)
94 | * [Semantic Matching in Search (Book 2014)](http://www.bigdatalab.ac.cn/~junxu/publications/SemanticMatchingInSearch_2014.pdf)
95 | * [gensim notebook](https://github.com/RaRe-Technologies/gensim/tree/develop/docs/notebooks)
96 |
97 |
98 | ## Downloads
99 | > [DSSM/Sent2Vec Release Version](https://www.microsoft.com/en-us/download/details.aspx?id=52365)
100 |
MSRA发布的Sent2Vec发行版
101 |
102 | ## Datasets
103 | * [Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks](http://arxiv.org/abs/1502.05698 "Jason Weston, Antoine Bordes, Sumit Chopra, Tomas Mikolov, Alexander M. Rush") ([fb.ai/babi](http://fb.ai/babi))
104 | * [Teaching Machines to Read and Comprehend](http://arxiv.org/abs/1506.03340 "Karl Moritz Hermann, Tomáš Kočiský, Edward Grefenstette, Lasse Espeholt, Will Kay, Mustafa Suleyman, Phil Blunsom") ([github.com/deepmind/rc-data](https://github.com/deepmind/rc-data))
105 | * [One Billion Word Benchmark for Measuring Progress in Statistical Language Modeling](http://arxiv.org/abs/1312.3005 "Ciprian Chelba, Tomas Mikolov, Mike Schuster, Qi Ge, Thorsten Brants, Phillipp Koehn, Tony Robinson") ([github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark))
106 | * [The Ubuntu Dialogue Corpus: A Large Dataset for Research in Unstructured Multi-Turn Dialogue Systems](http://arxiv.org/abs/1506.08909 "Ryan Lowe, Nissan Pow, Iulian Serban, Joelle Pineau") ([cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0](http://cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0/))
107 | * [Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books](http://arxiv.org/abs/1506.06724 "Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, Sanja Fidler") ([BookCorpus](http://www.cs.toronto.edu/~mbweb/))
108 | * [Every publicly available Reddit comment, for research.](https://www.reddit.com/r/datasets/comments/3bxlg7/i_have_every_publicly_available_reddit_comment/ "Stuck_In_the_Matrix")
109 | * [Stack Exchange Data Dump](https://archive.org/details/stackexchange "Stack Exchange")
110 | * [Europarl: A Parallel Corpus for Statistical Machine Translation](http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl-mtsummit05.pdf "Philipp Koehn") ([www.statmt.org/europarl/](http://www.statmt.org/europarl/))
111 | * [RTE Knowledge Resources](http://aclweb.org/aclwiki/index.php?title=RTE_Knowledge_Resources)
112 | * [**Kaggle Quora Question Pairs**]()
113 |
114 |
115 | ## Competition
116 | * [Kaggle Quora Question Pairs](https://www.kaggle.com/c/quora-question-pairs)
117 |
[SQuAD(The Stanford Question Answering Dataset)](https://rajpurkar.github.io/SQuAD-explorer/)
118 |
119 | ## Pretrained Models
120 | * [Model Zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo "Berkeley Vision and Learning Center")
121 | * [word2vec](https://code.google.com/p/word2vec/ "Tomas Mikolov")
122 | * [GoogleNews-vectors-negative300.bin.gz](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing)
123 | * [freebase-vectors-skipgram1000.bin.gz](https://docs.google.com/file/d/0B7XkCwpI5KDYaDBDQm1tZGNDRHc/edit?usp=sharing)
124 | * [GloVe](http://nlp.stanford.edu/projects/glove/ "Jeffrey Pennington, Richard Socher, Christopher D. Manning")
125 | * [SENNA](http://ronan.collobert.com/senna/ "R. Collobert, J. Weston, L. Bottou, M. Karlen, K. Kavukcuoglu, P. Kuksa")
126 |
127 | ## Important Online Courses
128 | * [Stanford CS224d Deep Learning for Natural Language Processing](http://cs224d.stanford.edu/)
129 | * [Stanford CS20SI Tensorflow for Deep Learning Research](https://web.stanford.edu/class/cs20si/)
130 | * [Stanford CS231n Convolutional Neural Networks for Visual Recognition](http://cs231n.stanford.edu/)
131 |
132 |
133 | ## References
134 | https://github.com/robertsdionne/neural-network-papers/blob/master/README.md
135 |
--------------------------------------------------------------------------------
/helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Accagain2014/TextMatching/b22d8d705da64a34293d2079e577027c819c00d1/helper/__init__.py
--------------------------------------------------------------------------------
/helper/distance.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 |
3 | import numpy as np
4 | from sklearn.metrics.pairwise import cosine_similarity
5 |
6 | def cos_dis(x, y):
7 | '''
8 | Calculate cosine distance about vector x and y
9 |
10 | :param x: np.array liked, a vector, one dimension
11 | :param y: np.array liked, a vector, one dimension
12 | :return: cosine distance between vector x and vector y
13 | '''
14 |
15 | dot_mul = x * y
16 | ans = np.sum(dot_mul) / np.power(np.sum(x**2), 0.5) / np.power(np.sum(y**2), 0.5)
17 |
18 | '''
19 | ans_sk = cosine_similarity(x, y)
20 |
21 | assert ans - ans_sk < 1e-10
22 | '''
23 |
24 | return ans
25 |
26 |
--------------------------------------------------------------------------------
/helper/tools.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 |
3 | import numpy as np
4 | import math
5 | from scipy import sparse as sps
6 | import numpy as np
7 | import pandas as pd
8 | from sklearn.preprocessing import normalize
9 |
10 | def softmax(x):
11 | if len(x.shape) > 1:
12 | tmp = np.max(x, axis=1)
13 | x -= tmp.reshape((x.shape[0], 1))
14 | x = np.exp(x)
15 | tmp = np.sum(x, axis=1)
16 | x /= tmp.reshape((x.shape[0], 1))
17 | else:
18 | tmp = np.max(x)
19 | x -= tmp
20 | x = np.exp(x)
21 | tmp = np.sum(x)
22 | x /= tmp
23 |
24 | return x
25 |
26 |
27 | def sigmoid(x):
28 | x = 1. / (1 + np.exp(-x))
29 | return x
30 |
31 |
32 | def sigmoid_grad(f):
33 | f = f * (1 - f)
34 | return f
35 |
36 |
37 | def xavier_init(n1, n2):
38 | return math.sqrt(6.0/(n1+n2))
39 |
40 |
41 | def oversample(X_ot,y,p=0.165):
42 | pos_ot = X_ot[y==1]
43 | neg_ot = X_ot[y==0]
44 |
45 | scale = ((pos_ot.shape[0]*1.0 / (pos_ot.shape[0] + neg_ot.shape[0])) / p) - 1
46 | while scale > 1:
47 | neg_ot = sps.vstack([neg_ot, neg_ot]).tocsr()
48 | scale -=1
49 | neg_ot = sps.vstack([neg_ot, neg_ot[:int(scale * neg_ot.shape[0])]]).tocsr()
50 | ot = sps.vstack([pos_ot, neg_ot]).tocsr()
51 | y=np.zeros(ot.shape[0])
52 | y[:pos_ot.shape[0]]=1.0
53 | print 'After oversample, the "is_duplicate" field mean: ', y.mean()
54 | return ot,y
55 |
56 |
57 | def data_iterator(orig_X, orig_y=None, orig_label=None, batch_size=10, shuffle=False, is_normalize=False):
58 | '''
59 |
60 | :param orig_X:
61 | :param orig_y:
62 | :param orig_label:
63 | :param batch_size:
64 | :param shuffle:
65 | :return:
66 | '''
67 |
68 | # Optionally shuffle the data before training
69 | if shuffle:
70 | indices = np.random.permutation(orig_X.shape[0])
71 | data_X = orig_X[indices]
72 | data_y = orig_y[indices]
73 | data_label = orig_label[indices]
74 | else:
75 | data_X = orig_X
76 | data_y = orig_y
77 | data_label = orig_label
78 | ###
79 | total_processed_examples = 0
80 | total_steps = int(np.ceil(data_X.shape[0]) / float(batch_size))
81 | for step in xrange(total_steps):
82 | # Create the batch by selecting up to batch_size elements
83 | batch_start = step * batch_size
84 | x = data_X[batch_start : batch_start + batch_size]
85 | y = data_y[batch_start : batch_start + batch_size]
86 | label = orig_label[batch_start : batch_start + batch_size]
87 | '''
88 | if is_sparse:
89 | yield x.toarray(), y.toarray(), label
90 | else:
91 | yield x, y, label
92 | '''
93 | if is_normalize:
94 | yield normalize(x, axis=0), normalize(y, axis=0), label
95 | else:
96 | yield x, y, label
97 | total_processed_examples += x.shape[0]
98 | # Sanity check to make sure we iterated over all the dataset as intended
99 | #assert total_processed_examples == data_X.shape[0], 'Expected {} and processed {}'.format(data_X.shape[0], total_processed_examples)
100 |
--------------------------------------------------------------------------------
/helper/wordhash.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | import pickle
3 | import sys
4 |
5 | import pandas as pd
6 | from scipy import sparse as sps
7 |
8 | from helper import distance
9 |
10 |
11 | class WordHash(object):
12 |
13 | '''
14 | Implement word hash methods mentioned in the paper: Learning Deep Structured Semantic Models for Web Search using Clickthrough Data
15 | '''
16 |
17 | def __init__(self, words, n_gram=3, marks='#', load_from_file=False, dump_to_file=False, file=None):
18 | '''
19 |
20 | :param words: origin vacabulary
21 | :param n_gram: number of letters to make a n_gram term
22 | :param marks: the character added in the starting and ending position of a word
23 | :param load_from_file: load n_gram index map dict from file or not
24 | :param load_file: load file name
25 | '''
26 |
27 | if load_from_file:
28 | with open(file, 'rb') as fr:
29 | self.__dict__ = pickle.load(fr).__dict__ # load an object
30 | return
31 |
32 | self.ori_words = words
33 | self.ori_len = len(self.ori_words)
34 | self.words = map(lambda x: marks+x.lower()+marks, list(set(self.ori_words)))
35 | self.ori_diff_len = len(self.words)
36 | self.n_gram = n_gram
37 | self.marks = marks
38 | self.hashed_words = set()
39 | self.n_gram_index_map = {}
40 | self.n_gram_size = 0
41 |
42 | print 'Sum number of origin words: ', self.ori_len
43 | print 'Sum number of origin diff words: ', self.ori_diff_len
44 | print 'Letter n-gram: ', self.n_gram
45 |
46 | self._get_hash_dict()
47 |
48 | if dump_to_file:
49 | with open(file, 'wb') as fw:
50 | pickle.dump(self, fw)
51 |
52 |
53 | def _split(self, word):
54 | '''
55 | Split a word with lenth of self.n_gram_size
56 | :param self:
57 | :param word: word to be splited by n_gram len
58 | :return:
59 | '''
60 |
61 | splited_ngrams = []
62 | word_len = len(word)
63 | split_point = 0
64 | while split_point < word_len-1: # don't consider the last marks
65 | splited_ngrams.append(word[split_point : split_point+self.n_gram])
66 | split_point += 1
67 | return splited_ngrams
68 |
69 |
70 | def _get_hash_dict(self):
71 | '''
72 | Get n_gram terms and mapping them to indexes
73 | :return:
74 | '''
75 |
76 | for one_word in self.words:
77 | ngram_words = self._split(one_word)
78 | self.hashed_words = self.hashed_words | set(ngram_words)
79 |
80 | word_keys = list(self.hashed_words)
81 | word_values = range(0, len(word_keys))
82 | self.n_gram_index_map = dict(zip(word_keys, word_values))
83 | self.n_gram_size = len(word_keys)
84 |
85 | print 'Sum numbers of n-grams: ', self.n_gram_size
86 | return self.hashed_words
87 |
88 | def get_n_gram_count(self, sentences, is_dump=False, dump_file=None):
89 | '''
90 | Get n_gram counting term matrix
91 | :param sentences: sentences to be handled to get n_gram term counting matrix
92 | :param is_dump: whether dump the result to file or not
93 | :param dump_file: dump file name
94 | :return: n_gram term counting sparse matrix, shapes(sentences number, n_gram term size)
95 | '''
96 |
97 | # n_gram_count = np.zeros((len(sentences), self.n_gram_size))
98 | n_gram_count = sps.lil_matrix((len(sentences), self.n_gram_size))
99 | sen_cnt = 0
100 | for one_sen in sentences:
101 | one_sen = one_sen.strip().split()
102 | for one_word in one_sen:
103 | one_word = one_word.strip()
104 | one_word = self.marks+one_word.lower()+self.marks
105 | splited_n_gram = self._split(one_word)
106 | n_gram_index = map(lambda x: self.n_gram_index_map[x], splited_n_gram)
107 | # n_gram_count[sen_cnt, n_gram_index] += 1
108 | for one_n_gram_index in n_gram_index:
109 | n_gram_count[sen_cnt, one_n_gram_index] += 1
110 | sen_cnt += 1
111 | if is_dump:
112 | pd.to_pickle(n_gram_count.tocsr(), dump_file)
113 | '''
114 | with open(dump_file, 'wb') as fw:
115 | pickle.dump(n_gram_count.tocsr(), fw)
116 | '''
117 | print 'Dump to file ', dump_file, ' done.'
118 | sys.stdout.flush()
119 | print 'Get n_gram count matrix done, shape with: ', n_gram_count.shape
120 | return n_gram_count.tocsr()
121 |
122 | def test_WordHash():
123 |
124 | sentence = 'Key words based text matching methods and semantic matching methods'
125 |
126 | print sentence.split()
127 | wordhash = WordHash(sentence.split(), load_from_file=False, load_file='n_gram_term_index_mapping.pkl', dump_to_file=False, dump_file='n_gram_term_index_mapping.pkl')
128 | print wordhash.n_gram_index_map
129 | n_gram_matrix = wordhash.get_n_gram_count(['key words text matching methods', 'semantic text matching methods'])
130 |
131 | print distance.cos_dis(n_gram_matrix[0].toarray().reshape([-1]), n_gram_matrix[1].toarray().reshape([-1]))
132 |
133 | if __name__ == '__main__':
134 | test_WordHash()
135 |
--------------------------------------------------------------------------------
/quora_dssm.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | import sys
3 |
4 | import numpy as np
5 | import tensorflow as tf
6 | from sklearn.datasets import load_svmlight_file
7 | from sklearn.model_selection import KFold
8 | from sklearn.utils import shuffle
9 |
10 | import config
11 | from DSSM.dssm import DSSM
12 | from helper import tools
13 |
14 |
15 | def get_words(sentences):
16 | words = []
17 | for one_sen in sentences:
18 | one_sen = one_sen.strip().split()
19 | one_sen = map(lambda x: x.strip(), one_sen)
20 | words += one_sen
21 | return words
22 |
23 |
24 | def quora_dssm(train_input_file, test_input_file):
25 |
26 | '''
27 | 测试函数
28 | :return:
29 | '''
30 | seed = 2222
31 |
32 | '''
33 | train_ori = pd.read_csv(train_input_file)
34 | test = pd.read_csv(test_input_file)#, nrows=1001)
35 | test['is_duplicate'] = 0
36 | # train_ori = train_ori[:1000]
37 | # test = test_ori[:]
38 |
39 | print train_ori['is_duplicate'].value_counts()
40 |
41 | q = ['question1', 'question2']
42 | words = []
43 | for one_q in q:
44 | train_ori[one_q] = train_ori[one_q].astype(str)
45 | test[one_q] = test[one_q].astype(str)
46 | '''
47 | #wordhash = WordHash(words, load_from_file=True, \
48 | # dump_to_file=True, file='result/n_gram_term_index_mapping.pkl')
49 | #print 'Load n_gram_term_index_mapping.pkl done'
50 | #sys.stdout.flush()
51 |
52 |
53 | #train_ori_q1 = wordhash.get_n_gram_count(train_ori[q[0]].values, is_dump=True, dump_file='result/train_q1_ngram_counting_matrix.pkl')
54 |
55 | #train_ori_q2 = wordhash.get_n_gram_count(train_ori[q[1]].values, is_dump=True, dump_file='result/train_q2_ngram_counting_matrix.pkl')
56 |
57 | '''
58 | with open('result/train_q1_ngram_counting_matrix.pkl', 'rb') as fr:
59 | train_ori_q1 = pickle.load(fr)
60 | with open('result/train_q2_ngram_counting_matrix.pkl', 'rb') as fr:
61 | train_ori_q2 = pickle.load(fr)
62 | print 'Get train origin sparse matrix done'
63 | sys.stdout.flush()
64 | '''
65 |
66 | '''
67 | y = train_ori['is_duplicate'].values[:]
68 | y_t = test['is_duplicate'].values[:]
69 | del train_ori
70 |
71 | test_q1 = pd.read_pickle('result/test_q1_ngram_counting_matrix.pkl')
72 | test_q2 = pd.read_pickle('result/test_q2_ngram_counting_matrix.pkl')
73 |
74 | #test_q1 = wordhash.get_n_gram_count(test[q[0]].values, is_dump=True, dump_file='result/test_q1_ngram_counting_matrix.pkl')
75 | #test_q2 = wordhash.get_n_gram_count(test[q[1]].values, is_dump=True, dump_file='result/test_q2_ngram_counting_matrix.pkl')
76 | del test
77 |
78 | print 'Get test origin sparse matrix done'
79 | sys.stdout.flush()
80 | '''
81 |
82 |
83 | '''
84 | X = sps.hstack(
85 | [train_ori_q1, train_ori_q2]
86 | ).tocsr()
87 |
88 | X_t = sps.hstack(
89 | [test_q1, test_q2]
90 | ).tocsr()
91 | '''
92 |
93 | '''
94 | Get origin train and test svm format file.
95 | '''
96 |
97 | #dump_svmlight_file(X, y, 'result/train_ori_n_gram_counting_sparse_matrix.svm')
98 | #dump_svmlight_file(X_t, y_t, 'result/test_n_gram_counting_sparse_matrix.svm')
99 |
100 | #X, y, X_test, y_test = load_svmlight_files(['result/train_ori_n_gram_counting_sparse_matrix.svm', 'result/test_n_gram_counting_sparse_matrix.svm']) # 注意load_svmlight_file的shape是根据推断来的,可能导致不一致
101 | #X = normalize(X, axis=0)
102 | #X_test = normalize(X_test, axis=0)
103 |
104 | #dump_svmlight_file(X, y, 'result/train_ori_n_gram_counting_sparse_matrix.norm.svm')
105 | #dump_svmlight_file(X_test, y_test, 'result/test_n_gram_counting_sparse_matrix.norm.svm')
106 |
107 | X, y = load_svmlight_file('result/train_ori_n_gram_counting_sparse_matrix.norm.mini.svm', n_features=111166)
108 | print X.shape
109 | used_as_train = X.shape[0]/10
110 | X = X[:used_as_train]
111 | y = y[:used_as_train]
112 |
113 | print y[:10]
114 | #dump_svmlight_file(X, y, 'result/train_ori_n_gram_counting_sparse_matrix.norm.mini.svm')
115 |
116 | #print 'X_train shape: ', X.shape, ' X_test shape: ', X_test.shape
117 | print 'Load done'
118 | sys.stdout.flush()
119 |
120 | skf = KFold(n_splits=5, shuffle=True, random_state=seed).split(X)
121 | for ind_tr, ind_te in skf:
122 | X_train = X[ind_tr]
123 | y_train = y[ind_tr]
124 |
125 | X_valid = X[ind_te]
126 | y_valid = y[ind_te]
127 | break
128 |
129 | X_train, y_train = tools.oversample(X_train.tocsr(), y_train, p=0.165)
130 | X_valid, y_valid = tools.oversample(X_valid.tocsr(), y_valid, p=0.165)
131 |
132 | X_train, y_train = shuffle(X_train, y_train, random_state=seed)
133 |
134 | #dump_svmlight_file(X_train, y_train, 'result/oversample_train_n_gram_counting_sparse_matrix.svm')
135 | #dump_svmlight_file(X_valid, y_valid, 'result/oversample_valid_n_gram_counting_sparse_matrix.svm')
136 |
137 | #print 'Dump to svm format done.'
138 |
139 | '''
140 | for _ in q:
141 | train_ori[_] = train_ori[_].astype(str)
142 | test[_] = test_ori[_].astype(str)
143 | words += get_words(train_ori[_].values)
144 | words += get_words(test[_].values)
145 |
146 | print 'Sum words: ', len(words), ' sum diff words: ', len(set(words))
147 |
148 | wordhash = WordHash(words, load_from_file=True, load_file='n_gram_term_index_mapping.pkl', \
149 | dump_to_file=True, dump_file='n_gram_term_index_mapping.pkl')
150 |
151 |
152 | split_point = int(0.7 * len(train_ori))
153 | train = train_ori[:split_point]
154 | valid = train_ori[split_point:]
155 |
156 | train_q1 = wordhash.get_n_gram_count(train[q[0]].values, is_dump=True, dump_file='result/train_q1_ngram_counting_matrix.pkl')
157 | train_q2 = wordhash.get_n_gram_count(train[q[1]].values, is_dump=True, dump_file='result/train_q2_ngram_counting_matrix.pkl')
158 | train_label = train['is_duplicate'].values
159 |
160 | valid_q1 = wordhash.get_n_gram_count(valid[q[0]].values, is_dump=True, dump_file='result/valid_q1_ngram_counting_matrix.pkl')
161 | valid_q2 = wordhash.get_n_gram_count(valid[q[1]].values, is_dump=True, dump_file='result/valid_q2_ngram_counting_matrix.pkl')
162 | valid_label = valid['is_duplicate'].values
163 |
164 | test_q1 = wordhash.get_n_gram_count(test[q[0]].values, is_dump=True, dump_file='result/test_q1_ngram_counting_matrix.pkl')
165 | test_q2 = wordhash.get_n_gram_count(test[q[1]].values, is_dump=True, dump_file='result/test_q2_ngram_counting_matrix.pkl')
166 | test_label = test['is_duplicate'].values
167 |
168 | '''
169 | print 'train shape: ', X_train.shape, 'valid shape: ', X_valid.shape
170 | #print 'test shape: ', X_test.shape
171 | n_gram_size = X_train.shape[1]
172 | #n_gram_size = X_test.shape[1]
173 | sys.stdout.flush()
174 |
175 | with tf.Graph().as_default():
176 | tf.set_random_seed(1)
177 | model = DSSM(hash_tokens_nums=n_gram_size/2, dnn_layer_nums=2, dnn_hidden_node_nums=288, feature_nums=64, batch_size=X_train.shape[0], neg_nums=0, learning_rate=0.001, max_epochs=400, loss_kind='log_loss', w_init=1,save_model_path='result/save-model', mlp_hidden_node_nums=16, mlp_layer_nums=100,input_is_sparse=True)
178 | sess = tf.Session()
179 | init = tf.initialize_all_variables()
180 | sess.run(init)
181 | np.random.seed(1)
182 |
183 | # query = np.random.rand(500, 30000)
184 | # doc = np.random.rand(500, 30000)
185 | # label = np.array([1, 0, 0, 0, 0] * 100)
186 | # model.set_positive_weights([1]*500)
187 |
188 | #print query
189 | #print doc
190 | #print label
191 | X_train_q1 = X_train[:, :n_gram_size/2]
192 | X_train_q2 = X_train[:, n_gram_size/2:]
193 |
194 | X_valid_q1 = X_valid[:, :n_gram_size/2]
195 | X_valid_q2 = X_valid[:, n_gram_size/2:]
196 |
197 | #X_test_q1 = X_test[:, :n_gram_size/2]
198 | #X_test_q2 = X_test[:, n_gram_size/2:]
199 |
200 | losses = model.fit(sess, X_train_q1, X_train_q2, y_train, X_valid_q1, X_valid_q2, y_valid, load_model=False)
201 | '''
202 | print 'Start to test. '
203 |
204 | test['is_duplicate'] = model.predict(sess, X_test_q1, X_test_q2, y_test, is_sparse=True)
205 | test[['test_id', 'is_duplicate']].to_csv('result/out.csv', index=False)
206 | '''
207 | if __name__ == '__main__':
208 |
209 | train_file = config.train_file
210 | test_file = config.test_file
211 |
212 | quora_dssm(train_file, test_file)
213 |
--------------------------------------------------------------------------------