├── .idea
├── .name
├── encodings.xml
├── modules.xml
├── SentencePairMatch_MachineLearning.iml
├── misc.xml
└── workspace.xml
├── README.md
├── .gitignore
├── ParaPhrase_svm.py
├── ParaPhrase_lr.py
├── ParaPhrase_randomforest.py
└── ParaPhrase_gbdt.py
/.idea/.name:
--------------------------------------------------------------------------------
1 | SentencePairMatch_MachineLearning
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SentencePairMatch_MachineLearning
2 | 用机器学习算法实现了一种有监督的句子对匹配方法,使用的机器学习分类算法有:逻辑回归(LR)、SVM、GBDT和随机森林(RandomForest),使用的工具是Sklearn,版本是0.17.1
3 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/SentencePairMatch_MachineLearning.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/ParaPhrase_svm.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import numpy as np
3 | import csv
4 | import datetime
5 | from sklearn.svm import SVC
6 | import os
7 | import pandas as pd
8 | from sklearn import metrics
9 | cwd = os.getcwd()
10 |
11 |
12 | def load_data(datapath):
13 | data_train = pd.read_csv(datapath, sep='\t', encoding='utf-8')
14 | print data_train.shape
15 |
16 | qid1 = []
17 | qid2 = []
18 | labels = []
19 | count = 0
20 | for idx in range(data_train.id.shape[0]):
21 | # for idx in range(400):
22 | # count += 1
23 | # if count == 21: break
24 | print idx
25 | q1 = data_train.qid1[idx]
26 | q2 = data_train.qid2[idx]
27 |
28 | qid1.append(q1)
29 | qid2.append(q2)
30 | labels.append(data_train.is_duplicate[idx])
31 |
32 | return qid1, qid2, labels
33 |
34 | def load_doc2vec(word2vecpath):
35 | f = open(word2vecpath)
36 | embeddings_index = {}
37 | count = 0
38 | for line in f:
39 | # count += 1
40 | # if count == 10000: break
41 | values = line.split('\t')
42 | id = values[0]
43 | print id
44 | coefs = np.asarray(values[1].split(), dtype='float32')
45 | embeddings_index[int(id)+1] = coefs
46 | f.close()
47 | print('Total %s word vectors.' % len(embeddings_index))
48 |
49 | return embeddings_index
50 |
51 | def sentence_represention(qid, embeddings_index):
52 | vectors = np.zeros((len(qid), 100))
53 | for i in range(len(qid)):
54 | print i
55 | vectors[i] = embeddings_index.get(qid[i])
56 |
57 | return vectors
58 |
59 | def main():
60 | start = datetime.datetime.now()
61 | datapath = 'D:/dataset/quora/quora_duplicate_questions_Chinese_seg.tsv'
62 | doc2vecpath = "D:/dataset/quora/vector2/quora_duplicate_question_doc2vec_100.vector"
63 | qid1, qid2, labels = load_data(datapath)
64 | embeddings_index = load_doc2vec(word2vecpath=doc2vecpath)
65 | vectors1 = sentence_represention(qid1, embeddings_index)
66 | vectors2 = sentence_represention(qid2, embeddings_index)
67 | vectors = np.hstack((vectors1, vectors2))
68 | labels = np.array(labels)
69 | VALIDATION_SPLIT = 10000
70 | VALIDATION_SPLIT0 = 1000
71 | indices = np.arange(vectors.shape[0])
72 | np.random.shuffle(indices)
73 | vectors = vectors[indices]
74 | labels = labels[indices]
75 | train_vectors = vectors[:-VALIDATION_SPLIT]
76 | train_labels = labels[:-VALIDATION_SPLIT]
77 | test_vectors = vectors[-VALIDATION_SPLIT:]
78 | test_labels = labels[-VALIDATION_SPLIT:]
79 | # train_vectors = vectors[:VALIDATION_SPLIT0]
80 | # train_labels = labels[:VALIDATION_SPLIT0]
81 | # test_vectors = vectors[-VALIDATION_SPLIT0:]
82 | # test_labels = labels[-VALIDATION_SPLIT0:]
83 |
84 | svm = SVC()
85 | print '***********************training************************'
86 | svm.fit(train_vectors, train_labels)
87 |
88 | print '***********************predict*************************'
89 | prediction = svm.predict(test_vectors)
90 | accuracy = metrics.accuracy_score(test_labels, prediction)
91 | print accuracy
92 |
93 | end = datetime.datetime.now()
94 | print end-start
95 |
96 |
97 | if __name__ == '__main__':
98 | main() # the whole one model
--------------------------------------------------------------------------------
/ParaPhrase_lr.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import numpy as np
3 | import csv
4 | import datetime
5 | from sklearn.linear_model import LogisticRegression
6 | import os
7 | import pandas as pd
8 | from sklearn import metrics
9 | cwd = os.getcwd()
10 |
11 | def load_data(datapath):
12 | data_train = pd.read_csv(datapath, sep='\t', encoding='utf-8')
13 | print data_train.shape
14 |
15 | qid1 = []
16 | qid2 = []
17 | labels = []
18 | count = 0
19 | for idx in range(data_train.id.shape[0]):
20 | # for idx in range(400):
21 | # count += 1
22 | # if count == 21: break
23 | print idx
24 | q1 = data_train.qid1[idx]
25 | q2 = data_train.qid2[idx]
26 |
27 | qid1.append(q1)
28 | qid2.append(q2)
29 | labels.append(data_train.is_duplicate[idx])
30 |
31 | return qid1, qid2, labels
32 |
33 | def load_doc2vec(word2vecpath):
34 | f = open(word2vecpath)
35 | embeddings_index = {}
36 | count = 0
37 | for line in f:
38 | # count += 1
39 | # if count == 10000: break
40 | values = line.split('\t')
41 | id = values[0]
42 | print id
43 | coefs = np.asarray(values[1].split(), dtype='float32')
44 | embeddings_index[int(id)+1] = coefs
45 | f.close()
46 | print('Total %s word vectors.' % len(embeddings_index))
47 |
48 | return embeddings_index
49 |
50 | def sentence_represention(qid, embeddings_index):
51 | vectors = np.zeros((len(qid), 100))
52 | for i in range(len(qid)):
53 | print i
54 | vectors[i] = embeddings_index.get(qid[i])
55 |
56 | return vectors
57 |
58 | def main():
59 | start = datetime.datetime.now()
60 | datapath = 'D:/dataset/quora/quora_duplicate_questions_Chinese_seg.tsv'
61 | doc2vecpath = "D:/dataset/quora/vector2/quora_duplicate_question_doc2vec_100.vector"
62 | qid1, qid2, labels = load_data(datapath)
63 | embeddings_index = load_doc2vec(word2vecpath=doc2vecpath)
64 | vectors1 = sentence_represention(qid1, embeddings_index)
65 | vectors2 = sentence_represention(qid2, embeddings_index)
66 | vectors = np.hstack((vectors1, vectors2))
67 | labels = np.array(labels)
68 | VALIDATION_SPLIT = 10000
69 | VALIDATION_SPLIT0 = 1000
70 | indices = np.arange(vectors.shape[0])
71 | np.random.shuffle(indices)
72 | vectors = vectors[indices]
73 | labels = labels[indices]
74 | train_vectors = vectors[:-VALIDATION_SPLIT]
75 | train_labels = labels[:-VALIDATION_SPLIT]
76 | test_vectors = vectors[-VALIDATION_SPLIT:]
77 | test_labels = labels[-VALIDATION_SPLIT:]
78 | # train_vectors = vectors[:VALIDATION_SPLIT0]
79 | # train_labels = labels[:VALIDATION_SPLIT0]
80 | # test_vectors = vectors[-VALIDATION_SPLIT0:]
81 | # test_labels = labels[-VALIDATION_SPLIT0:]
82 |
83 | lr = LogisticRegression()
84 | print '***********************training************************'
85 | lr.fit(train_vectors, train_labels)
86 |
87 | print '***********************predict*************************'
88 | prediction = lr.predict(test_vectors)
89 | accuracy = metrics.accuracy_score(test_labels, prediction)
90 | print accuracy
91 | end = datetime.datetime.now()
92 | print end-start
93 |
94 |
95 | if __name__ == '__main__':
96 | main() # the whole one model
--------------------------------------------------------------------------------
/ParaPhrase_randomforest.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import numpy as np
3 | import csv
4 | import datetime
5 | from sklearn.ensemble import RandomForestClassifier
6 | import os
7 | import pandas as pd
8 | from sklearn import metrics, feature_extraction
9 | from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
10 | cwd = os.getcwd()
11 |
12 |
13 | def load_data(datapath):
14 | data_train = pd.read_csv(datapath, sep='\t', encoding='utf-8')
15 | print data_train.shape
16 |
17 | qid1 = []
18 | qid2 = []
19 | question1 = []
20 | question2 = []
21 | labels = []
22 | count = 0
23 | for idx in range(data_train.id.shape[0]):
24 | # for idx in range(400):
25 | # count += 1
26 | # if count == 21: break
27 | print idx
28 | q1 = data_train.qid1[idx]
29 | q2 = data_train.qid2[idx]
30 |
31 | qid1.append(q1)
32 | qid2.append(q2)
33 | question1.append(data_train.question1[idx])
34 | question2.append(data_train.question2[idx])
35 | labels.append(data_train.is_duplicate[idx])
36 |
37 | return qid1, qid2, question1, question2, labels
38 |
39 | def load_doc2vec(word2vecpath):
40 | f = open(word2vecpath)
41 | embeddings_index = {}
42 | count = 0
43 | for line in f:
44 | # count += 1
45 | # if count == 10000: break
46 | values = line.split('\t')
47 | id = values[0]
48 | print id
49 | coefs = np.asarray(values[1].split(), dtype='float32')
50 | embeddings_index[int(id)+1] = coefs
51 | f.close()
52 | print('Total %s word vectors.' % len(embeddings_index))
53 |
54 | return embeddings_index
55 |
56 | def sentence_represention(qid, embeddings_index):
57 | vectors = np.zeros((len(qid), 100))
58 | for i in range(len(qid)):
59 | print i
60 | vectors[i] = embeddings_index.get(qid[i])
61 |
62 | return vectors
63 |
64 | def main():
65 | start = datetime.datetime.now()
66 | datapath = 'D:/dataset/quora/quora_duplicate_questions_Chinese_seg.tsv'
67 | doc2vecpath = "D:/dataset/quora/vector2/quora_duplicate_question_doc2vec_100.vector"
68 | qid1, qid2, question1, question2, labels = load_data(datapath)
69 |
70 | embeddings_index = load_doc2vec(word2vecpath=doc2vecpath)
71 | vectors1 = sentence_represention(qid1, embeddings_index)
72 | vectors2 = sentence_represention(qid2, embeddings_index)
73 | vectors = np.hstack((vectors1, vectors2))
74 | labels = np.array(labels)
75 | VALIDATION_SPLIT = 10000
76 | VALIDATION_SPLIT0 = 1000
77 | indices = np.arange(vectors.shape[0])
78 | np.random.shuffle(indices)
79 | vectors = vectors[indices]
80 | labels = labels[indices]
81 | train_vectors = vectors[:-VALIDATION_SPLIT]
82 | train_labels = labels[:-VALIDATION_SPLIT]
83 | test_vectors = vectors[-VALIDATION_SPLIT:]
84 | test_labels = labels[-VALIDATION_SPLIT:]
85 | # train_vectors = vectors[:VALIDATION_SPLIT0]
86 | # train_labels = labels[:VALIDATION_SPLIT0]
87 | # test_vectors = vectors[-VALIDATION_SPLIT0:]
88 | # test_labels = labels[-VALIDATION_SPLIT0:]
89 |
90 | randomforest = RandomForestClassifier()
91 | print '***********************training************************'
92 | randomforest.fit(train_vectors, train_labels)
93 |
94 | print '***********************predict*************************'
95 | prediction = randomforest.predict(test_vectors)
96 | accuracy = metrics.accuracy_score(test_labels, prediction)
97 | print accuracy
98 |
99 | end = datetime.datetime.now()
100 | print end-start
101 |
102 |
103 | if __name__ == '__main__':
104 | main() # the whole one model
--------------------------------------------------------------------------------
/ParaPhrase_gbdt.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import numpy as np
3 | import csv
4 | import datetime
5 | from sklearn.ensemble import GradientBoostingClassifier
6 | import os
7 | import pandas as pd
8 | from sklearn import metrics
9 | cwd = os.getcwd()
10 |
11 | def load_data(datapath):
12 | data_train = pd.read_csv(datapath, sep='\t', encoding='utf-8')
13 | print data_train.shape
14 |
15 | qid1 = []
16 | qid2 = []
17 | labels = []
18 | count = 0
19 | for idx in range(data_train.id.shape[0]):
20 | # for idx in range(400):
21 | # count += 1
22 | # if count == 21: break
23 | print idx
24 | q1 = data_train.qid1[idx]
25 | q2 = data_train.qid2[idx]
26 |
27 | qid1.append(q1)
28 | qid2.append(q2)
29 | labels.append(data_train.is_duplicate[idx])
30 |
31 | return qid1, qid2, labels
32 |
33 | def load_doc2vec(word2vecpath):
34 | f = open(word2vecpath)
35 | embeddings_index = {}
36 | count = 0
37 | for line in f:
38 | # count += 1
39 | # if count == 10000: break
40 | values = line.split('\t')
41 | id = values[0]
42 | print id
43 | coefs = np.asarray(values[1].split(), dtype='float32')
44 | embeddings_index[int(id)+1] = coefs
45 | f.close()
46 | print('Total %s word vectors.' % len(embeddings_index))
47 |
48 | return embeddings_index
49 |
50 | def sentence_represention(qid, embeddings_index):
51 | vectors = np.zeros((len(qid), 100))
52 | for i in range(len(qid)):
53 | print i
54 | vectors[i] = embeddings_index.get(qid[i])
55 |
56 | return vectors
57 |
58 | def main():
59 | start = datetime.datetime.now()
60 | datapath = 'D:/dataset/quora/quora_duplicate_questions_Chinese_seg.tsv'
61 | doc2vecpath = "D:/dataset/quora/vector2/quora_duplicate_question_doc2vec_100.vector"
62 | qid1, qid2, labels = load_data(datapath)
63 | embeddings_index = load_doc2vec(word2vecpath=doc2vecpath)
64 | vectors1 = sentence_represention(qid1, embeddings_index)
65 | vectors2 = sentence_represention(qid2, embeddings_index)
66 | vectors = np.hstack((vectors1, vectors2))
67 | labels = np.array(labels)
68 | VALIDATION_SPLIT = 10000
69 | VALIDATION_SPLIT0 = 1000
70 | indices = np.arange(vectors.shape[0])
71 | np.random.shuffle(indices)
72 | vectors = vectors[indices]
73 | labels = labels[indices]
74 | train_vectors = vectors[:-VALIDATION_SPLIT]
75 | train_labels = labels[:-VALIDATION_SPLIT]
76 | test_vectors = vectors[-VALIDATION_SPLIT:]
77 | test_labels = labels[-VALIDATION_SPLIT:]
78 | # train_vectors = vectors[:VALIDATION_SPLIT0]
79 | # train_labels = labels[:VALIDATION_SPLIT0]
80 | # test_vectors = vectors[-VALIDATION_SPLIT0:]
81 | # test_labels = labels[-VALIDATION_SPLIT0:]
82 |
83 | gbdt = GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
84 | max_depth=3, max_features=None, max_leaf_nodes=None,
85 | min_samples_leaf=1, min_samples_split=2,
86 | min_weight_fraction_leaf=0.0, n_estimators=100,
87 | random_state=None, subsample=1.0, verbose=0,
88 | warm_start=False)
89 | print '***********************training************************'
90 | gbdt.fit(train_vectors, train_labels)
91 |
92 | print '***********************predict*************************'
93 | prediction = gbdt.predict(test_vectors)
94 | accuracy = metrics.accuracy_score(test_labels, prediction)
95 | acc = gbdt.score(test_vectors, test_labels)
96 | print accuracy
97 | print acc
98 |
99 | end = datetime.datetime.now()
100 | print end-start
101 |
102 |
103 | if __name__ == '__main__':
104 | main() # the whole one model
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 | true
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 | 1496481127907
301 |
302 | 1496481127907
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
--------------------------------------------------------------------------------