├── ABCNN
    ├── ABCNN-1.png
    ├── ABCNN-2.png
    ├── ABCNN-3.png
    ├── BCNN.png
    ├── Data_Generate.py
    ├── README.md
    ├── __pycache__
    │   ├── config.cpython-36.pyc
    │   └── model.cpython-36.pyc
    ├── config.py
    ├── data
    │   ├── idf.npy
    │   ├── label.txt
    │   ├── stopwords.txt
    │   └── vocab.txt
    ├── main.py
    └── model.py
├── BIMPM
    ├── BiMPM.png
    ├── Data_Generate.py
    ├── README.md
    ├── config.py
    ├── data
    │   ├── char.txt
    │   ├── label.txt
    │   ├── stopwords.txt
    │   └── vocab.txt
    ├── main.py
    └── model.py
├── BM25
    ├── BM25.png
    ├── BM25.py
    └── README.md
├── DSSM-BOW
    ├── DSSM.png
    ├── Data_Generate.py
    ├── README.md
    ├── __pycache__
    │   ├── config.cpython-36.pyc
    │   └── model.cpython-36.pyc
    ├── config.py
    ├── data
    │   ├── stopwords.txt
    │   └── vocab.txt
    ├── main.py
    └── model.py
├── DSSM-CNN
    ├── DSSM-CNN.png
    ├── Data_Generate.py
    ├── README.md
    ├── config.py
    ├── data
    │   ├── stopwords.txt
    │   └── vocab.txt
    ├── main.py
    └── model.py
├── DSSM-Embedding
    ├── DSSM-Embedding.png
    ├── Data_Generate.py
    ├── README.md
    ├── config.py
    ├── data
    │   ├── stopwords.txt
    │   └── vocab.txt
    ├── main.py
    └── model.py
├── DSSM-RNN
    ├── DSSM-LSTM.png
    ├── Data_Generate.py
    ├── README.md
    ├── config.py
    ├── data
    │   ├── stopwords.txt
    │   └── vocab.txt
    ├── main.py
    └── model.py
├── ESIM
    ├── Data_Generate.py
    ├── ESIM.png
    ├── README.md
    ├── config.py
    ├── data
    │   ├── label.txt
    │   ├── stopwords.txt
    │   └── vocab.txt
    ├── main.py
    └── model.py
├── Edit_Distance
    ├── Edit-Distance.png
    ├── README.md
    └── edit_distance.py
├── Jaccard
    ├── README.md
    └── jaccard.py
├── README.md
└── Result.png


/ABCNN/ABCNN-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/ABCNN/ABCNN-1.png


--------------------------------------------------------------------------------
/ABCNN/ABCNN-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/ABCNN/ABCNN-2.png


--------------------------------------------------------------------------------
/ABCNN/ABCNN-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/ABCNN/ABCNN-3.png


--------------------------------------------------------------------------------
/ABCNN/BCNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/ABCNN/BCNN.png


--------------------------------------------------------------------------------
/ABCNN/Data_Generate.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/5/12 22:19 
 4 | # @Author: Zhang Cong
 5 | 
 6 | import logging
 7 | from tqdm import tqdm
 8 | 
 9 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
10 | 
11 | def generate_data(input_file_path, output_file_path):
12 |     '''
13 |     对原始csv数据进行格式转换，构建训练/测试数据集
14 |     :param input_file_path: 原始数据路径
15 |     :param output_file_path: 构建完成的训练数据路径
16 |     :return: 将数据存储至本地
17 |     '''
18 |     logging.info('Start get all sentence ...')
19 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
20 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
21 |         line = line.replace('\n', '').split('\t')
22 |         if line[2] == 'label':
23 |             continue
24 |         sentence_1 = str(line[0]).replace('\t', '')
25 |         sentence_2 = str(line[1]).replace('\t', '')
26 |         label = str(line[2]).replace('\t', '')
27 |         output_file.write(sentence_1 + '\t' + sentence_2 + '\t' + label + '\n')
28 | 
29 | 
30 | def check_data(input_file_path):
31 |     '''
32 |     统计数据分布情况，检查数据集0/1分布是否均衡
33 |     :param input_file_path: 数据路径
34 |     :return:
35 |     '''
36 |     count = 0
37 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
38 |         line = line.replace('\n', '').split('\t')
39 |         if line[2] == 'label':
40 |             continue
41 |         if int(line[2]) == 1:
42 |             count += 1
43 |     print(count)
44 | 
45 | 
46 | if __name__ == '__main__':
47 | 
48 |     # 统计数据分布情况
49 |     # file_path = './data/lcqmc/lcqmc_train.tsv'
50 |     # check_data(file_path)
51 | 
52 |     # 构建训练数据集
53 |     input_file_path = './data/lcqmc/lcqmc_train.tsv'
54 |     output_file_path = './data/train.txt'
55 |     generate_data(input_file_path, output_file_path)
56 |     logging.info('Success generate train.txt')
57 | 
58 |     # 构建验证数据集
59 |     input_file_path = './data/lcqmc/lcqmc_dev.tsv'
60 |     output_file_path = './data/dev.txt'
61 |     generate_data(input_file_path, output_file_path)
62 |     logging.info('Success generate dev.txt')
63 | 
64 |     # # 构建测试数据集
65 |     # input_file_path = './data/lcqmc/lcqmc_test.tsv'
66 |     # output_file_path = './data/test.txt'
67 |     # generate_test_data(input_file_path, output_file_path)
68 |     # logging.info('Success generate test.txt')
69 | 
70 | 


--------------------------------------------------------------------------------
/ABCNN/README.md:
--------------------------------------------------------------------------------
 1 | ## ABCNN (ABCNN: Attention-Based Convolutional Neural Network for Modeling Sentence Pairs)
 2 | 
 3 | 
 4 | ### 数据集：
 5 | #### LCQMC (http://icrc.hitsz.edu.cn/info/1037/1146.htm) 数据集有版权保护，本项目不提供，请自行下载或替换其他数据集进行试验
 6 | 
 7 | 
 8 | ### 数据形式：
 9 | #### sentence_1 \t sentence_2 \t label
10 | 
11 | 
12 | ### 文件解释
13 | * main.py —— 主文件
14 | * model.py —— 模型结构
15 | * config.py —— 配置参数
16 | * Data_Generate.py —— 数据集处理脚本
17 | * /data —— 数据存放文件夹
18 | * /save_model —— 模型存储文件夹
19 | 
20 | 
21 | ### 模型结构
22 | ![avatar](./BCNN.png)
23 | * BCNN的B代表Basic-Bi。它由四个部分组成：输入层(embedding)，卷积层(convolution)，池化层(pooling)，输出层(Logistic)
24 | 
25 | ![avatar](./ABCNN-1.png)
26 | * ABCNN-1 通过对输入句子的向量表示进行attention操作，从而影响卷积网络，也即它的attention是在卷积操作之前进行的。
27 | 
28 | ![avatar](./ABCNN-2.png)
29 | * ABCNN-2是对conv层的输出进行attention，从而对卷积层的输出结果进行加权。attention矩阵的计算方式与ABCNN-1相同，计算完attention矩阵之后，需要分别为两个句子计算它们的conv输出和attention矩阵Average Pooling(如上图中的两个虚线部分，它们中的每个元素分别代表了相应单词针对attention矩阵的行和列分别做Average Pooling的权重)的乘积。
30 | 
31 | ![avatar](./ABCNN-3.png)
32 | * 理解完ABCNN-1和ABCNN-2，ABCNN-3就容易理解了，它就是将上面的两个结构进行叠加，结构如上。
33 | 
34 | 
35 | ### 参考资料
36 | * ABCNN: Attention-Based Convolutional Neural Network for Modeling Sentence Pairs (https://arxiv.org/abs/1512.05193)
37 | * https://zhuanlan.zhihu.com/p/50160263
38 | 
39 | 


--------------------------------------------------------------------------------
/ABCNN/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/ABCNN/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/ABCNN/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/ABCNN/__pycache__/model.cpython-36.pyc


--------------------------------------------------------------------------------
/ABCNN/config.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/5/12 22:57 
 4 | # @Author: Zhang Cong
 5 | 
 6 | # 模型配置参数
 7 | class Config():
 8 |     def __init__(self):
 9 |         self.original_data_path = './data/train.txt'
10 |         self.dev_data_path = './data/dev.txt'
11 |         self.stopwords_path = './data/stopwords.txt'
12 |         self.preprocess_path = './data/preprocessed_data.txt'
13 |         self.vocab_path = './data/vocab.txt'
14 |         self.label_path = './data/label.txt'
15 |         self.idf_path = './data/idf.npy'
16 |         self.model_save_path = './save_model/'
17 |         self.model_type = 'BCNN'    # BCNN ABCNN1 ABCNN2 ABCNN3
18 |         self.vocab_size = 2000
19 |         self.embedding_dim = 300
20 |         self.seq_length = 20
21 |         self.feature_size = 4
22 |         self.learning_rate = 1e-5
23 |         self.l2_reg = 0.0004
24 |         self.keep_prob = 0.5
25 |         self.hidden_dim = 256
26 |         self.kernel_size = 3
27 |         self.num_classes = 2
28 |         self.num_layers = 2
29 |         self.batch_size = 32
30 |         self.epochs = 1000


--------------------------------------------------------------------------------
/ABCNN/data/idf.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/ABCNN/data/idf.npy


--------------------------------------------------------------------------------
/ABCNN/data/label.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 1
3 | 


--------------------------------------------------------------------------------
/ABCNN/data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | &nbsp
   2 | &nbsp;
   3 | aboard
   4 | about
   5 | above
   6 | according
   7 | according to
   8 | across
   9 | afore
  10 | after
  11 | afterwards
  12 | again
  13 | against
  14 | agin
  15 | all
  16 | almost
  17 | alone
  18 | along
  19 | alongside
  20 | already
  21 | also
  22 | although
  23 | always
  24 | am
  25 | amid
  26 | amidst
  27 | among
  28 | amongst
  29 | amoungst
  30 | amount
  31 | an
  32 | and
  33 | anent
  34 | another
  35 | any
  36 | anyhow
  37 | anyone
  38 | anything
  39 | anyway
  40 | anywhere
  41 | approximately
  42 | are
  43 | around
  44 | as
  45 | asked
  46 | aslant
  47 | astride
  48 | at
  49 | athwart
  50 | back
  51 | bar
  52 | be
  53 | became
  54 | because
  55 | because of
  56 | become
  57 | becomes
  58 | becoming
  59 | been
  60 | before
  61 | beforehand
  62 | behind
  63 | being
  64 | below
  65 | beneath
  66 | beside
  67 | besides
  68 | between
  69 | betwixt
  70 | beyond
  71 | bill
  72 | both
  73 | bottom
  74 | but
  75 | by
  76 | call
  77 | called
  78 | can
  79 | cannot
  80 | cant
  81 | circa
  82 | co
  83 | computer
  84 | con
  85 | could
  86 | couldnt
  87 | cry
  88 | currently
  89 | dare
  90 | de
  91 | describe
  92 | despite
  93 | detail
  94 | did
  95 | do
  96 | does
  97 | done
  98 | down
  99 | dr
 100 | due
 101 | due to
 102 | during
 103 | e.g.,
 104 | each
 105 | earlier
 106 | eg
 107 | eight
 108 | either
 109 | eleven
 110 | else
 111 | elsewhere
 112 | empty
 113 | enough
 114 | ere
 115 | etc
 116 | even
 117 | eventually
 118 | ever
 119 | every
 120 | everyone
 121 | everything
 122 | everywhere
 123 | except
 124 | few
 125 | fifteen
 126 | fify
 127 | fill
 128 | find
 129 | fire
 130 | first
 131 | five
 132 | for
 133 | former
 134 | formerly
 135 | forty
 136 | found
 137 | four
 138 | from
 139 | front
 140 | full
 141 | further
 142 | get
 143 | give
 144 | go
 145 | had
 146 | has
 147 | hasnt
 148 | have
 149 | he
 150 | hence
 151 | her
 152 | here
 153 | hereafter
 154 | hereby
 155 | herein
 156 | hereupon
 157 | hers
 158 | herself
 159 | him
 160 | himself
 161 | his
 162 | how
 163 | however
 164 | hundred
 165 | i
 166 | ie
 167 | if
 168 | in
 169 | inc
 170 | indeed
 171 | inside
 172 | instead
 173 | interest
 174 | into
 175 | is
 176 | it
 177 | its
 178 | itself
 179 | just
 180 | keep
 181 | last
 182 | latter
 183 | latterly
 184 | least
 185 | less
 186 | like
 187 | ltd
 188 | made
 189 | major
 190 | many
 191 | may
 192 | maybe
 193 | me
 194 | meanwhile
 195 | mid
 196 | midst
 197 | might
 198 | mill
 199 | mine
 200 | minus
 201 | more
 202 | moreover
 203 | most
 204 | mostly
 205 | move
 206 | mr
 207 | mrs
 208 | ms
 209 | much
 210 | must
 211 | my
 212 | myself
 213 | name
 214 | namely
 215 | near
 216 | need
 217 | neither
 218 | net
 219 | never
 220 | nevertheless
 221 | next
 222 | nigh
 223 | nigher
 224 | nighest
 225 | nine
 226 | no
 227 | nobody
 228 | none
 229 | noone
 230 | nor
 231 | not
 232 | nothing
 233 | notwithstanding
 234 | now
 235 | nowhere
 236 | of
 237 | off
 238 | often
 239 | on
 240 | on to
 241 | once
 242 | one
 243 | only
 244 | onto
 245 | or
 246 | other
 247 | others
 248 | otherwise
 249 | ought
 250 | our
 251 | ours
 252 | ourselves
 253 | out
 254 | out of
 255 | outside
 256 | over
 257 | own
 258 | part
 259 | partly
 260 | past
 261 | pending
 262 | per
 263 | perhaps
 264 | please
 265 | plus
 266 | prior
 267 | put
 268 | qua
 269 | rather
 270 | re
 271 | regarding
 272 | round
 273 | same
 274 | sans
 275 | save
 276 | see
 277 | seem
 278 | seemed
 279 | seeming
 280 | seems
 281 | separately
 282 | serious
 283 | seven
 284 | several
 285 | shall
 286 | she
 287 | should
 288 | show
 289 | side
 290 | similarly
 291 | since
 292 | sincere
 293 | six
 294 | sixty
 295 | so
 296 | some
 297 | somehow
 298 | someone
 299 | something
 300 | sometime
 301 | sometimes
 302 | somewhere
 303 | still
 304 | such
 305 | system
 306 | take
 307 | ten
 308 | than
 309 | that
 310 | the
 311 | their
 312 | theirs
 313 | them
 314 | themselves
 315 | then
 316 | thence
 317 | there
 318 | thereafter
 319 | thereby
 320 | therefore
 321 | therein
 322 | thereupon
 323 | these
 324 | they
 325 | thick
 326 | thin
 327 | third
 328 | this
 329 | those
 330 | though
 331 | three
 332 | through
 333 | throughout
 334 | thru
 335 | thus
 336 | till
 337 | to
 338 | together
 339 | too
 340 | top
 341 | toward
 342 | towards
 343 | twelve
 344 | twenty
 345 | two
 346 | un
 347 | under
 348 | underneath
 349 | unless
 350 | unlike
 351 | until
 352 | unto
 353 | up
 354 | upon
 355 | us
 356 | versus
 357 | very
 358 | via
 359 | vice
 360 | volume
 361 | was
 362 | we
 363 | well
 364 | were
 365 | what
 366 | whatever
 367 | whats
 368 | when
 369 | whence
 370 | whenever
 371 | where
 372 | whereafter
 373 | whereas
 374 | whereby
 375 | wherein
 376 | whereupon
 377 | wherever
 378 | whether
 379 | which
 380 | while
 381 | whither
 382 | who
 383 | whoever
 384 | whole
 385 | whom
 386 | whose
 387 | why
 388 | will
 389 | with
 390 | within
 391 | without
 392 | would
 393 | yesterday
 394 | yet
 395 | you
 396 | your
 397 | yours
 398 | yourself
 399 | yourselves
 400 | {
 401 | |
 402 | }
 403 | ~
 404 | ¡
 405 | ¦
 406 | «
 407 | ­
 408 | ¯
 409 | ´
 410 | ¸
 411 | »
 412 | ¿
 413 | ˇ
 414 | ˉ
 415 | ˊ
 416 | ˋ
 417 | ˜
 418 | ‐
 419 | —　
 420 | ―
 421 | ‖
 422 | ‘
 423 | ’
 424 | “
 425 | ”
 426 | •
 427 | …
 428 | ‹
 429 | ›
 430 | ∕
 431 | 、
 432 | 。
 433 | 〈
 434 | 〉
 435 | 《
 436 | 》
 437 | 「
 438 | 」
 439 | 『
 440 | 』
 441 | 【
 442 | 】
 443 | 〔
 444 | 〕
 445 | 〖
 446 | 〗
 447 | 〝
 448 | 〞
 449 | 一
 450 | 一些
 451 | 一何
 452 | 一切
 453 | 一则
 454 | 一方面
 455 | 一旦
 456 | 一来
 457 | 一样
 458 | 一般
 459 | 一转眼
 460 | 万一
 461 | 上
 462 | 上下
 463 | 下
 464 | 不
 465 | 不仅
 466 | 不但
 467 | 不光
 468 | 不单
 469 | 不只
 470 | 不外乎
 471 | 不如
 472 | 不妨
 473 | 不尽
 474 | 不尽然
 475 | 不得
 476 | 不怕
 477 | 不惟
 478 | 不成
 479 | 不拘
 480 | 不料
 481 | 不是
 482 | 不比
 483 | 不然
 484 | 不特
 485 | 不独
 486 | 不管
 487 | 不至于
 488 | 不若
 489 | 不论
 490 | 不过
 491 | 不问
 492 | 与
 493 | 与其
 494 | 与其说
 495 | 与否
 496 | 与此同时
 497 | 且
 498 | 且不说
 499 | 且说
 500 | 两者
 501 | 个
 502 | 个别
 503 | 临
 504 | 为
 505 | 为了
 506 | 为止
 507 | 为此
 508 | 为着
 509 | 乃
 510 | 乃至
 511 | 乃至于
 512 | 么
 513 | 之
 514 | 之一
 515 | 之所以
 516 | 之类
 517 | 乌乎
 518 | 乎
 519 | 乘
 520 | 也
 521 | 也好
 522 | 也罢
 523 | 了
 524 | 二来
 525 | 于
 526 | 于是
 527 | 于是乎
 528 | 云云
 529 | 云尔
 530 | 些
 531 | 亦
 532 | 人
 533 | 人们
 534 | 人家
 535 | 今
 536 | 介于
 537 | 仍
 538 | 仍旧
 539 | 从
 540 | 从此
 541 | 从而
 542 | 他
 543 | 他人
 544 | 他们
 545 | 以
 546 | 以上
 547 | 以为
 548 | 以便
 549 | 以免
 550 | 以及
 551 | 以故
 552 | 以期
 553 | 以来
 554 | 以至
 555 | 以至于
 556 | 以致
 557 | 们
 558 | 任
 559 | 任何
 560 | 任凭
 561 | 似的
 562 | 但
 563 | 但凡
 564 | 但是
 565 | 何
 566 | 何以
 567 | 何况
 568 | 何处
 569 | 何时
 570 | 余外
 571 | 作为
 572 | 你
 573 | 你们
 574 | 使
 575 | 使得
 576 | 例如
 577 | 依
 578 | 依据
 579 | 依照
 580 | 便于
 581 | 俺
 582 | 俺们
 583 | 倘
 584 | 倘使
 585 | 倘或
 586 | 倘然
 587 | 倘若
 588 | 借
 589 | 假使
 590 | 假如
 591 | 假若
 592 | 傥然
 593 | 像
 594 | 儿
 595 | 先不先
 596 | 光是
 597 | 全体
 598 | 全部
 599 | 兮
 600 | 关于
 601 | 其
 602 | 其一
 603 | 其中
 604 | 其二
 605 | 其他
 606 | 其余
 607 | 其它
 608 | 其次
 609 | 具体地说
 610 | 具体说来
 611 | 兼之
 612 | 内
 613 | 再其次
 614 | 再则
 615 | 再有
 616 | 再者
 617 | 再者说
 618 | 再说
 619 | 冒
 620 | 冲
 621 | 况且
 622 | 几
 623 | 几时
 624 | 凡
 625 | 凡是
 626 | 凭
 627 | 凭借
 628 | 出于
 629 | 出来
 630 | 分别
 631 | 则
 632 | 则甚
 633 | 别
 634 | 别人
 635 | 别处
 636 | 别是
 637 | 别的
 638 | 别管
 639 | 别说
 640 | 到
 641 | 前后
 642 | 前此
 643 | 前者
 644 | 加之
 645 | 加以
 646 | 即
 647 | 即令
 648 | 即使
 649 | 即便
 650 | 即如
 651 | 即或
 652 | 即若
 653 | 却
 654 | 去
 655 | 又
 656 | 又及
 657 | 及
 658 | 及其
 659 | 及至
 660 | 反之
 661 | 反而
 662 | 反过来
 663 | 反过来说
 664 | 受到
 665 | 另
 666 | 另一方面
 667 | 另外
 668 | 另悉
 669 | 只
 670 | 只当
 671 | 只怕
 672 | 只是
 673 | 只有
 674 | 只消
 675 | 只要
 676 | 只限
 677 | 叫
 678 | 叮咚
 679 | 可
 680 | 可以
 681 | 可是
 682 | 可见
 683 | 各
 684 | 各个
 685 | 各位
 686 | 各种
 687 | 各自
 688 | 同
 689 | 同时
 690 | 后
 691 | 后者
 692 | 向
 693 | 向使
 694 | 向着
 695 | 吓
 696 | 吗
 697 | 否则
 698 | 吧
 699 | 吧哒
 700 | 吱
 701 | 呀
 702 | 呃
 703 | 呕
 704 | 呗
 705 | 呜
 706 | 呜呼
 707 | 呢
 708 | 呵
 709 | 呵呵
 710 | 呸
 711 | 呼哧
 712 | 咋
 713 | 和
 714 | 咚
 715 | 咦
 716 | 咧
 717 | 咱
 718 | 咱们
 719 | 咳
 720 | 哇
 721 | 哈
 722 | 哈哈
 723 | 哉
 724 | 哎
 725 | 哎呀
 726 | 哎哟
 727 | 哗
 728 | 哟
 729 | 哦
 730 | 哩
 731 | 哪
 732 | 哪些
 733 | 哪怕
 734 | 哼
 735 | 哼唷
 736 | 唉
 737 | 唯有
 738 | 啊
 739 | 啐
 740 | 啥
 741 | 啦
 742 | 啪达
 743 | 啷当
 744 | 喂
 745 | 喏
 746 | 喔唷
 747 | 喽
 748 | 嗡
 749 | 嗡嗡
 750 | 嗬
 751 | 嗯
 752 | 嗳
 753 | 嘎
 754 | 嘎登
 755 | 嘘
 756 | 嘛
 757 | 嘻
 758 | 嘿
 759 | 嘿嘿
 760 | 因
 761 | 因为
 762 | 因了
 763 | 因此
 764 | 因着
 765 | 因而
 766 | 固然
 767 | 在
 768 | 在下
 769 | 在于
 770 | 地
 771 | 基于
 772 | 处在
 773 | 多
 774 | 多么
 775 | 多少
 776 | 大
 777 | 大家
 778 | 她
 779 | 她们
 780 | 好
 781 | 如
 782 | 如上
 783 | 如上所述
 784 | 如下
 785 | 如何
 786 | 如其
 787 | 如同
 788 | 如是
 789 | 如果
 790 | 如此
 791 | 如若
 792 | 始而
 793 | 孰料
 794 | 孰知
 795 | 宁
 796 | 宁可
 797 | 宁愿
 798 | 宁肯
 799 | 它
 800 | 它们
 801 | 对
 802 | 对于
 803 | 对待
 804 | 对方
 805 | 对比
 806 | 将
 807 | 小
 808 | 尔
 809 | 尔后
 810 | 尔尔
 811 | 尚且
 812 | 就
 813 | 就是
 814 | 就是了
 815 | 就是说
 816 | 就算
 817 | 就要
 818 | 尽
 819 | 尽管
 820 | 尽管如此
 821 | 岂但
 822 | 己
 823 | 已
 824 | 已矣
 825 | 巴
 826 | 巴巴
 827 | 并
 828 | 并且
 829 | 并非
 830 | 庶乎
 831 | 庶几
 832 | 开外
 833 | 开始
 834 | 归
 835 | 归齐
 836 | 当
 837 | 当地
 838 | 当然
 839 | 当着
 840 | 彼
 841 | 彼时
 842 | 彼此
 843 | 往
 844 | 待
 845 | 很
 846 | 得
 847 | 得了
 848 | 怎
 849 | 怎奈
 850 | 总之
 851 | 总的来看
 852 | 总的来说
 853 | 总的说来
 854 | 总而言之
 855 | 恰恰相反
 856 | 您
 857 | 惟其
 858 | 慢说
 859 | 我
 860 | 我们
 861 | 或
 862 | 或则
 863 | 或是
 864 | 或曰
 865 | 或者
 866 | 截至
 867 | 所
 868 | 所以
 869 | 所在
 870 | 所幸
 871 | 所有
 872 | 才
 873 | 才能
 874 | 打
 875 | 打从
 876 | 把
 877 | 抑或
 878 | 拿
 879 | 按
 880 | 按照
 881 | 换句话说
 882 | 换言之
 883 | 据
 884 | 据此
 885 | 接着
 886 | 故
 887 | 故此
 888 | 故而
 889 | 旁人
 890 | 无
 891 | 无宁
 892 | 无论
 893 | 既
 894 | 既往
 895 | 既是
 896 | 既然
 897 | 时候
 898 | 是
 899 | 是以
 900 | 是的
 901 | 曾
 902 | 替
 903 | 替代
 904 | 最
 905 | 有
 906 | 有些
 907 | 有关
 908 | 有及
 909 | 有时
 910 | 有的
 911 | 望
 912 | 朝
 913 | 朝着
 914 | 本
 915 | 本人
 916 | 本地
 917 | 本着
 918 | 本身
 919 | 来
 920 | 来着
 921 | 来自
 922 | 来说
 923 | 极了
 924 | 果然
 925 | 果真
 926 | 某
 927 | 某个
 928 | 某些
 929 | 某某
 930 | 根据
 931 | 欤
 932 | 正值
 933 | 正如
 934 | 正巧
 935 | 正是
 936 | 此
 937 | 此地
 938 | 此处
 939 | 此外
 940 | 此时
 941 | 此次
 942 | 此间
 943 | 毋宁
 944 | 每
 945 | 每当
 946 | 比
 947 | 比及
 948 | 比如
 949 | 比方
 950 | 没奈何
 951 | 沿
 952 | 沿着
 953 | 漫说
 954 | 焉
 955 | 然则
 956 | 然后
 957 | 然而
 958 | 照
 959 | 照着
 960 | 犹且
 961 | 犹自
 962 | 甚且
 963 | 甚么
 964 | 甚或
 965 | 甚而
 966 | 甚至
 967 | 甚至于
 968 | 用
 969 | 用来
 970 | 由
 971 | 由于
 972 | 由是
 973 | 由此
 974 | 由此可见
 975 | 的
 976 | 的确
 977 | 的话
 978 | 直到
 979 | 相对而言
 980 | 省得
 981 | 看
 982 | 眨眼
 983 | 着
 984 | 着呢
 985 | 矣
 986 | 矣乎
 987 | 矣哉
 988 | 离
 989 | 竟而
 990 | 第
 991 | 等
 992 | 等到
 993 | 等等
 994 | 简言之
 995 | 管
 996 | 类如
 997 | 紧接着
 998 | 纵
 999 | 纵令
1000 | 纵使
1001 | 纵然
1002 | 经
1003 | 经过
1004 | 结果
1005 | 给
1006 | 继之
1007 | 继后
1008 | 继而
1009 | 综上所述
1010 | 罢了
1011 | 者
1012 | 而
1013 | 而且
1014 | 而况
1015 | 而后
1016 | 而外
1017 | 而已
1018 | 而是
1019 | 而言
1020 | 能
1021 | 能否
1022 | 腾
1023 | 自
1024 | 自个儿
1025 | 自从
1026 | 自各儿
1027 | 自后
1028 | 自家
1029 | 自己
1030 | 自打
1031 | 自身
1032 | 至
1033 | 至于
1034 | 至今
1035 | 至若
1036 | 致
1037 | 般的
1038 | 若
1039 | 若夫
1040 | 若是
1041 | 若果 
1042 | 若非
1043 | 莫不然
1044 | 莫如
1045 | 莫若
1046 | 虽
1047 | 虽则
1048 | 虽然
1049 | 虽说
1050 | 被
1051 | 要
1052 | 要不
1053 | 要不是
1054 | 要不然
1055 | 要么
1056 | 要是
1057 | 譬喻
1058 | 譬如
1059 | 让
1060 | 许多
1061 | 论
1062 | 设使
1063 | 设或
1064 | 设若
1065 | 诚如
1066 | 诚然
1067 | 该
1068 | 说来
1069 | 诸
1070 | 诸位
1071 | 诸如
1072 | 谁
1073 | 谁人
1074 | 谁料
1075 | 谁知
1076 | 贼死
1077 | 赖以
1078 | 赶
1079 | 起
1080 | 起见
1081 | 趁
1082 | 趁着
1083 | 越是
1084 | 距
1085 | 跟
1086 | 较
1087 | 较之
1088 | 边
1089 | 过
1090 | 还
1091 | 还是
1092 | 还有
1093 | 还要
1094 | 这
1095 | 这一来
1096 | 这个
1097 | 这么
1098 | 这么些
1099 | 这么样
1100 | 这么点儿
1101 | 这些
1102 | 这会儿
1103 | 这儿
1104 | 这就是说
1105 | 这时
1106 | 这样
1107 | 这次
1108 | 这般
1109 | 这边
1110 | 这里
1111 | 进而
1112 | 连
1113 | 连同
1114 | 逐步
1115 | 通过
1116 | 遵循
1117 | 遵照
1118 | 那
1119 | 那个
1120 | 那么
1121 | 那么些
1122 | 那么样
1123 | 那些
1124 | 那会儿
1125 | 那儿
1126 | 那时
1127 | 那样
1128 | 那般
1129 | 那边
1130 | 那里
1131 | 都
1132 | 鄙人
1133 | 鉴于
1134 | 针对
1135 | 阿
1136 | 除
1137 | 除了
1138 | 除外
1139 | 除开
1140 | 除此之外
1141 | 除非
1142 | 随
1143 | 随后
1144 | 随时
1145 | 随着
1146 | 难道说
1147 | 非但
1148 | 非徒
1149 | 非特
1150 | 非独
1151 | 靠
1152 | 顺
1153 | 顺着
1154 | 首先
1155 | ︰
1156 | ︳
1157 | ︴
1158 | ︵
1159 | ︶
1160 | ︷
1161 | ︸
1162 | ︹
1163 | ︺
1164 | ︻
1165 | ︼
1166 | ︽
1167 | ︾
1168 | ︿
1169 | ﹀
1170 | ﹁
1171 | ﹂
1172 | ﹃
1173 | ﹄
1174 | ﹉
1175 | ﹊
1176 | ﹋
1177 | ﹌
1178 | ﹍
1179 | ﹎
1180 | ﹏
1181 | ﹐
1182 | ﹑
1183 | ﹔
1184 | ﹕
1185 | ﹖
1186 | ﹝
1187 | ﹞
1188 | ﹟
1189 | ﹠
1190 | ﹡
1191 | ﹢
1192 | ﹤
1193 | ﹦
1194 | ﹨
1195 | ﹩
1196 | ﹪
1197 | ﹫
1198 | ！
1199 | ＂
1200 | ＇
1201 | （
1202 | ）
1203 | ，
1204 | ：
1205 | ；
1206 | ？
1207 | ＿
1208 | ￣
1209 | １
1210 | ２
1211 | ３
1212 | ４
1213 | ５
1214 | ６
1215 | ７
1216 | ８
1217 | ９
1218 | ０
1219 | *
1220 | 


--------------------------------------------------------------------------------
/ABCNN/model.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/6/27 22:58
  4 | # @Author: Zhang Cong
  5 | 
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | from config import Config
  9 | 
 10 | class Model():
 11 |     def __init__(self):
 12 |         self.config = Config()                                                                                              # 配置参数
 13 |         self.input_query = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name="input-query")         # 输入query，One-Hot形式
 14 |         self.input_doc = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name="input-doc")             # 输入doc，One-Hot形式
 15 |         self.input_label = tf.placeholder(shape=[None, self.config.num_classes], dtype=tf.int32, name="input-label")        # 输入label
 16 |         self.input_feature = tf.placeholder(tf.float32, shape=[None, self.config.feature_size], name="input-features")      # 额外特征 [文本1长度，文本2长度，两个文本的字符交集，sum(IDF)]
 17 | 
 18 |         # Embedding layer
 19 |         self.embedding = tf.get_variable(shape=[self.config.vocab_size, self.config.embedding_dim], dtype=tf.float32, name='embedding')
 20 | 
 21 |         # 将词汇映射为向量形式 [batch_size, seq_length, embedding_dim]
 22 |         self.input_query_emb = tf.nn.embedding_lookup(params=self.embedding, ids=self.input_query, name='input-query-emb')
 23 |         self.input_doc_emb = tf.nn.embedding_lookup(params=self.embedding, ids=self.input_doc, name='input-doc-emb')
 24 | 
 25 |         # 维度扩充[batch_size, seq_length, embedding_dim, 1]
 26 |         self.input_query_emb = tf.expand_dims(input=self.input_query_emb, axis=-1)
 27 |         self.input_doc_emb = tf.expand_dims(input=self.input_doc_emb, axis=-1)
 28 | 
 29 |         # 对输入进行全局池化 all pool
 30 |         input_query_all_pool = self.all_pool(variable_scope="input-left", x=self.input_query_emb)
 31 |         input_doc_all_pool = self.all_pool(variable_scope="input-right", x=self.input_doc_emb)
 32 | 
 33 |         # 第一次pad + 宽卷积
 34 |         left_wp, left_ap, right_wp, right_ap = self.CNN_layer(variable_scope="CNN-1",
 35 |                                                               query=self.input_query_emb,
 36 |                                                               doc=self.input_doc_emb,
 37 |                                                               dim=self.config.embedding_dim)
 38 | 
 39 |         # 将每个conv stack 的结果取all-pool，然后计算query与doc的cosine值作为额外特征
 40 |         sims = [self.cos_sim(input_query_all_pool, input_doc_all_pool), self.cos_sim(left_ap, right_ap)]
 41 | 
 42 |         # 如果conv layer 有2层（原论文中最多2层）
 43 |         if self.config.num_layers > 1:
 44 |             left_wp, left_ap, right_wp, right_ap = self.CNN_layer(variable_scope="CNN-2",
 45 |                                                                   query=left_wp,
 46 |                                                                   doc=right_wp,
 47 |                                                                   dim=self.config.hidden_dim)
 48 |             # 将第2层产生的cos特征加入额外特征列表
 49 |             sims.append(self.cos_sim(left_ap, right_ap))
 50 | 
 51 |         with tf.variable_scope("output-layer"):
 52 |             # 将额外字符特征feature与sims层次相似度特征进行拼接 [batch_size, 7]
 53 |             self.output_features = tf.concat([self.input_feature, tf.stack(sims, axis=1)], axis=1, name="output_features")
 54 |             # 全连接层
 55 |             self.logits = tf.contrib.layers.fully_connected(
 56 |                 inputs=self.output_features,
 57 |                 num_outputs=self.config.num_classes,
 58 |                 activation_fn=None,
 59 |                 weights_initializer=tf.contrib.layers.xavier_initializer(),
 60 |                 weights_regularizer=tf.contrib.layers.l2_regularizer(scale=self.config.l2_reg),
 61 |                 biases_initializer=tf.constant_initializer(1e-04),
 62 |                 scope="FC"
 63 |             )
 64 | 
 65 |             # 还可使用 layer.dense 进行全连接
 66 |             # tf.layers.dense(inputs=self.output_features,
 67 |             #                 units=self.config.num_classes,
 68 |             #                 activation=None,
 69 |             #                 kernel_initializer=contrib.layers.xavier_initializer(),
 70 |             #                 kernel_regularizer=contrib.layers.l2_regularizer(scale=self.config.l2_reg),
 71 |             #                 bias_initializer=tf.constant_initializer(1e-04),
 72 |             #                 name='FC')
 73 | 
 74 |         # 类别score
 75 |         self.score = tf.nn.softmax(self.logits, name='score')
 76 |         # 预测结果
 77 |         self.predict = tf.argmax(self.score, axis=1, name='predict')
 78 |         # 准确率
 79 |         self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(self.input_label, axis=1), self.predict), dtype=tf.float32),name='accuracy')
 80 |         # 结构化损失函数，交叉熵+L2正则化
 81 |         self.loss = tf.add(
 82 |             tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_label)),
 83 |             tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
 84 |             name="loss")
 85 |         # 优化器
 86 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate, name="optimizer").minimize(self.loss)
 87 | 
 88 | 
 89 |     def pad_for_wide_conv(self, x):
 90 |         '''
 91 |         对input进行padding，为宽卷积做预处理，在文本的首尾都填充 kernel_size - 1 个0
 92 |         :param x: 输入向量[batch_size, seq_length, hidden_dim, channel)]
 93 |         :return:
 94 |         '''
 95 |         return tf.pad(tensor=x,
 96 |                       paddings=np.array([[0, 0], [self.config.kernel_size - 1, self.config.kernel_size - 1], [0, 0], [0, 0]]),
 97 |                       mode="CONSTANT",
 98 |                       name="pad_wide_conv")
 99 | 
100 | 
101 |     def cos_sim(self, v1, v2):
102 |         '''
103 |         计算cosin值
104 |         :param v1: 输入向量1 [v1, v2, v3 ...]
105 |         :param v2: 输入向量2 [v1, v2, v3 ...]
106 |         :return:
107 |         '''
108 |         norm1 = tf.sqrt(tf.reduce_sum(tf.square(v1), axis=1))
109 |         norm2 = tf.sqrt(tf.reduce_sum(tf.square(v2), axis=1))
110 |         dot_products = tf.reduce_sum(v1 * v2, axis=1, name="cos_sim")
111 | 
112 |         return dot_products / (norm1 * norm2)
113 | 
114 | 
115 |     def euclidean_score(self, v1, v2):
116 |         '''
117 |         计算attention weight
118 |         原始论文提出的计算attention的方法，在实际过程中反向传播计算梯度时 容易出现NaN的情况
119 |         :param v1: 矩阵1 [batch_size, seq_length, hidden]
120 |         :param v2: 矩阵2 [batch_size, seq_length, hidden]
121 |         :return:
122 |         '''
123 |         euclidean = tf.sqrt(tf.reduce_sum(tf.square(v1 - v2), axis=1))
124 |         return 1 / (1 + euclidean)
125 | 
126 | 
127 |     def make_attention_mat(self, x1, x2):
128 |         '''
129 |         计算attention weight
130 |         作者论文中提出计算attention的方法 在实际过程中反向传播计算梯度时 容易出现NaN的情况 这里面加以修改
131 |         :param x1: 矩阵1 [batch_size, seq_length, hidden]
132 |         :param x2: 矩阵2 [batch_size, seq_length, hidden]
133 |         :return:
134 |         '''
135 |         x2 = tf.transpose(tf.squeeze(x2, axis=-1), [0, 2, 1])
136 |         attention = tf.einsum("ijk,ikl->ijl", tf.squeeze(x1, axis=-1), x2)
137 |         return attention
138 | 
139 | 
140 |     def convolution(self, name_scope, x, dim, reuse):
141 |         '''
142 |         卷积层函数
143 |         :param name_scope: 该操作所属变量空间
144 |         :param x: 输入四维矩阵[batch_size, seq_length, hidden_dim, channel]
145 |         :param dim: 卷积核宽度（词向量大小 or 隐藏层向量大小）
146 |         :param reuse: 是否复用，与已存在的相同命名层共享参数
147 |         :return:
148 |         '''
149 |         with tf.name_scope(name_scope + "-conv"):
150 |             with tf.variable_scope("conv") as scope:
151 |                 conv = tf.contrib.layers.conv2d(
152 |                     inputs=x,
153 |                     num_outputs=self.config.hidden_dim,
154 |                     kernel_size=(self.config.kernel_size, dim),
155 |                     stride=1,
156 |                     padding="VALID",
157 |                     activation_fn=tf.nn.tanh,
158 |                     weights_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
159 |                     weights_regularizer=tf.contrib.layers.l2_regularizer(scale=self.config.l2_reg),
160 |                     biases_initializer=tf.constant_initializer(1e-04),
161 |                     reuse=reuse,
162 |                     trainable=True,
163 |                     scope=scope
164 |                 )
165 |                 conv = tf.transpose(conv, [0, 1, 3, 2], name="conv_trans")
166 |                 return conv
167 | 
168 | 
169 |     def w_pool(self, variable_scope, x, attention):
170 |         '''
171 |         权重池化函数
172 |         :param variable_scope: 命名空间
173 |         :param x: 输入向量 [batch, s+w-1, dim, 1]
174 |         :param attention: 注意力权重 [batch, s+w-1]
175 |         :return:
176 |         '''
177 |         model_type = self.config.model_type
178 |         with tf.variable_scope(variable_scope + "-w_pool"):
179 |             if model_type == "ABCNN2" or model_type == "ABCNN3":
180 |                 pools = []
181 |                 # 维度扩充 [batch, s+w-1] => [batch, s+w-1, 1, 1]
182 |                 attention = tf.expand_dims(tf.expand_dims(attention, -1), -1)
183 |                 # 进行加权的池化
184 |                 for i in range(self.config.seq_length):
185 |                     # [batch, w, dim, 1], [batch, w, 1, 1] => [batch, 1, dim, 1]
186 |                     kernel_size = self.config.kernel_size
187 |                     pools.append(tf.reduce_sum(x[:, i:i + kernel_size, :, :] * attention[:, i:i + kernel_size, :, :],
188 |                                                axis=1,
189 |                                                keep_dims=True))
190 | 
191 |                 # [batch, seq_length, dim, 1]
192 |                 w_ap = tf.concat(pools, axis=1, name="w_ap")
193 |             else:
194 |                 # 平均池化，[batch, seq_length, dim]
195 |                 w_ap = tf.layers.average_pooling2d(inputs=x,
196 |                                                    pool_size=(self.config.kernel_size, 1),
197 |                                                    strides=1,
198 |                                                    padding="VALID",
199 |                                                    name="w_ap")
200 |             return w_ap
201 | 
202 | 
203 |     def all_pool(self, variable_scope, x):
204 |         '''
205 |         全局池化函数
206 |         :param variable_scope: 变量空间
207 |         :param x: 输入向量 [batch_size, seq_length, hidden_dim, 1]
208 |         :return:
209 |         '''
210 |         with tf.variable_scope(variable_scope + "-all_pool"):
211 |             # 如果是对初始input进行all-pool
212 |             if variable_scope.startswith("input"):
213 |                 pool_width = self.config.seq_length     # 文本长度
214 |                 d = self.config.embedding_dim           # 词向量维度
215 | 
216 |             else:    # 如果是对中间卷积结果进行all-pool
217 |                 pool_width = self.config.seq_length + self.config.kernel_size - 1
218 |                 d = self.config.hidden_dim
219 | 
220 |             # 二维平均池化
221 |             all_ap = tf.layers.average_pooling2d(inputs=x,
222 |                                                  pool_size=(pool_width, 1),
223 |                                                  strides=1,
224 |                                                  padding='VALID',
225 |                                                  name='all_ap')
226 |             # [batch, hidden_dim]
227 |             all_ap_reshaped = tf.reshape(all_ap, [-1, d])
228 | 
229 |             return all_ap_reshaped
230 | 
231 | 
232 |     def CNN_layer(self, variable_scope, query, doc, dim):
233 |         '''
234 |         卷积层 pad + conv + pool
235 |         :param variable_scope: 变量空间
236 |         :param query: 输入的query向量
237 |         :param doc: 输入的doc向量
238 |         :param dim: 卷积核宽度（词向量大小 or 隐藏层向量大小）
239 |         :return:
240 |         '''
241 |         # x1, x2 = [batch, seq_length, embedding_dim, 1]    dim:hidden_dim
242 |         model_type = self.config.model_type
243 |         with tf.variable_scope(variable_scope):
244 |             if model_type == "ABCNN1" or model_type == "ABCNN3":
245 |                 with tf.name_scope("att_mat"):
246 |                     aW = tf.get_variable(name="aW",
247 |                                          shape=(self.config.seq_length, dim),
248 |                                          initializer=tf.contrib.layers.xavier_initializer(),
249 |                                          regularizer=tf.contrib.layers.l2_regularizer(scale=self.config.l2_reg))
250 | 
251 |                     # attention weight [batch, seq_length, seq_length]
252 |                     att_mat = self.make_attention_mat(query, doc)
253 |                     query_a = tf.expand_dims(tf.einsum("ijk,kl->ijl", att_mat, aW), axis=-1)    # attention交互生成的新query embedding
254 |                     doc_a = tf.expand_dims(tf.einsum("ijk,kl->ijl", tf.matrix_transpose(att_mat), aW), axis=-1) # attention交互生成的新doc embedding
255 |                     # [batch, d, s, 2]
256 |                     query = tf.concat([query, query_a], axis=3)     # 新embedding与旧embedding在第三维度上进行组合拼接
257 |                     doc = tf.concat([doc, doc_a], axis=3)
258 | 
259 |             # 进行pad + 宽卷积
260 |             left_conv = self.convolution(name_scope="left", x=self.pad_for_wide_conv(query), dim=dim, reuse=False)
261 |             right_conv = self.convolution(name_scope="right", x=self.pad_for_wide_conv(doc), dim=dim, reuse=True)
262 | 
263 |             left_attention, right_attention = None, None
264 | 
265 |             if model_type == "ABCNN2" or model_type == "ABCNN3":
266 |                 # [batch, s+w-1, s+w-1]
267 |                 att_mat = self.make_attention_mat(left_conv, right_conv)
268 |                 # 获取left和right的attention权重进行加权的池化操作，[batch, s+w-1], [batch, s+w-1]
269 |                 left_attention, right_attention = tf.reduce_sum(att_mat, axis=2), tf.reduce_sum(att_mat, axis=1)
270 | 
271 |             # 进行池化处理
272 |             left_wp = self.w_pool(variable_scope="left", x=left_conv, attention=left_attention)
273 |             left_ap = self.all_pool(variable_scope="left", x=left_conv)
274 |             right_wp = self.w_pool(variable_scope="right", x=right_conv, attention=right_attention)
275 |             right_ap = self.all_pool(variable_scope="right", x=right_conv)
276 | 
277 |             return left_wp, left_ap, right_wp, right_ap
278 | 
279 | 
280 | 
281 | if __name__ == '__main__':
282 |     Model()


--------------------------------------------------------------------------------
/BIMPM/BiMPM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/BIMPM/BiMPM.png


--------------------------------------------------------------------------------
/BIMPM/Data_Generate.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/5/12 22:19 
 4 | # @Author: Zhang Cong
 5 | 
 6 | import logging
 7 | from tqdm import tqdm
 8 | 
 9 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
10 | 
11 | def generate_data(input_file_path, output_file_path):
12 |     '''
13 |     对原始csv数据进行格式转换，构建训练/测试数据集
14 |     :param input_file_path: 原始数据路径
15 |     :param output_file_path: 构建完成的训练数据路径
16 |     :return: 将数据存储至本地
17 |     '''
18 |     logging.info('Start get all sentence ...')
19 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
20 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
21 |         line = line.replace('\n', '').split('\t')
22 |         if line[2] == 'label':
23 |             continue
24 |         sentence_1 = str(line[0]).replace('\t', '')
25 |         sentence_2 = str(line[1]).replace('\t', '')
26 |         label = str(line[2]).replace('\t', '')
27 |         output_file.write(sentence_1 + '\t' + sentence_2 + '\t' + label + '\n')
28 | 
29 | 
30 | def check_data(input_file_path):
31 |     '''
32 |     统计数据分布情况，检查数据集0/1分布是否均衡
33 |     :param input_file_path: 数据路径
34 |     :return:
35 |     '''
36 |     count = 0
37 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
38 |         line = line.replace('\n', '').split('\t')
39 |         if line[2] == 'label':
40 |             continue
41 |         if int(line[2]) == 1:
42 |             count += 1
43 |     print(count)
44 | 
45 | 
46 | if __name__ == '__main__':
47 | 
48 |     # 统计数据分布情况
49 |     # file_path = './data/lcqmc/lcqmc_train.tsv'
50 |     # check_data(file_path)
51 | 
52 |     # 构建训练数据集
53 |     input_file_path = './data/lcqmc/lcqmc_train.tsv'
54 |     output_file_path = './data/train.txt'
55 |     generate_data(input_file_path, output_file_path)
56 |     logging.info('Success generate train.txt')
57 | 
58 |     # 构建验证数据集
59 |     input_file_path = './data/lcqmc/lcqmc_dev.tsv'
60 |     output_file_path = './data/dev.txt'
61 |     generate_data(input_file_path, output_file_path)
62 |     logging.info('Success generate dev.txt')
63 | 
64 |     # # 构建测试数据集
65 |     # input_file_path = './data/lcqmc/lcqmc_test.tsv'
66 |     # output_file_path = './data/test.txt'
67 |     # generate_test_data(input_file_path, output_file_path)
68 |     # logging.info('Success generate test.txt')
69 | 
70 | 


--------------------------------------------------------------------------------
/BIMPM/README.md:
--------------------------------------------------------------------------------
 1 | ## BiMPM (Bilateral Multi-Perspective Matching for Natural Language Sentences)
 2 | 
 3 | 
 4 | ### 数据集：
 5 | #### LCQMC (http://icrc.hitsz.edu.cn/info/1037/1146.htm) 数据集有版权保护，本项目不提供，请自行下载或替换其他数据集进行试验
 6 | 
 7 | 
 8 | ### 数据形式：
 9 | #### sentence_1 \t sentence_2 \t label
10 | 
11 | 
12 | ### 文件解释
13 | * main.py —— 主文件
14 | * model.py —— 模型结构
15 | * config.py —— 配置参数
16 | * Data_Generate.py —— 数据集处理脚本
17 | * /data —— 数据存放文件夹
18 | * /save_model —— 模型存储文件夹
19 | 
20 | 
21 | ### 模型结构
22 | ![avatar](./BiMPM.png)
23 | * Our model essentially belongs to the“matching-aggregation”framework. Given two sentences P and Q, our model first encodes them with a bidirectional Long Short-Term Memory Network (BiLSTM). Next, we match the two encoded sentences in two directions P→Q and P←Q. In each matching direction, let’s say P→Q, each time step of Q is matched against all time-steps of P from multiple perspectives.
24 | * BiMPM这个模型最大的创新点在于采用了双向多角度匹配，不单单只考虑一个维度，采用了matching-aggregation的结构，把两个句子之间的单元做了四种相似度计算，最后经过全连接层与softamx层得到最终的结果，不过这也成了其缺点，慢。
25 | * 考虑局部推断和全局推断
26 | 
27 | 
28 | ### 参考资料
29 | * Bilateral Multi-Perspective Matching for Natural Language Sentences (https://arxiv.org/abs/1702.03814)
30 | * https://zhuanlan.zhihu.com/p/50184415
31 | 
32 | 


--------------------------------------------------------------------------------
/BIMPM/config.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/7/16 22:57
 4 | # @Author: Zhang Cong
 5 | 
 6 | # 模型配置参数
 7 | class Config():
 8 |     def __init__(self):
 9 |         self.original_data_path = './data/train.txt'
10 |         self.dev_data_path = './data/dev.txt'
11 |         self.stopwords_path = './data/stopwords.txt'
12 |         self.preprocess_path = './data/preprocessed_data.txt'
13 |         self.vocab_path = './data/vocab.txt'
14 |         self.char_path = './data/char.txt'
15 |         self.label_path = './data/label.txt'
16 |         self.model_save_path = './save_model/'
17 |         self.rnn_type = 'lstm'
18 |         self.vocab_size = 2000
19 |         self.char_size = 2000
20 |         self.embedding_dim = 300
21 |         self.seq_length = 20
22 |         self.learning_rate = 1e-5
23 |         self.l2_reg = 0.0004
24 |         self.keep_prob = 0.5
25 |         self.hidden_dim = 256
26 |         self.fc_hidden_dim_1 = 10000
27 |         self.fc_hidden_dim_2 = 512
28 |         self.num_perspective = 12
29 |         self.num_classes = 2
30 |         self.batch_size = 32
31 |         self.epochs = 100


--------------------------------------------------------------------------------
/BIMPM/data/label.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 1
3 | 


--------------------------------------------------------------------------------
/BIMPM/data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | &nbsp
   2 | &nbsp;
   3 | aboard
   4 | about
   5 | above
   6 | according
   7 | according to
   8 | across
   9 | afore
  10 | after
  11 | afterwards
  12 | again
  13 | against
  14 | agin
  15 | all
  16 | almost
  17 | alone
  18 | along
  19 | alongside
  20 | already
  21 | also
  22 | although
  23 | always
  24 | am
  25 | amid
  26 | amidst
  27 | among
  28 | amongst
  29 | amoungst
  30 | amount
  31 | an
  32 | and
  33 | anent
  34 | another
  35 | any
  36 | anyhow
  37 | anyone
  38 | anything
  39 | anyway
  40 | anywhere
  41 | approximately
  42 | are
  43 | around
  44 | as
  45 | asked
  46 | aslant
  47 | astride
  48 | at
  49 | athwart
  50 | back
  51 | bar
  52 | be
  53 | became
  54 | because
  55 | because of
  56 | become
  57 | becomes
  58 | becoming
  59 | been
  60 | before
  61 | beforehand
  62 | behind
  63 | being
  64 | below
  65 | beneath
  66 | beside
  67 | besides
  68 | between
  69 | betwixt
  70 | beyond
  71 | bill
  72 | both
  73 | bottom
  74 | but
  75 | by
  76 | call
  77 | called
  78 | can
  79 | cannot
  80 | cant
  81 | circa
  82 | co
  83 | computer
  84 | con
  85 | could
  86 | couldnt
  87 | cry
  88 | currently
  89 | dare
  90 | de
  91 | describe
  92 | despite
  93 | detail
  94 | did
  95 | do
  96 | does
  97 | done
  98 | down
  99 | dr
 100 | due
 101 | due to
 102 | during
 103 | e.g.,
 104 | each
 105 | earlier
 106 | eg
 107 | eight
 108 | either
 109 | eleven
 110 | else
 111 | elsewhere
 112 | empty
 113 | enough
 114 | ere
 115 | etc
 116 | even
 117 | eventually
 118 | ever
 119 | every
 120 | everyone
 121 | everything
 122 | everywhere
 123 | except
 124 | few
 125 | fifteen
 126 | fify
 127 | fill
 128 | find
 129 | fire
 130 | first
 131 | five
 132 | for
 133 | former
 134 | formerly
 135 | forty
 136 | found
 137 | four
 138 | from
 139 | front
 140 | full
 141 | further
 142 | get
 143 | give
 144 | go
 145 | had
 146 | has
 147 | hasnt
 148 | have
 149 | he
 150 | hence
 151 | her
 152 | here
 153 | hereafter
 154 | hereby
 155 | herein
 156 | hereupon
 157 | hers
 158 | herself
 159 | him
 160 | himself
 161 | his
 162 | how
 163 | however
 164 | hundred
 165 | i
 166 | ie
 167 | if
 168 | in
 169 | inc
 170 | indeed
 171 | inside
 172 | instead
 173 | interest
 174 | into
 175 | is
 176 | it
 177 | its
 178 | itself
 179 | just
 180 | keep
 181 | last
 182 | latter
 183 | latterly
 184 | least
 185 | less
 186 | like
 187 | ltd
 188 | made
 189 | major
 190 | many
 191 | may
 192 | maybe
 193 | me
 194 | meanwhile
 195 | mid
 196 | midst
 197 | might
 198 | mill
 199 | mine
 200 | minus
 201 | more
 202 | moreover
 203 | most
 204 | mostly
 205 | move
 206 | mr
 207 | mrs
 208 | ms
 209 | much
 210 | must
 211 | my
 212 | myself
 213 | name
 214 | namely
 215 | near
 216 | need
 217 | neither
 218 | net
 219 | never
 220 | nevertheless
 221 | next
 222 | nigh
 223 | nigher
 224 | nighest
 225 | nine
 226 | no
 227 | nobody
 228 | none
 229 | noone
 230 | nor
 231 | not
 232 | nothing
 233 | notwithstanding
 234 | now
 235 | nowhere
 236 | of
 237 | off
 238 | often
 239 | on
 240 | on to
 241 | once
 242 | one
 243 | only
 244 | onto
 245 | or
 246 | other
 247 | others
 248 | otherwise
 249 | ought
 250 | our
 251 | ours
 252 | ourselves
 253 | out
 254 | out of
 255 | outside
 256 | over
 257 | own
 258 | part
 259 | partly
 260 | past
 261 | pending
 262 | per
 263 | perhaps
 264 | please
 265 | plus
 266 | prior
 267 | put
 268 | qua
 269 | rather
 270 | re
 271 | regarding
 272 | round
 273 | same
 274 | sans
 275 | save
 276 | see
 277 | seem
 278 | seemed
 279 | seeming
 280 | seems
 281 | separately
 282 | serious
 283 | seven
 284 | several
 285 | shall
 286 | she
 287 | should
 288 | show
 289 | side
 290 | similarly
 291 | since
 292 | sincere
 293 | six
 294 | sixty
 295 | so
 296 | some
 297 | somehow
 298 | someone
 299 | something
 300 | sometime
 301 | sometimes
 302 | somewhere
 303 | still
 304 | such
 305 | system
 306 | take
 307 | ten
 308 | than
 309 | that
 310 | the
 311 | their
 312 | theirs
 313 | them
 314 | themselves
 315 | then
 316 | thence
 317 | there
 318 | thereafter
 319 | thereby
 320 | therefore
 321 | therein
 322 | thereupon
 323 | these
 324 | they
 325 | thick
 326 | thin
 327 | third
 328 | this
 329 | those
 330 | though
 331 | three
 332 | through
 333 | throughout
 334 | thru
 335 | thus
 336 | till
 337 | to
 338 | together
 339 | too
 340 | top
 341 | toward
 342 | towards
 343 | twelve
 344 | twenty
 345 | two
 346 | un
 347 | under
 348 | underneath
 349 | unless
 350 | unlike
 351 | until
 352 | unto
 353 | up
 354 | upon
 355 | us
 356 | versus
 357 | very
 358 | via
 359 | vice
 360 | volume
 361 | was
 362 | we
 363 | well
 364 | were
 365 | what
 366 | whatever
 367 | whats
 368 | when
 369 | whence
 370 | whenever
 371 | where
 372 | whereafter
 373 | whereas
 374 | whereby
 375 | wherein
 376 | whereupon
 377 | wherever
 378 | whether
 379 | which
 380 | while
 381 | whither
 382 | who
 383 | whoever
 384 | whole
 385 | whom
 386 | whose
 387 | why
 388 | will
 389 | with
 390 | within
 391 | without
 392 | would
 393 | yesterday
 394 | yet
 395 | you
 396 | your
 397 | yours
 398 | yourself
 399 | yourselves
 400 | {
 401 | |
 402 | }
 403 | ~
 404 | ¡
 405 | ¦
 406 | «
 407 | ­
 408 | ¯
 409 | ´
 410 | ¸
 411 | »
 412 | ¿
 413 | ˇ
 414 | ˉ
 415 | ˊ
 416 | ˋ
 417 | ˜
 418 | ‐
 419 | —　
 420 | ―
 421 | ‖
 422 | ‘
 423 | ’
 424 | “
 425 | ”
 426 | •
 427 | …
 428 | ‹
 429 | ›
 430 | ∕
 431 | 、
 432 | 。
 433 | 〈
 434 | 〉
 435 | 《
 436 | 》
 437 | 「
 438 | 」
 439 | 『
 440 | 』
 441 | 【
 442 | 】
 443 | 〔
 444 | 〕
 445 | 〖
 446 | 〗
 447 | 〝
 448 | 〞
 449 | 一
 450 | 一些
 451 | 一何
 452 | 一切
 453 | 一则
 454 | 一方面
 455 | 一旦
 456 | 一来
 457 | 一样
 458 | 一般
 459 | 一转眼
 460 | 万一
 461 | 上
 462 | 上下
 463 | 下
 464 | 不
 465 | 不仅
 466 | 不但
 467 | 不光
 468 | 不单
 469 | 不只
 470 | 不外乎
 471 | 不如
 472 | 不妨
 473 | 不尽
 474 | 不尽然
 475 | 不得
 476 | 不怕
 477 | 不惟
 478 | 不成
 479 | 不拘
 480 | 不料
 481 | 不是
 482 | 不比
 483 | 不然
 484 | 不特
 485 | 不独
 486 | 不管
 487 | 不至于
 488 | 不若
 489 | 不论
 490 | 不过
 491 | 不问
 492 | 与
 493 | 与其
 494 | 与其说
 495 | 与否
 496 | 与此同时
 497 | 且
 498 | 且不说
 499 | 且说
 500 | 两者
 501 | 个
 502 | 个别
 503 | 临
 504 | 为
 505 | 为了
 506 | 为止
 507 | 为此
 508 | 为着
 509 | 乃
 510 | 乃至
 511 | 乃至于
 512 | 么
 513 | 之
 514 | 之一
 515 | 之所以
 516 | 之类
 517 | 乌乎
 518 | 乎
 519 | 乘
 520 | 也
 521 | 也好
 522 | 也罢
 523 | 了
 524 | 二来
 525 | 于
 526 | 于是
 527 | 于是乎
 528 | 云云
 529 | 云尔
 530 | 些
 531 | 亦
 532 | 人
 533 | 人们
 534 | 人家
 535 | 今
 536 | 介于
 537 | 仍
 538 | 仍旧
 539 | 从
 540 | 从此
 541 | 从而
 542 | 他
 543 | 他人
 544 | 他们
 545 | 以
 546 | 以上
 547 | 以为
 548 | 以便
 549 | 以免
 550 | 以及
 551 | 以故
 552 | 以期
 553 | 以来
 554 | 以至
 555 | 以至于
 556 | 以致
 557 | 们
 558 | 任
 559 | 任何
 560 | 任凭
 561 | 似的
 562 | 但
 563 | 但凡
 564 | 但是
 565 | 何
 566 | 何以
 567 | 何况
 568 | 何处
 569 | 何时
 570 | 余外
 571 | 作为
 572 | 你
 573 | 你们
 574 | 使
 575 | 使得
 576 | 例如
 577 | 依
 578 | 依据
 579 | 依照
 580 | 便于
 581 | 俺
 582 | 俺们
 583 | 倘
 584 | 倘使
 585 | 倘或
 586 | 倘然
 587 | 倘若
 588 | 借
 589 | 假使
 590 | 假如
 591 | 假若
 592 | 傥然
 593 | 像
 594 | 儿
 595 | 先不先
 596 | 光是
 597 | 全体
 598 | 全部
 599 | 兮
 600 | 关于
 601 | 其
 602 | 其一
 603 | 其中
 604 | 其二
 605 | 其他
 606 | 其余
 607 | 其它
 608 | 其次
 609 | 具体地说
 610 | 具体说来
 611 | 兼之
 612 | 内
 613 | 再其次
 614 | 再则
 615 | 再有
 616 | 再者
 617 | 再者说
 618 | 再说
 619 | 冒
 620 | 冲
 621 | 况且
 622 | 几
 623 | 几时
 624 | 凡
 625 | 凡是
 626 | 凭
 627 | 凭借
 628 | 出于
 629 | 出来
 630 | 分别
 631 | 则
 632 | 则甚
 633 | 别
 634 | 别人
 635 | 别处
 636 | 别是
 637 | 别的
 638 | 别管
 639 | 别说
 640 | 到
 641 | 前后
 642 | 前此
 643 | 前者
 644 | 加之
 645 | 加以
 646 | 即
 647 | 即令
 648 | 即使
 649 | 即便
 650 | 即如
 651 | 即或
 652 | 即若
 653 | 却
 654 | 去
 655 | 又
 656 | 又及
 657 | 及
 658 | 及其
 659 | 及至
 660 | 反之
 661 | 反而
 662 | 反过来
 663 | 反过来说
 664 | 受到
 665 | 另
 666 | 另一方面
 667 | 另外
 668 | 另悉
 669 | 只
 670 | 只当
 671 | 只怕
 672 | 只是
 673 | 只有
 674 | 只消
 675 | 只要
 676 | 只限
 677 | 叫
 678 | 叮咚
 679 | 可
 680 | 可以
 681 | 可是
 682 | 可见
 683 | 各
 684 | 各个
 685 | 各位
 686 | 各种
 687 | 各自
 688 | 同
 689 | 同时
 690 | 后
 691 | 后者
 692 | 向
 693 | 向使
 694 | 向着
 695 | 吓
 696 | 吗
 697 | 否则
 698 | 吧
 699 | 吧哒
 700 | 吱
 701 | 呀
 702 | 呃
 703 | 呕
 704 | 呗
 705 | 呜
 706 | 呜呼
 707 | 呢
 708 | 呵
 709 | 呵呵
 710 | 呸
 711 | 呼哧
 712 | 咋
 713 | 和
 714 | 咚
 715 | 咦
 716 | 咧
 717 | 咱
 718 | 咱们
 719 | 咳
 720 | 哇
 721 | 哈
 722 | 哈哈
 723 | 哉
 724 | 哎
 725 | 哎呀
 726 | 哎哟
 727 | 哗
 728 | 哟
 729 | 哦
 730 | 哩
 731 | 哪
 732 | 哪些
 733 | 哪怕
 734 | 哼
 735 | 哼唷
 736 | 唉
 737 | 唯有
 738 | 啊
 739 | 啐
 740 | 啥
 741 | 啦
 742 | 啪达
 743 | 啷当
 744 | 喂
 745 | 喏
 746 | 喔唷
 747 | 喽
 748 | 嗡
 749 | 嗡嗡
 750 | 嗬
 751 | 嗯
 752 | 嗳
 753 | 嘎
 754 | 嘎登
 755 | 嘘
 756 | 嘛
 757 | 嘻
 758 | 嘿
 759 | 嘿嘿
 760 | 因
 761 | 因为
 762 | 因了
 763 | 因此
 764 | 因着
 765 | 因而
 766 | 固然
 767 | 在
 768 | 在下
 769 | 在于
 770 | 地
 771 | 基于
 772 | 处在
 773 | 多
 774 | 多么
 775 | 多少
 776 | 大
 777 | 大家
 778 | 她
 779 | 她们
 780 | 好
 781 | 如
 782 | 如上
 783 | 如上所述
 784 | 如下
 785 | 如何
 786 | 如其
 787 | 如同
 788 | 如是
 789 | 如果
 790 | 如此
 791 | 如若
 792 | 始而
 793 | 孰料
 794 | 孰知
 795 | 宁
 796 | 宁可
 797 | 宁愿
 798 | 宁肯
 799 | 它
 800 | 它们
 801 | 对
 802 | 对于
 803 | 对待
 804 | 对方
 805 | 对比
 806 | 将
 807 | 小
 808 | 尔
 809 | 尔后
 810 | 尔尔
 811 | 尚且
 812 | 就
 813 | 就是
 814 | 就是了
 815 | 就是说
 816 | 就算
 817 | 就要
 818 | 尽
 819 | 尽管
 820 | 尽管如此
 821 | 岂但
 822 | 己
 823 | 已
 824 | 已矣
 825 | 巴
 826 | 巴巴
 827 | 并
 828 | 并且
 829 | 并非
 830 | 庶乎
 831 | 庶几
 832 | 开外
 833 | 开始
 834 | 归
 835 | 归齐
 836 | 当
 837 | 当地
 838 | 当然
 839 | 当着
 840 | 彼
 841 | 彼时
 842 | 彼此
 843 | 往
 844 | 待
 845 | 很
 846 | 得
 847 | 得了
 848 | 怎
 849 | 怎奈
 850 | 总之
 851 | 总的来看
 852 | 总的来说
 853 | 总的说来
 854 | 总而言之
 855 | 恰恰相反
 856 | 您
 857 | 惟其
 858 | 慢说
 859 | 我
 860 | 我们
 861 | 或
 862 | 或则
 863 | 或是
 864 | 或曰
 865 | 或者
 866 | 截至
 867 | 所
 868 | 所以
 869 | 所在
 870 | 所幸
 871 | 所有
 872 | 才
 873 | 才能
 874 | 打
 875 | 打从
 876 | 把
 877 | 抑或
 878 | 拿
 879 | 按
 880 | 按照
 881 | 换句话说
 882 | 换言之
 883 | 据
 884 | 据此
 885 | 接着
 886 | 故
 887 | 故此
 888 | 故而
 889 | 旁人
 890 | 无
 891 | 无宁
 892 | 无论
 893 | 既
 894 | 既往
 895 | 既是
 896 | 既然
 897 | 时候
 898 | 是
 899 | 是以
 900 | 是的
 901 | 曾
 902 | 替
 903 | 替代
 904 | 最
 905 | 有
 906 | 有些
 907 | 有关
 908 | 有及
 909 | 有时
 910 | 有的
 911 | 望
 912 | 朝
 913 | 朝着
 914 | 本
 915 | 本人
 916 | 本地
 917 | 本着
 918 | 本身
 919 | 来
 920 | 来着
 921 | 来自
 922 | 来说
 923 | 极了
 924 | 果然
 925 | 果真
 926 | 某
 927 | 某个
 928 | 某些
 929 | 某某
 930 | 根据
 931 | 欤
 932 | 正值
 933 | 正如
 934 | 正巧
 935 | 正是
 936 | 此
 937 | 此地
 938 | 此处
 939 | 此外
 940 | 此时
 941 | 此次
 942 | 此间
 943 | 毋宁
 944 | 每
 945 | 每当
 946 | 比
 947 | 比及
 948 | 比如
 949 | 比方
 950 | 没奈何
 951 | 沿
 952 | 沿着
 953 | 漫说
 954 | 焉
 955 | 然则
 956 | 然后
 957 | 然而
 958 | 照
 959 | 照着
 960 | 犹且
 961 | 犹自
 962 | 甚且
 963 | 甚么
 964 | 甚或
 965 | 甚而
 966 | 甚至
 967 | 甚至于
 968 | 用
 969 | 用来
 970 | 由
 971 | 由于
 972 | 由是
 973 | 由此
 974 | 由此可见
 975 | 的
 976 | 的确
 977 | 的话
 978 | 直到
 979 | 相对而言
 980 | 省得
 981 | 看
 982 | 眨眼
 983 | 着
 984 | 着呢
 985 | 矣
 986 | 矣乎
 987 | 矣哉
 988 | 离
 989 | 竟而
 990 | 第
 991 | 等
 992 | 等到
 993 | 等等
 994 | 简言之
 995 | 管
 996 | 类如
 997 | 紧接着
 998 | 纵
 999 | 纵令
1000 | 纵使
1001 | 纵然
1002 | 经
1003 | 经过
1004 | 结果
1005 | 给
1006 | 继之
1007 | 继后
1008 | 继而
1009 | 综上所述
1010 | 罢了
1011 | 者
1012 | 而
1013 | 而且
1014 | 而况
1015 | 而后
1016 | 而外
1017 | 而已
1018 | 而是
1019 | 而言
1020 | 能
1021 | 能否
1022 | 腾
1023 | 自
1024 | 自个儿
1025 | 自从
1026 | 自各儿
1027 | 自后
1028 | 自家
1029 | 自己
1030 | 自打
1031 | 自身
1032 | 至
1033 | 至于
1034 | 至今
1035 | 至若
1036 | 致
1037 | 般的
1038 | 若
1039 | 若夫
1040 | 若是
1041 | 若果 
1042 | 若非
1043 | 莫不然
1044 | 莫如
1045 | 莫若
1046 | 虽
1047 | 虽则
1048 | 虽然
1049 | 虽说
1050 | 被
1051 | 要
1052 | 要不
1053 | 要不是
1054 | 要不然
1055 | 要么
1056 | 要是
1057 | 譬喻
1058 | 譬如
1059 | 让
1060 | 许多
1061 | 论
1062 | 设使
1063 | 设或
1064 | 设若
1065 | 诚如
1066 | 诚然
1067 | 该
1068 | 说来
1069 | 诸
1070 | 诸位
1071 | 诸如
1072 | 谁
1073 | 谁人
1074 | 谁料
1075 | 谁知
1076 | 贼死
1077 | 赖以
1078 | 赶
1079 | 起
1080 | 起见
1081 | 趁
1082 | 趁着
1083 | 越是
1084 | 距
1085 | 跟
1086 | 较
1087 | 较之
1088 | 边
1089 | 过
1090 | 还
1091 | 还是
1092 | 还有
1093 | 还要
1094 | 这
1095 | 这一来
1096 | 这个
1097 | 这么
1098 | 这么些
1099 | 这么样
1100 | 这么点儿
1101 | 这些
1102 | 这会儿
1103 | 这儿
1104 | 这就是说
1105 | 这时
1106 | 这样
1107 | 这次
1108 | 这般
1109 | 这边
1110 | 这里
1111 | 进而
1112 | 连
1113 | 连同
1114 | 逐步
1115 | 通过
1116 | 遵循
1117 | 遵照
1118 | 那
1119 | 那个
1120 | 那么
1121 | 那么些
1122 | 那么样
1123 | 那些
1124 | 那会儿
1125 | 那儿
1126 | 那时
1127 | 那样
1128 | 那般
1129 | 那边
1130 | 那里
1131 | 都
1132 | 鄙人
1133 | 鉴于
1134 | 针对
1135 | 阿
1136 | 除
1137 | 除了
1138 | 除外
1139 | 除开
1140 | 除此之外
1141 | 除非
1142 | 随
1143 | 随后
1144 | 随时
1145 | 随着
1146 | 难道说
1147 | 非但
1148 | 非徒
1149 | 非特
1150 | 非独
1151 | 靠
1152 | 顺
1153 | 顺着
1154 | 首先
1155 | ︰
1156 | ︳
1157 | ︴
1158 | ︵
1159 | ︶
1160 | ︷
1161 | ︸
1162 | ︹
1163 | ︺
1164 | ︻
1165 | ︼
1166 | ︽
1167 | ︾
1168 | ︿
1169 | ﹀
1170 | ﹁
1171 | ﹂
1172 | ﹃
1173 | ﹄
1174 | ﹉
1175 | ﹊
1176 | ﹋
1177 | ﹌
1178 | ﹍
1179 | ﹎
1180 | ﹏
1181 | ﹐
1182 | ﹑
1183 | ﹔
1184 | ﹕
1185 | ﹖
1186 | ﹝
1187 | ﹞
1188 | ﹟
1189 | ﹠
1190 | ﹡
1191 | ﹢
1192 | ﹤
1193 | ﹦
1194 | ﹨
1195 | ﹩
1196 | ﹪
1197 | ﹫
1198 | ！
1199 | ＂
1200 | ＇
1201 | （
1202 | ）
1203 | ，
1204 | ：
1205 | ；
1206 | ？
1207 | ＿
1208 | ￣
1209 | １
1210 | ２
1211 | ３
1212 | ４
1213 | ５
1214 | ６
1215 | ７
1216 | ８
1217 | ９
1218 | ０
1219 | *
1220 | 


--------------------------------------------------------------------------------
/BIMPM/model.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/7/16 22:58
  4 | # @Author: Zhang Cong
  5 | 
  6 | import tensorflow as tf
  7 | from config import Config
  8 | import tensorflow.contrib as contrib
  9 | 
 10 | class Model():
 11 |     def __init__(self):
 12 |         self.config = Config()                                                                                                   # 读取配置参数
 13 |         self.input_query_word = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name="input-query-word")    # 输入query，One-Hot形式
 14 |         self.input_doc_word = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name="input-doc-word")        # 输入doc，One-Hot形式
 15 |         self.input_query_char = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name="input-query-char")    # 输入query，One-Hot形式
 16 |         self.input_doc_char = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name="input-doc-char")        # 输入doc，One-Hot形式
 17 |         self.input_label = tf.placeholder(shape=[None, self.config.num_classes], dtype=tf.int32, name="input-label")             # 输入 label
 18 |         self.input_keep_prob = tf.placeholder(dtype=tf.float32, name='input-keep-prob')                                          # keep-prob
 19 | 
 20 |         # Embedding layer
 21 |         self.embedding_word = tf.get_variable(shape=[self.config.vocab_size, self.config.embedding_dim], dtype=tf.float32, name='embedding-word')
 22 |         self.embedding_char = tf.get_variable(shape=[self.config.char_size, self.config.embedding_dim], dtype=tf.float32, name='embedding-char')
 23 | 
 24 |         # 将词汇映射为向量形式 [batch_size, seq_length, embedding_dim]
 25 |         self.input_query_word_emb = tf.nn.embedding_lookup(params=self.embedding_word, ids=self.input_query_word, name='input-query-word-emb')
 26 |         self.input_doc_word_emb = tf.nn.embedding_lookup(params=self.embedding_word, ids=self.input_doc_word, name='input-doc-word-emb')
 27 | 
 28 |         # ----- Word Representation Layer -----
 29 |         # 将字符映射为向量形式 [batch_size, seq_length, embedding_dim]
 30 |         self.input_query_char_emb = tf.nn.embedding_lookup(params=self.embedding_char, ids=self.input_query_char, name='input-query-char-emb')
 31 |         self.input_doc_char_emb = tf.nn.embedding_lookup(params=self.embedding_char, ids=self.input_doc_char, name='input-doc-char-emb')
 32 | 
 33 |         # 将字符传入LSTM后作为char embedding
 34 |         input_query_char_emb = self.uni_directional_rnn(input_data=self.input_query_char_emb,
 35 |                                                              num_units=self.config.hidden_dim,
 36 |                                                              rnn_type=self.config.rnn_type,
 37 |                                                              scope='rnn-query-char')
 38 |         input_doc_char_emb = self.uni_directional_rnn(input_data=self.input_doc_char_emb,
 39 |                                                            num_units=self.config.hidden_dim,
 40 |                                                            rnn_type=self.config.rnn_type,
 41 |                                                            scope='rnn-doc-char')
 42 |         # 将生成的char embedding 与 word embedding进行拼接 [batch_size, seq_length, word_embediing + char_hidden_dim]
 43 |         self.query_embedding = tf.concat([input_query_char_emb, self.input_query_word_emb], axis=-1)
 44 |         self.doc_embedding = tf.concat([input_doc_char_emb, self.input_doc_word_emb], axis=-1)
 45 | 
 46 |         # dropout layer
 47 |         self.query_embedding = tf.nn.dropout(self.query_embedding, keep_prob=self.input_keep_prob)
 48 |         self.doc_embedding = tf.nn.dropout(self.doc_embedding, keep_prob=self.input_keep_prob)
 49 | 
 50 |         # ----- Context Representation Layer -----
 51 |         # 对query和doc向量进行Bi-LSTM处理
 52 |         query_fw, query_bw = self.bi_directional_rnn(input_data=self.query_embedding,
 53 |                                                      num_units=self.config.hidden_dim,
 54 |                                                      rnn_type=self.config.rnn_type,
 55 |                                                      scope='bi-rnn-query-char')
 56 |         doc_fw, doc_bw = self.bi_directional_rnn(input_data=self.doc_embedding,
 57 |                                                  num_units=self.config.hidden_dim,
 58 |                                                  rnn_type=self.config.rnn_type,
 59 |                                                  scope='bi-rnn-doc-char')
 60 | 
 61 |         # dropout layer
 62 |         query_fw = tf.nn.dropout(query_fw, keep_prob=self.input_keep_prob)
 63 |         query_bw = tf.nn.dropout(query_bw, keep_prob=self.input_keep_prob)
 64 |         doc_fw = tf.nn.dropout(doc_fw, keep_prob=self.input_keep_prob)
 65 |         doc_bw = tf.nn.dropout(doc_bw, keep_prob=self.input_keep_prob)
 66 | 
 67 |         # ----- Matching Layer -----
 68 |         # 1、Full-Matching
 69 |         # 这个匹配策略是对于query中Bi-LSTM的每个时间步与doc中Bi-LSTM的最后一个时间步计算相似度（既有前向也有后向），然后doc的每个
 70 |         # 时间步与query的最后一个时间步计算相似度
 71 |         w1 = tf.get_variable(shape=[self.config.num_perspective, self.config.hidden_dim], dtype=tf.float32, name='w1')
 72 |         w2 = tf.get_variable(shape=[self.config.num_perspective, self.config.hidden_dim], dtype=tf.float32, name='w2')
 73 |         query_full_fw = self.full_matching(query_fw, tf.expand_dims(doc_fw[:, -1, :], 1), w1)
 74 |         query_full_bw = self.full_matching(query_bw, tf.expand_dims(doc_bw[:, 0, :], 1), w2)
 75 |         doc_full_fw = self.full_matching(doc_fw, tf.expand_dims(query_fw[:, -1, :], 1), w1)
 76 |         doc_full_bw = self.full_matching(doc_bw, tf.expand_dims(query_bw[:, 0, :], 1), w2)
 77 | 
 78 |         # 2、Maxpooling-Matching
 79 |         # 这个匹配策略对于P中BiLSTM的每个时间步与Q中BiLSTM的每个时间步分别计算相似度，然后只返回最大的一个相似度
 80 |         w3 = tf.get_variable(shape=[self.config.num_perspective, self.config.hidden_dim], dtype=tf.float32, name='w3')
 81 |         w4 = tf.get_variable(shape=[self.config.num_perspective, self.config.hidden_dim], dtype=tf.float32, name='w4')
 82 |         max_fw = self.maxpool_matching(query_fw, doc_fw, w3)
 83 |         max_bw = self.maxpool_matching(query_bw, doc_bw, w4)
 84 | 
 85 |         # 3、Attentive-Matching
 86 |         # 这个匹配策略先计算P和Q中BiLSTM中每个时间步的cosine(传统的)相似度，生成一个相关性矩阵，然后用这个相关矩阵计算Q的加权求和（如果是P-->Q），
 87 |         # 最后用P的每个时间步分别于Q的加权求和计算相似度
 88 |         # 计算权重即相似度矩阵（普通Cosine）
 89 |         fw_cos = self.cosine(query_fw, doc_fw)
 90 |         bw_cos = self.cosine(query_bw, doc_bw)
 91 | 
 92 |         # 计算attentive vector 加权求和
 93 |         query_att_fw = tf.matmul(fw_cos, query_fw)
 94 |         query_att_bw = tf.matmul(bw_cos, query_bw)
 95 |         doc_att_fw = tf.matmul(fw_cos, doc_fw)
 96 |         doc_att_bw = tf.matmul(bw_cos, doc_bw)
 97 |         # 标准化，除以权重和
 98 |         query_mean_fw = tf.divide(query_att_fw, tf.reduce_sum(fw_cos, axis=2, keep_dims=True))
 99 |         query_mean_bw = tf.divide(query_att_bw, tf.reduce_sum(bw_cos, axis=2, keep_dims=True))
100 |         doc_mean_fw = tf.divide(doc_att_fw, tf.reduce_sum(fw_cos, axis=2, keep_dims=True))
101 |         doc_mean_bw = tf.divide(doc_att_bw, tf.reduce_sum(fw_cos, axis=2, keep_dims=True))
102 |         # 计算match score
103 |         w5 = tf.get_variable(shape=[self.config.num_perspective, self.config.hidden_dim], dtype=tf.float32, name='w5')
104 |         w6 = tf.get_variable(shape=[self.config.num_perspective, self.config.hidden_dim], dtype=tf.float32, name='w6')
105 |         query_att_mean_fw = self.full_matching(query_fw, query_mean_fw, w5)
106 |         query_att_mean_bw = self.full_matching(query_bw, query_mean_bw, w6)
107 |         doc_att_mean_fw = self.full_matching(doc_fw, doc_mean_fw, w5)
108 |         doc_att_mean_bw = self.full_matching(doc_bw, doc_mean_bw, w6)
109 | 
110 |         # 4、Max-Attentive-Matching
111 |         # 这个和上面的attentive-matching很像，只不过这里不再是加权求和了，而是直接用cosine最大的embedding作为attentive vector，
112 |         # 然后P的每个时间步分别于最大相似度的embedding求多角度cosine相似度
113 |         # 求cos最大的embedding
114 |         query_max_fw = tf.reduce_max(query_att_fw, axis=2, keep_dims=True)
115 |         query_max_bw = tf.reduce_max(query_att_bw, axis=2, keep_dims=True)
116 |         doc_max_fw = tf.reduce_max(doc_att_fw, axis=2, keep_dims=True)
117 |         doc_max_bw = tf.reduce_max(doc_att_bw, axis=2, keep_dims=True)
118 |         # 计算match score
119 |         w7 = tf.get_variable(shape=[self.config.num_perspective, self.config.hidden_dim], dtype=tf.float32, name='w7')
120 |         w8 = tf.get_variable(shape=[self.config.num_perspective, self.config.hidden_dim], dtype=tf.float32, name='w8')
121 |         query_att_max_fw = self.full_matching(query_fw, query_max_fw, w7)
122 |         query_att_max_bw = self.full_matching(query_bw, query_max_bw, w8)
123 |         doc_att_max_fw = self.full_matching(doc_fw, doc_max_fw, w7)
124 |         doc_att_max_bw = self.full_matching(doc_bw, doc_max_bw, w8)
125 | 
126 |         # 将以上四种相似度计算方式得出的结果进行拼接
127 |         mv_query = tf.concat([query_full_fw, max_fw, query_att_mean_fw, query_att_max_fw, query_full_bw, max_bw, query_att_mean_bw, query_att_max_bw], axis=2)
128 |         mv_doc = tf.concat([doc_full_fw, max_fw, doc_att_mean_fw, doc_att_max_fw, doc_full_bw, max_bw, doc_att_mean_bw, doc_att_max_bw], axis=2)
129 |         # dropout layer
130 |         mv_query  = tf.nn.dropout(mv_query, keep_prob=self.input_keep_prob)
131 |         mv_doc = tf.nn.dropout(mv_doc, keep_prob=self.input_keep_prob)
132 |         # 维度转换
133 |         mv_query = tf.reshape(mv_query, [-1, mv_query.shape[1], mv_query.shape[2] * mv_query.shape[3]])
134 |         mv_doc = tf.reshape(mv_doc, [-1, mv_doc.shape[1], mv_doc.shape[2] * mv_doc.shape[3]])
135 | 
136 |         # ----- Aggregation Layer -----
137 |         # 采用Bi-LSTM对合并转换后的向量进行特征提取
138 |         query_fw_final, query_bw_final = self.bi_directional_rnn(input_data=mv_query,
139 |                                                                  num_units=self.config.hidden_dim,
140 |                                                                  rnn_type=self.config.rnn_type,
141 |                                                                  scope='bi-rnn-query-agg')
142 |         doc_fw_final, doc_bw_final = self.bi_directional_rnn(input_data=mv_doc,
143 |                                                              num_units=self.config.hidden_dim,
144 |                                                              rnn_type=self.config.rnn_type,
145 |                                                              scope='bi-rnn-doc-agg')
146 |         # 将Bi-LSTM的结果进行拼接
147 |         combine_emb = tf.concat((query_fw_final, query_bw_final, doc_fw_final, doc_bw_final), axis=2)
148 |         combine_emb = tf.reshape(combine_emb, shape=[-1, combine_emb.shape[1] * combine_emb.shape[2]])
149 |         combine_emb = tf.nn.dropout(combine_emb, keep_prob=self.input_keep_prob)
150 | 
151 |         # ----- Prediction Layer -----
152 |         # 全连接层 1
153 |         with tf.variable_scope('feed_foward_layer1'):
154 |             inputs = tf.nn.dropout(combine_emb, self.input_keep_prob)
155 |             outputs = tf.layers.dense(inputs=inputs,
156 |                                       units=self.config.fc_hidden_dim_1,
157 |                                       activation=tf.nn.relu,
158 |                                       use_bias=True,
159 |                                       kernel_initializer=tf.random_normal_initializer(0.0, 0.1))
160 |         # 全连接层 2
161 |         with tf.variable_scope('feed_foward_layer2'):
162 |             inputs = tf.nn.dropout(outputs, self.input_keep_prob)
163 |             outputs = tf.layers.dense(inputs=inputs,
164 |                                       units=self.config.fc_hidden_dim_2,
165 |                                       activation=tf.nn.relu,
166 |                                       use_bias=True,
167 |                                       kernel_initializer=tf.random_normal_initializer(0.0, 0.1))
168 |         # 全连接层 3
169 |         with tf.variable_scope('feed_foward_layer3'):
170 |             inputs = tf.nn.dropout(outputs, self.input_keep_prob)
171 |             self.logits = tf.layers.dense(inputs=inputs,
172 |                                           units=self.config.num_classes,
173 |                                           activation=tf.nn.relu,
174 |                                           use_bias=True,
175 |                                           kernel_initializer=tf.random_normal_initializer(0.0, 0.1))
176 |         # 类别score
177 |         self.score = tf.nn.softmax(self.logits, name='score')
178 |         # 预测结果
179 |         self.predict = tf.argmax(self.score, axis=1, name='predict')
180 |         # 准确率
181 |         self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(self.input_label, axis=1), self.predict), dtype=tf.float32),name='accuracy')
182 |         # 结构化损失函数，交叉熵+L2正则化
183 |         self.loss = tf.add(
184 |             tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_label)),
185 |             tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
186 |             name="loss")
187 |         # 优化器
188 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate, name="optimizer").minimize(self.loss)
189 | 
190 | 
191 | 
192 |     def full_matching(self, metric, vec, w):
193 |         '''
194 |         1、Full-Matching相似度计算，metric中的每个时间步与vec进行相似度计算
195 |         :param metric: 时间步矩阵 [batch_size, seq_length, hidden_dim]
196 |         :param vec: 最后一个时间步输出向量 [batch_size, 1, hidden_dim]
197 |         :param w: 权重矩阵 [num_perspective, hidden_dim]
198 |         :return:
199 |         '''
200 |         w = tf.expand_dims(tf.expand_dims(w, 0), 2)                             # 构建多角度权重矩阵 [batch_size, 1, num_perspective, hidden_dim]
201 |         metric = w * tf.stack([metric] * self.config.num_perspective, axis=1)   # 生成多角度metric向量
202 |         vec = w * tf.stack([vec] * self.config.num_perspective, axis=1)         # 生成多角度vec向量
203 |         # 进行Cosine计算
204 |         m = tf.matmul(metric, tf.transpose(vec, [0, 1, 3, 2]))                              # metric与vec进行点乘（cos分子）
205 |         n = tf.norm(metric, axis=3, keep_dims=True) * tf.norm(vec, axis=3, keep_dims=True)  # metric的L2范数与vec的L2范数相乘（cos分母）
206 |         cosine = tf.transpose(tf.divide(m, n), [0, 2, 3, 1])          # 相除得到Cosine [batch_size, seq_length, 1, num_perspective]
207 | 
208 |         return cosine
209 | 
210 | 
211 |     def maxpool_matching(self, v1, v2, w):
212 |         '''
213 |         2、Maxpooling-Matching相似度计算，v1中的每个时间步与v2中的每个时间步进行相似度计算
214 |         :param v1: 时间步矩阵 [batch_size, seq_length, hidden_dim]
215 |         :param v2: 时间步矩阵 [batch_size, seq_length, hidden_dim]
216 |         :param w: 权重矩阵 [num_perspective, hidden_dim]
217 |         :return:
218 |         '''
219 |         cosine = self.full_matching(v1, v2, w)      # full_matching相似度计算
220 |         max_value = tf.reduce_max(cosine, axis=2, keep_dims=True)   # maxpooling
221 |         return max_value
222 | 
223 | 
224 |     def cosine(self, v1, v2):
225 |         '''
226 |         计算两个矩阵每个时间步的cos值
227 |         :param v1: 时间步矩阵1
228 |         :param v2: 时间步矩阵2
229 |         :return:
230 |         '''
231 |         m = tf.matmul(v1, tf.transpose(v2, [0, 2, 1]))      # 矩阵v1和矩阵v2进行点乘（cos分子）
232 |         n = tf.norm(v1, axis=2, keep_dims=True) * tf.norm(v2, axis=2, keep_dims=True)       # v1c的L2范数与v2的L2范数相乘（cos分母）
233 |         cosine = tf.divide(m, n)                     # 相除得到Cosine
234 |         return cosine
235 | 
236 | 
237 |     def bi_directional_rnn(self, input_data, num_units, rnn_type, scope, reuse=False):
238 |         '''
239 |         构建双向RNN层，可选LSTM/GRU
240 |         :param input_data: 输入时序数据
241 |         :param rnn_type: RNN类型
242 |         :param scope: 变量空间
243 |         :param reuse: 是否重用变量
244 |         :return:
245 |         '''
246 |         with tf.variable_scope(scope, reuse=reuse):
247 |             cell_fw = self.get_rnn(rnn_type, num_units)
248 |             cell_bw = self.get_rnn(rnn_type, num_units)
249 |             outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, inputs=input_data, dtype=tf.float32)
250 |             # outputs = tf.concat(outputs, axis=2)
251 |             return outputs
252 | 
253 | 
254 |     def uni_directional_rnn(self, input_data, num_units, rnn_type, scope, reuse=False):
255 |         '''
256 |         构建单向RNN层，可选LSTM/GRU
257 |         :param input_data: 输入时序数据
258 |         :param rnn_type: RNN类型
259 |         :param scope: 变量空间
260 |         :param reuse: 是否重用变量
261 |         :return:
262 |         '''
263 |         with tf.variable_scope(scope, reuse=reuse):
264 |             cell = self.get_rnn(rnn_type, num_units)
265 |             outputs, states = tf.nn.dynamic_rnn(cell=cell, inputs=input_data, dtype=tf.float32)
266 |             return outputs
267 | 
268 | 
269 |     def get_rnn(self, rnn_type, num_units):
270 |         '''
271 |         根据rnn_type创建RNN层
272 |         :param rnn_type: RNN类型
273 |         :return:
274 |         '''
275 |         if rnn_type == 'lstm':
276 |             cell = contrib.rnn.LSTMCell(num_units=num_units)
277 |         else:
278 |             cell = contrib.rnn.GRUCell(num_units=num_units)
279 |         cell = contrib.rnn.DropoutWrapper(cell=cell, input_keep_prob=self.input_keep_prob)
280 |         return cell
281 | 
282 | 
283 | if __name__ == '__main__':
284 |     Model()


--------------------------------------------------------------------------------
/BM25/BM25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/BM25/BM25.png


--------------------------------------------------------------------------------
/BM25/BM25.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/7/20 21:50 
 4 | # @Author: Zhang Cong
 5 | 
 6 | import math
 7 | import jieba
 8 | 
 9 | 
10 | class BM25(object):
11 |     def __init__(self, docs):
12 |         self.N = len(docs)  # 文本数量
13 |         self.avgdl = sum([len(doc) for doc in docs])*1.0 / self.N   # 文本平均长度
14 |         self.docs = docs
15 |         self.f = []         # 每篇文档中每个词的出现次数
16 |         self.df = {}        # 每个词及出现了该词的文档数量
17 |         self.idf = {}       # 每个词的IDF值
18 |         self.k1 = 1.5       # 调节参数K1
19 |         self.b = 0.75       # 调节参数b
20 |         self.init()
21 | 
22 | 
23 |     def init(self):
24 |         '''
25 |         计算文档集每篇文档中每个词的出现次数、每个词及出现了该词的文档数量、每个词的IDF值
26 |         :return:
27 |         '''
28 |         for doc in self.docs:
29 |             tmp = {}
30 |             # 统计当前文档中每个词的出现次数
31 |             for word in doc:
32 |                 tmp[word] = tmp.get(word, 0) + 1
33 |             self.f.append(tmp)  # 加入到全局记录中
34 | 
35 |             # 统计出现了当前词汇的文档数量
36 |             for k in tmp.keys():
37 |                 self.df[k] = self.df.get(k, 0) + 1
38 | 
39 |         # 计算IDF值
40 |         for k, v in self.df.items():
41 |             self.idf[k] = math.log(self.N-v+0.5)-math.log(v+0.5)
42 | 
43 | 
44 |     def get_score(self, query, index):
45 |         '''
46 |         计算输入的query和doc的相似度分数score
47 |         :param doc: 输入的query
48 |         :param index: 文档集中的文档索引
49 |         :return:
50 |         '''
51 |         score = 0
52 |         for word in query:
53 |             # 如果是未登录词，则跳过
54 |             if word not in self.f[index]:
55 |                 continue
56 |             dl = len(self.docs[index])  # 当前文档长度
57 |             # 计算相似度分数 IDF*R(q, d) 求和
58 |             score += (self.idf[word] * self.f[index][word]*(self.k1+1)
59 |                                         / (self.f[index][word] + self.k1 * (1 - self.b + self.b*dl/self.avgdl)))
60 |         return score
61 | 
62 | 
63 |     def similarity(self, query):
64 |         '''
65 |         输入query对文档集进行检索
66 |         :param doc: 分词后的query list
67 |         :return:
68 |         '''
69 |         scores = []
70 |         for index in range(self.N):
71 |             score = self.get_score(query, index)
72 |             scores.append(score)
73 |         return scores
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     # 测试文本
78 |     text = ['自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。',
79 |             '它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。',
80 |             '自然语言处理是一门融语言学、计算机科学、数学于一体的科学。',
81 |             '因此，这一领域的研究将涉及自然语言，即人们日常使用的语言，',
82 |             '所以它与语言学的研究有着密切的联系，但又有重要的区别。',
83 |             '自然语言处理并不是一般地研究自然语言，',
84 |             '而在于研制能有效地实现自然语言通信的计算机系统，',
85 |             '特别是其中的软件系统。因而它是计算机科学的一部分。']
86 | 
87 |     doc = []
88 |     for sentence in text:
89 |         words = list(jieba.cut(sentence))
90 |         doc.append(words)
91 |     print(doc)
92 |     s = BM25(doc)
93 |     print(s.f)
94 |     print(s.idf)
95 |     print(s.similarity(['自然语言', '计算机科学', '领域', '人工智能', '领域']))


--------------------------------------------------------------------------------
/BM25/README.md:
--------------------------------------------------------------------------------
 1 | ## BM25
 2 | 
 3 | 
 4 | ### 概述
 5 | BM25算法，通常用来作搜索相关性评分。一句话概况其主要思想：对Query进行语素解析，生成语素qi；然后，对于每个搜索结果D，计算每个语素qi与D的相关性得分，最后，将qi相对于D的相关性得分进行加权求和，从而得到Query与D的相关性得分。
 6 | 
 7 | 
 8 | ### 数学公式
 9 | ![avatar](./BM25.png)
10 | 
11 | 
12 | ### 文件解释
13 | * BM25.py —— 主文件
14 | * /data —— 数据存放文件夹
15 | 
16 | 
17 | ### 参考资料
18 | * https://www.jianshu.com/p/1e498888f505
19 | 
20 | 


--------------------------------------------------------------------------------
/DSSM-BOW/DSSM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/DSSM-BOW/DSSM.png


--------------------------------------------------------------------------------
/DSSM-BOW/Data_Generate.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/5/12 22:19 
  4 | # @Author: Zhang Cong
  5 | 
  6 | import random
  7 | import logging
  8 | from tqdm import tqdm
  9 | 
 10 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 11 | 
 12 | def generate_train_data(input_file_path, output_file_path):
 13 |     '''
 14 |     对原始csv数据进行格式转换，构建训练数据集
 15 |     :param input_file_path: 原始数据路径
 16 |     :param output_file_path: 构建完成的训练数据路径
 17 |     :return: 将数据存储至本地
 18 |     '''
 19 |     logging.info('Start get all sentence ...')
 20 |     # 获取全部句子集
 21 |     all_sentence = []
 22 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 23 |         line = line.replace('\n', '').split('\t')
 24 |         if line[2] == 'label':      # 跳过首行
 25 |             continue
 26 |         sentence_1 = str(line[0]).replace('\t', '')     # 句子1
 27 |         sentence_2 = str(line[1]).replace('\t', '')     # 句子2
 28 |         all_sentence.append(sentence_1)
 29 |         all_sentence.append(sentence_2)
 30 |     # 去重
 31 |     all_sentence = list(set(all_sentence))
 32 | 
 33 |     logging.info('Start generate dataset ...')
 34 |     # 构建训练数据集 [query, pos, neg_1, neg_2, neg_3, neg_4]
 35 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
 36 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 37 |         line = line.replace('\n', '').split('\t')
 38 |         if line[2] == 'label':      # 跳过首行
 39 |             continue
 40 |         sentence_list = []
 41 |         sentence_1 = str(line[0]).replace('\t', '')
 42 |         sentence_2 = str(line[1]).replace('\t', '')
 43 |         sentence_list.append(sentence_1)    # 句子1
 44 |         sentence_list.append(sentence_2)    # 句子2
 45 |         label = line[2]                     # 标签
 46 | 
 47 |         if int(label)==1:       # 如果标签为1，则保留此句子对，并随机负采样得到4个负例
 48 |             while len(sentence_list)<6:         # [query, pos, neg_1, neg_2, neg_3, neg_4]
 49 |                 index = random.randint(0, len(all_sentence)-1)      # 随机索引
 50 |                 if all_sentence[index] not in sentence_list:        # 如果不重复，则加入
 51 |                     sentence_list.append(all_sentence[index])
 52 |             output_file.write('\t'.join(sentence_list) + '\n')
 53 |     output_file.close()
 54 |     logging.info('Finishied generate dataset ...')
 55 | 
 56 | 
 57 | def generate_test_data(input_file_path, output_file_path):
 58 |     '''
 59 |     对原始csv数据进行格式转换，构建测试数据集
 60 |     :param input_file_path: 原始数据路径
 61 |     :param output_file_path: 构建完成的训练数据路径
 62 |     :return: 将数据存储至本地
 63 |     '''
 64 |     logging.info('Start get all sentence ...')
 65 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
 66 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 67 |         line = line.replace('\n', '').split('\t')
 68 |         if line[2] == 'label':      # 跳过首行
 69 |             continue
 70 |         sentence_1 = str(line[0]).replace('\t', '')     # 句子1
 71 |         sentence_2 = str(line[1]).replace('\t', '')     # 句子2
 72 |         label = line[2]                                 # 标签
 73 |         output_file.write(sentence_1 + '\t' + sentence_2 + '\t' + label + '\n')
 74 | 
 75 | 
 76 | def check_data(input_file_path):
 77 |     '''
 78 |     统计数据分布情况，检查数据集0/1分布是否均衡
 79 |     :param input_file_path: 数据路径
 80 |     :return:
 81 |     '''
 82 |     count = 0
 83 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 84 |         line = line.replace('\n', '').split('\t')
 85 |         if line[2] == 'label':
 86 |             continue
 87 |         if int(line[2]) == 1:
 88 |             count += 1
 89 |     print(count)
 90 | 
 91 | 
 92 | if __name__ == '__main__':
 93 | 
 94 |     # 统计数据分布情况
 95 |     file_path = './data/lcqmc/lcqmc_train.tsv'
 96 |     check_data(file_path)
 97 | 
 98 |     # 构建训练数据集
 99 |     input_file_path = './data/lcqmc/lcqmc_train.tsv'
100 |     output_file_path = './data/train.txt'
101 |     generate_train_data(input_file_path, output_file_path)
102 |     logging.info('Success generate train.txt')
103 | 
104 |     # 构建验证数据集
105 |     input_file_path = './data/lcqmc/lcqmc_dev.tsv'
106 |     output_file_path = './data/dev.txt'
107 |     generate_test_data(input_file_path, output_file_path)
108 |     logging.info('Success generate dev.txt')
109 | 
110 |     # 构建测试数据集
111 |     # input_file_path = './data/lcqmc/lcqmc_test.tsv'
112 |     # output_file_path = './data/test.txt'
113 |     # generate_test_data(input_file_path, output_file_path)
114 |     # logging.info('Success generate test.txt')
115 | 
116 | 


--------------------------------------------------------------------------------
/DSSM-BOW/README.md:
--------------------------------------------------------------------------------
 1 | ## DSSM (Learning Deep Structured Semantic Models for Web Search using Clickthrough Data)
 2 | 
 3 | 
 4 | ### 数据集：
 5 | #### LCQMC (http://icrc.hitsz.edu.cn/info/1037/1146.htm) 数据集版权保护，本项目不提供，请自行下载或替换其他数据集进行试验
 6 | 
 7 | 
 8 | ### 数据形式：
 9 | #### query \t pos \t neg_1 \t neg_2 \t neg_3 \t neg_4
10 |     其中pos为正例，neg_1、neg_2、neg_3、neg_4为随机负采样得到负例
11 | 
12 | 
13 | ### 文件解释
14 | * main.py —— 主文件
15 | * model.py —— 模型结构
16 | * config.py —— 配置参数
17 | * Data_Generate.py —— 数据集处理脚本
18 | * /data —— 数据存放文件夹
19 | * /save_model —— 模型存储文件夹
20 | 
21 | 
22 | ### 模型结构
23 | ![avatar](./DSSM.png)
24 | * DSSM的原理很简单，通过搜索引擎里Query和Title的海量的点击曝光日志，用DNN把Query和Title表达为低纬语义向量，并通过cosine距离来计算两个语义向量的距离，最终训练出语义相似度模型。该模型既可以用来预测两个句子的语义相似度，又可以获得某句子的低纬语义向量表达。
25 | * DSSM 从下往上可以分为三层结构：输入层、表示层、匹配层
26 | 
27 | 
28 | ### 参考资料
29 | * Learning Deep Structured Semantic Models for Web Search using Clickthrough Data (https://www.microsoft.com/en-us/research/publication/learning-deep-structured-semantic-models-for-web-search-using-clickthrough-data/)
30 | * https://blog.csdn.net/wangqingbaidu/article/details/79286038
31 | 
32 | 


--------------------------------------------------------------------------------
/DSSM-BOW/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/DSSM-BOW/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/DSSM-BOW/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/DSSM-BOW/__pycache__/model.cpython-36.pyc


--------------------------------------------------------------------------------
/DSSM-BOW/config.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/5/12 22:57 
 4 | # @Author: Zhang Cong
 5 | 
 6 | # 模型配置参数
 7 | class Config():
 8 |     def __init__(self):
 9 |         self.original_data_path = './data/train.txt'
10 |         self.dev_data_path = './data/dev.txt'
11 |         self.stopwords_path = './data/stopwords.txt'
12 |         self.preprocess_path = './data/preprocessed_data.txt'
13 |         self.vocab_path = './data/vocab.txt'
14 |         self.label_path = './data/label.txt'
15 |         self.model_save_path = './save_model/'
16 |         self.vocab_size = 2000
17 |         self.neg_doc_num = 4
18 |         self.learning_rate = 1e-5
19 |         self.keep_prob = 0.5
20 |         self.batch_size = 32
21 |         self.epochs = 100


--------------------------------------------------------------------------------
/DSSM-BOW/data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | &nbsp
   2 | &nbsp;
   3 | aboard
   4 | about
   5 | above
   6 | according
   7 | according to
   8 | across
   9 | afore
  10 | after
  11 | afterwards
  12 | again
  13 | against
  14 | agin
  15 | all
  16 | almost
  17 | alone
  18 | along
  19 | alongside
  20 | already
  21 | also
  22 | although
  23 | always
  24 | am
  25 | amid
  26 | amidst
  27 | among
  28 | amongst
  29 | amoungst
  30 | amount
  31 | an
  32 | and
  33 | anent
  34 | another
  35 | any
  36 | anyhow
  37 | anyone
  38 | anything
  39 | anyway
  40 | anywhere
  41 | approximately
  42 | are
  43 | around
  44 | as
  45 | asked
  46 | aslant
  47 | astride
  48 | at
  49 | athwart
  50 | back
  51 | bar
  52 | be
  53 | became
  54 | because
  55 | because of
  56 | become
  57 | becomes
  58 | becoming
  59 | been
  60 | before
  61 | beforehand
  62 | behind
  63 | being
  64 | below
  65 | beneath
  66 | beside
  67 | besides
  68 | between
  69 | betwixt
  70 | beyond
  71 | bill
  72 | both
  73 | bottom
  74 | but
  75 | by
  76 | call
  77 | called
  78 | can
  79 | cannot
  80 | cant
  81 | circa
  82 | co
  83 | computer
  84 | con
  85 | could
  86 | couldnt
  87 | cry
  88 | currently
  89 | dare
  90 | de
  91 | describe
  92 | despite
  93 | detail
  94 | did
  95 | do
  96 | does
  97 | done
  98 | down
  99 | dr
 100 | due
 101 | due to
 102 | during
 103 | e.g.,
 104 | each
 105 | earlier
 106 | eg
 107 | eight
 108 | either
 109 | eleven
 110 | else
 111 | elsewhere
 112 | empty
 113 | enough
 114 | ere
 115 | etc
 116 | even
 117 | eventually
 118 | ever
 119 | every
 120 | everyone
 121 | everything
 122 | everywhere
 123 | except
 124 | few
 125 | fifteen
 126 | fify
 127 | fill
 128 | find
 129 | fire
 130 | first
 131 | five
 132 | for
 133 | former
 134 | formerly
 135 | forty
 136 | found
 137 | four
 138 | from
 139 | front
 140 | full
 141 | further
 142 | get
 143 | give
 144 | go
 145 | had
 146 | has
 147 | hasnt
 148 | have
 149 | he
 150 | hence
 151 | her
 152 | here
 153 | hereafter
 154 | hereby
 155 | herein
 156 | hereupon
 157 | hers
 158 | herself
 159 | him
 160 | himself
 161 | his
 162 | how
 163 | however
 164 | hundred
 165 | i
 166 | ie
 167 | if
 168 | in
 169 | inc
 170 | indeed
 171 | inside
 172 | instead
 173 | interest
 174 | into
 175 | is
 176 | it
 177 | its
 178 | itself
 179 | just
 180 | keep
 181 | last
 182 | latter
 183 | latterly
 184 | least
 185 | less
 186 | like
 187 | ltd
 188 | made
 189 | major
 190 | many
 191 | may
 192 | maybe
 193 | me
 194 | meanwhile
 195 | mid
 196 | midst
 197 | might
 198 | mill
 199 | mine
 200 | minus
 201 | more
 202 | moreover
 203 | most
 204 | mostly
 205 | move
 206 | mr
 207 | mrs
 208 | ms
 209 | much
 210 | must
 211 | my
 212 | myself
 213 | name
 214 | namely
 215 | near
 216 | need
 217 | neither
 218 | net
 219 | never
 220 | nevertheless
 221 | next
 222 | nigh
 223 | nigher
 224 | nighest
 225 | nine
 226 | no
 227 | nobody
 228 | none
 229 | noone
 230 | nor
 231 | not
 232 | nothing
 233 | notwithstanding
 234 | now
 235 | nowhere
 236 | of
 237 | off
 238 | often
 239 | on
 240 | on to
 241 | once
 242 | one
 243 | only
 244 | onto
 245 | or
 246 | other
 247 | others
 248 | otherwise
 249 | ought
 250 | our
 251 | ours
 252 | ourselves
 253 | out
 254 | out of
 255 | outside
 256 | over
 257 | own
 258 | part
 259 | partly
 260 | past
 261 | pending
 262 | per
 263 | perhaps
 264 | please
 265 | plus
 266 | prior
 267 | put
 268 | qua
 269 | rather
 270 | re
 271 | regarding
 272 | round
 273 | same
 274 | sans
 275 | save
 276 | see
 277 | seem
 278 | seemed
 279 | seeming
 280 | seems
 281 | separately
 282 | serious
 283 | seven
 284 | several
 285 | shall
 286 | she
 287 | should
 288 | show
 289 | side
 290 | similarly
 291 | since
 292 | sincere
 293 | six
 294 | sixty
 295 | so
 296 | some
 297 | somehow
 298 | someone
 299 | something
 300 | sometime
 301 | sometimes
 302 | somewhere
 303 | still
 304 | such
 305 | system
 306 | take
 307 | ten
 308 | than
 309 | that
 310 | the
 311 | their
 312 | theirs
 313 | them
 314 | themselves
 315 | then
 316 | thence
 317 | there
 318 | thereafter
 319 | thereby
 320 | therefore
 321 | therein
 322 | thereupon
 323 | these
 324 | they
 325 | thick
 326 | thin
 327 | third
 328 | this
 329 | those
 330 | though
 331 | three
 332 | through
 333 | throughout
 334 | thru
 335 | thus
 336 | till
 337 | to
 338 | together
 339 | too
 340 | top
 341 | toward
 342 | towards
 343 | twelve
 344 | twenty
 345 | two
 346 | un
 347 | under
 348 | underneath
 349 | unless
 350 | unlike
 351 | until
 352 | unto
 353 | up
 354 | upon
 355 | us
 356 | versus
 357 | very
 358 | via
 359 | vice
 360 | volume
 361 | was
 362 | we
 363 | well
 364 | were
 365 | what
 366 | whatever
 367 | whats
 368 | when
 369 | whence
 370 | whenever
 371 | where
 372 | whereafter
 373 | whereas
 374 | whereby
 375 | wherein
 376 | whereupon
 377 | wherever
 378 | whether
 379 | which
 380 | while
 381 | whither
 382 | who
 383 | whoever
 384 | whole
 385 | whom
 386 | whose
 387 | why
 388 | will
 389 | with
 390 | within
 391 | without
 392 | would
 393 | yesterday
 394 | yet
 395 | you
 396 | your
 397 | yours
 398 | yourself
 399 | yourselves
 400 | {
 401 | |
 402 | }
 403 | ~
 404 | ¡
 405 | ¦
 406 | «
 407 | ­
 408 | ¯
 409 | ´
 410 | ¸
 411 | »
 412 | ¿
 413 | ˇ
 414 | ˉ
 415 | ˊ
 416 | ˋ
 417 | ˜
 418 | ‐
 419 | —　
 420 | ―
 421 | ‖
 422 | ‘
 423 | ’
 424 | “
 425 | ”
 426 | •
 427 | …
 428 | ‹
 429 | ›
 430 | ∕
 431 | 、
 432 | 。
 433 | 〈
 434 | 〉
 435 | 《
 436 | 》
 437 | 「
 438 | 」
 439 | 『
 440 | 』
 441 | 【
 442 | 】
 443 | 〔
 444 | 〕
 445 | 〖
 446 | 〗
 447 | 〝
 448 | 〞
 449 | 一
 450 | 一些
 451 | 一何
 452 | 一切
 453 | 一则
 454 | 一方面
 455 | 一旦
 456 | 一来
 457 | 一样
 458 | 一般
 459 | 一转眼
 460 | 万一
 461 | 上
 462 | 上下
 463 | 下
 464 | 不
 465 | 不仅
 466 | 不但
 467 | 不光
 468 | 不单
 469 | 不只
 470 | 不外乎
 471 | 不如
 472 | 不妨
 473 | 不尽
 474 | 不尽然
 475 | 不得
 476 | 不怕
 477 | 不惟
 478 | 不成
 479 | 不拘
 480 | 不料
 481 | 不是
 482 | 不比
 483 | 不然
 484 | 不特
 485 | 不独
 486 | 不管
 487 | 不至于
 488 | 不若
 489 | 不论
 490 | 不过
 491 | 不问
 492 | 与
 493 | 与其
 494 | 与其说
 495 | 与否
 496 | 与此同时
 497 | 且
 498 | 且不说
 499 | 且说
 500 | 两者
 501 | 个
 502 | 个别
 503 | 临
 504 | 为
 505 | 为了
 506 | 为止
 507 | 为此
 508 | 为着
 509 | 乃
 510 | 乃至
 511 | 乃至于
 512 | 么
 513 | 之
 514 | 之一
 515 | 之所以
 516 | 之类
 517 | 乌乎
 518 | 乎
 519 | 乘
 520 | 也
 521 | 也好
 522 | 也罢
 523 | 了
 524 | 二来
 525 | 于
 526 | 于是
 527 | 于是乎
 528 | 云云
 529 | 云尔
 530 | 些
 531 | 亦
 532 | 人
 533 | 人们
 534 | 人家
 535 | 今
 536 | 介于
 537 | 仍
 538 | 仍旧
 539 | 从
 540 | 从此
 541 | 从而
 542 | 他
 543 | 他人
 544 | 他们
 545 | 以
 546 | 以上
 547 | 以为
 548 | 以便
 549 | 以免
 550 | 以及
 551 | 以故
 552 | 以期
 553 | 以来
 554 | 以至
 555 | 以至于
 556 | 以致
 557 | 们
 558 | 任
 559 | 任何
 560 | 任凭
 561 | 似的
 562 | 但
 563 | 但凡
 564 | 但是
 565 | 何
 566 | 何以
 567 | 何况
 568 | 何处
 569 | 何时
 570 | 余外
 571 | 作为
 572 | 你
 573 | 你们
 574 | 使
 575 | 使得
 576 | 例如
 577 | 依
 578 | 依据
 579 | 依照
 580 | 便于
 581 | 俺
 582 | 俺们
 583 | 倘
 584 | 倘使
 585 | 倘或
 586 | 倘然
 587 | 倘若
 588 | 借
 589 | 假使
 590 | 假如
 591 | 假若
 592 | 傥然
 593 | 像
 594 | 儿
 595 | 先不先
 596 | 光是
 597 | 全体
 598 | 全部
 599 | 兮
 600 | 关于
 601 | 其
 602 | 其一
 603 | 其中
 604 | 其二
 605 | 其他
 606 | 其余
 607 | 其它
 608 | 其次
 609 | 具体地说
 610 | 具体说来
 611 | 兼之
 612 | 内
 613 | 再其次
 614 | 再则
 615 | 再有
 616 | 再者
 617 | 再者说
 618 | 再说
 619 | 冒
 620 | 冲
 621 | 况且
 622 | 几
 623 | 几时
 624 | 凡
 625 | 凡是
 626 | 凭
 627 | 凭借
 628 | 出于
 629 | 出来
 630 | 分别
 631 | 则
 632 | 则甚
 633 | 别
 634 | 别人
 635 | 别处
 636 | 别是
 637 | 别的
 638 | 别管
 639 | 别说
 640 | 到
 641 | 前后
 642 | 前此
 643 | 前者
 644 | 加之
 645 | 加以
 646 | 即
 647 | 即令
 648 | 即使
 649 | 即便
 650 | 即如
 651 | 即或
 652 | 即若
 653 | 却
 654 | 去
 655 | 又
 656 | 又及
 657 | 及
 658 | 及其
 659 | 及至
 660 | 反之
 661 | 反而
 662 | 反过来
 663 | 反过来说
 664 | 受到
 665 | 另
 666 | 另一方面
 667 | 另外
 668 | 另悉
 669 | 只
 670 | 只当
 671 | 只怕
 672 | 只是
 673 | 只有
 674 | 只消
 675 | 只要
 676 | 只限
 677 | 叫
 678 | 叮咚
 679 | 可
 680 | 可以
 681 | 可是
 682 | 可见
 683 | 各
 684 | 各个
 685 | 各位
 686 | 各种
 687 | 各自
 688 | 同
 689 | 同时
 690 | 后
 691 | 后者
 692 | 向
 693 | 向使
 694 | 向着
 695 | 吓
 696 | 吗
 697 | 否则
 698 | 吧
 699 | 吧哒
 700 | 吱
 701 | 呀
 702 | 呃
 703 | 呕
 704 | 呗
 705 | 呜
 706 | 呜呼
 707 | 呢
 708 | 呵
 709 | 呵呵
 710 | 呸
 711 | 呼哧
 712 | 咋
 713 | 和
 714 | 咚
 715 | 咦
 716 | 咧
 717 | 咱
 718 | 咱们
 719 | 咳
 720 | 哇
 721 | 哈
 722 | 哈哈
 723 | 哉
 724 | 哎
 725 | 哎呀
 726 | 哎哟
 727 | 哗
 728 | 哟
 729 | 哦
 730 | 哩
 731 | 哪
 732 | 哪些
 733 | 哪怕
 734 | 哼
 735 | 哼唷
 736 | 唉
 737 | 唯有
 738 | 啊
 739 | 啐
 740 | 啥
 741 | 啦
 742 | 啪达
 743 | 啷当
 744 | 喂
 745 | 喏
 746 | 喔唷
 747 | 喽
 748 | 嗡
 749 | 嗡嗡
 750 | 嗬
 751 | 嗯
 752 | 嗳
 753 | 嘎
 754 | 嘎登
 755 | 嘘
 756 | 嘛
 757 | 嘻
 758 | 嘿
 759 | 嘿嘿
 760 | 因
 761 | 因为
 762 | 因了
 763 | 因此
 764 | 因着
 765 | 因而
 766 | 固然
 767 | 在
 768 | 在下
 769 | 在于
 770 | 地
 771 | 基于
 772 | 处在
 773 | 多
 774 | 多么
 775 | 多少
 776 | 大
 777 | 大家
 778 | 她
 779 | 她们
 780 | 好
 781 | 如
 782 | 如上
 783 | 如上所述
 784 | 如下
 785 | 如何
 786 | 如其
 787 | 如同
 788 | 如是
 789 | 如果
 790 | 如此
 791 | 如若
 792 | 始而
 793 | 孰料
 794 | 孰知
 795 | 宁
 796 | 宁可
 797 | 宁愿
 798 | 宁肯
 799 | 它
 800 | 它们
 801 | 对
 802 | 对于
 803 | 对待
 804 | 对方
 805 | 对比
 806 | 将
 807 | 小
 808 | 尔
 809 | 尔后
 810 | 尔尔
 811 | 尚且
 812 | 就
 813 | 就是
 814 | 就是了
 815 | 就是说
 816 | 就算
 817 | 就要
 818 | 尽
 819 | 尽管
 820 | 尽管如此
 821 | 岂但
 822 | 己
 823 | 已
 824 | 已矣
 825 | 巴
 826 | 巴巴
 827 | 并
 828 | 并且
 829 | 并非
 830 | 庶乎
 831 | 庶几
 832 | 开外
 833 | 开始
 834 | 归
 835 | 归齐
 836 | 当
 837 | 当地
 838 | 当然
 839 | 当着
 840 | 彼
 841 | 彼时
 842 | 彼此
 843 | 往
 844 | 待
 845 | 很
 846 | 得
 847 | 得了
 848 | 怎
 849 | 怎奈
 850 | 总之
 851 | 总的来看
 852 | 总的来说
 853 | 总的说来
 854 | 总而言之
 855 | 恰恰相反
 856 | 您
 857 | 惟其
 858 | 慢说
 859 | 我
 860 | 我们
 861 | 或
 862 | 或则
 863 | 或是
 864 | 或曰
 865 | 或者
 866 | 截至
 867 | 所
 868 | 所以
 869 | 所在
 870 | 所幸
 871 | 所有
 872 | 才
 873 | 才能
 874 | 打
 875 | 打从
 876 | 把
 877 | 抑或
 878 | 拿
 879 | 按
 880 | 按照
 881 | 换句话说
 882 | 换言之
 883 | 据
 884 | 据此
 885 | 接着
 886 | 故
 887 | 故此
 888 | 故而
 889 | 旁人
 890 | 无
 891 | 无宁
 892 | 无论
 893 | 既
 894 | 既往
 895 | 既是
 896 | 既然
 897 | 时候
 898 | 是
 899 | 是以
 900 | 是的
 901 | 曾
 902 | 替
 903 | 替代
 904 | 最
 905 | 有
 906 | 有些
 907 | 有关
 908 | 有及
 909 | 有时
 910 | 有的
 911 | 望
 912 | 朝
 913 | 朝着
 914 | 本
 915 | 本人
 916 | 本地
 917 | 本着
 918 | 本身
 919 | 来
 920 | 来着
 921 | 来自
 922 | 来说
 923 | 极了
 924 | 果然
 925 | 果真
 926 | 某
 927 | 某个
 928 | 某些
 929 | 某某
 930 | 根据
 931 | 欤
 932 | 正值
 933 | 正如
 934 | 正巧
 935 | 正是
 936 | 此
 937 | 此地
 938 | 此处
 939 | 此外
 940 | 此时
 941 | 此次
 942 | 此间
 943 | 毋宁
 944 | 每
 945 | 每当
 946 | 比
 947 | 比及
 948 | 比如
 949 | 比方
 950 | 没奈何
 951 | 沿
 952 | 沿着
 953 | 漫说
 954 | 焉
 955 | 然则
 956 | 然后
 957 | 然而
 958 | 照
 959 | 照着
 960 | 犹且
 961 | 犹自
 962 | 甚且
 963 | 甚么
 964 | 甚或
 965 | 甚而
 966 | 甚至
 967 | 甚至于
 968 | 用
 969 | 用来
 970 | 由
 971 | 由于
 972 | 由是
 973 | 由此
 974 | 由此可见
 975 | 的
 976 | 的确
 977 | 的话
 978 | 直到
 979 | 相对而言
 980 | 省得
 981 | 看
 982 | 眨眼
 983 | 着
 984 | 着呢
 985 | 矣
 986 | 矣乎
 987 | 矣哉
 988 | 离
 989 | 竟而
 990 | 第
 991 | 等
 992 | 等到
 993 | 等等
 994 | 简言之
 995 | 管
 996 | 类如
 997 | 紧接着
 998 | 纵
 999 | 纵令
1000 | 纵使
1001 | 纵然
1002 | 经
1003 | 经过
1004 | 结果
1005 | 给
1006 | 继之
1007 | 继后
1008 | 继而
1009 | 综上所述
1010 | 罢了
1011 | 者
1012 | 而
1013 | 而且
1014 | 而况
1015 | 而后
1016 | 而外
1017 | 而已
1018 | 而是
1019 | 而言
1020 | 能
1021 | 能否
1022 | 腾
1023 | 自
1024 | 自个儿
1025 | 自从
1026 | 自各儿
1027 | 自后
1028 | 自家
1029 | 自己
1030 | 自打
1031 | 自身
1032 | 至
1033 | 至于
1034 | 至今
1035 | 至若
1036 | 致
1037 | 般的
1038 | 若
1039 | 若夫
1040 | 若是
1041 | 若果 
1042 | 若非
1043 | 莫不然
1044 | 莫如
1045 | 莫若
1046 | 虽
1047 | 虽则
1048 | 虽然
1049 | 虽说
1050 | 被
1051 | 要
1052 | 要不
1053 | 要不是
1054 | 要不然
1055 | 要么
1056 | 要是
1057 | 譬喻
1058 | 譬如
1059 | 让
1060 | 许多
1061 | 论
1062 | 设使
1063 | 设或
1064 | 设若
1065 | 诚如
1066 | 诚然
1067 | 该
1068 | 说来
1069 | 诸
1070 | 诸位
1071 | 诸如
1072 | 谁
1073 | 谁人
1074 | 谁料
1075 | 谁知
1076 | 贼死
1077 | 赖以
1078 | 赶
1079 | 起
1080 | 起见
1081 | 趁
1082 | 趁着
1083 | 越是
1084 | 距
1085 | 跟
1086 | 较
1087 | 较之
1088 | 边
1089 | 过
1090 | 还
1091 | 还是
1092 | 还有
1093 | 还要
1094 | 这
1095 | 这一来
1096 | 这个
1097 | 这么
1098 | 这么些
1099 | 这么样
1100 | 这么点儿
1101 | 这些
1102 | 这会儿
1103 | 这儿
1104 | 这就是说
1105 | 这时
1106 | 这样
1107 | 这次
1108 | 这般
1109 | 这边
1110 | 这里
1111 | 进而
1112 | 连
1113 | 连同
1114 | 逐步
1115 | 通过
1116 | 遵循
1117 | 遵照
1118 | 那
1119 | 那个
1120 | 那么
1121 | 那么些
1122 | 那么样
1123 | 那些
1124 | 那会儿
1125 | 那儿
1126 | 那时
1127 | 那样
1128 | 那般
1129 | 那边
1130 | 那里
1131 | 都
1132 | 鄙人
1133 | 鉴于
1134 | 针对
1135 | 阿
1136 | 除
1137 | 除了
1138 | 除外
1139 | 除开
1140 | 除此之外
1141 | 除非
1142 | 随
1143 | 随后
1144 | 随时
1145 | 随着
1146 | 难道说
1147 | 非但
1148 | 非徒
1149 | 非特
1150 | 非独
1151 | 靠
1152 | 顺
1153 | 顺着
1154 | 首先
1155 | ︰
1156 | ︳
1157 | ︴
1158 | ︵
1159 | ︶
1160 | ︷
1161 | ︸
1162 | ︹
1163 | ︺
1164 | ︻
1165 | ︼
1166 | ︽
1167 | ︾
1168 | ︿
1169 | ﹀
1170 | ﹁
1171 | ﹂
1172 | ﹃
1173 | ﹄
1174 | ﹉
1175 | ﹊
1176 | ﹋
1177 | ﹌
1178 | ﹍
1179 | ﹎
1180 | ﹏
1181 | ﹐
1182 | ﹑
1183 | ﹔
1184 | ﹕
1185 | ﹖
1186 | ﹝
1187 | ﹞
1188 | ﹟
1189 | ﹠
1190 | ﹡
1191 | ﹢
1192 | ﹤
1193 | ﹦
1194 | ﹨
1195 | ﹩
1196 | ﹪
1197 | ﹫
1198 | ！
1199 | ＂
1200 | ＇
1201 | （
1202 | ）
1203 | ，
1204 | ：
1205 | ；
1206 | ？
1207 | ＿
1208 | ￣
1209 | １
1210 | ２
1211 | ３
1212 | ４
1213 | ５
1214 | ６
1215 | ７
1216 | ８
1217 | ９
1218 | ０
1219 | *
1220 | 


--------------------------------------------------------------------------------
/DSSM-BOW/main.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/5/12 22:57 
  4 | # @Author: Zhang Cong
  5 | 
  6 | import os
  7 | import re
  8 | import logging
  9 | import sklearn
 10 | import numpy as np
 11 | from tqdm import tqdm
 12 | import tensorflow as tf
 13 | from model import Model
 14 | from config import Config
 15 | from collections import Counter
 16 | 
 17 | logging.getLogger().setLevel(level=logging.INFO)
 18 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 19 | 
 20 | config = Config()
 21 | # GPU配置信息
 22 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"                  # 按照PCI_BUS_ID顺序从0开始排列GPU设备
 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"                        # 设置当前使用的GPU设备仅为0号设备
 24 | gpuConfig = tf.ConfigProto()
 25 | gpuConfig.allow_soft_placement = True                           # 设置为True，当GPU不存在或者程序中出现GPU不能运行的代码时，自动切换到CPU运行
 26 | gpuConfig.gpu_options.allow_growth = True                       # 设置为True，程序运行时，会根据程序所需GPU显存情况，分配最小的资源
 27 | gpuConfig.gpu_options.per_process_gpu_memory_fraction = 0.8     # 程序运行的时，所需的GPU显存资源最大不允许超过rate的设定值
 28 | 
 29 | # 模型训练
 30 | class Train():
 31 |     def __init__(self):
 32 |         # 实例化模型结构
 33 |         self.model = Model()
 34 |         self.sess = tf.Session(config=gpuConfig)
 35 |         self.sess.run(tf.global_variables_initializer())
 36 | 
 37 |     def train(self):
 38 |         # 数据集预处理
 39 |         if not os.path.exists(config.preprocess_path):
 40 |             pre_process(config.original_data_path, config.preprocess_path)
 41 |         sentences = load_dataset(config.preprocess_path)       # 加载数据集
 42 |         # 构建词汇映射表
 43 |         if not os.path.exists(config.vocab_path):
 44 |             build_vocab(sentences, config.vocab_path)
 45 |         word_to_id = read_vocab(config.vocab_path)      # 读取词汇表及其映射关系
 46 |         # 构建训练数据集
 47 |         train_data, train_label = data_transform(sentences, word_to_id)
 48 |         # 构建验证测试集
 49 |         test_data, test_label = get_dev_dataset(config.dev_data_path, word_to_id)
 50 | 
 51 |         # 打印训练、测试数据量，数据与标签量是否相等
 52 |         logging.info('Train Data: {}'.format(np.array(train_data).shape))
 53 |         logging.info('Train Label: {}'.format(np.array(train_label).shape))
 54 |         logging.info('Test Data: {}'.format(np.array(test_data).shape))
 55 |         logging.info('Test Label: {}'.format(np.array(test_label).shape))
 56 | 
 57 |         # 数据集校验，数据和标签数量是否有误
 58 |         if (len(train_data) != len(train_label)) or (len(test_data) != len(test_label)):
 59 |             logging.info('Data number != Label number')
 60 |             exit(0)
 61 | 
 62 |         # 配置Saver
 63 |         saver = tf.train.Saver()
 64 |         if not os.path.exists(config.model_save_path):  # 如不存在相应文件夹，则创建
 65 |             os.mkdir(config.model_save_path)
 66 | 
 67 |         # 模型训练
 68 |         best_f1_score = 0  # 初始best模型的F1值
 69 |         for epoch in range(1, config.epochs + 1):
 70 |             train_accuracy_list = []    # 存储每个epoch的accuracy
 71 |             train_loss_list = []        # 存储每个epoch的loss
 72 |             train_label_list = []       # 存储数据的true label
 73 |             train_predictions = []      # 存储模型预测出的label
 74 |             # 将训练数据进行 batch_size 切分
 75 |             batch_train_data, batch_train_label = creat_batch_data(train_data, train_label, config.batch_size)
 76 |             for step, (batch_x, batch_y) in tqdm(enumerate(zip(batch_train_data, batch_train_label))):
 77 |                 feed_dict = {self.model.input_query: [x[0] for x in batch_x],
 78 |                              self.model.input_pos_doc: [x[1] for x in batch_x],
 79 |                              self.model.input_neg_doc: [x[2: ] for x in batch_x],
 80 |                              self.model.input_keep_prob: config.keep_prob}
 81 |                 train_predict, train_accuracy, train_loss, _ = self.sess.run([self.model.train_predict, self.model.accuracy_train, self.model.loss, self.model.optimizer], feed_dict=feed_dict)
 82 |                 train_accuracy_list.append(train_accuracy)
 83 |                 train_loss_list.append(train_loss)
 84 |                 train_label_list.extend([1.0]*len(train_predict))
 85 |                 train_predictions.extend(train_predict)
 86 | 
 87 |             # 获取最大score所在的index
 88 |             train_true_y = train_label_list
 89 |             # 计算模型F1 score
 90 |             train_precision = sklearn.metrics.precision_score(y_true=np.array(train_true_y), y_pred=np.array(train_predictions), average='weighted')
 91 |             train_recall = sklearn.metrics.recall_score(y_true=np.array(train_true_y), y_pred=np.array(train_predictions), average='weighted')
 92 |             train_f1 = sklearn.metrics.f1_score(y_true=np.array(train_true_y), y_pred=np.array(train_predictions), average='weighted')
 93 | 
 94 |             # 完成一个epoch的训练，输出训练数据的mean accuracy、mean loss
 95 |             logging.info('Train Epoch: %d , Loss: %.6f , Acc: %.6f , Precision: %.6f , Recall: %.6f , F1: %.6f' % (epoch,
 96 |                                                                                                           float(np.mean(np.array(train_loss_list))),
 97 |                                                                                                           float(np.mean(np.array(train_accuracy_list))),
 98 |                                                                                                           float(train_precision),
 99 |                                                                                                           float(train_recall),
100 |                                                                                                           float(train_f1)))
101 |             # 模型验证
102 |             test_accuracy_list = []  # 存储每个epoch的accuracy
103 |             test_label_list = []  # 存储数据的true label
104 |             test_predictions = []  # 存储模型预测出的label
105 |             # 将训练数据进行 batch_size 切分
106 |             batch_test_data, batch_test_label = creat_batch_data(test_data, test_label, config.batch_size)
107 |             for (batch_x, batch_y) in tqdm(zip(batch_test_data, batch_test_label)):
108 |                 feed_dict = {self.model.input_query: [x[0] for x in batch_x],
109 |                              self.model.input_pos_doc: [x[1] for x in batch_x],
110 |                              self.model.input_keep_prob: 1.0}
111 |                 test_predict, test_accuracy = self.sess.run([self.model.test_predict, self.model.accuracy_test], feed_dict=feed_dict)
112 |                 test_accuracy_list.append(test_accuracy)
113 |                 test_label_list.extend(batch_y)
114 |                 test_predictions.extend(test_predict)
115 | 
116 |             # 获取最大score所在的index
117 |             test_true_y = test_label_list
118 |             # 计算模型F1 score
119 |             test_precision = sklearn.metrics.precision_score(y_true=np.array(test_true_y), y_pred=np.array(test_predictions), average='weighted')
120 |             test_recall = sklearn.metrics.recall_score(y_true=np.array(test_true_y), y_pred=np.array(test_predictions), average='weighted')
121 |             test_f1 = sklearn.metrics.f1_score(y_true=np.array(test_true_y), y_pred=np.array(test_predictions), average='weighted')
122 | 
123 |             # 完成一个epoch的训练，输出训练数据的mean accuracy、mean loss
124 |             logging.info('Test Epoch: %d , Acc: %.6f , Precision: %.6f , Recall: %.6f , F1: %.6f' % (epoch,
125 |                                                                                                      float(np.mean(np.array(test_accuracy_list))),
126 |                                                                                                      float(test_precision),
127 |                                                                                                      float(test_recall),
128 |                                                                                                      float(test_f1)))
129 |             # 当前epoch产生的模型F1值超过最好指标时，保存当前模型
130 |             if best_f1_score < test_f1:
131 |                 best_f1_score = test_f1
132 |                 saver.save(sess=self.sess, save_path=config.model_save_path)
133 |                 logging.info('Save Model Success ...')
134 | 
135 | 
136 | # 模型预测
137 | class Predict():
138 |     def __init__(self):
139 |         # 实例化并加载模型
140 |         self.model = Model()
141 |         self.sess = tf.Session(config=gpuConfig)
142 |         self.saver = tf.train.Saver()
143 |         self.saver.restore(sess=self.sess, save_path=config.model_save_path)
144 | 
145 |         # 加载词汇->ID映射表
146 |         self.word_to_id = read_vocab(config.vocab_path)
147 |         # 加载停用词
148 |         # self.stopwords = [word.replace('\n', '').strip() for word in open(config.stopwords_path, encoding='UTF-8')]
149 | 
150 | 
151 |     def pre_process(self, sentence):
152 |         '''
153 |         文本数据预处理
154 |         :param sentence: 输入的文本句子
155 |         :return:
156 |         '''
157 |         # 分词，去除停用词
158 |         sentence_seg = [word for word in text_processing(sentence).split(' ')]
159 |         # 将词汇映射为ID
160 |         sentence_id = [0] * config.vocab_size  # 每个文本表示为词表大小的向量
161 |         for word in sentence_seg:
162 |             if word in self.word_to_id.keys():  # 如果当前词汇在词表中，则词汇频次+1
163 |                 sentence_id[self.word_to_id[word]] += 1
164 |             else:  # 如果当前词汇不在词表中，则UNK频次+1
165 |                 sentence_id[self.word_to_id['<UNK>']] += 1
166 | 
167 |         return sentence_id
168 | 
169 | 
170 |     # 结果预测
171 |     def predict(self, sentence_1, sentence_2):
172 |         '''
173 |         模型预测函数
174 |         :param sentence_1: 句子1
175 |         :param sentence_2: 句子2
176 |         :return:
177 |         '''
178 |         # 对句子预处理并进行ID表示
179 |         sentence_id_1 = self.pre_process(sentence_1)
180 |         sentence_id_2 = self.pre_process(sentence_2)
181 | 
182 |         feed_dict = {self.model.input_query: [sentence_id_1],
183 |                      self.model.input_pos_doc: [sentence_id_2],
184 |                      self.model.input_keep_prob: 1.0}
185 |         sim_score = self.sess.run(self.model.query_pos_cosine, feed_dict=feed_dict)[0][0]
186 | 
187 |         return sim_score
188 | 
189 | 
190 | def text_processing(text):
191 |     '''
192 |     文本数据预处理，分词，去除停用词
193 |     :param text: 文本数据sentence
194 |     :return: 以空格为分隔符进行分词/分字
195 |     '''
196 |     # 删除（）里的内容
197 |     text = re.sub('（[^（.]*）', '', text)
198 |     # 只保留中文部分
199 |     text = ''.join([x for x in text if '\u4e00' <= x <= '\u9fa5'])
200 |     # 利用jieba进行分词
201 |     # words = list(jieba.cut(text))
202 |     # 不分词
203 |     words = [x for x in ''.join(text)]
204 |     return ' '.join(words)
205 | 
206 | 
207 | def pre_process(data_path, preprocess_path):
208 |     '''
209 |     原始数据预处理
210 |     :param data_path: 原始文本文件路径
211 |     :param preprocess_path: 预处理后的数据存储路径
212 |     :return:
213 |     '''
214 |     # 加载停用词表
215 |     logging.info('Start Preprocess ...')
216 |     preprocess_file = open(preprocess_path, mode='w', encoding='UTF-8')
217 |     # stopwords = [word.replace('\n', '').strip() for word in open(config.stopwords_path, encoding='UTF-8')]
218 |     for line in tqdm(open(data_path, encoding='UTF-8')):
219 |         sentence_list = str(line).strip().replace('\n', '').split('\t')             # 去除其他符号并进行切分
220 |         sentence_list = [text_processing(sentence) for sentence in sentence_list]   # 文本分字预处理
221 |         preprocess_file.write('\t'.join(sentence_list) + '\n')
222 | 
223 |     preprocess_file.close()
224 | 
225 | 
226 | def load_dataset(data_path):
227 |     '''
228 |     从本地磁盘加载经过预处理的数据集，避免每次都进行预处理操作
229 |     :param data_path: 预处理好的数据集路径
230 |     :return: 句子列表
231 |     '''
232 |     logging.info('Load Dataset ...')
233 |     sentences = []
234 |     for line in tqdm(open(data_path, encoding='UTF-8')):
235 |         sentence_list = str(line).strip().replace('\n', '').split('\t')
236 |         if len(sentence_list) != config.neg_doc_num+2:
237 |             # logging.info('Load Data Error ... msg: {}'.format(sentence_list))  # 部分数据去除英文和数字后为空，不为neg_doc_num+2条
238 |             continue
239 |         sentence_list = [sentence.split(' ') for sentence in sentence_list]
240 |         sentences.append(sentence_list)
241 | 
242 |     return sentences
243 | 
244 | 
245 | def get_dev_dataset(data_path, word_to_id):
246 |     '''
247 |     创建验证数据集，并进行预处理
248 |     :param data_path: 测试数据集路径
249 |     :param word_to_id:  word——ID 映射表
250 |     :param label_to_id: label——ID 映射表
251 |     :param idf: word——IDF 映射表
252 |     :return:
253 |     '''
254 |     logging.info('Get Dev Dataset ...')
255 |     datas, labels = [], []
256 |     for line in tqdm(open(data_path, mode='r', encoding='UTF-8')):
257 |         sentence_list = line.strip().replace('\n', '').split('\t')  # 去除其他符号，并进行split
258 |         if sentence_list[2] == 'label':      # 跳过头标签
259 |             continue
260 | 
261 |         sentence_id_temp = []
262 |         for sentence in sentence_list[: 2]:
263 |             sentence_temp = [0] * config.vocab_size
264 |             for word in text_processing(sentence).split(' '):
265 |                 if word in word_to_id.keys():       # 如果当前词汇在词表中，则词汇频次+1
266 |                     sentence_temp[word_to_id[word]] += 1
267 |                 else:       # 如果当前词汇不在词表中，则UNK频次+1
268 |                     sentence_temp[word_to_id['<UNK>']] += 1
269 |             sentence_id_temp.append(sentence_temp)
270 | 
271 |         datas.append(sentence_id_temp)
272 |         labels.append(int(sentence_list[2]))
273 |     return datas, labels
274 | 
275 | 
276 | def build_vocab(input_data, vocab_path):
277 |     '''
278 |     根据数据集构建词汇表，存储到本地备用
279 |     :param input_data: 全部句子集合 [n, 2] n为数据条数
280 |     :param vocab_path: 词表文件存储路径
281 |     :return:
282 |     '''
283 |     logging.info('Build Vocab ...')
284 |     all_sentence = []   # 全部句子集合
285 |     for sentence_list in input_data:
286 |         for sentence in sentence_list:
287 |             all_sentence.extend(sentence)
288 | 
289 |     counter = Counter(all_sentence)          # 词频统计
290 |     count_pairs = counter.most_common(config.vocab_size - 2)    # 对词汇按次数进行降序排序
291 |     words, _ = list(zip(*count_pairs))              # 将(word, count)元祖形式解压，转换为列表list
292 |     # 添加一个 <PAD> 来将所有文本pad为同一长度
293 |     words = ['<UNK>'] + list(words)  # 增加一个OOV标识的编码
294 |     words = ['<PAD>'] + list(words)  # 增加一个PAD标识的编码
295 |     open(vocab_path, mode='w', encoding='UTF-8').write('\n'.join(words))
296 | 
297 | 
298 | def read_vocab(vocab_path):
299 |     """
300 |     读取词汇表，构建 词汇-->ID 映射字典
301 |     :param vocab_path: 词表文件路径
302 |     :return: 词表，word_to_id
303 |     """
304 |     words = [word.replace('\n', '').strip() for word in open(vocab_path, encoding='UTF-8')]
305 |     word_to_id = dict(zip(words, range(len(words))))
306 | 
307 |     return word_to_id
308 | 
309 | 
310 | def data_transform(input_data, word_to_id):
311 |     '''
312 |     数据预处理，将文本和标签映射为ID形式，额外特征计算IDF
313 |     :param input_data: 文本数据集合
314 |     :param word_to_id: 词汇——ID映射表
315 |     :return: ID形式的文本
316 |     '''
317 |     logging.info('Sentence Trans To ID ...')
318 |     sentence_id = []
319 |     label_id = []
320 |     # 将文本转换为BOW表示
321 |     for sentence_list in tqdm(input_data):
322 |         sentence_id_temp = []
323 |         for sentence in sentence_list:
324 |             sentence_temp = [0]*config.vocab_size       # 每个文本表示为词表大小的向量
325 |             for word in sentence:
326 |                 if word in word_to_id.keys():   # 如果当前词汇在词表中，则词汇频次+1
327 |                     sentence_temp[word_to_id[word]] += 1
328 |                 else:   # 如果当前词汇不在词表中，则UNK频次+1
329 |                     sentence_temp[word_to_id['<UNK>']] += 1
330 |             sentence_id_temp.append(sentence_temp)
331 |         sentence_id.append(sentence_id_temp)
332 |         label_id.append([1] + [0]*config.neg_doc_num)
333 | 
334 |     # shuffle
335 |     indices = np.random.permutation(np.arange(len(sentence_id)))
336 |     datas = np.array(sentence_id)[indices]
337 |     labels = np.array(label_id)[indices]
338 | 
339 |     return datas, labels
340 | 
341 | 
342 | def creat_batch_data(input_data, input_label, batch_size):
343 |     '''
344 |     将数据集以batch_size大小进行切分
345 |     :param input_data: 数据列表
346 |     :param input_label: 标签列表
347 |     :param input_feature: 额外特征列表
348 |     :param batch_size: 批大小
349 |     :return:
350 |     '''
351 |     max_length = len(input_data)            # 数据量
352 |     max_index = max_length // batch_size    # 最大批次
353 |     # shuffle
354 |     indices = np.random.permutation(np.arange(max_length))
355 |     data_shuffle = np.array(input_data)[indices]
356 |     label_shuffle = np.array(input_label)[indices]
357 | 
358 |     batch_data = []
359 |     batch_label = []
360 |     for index in range(max_index):
361 |         start = index * batch_size                          # 起始索引
362 |         end = min((index + 1) * batch_size, max_length)     # 结束索引，可能为start + batch_size 或max_length
363 |         batch_data.append(data_shuffle[start: end])
364 |         batch_label.append(label_shuffle[start: end])
365 | 
366 |         if (index + 1) * batch_size > max_length:           # 如果结束索引超过了数据量，则结束
367 |             break
368 | 
369 |     return batch_data, batch_label
370 | 
371 | 
372 | 
373 | if __name__ == '__main__':
374 |     # 训练
375 |     Train().train()
376 | 
377 |     # 预测
378 |     # predictor = Predict()
379 |     # while True:
380 |     #     sentence_1 = input('Input Sentence 1：')
381 |     #     sentence_2 = input('Input Sentence 2：')
382 |     #     result = predictor.predict(sentence_1, sentence_2)
383 |     #     print(result)


--------------------------------------------------------------------------------
/DSSM-BOW/model.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/5/12 22:58 
  4 | # @Author: Zhang Cong
  5 | 
  6 | from config import Config
  7 | import tensorflow as tf
  8 | 
  9 | class Model():
 10 |     def __init__(self):
 11 |         self.config = Config()                                                                                                                      # 配置参数
 12 |         self.input_query = tf.placeholder(shape=[None, self.config.vocab_size], dtype=tf.float32, name='input-query')                               # 输入query，BOW形式
 13 |         self.input_pos_doc = tf.placeholder(shape=[None, self.config.vocab_size], dtype=tf.float32, name='input-pos')                               # 输入pos_doc，BOW形式
 14 |         self.input_neg_doc = tf.placeholder(shape=[None, self.config.neg_doc_num, self.config.vocab_size], dtype=tf.float32, name='input-neg')      # 输入多个neg_doc，BOW形式
 15 |         self.input_keep_prob = tf.placeholder(dtype=tf.float32, name='input-keep-prob')                                                             # keep-prob
 16 | 
 17 |         # 全连接层layer1 (batch_size, 2000) -> (batch_size, 300)
 18 |         L1_N = 300
 19 |         l1_range = tf.sqrt(6/(self.config.vocab_size + L1_N))       # 原论文weight、bias范围初始化方式
 20 |         weight_1 = tf.get_variable(initializer=tf.random_uniform(shape=[self.config.vocab_size, L1_N], minval=-l1_range, maxval=l1_range), name='weight-1')
 21 |         bias_1 = tf.get_variable(initializer=tf.random_uniform(shape=[L1_N], minval=-l1_range, maxval=l1_range), name='bias-1')
 22 |         # 全连接
 23 |         query_l1 = tf.matmul(self.input_query, weight_1) + bias_1
 24 |         pos_doc_l1 = tf.matmul(self.input_pos_doc, weight_1, ) + bias_1
 25 |         neg_doc_l1 = tf.matmul(tf.reshape(self.input_neg_doc, shape=[-1, self.config.vocab_size]), weight_1, ) + bias_1
 26 |         # 激活函数 activation function
 27 |         query_l1 = tf.nn.tanh(query_l1)
 28 |         pos_doc_l1 = tf.nn.tanh(pos_doc_l1)
 29 |         neg_doc_l1 = tf.nn.tanh(neg_doc_l1)
 30 | 
 31 |         # 全连接层layer2 (batch_size, 300) -> (batch_size, 300)
 32 |         L2_N = 300
 33 |         l2_range = tf.sqrt(6/(L1_N + L2_N))     # 原论文weight、bias范围初始化方式
 34 |         weight_2 = tf.get_variable(initializer=tf.random_uniform(shape=[L1_N, L2_N], minval=-l2_range, maxval=l2_range), name='weight-2')
 35 |         bias_2 = tf.get_variable(initializer=tf.random_uniform(shape=[L2_N], minval=-l2_range, maxval=l2_range), name='bias-2')
 36 |         # 全连接
 37 |         query_l2 = tf.matmul(query_l1, weight_2) + bias_2
 38 |         pos_doc_l2 = tf.matmul(pos_doc_l1, weight_2) + bias_2
 39 |         neg_doc_l2 = tf.matmul(neg_doc_l1, weight_2) + bias_2
 40 |         # 激活函数 activation function
 41 |         query_l2 = tf.nn.tanh(query_l2)
 42 |         pos_doc_l2 = tf.nn.tanh(pos_doc_l2)
 43 |         neg_doc_l2 = tf.nn.tanh(neg_doc_l2)
 44 | 
 45 |         # 全连接层layer3 (batch_size, 300) -> (batch_size, 128)
 46 |         L3_N = 128
 47 |         l3_range = tf.sqrt(6/(L2_N + L3_N))     # 原论文weight、bias范围初始化方式
 48 |         weight_3 = tf.get_variable(initializer=tf.random_uniform(shape=[L2_N, L3_N], minval=-l3_range, maxval=l3_range), name='weight-3')
 49 |         bias_3 = tf.get_variable(initializer=tf.random_uniform(shape=[L3_N], minval=-l3_range, maxval=l3_range), name='bias-3')
 50 |         # 全连接
 51 |         query_l3 = tf.matmul(query_l2, weight_3) + bias_3
 52 |         pos_doc_l3 = tf.matmul(pos_doc_l2, weight_3) + bias_3
 53 |         neg_doc_l3 = tf.matmul(neg_doc_l2, weight_3) + bias_3
 54 |         # 激活函数 activation function
 55 |         query_l3_out = tf.tanh(query_l3)
 56 |         pos_doc_l3_out = tf.tanh(pos_doc_l3)
 57 |         neg_doc_l3_out = tf.tanh(neg_doc_l3)
 58 | 
 59 |         # 维度还原 [batch_size, neg_doc_num, hidden_dim]
 60 |         neg_doc_l3_out = tf.reshape(neg_doc_l3_out, shape=[-1, self.config.neg_doc_num, L3_N])
 61 | 
 62 |         # 计算query和pos_doc的Cosine
 63 |         query_dot_pos = tf.reduce_sum(tf.multiply(query_l3_out, pos_doc_l3_out), axis=1)            # query和pos_doc进行点乘
 64 |         query_l3_l2 = tf.sqrt(tf.reduce_sum(tf.square(query_l3_out), axis=1))                       # query的L2范数
 65 |         pos_doc_l3_l2 = tf.sqrt(tf.reduce_sum(tf.square(pos_doc_l3_out), axis=1))                   # pos_doc的L2范数
 66 |         self.query_pos_cosine = tf.expand_dims(query_dot_pos/(query_l3_l2*pos_doc_l3_l2), axis=1)   # 计算query和pos_doc的余弦值
 67 | 
 68 |         # 测试结果
 69 |         self.test_predict = tf.reshape(tensor=tf.round(self.query_pos_cosine), shape=[-1])
 70 |         # 批测试准确率
 71 |         self.accuracy_test = tf.reduce_mean(tf.round(self.query_pos_cosine))
 72 | 
 73 |         # 计算query和neg_doc的Cosine
 74 |         query_l3_out_flatten = tf.expand_dims(query_l3_out, axis=1)                                 # 扩充query矩阵维度
 75 |         query_dot_neg = tf.reduce_sum(tf.multiply(query_l3_out_flatten, neg_doc_l3_out), axis=2)    # query和neg_doc进行点乘
 76 |         neg_doc_l3_l2 = tf.sqrt(tf.reduce_sum(tf.square(neg_doc_l3_out), axis=2))                   # neg_doc的L2范数
 77 |         self.query_neg_cosine = query_dot_neg/(tf.expand_dims(query_l3_l2, axis=1)*neg_doc_l3_l2)   # 计算query和neg_doc的余弦值
 78 | 
 79 |         # 将pos_doc和neg_doc的cosine进行拼接为一个整体矩阵
 80 |         doc_cosine = tf.concat([self.query_pos_cosine, self.query_neg_cosine], axis=1)
 81 |         # score归一化
 82 |         doc_cosine_softmax = tf.nn.softmax(doc_cosine, axis=1)
 83 |         # 获取query与pos_doc的相似度
 84 |         prob = tf.slice(doc_cosine_softmax, begin=[0, 0], size=[-1, 1])
 85 |         # 训练结果
 86 |         self.train_predict = tf.reshape(tensor=tf.round(prob), shape=[-1])
 87 |         # 损失函数 负对数损失函数，提升pos_doc的score，抑制neg_doc的score
 88 |         self.loss = -tf.reduce_sum(tf.log(prob))
 89 |         # 优化器
 90 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(loss=self.loss)
 91 | 
 92 |         # 构造true label
 93 |         label = [[1]+[0]*self.config.neg_doc_num]               # true label: [1, 0, 0, 0, 0]
 94 |         labels = tf.tile(label, [self.config.batch_size, 1])    # 按batch_size的数量进行复制
 95 | 
 96 |         # 正确率
 97 |         correct = tf.equal(tf.argmax(doc_cosine_softmax, axis=1), tf.argmax(labels, axis=1))
 98 |         self.accuracy_train = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
 99 | 
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     Model()


--------------------------------------------------------------------------------
/DSSM-CNN/DSSM-CNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/DSSM-CNN/DSSM-CNN.png


--------------------------------------------------------------------------------
/DSSM-CNN/Data_Generate.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/5/12 22:19 
  4 | # @Author: Zhang Cong
  5 | 
  6 | import random
  7 | import logging
  8 | from tqdm import tqdm
  9 | 
 10 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 11 | 
 12 | def generate_train_data(input_file_path, output_file_path):
 13 |     '''
 14 |     对原始csv数据进行格式转换，构建训练数据集
 15 |     :param input_file_path: 原始数据路径
 16 |     :param output_file_path: 构建完成的训练数据路径
 17 |     :return: 将数据存储至本地
 18 |     '''
 19 |     logging.info('Start get all sentence ...')
 20 |     # 获取全部句子集
 21 |     all_sentence = []
 22 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 23 |         line = line.replace('\n', '').split('\t')
 24 |         if line[2] == 'label':      # 跳过首行
 25 |             continue
 26 |         sentence_1 = str(line[0]).replace('\t', '')     # 句子1
 27 |         sentence_2 = str(line[1]).replace('\t', '')     # 句子2
 28 |         all_sentence.append(sentence_1)
 29 |         all_sentence.append(sentence_2)
 30 |     # 去重
 31 |     all_sentence = list(set(all_sentence))
 32 | 
 33 |     logging.info('Start generate dataset ...')
 34 |     # 构建训练数据集 [query, pos, neg_1, neg_2, neg_3, neg_4]
 35 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
 36 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 37 |         line = line.replace('\n', '').split('\t')
 38 |         if line[2] == 'label':      # 跳过首行
 39 |             continue
 40 |         sentence_list = []
 41 |         sentence_1 = str(line[0]).replace('\t', '')
 42 |         sentence_2 = str(line[1]).replace('\t', '')
 43 |         sentence_list.append(sentence_1)    # 句子1
 44 |         sentence_list.append(sentence_2)    # 句子2
 45 |         label = line[2]                     # 标签
 46 | 
 47 |         if int(label)==1:       # 如果标签为1，则保留此句子对，并随机负采样得到4个负例
 48 |             while len(sentence_list)<6:         # [query, pos, neg_1, neg_2, neg_3, neg_4]
 49 |                 index = random.randint(0, len(all_sentence)-1)      # 随机索引
 50 |                 if all_sentence[index] not in sentence_list:        # 如果不重复，则加入
 51 |                     sentence_list.append(all_sentence[index])
 52 |             output_file.write('\t'.join(sentence_list) + '\n')
 53 |     output_file.close()
 54 |     logging.info('Finishied generate dataset ...')
 55 | 
 56 | 
 57 | def generate_test_data(input_file_path, output_file_path):
 58 |     '''
 59 |     对原始csv数据进行格式转换，构建测试数据集
 60 |     :param input_file_path: 原始数据路径
 61 |     :param output_file_path: 构建完成的训练数据路径
 62 |     :return: 将数据存储至本地
 63 |     '''
 64 |     logging.info('Start get all sentence ...')
 65 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
 66 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 67 |         line = line.replace('\n', '').split('\t')
 68 |         if line[2] == 'label':      # 跳过首行
 69 |             continue
 70 |         sentence_1 = str(line[0]).replace('\t', '')     # 句子1
 71 |         sentence_2 = str(line[1]).replace('\t', '')     # 句子2
 72 |         label = line[2]                                 # 标签
 73 |         output_file.write(sentence_1 + '\t' + sentence_2 + '\t' + label + '\n')
 74 | 
 75 | 
 76 | def check_data(input_file_path):
 77 |     '''
 78 |     统计数据分布情况，检查数据集0/1分布是否均衡
 79 |     :param input_file_path: 数据路径
 80 |     :return:
 81 |     '''
 82 |     count = 0
 83 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 84 |         line = line.replace('\n', '').split('\t')
 85 |         if line[2] == 'label':
 86 |             continue
 87 |         if int(line[2]) == 1:
 88 |             count += 1
 89 |     print(count)
 90 | 
 91 | 
 92 | if __name__ == '__main__':
 93 | 
 94 |     # 统计数据分布情况
 95 |     file_path = './data/lcqmc/lcqmc_train.tsv'
 96 |     check_data(file_path)
 97 | 
 98 |     # 构建训练数据集
 99 |     input_file_path = './data/lcqmc/lcqmc_train.tsv'
100 |     output_file_path = './data/train.txt'
101 |     generate_train_data(input_file_path, output_file_path)
102 |     logging.info('Success generate train.txt')
103 | 
104 |     # 构建验证数据集
105 |     input_file_path = './data/lcqmc/lcqmc_dev.tsv'
106 |     output_file_path = './data/dev.txt'
107 |     generate_test_data(input_file_path, output_file_path)
108 |     logging.info('Success generate dev.txt')
109 | 
110 |     # 构建测试数据集
111 |     # input_file_path = './data/lcqmc/lcqmc_test.tsv'
112 |     # output_file_path = './data/test.txt'
113 |     # generate_test_data(input_file_path, output_file_path)
114 |     # logging.info('Success generate test.txt')
115 | 
116 | 


--------------------------------------------------------------------------------
/DSSM-CNN/README.md:
--------------------------------------------------------------------------------
 1 | ## DSSM-CNN (CLSM: A Latent Semantic Model with Convolutional-Pooling Structure for Information Retrieval)
 2 | 
 3 | 
 4 | ### 数据集：
 5 | #### LCQMC (http://icrc.hitsz.edu.cn/info/1037/1146.htm) 数据集版权保护，本项目不提供，请自行下载或替换其他数据集进行试验
 6 | 
 7 | 
 8 | ### 数据形式：
 9 | #### query \t pos \t neg_1 \t neg_2 \t neg_3 \t neg_4
10 |     其中pos为正例，neg_1、neg_2、neg_3、neg_4为随机负采样得到负例
11 | 
12 | 
13 | ### 文件解释
14 | * main.py —— 主文件
15 | * model.py —— 模型结构
16 | * config.py —— 配置参数
17 | * Data_Generate.py —— 数据集处理脚本
18 | * /data —— 数据存放文件夹
19 | * /save_model —— 模型存储文件夹
20 | 
21 | 
22 | ### 模型结构
23 | ![avatar](DSSM-CNN.png)
24 | * 原始的DSSM中所有单词相当于是独立来看的，而且Query和Doc都是Bag of words，句子之间的联系就被词袋模型给忽视掉了，所以作者通过引入卷积操作，尽量地捕捉单词的相关性。
25 | * 模型的主要思想是使用CNN结构来提取语料的语义信息，卷积层的使用保留了词语的上下文信息，池化层的使用提取了对各个隐含语义贡献最大的词汇。
26 | * DSSM-CNN与DSSM的区别主要在于输入层和表示层。
27 | 
28 | 
29 | ### 参考资料
30 | * A Latent Semantic Model with Convolutional-Pooling Structure for Information Retrieval (http://www.iro.umontreal.ca/~lisa/pointeurs/ir0895-he-2.pdf)
31 | * https://www.cnblogs.com/wmx24/p/10157154.html
32 | 
33 | 


--------------------------------------------------------------------------------
/DSSM-CNN/config.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/5/12 22:57 
 4 | # @Author: Zhang Cong
 5 | 
 6 | # 模型配置参数
 7 | class Config():
 8 |     def __init__(self):
 9 |         self.original_data_path = './data/train.txt'
10 |         self.dev_data_path = './data/dev.txt'
11 |         self.stopwords_path = './data/stopwords.txt'
12 |         self.preprocess_path = './data/preprocessed_data.txt'
13 |         self.vocab_path = './data/vocab.txt'
14 |         self.label_path = './data/label.txt'
15 |         self.model_save_path = './save_model/'
16 |         self.vocab_size = 2000
17 |         self.seq_length = 20
18 |         self.embedding_dim = 300
19 |         self.neg_doc_num = 4
20 |         self.learning_rate = 1e-5
21 |         self.keep_prob = 0.5
22 |         self.hidden_dim = 512
23 |         self.kernel_size = 3
24 |         self.batch_size = 32
25 |         self.epochs = 100


--------------------------------------------------------------------------------
/DSSM-CNN/data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | &nbsp
   2 | &nbsp;
   3 | aboard
   4 | about
   5 | above
   6 | according
   7 | according to
   8 | across
   9 | afore
  10 | after
  11 | afterwards
  12 | again
  13 | against
  14 | agin
  15 | all
  16 | almost
  17 | alone
  18 | along
  19 | alongside
  20 | already
  21 | also
  22 | although
  23 | always
  24 | am
  25 | amid
  26 | amidst
  27 | among
  28 | amongst
  29 | amoungst
  30 | amount
  31 | an
  32 | and
  33 | anent
  34 | another
  35 | any
  36 | anyhow
  37 | anyone
  38 | anything
  39 | anyway
  40 | anywhere
  41 | approximately
  42 | are
  43 | around
  44 | as
  45 | asked
  46 | aslant
  47 | astride
  48 | at
  49 | athwart
  50 | back
  51 | bar
  52 | be
  53 | became
  54 | because
  55 | because of
  56 | become
  57 | becomes
  58 | becoming
  59 | been
  60 | before
  61 | beforehand
  62 | behind
  63 | being
  64 | below
  65 | beneath
  66 | beside
  67 | besides
  68 | between
  69 | betwixt
  70 | beyond
  71 | bill
  72 | both
  73 | bottom
  74 | but
  75 | by
  76 | call
  77 | called
  78 | can
  79 | cannot
  80 | cant
  81 | circa
  82 | co
  83 | computer
  84 | con
  85 | could
  86 | couldnt
  87 | cry
  88 | currently
  89 | dare
  90 | de
  91 | describe
  92 | despite
  93 | detail
  94 | did
  95 | do
  96 | does
  97 | done
  98 | down
  99 | dr
 100 | due
 101 | due to
 102 | during
 103 | e.g.,
 104 | each
 105 | earlier
 106 | eg
 107 | eight
 108 | either
 109 | eleven
 110 | else
 111 | elsewhere
 112 | empty
 113 | enough
 114 | ere
 115 | etc
 116 | even
 117 | eventually
 118 | ever
 119 | every
 120 | everyone
 121 | everything
 122 | everywhere
 123 | except
 124 | few
 125 | fifteen
 126 | fify
 127 | fill
 128 | find
 129 | fire
 130 | first
 131 | five
 132 | for
 133 | former
 134 | formerly
 135 | forty
 136 | found
 137 | four
 138 | from
 139 | front
 140 | full
 141 | further
 142 | get
 143 | give
 144 | go
 145 | had
 146 | has
 147 | hasnt
 148 | have
 149 | he
 150 | hence
 151 | her
 152 | here
 153 | hereafter
 154 | hereby
 155 | herein
 156 | hereupon
 157 | hers
 158 | herself
 159 | him
 160 | himself
 161 | his
 162 | how
 163 | however
 164 | hundred
 165 | i
 166 | ie
 167 | if
 168 | in
 169 | inc
 170 | indeed
 171 | inside
 172 | instead
 173 | interest
 174 | into
 175 | is
 176 | it
 177 | its
 178 | itself
 179 | just
 180 | keep
 181 | last
 182 | latter
 183 | latterly
 184 | least
 185 | less
 186 | like
 187 | ltd
 188 | made
 189 | major
 190 | many
 191 | may
 192 | maybe
 193 | me
 194 | meanwhile
 195 | mid
 196 | midst
 197 | might
 198 | mill
 199 | mine
 200 | minus
 201 | more
 202 | moreover
 203 | most
 204 | mostly
 205 | move
 206 | mr
 207 | mrs
 208 | ms
 209 | much
 210 | must
 211 | my
 212 | myself
 213 | name
 214 | namely
 215 | near
 216 | need
 217 | neither
 218 | net
 219 | never
 220 | nevertheless
 221 | next
 222 | nigh
 223 | nigher
 224 | nighest
 225 | nine
 226 | no
 227 | nobody
 228 | none
 229 | noone
 230 | nor
 231 | not
 232 | nothing
 233 | notwithstanding
 234 | now
 235 | nowhere
 236 | of
 237 | off
 238 | often
 239 | on
 240 | on to
 241 | once
 242 | one
 243 | only
 244 | onto
 245 | or
 246 | other
 247 | others
 248 | otherwise
 249 | ought
 250 | our
 251 | ours
 252 | ourselves
 253 | out
 254 | out of
 255 | outside
 256 | over
 257 | own
 258 | part
 259 | partly
 260 | past
 261 | pending
 262 | per
 263 | perhaps
 264 | please
 265 | plus
 266 | prior
 267 | put
 268 | qua
 269 | rather
 270 | re
 271 | regarding
 272 | round
 273 | same
 274 | sans
 275 | save
 276 | see
 277 | seem
 278 | seemed
 279 | seeming
 280 | seems
 281 | separately
 282 | serious
 283 | seven
 284 | several
 285 | shall
 286 | she
 287 | should
 288 | show
 289 | side
 290 | similarly
 291 | since
 292 | sincere
 293 | six
 294 | sixty
 295 | so
 296 | some
 297 | somehow
 298 | someone
 299 | something
 300 | sometime
 301 | sometimes
 302 | somewhere
 303 | still
 304 | such
 305 | system
 306 | take
 307 | ten
 308 | than
 309 | that
 310 | the
 311 | their
 312 | theirs
 313 | them
 314 | themselves
 315 | then
 316 | thence
 317 | there
 318 | thereafter
 319 | thereby
 320 | therefore
 321 | therein
 322 | thereupon
 323 | these
 324 | they
 325 | thick
 326 | thin
 327 | third
 328 | this
 329 | those
 330 | though
 331 | three
 332 | through
 333 | throughout
 334 | thru
 335 | thus
 336 | till
 337 | to
 338 | together
 339 | too
 340 | top
 341 | toward
 342 | towards
 343 | twelve
 344 | twenty
 345 | two
 346 | un
 347 | under
 348 | underneath
 349 | unless
 350 | unlike
 351 | until
 352 | unto
 353 | up
 354 | upon
 355 | us
 356 | versus
 357 | very
 358 | via
 359 | vice
 360 | volume
 361 | was
 362 | we
 363 | well
 364 | were
 365 | what
 366 | whatever
 367 | whats
 368 | when
 369 | whence
 370 | whenever
 371 | where
 372 | whereafter
 373 | whereas
 374 | whereby
 375 | wherein
 376 | whereupon
 377 | wherever
 378 | whether
 379 | which
 380 | while
 381 | whither
 382 | who
 383 | whoever
 384 | whole
 385 | whom
 386 | whose
 387 | why
 388 | will
 389 | with
 390 | within
 391 | without
 392 | would
 393 | yesterday
 394 | yet
 395 | you
 396 | your
 397 | yours
 398 | yourself
 399 | yourselves
 400 | {
 401 | |
 402 | }
 403 | ~
 404 | ¡
 405 | ¦
 406 | «
 407 | ­
 408 | ¯
 409 | ´
 410 | ¸
 411 | »
 412 | ¿
 413 | ˇ
 414 | ˉ
 415 | ˊ
 416 | ˋ
 417 | ˜
 418 | ‐
 419 | —　
 420 | ―
 421 | ‖
 422 | ‘
 423 | ’
 424 | “
 425 | ”
 426 | •
 427 | …
 428 | ‹
 429 | ›
 430 | ∕
 431 | 、
 432 | 。
 433 | 〈
 434 | 〉
 435 | 《
 436 | 》
 437 | 「
 438 | 」
 439 | 『
 440 | 』
 441 | 【
 442 | 】
 443 | 〔
 444 | 〕
 445 | 〖
 446 | 〗
 447 | 〝
 448 | 〞
 449 | 一
 450 | 一些
 451 | 一何
 452 | 一切
 453 | 一则
 454 | 一方面
 455 | 一旦
 456 | 一来
 457 | 一样
 458 | 一般
 459 | 一转眼
 460 | 万一
 461 | 上
 462 | 上下
 463 | 下
 464 | 不
 465 | 不仅
 466 | 不但
 467 | 不光
 468 | 不单
 469 | 不只
 470 | 不外乎
 471 | 不如
 472 | 不妨
 473 | 不尽
 474 | 不尽然
 475 | 不得
 476 | 不怕
 477 | 不惟
 478 | 不成
 479 | 不拘
 480 | 不料
 481 | 不是
 482 | 不比
 483 | 不然
 484 | 不特
 485 | 不独
 486 | 不管
 487 | 不至于
 488 | 不若
 489 | 不论
 490 | 不过
 491 | 不问
 492 | 与
 493 | 与其
 494 | 与其说
 495 | 与否
 496 | 与此同时
 497 | 且
 498 | 且不说
 499 | 且说
 500 | 两者
 501 | 个
 502 | 个别
 503 | 临
 504 | 为
 505 | 为了
 506 | 为止
 507 | 为此
 508 | 为着
 509 | 乃
 510 | 乃至
 511 | 乃至于
 512 | 么
 513 | 之
 514 | 之一
 515 | 之所以
 516 | 之类
 517 | 乌乎
 518 | 乎
 519 | 乘
 520 | 也
 521 | 也好
 522 | 也罢
 523 | 了
 524 | 二来
 525 | 于
 526 | 于是
 527 | 于是乎
 528 | 云云
 529 | 云尔
 530 | 些
 531 | 亦
 532 | 人
 533 | 人们
 534 | 人家
 535 | 今
 536 | 介于
 537 | 仍
 538 | 仍旧
 539 | 从
 540 | 从此
 541 | 从而
 542 | 他
 543 | 他人
 544 | 他们
 545 | 以
 546 | 以上
 547 | 以为
 548 | 以便
 549 | 以免
 550 | 以及
 551 | 以故
 552 | 以期
 553 | 以来
 554 | 以至
 555 | 以至于
 556 | 以致
 557 | 们
 558 | 任
 559 | 任何
 560 | 任凭
 561 | 似的
 562 | 但
 563 | 但凡
 564 | 但是
 565 | 何
 566 | 何以
 567 | 何况
 568 | 何处
 569 | 何时
 570 | 余外
 571 | 作为
 572 | 你
 573 | 你们
 574 | 使
 575 | 使得
 576 | 例如
 577 | 依
 578 | 依据
 579 | 依照
 580 | 便于
 581 | 俺
 582 | 俺们
 583 | 倘
 584 | 倘使
 585 | 倘或
 586 | 倘然
 587 | 倘若
 588 | 借
 589 | 假使
 590 | 假如
 591 | 假若
 592 | 傥然
 593 | 像
 594 | 儿
 595 | 先不先
 596 | 光是
 597 | 全体
 598 | 全部
 599 | 兮
 600 | 关于
 601 | 其
 602 | 其一
 603 | 其中
 604 | 其二
 605 | 其他
 606 | 其余
 607 | 其它
 608 | 其次
 609 | 具体地说
 610 | 具体说来
 611 | 兼之
 612 | 内
 613 | 再其次
 614 | 再则
 615 | 再有
 616 | 再者
 617 | 再者说
 618 | 再说
 619 | 冒
 620 | 冲
 621 | 况且
 622 | 几
 623 | 几时
 624 | 凡
 625 | 凡是
 626 | 凭
 627 | 凭借
 628 | 出于
 629 | 出来
 630 | 分别
 631 | 则
 632 | 则甚
 633 | 别
 634 | 别人
 635 | 别处
 636 | 别是
 637 | 别的
 638 | 别管
 639 | 别说
 640 | 到
 641 | 前后
 642 | 前此
 643 | 前者
 644 | 加之
 645 | 加以
 646 | 即
 647 | 即令
 648 | 即使
 649 | 即便
 650 | 即如
 651 | 即或
 652 | 即若
 653 | 却
 654 | 去
 655 | 又
 656 | 又及
 657 | 及
 658 | 及其
 659 | 及至
 660 | 反之
 661 | 反而
 662 | 反过来
 663 | 反过来说
 664 | 受到
 665 | 另
 666 | 另一方面
 667 | 另外
 668 | 另悉
 669 | 只
 670 | 只当
 671 | 只怕
 672 | 只是
 673 | 只有
 674 | 只消
 675 | 只要
 676 | 只限
 677 | 叫
 678 | 叮咚
 679 | 可
 680 | 可以
 681 | 可是
 682 | 可见
 683 | 各
 684 | 各个
 685 | 各位
 686 | 各种
 687 | 各自
 688 | 同
 689 | 同时
 690 | 后
 691 | 后者
 692 | 向
 693 | 向使
 694 | 向着
 695 | 吓
 696 | 吗
 697 | 否则
 698 | 吧
 699 | 吧哒
 700 | 吱
 701 | 呀
 702 | 呃
 703 | 呕
 704 | 呗
 705 | 呜
 706 | 呜呼
 707 | 呢
 708 | 呵
 709 | 呵呵
 710 | 呸
 711 | 呼哧
 712 | 咋
 713 | 和
 714 | 咚
 715 | 咦
 716 | 咧
 717 | 咱
 718 | 咱们
 719 | 咳
 720 | 哇
 721 | 哈
 722 | 哈哈
 723 | 哉
 724 | 哎
 725 | 哎呀
 726 | 哎哟
 727 | 哗
 728 | 哟
 729 | 哦
 730 | 哩
 731 | 哪
 732 | 哪些
 733 | 哪怕
 734 | 哼
 735 | 哼唷
 736 | 唉
 737 | 唯有
 738 | 啊
 739 | 啐
 740 | 啥
 741 | 啦
 742 | 啪达
 743 | 啷当
 744 | 喂
 745 | 喏
 746 | 喔唷
 747 | 喽
 748 | 嗡
 749 | 嗡嗡
 750 | 嗬
 751 | 嗯
 752 | 嗳
 753 | 嘎
 754 | 嘎登
 755 | 嘘
 756 | 嘛
 757 | 嘻
 758 | 嘿
 759 | 嘿嘿
 760 | 因
 761 | 因为
 762 | 因了
 763 | 因此
 764 | 因着
 765 | 因而
 766 | 固然
 767 | 在
 768 | 在下
 769 | 在于
 770 | 地
 771 | 基于
 772 | 处在
 773 | 多
 774 | 多么
 775 | 多少
 776 | 大
 777 | 大家
 778 | 她
 779 | 她们
 780 | 好
 781 | 如
 782 | 如上
 783 | 如上所述
 784 | 如下
 785 | 如何
 786 | 如其
 787 | 如同
 788 | 如是
 789 | 如果
 790 | 如此
 791 | 如若
 792 | 始而
 793 | 孰料
 794 | 孰知
 795 | 宁
 796 | 宁可
 797 | 宁愿
 798 | 宁肯
 799 | 它
 800 | 它们
 801 | 对
 802 | 对于
 803 | 对待
 804 | 对方
 805 | 对比
 806 | 将
 807 | 小
 808 | 尔
 809 | 尔后
 810 | 尔尔
 811 | 尚且
 812 | 就
 813 | 就是
 814 | 就是了
 815 | 就是说
 816 | 就算
 817 | 就要
 818 | 尽
 819 | 尽管
 820 | 尽管如此
 821 | 岂但
 822 | 己
 823 | 已
 824 | 已矣
 825 | 巴
 826 | 巴巴
 827 | 并
 828 | 并且
 829 | 并非
 830 | 庶乎
 831 | 庶几
 832 | 开外
 833 | 开始
 834 | 归
 835 | 归齐
 836 | 当
 837 | 当地
 838 | 当然
 839 | 当着
 840 | 彼
 841 | 彼时
 842 | 彼此
 843 | 往
 844 | 待
 845 | 很
 846 | 得
 847 | 得了
 848 | 怎
 849 | 怎奈
 850 | 总之
 851 | 总的来看
 852 | 总的来说
 853 | 总的说来
 854 | 总而言之
 855 | 恰恰相反
 856 | 您
 857 | 惟其
 858 | 慢说
 859 | 我
 860 | 我们
 861 | 或
 862 | 或则
 863 | 或是
 864 | 或曰
 865 | 或者
 866 | 截至
 867 | 所
 868 | 所以
 869 | 所在
 870 | 所幸
 871 | 所有
 872 | 才
 873 | 才能
 874 | 打
 875 | 打从
 876 | 把
 877 | 抑或
 878 | 拿
 879 | 按
 880 | 按照
 881 | 换句话说
 882 | 换言之
 883 | 据
 884 | 据此
 885 | 接着
 886 | 故
 887 | 故此
 888 | 故而
 889 | 旁人
 890 | 无
 891 | 无宁
 892 | 无论
 893 | 既
 894 | 既往
 895 | 既是
 896 | 既然
 897 | 时候
 898 | 是
 899 | 是以
 900 | 是的
 901 | 曾
 902 | 替
 903 | 替代
 904 | 最
 905 | 有
 906 | 有些
 907 | 有关
 908 | 有及
 909 | 有时
 910 | 有的
 911 | 望
 912 | 朝
 913 | 朝着
 914 | 本
 915 | 本人
 916 | 本地
 917 | 本着
 918 | 本身
 919 | 来
 920 | 来着
 921 | 来自
 922 | 来说
 923 | 极了
 924 | 果然
 925 | 果真
 926 | 某
 927 | 某个
 928 | 某些
 929 | 某某
 930 | 根据
 931 | 欤
 932 | 正值
 933 | 正如
 934 | 正巧
 935 | 正是
 936 | 此
 937 | 此地
 938 | 此处
 939 | 此外
 940 | 此时
 941 | 此次
 942 | 此间
 943 | 毋宁
 944 | 每
 945 | 每当
 946 | 比
 947 | 比及
 948 | 比如
 949 | 比方
 950 | 没奈何
 951 | 沿
 952 | 沿着
 953 | 漫说
 954 | 焉
 955 | 然则
 956 | 然后
 957 | 然而
 958 | 照
 959 | 照着
 960 | 犹且
 961 | 犹自
 962 | 甚且
 963 | 甚么
 964 | 甚或
 965 | 甚而
 966 | 甚至
 967 | 甚至于
 968 | 用
 969 | 用来
 970 | 由
 971 | 由于
 972 | 由是
 973 | 由此
 974 | 由此可见
 975 | 的
 976 | 的确
 977 | 的话
 978 | 直到
 979 | 相对而言
 980 | 省得
 981 | 看
 982 | 眨眼
 983 | 着
 984 | 着呢
 985 | 矣
 986 | 矣乎
 987 | 矣哉
 988 | 离
 989 | 竟而
 990 | 第
 991 | 等
 992 | 等到
 993 | 等等
 994 | 简言之
 995 | 管
 996 | 类如
 997 | 紧接着
 998 | 纵
 999 | 纵令
1000 | 纵使
1001 | 纵然
1002 | 经
1003 | 经过
1004 | 结果
1005 | 给
1006 | 继之
1007 | 继后
1008 | 继而
1009 | 综上所述
1010 | 罢了
1011 | 者
1012 | 而
1013 | 而且
1014 | 而况
1015 | 而后
1016 | 而外
1017 | 而已
1018 | 而是
1019 | 而言
1020 | 能
1021 | 能否
1022 | 腾
1023 | 自
1024 | 自个儿
1025 | 自从
1026 | 自各儿
1027 | 自后
1028 | 自家
1029 | 自己
1030 | 自打
1031 | 自身
1032 | 至
1033 | 至于
1034 | 至今
1035 | 至若
1036 | 致
1037 | 般的
1038 | 若
1039 | 若夫
1040 | 若是
1041 | 若果 
1042 | 若非
1043 | 莫不然
1044 | 莫如
1045 | 莫若
1046 | 虽
1047 | 虽则
1048 | 虽然
1049 | 虽说
1050 | 被
1051 | 要
1052 | 要不
1053 | 要不是
1054 | 要不然
1055 | 要么
1056 | 要是
1057 | 譬喻
1058 | 譬如
1059 | 让
1060 | 许多
1061 | 论
1062 | 设使
1063 | 设或
1064 | 设若
1065 | 诚如
1066 | 诚然
1067 | 该
1068 | 说来
1069 | 诸
1070 | 诸位
1071 | 诸如
1072 | 谁
1073 | 谁人
1074 | 谁料
1075 | 谁知
1076 | 贼死
1077 | 赖以
1078 | 赶
1079 | 起
1080 | 起见
1081 | 趁
1082 | 趁着
1083 | 越是
1084 | 距
1085 | 跟
1086 | 较
1087 | 较之
1088 | 边
1089 | 过
1090 | 还
1091 | 还是
1092 | 还有
1093 | 还要
1094 | 这
1095 | 这一来
1096 | 这个
1097 | 这么
1098 | 这么些
1099 | 这么样
1100 | 这么点儿
1101 | 这些
1102 | 这会儿
1103 | 这儿
1104 | 这就是说
1105 | 这时
1106 | 这样
1107 | 这次
1108 | 这般
1109 | 这边
1110 | 这里
1111 | 进而
1112 | 连
1113 | 连同
1114 | 逐步
1115 | 通过
1116 | 遵循
1117 | 遵照
1118 | 那
1119 | 那个
1120 | 那么
1121 | 那么些
1122 | 那么样
1123 | 那些
1124 | 那会儿
1125 | 那儿
1126 | 那时
1127 | 那样
1128 | 那般
1129 | 那边
1130 | 那里
1131 | 都
1132 | 鄙人
1133 | 鉴于
1134 | 针对
1135 | 阿
1136 | 除
1137 | 除了
1138 | 除外
1139 | 除开
1140 | 除此之外
1141 | 除非
1142 | 随
1143 | 随后
1144 | 随时
1145 | 随着
1146 | 难道说
1147 | 非但
1148 | 非徒
1149 | 非特
1150 | 非独
1151 | 靠
1152 | 顺
1153 | 顺着
1154 | 首先
1155 | ︰
1156 | ︳
1157 | ︴
1158 | ︵
1159 | ︶
1160 | ︷
1161 | ︸
1162 | ︹
1163 | ︺
1164 | ︻
1165 | ︼
1166 | ︽
1167 | ︾
1168 | ︿
1169 | ﹀
1170 | ﹁
1171 | ﹂
1172 | ﹃
1173 | ﹄
1174 | ﹉
1175 | ﹊
1176 | ﹋
1177 | ﹌
1178 | ﹍
1179 | ﹎
1180 | ﹏
1181 | ﹐
1182 | ﹑
1183 | ﹔
1184 | ﹕
1185 | ﹖
1186 | ﹝
1187 | ﹞
1188 | ﹟
1189 | ﹠
1190 | ﹡
1191 | ﹢
1192 | ﹤
1193 | ﹦
1194 | ﹨
1195 | ﹩
1196 | ﹪
1197 | ﹫
1198 | ！
1199 | ＂
1200 | ＇
1201 | （
1202 | ）
1203 | ，
1204 | ：
1205 | ；
1206 | ？
1207 | ＿
1208 | ￣
1209 | １
1210 | ２
1211 | ３
1212 | ４
1213 | ５
1214 | ６
1215 | ７
1216 | ８
1217 | ９
1218 | ０
1219 | *
1220 | 


--------------------------------------------------------------------------------
/DSSM-CNN/model.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/6/12 22:58
  4 | # @Author: Zhang Cong
  5 | 
  6 | from config import Config
  7 | import tensorflow as tf
  8 | 
  9 | class Model():
 10 |     def __init__(self):
 11 |         self.config = Config()                                                                                                                  # 配置参数
 12 |         self.input_query = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name='input-query')                             # 输入query，ID形式
 13 |         self.input_pos_doc = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name='input-pos')                             # 输入pos_doc，ID形式
 14 |         self.input_neg_doc = tf.placeholder(shape=[None, self.config.neg_doc_num, self.config.seq_length], dtype=tf.int32, name='input-neg')    # 输入多个neg_doc，ID形式
 15 |         self.input_keep_prob = tf.placeholder(dtype=tf.float32, name='input-keep-prob')                                                         # keep-prob
 16 | 
 17 |         # Embedding layer
 18 |         embedding = tf.get_variable(shape=[self.config.vocab_size, self.config.embedding_dim], dtype=tf.float32, name='embedding')
 19 | 
 20 |         # 将词汇映射为向量形式 [batch_size, seq_length, embedding_dim]
 21 |         embedding_query = tf.nn.embedding_lookup(params=embedding, ids=self.input_query, name='embedding_query')
 22 |         embedding_pos_doc = tf.nn.embedding_lookup(params=embedding, ids=self.input_pos_doc, name='embedding_pos_doc')
 23 |         embedding_neg_doc = tf.nn.embedding_lookup(params=embedding, ids=self.input_neg_doc, name='embedding_neg_doc')
 24 | 
 25 |         # 创建卷积层的权重和偏置项 weight、bias （query、pos、neg共用）
 26 |         conv_w = tf.get_variable(shape=[self.config.kernel_size, self.config.embedding_dim, self.config.hidden_dim], dtype=tf.float32, name='conv_w')
 27 |         conv_b = tf.get_variable(shape=[self.config.hidden_dim], dtype=tf.float32, name='conv_b')
 28 | 
 29 |         # 一维卷积层
 30 |         embedding_query_conv = tf.nn.conv1d(value=embedding_query, filters=conv_w, stride=1, padding='VALID') + conv_b
 31 |         embedding_pos_doc_conv = tf.nn.conv1d(value=embedding_pos_doc, filters=conv_w, stride=1, padding='VALID') + conv_b
 32 |         embedding_neg_doc_conv = tf.nn.conv1d(value=tf.reshape(embedding_neg_doc, shape=[-1, self.config.seq_length, self.config.embedding_dim]),
 33 |                                               filters=conv_w, stride=1, padding='VALID') + conv_b
 34 | 
 35 |         # 最大池化层 max pooling layer
 36 |         embedding_query_pooling = tf.layers.max_pooling1d(inputs=embedding_query_conv,
 37 |                                                           pool_size=self.config.seq_length-self.config.kernel_size+1,
 38 |                                                           strides=1,
 39 |                                                           name='embedding_query_pooling')
 40 |         embedding_pos_doc_pooling = tf.layers.max_pooling1d(inputs=embedding_pos_doc_conv,
 41 |                                                             pool_size=self.config.seq_length-self.config.kernel_size+1,
 42 |                                                             strides=1,
 43 |                                                             name='embedding_pos_doc_pooling')
 44 |         embedding_neg_doc_pooling = tf.layers.max_pooling1d(inputs=embedding_neg_doc_conv,
 45 |                                                             pool_size=self.config.seq_length-self.config.kernel_size+1,
 46 |                                                             strides=1,
 47 |                                                             name='embedding_neg_doc_pooling')
 48 | 
 49 |         # 维度压缩 squeeze layer
 50 |         embedding_query_new = tf.squeeze(input=embedding_query_pooling, axis=1, name='embedding_query_new')
 51 |         embedding_pos_doc_new = tf.squeeze(input=embedding_pos_doc_pooling, axis=1, name='embedding_pos_doc_new')
 52 |         embedding_neg_doc_new = tf.squeeze(input=embedding_neg_doc_pooling, axis=1, name='embedding_neg_doc_new')
 53 | 
 54 |         # 全连接层layer1 (batch_size, 512) -> (batch_size, 300)
 55 |         L1_N = 300
 56 |         l1_range = tf.sqrt(6/(self.config.hidden_dim + L1_N))       # 原论文weight、bias范围初始化方式
 57 |         weight_1 = tf.get_variable(initializer=tf.random_uniform(shape=[self.config.hidden_dim, L1_N], minval=-l1_range, maxval=l1_range), name='weight-1')
 58 |         bias_1 = tf.get_variable(initializer=tf.random_uniform(shape=[L1_N], minval=-l1_range, maxval=l1_range), name='bias-1')
 59 |         # 全连接
 60 |         query_l1 = tf.matmul(embedding_query_new, weight_1) + bias_1
 61 |         pos_doc_l1 = tf.matmul(embedding_pos_doc_new, weight_1, ) + bias_1
 62 |         neg_doc_l1 = tf.matmul(embedding_neg_doc_new, weight_1, ) + bias_1
 63 |         # 激活函数 activation function
 64 |         query_l1 = tf.nn.tanh(query_l1)
 65 |         pos_doc_l1 = tf.nn.tanh(pos_doc_l1)
 66 |         neg_doc_l1 = tf.nn.tanh(neg_doc_l1)
 67 | 
 68 |         # 全连接层layer2 (batch_size, 300) -> (batch_size, 128)
 69 |         L2_N = 128
 70 |         l2_range = tf.sqrt(6/(L1_N + L2_N))         # 原论文weight、bias范围初始化方式
 71 |         weight_2 = tf.get_variable(initializer=tf.random_uniform(shape=[L1_N, L2_N], minval=-l2_range, maxval=l2_range), name='weight-2')
 72 |         bias_2 = tf.get_variable(initializer=tf.random_uniform(shape=[L2_N], minval=-l2_range, maxval=l2_range), name='bias-2')
 73 |         # 全连接
 74 |         query_l2 = tf.matmul(query_l1, weight_2) + bias_2
 75 |         pos_doc_l2 = tf.matmul(pos_doc_l1, weight_2) + bias_2
 76 |         neg_doc_l2 = tf.matmul(neg_doc_l1, weight_2) + bias_2
 77 |         # 激活函数 activation function
 78 |         query_l2_out = tf.tanh(query_l2)
 79 |         pos_doc_l2_out = tf.tanh(pos_doc_l2)
 80 |         neg_doc_l2_out = tf.tanh(neg_doc_l2)
 81 | 
 82 |         # 维度还原 [batch_size, neg_doc_num, hidden_dim]
 83 |         neg_doc_l2_out = tf.reshape(neg_doc_l2_out, shape=[-1, self.config.neg_doc_num, L2_N])
 84 | 
 85 |         # 计算query和pos_doc的Cosine
 86 |         query_dot_pos = tf.reduce_sum(tf.multiply(query_l2_out, pos_doc_l2_out), axis=1)                # query和pos_doc进行点乘
 87 |         query_l2_L2 = tf.sqrt(tf.reduce_sum(tf.square(query_l2_out), axis=1))                           # query的L2范数
 88 |         pos_doc_l2_L2 = tf.sqrt(tf.reduce_sum(tf.square(pos_doc_l2_out), axis=1))                       # pos_doc的L2范数
 89 |         self.query_pos_cosine = tf.expand_dims(query_dot_pos/(query_l2_L2*pos_doc_l2_L2), axis=1)       # 计算query和pos_doc的余弦值
 90 | 
 91 |         # 测试结果
 92 |         self.test_predict = tf.reshape(tensor=tf.round(self.query_pos_cosine), shape=[-1])
 93 |         # 批测试准确率
 94 |         self.accuracy_test = tf.reduce_mean(tf.round(self.query_pos_cosine))
 95 | 
 96 |         # 计算query和neg_doc的Cosine
 97 |         query_l2_out_flatten = tf.expand_dims(query_l2_out, axis=1)                                     # 扩充query矩阵维度
 98 |         query_dot_neg = tf.reduce_sum(tf.multiply(query_l2_out_flatten, neg_doc_l2_out), axis=2)        # query和neg_doc进行点乘
 99 |         neg_doc_l2_L2 = tf.sqrt(tf.reduce_sum(tf.square(neg_doc_l2_out), axis=2))                       # neg_doc的L2范数
100 |         self.query_neg_cosine = query_dot_neg/(tf.expand_dims(query_l2_L2, axis=1)*neg_doc_l2_L2)       # 计算query和neg_doc的余弦值
101 | 
102 |         # 将pos_doc和neg_doc的cosine进行拼接为一个整体矩阵
103 |         doc_cosine = tf.concat([self.query_pos_cosine, self.query_neg_cosine], axis=1)
104 |         # score归一化
105 |         doc_cosine_softmax = tf.nn.softmax(doc_cosine, axis=1)
106 |         # 获取query与pos_doc的相似度
107 |         prob = tf.slice(doc_cosine_softmax, begin=[0, 0], size=[-1, 1])
108 |         # 训练结果
109 |         self.train_predict = tf.reshape(tensor=tf.round(prob), shape=[-1])
110 |         # 损失函数 负对数损失函数，提升pos_doc的score，抑制neg_doc的score
111 |         self.loss = -tf.reduce_sum(tf.log(prob))
112 |         # 优化器
113 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(loss=self.loss)
114 | 
115 |         # 构造true label
116 |         label = [[1]+[0]*self.config.neg_doc_num]                   # true label: [1, 0, 0, 0, 0]
117 |         labels = tf.tile(label, [self.config.batch_size, 1])        # 按batch_size的数量进行复制
118 | 
119 |         # 正确率
120 |         correct = tf.equal(tf.argmax(doc_cosine_softmax, axis=1), tf.argmax(labels, axis=1))
121 |         self.accuracy_train = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     Model()


--------------------------------------------------------------------------------
/DSSM-Embedding/DSSM-Embedding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/DSSM-Embedding/DSSM-Embedding.png


--------------------------------------------------------------------------------
/DSSM-Embedding/Data_Generate.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/5/12 22:19 
  4 | # @Author: Zhang Cong
  5 | 
  6 | import random
  7 | import logging
  8 | from tqdm import tqdm
  9 | 
 10 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 11 | 
 12 | def generate_train_data(input_file_path, output_file_path):
 13 |     '''
 14 |     对原始csv数据进行格式转换，构建训练数据集
 15 |     :param input_file_path: 原始数据路径
 16 |     :param output_file_path: 构建完成的训练数据路径
 17 |     :return: 将数据存储至本地
 18 |     '''
 19 |     logging.info('Start get all sentence ...')
 20 |     # 获取全部句子集
 21 |     all_sentence = []
 22 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 23 |         line = line.replace('\n', '').split('\t')
 24 |         if line[2] == 'label':      # 跳过首行
 25 |             continue
 26 |         sentence_1 = str(line[0]).replace('\t', '')     # 句子1
 27 |         sentence_2 = str(line[1]).replace('\t', '')     # 句子2
 28 |         all_sentence.append(sentence_1)
 29 |         all_sentence.append(sentence_2)
 30 |     # 去重
 31 |     all_sentence = list(set(all_sentence))
 32 | 
 33 |     logging.info('Start generate dataset ...')
 34 |     # 构建训练数据集 [query, pos, neg_1, neg_2, neg_3, neg_4]
 35 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
 36 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 37 |         line = line.replace('\n', '').split('\t')
 38 |         if line[2] == 'label':      # 跳过首行
 39 |             continue
 40 |         sentence_list = []
 41 |         sentence_1 = str(line[0]).replace('\t', '')
 42 |         sentence_2 = str(line[1]).replace('\t', '')
 43 |         sentence_list.append(sentence_1)    # 句子1
 44 |         sentence_list.append(sentence_2)    # 句子2
 45 |         label = line[2]                     # 标签
 46 | 
 47 |         if int(label)==1:       # 如果标签为1，则保留此句子对，并随机负采样得到4个负例
 48 |             while len(sentence_list)<6:         # [query, pos, neg_1, neg_2, neg_3, neg_4]
 49 |                 index = random.randint(0, len(all_sentence)-1)      # 随机索引
 50 |                 if all_sentence[index] not in sentence_list:        # 如果不重复，则加入
 51 |                     sentence_list.append(all_sentence[index])
 52 |             output_file.write('\t'.join(sentence_list) + '\n')
 53 |     output_file.close()
 54 |     logging.info('Finishied generate dataset ...')
 55 | 
 56 | 
 57 | def generate_test_data(input_file_path, output_file_path):
 58 |     '''
 59 |     对原始csv数据进行格式转换，构建测试数据集
 60 |     :param input_file_path: 原始数据路径
 61 |     :param output_file_path: 构建完成的训练数据路径
 62 |     :return: 将数据存储至本地
 63 |     '''
 64 |     logging.info('Start get all sentence ...')
 65 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
 66 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 67 |         line = line.replace('\n', '').split('\t')
 68 |         if line[2] == 'label':      # 跳过首行
 69 |             continue
 70 |         sentence_1 = str(line[0]).replace('\t', '')     # 句子1
 71 |         sentence_2 = str(line[1]).replace('\t', '')     # 句子2
 72 |         label = line[2]                                 # 标签
 73 |         output_file.write(sentence_1 + '\t' + sentence_2 + '\t' + label + '\n')
 74 | 
 75 | 
 76 | def check_data(input_file_path):
 77 |     '''
 78 |     统计数据分布情况，检查数据集0/1分布是否均衡
 79 |     :param input_file_path: 数据路径
 80 |     :return:
 81 |     '''
 82 |     count = 0
 83 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 84 |         line = line.replace('\n', '').split('\t')
 85 |         if line[2] == 'label':
 86 |             continue
 87 |         if int(line[2]) == 1:
 88 |             count += 1
 89 |     print(count)
 90 | 
 91 | 
 92 | if __name__ == '__main__':
 93 | 
 94 |     # 统计数据分布情况
 95 |     file_path = './data/lcqmc/lcqmc_train.tsv'
 96 |     check_data(file_path)
 97 | 
 98 |     # 构建训练数据集
 99 |     input_file_path = './data/lcqmc/lcqmc_train.tsv'
100 |     output_file_path = './data/train.txt'
101 |     generate_train_data(input_file_path, output_file_path)
102 |     logging.info('Success generate train.txt')
103 | 
104 |     # 构建验证数据集
105 |     input_file_path = './data/lcqmc/lcqmc_dev.tsv'
106 |     output_file_path = './data/dev.txt'
107 |     generate_test_data(input_file_path, output_file_path)
108 |     logging.info('Success generate dev.txt')
109 | 
110 |     # 构建测试数据集
111 |     # input_file_path = './data/lcqmc/lcqmc_test.tsv'
112 |     # output_file_path = './data/test.txt'
113 |     # generate_test_data(input_file_path, output_file_path)
114 |     # logging.info('Success generate test.txt')
115 | 
116 | 


--------------------------------------------------------------------------------
/DSSM-Embedding/README.md:
--------------------------------------------------------------------------------
 1 | ## DSSM-Embedding
 2 | 
 3 | 
 4 | ### 数据集：
 5 | #### LCQMC (http://icrc.hitsz.edu.cn/info/1037/1146.htm) 数据集版权保护，本项目不提供，请自行下载或替换其他数据集进行试验
 6 | 
 7 | 
 8 | ### 数据形式：
 9 | #### query \t pos \t neg_1 \t neg_2 \t neg_3 \t neg_4
10 |     其中pos为正例，neg_1、neg_2、neg_3、neg_4为随机负采样得到负例
11 | 
12 | 
13 | ### 文件解释
14 | * main.py —— 主文件
15 | * model.py —— 模型结构
16 | * config.py —— 配置参数
17 | * Data_Generate.py —— 数据集处理脚本
18 | * /data —— 数据存放文件夹
19 | * /save_model —— 模型存储文件夹
20 | 
21 | 
22 | ### 模型结构
23 | ![avatar](./DSSM-Embedding.png)
24 | * 本模型是对DSSM的改进，主要将BOW表示层替换为Embedding层，减小词汇表征维度。
25 | 
26 | 
27 | ### 参考资料
28 | * https://www.cnblogs.com/guoyaohua/p/9229190.html
29 | 
30 | 


--------------------------------------------------------------------------------
/DSSM-Embedding/config.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/5/12 22:57 
 4 | # @Author: Zhang Cong
 5 | 
 6 | # 模型配置参数
 7 | class Config():
 8 |     def __init__(self):
 9 |         self.original_data_path = './data/train.txt'
10 |         self.dev_data_path = './data/dev.txt'
11 |         self.stopwords_path = './data/stopwords.txt'
12 |         self.preprocess_path = './data/preprocessed_data.txt'
13 |         self.vocab_path = './data/vocab.txt'
14 |         self.label_path = './data/label.txt'
15 |         self.model_save_path = './save_model/'
16 |         self.vocab_size = 2000
17 |         self.seq_length = 20
18 |         self.embedding_dim = 300
19 |         self.neg_doc_num = 4
20 |         self.learning_rate = 1e-5
21 |         self.keep_prob = 0.5
22 |         self.batch_size = 32
23 |         self.epochs = 30


--------------------------------------------------------------------------------
/DSSM-Embedding/data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | &nbsp
   2 | &nbsp;
   3 | aboard
   4 | about
   5 | above
   6 | according
   7 | according to
   8 | across
   9 | afore
  10 | after
  11 | afterwards
  12 | again
  13 | against
  14 | agin
  15 | all
  16 | almost
  17 | alone
  18 | along
  19 | alongside
  20 | already
  21 | also
  22 | although
  23 | always
  24 | am
  25 | amid
  26 | amidst
  27 | among
  28 | amongst
  29 | amoungst
  30 | amount
  31 | an
  32 | and
  33 | anent
  34 | another
  35 | any
  36 | anyhow
  37 | anyone
  38 | anything
  39 | anyway
  40 | anywhere
  41 | approximately
  42 | are
  43 | around
  44 | as
  45 | asked
  46 | aslant
  47 | astride
  48 | at
  49 | athwart
  50 | back
  51 | bar
  52 | be
  53 | became
  54 | because
  55 | because of
  56 | become
  57 | becomes
  58 | becoming
  59 | been
  60 | before
  61 | beforehand
  62 | behind
  63 | being
  64 | below
  65 | beneath
  66 | beside
  67 | besides
  68 | between
  69 | betwixt
  70 | beyond
  71 | bill
  72 | both
  73 | bottom
  74 | but
  75 | by
  76 | call
  77 | called
  78 | can
  79 | cannot
  80 | cant
  81 | circa
  82 | co
  83 | computer
  84 | con
  85 | could
  86 | couldnt
  87 | cry
  88 | currently
  89 | dare
  90 | de
  91 | describe
  92 | despite
  93 | detail
  94 | did
  95 | do
  96 | does
  97 | done
  98 | down
  99 | dr
 100 | due
 101 | due to
 102 | during
 103 | e.g.,
 104 | each
 105 | earlier
 106 | eg
 107 | eight
 108 | either
 109 | eleven
 110 | else
 111 | elsewhere
 112 | empty
 113 | enough
 114 | ere
 115 | etc
 116 | even
 117 | eventually
 118 | ever
 119 | every
 120 | everyone
 121 | everything
 122 | everywhere
 123 | except
 124 | few
 125 | fifteen
 126 | fify
 127 | fill
 128 | find
 129 | fire
 130 | first
 131 | five
 132 | for
 133 | former
 134 | formerly
 135 | forty
 136 | found
 137 | four
 138 | from
 139 | front
 140 | full
 141 | further
 142 | get
 143 | give
 144 | go
 145 | had
 146 | has
 147 | hasnt
 148 | have
 149 | he
 150 | hence
 151 | her
 152 | here
 153 | hereafter
 154 | hereby
 155 | herein
 156 | hereupon
 157 | hers
 158 | herself
 159 | him
 160 | himself
 161 | his
 162 | how
 163 | however
 164 | hundred
 165 | i
 166 | ie
 167 | if
 168 | in
 169 | inc
 170 | indeed
 171 | inside
 172 | instead
 173 | interest
 174 | into
 175 | is
 176 | it
 177 | its
 178 | itself
 179 | just
 180 | keep
 181 | last
 182 | latter
 183 | latterly
 184 | least
 185 | less
 186 | like
 187 | ltd
 188 | made
 189 | major
 190 | many
 191 | may
 192 | maybe
 193 | me
 194 | meanwhile
 195 | mid
 196 | midst
 197 | might
 198 | mill
 199 | mine
 200 | minus
 201 | more
 202 | moreover
 203 | most
 204 | mostly
 205 | move
 206 | mr
 207 | mrs
 208 | ms
 209 | much
 210 | must
 211 | my
 212 | myself
 213 | name
 214 | namely
 215 | near
 216 | need
 217 | neither
 218 | net
 219 | never
 220 | nevertheless
 221 | next
 222 | nigh
 223 | nigher
 224 | nighest
 225 | nine
 226 | no
 227 | nobody
 228 | none
 229 | noone
 230 | nor
 231 | not
 232 | nothing
 233 | notwithstanding
 234 | now
 235 | nowhere
 236 | of
 237 | off
 238 | often
 239 | on
 240 | on to
 241 | once
 242 | one
 243 | only
 244 | onto
 245 | or
 246 | other
 247 | others
 248 | otherwise
 249 | ought
 250 | our
 251 | ours
 252 | ourselves
 253 | out
 254 | out of
 255 | outside
 256 | over
 257 | own
 258 | part
 259 | partly
 260 | past
 261 | pending
 262 | per
 263 | perhaps
 264 | please
 265 | plus
 266 | prior
 267 | put
 268 | qua
 269 | rather
 270 | re
 271 | regarding
 272 | round
 273 | same
 274 | sans
 275 | save
 276 | see
 277 | seem
 278 | seemed
 279 | seeming
 280 | seems
 281 | separately
 282 | serious
 283 | seven
 284 | several
 285 | shall
 286 | she
 287 | should
 288 | show
 289 | side
 290 | similarly
 291 | since
 292 | sincere
 293 | six
 294 | sixty
 295 | so
 296 | some
 297 | somehow
 298 | someone
 299 | something
 300 | sometime
 301 | sometimes
 302 | somewhere
 303 | still
 304 | such
 305 | system
 306 | take
 307 | ten
 308 | than
 309 | that
 310 | the
 311 | their
 312 | theirs
 313 | them
 314 | themselves
 315 | then
 316 | thence
 317 | there
 318 | thereafter
 319 | thereby
 320 | therefore
 321 | therein
 322 | thereupon
 323 | these
 324 | they
 325 | thick
 326 | thin
 327 | third
 328 | this
 329 | those
 330 | though
 331 | three
 332 | through
 333 | throughout
 334 | thru
 335 | thus
 336 | till
 337 | to
 338 | together
 339 | too
 340 | top
 341 | toward
 342 | towards
 343 | twelve
 344 | twenty
 345 | two
 346 | un
 347 | under
 348 | underneath
 349 | unless
 350 | unlike
 351 | until
 352 | unto
 353 | up
 354 | upon
 355 | us
 356 | versus
 357 | very
 358 | via
 359 | vice
 360 | volume
 361 | was
 362 | we
 363 | well
 364 | were
 365 | what
 366 | whatever
 367 | whats
 368 | when
 369 | whence
 370 | whenever
 371 | where
 372 | whereafter
 373 | whereas
 374 | whereby
 375 | wherein
 376 | whereupon
 377 | wherever
 378 | whether
 379 | which
 380 | while
 381 | whither
 382 | who
 383 | whoever
 384 | whole
 385 | whom
 386 | whose
 387 | why
 388 | will
 389 | with
 390 | within
 391 | without
 392 | would
 393 | yesterday
 394 | yet
 395 | you
 396 | your
 397 | yours
 398 | yourself
 399 | yourselves
 400 | {
 401 | |
 402 | }
 403 | ~
 404 | ¡
 405 | ¦
 406 | «
 407 | ­
 408 | ¯
 409 | ´
 410 | ¸
 411 | »
 412 | ¿
 413 | ˇ
 414 | ˉ
 415 | ˊ
 416 | ˋ
 417 | ˜
 418 | ‐
 419 | —　
 420 | ―
 421 | ‖
 422 | ‘
 423 | ’
 424 | “
 425 | ”
 426 | •
 427 | …
 428 | ‹
 429 | ›
 430 | ∕
 431 | 、
 432 | 。
 433 | 〈
 434 | 〉
 435 | 《
 436 | 》
 437 | 「
 438 | 」
 439 | 『
 440 | 』
 441 | 【
 442 | 】
 443 | 〔
 444 | 〕
 445 | 〖
 446 | 〗
 447 | 〝
 448 | 〞
 449 | 一
 450 | 一些
 451 | 一何
 452 | 一切
 453 | 一则
 454 | 一方面
 455 | 一旦
 456 | 一来
 457 | 一样
 458 | 一般
 459 | 一转眼
 460 | 万一
 461 | 上
 462 | 上下
 463 | 下
 464 | 不
 465 | 不仅
 466 | 不但
 467 | 不光
 468 | 不单
 469 | 不只
 470 | 不外乎
 471 | 不如
 472 | 不妨
 473 | 不尽
 474 | 不尽然
 475 | 不得
 476 | 不怕
 477 | 不惟
 478 | 不成
 479 | 不拘
 480 | 不料
 481 | 不是
 482 | 不比
 483 | 不然
 484 | 不特
 485 | 不独
 486 | 不管
 487 | 不至于
 488 | 不若
 489 | 不论
 490 | 不过
 491 | 不问
 492 | 与
 493 | 与其
 494 | 与其说
 495 | 与否
 496 | 与此同时
 497 | 且
 498 | 且不说
 499 | 且说
 500 | 两者
 501 | 个
 502 | 个别
 503 | 临
 504 | 为
 505 | 为了
 506 | 为止
 507 | 为此
 508 | 为着
 509 | 乃
 510 | 乃至
 511 | 乃至于
 512 | 么
 513 | 之
 514 | 之一
 515 | 之所以
 516 | 之类
 517 | 乌乎
 518 | 乎
 519 | 乘
 520 | 也
 521 | 也好
 522 | 也罢
 523 | 了
 524 | 二来
 525 | 于
 526 | 于是
 527 | 于是乎
 528 | 云云
 529 | 云尔
 530 | 些
 531 | 亦
 532 | 人
 533 | 人们
 534 | 人家
 535 | 今
 536 | 介于
 537 | 仍
 538 | 仍旧
 539 | 从
 540 | 从此
 541 | 从而
 542 | 他
 543 | 他人
 544 | 他们
 545 | 以
 546 | 以上
 547 | 以为
 548 | 以便
 549 | 以免
 550 | 以及
 551 | 以故
 552 | 以期
 553 | 以来
 554 | 以至
 555 | 以至于
 556 | 以致
 557 | 们
 558 | 任
 559 | 任何
 560 | 任凭
 561 | 似的
 562 | 但
 563 | 但凡
 564 | 但是
 565 | 何
 566 | 何以
 567 | 何况
 568 | 何处
 569 | 何时
 570 | 余外
 571 | 作为
 572 | 你
 573 | 你们
 574 | 使
 575 | 使得
 576 | 例如
 577 | 依
 578 | 依据
 579 | 依照
 580 | 便于
 581 | 俺
 582 | 俺们
 583 | 倘
 584 | 倘使
 585 | 倘或
 586 | 倘然
 587 | 倘若
 588 | 借
 589 | 假使
 590 | 假如
 591 | 假若
 592 | 傥然
 593 | 像
 594 | 儿
 595 | 先不先
 596 | 光是
 597 | 全体
 598 | 全部
 599 | 兮
 600 | 关于
 601 | 其
 602 | 其一
 603 | 其中
 604 | 其二
 605 | 其他
 606 | 其余
 607 | 其它
 608 | 其次
 609 | 具体地说
 610 | 具体说来
 611 | 兼之
 612 | 内
 613 | 再其次
 614 | 再则
 615 | 再有
 616 | 再者
 617 | 再者说
 618 | 再说
 619 | 冒
 620 | 冲
 621 | 况且
 622 | 几
 623 | 几时
 624 | 凡
 625 | 凡是
 626 | 凭
 627 | 凭借
 628 | 出于
 629 | 出来
 630 | 分别
 631 | 则
 632 | 则甚
 633 | 别
 634 | 别人
 635 | 别处
 636 | 别是
 637 | 别的
 638 | 别管
 639 | 别说
 640 | 到
 641 | 前后
 642 | 前此
 643 | 前者
 644 | 加之
 645 | 加以
 646 | 即
 647 | 即令
 648 | 即使
 649 | 即便
 650 | 即如
 651 | 即或
 652 | 即若
 653 | 却
 654 | 去
 655 | 又
 656 | 又及
 657 | 及
 658 | 及其
 659 | 及至
 660 | 反之
 661 | 反而
 662 | 反过来
 663 | 反过来说
 664 | 受到
 665 | 另
 666 | 另一方面
 667 | 另外
 668 | 另悉
 669 | 只
 670 | 只当
 671 | 只怕
 672 | 只是
 673 | 只有
 674 | 只消
 675 | 只要
 676 | 只限
 677 | 叫
 678 | 叮咚
 679 | 可
 680 | 可以
 681 | 可是
 682 | 可见
 683 | 各
 684 | 各个
 685 | 各位
 686 | 各种
 687 | 各自
 688 | 同
 689 | 同时
 690 | 后
 691 | 后者
 692 | 向
 693 | 向使
 694 | 向着
 695 | 吓
 696 | 吗
 697 | 否则
 698 | 吧
 699 | 吧哒
 700 | 吱
 701 | 呀
 702 | 呃
 703 | 呕
 704 | 呗
 705 | 呜
 706 | 呜呼
 707 | 呢
 708 | 呵
 709 | 呵呵
 710 | 呸
 711 | 呼哧
 712 | 咋
 713 | 和
 714 | 咚
 715 | 咦
 716 | 咧
 717 | 咱
 718 | 咱们
 719 | 咳
 720 | 哇
 721 | 哈
 722 | 哈哈
 723 | 哉
 724 | 哎
 725 | 哎呀
 726 | 哎哟
 727 | 哗
 728 | 哟
 729 | 哦
 730 | 哩
 731 | 哪
 732 | 哪些
 733 | 哪怕
 734 | 哼
 735 | 哼唷
 736 | 唉
 737 | 唯有
 738 | 啊
 739 | 啐
 740 | 啥
 741 | 啦
 742 | 啪达
 743 | 啷当
 744 | 喂
 745 | 喏
 746 | 喔唷
 747 | 喽
 748 | 嗡
 749 | 嗡嗡
 750 | 嗬
 751 | 嗯
 752 | 嗳
 753 | 嘎
 754 | 嘎登
 755 | 嘘
 756 | 嘛
 757 | 嘻
 758 | 嘿
 759 | 嘿嘿
 760 | 因
 761 | 因为
 762 | 因了
 763 | 因此
 764 | 因着
 765 | 因而
 766 | 固然
 767 | 在
 768 | 在下
 769 | 在于
 770 | 地
 771 | 基于
 772 | 处在
 773 | 多
 774 | 多么
 775 | 多少
 776 | 大
 777 | 大家
 778 | 她
 779 | 她们
 780 | 好
 781 | 如
 782 | 如上
 783 | 如上所述
 784 | 如下
 785 | 如何
 786 | 如其
 787 | 如同
 788 | 如是
 789 | 如果
 790 | 如此
 791 | 如若
 792 | 始而
 793 | 孰料
 794 | 孰知
 795 | 宁
 796 | 宁可
 797 | 宁愿
 798 | 宁肯
 799 | 它
 800 | 它们
 801 | 对
 802 | 对于
 803 | 对待
 804 | 对方
 805 | 对比
 806 | 将
 807 | 小
 808 | 尔
 809 | 尔后
 810 | 尔尔
 811 | 尚且
 812 | 就
 813 | 就是
 814 | 就是了
 815 | 就是说
 816 | 就算
 817 | 就要
 818 | 尽
 819 | 尽管
 820 | 尽管如此
 821 | 岂但
 822 | 己
 823 | 已
 824 | 已矣
 825 | 巴
 826 | 巴巴
 827 | 并
 828 | 并且
 829 | 并非
 830 | 庶乎
 831 | 庶几
 832 | 开外
 833 | 开始
 834 | 归
 835 | 归齐
 836 | 当
 837 | 当地
 838 | 当然
 839 | 当着
 840 | 彼
 841 | 彼时
 842 | 彼此
 843 | 往
 844 | 待
 845 | 很
 846 | 得
 847 | 得了
 848 | 怎
 849 | 怎奈
 850 | 总之
 851 | 总的来看
 852 | 总的来说
 853 | 总的说来
 854 | 总而言之
 855 | 恰恰相反
 856 | 您
 857 | 惟其
 858 | 慢说
 859 | 我
 860 | 我们
 861 | 或
 862 | 或则
 863 | 或是
 864 | 或曰
 865 | 或者
 866 | 截至
 867 | 所
 868 | 所以
 869 | 所在
 870 | 所幸
 871 | 所有
 872 | 才
 873 | 才能
 874 | 打
 875 | 打从
 876 | 把
 877 | 抑或
 878 | 拿
 879 | 按
 880 | 按照
 881 | 换句话说
 882 | 换言之
 883 | 据
 884 | 据此
 885 | 接着
 886 | 故
 887 | 故此
 888 | 故而
 889 | 旁人
 890 | 无
 891 | 无宁
 892 | 无论
 893 | 既
 894 | 既往
 895 | 既是
 896 | 既然
 897 | 时候
 898 | 是
 899 | 是以
 900 | 是的
 901 | 曾
 902 | 替
 903 | 替代
 904 | 最
 905 | 有
 906 | 有些
 907 | 有关
 908 | 有及
 909 | 有时
 910 | 有的
 911 | 望
 912 | 朝
 913 | 朝着
 914 | 本
 915 | 本人
 916 | 本地
 917 | 本着
 918 | 本身
 919 | 来
 920 | 来着
 921 | 来自
 922 | 来说
 923 | 极了
 924 | 果然
 925 | 果真
 926 | 某
 927 | 某个
 928 | 某些
 929 | 某某
 930 | 根据
 931 | 欤
 932 | 正值
 933 | 正如
 934 | 正巧
 935 | 正是
 936 | 此
 937 | 此地
 938 | 此处
 939 | 此外
 940 | 此时
 941 | 此次
 942 | 此间
 943 | 毋宁
 944 | 每
 945 | 每当
 946 | 比
 947 | 比及
 948 | 比如
 949 | 比方
 950 | 没奈何
 951 | 沿
 952 | 沿着
 953 | 漫说
 954 | 焉
 955 | 然则
 956 | 然后
 957 | 然而
 958 | 照
 959 | 照着
 960 | 犹且
 961 | 犹自
 962 | 甚且
 963 | 甚么
 964 | 甚或
 965 | 甚而
 966 | 甚至
 967 | 甚至于
 968 | 用
 969 | 用来
 970 | 由
 971 | 由于
 972 | 由是
 973 | 由此
 974 | 由此可见
 975 | 的
 976 | 的确
 977 | 的话
 978 | 直到
 979 | 相对而言
 980 | 省得
 981 | 看
 982 | 眨眼
 983 | 着
 984 | 着呢
 985 | 矣
 986 | 矣乎
 987 | 矣哉
 988 | 离
 989 | 竟而
 990 | 第
 991 | 等
 992 | 等到
 993 | 等等
 994 | 简言之
 995 | 管
 996 | 类如
 997 | 紧接着
 998 | 纵
 999 | 纵令
1000 | 纵使
1001 | 纵然
1002 | 经
1003 | 经过
1004 | 结果
1005 | 给
1006 | 继之
1007 | 继后
1008 | 继而
1009 | 综上所述
1010 | 罢了
1011 | 者
1012 | 而
1013 | 而且
1014 | 而况
1015 | 而后
1016 | 而外
1017 | 而已
1018 | 而是
1019 | 而言
1020 | 能
1021 | 能否
1022 | 腾
1023 | 自
1024 | 自个儿
1025 | 自从
1026 | 自各儿
1027 | 自后
1028 | 自家
1029 | 自己
1030 | 自打
1031 | 自身
1032 | 至
1033 | 至于
1034 | 至今
1035 | 至若
1036 | 致
1037 | 般的
1038 | 若
1039 | 若夫
1040 | 若是
1041 | 若果 
1042 | 若非
1043 | 莫不然
1044 | 莫如
1045 | 莫若
1046 | 虽
1047 | 虽则
1048 | 虽然
1049 | 虽说
1050 | 被
1051 | 要
1052 | 要不
1053 | 要不是
1054 | 要不然
1055 | 要么
1056 | 要是
1057 | 譬喻
1058 | 譬如
1059 | 让
1060 | 许多
1061 | 论
1062 | 设使
1063 | 设或
1064 | 设若
1065 | 诚如
1066 | 诚然
1067 | 该
1068 | 说来
1069 | 诸
1070 | 诸位
1071 | 诸如
1072 | 谁
1073 | 谁人
1074 | 谁料
1075 | 谁知
1076 | 贼死
1077 | 赖以
1078 | 赶
1079 | 起
1080 | 起见
1081 | 趁
1082 | 趁着
1083 | 越是
1084 | 距
1085 | 跟
1086 | 较
1087 | 较之
1088 | 边
1089 | 过
1090 | 还
1091 | 还是
1092 | 还有
1093 | 还要
1094 | 这
1095 | 这一来
1096 | 这个
1097 | 这么
1098 | 这么些
1099 | 这么样
1100 | 这么点儿
1101 | 这些
1102 | 这会儿
1103 | 这儿
1104 | 这就是说
1105 | 这时
1106 | 这样
1107 | 这次
1108 | 这般
1109 | 这边
1110 | 这里
1111 | 进而
1112 | 连
1113 | 连同
1114 | 逐步
1115 | 通过
1116 | 遵循
1117 | 遵照
1118 | 那
1119 | 那个
1120 | 那么
1121 | 那么些
1122 | 那么样
1123 | 那些
1124 | 那会儿
1125 | 那儿
1126 | 那时
1127 | 那样
1128 | 那般
1129 | 那边
1130 | 那里
1131 | 都
1132 | 鄙人
1133 | 鉴于
1134 | 针对
1135 | 阿
1136 | 除
1137 | 除了
1138 | 除外
1139 | 除开
1140 | 除此之外
1141 | 除非
1142 | 随
1143 | 随后
1144 | 随时
1145 | 随着
1146 | 难道说
1147 | 非但
1148 | 非徒
1149 | 非特
1150 | 非独
1151 | 靠
1152 | 顺
1153 | 顺着
1154 | 首先
1155 | ︰
1156 | ︳
1157 | ︴
1158 | ︵
1159 | ︶
1160 | ︷
1161 | ︸
1162 | ︹
1163 | ︺
1164 | ︻
1165 | ︼
1166 | ︽
1167 | ︾
1168 | ︿
1169 | ﹀
1170 | ﹁
1171 | ﹂
1172 | ﹃
1173 | ﹄
1174 | ﹉
1175 | ﹊
1176 | ﹋
1177 | ﹌
1178 | ﹍
1179 | ﹎
1180 | ﹏
1181 | ﹐
1182 | ﹑
1183 | ﹔
1184 | ﹕
1185 | ﹖
1186 | ﹝
1187 | ﹞
1188 | ﹟
1189 | ﹠
1190 | ﹡
1191 | ﹢
1192 | ﹤
1193 | ﹦
1194 | ﹨
1195 | ﹩
1196 | ﹪
1197 | ﹫
1198 | ！
1199 | ＂
1200 | ＇
1201 | （
1202 | ）
1203 | ，
1204 | ：
1205 | ；
1206 | ？
1207 | ＿
1208 | ￣
1209 | １
1210 | ２
1211 | ３
1212 | ４
1213 | ５
1214 | ６
1215 | ７
1216 | ８
1217 | ９
1218 | ０
1219 | *
1220 | 


--------------------------------------------------------------------------------
/DSSM-Embedding/model.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/6/12 22:58
  4 | # @Author: Zhang Cong
  5 | 
  6 | from config import Config
  7 | import tensorflow as tf
  8 | 
  9 | class Model():
 10 |     def __init__(self):
 11 |         self.config = Config()                                                                                                                  # 配置参数
 12 |         self.input_query = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name='input-query')                             # 输入query，ID形式
 13 |         self.input_pos_doc = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name='input-pos')                             # 输入pos_doc，ID形式
 14 |         self.input_neg_doc = tf.placeholder(shape=[None, self.config.neg_doc_num, self.config.seq_length], dtype=tf.int32, name='input-neg')    # 输入多个neg_doc，ID形式
 15 |         self.input_keep_prob = tf.placeholder(dtype=tf.float32, name='input-keep-prob')                                                         # keep-prob
 16 | 
 17 |         # Embedding layer
 18 |         embedding = tf.get_variable(shape=[self.config.vocab_size, self.config.embedding_dim], dtype=tf.float32, name='embedding')
 19 | 
 20 |         # 将词汇映射为向量形式 [batch_size, seq_length, embedding_dim]
 21 |         embedding_query = tf.nn.embedding_lookup(params=embedding, ids=self.input_query, name='embedding_query')
 22 |         embedding_pos_doc = tf.nn.embedding_lookup(params=embedding, ids=self.input_pos_doc, name='embedding_pos_doc')
 23 |         embedding_neg_doc = tf.nn.embedding_lookup(params=embedding, ids=self.input_neg_doc, name='embedding_neg_doc')
 24 | 
 25 |         # 对所有词向量进行叠加求平均，用来表征整个文本向量
 26 |         embedding_query = tf.reduce_mean(embedding_query, axis=1)
 27 |         embedding_pos_doc = tf.reduce_mean(embedding_pos_doc, axis=1)
 28 |         embedding_neg_doc = tf.reduce_mean(embedding_neg_doc, axis=2)
 29 | 
 30 |         # 全连接层layer1 (batch_size, 300) -> (batch_size, 300)
 31 |         L1_N = 300
 32 |         l1_range = tf.sqrt(6/(self.config.embedding_dim + L1_N))        # 原论文weight、bias范围初始化方式
 33 |         weight_1 = tf.get_variable(initializer=tf.random_uniform(shape=[self.config.embedding_dim, L1_N], minval=-l1_range, maxval=l1_range), name='weight-1')
 34 |         bias_1 = tf.get_variable(initializer=tf.random_uniform(shape=[L1_N], minval=-l1_range, maxval=l1_range), name='bias-1')
 35 |         # 全连接
 36 |         query_l1 = tf.matmul(embedding_query, weight_1) + bias_1
 37 |         pos_doc_l1 = tf.matmul(embedding_pos_doc, weight_1, ) + bias_1
 38 |         neg_doc_l1 = tf.matmul(tf.reshape(embedding_neg_doc, shape=[-1, self.config.embedding_dim]), weight_1, ) + bias_1
 39 |         # 激活函数 activation function
 40 |         query_l1 = tf.nn.tanh(query_l1)
 41 |         pos_doc_l1 = tf.nn.tanh(pos_doc_l1)
 42 |         neg_doc_l1 = tf.nn.tanh(neg_doc_l1)
 43 | 
 44 |         # 全连接层layer2 (batch_size, 300) -> (batch_size, 300)
 45 |         L2_N = 300
 46 |         l2_range = tf.sqrt(6/(L1_N + L2_N))          # 原论文weight、bias范围初始化方式
 47 |         weight_2 = tf.get_variable(initializer=tf.random_uniform(shape=[L1_N, L2_N], minval=-l2_range, maxval=l2_range), name='weight-2')
 48 |         bias_2 = tf.get_variable(initializer=tf.random_uniform(shape=[L2_N], minval=-l2_range, maxval=l2_range), name='bias-2')
 49 |         # 全连接
 50 |         query_l2 = tf.matmul(query_l1, weight_2) + bias_2
 51 |         pos_doc_l2 = tf.matmul(pos_doc_l1, weight_2) + bias_2
 52 |         neg_doc_l2 = tf.matmul(neg_doc_l1, weight_2) + bias_2
 53 |         # 激活函数 activation function
 54 |         query_l2 = tf.nn.tanh(query_l2)
 55 |         pos_doc_l2 = tf.nn.tanh(pos_doc_l2)
 56 |         neg_doc_l2 = tf.nn.tanh(neg_doc_l2)
 57 | 
 58 |         # 全连接层layer3 (batch_size, 300) -> (batch_size, 128)
 59 |         L3_N = 128
 60 |         l3_range = tf.sqrt(6/(L2_N + L3_N))             # 原论文weight、bias范围初始化方式
 61 |         weight_3 = tf.get_variable(initializer=tf.random_uniform(shape=[L2_N, L3_N], minval=-l3_range, maxval=l3_range), name='weight-3')
 62 |         bias_3 = tf.get_variable(initializer=tf.random_uniform(shape=[L3_N], minval=-l3_range, maxval=l3_range), name='bias-3')
 63 |         # 全连接
 64 |         query_l3 = tf.matmul(query_l2, weight_3) + bias_3
 65 |         pos_doc_l3 = tf.matmul(pos_doc_l2, weight_3) + bias_3
 66 |         neg_doc_l3 = tf.matmul(neg_doc_l2, weight_3) + bias_3
 67 |         # 激活函数 activation function
 68 |         query_l3_out = tf.tanh(query_l3)
 69 |         pos_doc_l3_out = tf.tanh(pos_doc_l3)
 70 |         neg_doc_l3_out = tf.tanh(neg_doc_l3)
 71 | 
 72 |         # 维度还原 [batch_size, neg_doc_num, hidden_dim]
 73 |         neg_doc_l3_out = tf.reshape(neg_doc_l3_out, shape=[-1, self.config.neg_doc_num, L3_N])
 74 | 
 75 |         # 计算query和pos_doc的Cosine
 76 |         query_dot_pos = tf.reduce_sum(tf.multiply(query_l3_out, pos_doc_l3_out), axis=1)                # query和pos_doc进行点乘
 77 |         query_l3_l2 = tf.sqrt(tf.reduce_sum(tf.square(query_l3_out), axis=1))                           # query的L2范数
 78 |         pos_doc_l3_l2 = tf.sqrt(tf.reduce_sum(tf.square(pos_doc_l3_out), axis=1))                       # pos_doc的L2范数
 79 |         self.query_pos_cosine = tf.expand_dims(query_dot_pos/(query_l3_l2*pos_doc_l3_l2), axis=1)       # 计算query和pos_doc的余弦值
 80 | 
 81 |         # 测试结果
 82 |         self.test_predict = tf.reshape(tensor=tf.round(self.query_pos_cosine), shape=[-1])
 83 |         # 批测试准确率
 84 |         self.accuracy_test = tf.reduce_mean(tf.round(self.query_pos_cosine))
 85 | 
 86 |         # 计算query和neg_doc的Cosine
 87 |         query_l3_out_flatten = tf.expand_dims(query_l3_out, axis=1)                                     # 扩充query矩阵维度
 88 |         query_dot_neg = tf.reduce_sum(tf.multiply(query_l3_out_flatten, neg_doc_l3_out), axis=2)        # query和neg_doc进行点乘
 89 |         neg_doc_l3_l2 = tf.sqrt(tf.reduce_sum(tf.square(neg_doc_l3_out), axis=2))                       # neg_doc的L2范数
 90 |         self.query_neg_cosine = query_dot_neg/(tf.expand_dims(query_l3_l2, axis=1)*neg_doc_l3_l2)       # 计算query和neg_doc的余弦值
 91 | 
 92 |         # 将pos_doc和neg_doc的cosine进行拼接为一个整体矩阵
 93 |         doc_cosine = tf.concat([self.query_pos_cosine, self.query_neg_cosine], axis=1)
 94 |         # score归一化
 95 |         doc_cosine_softmax = tf.nn.softmax(doc_cosine, axis=1)
 96 |         # 获取query与pos_doc的相似度
 97 |         prob = tf.slice(doc_cosine_softmax, begin=[0, 0], size=[-1, 1])
 98 |         # 训练结果
 99 |         self.train_predict = tf.reshape(tensor=tf.round(prob), shape=[-1])
100 |         # 损失函数 负对数损失函数，提升pos_doc的score，抑制neg_doc的score
101 |         self.loss = -tf.reduce_sum(tf.log(prob))
102 |         # 优化器
103 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(loss=self.loss)
104 | 
105 |         # 构造true label
106 |         label = [[1]+[0]*self.config.neg_doc_num]                   # true label: [1, 0, 0, 0, 0]
107 |         labels = tf.tile(label, [self.config.batch_size, 1])        # 按batch_size的数量进行复制
108 | 
109 |         # 正确率
110 |         correct = tf.equal(tf.argmax(doc_cosine_softmax, axis=1), tf.argmax(labels, axis=1))
111 |         self.accuracy_train = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
112 | 
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     Model()


--------------------------------------------------------------------------------
/DSSM-RNN/DSSM-LSTM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/DSSM-RNN/DSSM-LSTM.png


--------------------------------------------------------------------------------
/DSSM-RNN/Data_Generate.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/5/12 22:19 
  4 | # @Author: Zhang Cong
  5 | 
  6 | import random
  7 | import logging
  8 | from tqdm import tqdm
  9 | 
 10 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 11 | 
 12 | def generate_train_data(input_file_path, output_file_path):
 13 |     '''
 14 |     对原始csv数据进行格式转换，构建训练数据集
 15 |     :param input_file_path: 原始数据路径
 16 |     :param output_file_path: 构建完成的训练数据路径
 17 |     :return: 将数据存储至本地
 18 |     '''
 19 |     logging.info('Start get all sentence ...')
 20 |     # 获取全部句子集
 21 |     all_sentence = []
 22 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 23 |         line = line.replace('\n', '').split('\t')
 24 |         if line[2] == 'label':      # 跳过首行
 25 |             continue
 26 |         sentence_1 = str(line[0]).replace('\t', '')     # 句子1
 27 |         sentence_2 = str(line[1]).replace('\t', '')     # 句子2
 28 |         all_sentence.append(sentence_1)
 29 |         all_sentence.append(sentence_2)
 30 |     # 去重
 31 |     all_sentence = list(set(all_sentence))
 32 | 
 33 |     logging.info('Start generate dataset ...')
 34 |     # 构建训练数据集 [query, pos, neg_1, neg_2, neg_3, neg_4]
 35 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
 36 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 37 |         line = line.replace('\n', '').split('\t')
 38 |         if line[2] == 'label':      # 跳过首行
 39 |             continue
 40 |         sentence_list = []
 41 |         sentence_1 = str(line[0]).replace('\t', '')
 42 |         sentence_2 = str(line[1]).replace('\t', '')
 43 |         sentence_list.append(sentence_1)    # 句子1
 44 |         sentence_list.append(sentence_2)    # 句子2
 45 |         label = line[2]                     # 标签
 46 | 
 47 |         if int(label)==1:       # 如果标签为1，则保留此句子对，并随机负采样得到4个负例
 48 |             while len(sentence_list)<6:         # [query, pos, neg_1, neg_2, neg_3, neg_4]
 49 |                 index = random.randint(0, len(all_sentence)-1)      # 随机索引
 50 |                 if all_sentence[index] not in sentence_list:        # 如果不重复，则加入
 51 |                     sentence_list.append(all_sentence[index])
 52 |             output_file.write('\t'.join(sentence_list) + '\n')
 53 |     output_file.close()
 54 |     logging.info('Finishied generate dataset ...')
 55 | 
 56 | 
 57 | def generate_test_data(input_file_path, output_file_path):
 58 |     '''
 59 |     对原始csv数据进行格式转换，构建测试数据集
 60 |     :param input_file_path: 原始数据路径
 61 |     :param output_file_path: 构建完成的训练数据路径
 62 |     :return: 将数据存储至本地
 63 |     '''
 64 |     logging.info('Start get all sentence ...')
 65 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
 66 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 67 |         line = line.replace('\n', '').split('\t')
 68 |         if line[2] == 'label':      # 跳过首行
 69 |             continue
 70 |         sentence_1 = str(line[0]).replace('\t', '')     # 句子1
 71 |         sentence_2 = str(line[1]).replace('\t', '')     # 句子2
 72 |         label = line[2]                                 # 标签
 73 |         output_file.write(sentence_1 + '\t' + sentence_2 + '\t' + label + '\n')
 74 | 
 75 | 
 76 | def check_data(input_file_path):
 77 |     '''
 78 |     统计数据分布情况，检查数据集0/1分布是否均衡
 79 |     :param input_file_path: 数据路径
 80 |     :return:
 81 |     '''
 82 |     count = 0
 83 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
 84 |         line = line.replace('\n', '').split('\t')
 85 |         if line[2] == 'label':
 86 |             continue
 87 |         if int(line[2]) == 1:
 88 |             count += 1
 89 |     print(count)
 90 | 
 91 | 
 92 | if __name__ == '__main__':
 93 | 
 94 |     # 统计数据分布情况
 95 |     file_path = './data/lcqmc/lcqmc_train.tsv'
 96 |     check_data(file_path)
 97 | 
 98 |     # 构建训练数据集
 99 |     input_file_path = './data/lcqmc/lcqmc_train.tsv'
100 |     output_file_path = './data/train.txt'
101 |     generate_train_data(input_file_path, output_file_path)
102 |     logging.info('Success generate train.txt')
103 | 
104 |     # 构建验证数据集
105 |     input_file_path = './data/lcqmc/lcqmc_dev.tsv'
106 |     output_file_path = './data/dev.txt'
107 |     generate_test_data(input_file_path, output_file_path)
108 |     logging.info('Success generate dev.txt')
109 | 
110 |     # 构建测试数据集
111 |     # input_file_path = './data/lcqmc/lcqmc_test.tsv'
112 |     # output_file_path = './data/test.txt'
113 |     # generate_test_data(input_file_path, output_file_path)
114 |     # logging.info('Success generate test.txt')
115 | 
116 | 


--------------------------------------------------------------------------------
/DSSM-RNN/README.md:
--------------------------------------------------------------------------------
 1 | ## DSSM-LSTM (Semantic modelling with long-short-term memory for information retrieval)
 2 | 
 3 | 
 4 | ### 数据集：
 5 | #### LCQMC (http://icrc.hitsz.edu.cn/info/1037/1146.htm) 数据集版权保护，本项目不提供，请自行下载或替换其他数据集进行试验
 6 | 
 7 | 
 8 | ### 数据形式：
 9 | #### query \t pos \t neg_1 \t neg_2 \t neg_3 \t neg_4
10 |     其中pos为正例，neg_1、neg_2、neg_3、neg_4为随机负采样得到负例
11 | 
12 | 
13 | ### 文件解释
14 | * main.py —— 主文件
15 | * model.py —— 模型结构
16 | * config.py —— 配置参数
17 | * Data_Generate.py —— 数据集处理脚本
18 | * /data —— 数据存放文件夹
19 | * /save_model —— 模型存储文件夹
20 | 
21 | 
22 | ### 模型结构
23 | ![avatar](./DSSM-LSTM.png)
24 | * 模型是在DSSM-CNN上的进一步改进，由于卷积无法捕捉单词之间的相关性，很明显更就要想到使用RNN解决这个问题。
25 | 
26 | 
27 | ### 参考资料
28 | * Semantic modelling with long-short-term memory for information retrieval (https://arxiv.org/pdf/1412.6629.pdf)
29 | * https://blog.csdn.net/sxf1061926959/article/details/89366526
30 | 
31 | 


--------------------------------------------------------------------------------
/DSSM-RNN/config.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/5/12 22:57 
 4 | # @Author: Zhang Cong
 5 | 
 6 | # 模型配置参数
 7 | class Config():
 8 |     def __init__(self):
 9 |         self.original_data_path = './data/train.txt'
10 |         self.dev_data_path = './data/dev.txt'
11 |         self.stopwords_path = './data/stopwords.txt'
12 |         self.preprocess_path = './data/preprocessed_data.txt'
13 |         self.vocab_path = './data/vocab.txt'
14 |         self.label_path = './data/label.txt'
15 |         self.model_save_path = './save_model/'
16 |         self.rnn_type = 'lstm'
17 |         self.num_layer = 2
18 |         self.vocab_size = 2000
19 |         self.seq_length = 20
20 |         self.embedding_dim = 300
21 |         self.neg_doc_num = 4
22 |         self.learning_rate = 1e-5
23 |         self.keep_prob = 0.5
24 |         self.hidden_dim = 512
25 |         self.kernel_size = 3
26 |         self.batch_size = 32
27 |         self.epochs = 100


--------------------------------------------------------------------------------
/DSSM-RNN/data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | &nbsp
   2 | &nbsp;
   3 | aboard
   4 | about
   5 | above
   6 | according
   7 | according to
   8 | across
   9 | afore
  10 | after
  11 | afterwards
  12 | again
  13 | against
  14 | agin
  15 | all
  16 | almost
  17 | alone
  18 | along
  19 | alongside
  20 | already
  21 | also
  22 | although
  23 | always
  24 | am
  25 | amid
  26 | amidst
  27 | among
  28 | amongst
  29 | amoungst
  30 | amount
  31 | an
  32 | and
  33 | anent
  34 | another
  35 | any
  36 | anyhow
  37 | anyone
  38 | anything
  39 | anyway
  40 | anywhere
  41 | approximately
  42 | are
  43 | around
  44 | as
  45 | asked
  46 | aslant
  47 | astride
  48 | at
  49 | athwart
  50 | back
  51 | bar
  52 | be
  53 | became
  54 | because
  55 | because of
  56 | become
  57 | becomes
  58 | becoming
  59 | been
  60 | before
  61 | beforehand
  62 | behind
  63 | being
  64 | below
  65 | beneath
  66 | beside
  67 | besides
  68 | between
  69 | betwixt
  70 | beyond
  71 | bill
  72 | both
  73 | bottom
  74 | but
  75 | by
  76 | call
  77 | called
  78 | can
  79 | cannot
  80 | cant
  81 | circa
  82 | co
  83 | computer
  84 | con
  85 | could
  86 | couldnt
  87 | cry
  88 | currently
  89 | dare
  90 | de
  91 | describe
  92 | despite
  93 | detail
  94 | did
  95 | do
  96 | does
  97 | done
  98 | down
  99 | dr
 100 | due
 101 | due to
 102 | during
 103 | e.g.,
 104 | each
 105 | earlier
 106 | eg
 107 | eight
 108 | either
 109 | eleven
 110 | else
 111 | elsewhere
 112 | empty
 113 | enough
 114 | ere
 115 | etc
 116 | even
 117 | eventually
 118 | ever
 119 | every
 120 | everyone
 121 | everything
 122 | everywhere
 123 | except
 124 | few
 125 | fifteen
 126 | fify
 127 | fill
 128 | find
 129 | fire
 130 | first
 131 | five
 132 | for
 133 | former
 134 | formerly
 135 | forty
 136 | found
 137 | four
 138 | from
 139 | front
 140 | full
 141 | further
 142 | get
 143 | give
 144 | go
 145 | had
 146 | has
 147 | hasnt
 148 | have
 149 | he
 150 | hence
 151 | her
 152 | here
 153 | hereafter
 154 | hereby
 155 | herein
 156 | hereupon
 157 | hers
 158 | herself
 159 | him
 160 | himself
 161 | his
 162 | how
 163 | however
 164 | hundred
 165 | i
 166 | ie
 167 | if
 168 | in
 169 | inc
 170 | indeed
 171 | inside
 172 | instead
 173 | interest
 174 | into
 175 | is
 176 | it
 177 | its
 178 | itself
 179 | just
 180 | keep
 181 | last
 182 | latter
 183 | latterly
 184 | least
 185 | less
 186 | like
 187 | ltd
 188 | made
 189 | major
 190 | many
 191 | may
 192 | maybe
 193 | me
 194 | meanwhile
 195 | mid
 196 | midst
 197 | might
 198 | mill
 199 | mine
 200 | minus
 201 | more
 202 | moreover
 203 | most
 204 | mostly
 205 | move
 206 | mr
 207 | mrs
 208 | ms
 209 | much
 210 | must
 211 | my
 212 | myself
 213 | name
 214 | namely
 215 | near
 216 | need
 217 | neither
 218 | net
 219 | never
 220 | nevertheless
 221 | next
 222 | nigh
 223 | nigher
 224 | nighest
 225 | nine
 226 | no
 227 | nobody
 228 | none
 229 | noone
 230 | nor
 231 | not
 232 | nothing
 233 | notwithstanding
 234 | now
 235 | nowhere
 236 | of
 237 | off
 238 | often
 239 | on
 240 | on to
 241 | once
 242 | one
 243 | only
 244 | onto
 245 | or
 246 | other
 247 | others
 248 | otherwise
 249 | ought
 250 | our
 251 | ours
 252 | ourselves
 253 | out
 254 | out of
 255 | outside
 256 | over
 257 | own
 258 | part
 259 | partly
 260 | past
 261 | pending
 262 | per
 263 | perhaps
 264 | please
 265 | plus
 266 | prior
 267 | put
 268 | qua
 269 | rather
 270 | re
 271 | regarding
 272 | round
 273 | same
 274 | sans
 275 | save
 276 | see
 277 | seem
 278 | seemed
 279 | seeming
 280 | seems
 281 | separately
 282 | serious
 283 | seven
 284 | several
 285 | shall
 286 | she
 287 | should
 288 | show
 289 | side
 290 | similarly
 291 | since
 292 | sincere
 293 | six
 294 | sixty
 295 | so
 296 | some
 297 | somehow
 298 | someone
 299 | something
 300 | sometime
 301 | sometimes
 302 | somewhere
 303 | still
 304 | such
 305 | system
 306 | take
 307 | ten
 308 | than
 309 | that
 310 | the
 311 | their
 312 | theirs
 313 | them
 314 | themselves
 315 | then
 316 | thence
 317 | there
 318 | thereafter
 319 | thereby
 320 | therefore
 321 | therein
 322 | thereupon
 323 | these
 324 | they
 325 | thick
 326 | thin
 327 | third
 328 | this
 329 | those
 330 | though
 331 | three
 332 | through
 333 | throughout
 334 | thru
 335 | thus
 336 | till
 337 | to
 338 | together
 339 | too
 340 | top
 341 | toward
 342 | towards
 343 | twelve
 344 | twenty
 345 | two
 346 | un
 347 | under
 348 | underneath
 349 | unless
 350 | unlike
 351 | until
 352 | unto
 353 | up
 354 | upon
 355 | us
 356 | versus
 357 | very
 358 | via
 359 | vice
 360 | volume
 361 | was
 362 | we
 363 | well
 364 | were
 365 | what
 366 | whatever
 367 | whats
 368 | when
 369 | whence
 370 | whenever
 371 | where
 372 | whereafter
 373 | whereas
 374 | whereby
 375 | wherein
 376 | whereupon
 377 | wherever
 378 | whether
 379 | which
 380 | while
 381 | whither
 382 | who
 383 | whoever
 384 | whole
 385 | whom
 386 | whose
 387 | why
 388 | will
 389 | with
 390 | within
 391 | without
 392 | would
 393 | yesterday
 394 | yet
 395 | you
 396 | your
 397 | yours
 398 | yourself
 399 | yourselves
 400 | {
 401 | |
 402 | }
 403 | ~
 404 | ¡
 405 | ¦
 406 | «
 407 | ­
 408 | ¯
 409 | ´
 410 | ¸
 411 | »
 412 | ¿
 413 | ˇ
 414 | ˉ
 415 | ˊ
 416 | ˋ
 417 | ˜
 418 | ‐
 419 | —　
 420 | ―
 421 | ‖
 422 | ‘
 423 | ’
 424 | “
 425 | ”
 426 | •
 427 | …
 428 | ‹
 429 | ›
 430 | ∕
 431 | 、
 432 | 。
 433 | 〈
 434 | 〉
 435 | 《
 436 | 》
 437 | 「
 438 | 」
 439 | 『
 440 | 』
 441 | 【
 442 | 】
 443 | 〔
 444 | 〕
 445 | 〖
 446 | 〗
 447 | 〝
 448 | 〞
 449 | 一
 450 | 一些
 451 | 一何
 452 | 一切
 453 | 一则
 454 | 一方面
 455 | 一旦
 456 | 一来
 457 | 一样
 458 | 一般
 459 | 一转眼
 460 | 万一
 461 | 上
 462 | 上下
 463 | 下
 464 | 不
 465 | 不仅
 466 | 不但
 467 | 不光
 468 | 不单
 469 | 不只
 470 | 不外乎
 471 | 不如
 472 | 不妨
 473 | 不尽
 474 | 不尽然
 475 | 不得
 476 | 不怕
 477 | 不惟
 478 | 不成
 479 | 不拘
 480 | 不料
 481 | 不是
 482 | 不比
 483 | 不然
 484 | 不特
 485 | 不独
 486 | 不管
 487 | 不至于
 488 | 不若
 489 | 不论
 490 | 不过
 491 | 不问
 492 | 与
 493 | 与其
 494 | 与其说
 495 | 与否
 496 | 与此同时
 497 | 且
 498 | 且不说
 499 | 且说
 500 | 两者
 501 | 个
 502 | 个别
 503 | 临
 504 | 为
 505 | 为了
 506 | 为止
 507 | 为此
 508 | 为着
 509 | 乃
 510 | 乃至
 511 | 乃至于
 512 | 么
 513 | 之
 514 | 之一
 515 | 之所以
 516 | 之类
 517 | 乌乎
 518 | 乎
 519 | 乘
 520 | 也
 521 | 也好
 522 | 也罢
 523 | 了
 524 | 二来
 525 | 于
 526 | 于是
 527 | 于是乎
 528 | 云云
 529 | 云尔
 530 | 些
 531 | 亦
 532 | 人
 533 | 人们
 534 | 人家
 535 | 今
 536 | 介于
 537 | 仍
 538 | 仍旧
 539 | 从
 540 | 从此
 541 | 从而
 542 | 他
 543 | 他人
 544 | 他们
 545 | 以
 546 | 以上
 547 | 以为
 548 | 以便
 549 | 以免
 550 | 以及
 551 | 以故
 552 | 以期
 553 | 以来
 554 | 以至
 555 | 以至于
 556 | 以致
 557 | 们
 558 | 任
 559 | 任何
 560 | 任凭
 561 | 似的
 562 | 但
 563 | 但凡
 564 | 但是
 565 | 何
 566 | 何以
 567 | 何况
 568 | 何处
 569 | 何时
 570 | 余外
 571 | 作为
 572 | 你
 573 | 你们
 574 | 使
 575 | 使得
 576 | 例如
 577 | 依
 578 | 依据
 579 | 依照
 580 | 便于
 581 | 俺
 582 | 俺们
 583 | 倘
 584 | 倘使
 585 | 倘或
 586 | 倘然
 587 | 倘若
 588 | 借
 589 | 假使
 590 | 假如
 591 | 假若
 592 | 傥然
 593 | 像
 594 | 儿
 595 | 先不先
 596 | 光是
 597 | 全体
 598 | 全部
 599 | 兮
 600 | 关于
 601 | 其
 602 | 其一
 603 | 其中
 604 | 其二
 605 | 其他
 606 | 其余
 607 | 其它
 608 | 其次
 609 | 具体地说
 610 | 具体说来
 611 | 兼之
 612 | 内
 613 | 再其次
 614 | 再则
 615 | 再有
 616 | 再者
 617 | 再者说
 618 | 再说
 619 | 冒
 620 | 冲
 621 | 况且
 622 | 几
 623 | 几时
 624 | 凡
 625 | 凡是
 626 | 凭
 627 | 凭借
 628 | 出于
 629 | 出来
 630 | 分别
 631 | 则
 632 | 则甚
 633 | 别
 634 | 别人
 635 | 别处
 636 | 别是
 637 | 别的
 638 | 别管
 639 | 别说
 640 | 到
 641 | 前后
 642 | 前此
 643 | 前者
 644 | 加之
 645 | 加以
 646 | 即
 647 | 即令
 648 | 即使
 649 | 即便
 650 | 即如
 651 | 即或
 652 | 即若
 653 | 却
 654 | 去
 655 | 又
 656 | 又及
 657 | 及
 658 | 及其
 659 | 及至
 660 | 反之
 661 | 反而
 662 | 反过来
 663 | 反过来说
 664 | 受到
 665 | 另
 666 | 另一方面
 667 | 另外
 668 | 另悉
 669 | 只
 670 | 只当
 671 | 只怕
 672 | 只是
 673 | 只有
 674 | 只消
 675 | 只要
 676 | 只限
 677 | 叫
 678 | 叮咚
 679 | 可
 680 | 可以
 681 | 可是
 682 | 可见
 683 | 各
 684 | 各个
 685 | 各位
 686 | 各种
 687 | 各自
 688 | 同
 689 | 同时
 690 | 后
 691 | 后者
 692 | 向
 693 | 向使
 694 | 向着
 695 | 吓
 696 | 吗
 697 | 否则
 698 | 吧
 699 | 吧哒
 700 | 吱
 701 | 呀
 702 | 呃
 703 | 呕
 704 | 呗
 705 | 呜
 706 | 呜呼
 707 | 呢
 708 | 呵
 709 | 呵呵
 710 | 呸
 711 | 呼哧
 712 | 咋
 713 | 和
 714 | 咚
 715 | 咦
 716 | 咧
 717 | 咱
 718 | 咱们
 719 | 咳
 720 | 哇
 721 | 哈
 722 | 哈哈
 723 | 哉
 724 | 哎
 725 | 哎呀
 726 | 哎哟
 727 | 哗
 728 | 哟
 729 | 哦
 730 | 哩
 731 | 哪
 732 | 哪些
 733 | 哪怕
 734 | 哼
 735 | 哼唷
 736 | 唉
 737 | 唯有
 738 | 啊
 739 | 啐
 740 | 啥
 741 | 啦
 742 | 啪达
 743 | 啷当
 744 | 喂
 745 | 喏
 746 | 喔唷
 747 | 喽
 748 | 嗡
 749 | 嗡嗡
 750 | 嗬
 751 | 嗯
 752 | 嗳
 753 | 嘎
 754 | 嘎登
 755 | 嘘
 756 | 嘛
 757 | 嘻
 758 | 嘿
 759 | 嘿嘿
 760 | 因
 761 | 因为
 762 | 因了
 763 | 因此
 764 | 因着
 765 | 因而
 766 | 固然
 767 | 在
 768 | 在下
 769 | 在于
 770 | 地
 771 | 基于
 772 | 处在
 773 | 多
 774 | 多么
 775 | 多少
 776 | 大
 777 | 大家
 778 | 她
 779 | 她们
 780 | 好
 781 | 如
 782 | 如上
 783 | 如上所述
 784 | 如下
 785 | 如何
 786 | 如其
 787 | 如同
 788 | 如是
 789 | 如果
 790 | 如此
 791 | 如若
 792 | 始而
 793 | 孰料
 794 | 孰知
 795 | 宁
 796 | 宁可
 797 | 宁愿
 798 | 宁肯
 799 | 它
 800 | 它们
 801 | 对
 802 | 对于
 803 | 对待
 804 | 对方
 805 | 对比
 806 | 将
 807 | 小
 808 | 尔
 809 | 尔后
 810 | 尔尔
 811 | 尚且
 812 | 就
 813 | 就是
 814 | 就是了
 815 | 就是说
 816 | 就算
 817 | 就要
 818 | 尽
 819 | 尽管
 820 | 尽管如此
 821 | 岂但
 822 | 己
 823 | 已
 824 | 已矣
 825 | 巴
 826 | 巴巴
 827 | 并
 828 | 并且
 829 | 并非
 830 | 庶乎
 831 | 庶几
 832 | 开外
 833 | 开始
 834 | 归
 835 | 归齐
 836 | 当
 837 | 当地
 838 | 当然
 839 | 当着
 840 | 彼
 841 | 彼时
 842 | 彼此
 843 | 往
 844 | 待
 845 | 很
 846 | 得
 847 | 得了
 848 | 怎
 849 | 怎奈
 850 | 总之
 851 | 总的来看
 852 | 总的来说
 853 | 总的说来
 854 | 总而言之
 855 | 恰恰相反
 856 | 您
 857 | 惟其
 858 | 慢说
 859 | 我
 860 | 我们
 861 | 或
 862 | 或则
 863 | 或是
 864 | 或曰
 865 | 或者
 866 | 截至
 867 | 所
 868 | 所以
 869 | 所在
 870 | 所幸
 871 | 所有
 872 | 才
 873 | 才能
 874 | 打
 875 | 打从
 876 | 把
 877 | 抑或
 878 | 拿
 879 | 按
 880 | 按照
 881 | 换句话说
 882 | 换言之
 883 | 据
 884 | 据此
 885 | 接着
 886 | 故
 887 | 故此
 888 | 故而
 889 | 旁人
 890 | 无
 891 | 无宁
 892 | 无论
 893 | 既
 894 | 既往
 895 | 既是
 896 | 既然
 897 | 时候
 898 | 是
 899 | 是以
 900 | 是的
 901 | 曾
 902 | 替
 903 | 替代
 904 | 最
 905 | 有
 906 | 有些
 907 | 有关
 908 | 有及
 909 | 有时
 910 | 有的
 911 | 望
 912 | 朝
 913 | 朝着
 914 | 本
 915 | 本人
 916 | 本地
 917 | 本着
 918 | 本身
 919 | 来
 920 | 来着
 921 | 来自
 922 | 来说
 923 | 极了
 924 | 果然
 925 | 果真
 926 | 某
 927 | 某个
 928 | 某些
 929 | 某某
 930 | 根据
 931 | 欤
 932 | 正值
 933 | 正如
 934 | 正巧
 935 | 正是
 936 | 此
 937 | 此地
 938 | 此处
 939 | 此外
 940 | 此时
 941 | 此次
 942 | 此间
 943 | 毋宁
 944 | 每
 945 | 每当
 946 | 比
 947 | 比及
 948 | 比如
 949 | 比方
 950 | 没奈何
 951 | 沿
 952 | 沿着
 953 | 漫说
 954 | 焉
 955 | 然则
 956 | 然后
 957 | 然而
 958 | 照
 959 | 照着
 960 | 犹且
 961 | 犹自
 962 | 甚且
 963 | 甚么
 964 | 甚或
 965 | 甚而
 966 | 甚至
 967 | 甚至于
 968 | 用
 969 | 用来
 970 | 由
 971 | 由于
 972 | 由是
 973 | 由此
 974 | 由此可见
 975 | 的
 976 | 的确
 977 | 的话
 978 | 直到
 979 | 相对而言
 980 | 省得
 981 | 看
 982 | 眨眼
 983 | 着
 984 | 着呢
 985 | 矣
 986 | 矣乎
 987 | 矣哉
 988 | 离
 989 | 竟而
 990 | 第
 991 | 等
 992 | 等到
 993 | 等等
 994 | 简言之
 995 | 管
 996 | 类如
 997 | 紧接着
 998 | 纵
 999 | 纵令
1000 | 纵使
1001 | 纵然
1002 | 经
1003 | 经过
1004 | 结果
1005 | 给
1006 | 继之
1007 | 继后
1008 | 继而
1009 | 综上所述
1010 | 罢了
1011 | 者
1012 | 而
1013 | 而且
1014 | 而况
1015 | 而后
1016 | 而外
1017 | 而已
1018 | 而是
1019 | 而言
1020 | 能
1021 | 能否
1022 | 腾
1023 | 自
1024 | 自个儿
1025 | 自从
1026 | 自各儿
1027 | 自后
1028 | 自家
1029 | 自己
1030 | 自打
1031 | 自身
1032 | 至
1033 | 至于
1034 | 至今
1035 | 至若
1036 | 致
1037 | 般的
1038 | 若
1039 | 若夫
1040 | 若是
1041 | 若果 
1042 | 若非
1043 | 莫不然
1044 | 莫如
1045 | 莫若
1046 | 虽
1047 | 虽则
1048 | 虽然
1049 | 虽说
1050 | 被
1051 | 要
1052 | 要不
1053 | 要不是
1054 | 要不然
1055 | 要么
1056 | 要是
1057 | 譬喻
1058 | 譬如
1059 | 让
1060 | 许多
1061 | 论
1062 | 设使
1063 | 设或
1064 | 设若
1065 | 诚如
1066 | 诚然
1067 | 该
1068 | 说来
1069 | 诸
1070 | 诸位
1071 | 诸如
1072 | 谁
1073 | 谁人
1074 | 谁料
1075 | 谁知
1076 | 贼死
1077 | 赖以
1078 | 赶
1079 | 起
1080 | 起见
1081 | 趁
1082 | 趁着
1083 | 越是
1084 | 距
1085 | 跟
1086 | 较
1087 | 较之
1088 | 边
1089 | 过
1090 | 还
1091 | 还是
1092 | 还有
1093 | 还要
1094 | 这
1095 | 这一来
1096 | 这个
1097 | 这么
1098 | 这么些
1099 | 这么样
1100 | 这么点儿
1101 | 这些
1102 | 这会儿
1103 | 这儿
1104 | 这就是说
1105 | 这时
1106 | 这样
1107 | 这次
1108 | 这般
1109 | 这边
1110 | 这里
1111 | 进而
1112 | 连
1113 | 连同
1114 | 逐步
1115 | 通过
1116 | 遵循
1117 | 遵照
1118 | 那
1119 | 那个
1120 | 那么
1121 | 那么些
1122 | 那么样
1123 | 那些
1124 | 那会儿
1125 | 那儿
1126 | 那时
1127 | 那样
1128 | 那般
1129 | 那边
1130 | 那里
1131 | 都
1132 | 鄙人
1133 | 鉴于
1134 | 针对
1135 | 阿
1136 | 除
1137 | 除了
1138 | 除外
1139 | 除开
1140 | 除此之外
1141 | 除非
1142 | 随
1143 | 随后
1144 | 随时
1145 | 随着
1146 | 难道说
1147 | 非但
1148 | 非徒
1149 | 非特
1150 | 非独
1151 | 靠
1152 | 顺
1153 | 顺着
1154 | 首先
1155 | ︰
1156 | ︳
1157 | ︴
1158 | ︵
1159 | ︶
1160 | ︷
1161 | ︸
1162 | ︹
1163 | ︺
1164 | ︻
1165 | ︼
1166 | ︽
1167 | ︾
1168 | ︿
1169 | ﹀
1170 | ﹁
1171 | ﹂
1172 | ﹃
1173 | ﹄
1174 | ﹉
1175 | ﹊
1176 | ﹋
1177 | ﹌
1178 | ﹍
1179 | ﹎
1180 | ﹏
1181 | ﹐
1182 | ﹑
1183 | ﹔
1184 | ﹕
1185 | ﹖
1186 | ﹝
1187 | ﹞
1188 | ﹟
1189 | ﹠
1190 | ﹡
1191 | ﹢
1192 | ﹤
1193 | ﹦
1194 | ﹨
1195 | ﹩
1196 | ﹪
1197 | ﹫
1198 | ！
1199 | ＂
1200 | ＇
1201 | （
1202 | ）
1203 | ，
1204 | ：
1205 | ；
1206 | ？
1207 | ＿
1208 | ￣
1209 | １
1210 | ２
1211 | ３
1212 | ４
1213 | ５
1214 | ６
1215 | ７
1216 | ８
1217 | ９
1218 | ０
1219 | *
1220 | 


--------------------------------------------------------------------------------
/DSSM-RNN/model.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/6/12 22:58
  4 | # @Author: Zhang Cong
  5 | 
  6 | from config import Config
  7 | import tensorflow as tf
  8 | import tensorflow.contrib as contrib
  9 | 
 10 | class Model():
 11 |     def __init__(self):
 12 |         self.config = Config()                                                                                                                  # 配置参数
 13 |         self.input_query = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name='input-query')                             # 输入query，ID形式
 14 |         self.input_pos_doc = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name='input-pos')                             # 输入pos_doc，ID形式
 15 |         self.input_neg_doc = tf.placeholder(shape=[None, self.config.neg_doc_num, self.config.seq_length], dtype=tf.int32, name='input-neg')    # 输入多个neg_doc，ID形式
 16 |         self.input_keep_prob = tf.placeholder(dtype=tf.float32, name='input-keep-prob')                                                         # keep-prob
 17 | 
 18 |         # Embedding layer
 19 |         embedding = tf.get_variable(shape=[self.config.vocab_size, self.config.embedding_dim], dtype=tf.float32, name='embedding')
 20 | 
 21 |         # 将词汇映射为向量形式 [batch_size, seq_length, embedding_dim]
 22 |         embedding_query = tf.nn.embedding_lookup(params=embedding, ids=self.input_query, name='embedding_query')
 23 |         embedding_pos_doc = tf.nn.embedding_lookup(params=embedding, ids=self.input_pos_doc, name='embedding_pos_doc')
 24 |         embedding_neg_doc = tf.nn.embedding_lookup(params=embedding, ids=self.input_neg_doc, name='embedding_neg_doc')
 25 | 
 26 |         # 创建rnn cell列表 并将多个rnn cell进行组合
 27 |         cells = [self.get_rnn(self.config.rnn_type) for _ in range(self.config.num_layer)]
 28 |         rnn_cell = contrib.rnn.MultiRNNCell(cells=cells, state_is_tuple=True)
 29 | 
 30 |         # 对query进行RNN处理，取最后一个时序的输出结果
 31 |         query_output, query_state = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_query, dtype=tf.float32)
 32 |         query_last = query_output[:, -1, :]
 33 | 
 34 |         # 对pos_doc进行RNN处理，取最后一个时序的输出结果
 35 |         pos_output, pos_state = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_pos_doc, dtype=tf.float32)
 36 |         pos_last = pos_output[:, -1, :]
 37 | 
 38 |         # 对neg_doc进行RNN处理，取最后一个时序的输出结果
 39 |         neg_output, neg_state = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=tf.reshape(embedding_neg_doc, shape=[-1, self.config.seq_length, self.config.embedding_dim]), dtype=tf.float32)
 40 |         neg_last = neg_output[:, -1, :]
 41 | 
 42 |         # 全连接层layer1 (batch_size, 512) -> (batch_size, 300)
 43 |         L1_N = 300
 44 |         l1_range = tf.sqrt(6/(self.config.hidden_dim + L1_N))             # 原论文weight、bias范围初始化方式
 45 |         weight_1 = tf.get_variable(initializer=tf.random_uniform(shape=[self.config.hidden_dim, L1_N], minval=-l1_range, maxval=l1_range), name='weight-1')
 46 |         bias_1 = tf.get_variable(initializer=tf.random_uniform(shape=[L1_N], minval=-l1_range, maxval=l1_range), name='bias-1')
 47 |         # 全连接
 48 |         query_l1 = tf.matmul(query_last, weight_1) + bias_1
 49 |         pos_doc_l1 = tf.matmul(pos_last, weight_1, ) + bias_1
 50 |         neg_doc_l1 = tf.matmul(neg_last, weight_1, ) + bias_1
 51 |         # 激活函数 activation function
 52 |         query_l1 = tf.nn.tanh(query_l1)
 53 |         pos_doc_l1 = tf.nn.tanh(pos_doc_l1)
 54 |         neg_doc_l1 = tf.nn.tanh(neg_doc_l1)
 55 | 
 56 |         # 全连接层layer2 (batch_size, 300) -> (batch_size, 128)
 57 |         L2_N = 128
 58 |         l2_range = tf.sqrt(6/(L1_N + L2_N))                    # 原论文weight、bias范围初始化方式
 59 |         weight_2 = tf.get_variable(initializer=tf.random_uniform(shape=[L1_N, L2_N], minval=-l2_range, maxval=l2_range), name='weight-2')
 60 |         bias_2 = tf.get_variable(initializer=tf.random_uniform(shape=[L2_N], minval=-l2_range, maxval=l2_range), name='bias-2')
 61 |         # 全连接
 62 |         query_l2 = tf.matmul(query_l1, weight_2) + bias_2
 63 |         pos_doc_l2 = tf.matmul(pos_doc_l1, weight_2) + bias_2
 64 |         neg_doc_l2 = tf.matmul(neg_doc_l1, weight_2) + bias_2
 65 |         # 激活函数 activation function
 66 |         query_l2_out = tf.tanh(query_l2)
 67 |         pos_doc_l2_out = tf.tanh(pos_doc_l2)
 68 |         neg_doc_l2_out = tf.tanh(neg_doc_l2)
 69 | 
 70 |         # 维度还原 [batch_size, neg_doc_num, hidden_dim]
 71 |         neg_doc_l2_out = tf.reshape(neg_doc_l2_out, shape=[-1, self.config.neg_doc_num, L2_N])
 72 | 
 73 |         # 计算query和pos_doc的Cosine
 74 |         query_dot_pos = tf.reduce_sum(tf.multiply(query_l2_out, pos_doc_l2_out), axis=1)                # query和pos_doc进行点乘
 75 |         query_l2_L2 = tf.sqrt(tf.reduce_sum(tf.square(query_l2_out), axis=1))                           # query的L2范数
 76 |         pos_doc_l2_L2 = tf.sqrt(tf.reduce_sum(tf.square(pos_doc_l2_out), axis=1))                       # pos_doc的L2范数
 77 |         self.query_pos_cosine = tf.expand_dims(query_dot_pos/(query_l2_L2*pos_doc_l2_L2), axis=1)       # 计算query和pos_doc的余弦值
 78 | 
 79 |         # 测试结果
 80 |         self.test_predict = tf.reshape(tensor=tf.round(self.query_pos_cosine), shape=[-1])
 81 |         # 批测试准确率
 82 |         self.accuracy_test = tf.reduce_mean(tf.round(self.query_pos_cosine))
 83 | 
 84 |         # 计算query和neg_doc的Cosine
 85 |         query_l2_out_flatten = tf.expand_dims(query_l2_out, axis=1)                                     # 扩充query矩阵维度
 86 |         query_dot_neg = tf.reduce_sum(tf.multiply(query_l2_out_flatten, neg_doc_l2_out), axis=2)        # query和neg_doc进行点乘
 87 |         neg_doc_l2_L2 = tf.sqrt(tf.reduce_sum(tf.square(neg_doc_l2_out), axis=2))                       # neg_doc的L2范数
 88 |         self.query_neg_cosine = query_dot_neg/(tf.expand_dims(query_l2_L2, axis=1)*neg_doc_l2_L2)       # 计算query和neg_doc的余弦值
 89 | 
 90 |         # 将pos_doc和neg_doc的cosine进行拼接为一个整体矩阵
 91 |         doc_cosine = tf.concat([self.query_pos_cosine, self.query_neg_cosine], axis=1)
 92 |         # score归一化
 93 |         doc_cosine_softmax = tf.nn.softmax(doc_cosine, axis=1)
 94 |         # 获取query与pos_doc的相似度
 95 |         prob = tf.slice(doc_cosine_softmax, begin=[0, 0], size=[-1, 1])
 96 |         # 训练结果
 97 |         self.train_predict = tf.reshape(tensor=tf.round(prob), shape=[-1])
 98 |         # 损失函数 负对数损失函数，提升pos_doc的score，抑制neg_doc的score
 99 |         self.loss = -tf.reduce_sum(tf.log(prob))
100 |         # 优化器
101 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(loss=self.loss)
102 | 
103 |         # 构造true label acc
104 |         label = [[1]+[0]*self.config.neg_doc_num]                       # true label: [1, 0, 0, 0, 0]
105 |         labels = tf.tile(label, [self.config.batch_size, 1])            # 按batch_size的数量进行复制
106 | 
107 |         # 正确率
108 |         correct = tf.equal(tf.argmax(doc_cosine_softmax, axis=1), tf.argmax(labels, axis=1))
109 |         self.accuracy_train = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
110 | 
111 | 
112 |     def get_rnn(self, rnn_type):
113 |         '''
114 |         创建RNN层
115 |         :param rnn_type: rnn类型 LSTM/GRU
116 |         :return:
117 |         '''
118 |         if rnn_type == 'lstm':
119 |             cell = contrib.rnn.BasicLSTMCell(num_units=self.config.hidden_dim, state_is_tuple=True)
120 |         else:
121 |             cell = contrib.rnn.GRUCell(num_units=self.config.hidden_dim)
122 |         return contrib.rnn.DropoutWrapper(cell=cell, input_keep_prob=self.input_keep_prob)
123 | 
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     Model()


--------------------------------------------------------------------------------
/ESIM/Data_Generate.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/5/12 22:19 
 4 | # @Author: Zhang Cong
 5 | 
 6 | import logging
 7 | from tqdm import tqdm
 8 | 
 9 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
10 | 
11 | def generate_data(input_file_path, output_file_path):
12 |     '''
13 |     对原始csv数据进行格式转换，构建训练/测试数据集
14 |     :param input_file_path: 原始数据路径
15 |     :param output_file_path: 构建完成的训练数据路径
16 |     :return: 将数据存储至本地
17 |     '''
18 |     logging.info('Start get all sentence ...')
19 |     output_file = open(output_file_path, mode='w', encoding='utf-8')
20 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
21 |         line = line.replace('\n', '').split('\t')
22 |         if line[2] == 'label':
23 |             continue
24 |         sentence_1 = str(line[0]).replace('\t', '')
25 |         sentence_2 = str(line[1]).replace('\t', '')
26 |         label = str(line[2]).replace('\t', '')
27 |         output_file.write(sentence_1 + '\t' + sentence_2 + '\t' + label + '\n')
28 | 
29 | 
30 | def check_data(input_file_path):
31 |     '''
32 |     统计数据分布情况，检查数据集0/1分布是否均衡
33 |     :param input_file_path: 数据路径
34 |     :return:
35 |     '''
36 |     count = 0
37 |     for line in tqdm(open(input_file_path, encoding='utf-8')):
38 |         line = line.replace('\n', '').split('\t')
39 |         if line[2] == 'label':
40 |             continue
41 |         if int(line[2]) == 1:
42 |             count += 1
43 |     print(count)
44 | 
45 | 
46 | if __name__ == '__main__':
47 | 
48 |     # 统计数据分布情况
49 |     # file_path = './data/lcqmc/lcqmc_train.tsv'
50 |     # check_data(file_path)
51 | 
52 |     # 构建训练数据集
53 |     input_file_path = './data/lcqmc/lcqmc_train.tsv'
54 |     output_file_path = './data/train.txt'
55 |     generate_data(input_file_path, output_file_path)
56 |     logging.info('Success generate train.txt')
57 | 
58 |     # 构建验证数据集
59 |     input_file_path = './data/lcqmc/lcqmc_dev.tsv'
60 |     output_file_path = './data/dev.txt'
61 |     generate_data(input_file_path, output_file_path)
62 |     logging.info('Success generate dev.txt')
63 | 
64 |     # # 构建测试数据集
65 |     # input_file_path = './data/lcqmc/lcqmc_test.tsv'
66 |     # output_file_path = './data/test.txt'
67 |     # generate_test_data(input_file_path, output_file_path)
68 |     # logging.info('Success generate test.txt')
69 | 
70 | 


--------------------------------------------------------------------------------
/ESIM/ESIM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/ESIM/ESIM.png


--------------------------------------------------------------------------------
/ESIM/README.md:
--------------------------------------------------------------------------------
 1 | ## ESIM (Enhanced LSTM for Natural Language Inference)
 2 | 
 3 | 
 4 | ### 数据集：
 5 | #### LCQMC (http://icrc.hitsz.edu.cn/info/1037/1146.htm) 数据集有版权保护，本项目不提供，请自行下载或替换其他数据集进行试验
 6 | 
 7 | 
 8 | ### 数据形式：
 9 | #### sentence_1 \t sentence_2 \t label
10 | 
11 | 
12 | ### 文件解释
13 | * main.py —— 主文件
14 | * model.py —— 模型结构
15 | * config.py —— 配置参数
16 | * Data_Generate.py —— 数据集处理脚本
17 | * /data —— 数据存放文件夹
18 | * /save_model —— 模型存储文件夹
19 | 
20 | 
21 | ### 模型结构
22 | ![avatar](./ESIM.png)
23 | * Unlike the previous top models that use very complicated network architectures, we first demonstrate that carefully designing sequential inference models based on chain LSTMs can outperform all previous models. Based on this, we further show that by explicitly considering recursive architectures in both local inference modeling and inference composition, we achieve additional improvement.
24 | * ESIM主要分为三部分：input encoding，local inference modeling 和 inference composition
25 | * 精细的设计序列式的推断结构
26 | * 考虑局部推断和全局推断
27 | 
28 | 
29 | ### 参考资料
30 | * Enhanced LSTM for Natural Language Inference (https://arxiv.org/abs/1609.06038)
31 | * https://zhuanlan.zhihu.com/p/47580077
32 | 
33 | 


--------------------------------------------------------------------------------
/ESIM/config.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/5/12 22:57 
 4 | # @Author: Zhang Cong
 5 | 
 6 | # 模型配置参数
 7 | class Config():
 8 |     def __init__(self):
 9 |         self.original_data_path = './data/train.txt'
10 |         self.dev_data_path = './data/dev.txt'
11 |         self.stopwords_path = './data/stopwords.txt'
12 |         self.preprocess_path = './data/preprocessed_data.txt'
13 |         self.vocab_path = './data/vocab.txt'
14 |         self.label_path = './data/label.txt'
15 |         self.model_save_path = './save_model/'
16 |         self.rnn_type = 'lstm'
17 |         self.vocab_size = 2000
18 |         self.embedding_dim = 300
19 |         self.seq_length = 20
20 |         self.learning_rate = 1e-5
21 |         self.l2_reg = 0.0004
22 |         self.keep_prob = 0.5
23 |         self.hidden_dim = 128
24 |         self.num_classes = 2
25 |         self.batch_size = 32
26 |         self.epochs = 100


--------------------------------------------------------------------------------
/ESIM/data/label.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 1
3 | 


--------------------------------------------------------------------------------
/ESIM/data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | &nbsp
   2 | &nbsp;
   3 | aboard
   4 | about
   5 | above
   6 | according
   7 | according to
   8 | across
   9 | afore
  10 | after
  11 | afterwards
  12 | again
  13 | against
  14 | agin
  15 | all
  16 | almost
  17 | alone
  18 | along
  19 | alongside
  20 | already
  21 | also
  22 | although
  23 | always
  24 | am
  25 | amid
  26 | amidst
  27 | among
  28 | amongst
  29 | amoungst
  30 | amount
  31 | an
  32 | and
  33 | anent
  34 | another
  35 | any
  36 | anyhow
  37 | anyone
  38 | anything
  39 | anyway
  40 | anywhere
  41 | approximately
  42 | are
  43 | around
  44 | as
  45 | asked
  46 | aslant
  47 | astride
  48 | at
  49 | athwart
  50 | back
  51 | bar
  52 | be
  53 | became
  54 | because
  55 | because of
  56 | become
  57 | becomes
  58 | becoming
  59 | been
  60 | before
  61 | beforehand
  62 | behind
  63 | being
  64 | below
  65 | beneath
  66 | beside
  67 | besides
  68 | between
  69 | betwixt
  70 | beyond
  71 | bill
  72 | both
  73 | bottom
  74 | but
  75 | by
  76 | call
  77 | called
  78 | can
  79 | cannot
  80 | cant
  81 | circa
  82 | co
  83 | computer
  84 | con
  85 | could
  86 | couldnt
  87 | cry
  88 | currently
  89 | dare
  90 | de
  91 | describe
  92 | despite
  93 | detail
  94 | did
  95 | do
  96 | does
  97 | done
  98 | down
  99 | dr
 100 | due
 101 | due to
 102 | during
 103 | e.g.,
 104 | each
 105 | earlier
 106 | eg
 107 | eight
 108 | either
 109 | eleven
 110 | else
 111 | elsewhere
 112 | empty
 113 | enough
 114 | ere
 115 | etc
 116 | even
 117 | eventually
 118 | ever
 119 | every
 120 | everyone
 121 | everything
 122 | everywhere
 123 | except
 124 | few
 125 | fifteen
 126 | fify
 127 | fill
 128 | find
 129 | fire
 130 | first
 131 | five
 132 | for
 133 | former
 134 | formerly
 135 | forty
 136 | found
 137 | four
 138 | from
 139 | front
 140 | full
 141 | further
 142 | get
 143 | give
 144 | go
 145 | had
 146 | has
 147 | hasnt
 148 | have
 149 | he
 150 | hence
 151 | her
 152 | here
 153 | hereafter
 154 | hereby
 155 | herein
 156 | hereupon
 157 | hers
 158 | herself
 159 | him
 160 | himself
 161 | his
 162 | how
 163 | however
 164 | hundred
 165 | i
 166 | ie
 167 | if
 168 | in
 169 | inc
 170 | indeed
 171 | inside
 172 | instead
 173 | interest
 174 | into
 175 | is
 176 | it
 177 | its
 178 | itself
 179 | just
 180 | keep
 181 | last
 182 | latter
 183 | latterly
 184 | least
 185 | less
 186 | like
 187 | ltd
 188 | made
 189 | major
 190 | many
 191 | may
 192 | maybe
 193 | me
 194 | meanwhile
 195 | mid
 196 | midst
 197 | might
 198 | mill
 199 | mine
 200 | minus
 201 | more
 202 | moreover
 203 | most
 204 | mostly
 205 | move
 206 | mr
 207 | mrs
 208 | ms
 209 | much
 210 | must
 211 | my
 212 | myself
 213 | name
 214 | namely
 215 | near
 216 | need
 217 | neither
 218 | net
 219 | never
 220 | nevertheless
 221 | next
 222 | nigh
 223 | nigher
 224 | nighest
 225 | nine
 226 | no
 227 | nobody
 228 | none
 229 | noone
 230 | nor
 231 | not
 232 | nothing
 233 | notwithstanding
 234 | now
 235 | nowhere
 236 | of
 237 | off
 238 | often
 239 | on
 240 | on to
 241 | once
 242 | one
 243 | only
 244 | onto
 245 | or
 246 | other
 247 | others
 248 | otherwise
 249 | ought
 250 | our
 251 | ours
 252 | ourselves
 253 | out
 254 | out of
 255 | outside
 256 | over
 257 | own
 258 | part
 259 | partly
 260 | past
 261 | pending
 262 | per
 263 | perhaps
 264 | please
 265 | plus
 266 | prior
 267 | put
 268 | qua
 269 | rather
 270 | re
 271 | regarding
 272 | round
 273 | same
 274 | sans
 275 | save
 276 | see
 277 | seem
 278 | seemed
 279 | seeming
 280 | seems
 281 | separately
 282 | serious
 283 | seven
 284 | several
 285 | shall
 286 | she
 287 | should
 288 | show
 289 | side
 290 | similarly
 291 | since
 292 | sincere
 293 | six
 294 | sixty
 295 | so
 296 | some
 297 | somehow
 298 | someone
 299 | something
 300 | sometime
 301 | sometimes
 302 | somewhere
 303 | still
 304 | such
 305 | system
 306 | take
 307 | ten
 308 | than
 309 | that
 310 | the
 311 | their
 312 | theirs
 313 | them
 314 | themselves
 315 | then
 316 | thence
 317 | there
 318 | thereafter
 319 | thereby
 320 | therefore
 321 | therein
 322 | thereupon
 323 | these
 324 | they
 325 | thick
 326 | thin
 327 | third
 328 | this
 329 | those
 330 | though
 331 | three
 332 | through
 333 | throughout
 334 | thru
 335 | thus
 336 | till
 337 | to
 338 | together
 339 | too
 340 | top
 341 | toward
 342 | towards
 343 | twelve
 344 | twenty
 345 | two
 346 | un
 347 | under
 348 | underneath
 349 | unless
 350 | unlike
 351 | until
 352 | unto
 353 | up
 354 | upon
 355 | us
 356 | versus
 357 | very
 358 | via
 359 | vice
 360 | volume
 361 | was
 362 | we
 363 | well
 364 | were
 365 | what
 366 | whatever
 367 | whats
 368 | when
 369 | whence
 370 | whenever
 371 | where
 372 | whereafter
 373 | whereas
 374 | whereby
 375 | wherein
 376 | whereupon
 377 | wherever
 378 | whether
 379 | which
 380 | while
 381 | whither
 382 | who
 383 | whoever
 384 | whole
 385 | whom
 386 | whose
 387 | why
 388 | will
 389 | with
 390 | within
 391 | without
 392 | would
 393 | yesterday
 394 | yet
 395 | you
 396 | your
 397 | yours
 398 | yourself
 399 | yourselves
 400 | {
 401 | |
 402 | }
 403 | ~
 404 | ¡
 405 | ¦
 406 | «
 407 | ­
 408 | ¯
 409 | ´
 410 | ¸
 411 | »
 412 | ¿
 413 | ˇ
 414 | ˉ
 415 | ˊ
 416 | ˋ
 417 | ˜
 418 | ‐
 419 | —　
 420 | ―
 421 | ‖
 422 | ‘
 423 | ’
 424 | “
 425 | ”
 426 | •
 427 | …
 428 | ‹
 429 | ›
 430 | ∕
 431 | 、
 432 | 。
 433 | 〈
 434 | 〉
 435 | 《
 436 | 》
 437 | 「
 438 | 」
 439 | 『
 440 | 』
 441 | 【
 442 | 】
 443 | 〔
 444 | 〕
 445 | 〖
 446 | 〗
 447 | 〝
 448 | 〞
 449 | 一
 450 | 一些
 451 | 一何
 452 | 一切
 453 | 一则
 454 | 一方面
 455 | 一旦
 456 | 一来
 457 | 一样
 458 | 一般
 459 | 一转眼
 460 | 万一
 461 | 上
 462 | 上下
 463 | 下
 464 | 不
 465 | 不仅
 466 | 不但
 467 | 不光
 468 | 不单
 469 | 不只
 470 | 不外乎
 471 | 不如
 472 | 不妨
 473 | 不尽
 474 | 不尽然
 475 | 不得
 476 | 不怕
 477 | 不惟
 478 | 不成
 479 | 不拘
 480 | 不料
 481 | 不是
 482 | 不比
 483 | 不然
 484 | 不特
 485 | 不独
 486 | 不管
 487 | 不至于
 488 | 不若
 489 | 不论
 490 | 不过
 491 | 不问
 492 | 与
 493 | 与其
 494 | 与其说
 495 | 与否
 496 | 与此同时
 497 | 且
 498 | 且不说
 499 | 且说
 500 | 两者
 501 | 个
 502 | 个别
 503 | 临
 504 | 为
 505 | 为了
 506 | 为止
 507 | 为此
 508 | 为着
 509 | 乃
 510 | 乃至
 511 | 乃至于
 512 | 么
 513 | 之
 514 | 之一
 515 | 之所以
 516 | 之类
 517 | 乌乎
 518 | 乎
 519 | 乘
 520 | 也
 521 | 也好
 522 | 也罢
 523 | 了
 524 | 二来
 525 | 于
 526 | 于是
 527 | 于是乎
 528 | 云云
 529 | 云尔
 530 | 些
 531 | 亦
 532 | 人
 533 | 人们
 534 | 人家
 535 | 今
 536 | 介于
 537 | 仍
 538 | 仍旧
 539 | 从
 540 | 从此
 541 | 从而
 542 | 他
 543 | 他人
 544 | 他们
 545 | 以
 546 | 以上
 547 | 以为
 548 | 以便
 549 | 以免
 550 | 以及
 551 | 以故
 552 | 以期
 553 | 以来
 554 | 以至
 555 | 以至于
 556 | 以致
 557 | 们
 558 | 任
 559 | 任何
 560 | 任凭
 561 | 似的
 562 | 但
 563 | 但凡
 564 | 但是
 565 | 何
 566 | 何以
 567 | 何况
 568 | 何处
 569 | 何时
 570 | 余外
 571 | 作为
 572 | 你
 573 | 你们
 574 | 使
 575 | 使得
 576 | 例如
 577 | 依
 578 | 依据
 579 | 依照
 580 | 便于
 581 | 俺
 582 | 俺们
 583 | 倘
 584 | 倘使
 585 | 倘或
 586 | 倘然
 587 | 倘若
 588 | 借
 589 | 假使
 590 | 假如
 591 | 假若
 592 | 傥然
 593 | 像
 594 | 儿
 595 | 先不先
 596 | 光是
 597 | 全体
 598 | 全部
 599 | 兮
 600 | 关于
 601 | 其
 602 | 其一
 603 | 其中
 604 | 其二
 605 | 其他
 606 | 其余
 607 | 其它
 608 | 其次
 609 | 具体地说
 610 | 具体说来
 611 | 兼之
 612 | 内
 613 | 再其次
 614 | 再则
 615 | 再有
 616 | 再者
 617 | 再者说
 618 | 再说
 619 | 冒
 620 | 冲
 621 | 况且
 622 | 几
 623 | 几时
 624 | 凡
 625 | 凡是
 626 | 凭
 627 | 凭借
 628 | 出于
 629 | 出来
 630 | 分别
 631 | 则
 632 | 则甚
 633 | 别
 634 | 别人
 635 | 别处
 636 | 别是
 637 | 别的
 638 | 别管
 639 | 别说
 640 | 到
 641 | 前后
 642 | 前此
 643 | 前者
 644 | 加之
 645 | 加以
 646 | 即
 647 | 即令
 648 | 即使
 649 | 即便
 650 | 即如
 651 | 即或
 652 | 即若
 653 | 却
 654 | 去
 655 | 又
 656 | 又及
 657 | 及
 658 | 及其
 659 | 及至
 660 | 反之
 661 | 反而
 662 | 反过来
 663 | 反过来说
 664 | 受到
 665 | 另
 666 | 另一方面
 667 | 另外
 668 | 另悉
 669 | 只
 670 | 只当
 671 | 只怕
 672 | 只是
 673 | 只有
 674 | 只消
 675 | 只要
 676 | 只限
 677 | 叫
 678 | 叮咚
 679 | 可
 680 | 可以
 681 | 可是
 682 | 可见
 683 | 各
 684 | 各个
 685 | 各位
 686 | 各种
 687 | 各自
 688 | 同
 689 | 同时
 690 | 后
 691 | 后者
 692 | 向
 693 | 向使
 694 | 向着
 695 | 吓
 696 | 吗
 697 | 否则
 698 | 吧
 699 | 吧哒
 700 | 吱
 701 | 呀
 702 | 呃
 703 | 呕
 704 | 呗
 705 | 呜
 706 | 呜呼
 707 | 呢
 708 | 呵
 709 | 呵呵
 710 | 呸
 711 | 呼哧
 712 | 咋
 713 | 和
 714 | 咚
 715 | 咦
 716 | 咧
 717 | 咱
 718 | 咱们
 719 | 咳
 720 | 哇
 721 | 哈
 722 | 哈哈
 723 | 哉
 724 | 哎
 725 | 哎呀
 726 | 哎哟
 727 | 哗
 728 | 哟
 729 | 哦
 730 | 哩
 731 | 哪
 732 | 哪些
 733 | 哪怕
 734 | 哼
 735 | 哼唷
 736 | 唉
 737 | 唯有
 738 | 啊
 739 | 啐
 740 | 啥
 741 | 啦
 742 | 啪达
 743 | 啷当
 744 | 喂
 745 | 喏
 746 | 喔唷
 747 | 喽
 748 | 嗡
 749 | 嗡嗡
 750 | 嗬
 751 | 嗯
 752 | 嗳
 753 | 嘎
 754 | 嘎登
 755 | 嘘
 756 | 嘛
 757 | 嘻
 758 | 嘿
 759 | 嘿嘿
 760 | 因
 761 | 因为
 762 | 因了
 763 | 因此
 764 | 因着
 765 | 因而
 766 | 固然
 767 | 在
 768 | 在下
 769 | 在于
 770 | 地
 771 | 基于
 772 | 处在
 773 | 多
 774 | 多么
 775 | 多少
 776 | 大
 777 | 大家
 778 | 她
 779 | 她们
 780 | 好
 781 | 如
 782 | 如上
 783 | 如上所述
 784 | 如下
 785 | 如何
 786 | 如其
 787 | 如同
 788 | 如是
 789 | 如果
 790 | 如此
 791 | 如若
 792 | 始而
 793 | 孰料
 794 | 孰知
 795 | 宁
 796 | 宁可
 797 | 宁愿
 798 | 宁肯
 799 | 它
 800 | 它们
 801 | 对
 802 | 对于
 803 | 对待
 804 | 对方
 805 | 对比
 806 | 将
 807 | 小
 808 | 尔
 809 | 尔后
 810 | 尔尔
 811 | 尚且
 812 | 就
 813 | 就是
 814 | 就是了
 815 | 就是说
 816 | 就算
 817 | 就要
 818 | 尽
 819 | 尽管
 820 | 尽管如此
 821 | 岂但
 822 | 己
 823 | 已
 824 | 已矣
 825 | 巴
 826 | 巴巴
 827 | 并
 828 | 并且
 829 | 并非
 830 | 庶乎
 831 | 庶几
 832 | 开外
 833 | 开始
 834 | 归
 835 | 归齐
 836 | 当
 837 | 当地
 838 | 当然
 839 | 当着
 840 | 彼
 841 | 彼时
 842 | 彼此
 843 | 往
 844 | 待
 845 | 很
 846 | 得
 847 | 得了
 848 | 怎
 849 | 怎奈
 850 | 总之
 851 | 总的来看
 852 | 总的来说
 853 | 总的说来
 854 | 总而言之
 855 | 恰恰相反
 856 | 您
 857 | 惟其
 858 | 慢说
 859 | 我
 860 | 我们
 861 | 或
 862 | 或则
 863 | 或是
 864 | 或曰
 865 | 或者
 866 | 截至
 867 | 所
 868 | 所以
 869 | 所在
 870 | 所幸
 871 | 所有
 872 | 才
 873 | 才能
 874 | 打
 875 | 打从
 876 | 把
 877 | 抑或
 878 | 拿
 879 | 按
 880 | 按照
 881 | 换句话说
 882 | 换言之
 883 | 据
 884 | 据此
 885 | 接着
 886 | 故
 887 | 故此
 888 | 故而
 889 | 旁人
 890 | 无
 891 | 无宁
 892 | 无论
 893 | 既
 894 | 既往
 895 | 既是
 896 | 既然
 897 | 时候
 898 | 是
 899 | 是以
 900 | 是的
 901 | 曾
 902 | 替
 903 | 替代
 904 | 最
 905 | 有
 906 | 有些
 907 | 有关
 908 | 有及
 909 | 有时
 910 | 有的
 911 | 望
 912 | 朝
 913 | 朝着
 914 | 本
 915 | 本人
 916 | 本地
 917 | 本着
 918 | 本身
 919 | 来
 920 | 来着
 921 | 来自
 922 | 来说
 923 | 极了
 924 | 果然
 925 | 果真
 926 | 某
 927 | 某个
 928 | 某些
 929 | 某某
 930 | 根据
 931 | 欤
 932 | 正值
 933 | 正如
 934 | 正巧
 935 | 正是
 936 | 此
 937 | 此地
 938 | 此处
 939 | 此外
 940 | 此时
 941 | 此次
 942 | 此间
 943 | 毋宁
 944 | 每
 945 | 每当
 946 | 比
 947 | 比及
 948 | 比如
 949 | 比方
 950 | 没奈何
 951 | 沿
 952 | 沿着
 953 | 漫说
 954 | 焉
 955 | 然则
 956 | 然后
 957 | 然而
 958 | 照
 959 | 照着
 960 | 犹且
 961 | 犹自
 962 | 甚且
 963 | 甚么
 964 | 甚或
 965 | 甚而
 966 | 甚至
 967 | 甚至于
 968 | 用
 969 | 用来
 970 | 由
 971 | 由于
 972 | 由是
 973 | 由此
 974 | 由此可见
 975 | 的
 976 | 的确
 977 | 的话
 978 | 直到
 979 | 相对而言
 980 | 省得
 981 | 看
 982 | 眨眼
 983 | 着
 984 | 着呢
 985 | 矣
 986 | 矣乎
 987 | 矣哉
 988 | 离
 989 | 竟而
 990 | 第
 991 | 等
 992 | 等到
 993 | 等等
 994 | 简言之
 995 | 管
 996 | 类如
 997 | 紧接着
 998 | 纵
 999 | 纵令
1000 | 纵使
1001 | 纵然
1002 | 经
1003 | 经过
1004 | 结果
1005 | 给
1006 | 继之
1007 | 继后
1008 | 继而
1009 | 综上所述
1010 | 罢了
1011 | 者
1012 | 而
1013 | 而且
1014 | 而况
1015 | 而后
1016 | 而外
1017 | 而已
1018 | 而是
1019 | 而言
1020 | 能
1021 | 能否
1022 | 腾
1023 | 自
1024 | 自个儿
1025 | 自从
1026 | 自各儿
1027 | 自后
1028 | 自家
1029 | 自己
1030 | 自打
1031 | 自身
1032 | 至
1033 | 至于
1034 | 至今
1035 | 至若
1036 | 致
1037 | 般的
1038 | 若
1039 | 若夫
1040 | 若是
1041 | 若果 
1042 | 若非
1043 | 莫不然
1044 | 莫如
1045 | 莫若
1046 | 虽
1047 | 虽则
1048 | 虽然
1049 | 虽说
1050 | 被
1051 | 要
1052 | 要不
1053 | 要不是
1054 | 要不然
1055 | 要么
1056 | 要是
1057 | 譬喻
1058 | 譬如
1059 | 让
1060 | 许多
1061 | 论
1062 | 设使
1063 | 设或
1064 | 设若
1065 | 诚如
1066 | 诚然
1067 | 该
1068 | 说来
1069 | 诸
1070 | 诸位
1071 | 诸如
1072 | 谁
1073 | 谁人
1074 | 谁料
1075 | 谁知
1076 | 贼死
1077 | 赖以
1078 | 赶
1079 | 起
1080 | 起见
1081 | 趁
1082 | 趁着
1083 | 越是
1084 | 距
1085 | 跟
1086 | 较
1087 | 较之
1088 | 边
1089 | 过
1090 | 还
1091 | 还是
1092 | 还有
1093 | 还要
1094 | 这
1095 | 这一来
1096 | 这个
1097 | 这么
1098 | 这么些
1099 | 这么样
1100 | 这么点儿
1101 | 这些
1102 | 这会儿
1103 | 这儿
1104 | 这就是说
1105 | 这时
1106 | 这样
1107 | 这次
1108 | 这般
1109 | 这边
1110 | 这里
1111 | 进而
1112 | 连
1113 | 连同
1114 | 逐步
1115 | 通过
1116 | 遵循
1117 | 遵照
1118 | 那
1119 | 那个
1120 | 那么
1121 | 那么些
1122 | 那么样
1123 | 那些
1124 | 那会儿
1125 | 那儿
1126 | 那时
1127 | 那样
1128 | 那般
1129 | 那边
1130 | 那里
1131 | 都
1132 | 鄙人
1133 | 鉴于
1134 | 针对
1135 | 阿
1136 | 除
1137 | 除了
1138 | 除外
1139 | 除开
1140 | 除此之外
1141 | 除非
1142 | 随
1143 | 随后
1144 | 随时
1145 | 随着
1146 | 难道说
1147 | 非但
1148 | 非徒
1149 | 非特
1150 | 非独
1151 | 靠
1152 | 顺
1153 | 顺着
1154 | 首先
1155 | ︰
1156 | ︳
1157 | ︴
1158 | ︵
1159 | ︶
1160 | ︷
1161 | ︸
1162 | ︹
1163 | ︺
1164 | ︻
1165 | ︼
1166 | ︽
1167 | ︾
1168 | ︿
1169 | ﹀
1170 | ﹁
1171 | ﹂
1172 | ﹃
1173 | ﹄
1174 | ﹉
1175 | ﹊
1176 | ﹋
1177 | ﹌
1178 | ﹍
1179 | ﹎
1180 | ﹏
1181 | ﹐
1182 | ﹑
1183 | ﹔
1184 | ﹕
1185 | ﹖
1186 | ﹝
1187 | ﹞
1188 | ﹟
1189 | ﹠
1190 | ﹡
1191 | ﹢
1192 | ﹤
1193 | ﹦
1194 | ﹨
1195 | ﹩
1196 | ﹪
1197 | ﹫
1198 | ！
1199 | ＂
1200 | ＇
1201 | （
1202 | ）
1203 | ，
1204 | ：
1205 | ；
1206 | ？
1207 | ＿
1208 | ￣
1209 | １
1210 | ２
1211 | ３
1212 | ４
1213 | ５
1214 | ６
1215 | ７
1216 | ８
1217 | ９
1218 | ０
1219 | *
1220 | 


--------------------------------------------------------------------------------
/ESIM/model.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python 
  2 | # -*- coding: UTF-8 -*- 
  3 | # @Time: 2020/6/27 22:58
  4 | # @Author: Zhang Cong
  5 | 
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib as contrib
  9 | from config import Config
 10 | 
 11 | class Model():
 12 |     def __init__(self):
 13 |         self.config = Config()                                                                                              # 读取配置参数
 14 |         self.input_query = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name="input-query")         # 输入query，One-Hot形式
 15 |         self.input_doc = tf.placeholder(shape=[None, self.config.seq_length], dtype=tf.int32, name="input-doc")             # 输入doc，One-Hot形式
 16 |         self.input_label = tf.placeholder(shape=[None, self.config.num_classes], dtype=tf.int32, name="input-label")        # 输入 label
 17 |         self.input_keep_prob = tf.placeholder(dtype=tf.float32, name='input-keep-prob')                                     # keep-prob
 18 | 
 19 |         # Embedding layer
 20 |         self.embedding = tf.get_variable(shape=[self.config.vocab_size, self.config.embedding_dim], dtype=tf.float32, name='embedding')
 21 | 
 22 |         # 将词汇映射为向量形式 [batch_size, seq_length, embedding_dim]
 23 |         self.input_query_emb = tf.nn.embedding_lookup(params=self.embedding, ids=self.input_query, name='input-query-emb')
 24 |         self.input_doc_emb = tf.nn.embedding_lookup(params=self.embedding, ids=self.input_doc, name='input-doc-emb')
 25 | 
 26 |         # 双向RNN编码 Input Encoding
 27 |         input_query_encode = self.bi_directional_rnn(input_data=self.input_query_emb, rnn_type=self.config.rnn_type, scope='Input_Encoding/Bi-LSTM')
 28 |         input_doc_encode = self.bi_directional_rnn(input_data=self.input_doc_emb, rnn_type=self.config.rnn_type, scope='Input_Encoding/Bi-LSTM', reuse=True)
 29 | 
 30 |         # query与doc局部交互层（Attention）
 31 |         with tf.name_scope('Local_inference_Modeling'):
 32 |             # 计算query与doc每个词语之间的相似度
 33 |             with tf.name_scope('word_similarity'):
 34 |                 attention_weights = tf.matmul(input_query_encode, tf.transpose(input_doc_encode, [0, 2, 1]))
 35 |                 attentionsoft_a = tf.nn.softmax(attention_weights)
 36 |                 attentionsoft_b = tf.nn.softmax(tf.transpose(attention_weights, [0, 2, 1]))
 37 |                 query_new = tf.matmul(attentionsoft_a, input_doc_encode)        # 使用doc向量生成new query向量
 38 |                 doc_new = tf.matmul(attentionsoft_b, input_query_encode)        # 使用query向量生成new doc向量
 39 | 
 40 |             # 计算old_query与new_query的差、积
 41 |             query_diff = tf.subtract(input_query_encode, query_new)
 42 |             query_mul = tf.multiply(input_query_encode, query_new)
 43 | 
 44 |             # 计算old_doc与new_doc的差、积
 45 |             doc_diff = tf.subtract(input_doc_encode, doc_new)
 46 |             doc_mul = tf.multiply(input_doc_encode, doc_new)
 47 | 
 48 |             # 将原始query、new_query、差、积按维度进行特征拼接
 49 |             self.query_feature = tf.concat([input_query_encode, query_new, query_diff, query_mul], axis=2)
 50 |             self.doc_feature = tf.concat([input_doc_encode, doc_new, doc_diff, doc_mul], axis=2)
 51 | 
 52 |         with tf.name_scope("Inference_Composition"):
 53 |             # 双向RNN编码
 54 |             query_final = self.bi_directional_rnn(input_data=self.query_feature, rnn_type=self.config.rnn_type, scope='Inference_Composition/biLSTM')
 55 |             doc_final = self.bi_directional_rnn(input_data=self.doc_feature, rnn_type=self.config.rnn_type, scope='Inference_Composition/biLSTM', reuse=True)
 56 | 
 57 |             # 平均池化 average pool
 58 |             query_avg = tf.reduce_mean(query_final, axis=1)
 59 |             doc_avg = tf.reduce_mean(doc_final, axis=1)
 60 | 
 61 |             # 最大池化 max pool
 62 |             query_max = tf.reduce_max(query_final, axis=1)
 63 |             doc_max = tf.reduce_max(doc_final, axis=1)
 64 | 
 65 |             # 将四个池化特征进行维度拼接
 66 |             combine_emb = tf.concat([query_avg, query_max, doc_avg, doc_max], axis=1)
 67 | 
 68 |         # 全连接层 1
 69 |         with tf.variable_scope('feed_foward_layer1'):
 70 |             inputs = tf.nn.dropout(combine_emb, self.input_keep_prob)
 71 |             outputs = tf.layers.dense(inputs=inputs,
 72 |                                       units=self.config.hidden_dim,
 73 |                                       activation=tf.nn.relu,
 74 |                                       use_bias=True,
 75 |                                       kernel_initializer=tf.random_normal_initializer(0.0, 0.1))
 76 |         # 全连接层 2
 77 |         with tf.variable_scope('feed_foward_layer2'):
 78 |             outputs = tf.nn.dropout(outputs, self.input_keep_prob)
 79 |             self.logits = tf.layers.dense(inputs=outputs,
 80 |                                           units=self.config.num_classes,
 81 |                                           activation=tf.nn.tanh,
 82 |                                           use_bias=True,
 83 |                                           kernel_initializer=tf.random_normal_initializer(0.0, 0.1))
 84 |         # 类别score
 85 |         self.score = tf.nn.softmax(self.logits, name='score')
 86 |         # 预测结果
 87 |         self.predict = tf.argmax(self.score, axis=1, name='predict')
 88 |         # 准确率
 89 |         self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(self.input_label, axis=1), self.predict), dtype=tf.float32),name='accuracy')
 90 |         # 结构化损失函数，交叉熵+L2正则化
 91 |         self.loss = tf.add(
 92 |             tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_label)),
 93 |             tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
 94 |             name="loss")
 95 |         # 优化器
 96 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate, name="optimizer").minimize(self.loss)
 97 | 
 98 | 
 99 |     def bi_directional_rnn(self, input_data, rnn_type, scope, reuse=False):
100 |         '''
101 |         构建双向RNN层，可选LSTM/GRU
102 |         :param input_data: 输入时序数据
103 |         :param rnn_type: RNN类型
104 |         :param scope: 变量空间
105 |         :param reuse: 是否重用变量
106 |         :return:
107 |         '''
108 |         with tf.variable_scope(scope, reuse=reuse):
109 |             cell_fw = self.get_rnn(rnn_type)
110 |             cell_bw = self.get_rnn(rnn_type)
111 |             outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, inputs=input_data, dtype=tf.float32)
112 |             outputs = tf.concat(outputs, axis=2)
113 |             return outputs
114 | 
115 | 
116 |     def get_rnn(self, rnn_type):
117 |         '''
118 |         根据rnn_type创建RNN层
119 |         :param rnn_type: RNN类型
120 |         :return:
121 |         '''
122 |         if rnn_type == 'lstm':
123 |             cell = contrib.rnn.LSTMCell(num_units=self.config.hidden_dim)
124 |         else:
125 |             cell = contrib.rnn.GRUCell(num_units=self.config.hidden_dim)
126 |         cell = contrib.rnn.DropoutWrapper(cell=cell, input_keep_prob=self.input_keep_prob)
127 |         return cell
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     Model()


--------------------------------------------------------------------------------
/Edit_Distance/Edit-Distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/Edit_Distance/Edit-Distance.png


--------------------------------------------------------------------------------
/Edit_Distance/README.md:
--------------------------------------------------------------------------------
 1 | ## Edit_Distance (Levenshtein Distance)
 2 | 
 3 | 
 4 | ### 概述
 5 | 编辑距离（Minimum Edit Distance，MED），由俄罗斯科学家 Vladimir Levenshtein 在1965年提出，也因此而得名 Levenshtein Distance。
 6 | 
 7 | 在信息论、语言学和计算机科学领域，Levenshtein Distance 是用来度量两个序列相似程度的指标。通俗地来讲，编辑距离指的是在两个单词之间，由其中一个单词转换为另一个单词所需要的最少单字符编辑操作次数。
 8 | 
 9 | 在这里定义的单字符编辑操作有且仅有三种：
10 | 
11 | * 插入（Insertion）
12 | * 删除（Deletion）
13 | * 替换（Substitution）
14 | 
15 | #####譬如，"kitten" 和 "sitting" 这两个单词，由 "kitten" 转换为 "sitting" 需要的最少单字符编辑操作有：
16 | 
17 | * kitten → sitten (substitution of "s" for "k")
18 | * sitten → sittin (substitution of "i" for "e")
19 | * sittin → sitting (insertion of "g" at the end)
20 | 
21 | 因此，"kitten" 和 "sitting" 这两个单词之间的编辑距离为 3 
22 | 
23 | 
24 | ### 数学公式
25 | ![avatar](./Edit-Distance.png)
26 | 
27 | 
28 | ### 文件解释
29 | * edit_distance.py —— 主文件
30 | * /data —— 数据存放文件夹
31 | 
32 | 
33 | ### 参考资料
34 | * https://www.jianshu.com/p/a617d20162cf
35 | 
36 | 


--------------------------------------------------------------------------------
/Edit_Distance/edit_distance.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/4/25 22:44 
 4 | # @Author: Zhang Cong
 5 | 
 6 | '''
 7 |     编辑距离
 8 | '''
 9 | def edit_distance(string_1, string_2):
10 |     if len(string_1) == 0:      # 如果string_1长度为0，则返回string_2的长度为结果
11 |         return len(string_2)
12 |     if len(string_2) == 0:      # 如果string_2长度为0，则返回string_1的长度为结果
13 |         return len(string_1)
14 | 
15 |     if string_1[0] == string_2[0]:  # 如果string_1和string_2的首字符相同，则同时去掉首字母，继续递归
16 |         return edit_distance(string_1[1: ], string_2[1: ])
17 |     else:
18 |         return min(edit_distance(string_1[1: ], string_2) + 1,          # string_1去掉首字符
19 |                    edit_distance(string_1[1: ], string_2[1: ]) + 1,     # string_1和string_2都去掉首字符
20 |                    edit_distance(string_1, string_2[1: ]) + 1)          # string_2去掉首字符
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     string_1 = 'abcde'
25 |     string_2 = 'ac'
26 |     num_step = edit_distance(string_1, string_2)
27 |     score = 1 - (num_step/max(len(string_1), len(string_2)))
28 |     print(score)


--------------------------------------------------------------------------------
/Jaccard/README.md:
--------------------------------------------------------------------------------
 1 | ## Jaccard (Jaccard similarity coefficient)
 2 | 
 3 | 
 4 | ### 文件解释
 5 | * edit_distance.py —— 主文件
 6 | * /data —— 数据存放文件夹
 7 | 
 8 | 
 9 | ### 参考资料
10 | * https://blog.csdn.net/qq_34333481/article/details/84024513
11 | 
12 | 


--------------------------------------------------------------------------------
/Jaccard/jaccard.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python 
 2 | # -*- coding: UTF-8 -*- 
 3 | # @Time: 2020/4/25 22:50 
 4 | # @Author: Zhang Cong
 5 | 
 6 | '''
 7 |     Jaccard 相似度
 8 | '''
 9 | 
10 | 
11 | def jaccard(string_1, string_2):
12 |     char_set_1 = set(string_1)
13 |     char_set_2 = set(string_2)
14 |     interaction = char_set_1.intersection(char_set_2)   # 取交集
15 |     union = char_set_1.union(char_set_2)                # 取并集
16 |     score = len(interaction)/len(union)                 # 计算score
17 |     return score
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     string_1 = 'abcdef'
22 |     string_2 = 'ab'
23 |     score = jaccard(string_1, string_2)
24 |     print('Score: {}'.format(score))
25 | 
26 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text Matching Based on LCQMC: A Large-scale Chinese Question Matching Corpus
 2 | 
 3 | 
 4 | ### 模型
 5 | * ABCNN
 6 | * BIMPM
 7 | * DSSM-BOW
 8 | * DSSM-CNN
 9 | * DSSM-RNN
10 | * DSSM-Embedding
11 | * ESIM
12 | * BM25
13 | * Edit-Distance
14 | * Jaccard
15 | 
16 | 
17 | ### 数据集：
18 | #### LCQMC (http://icrc.hitsz.edu.cn/info/1037/1146.htm) 数据集版权保护，本项目不提供，请自行下载或替换其他数据集进行试验
19 | 
20 | 
21 | ### 数据形式：
22 | #### sentence_1 \t sentence_2 \t label
23 | 
24 | 
25 | ### 依赖环境
26 |     Python 3.6
27 |     TensorFlow 1.15
28 | 
29 | 
30 | ### 模型对比：
31 | ![avatar](./Result.png)
32 | * 以上结果只是对模型做了基本测试与验证、参数也不一定是最优，如果需用在具体项目中，可自行调试。
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/Result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangcong-zc/Text_Matching/1fd4228b3a4a2fb99d66f1f83dc014bbc3fcd287/Result.png


--------------------------------------------------------------------------------