├── .gitignore
├── .idea
    └── vcs.xml
├── README.md
├── config
    ├── __init__.py
    ├── config.py
    ├── hyperparams.py
    └── synonym.py
├── core
    ├── __init__.py
    ├── load_data.py
    ├── preprocessor.py
    ├── utils.py
    └── word_embedding.py
├── data
    ├── atec_nlp_sim_test.csv
    ├── atec_nlp_sim_train.csv
    ├── atec_nlp_sim_train_add.csv
    └── corpus.txt
├── logdir
    └── graph
    │   ├── match_pyramid
    │   └── siamese.png
├── main.py
├── model
    ├── __init__.py
    ├── cnn_siamese.py
    ├── match_pyramid.py
    ├── module
    │   ├── __init__.py
    │   ├── feature.py
    │   ├── modules.py
    │   ├── rnn.py
    │   └── templates.txt
    ├── rnn_siamese.py
    └── transformer_siamese.py
├── run.py
└── run.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | data/char2vec_300
  9 | data/data.pkl
 10 | data/vocab.pkl
 11 | data/expend_atec_nlp.csv
 12 | logdir/checkpoints/*
 13 | logdir/checkpoints-match_pyramid/*
 14 | logdir/checkpoints-rnn/*
 15 | logdir/model/*
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | 


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TextSimilar
 2 | 短文本相似度
 3 | ### 孪生网络
 4 | [Learning Text Similarity with Siamese Recurrent Networks](http://www.aclweb.org/anthology/W/W16/W16-1617.pdf)  
 5 | loss函数  
 6 | <img src="logdir/graph/siamese.png">  
 7 | ---
 8 | ### match pyramid
 9 | [Text Matching as Image Recognition](https://arxiv.org/abs/1602.06359)    
10 | <img src="logdir/graph/match_pyramid">  
11 | ---
12 | 数据来源于[蚂蚁金融NLP之问题相似度计算](https://dc.cloud.alipay.com/index#/topic/intro?id=8)  
13 | >问题相似度计算，即给定客服里用户描述的两句话，用算法来判断是否表示了相同的语义。
14 | >
15 | >示例：
16 | >
17 | >1. “花呗如何还款” --“花呗怎么还款”：同义问句
18 | >
19 | >2. “花呗如何还款” -- “我怎么还我的花被呢”：同义问句
20 | >
21 | >3. “花呗分期后逾期了如何还款”-- “花呗分期后逾期了哪里还款”：非同义问句
22 | >
23 | >对于例子a，比较简单的方法就可以判定同义；对于例子b，包含了错别字、同义词、词序变换等问题，两个句子乍一看并不类似，想正确判断比较有挑战；对于例子c，两句话很类似，仅仅有一处细微的差别 “如何”和“哪里”，就导致语义不一致。
24 | 
25 | 数据预处理python3 run.py, 在data目录得到data.pkl和vocab.pkl。  
26 | ```python
27 | if __name__ == "__main__":
28 | 	preprocessor(True)
29 | 	network = 'rnn'  # network = [rnn match_pyramid cnn]
30 | 	run(network)
31 | ```
32 | 


--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time    : 19-1-25 下午6:02
4 | # @Author  : 林利芳
5 | # @File    : __init__.py
6 | 


--------------------------------------------------------------------------------
/config/config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin python3
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 18-12-27 上午9:54
 4 | # @Author  : 林利芳
 5 | # @File    : config.py
 6 | 
 7 | import os
 8 | 
 9 | PATH = os.getcwd()
10 | ATEC_NLP_DATA = os.path.join(PATH, 'data/atec_nlp_sim_train.csv')
11 | ADD_ATEC_NLP_DATA = os.path.join(PATH, 'data/atec_nlp_sim_train_add.csv')
12 | 
13 | TEST_DATA = os.path.join(PATH, 'data/atec_nlp_sim_test.csv')
14 | TEST_RESULT = os.path.join(PATH, 'data/test_result.csv')
15 | 
16 | EXPEND_ATEC_NLP_DATA = os.path.join(PATH, 'data/expend_atec_nlp_{}.csv')
17 | 
18 | DATA_PKL = os.path.join(PATH, 'data/data.pkl')
19 | VOCAB_PKL = os.path.join(PATH, 'data/vocab.pkl')
20 | 
21 | CORPUS_DATA = os.path.join(PATH, 'data/corpus.txt')
22 | 
23 | WORD2VEC_DATA = os.path.join(PATH, 'data/char2vec_300')
24 | logdir = os.path.join(PATH, 'logdir')
25 | checkpoint_dir = "logdir/checkpoints-{}"
26 | model_dir = os.path.join(logdir, "model")
27 | 
28 | WordChar = "char"
29 | 


--------------------------------------------------------------------------------
/config/hyperparams.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 18-12-25 上午10:41
  4 | # @Author  : 林利芳
  5 | # @File    : hyperparams.py
  6 | 
  7 | 
  8 | class HyperParams:
  9 | 	# training
 10 | 	batch_size = 32  # alias = N
 11 | 	lr = 0.0001  # learning rate. In paper, learning rate is adjusted to the global step.
 12 | 	seg = 'GRU'  # seg = [GRU,LSTM,IndRNN,F-LSTM]
 13 | 	# model
 14 | 	max_len = 50  # Maximum number of words in a sentence. alias = T.
 15 | 	# Feel free to increase this if you are ambitious.
 16 | 	min_cnt = 20  # words whose occurred less than min_cnt are encoded as <UNK>.
 17 | 	num_units = 512  # alias = C
 18 | 	embedding_size = 512
 19 | 	vocab_size = 10000
 20 | 	num_blocks = 1  # number of encoder/decoder blocks
 21 | 	num_epochs = 100
 22 | 	num_heads = 8
 23 | 	attention_size = 100
 24 | 	clip = 5
 25 | 	dropout_rate = 0.1
 26 | 	eps = 1e-9
 27 | 	margin = 0.7
 28 | 	sinusoid = False  # If True, use sinusoid. If false, positional embedding.
 29 | 
 30 | 
 31 | class RnnParams:
 32 | 	# training
 33 | 	lr = 0.0001  # learning rate. In paper, learning rate is adjusted to the global step.
 34 | 	seg = 'GRU'  # seg = [GRU,LSTM,IndRNN,F-LSTM]
 35 | 	# model
 36 | 	max_len = 50  # Maximum number of words in a sentence. alias = T.
 37 | 	# Feel free to increase this if you are ambitious.
 38 | 	min_cnt = 20  # words whose occurred less than min_cnt are encoded as <UNK>.
 39 | 	num_units = 100  # alias = C
 40 | 	embedding_size = 100
 41 | 	num_epochs = 40
 42 | 	attention_size = 100
 43 | 	clip = 5
 44 | 	dropout_rate = 0.1
 45 | 	eps = 1e-9
 46 | 	margin = 0.1
 47 | 	sinusoid = False  # If True, use sinusoid. If false, positional embedding.
 48 | 
 49 | 
 50 | class CnnParams:
 51 | 	# training
 52 | 	lr = 0.0001  # learning rate. In paper, learning rate is adjusted to the global step.
 53 | 	seg = 'GRU'  # seg = [GRU,LSTM,IndRNN,F-LSTM]
 54 | 	# model
 55 | 	max_len = 50  # Maximum number of words in a sentence. alias = T.
 56 | 	# Feel free to increase this if you are ambitious.
 57 | 	min_cnt = 20  # words whose occurred less than min_cnt are encoded as <UNK>.
 58 | 	num_units = 100  # alias = C
 59 | 	embedding_size = 100
 60 | 	num_epochs = 40
 61 | 	attention_size = 100
 62 | 	clip = 5
 63 | 	dropout_rate = 0.1
 64 | 	eps = 1e-9
 65 | 	margin = 0.01
 66 | 	channel = 64  # 通道数
 67 | 	kernel = [3, 5]  # 核大小
 68 | 	pool_size = 2  # 池化层大小
 69 | 	dense_size = 100  # 全连接层大小
 70 | 	sinusoid = False  # If True, use sinusoid. If false, positional embedding.
 71 | 
 72 | 
 73 | class TransformerParams:
 74 | 	# training
 75 | 	lr = 0.0001  # learning rate. In paper, learning rate is adjusted to the global step.
 76 | 	seg = 'GRU'  # seg = [GRU,LSTM,IndRNN,F-LSTM]
 77 | 	# model
 78 | 	max_len = 50  # Maximum number of words in a sentence. alias = T.
 79 | 	# Feel free to increase this if you are ambitious.
 80 | 	min_cnt = 20  # words whose occurred less than min_cnt are encoded as <UNK>.
 81 | 	num_units = 512  # alias = C
 82 | 	embedding_size = 512
 83 | 	num_epochs = 40
 84 | 	num_blocks = 6  # number of encoder/decoder blocks
 85 | 	num_heads = 8
 86 | 	attention_size = 100
 87 | 	clip = 5
 88 | 	dropout_rate = 0.1
 89 | 	eps = 1e-9
 90 | 	margin = 0.3
 91 | 	sinusoid = False  # If True, use sinusoid. If false, positional embedding.
 92 | 
 93 | 
 94 | class MatchPyramidParams:
 95 | 	# training
 96 | 	lr = 0.0001  # learning rate. In paper, learning rate is adjusted to the global step.
 97 | 	seg = 'GRU'  # seg = [GRU,LSTM,IndRNN,F-LSTM]
 98 | 	# model
 99 | 	max_len = 50  # Maximum number of words in a sentence. alias = T.
100 | 	# Feel free to increase this if you are ambitious.
101 | 	min_cnt = 20  # words whose occurred less than min_cnt are encoded as <UNK>.
102 | 	num_units = 100  # alias = C
103 | 	embedding_size = 100
104 | 	num_epochs = 40
105 | 	attention_size = 100
106 | 	clip = 5
107 | 	dropout_rate = 0.1
108 | 	eps = 1e-9
109 | 	margin = 0.3
110 | 	channel = 64  # 通道数
111 | 	kernel = [3, 5]  # 核大小
112 | 	pool_size = 2  # 池化层大小
113 | 	dense_size = 100  # 全连接层大小
114 | 	sinusoid = False  # If True, use sinusoid. If false, positional embedding.
115 | 


--------------------------------------------------------------------------------
/config/synonym.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin python3
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 19-1-24 上午11:11
 4 | # @Author  : 林利芳
 5 | # @File    : synonym.py
 6 | import re
 7 | 
 8 | SYNONYM_DICT = {
 9 | 	"更改": ["更改", '更换'],
10 | 	"改为": ["更改成", "改为", "更改为"],
11 | 	"可以": ["可以", "能"],
12 | 	"下降": ['降低', "下降"],
13 | 	"为什么": ["为何", "为啥"],
14 | 	"能不能": ["能不能", "行不行", "可不可以"],
15 | 	"不能用": ["不能用", "用不了"],
16 | }
17 | 
18 | SYNONYM_WRONG = {
19 | 	"零时额度": "临时额度",
20 | 	"花贝": '花呗',
21 | 	"花唄": '花呗',
22 | 	"花被": '花呗',
23 | 	"蚂蚁花贝": '花呗',
24 | 	"蚂蚁花唄": '花呗',
25 | 	"蚂蚁花被": '花呗',
26 | 	"蚂蚁花呗": '花呗',
27 | 	"蚂蚁借呗": '借呗',
28 | 	"届不了": '借不了',
29 | 	"为何": "为什么",
30 | 	"为啥": "为什么",
31 | 	"下个月": '下月',
32 | 	"上个月": '上月',
33 | 	"行不行": '能不能',
34 | 	"可不可以": '能不能',
35 | 	"用不了": "不能用",
36 | 	"不让": '不能',
37 | 	"不可以": '不能',
38 | 	"不行": '不能',
39 | 	"老有": '总有',
40 | 	"日息": "利息",
41 | 	"更改成": "改为",
42 | 	"更改为": "改为",
43 | 	"更换": "更改",
44 | 	"能": "可以",
45 | 	"降低": "下降",
46 | 	"受到": "收到",
47 | 	',': '，',
48 | 	'?': '？',
49 | 	'!': '！',
50 | 	';': '；',
51 | 	'***': '0',
52 | }
53 | 
54 | PATTERN = [
55 | 	[re.compile('\*+'), '*'],
56 | 	[re.compile('\?'), '？'],
57 | 	[re.compile('\.$'), '。'],
58 | 	[re.compile('!'), '！'],
59 | 	[re.compile(','), '，'],
60 | 	[re.compile(';'), '；'],
61 | 	[re.compile('\s+'), ''],
62 | 	[re.compile('\ufeff'), ''],
63 | ]
64 | 


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time    : 19-1-25 下午6:02
4 | # @Author  : 林利芳
5 | # @File    : __init__.py
6 | 


--------------------------------------------------------------------------------
/core/load_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 19-1-18 下午4:08
  4 | # @Author  : 林利芳
  5 | # @File    : load_data.py
  6 | from config.config import DATA_PKL, VOCAB_PKL
  7 | from core.preprocessor import preprocessor, pad_sequence, trim
  8 | from core.utils import load_data, read_csv
  9 | import numpy as np
 10 | from sklearn.metrics import recall_score, precision_score, f1_score
 11 | 
 12 | 
 13 | def gen_batch_data(l_x, r_x, l_len, r_len, y, batch_size):
 14 | 	"""
 15 | 	生成batch数据
 16 | 	:param l_x:
 17 | 	:param r_x:
 18 | 	:param l_len:
 19 | 	:param r_len:
 20 | 	:param y:
 21 | 	:param batch_size:
 22 | 	:return:
 23 | 	"""
 24 | 	data_size = len(y)
 25 | 	num_batch = data_size // batch_size + 1
 26 | 	
 27 | 	for ii in range(num_batch):
 28 | 		start, end = ii * batch_size, (ii + 1) * batch_size
 29 | 		start_batch = 0
 30 | 		if end > data_size:
 31 | 			start_batch = end - data_size
 32 | 			start, end = data_size - batch_size, data_size
 33 | 		l_x_batch = l_x[start:end]
 34 | 		r_x_batch = r_x[start:end]
 35 | 		l_len_batch = l_len[start:end]
 36 | 		r_len_batch = r_len[start:end]
 37 | 		y_batch = y[start:end]
 38 | 		yield l_x_batch, r_x_batch, l_len_batch, r_len_batch, y_batch, start_batch
 39 | 
 40 | 
 41 | def load_train_data():
 42 | 	data = load_data(DATA_PKL)
 43 | 	train_l_x, val_l_x, train_l_len, val_l_len, train_r_x, val_r_x, train_r_len, val_r_len, train_y, val_y = \
 44 | 		data['train_l_x'], data['val_l_x'], data['train_l_len'], data['val_l_len'], data['train_r_x'], data[
 45 | 			'val_r_x'], data['train_r_len'], data['val_r_len'], data['train_y'], data['val_y']
 46 | 	train_l_x = np.array(train_l_x)
 47 | 	val_l_x = np.array(val_l_x)
 48 | 	train_l_len = np.array(train_l_len)
 49 | 	val_l_len = np.array(val_l_len)
 50 | 	train_r_x = np.array(train_r_x)
 51 | 	val_r_x = np.array(val_r_x)
 52 | 	train_r_len = np.array(train_r_len)
 53 | 	val_r_len = np.array(val_r_len)
 54 | 	train_y = np.array(train_y)
 55 | 	val_y = np.array(val_y)
 56 | 	return train_l_x, val_l_x, train_l_len, val_l_len, train_r_x, val_r_x, train_r_len, val_r_len, train_y, val_y
 57 | 
 58 | 
 59 | def get_feed_dict(model, l_x, r_x, l_len, r_len, y, batch_size):
 60 | 	"""
 61 | 	生成feed_dict
 62 | 	:param model:
 63 | 	:param l_x:
 64 | 	:param r_x:
 65 | 	:param l_len:
 66 | 	:param r_len:
 67 | 	:param y:
 68 | 	:param batch_size:
 69 | 	:return:
 70 | 	"""
 71 | 	for l_x_batch, r_x_batch, l_len_batch, r_len_batch, y_batch, start_batch in gen_batch_data(
 72 | 			l_x, r_x, l_len, r_len, y, batch_size):
 73 | 		feed_dict = {
 74 | 			model.left_x: l_x_batch,
 75 | 			model.right_x: r_x_batch,
 76 | 			model.y: y_batch,
 77 | 			model.left_seq_lens: l_len_batch,
 78 | 			model.right_seq_lens: r_len_batch
 79 | 		}
 80 | 		yield feed_dict, start_batch
 81 | 
 82 | 
 83 | def print_info(epoch, step, train_loss, dev_loss, y, pre_y):
 84 | 	loss = round(float(np.mean(train_loss)), 3)
 85 | 	val_loss = round(float(np.mean(dev_loss)), 3)
 86 | 	f1 = round(f1_score(y, pre_y), 4)
 87 | 	recall = round(recall_score(y, pre_y), 4)
 88 | 	precision = round(precision_score(y, pre_y), 4)
 89 | 	print('**************************************************')
 90 | 	print("epoch\t{}\tstep\t{}\ttrain_loss\t{}\tdev_loss\t{}\t".format(epoch, step, loss, val_loss))
 91 | 	print("precision\t{}\trecall\t{}\tf1\t{}\n\n".format(precision, recall, f1))
 92 | 
 93 | 
 94 | def load_test_data(filename):
 95 | 	vocab = load_data(VOCAB_PKL)
 96 | 	max_len = vocab.max_len
 97 | 	data = read_csv(filename)
 98 | 	data = [kk[:3] for kk in data]
 99 | 	idx, left_x, right_x = zip(*data)
100 | 	
101 | 	left_x = [trim(kk) for kk in left_x]
102 | 	right_x = [trim(kk) for kk in right_x]
103 | 	
104 | 	left_x, left_len = pad_sequence(left_x, vocab, max_len)
105 | 	right_x, right_len = pad_sequence(right_x, vocab, max_len)
106 | 	
107 | 	return idx, left_x, left_len, right_x, right_len, vocab
108 | 
109 | 
110 | def save_test_result(filename, idx, predicts):
111 | 	import codecs
112 | 	with codecs.open(filename, 'w', encoding='utf-8') as fp:
113 | 		for _id, pre in zip(idx, predicts):
114 | 			fp.writelines('{}\t{}\n'.format(_id, pre))
115 | 


--------------------------------------------------------------------------------
/core/preprocessor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 19-1-18 下午3:49
  4 | # @Author  : 林利芳
  5 | # @File    : preprocessor.py
  6 | import pprint
  7 | 
  8 | from sklearn.model_selection import train_test_split
  9 | import numpy as np
 10 | from config.config import DATA_PKL, VOCAB_PKL, ATEC_NLP_DATA, ADD_ATEC_NLP_DATA, CORPUS_DATA, EXPEND_ATEC_NLP_DATA, \
 11 | 	WordChar
 12 | from core.utils import save_data, read_csv, load_data
 13 | from core.word_embedding import Vocab
 14 | import re
 15 | import jieba
 16 | import collections
 17 | from config.synonym import SYNONYM_DICT, SYNONYM_WRONG, PATTERN
 18 | import itertools
 19 | from config.hyperparams import HyperParams as hp
 20 | import sys
 21 | 
 22 | try:
 23 | 	reload(sys)
 24 | 	sys.setdefaultencoding('utf8')
 25 | except:
 26 | 	pass
 27 | jieba.load_userdict(CORPUS_DATA)
 28 | PAD = "<PAD>"
 29 | UNK = "<UNK>"
 30 | PAD2ID = 0
 31 | UNK2ID = 0
 32 | 
 33 | 
 34 | def extended_corpus(data, is_training=True, filename="train"):
 35 | 	"""
 36 | 	扩展语料
 37 | 	:param data:
 38 | 	:param is_training:
 39 | 	:param filename:
 40 | 	:return:
 41 | 	"""
 42 | 	print("同义词替换...\n")
 43 | 	similar_data = []
 44 | 	for sub_data in data:
 45 | 		idx, left_s, right_s, y = sub_data
 46 | 		idx = idx.replace('\ufeff', '')
 47 | 		left_s = trim(left_s)
 48 | 		right_s = trim(right_s)
 49 | 		if is_training:
 50 | 			data = combine_data(idx, left_s, right_s, y)
 51 | 		else:
 52 | 			data = [[idx, ' '.join(left_s), ' '.join(right_s), y]]
 53 | 		similar_data.extend(data)
 54 | 	
 55 | 	save_expend_data(similar_data, EXPEND_ATEC_NLP_DATA.format(filename))
 56 | 
 57 | 
 58 | # return similar_data
 59 | 
 60 | 
 61 | def save_expend_data(data, filename):
 62 | 	import codecs
 63 | 	with codecs.open(filename, 'w', encoding='utf-8') as fp:
 64 | 		for line in data:
 65 | 			idx, left_x, right_x, y = line
 66 | 			temp = [idx, left_x, right_x, str(y)]
 67 | 			fp.writelines('\t'.join(temp) + '\n')
 68 | 
 69 | 
 70 | def synonym_replace(sentence):
 71 | 	"""
 72 | 	同义词替换
 73 | 	:param sentence:
 74 | 	:return:
 75 | 	"""
 76 | 	sentences = []
 77 | 	for word in sentence:
 78 | 		words = SYNONYM_DICT.get(word, [word])
 79 | 		sentences.append(words)
 80 | 	sentences = list(set(itertools.product(*sentences)))
 81 | 	result = []
 82 | 	for ii, sub_data in enumerate(sentences):
 83 | 		sub_data = list(sub_data)
 84 | 		if sub_data == sentence:
 85 | 			continue
 86 | 		result.append(sub_data)
 87 | 	return result
 88 | 
 89 | 
 90 | def combine_data(idx, left_s, right_s, y):
 91 | 	similar_data = [[idx, ' '.join(left_s), ' '.join(right_s), y]]
 92 | 	left_sentence = synonym_replace(left_s)
 93 | 	right_sentence = synonym_replace(right_s)
 94 | 	left_len, right_len = len(left_sentence), len(right_sentence)
 95 | 	max_num = max(left_len, right_len)
 96 | 	if y == '0':
 97 | 		max_num = 0
 98 | 	for sub_s in left_sentence[:max_num]:
 99 | 		temp = [idx, ' '.join(sub_s), ' '.join(right_s), y]
100 | 		similar_data.append(temp)
101 | 	for sub_s in right_sentence[:max_num]:
102 | 		temp = [idx, ' '.join(left_s), ' '.join(sub_s), y]
103 | 		similar_data.append(temp)
104 | 	return similar_data
105 | 
106 | 
107 | # if y == '1':
108 | # 	for sub_left_s, sub_right_s in zip(left_sentence[:3], right_sentence[:3]):
109 | # 		temp = [idx, sub_left_s, sub_right_s, y]
110 | # 		similar_data.append(temp)
111 | #
112 | # if left_len > right_len:
113 | # 	for sub_left_s, sub_right_s in zip(left_sentence[1:], right_sentence):
114 | # 		temp = [idx, sub_left_s, sub_right_s, y]
115 | # 		similar_data.append(temp)
116 | # elif right_len > left_len:
117 | # 	for sub_left_s, sub_right_s in zip(left_sentence, right_sentence[1:]):
118 | # 		temp = [idx, sub_left_s, sub_right_s, y]
119 | # 		similar_data.append(temp)
120 | # else:
121 | # 	data = left_sentence.pop()
122 | # 	left_sentence.insert(0, data)
123 | # 	for sub_left_s, sub_right_s in zip(left_sentence, right_sentence):
124 | # 		temp = [idx, sub_left_s, sub_right_s, y]
125 | # 		similar_data.append(temp)
126 | 
127 | 
128 | def trim(text):
129 | 	for rule, region in PATTERN:
130 | 		text = rule.sub(region, text)
131 | 	sentence = list(jieba.cut(text))
132 | 	for ii, word in enumerate(sentence):
133 | 		if word in SYNONYM_WRONG:
134 | 			word = SYNONYM_WRONG.get(word, word)
135 | 			sentence[ii] = word
136 | 	return sentence
137 | 
138 | 
139 | def build_vocab(text, max_len):
140 | 	"""
141 | 	构建词库
142 | 	:param text: text = [sentence]
143 | 	:param max_len: int
144 | 	:return:
145 | 	"""
146 | 	vocab = []
147 | 	for sentence in text:
148 | 		vocab.extend(sentence)
149 | 	count = collections.Counter(vocab).most_common()
150 | 	vocab = {v: k + 2 for k, (v, _) in enumerate(count)}
151 | 	vocab[PAD] = PAD2ID
152 | 	vocab[UNK] = UNK2ID
153 | 	
154 | 	v = Vocab()
155 | 	v.word2idx = vocab
156 | 	v.max_len = max_len
157 | 	return v
158 | 
159 | 
160 | def process_label(y):
161 | 	result = []
162 | 	num = 0
163 | 	for label in y:
164 | 		if label == '1':
165 | 			num += 1
166 | 		try:
167 | 			result.append(int(label))
168 | 		except:
169 | 			result.append(0)
170 | 	print("正样本数\t{}\t负样本数\t{}".format(num, len(y) - num))
171 | 	return result
172 | 
173 | 
174 | def preprocessor(synonym=False):
175 | 	"""数据预处理"""
176 | 	if synonym:
177 | 		data = read_csv(ATEC_NLP_DATA)
178 | 		data.extend(read_csv(ADD_ATEC_NLP_DATA))
179 | 		init_num = len(data)
180 | 		train_data, dev_data = train_test_split(data, test_size=0.1, random_state=50)
181 | 		extended_corpus(train_data)
182 | 		extended_corpus(dev_data, False, 'dev')
183 | 	# expand_num = len(train_data) + len(dev_data)
184 | 	# print("初始语料\t{}\t扩展语料\t{}\t新增语料\t{}".format(init_num, expand_num, expand_num - init_num))
185 | 	# else:
186 | 	train_data = read_csv(EXPEND_ATEC_NLP_DATA.format('train'))
187 | 	dev_data = read_csv(EXPEND_ATEC_NLP_DATA.format('dev'))
188 | 	train_idx, train_left_x, train_right_x, train_y = zip(*train_data)
189 | 	dev_idx, dev_left_x, dev_right_x, dev_y = zip(*dev_data)
190 | 	
191 | 	train_left_x = split_data(train_left_x)
192 | 	
193 | 	train_right_x = split_data(train_right_x)
194 | 	dev_left_x = split_data(dev_left_x)
195 | 	dev_right_x = split_data(dev_right_x)
196 | 	train_y = process_label(train_y)
197 | 	dev_y = process_label(dev_y)
198 | 	max_len = max(len(kk) for kk in train_left_x + train_right_x + dev_right_x + dev_left_x)
199 | 	vocab = build_vocab(train_left_x + train_right_x + dev_right_x + dev_left_x, max_len)
200 | 	
201 | 	print("最大长度\t{}\t词汇量\t{}".format(max_len, len(vocab.word2idx)))
202 | 	
203 | 	train_left_x, train_left_len = pad_sequence(train_left_x, vocab, max_len)
204 | 	train_right_x, train_right_len = pad_sequence(train_right_x, vocab, max_len)
205 | 	dev_left_x, dev_left_len = pad_sequence(dev_left_x, vocab, max_len)
206 | 	dev_right_x, dev_right_len = pad_sequence(dev_right_x, vocab, max_len)
207 | 	
208 | 	data = {
209 | 		"train_l_x": train_left_x,
210 | 		"train_r_x": train_right_x,
211 | 		"train_l_len": train_left_len,
212 | 		"train_r_len": train_right_len,
213 | 		"train_y": train_y,
214 | 		"val_l_x": dev_left_x,
215 | 		"val_r_x": dev_right_x,
216 | 		"val_l_len": dev_left_len,
217 | 		"val_r_len": dev_right_len,
218 | 		"val_y": dev_y,
219 | 	}
220 | 	save_data(DATA_PKL, data)
221 | 	save_data(VOCAB_PKL, vocab)
222 | 	return data, vocab
223 | 
224 | 
225 | def split_data(data):
226 | 	result = []
227 | 	if WordChar == 'char':
228 | 		for sentence in data:
229 | 			sentence = sentence.replace(' ', '')
230 | 			new_sentence = [char for char in sentence]
231 | 			result.append(new_sentence)
232 | 	else:
233 | 		for sentence in data:
234 | 			sentence = sentence.split(' ')
235 | 			result.append(sentence)
236 | 	return result
237 | 
238 | 
239 | def pad_sequence(data, vocab, max_len):
240 | 	"""
241 | 	补全数据
242 | 	:param data:
243 | 	:param vocab:
244 | 	:param max_len:
245 | 	:return:
246 | 	"""
247 | 	seqs_data = []
248 | 	seqs_len = []
249 | 	for sentence in data:
250 | 		seq_len = len(sentence)
251 | 		seqs_len.append(len(sentence))
252 | 		sentence = [vocab.word2idx.get(kk, UNK2ID) for kk in sentence] + [PAD2ID] * (max_len - seq_len)
253 | 		seqs_data.append(sentence[:max_len])
254 | 	return seqs_data, seqs_len
255 | 


--------------------------------------------------------------------------------
/core/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # @author: Linlifang
 5 | # @file: utils.py
 6 | # @time: 18-6-27下午6:13
 7 | import csv
 8 | 
 9 | try:
10 | 	import cPickle as pickle
11 | except:
12 | 	import pickle
13 | try:
14 | 	import sys
15 | 	
16 | 	reload(sys)
17 | 	sys.setdefaultencoding('utf8')
18 | except:
19 | 	pass
20 | 
21 | 
22 | def read_csv(filename, delimiter='\t'):
23 | 	"""
24 | 	读取csv
25 | 	:param filename:
26 | 	:param delimiter:
27 | 	:return:
28 | 	"""
29 | 	import codecs
30 | 	with codecs.open(filename, 'r', encoding='utf-8') as fp:
31 | 		data = [[ii for ii in each] for each in csv.reader(fp, delimiter=delimiter)]
32 | 	return data
33 | 
34 | 
35 | def load_text(filename):
36 | 	"""
37 | 	加载数据
38 | 	:param filename:
39 | 	:return:
40 | 	"""
41 | 	data = []
42 | 	with open(filename, 'r') as fp:
43 | 		for idx, line in enumerate(fp):
44 | 			line = line.strip('\n')
45 | 			tokens = line.split()
46 | 			data.append(tokens)
47 | 	return data
48 | 
49 | 
50 | def load_data(filename):
51 | 	"""
52 | 	加载词汇信息
53 | 	:return:
54 | 	"""
55 | 	try:
56 | 		with open(filename, 'rb') as fp:
57 | 			data = pickle.load(fp)
58 | 	except:
59 | 		with open('data/vocab2.pkl', 'rb') as fp:
60 | 			data = pickle.load(fp)
61 | 	return data
62 | 
63 | 
64 | def save_data(filename, data):
65 | 	with open(filename, 'wb') as fp:
66 | 		pickle.dump(data, fp)
67 | 


--------------------------------------------------------------------------------
/core/word_embedding.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin python3
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 19-1-17 下午5:08
 4 | # @Author  : 林利芳
 5 | # @File    : word_embedding.py
 6 | import numpy as np
 7 | 
 8 | from config.config import WORD2VEC_DATA
 9 | 
10 | 
11 | class Vocab(object):
12 | 	def __init__(self):
13 | 		self.word2vec = []
14 | 		self.word2idx = {'<PAD>': 0, '<UNK>': 1}
15 | 		self.max_len = 0
16 | 	
17 | 	def add_word(self, word, vector):
18 | 		self.word2idx[word] = len(self.word2idx)
19 | 		self.word2vec.append(vector)
20 | 	
21 | 	def load_word_vectors(self):
22 | 		with open(WORD2VEC_DATA, 'r') as f:
23 | 			vocab_size, embedding_dim = [int(_) for _ in f.readline().strip().split(' ')]
24 | 			self.word2vec = [[0.0] * embedding_dim]
25 | 			self.word2vec.append(np.random.uniform(-0.25, 0.25, embedding_dim).round(6).tolist())
26 | 			lines = f.readlines()
27 | 			for line in lines:
28 | 				word, vector = line.strip().split(' ', 1)
29 | 				self.add_word(word, [float(_) for _ in vector.split(' ')])
30 | 		self.word2vec = np.array(self.word2vec).astype(np.float32)
31 | 


--------------------------------------------------------------------------------
/data/corpus.txt:
--------------------------------------------------------------------------------
 1 | 怎么
 2 | 怎样
 3 | 如何
 4 | 更改
 5 | 更换
 6 | 更新
 7 | 修改
 8 | 未
 9 | 没有
10 | 可以
11 | 为什么
12 | 为何
13 | 为啥
14 | 零时额度
15 | 临时额度
16 | 这么久
17 | 降低
18 | 下降
19 | 日息
20 | 不能
21 | 不让
22 | 不可以
23 | 能不能
24 | 行不行
25 | 可不可以
26 | 用不了
27 | 不能用
28 | 被冻结
29 | 被封了
30 | 下月
31 | 下个月
32 | 蚂蚁借呗
33 | 借呗
34 | 花呗
35 | 花贝
36 | 花唄
37 | 花被
38 | 蚂蚁借呗
39 | 蚂蚁花呗
40 | 蚂蚁花贝
41 | 蚂蚁花唄
42 | 蚂蚁花被
43 | ***
44 | ofo
45 | 借呗
46 | 余额宝
47 | 代扣完
48 | 更改成
49 | 用了
50 | 届不了
51 | 借不了
52 | 上个月
53 | 上月
54 | 老有
55 | 总有


--------------------------------------------------------------------------------
/logdir/graph/match_pyramid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phychaos/TextSimilar/5c3e23bceba3e2aebf5c2db390ab1ddeb728e30e/logdir/graph/match_pyramid


--------------------------------------------------------------------------------
/logdir/graph/siamese.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phychaos/TextSimilar/5c3e23bceba3e2aebf5c2db390ab1ddeb728e30e/logdir/graph/siamese.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin python3
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 19-1-18 下午5:46
 4 | # @Author  : 林利芳
 5 | # @File    : main.py
 6 | import os
 7 | import sys
 8 | 
 9 | from core.load_data import get_feed_dict, load_test_data, save_test_result
10 | from config.config import checkpoint_dir, TEST_DATA, TEST_RESULT
11 | from model.match_pyramid import MatchPyramidNetwork
12 | from model.rnn_siamese import RnnSiameseNetwork
13 | from config.hyperparams import HyperParams as hp
14 | import tensorflow as tf
15 | import numpy as np
16 | 
17 | 
18 | def test(filename=TEST_DATA, outfile=TEST_RESULT, network='rnn'):
19 | 	checkpoint_file = checkpoint_dir.format(network)
20 | 	idx, left_x, left_len, right_x, right_len, vocab = load_test_data(filename)
21 | 	y = np.ones_like(idx)
22 | 	vocab_size = len(vocab.word2idx)
23 | 	if network == 'rnn':
24 | 		model = RnnSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, hp.batch_size, False)
25 | 	elif network == 'match_pyramid':
26 | 		model = MatchPyramidNetwork(vocab_size, hp.embedding_size, vocab.max_len, hp.batch_size, False)
27 | 	else:
28 | 		return
29 | 	sv = tf.train.Supervisor(graph=model.graph, logdir=checkpoint_file, save_model_secs=0)
30 | 	with sv.managed_session() as sess:
31 | 		predicts = []
32 | 		for feed_dict, start_batch in get_feed_dict(model, left_x, right_x, left_len, right_len, y, hp.batch_size):
33 | 			pre_y, distince = sess.run([model.pre_y, model.distance], feed_dict=feed_dict)
34 | 			predicts.extend(pre_y[start_batch:])
35 | 		save_test_result(outfile, idx, predicts)
36 | 
37 | 
38 | if __name__ == "__main__":
39 | 	test(sys.argv[1], sys.argv[2], 'rnn')
40 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time    : 18-12-29 下午3:17
4 | # @Author  : 林利芳
5 | # @File    : __init__.py
6 | 


--------------------------------------------------------------------------------
/model/cnn_siamese.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 18-12-29 下午3:07
  4 | # @Author  : 林利芳
  5 | # @File    : rnn_siamese.py
  6 | import tensorflow as tf
  7 | from config.hyperparams import CnnParams as hp
  8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize
  9 | 
 10 | 
 11 | class CnnSiameseNetwork(object):
 12 | 	def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True):
 13 | 		self.vocab_size = vocab_size
 14 | 		self.embedding_size = embedding_size
 15 | 		self.max_len = max_len
 16 | 		self.is_training = is_training
 17 | 		self.graph = tf.Graph()
 18 | 		with self.graph.as_default():
 19 | 			self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x")
 20 | 			self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x")
 21 | 			self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target")
 22 | 			self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
 23 | 			self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
 24 | 			self.global_step = tf.train.create_global_step()
 25 | 			
 26 | 			key, value = self.siamese()
 27 | 			self.distance, self.pre_y = self.similar(key, value)
 28 | 			self.accuracy = self.predict()
 29 | 			self.loss = self.loss_layer()
 30 | 			self.train_op = self.optimize()
 31 | 	
 32 | 	def siamese(self):
 33 | 		"""
 34 | 		孪生网络 transformer + rnn
 35 | 		:return:
 36 | 		"""
 37 | 		x = tf.concat([self.left_x, self.right_x], axis=0)
 38 | 		seq_lens = tf.concat([self.left_seq_lens, self.right_seq_lens], axis=0)
 39 | 		# layers embedding multi_head_attention rnn
 40 | 		embed = embedding(x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True, scope="embed")
 41 | 		
 42 | 		# output = self.transformer(embed, x)
 43 | 		inputs = tf.expand_dims(embed, -1)
 44 | 		output = self.cnn_layer(inputs, 1)
 45 | 		output = tf.expand_dims(output, -1)
 46 | 		output = self.cnn_layer(output, 2)
 47 | 		output = self.attention(embed, output)
 48 | 		key, value = tf.split(output, 2, axis=0)
 49 | 		return key, value
 50 | 	
 51 | 	def rnn_layer(self, inputs, seq_lens, seg=hp.seg):
 52 | 		"""
 53 | 		创建双向RNN层
 54 | 		:param inputs:
 55 | 		:param seq_lens:
 56 | 		:param seg: LSTM GRU F-LSTM, IndRNN
 57 | 		:return:
 58 | 		"""
 59 | 		if seg == 'LSTM':
 60 | 			fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
 61 | 			bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
 62 | 		
 63 | 		elif seg == 'GRU':
 64 | 			fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
 65 | 			bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
 66 | 		else:
 67 | 			fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
 68 | 			bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
 69 | 		# 双向rnn
 70 | 		(fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn(
 71 | 			fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32)
 72 | 		# 合并双向rnn的output batch_size * max_seq * (hidden_dim*2)
 73 | 		output = tf.add(fw_output, bw_output)
 74 | 		return output
 75 | 	
 76 | 	def cnn_layer(self, inputs, layer=1):
 77 | 		"""
 78 | 		卷积层 卷积核2,3,4,5 激活层relu 池化层 size=2
 79 | 		:param inputs: batch T * T
 80 | 		:param layer: batch T * T
 81 | 		:return:
 82 | 		"""
 83 | 		outputs = []
 84 | 		d_dim, channel = inputs.get_shape().as_list()[-2:]
 85 | 		for ii, width in enumerate(hp.kernel):
 86 | 			with tf.variable_scope("cnn_{}_{}_layer".format(layer, ii + 1)):
 87 | 				weight = tf.Variable(tf.truncated_normal([width, d_dim, channel, hp.channel], stddev=0.1, name='w'))
 88 | 				bias = tf.get_variable('bias', [hp.channel], initializer=tf.constant_initializer(0.0))
 89 | 				output = tf.nn.conv2d(inputs, weight, strides=[1, 1, d_dim, 1], padding='SAME')  # batch T T channel
 90 | 				output = tf.nn.relu(tf.nn.bias_add(output, bias, data_format="NHWC"))
 91 | 				
 92 | 				output = tf.reshape(output, shape=[-1, self.max_len, hp.channel])
 93 | 				outputs.append(output)
 94 | 		outputs = tf.concat(outputs, axis=-1)
 95 | 		return outputs
 96 | 	
 97 | 	def transformer(self, embed, value):
 98 | 		with tf.variable_scope("Transformer_Encoder"):
 99 | 			# Positional Encoding
100 | 			embed += positional_encoding(value, num_units=hp.num_units, zero_pad=False, scale=False, scope="post")
101 | 			# Dropout
102 | 			output = self.multi_head_block(embed)
103 | 			return output
104 | 	
105 | 	def multi_head_block(self, query, causality=False):
106 | 		"""
107 | 		多头注意力机制
108 | 		:param query:
109 | 		:param causality:
110 | 		:return:
111 | 		"""
112 | 		for i in range(hp.num_blocks):
113 | 			with tf.variable_scope("num_blocks_{}".format(i)):
114 | 				# multi head Attention ( self-attention)
115 | 				query = multihead_attention(
116 | 					queries=query, keys=query, num_units=hp.num_units, num_heads=hp.num_heads,
117 | 					dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality,
118 | 					scope="self_attention")
119 | 				# Feed Forward
120 | 				query = feedforward(query, num_units=[4 * hp.num_units, hp.num_units])
121 | 		return query
122 | 	
123 | 	def loss_layer(self):
124 | 		"""
125 | 		损失函数 L+ = （1-Ew)^2/4  L_ = max(Ex,0)^2
126 | 		:return:
127 | 		"""
128 | 		y = tf.cast(self.y, tf.float32)
129 | 		with tf.name_scope("output"):
130 | 			loss_p = tf.square(1 - self.distance) / 4
131 | 			mask = tf.sign(tf.nn.relu(self.distance - hp.margin))
132 | 			loss_m = tf.square(mask * self.distance)
133 | 			loss = tf.reduce_sum(y * loss_p + (1 - y) * loss_m)
134 | 			return loss
135 | 	
136 | 	def attention(self, embed, query):
137 | 		"""
138 | 		注意力机制
139 | 		:param embed:
140 | 		:param query:
141 | 		:return:
142 | 		"""
143 | 		output = tf.reduce_mean(query, axis=1)
144 | 		return output
145 | 		with tf.name_scope("attention"):
146 | 			w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32)
147 | 			b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32)
148 | 			u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32)
149 | 			value = tf.concat([embed, query], axis=-1)
150 | 			value = tf.reshape(value, [-1, 2 * hp.num_units])
151 | 			attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u)
152 | 			attention = tf.reshape(attention, shape=[-1, self.max_len])
153 | 			attention = tf.nn.softmax(attention, axis=-1)
154 | 			attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units])
155 | 			
156 | 			output = tf.reduce_sum(attention * query, axis=1)
157 | 			output = layer_normalize(output)
158 | 			return output
159 | 	
160 | 	@staticmethod
161 | 	def similar(key, value):
162 | 		"""
163 | 		cosine(key,value) = key * value/(|key|*|value|)
164 | 		:param key:
165 | 		:param value:
166 | 		:return:
167 | 		"""
168 | 		dot_value = tf.reduce_sum(key * value, axis=-1)
169 | 		key_sqrt = tf.sqrt(tf.reduce_sum(tf.square(key), axis=-1) + hp.eps)
170 | 		value_sqrt = tf.sqrt(tf.reduce_sum(tf.square(value), axis=-1) + hp.eps)
171 | 		distance = tf.div(dot_value, key_sqrt * value_sqrt, name="similar")
172 | 		pre_y = tf.sign(tf.nn.relu(distance - hp.margin))
173 | 		pre_y = tf.cast(pre_y, tf.int32, name='pre')
174 | 		return distance, pre_y
175 | 	
176 | 	def predict(self):
177 | 		correct_predictions = tf.equal(self.pre_y, self.y)
178 | 		accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
179 | 		return accuracy
180 | 	
181 | 	def optimize(self):
182 | 		"""
183 | 		优化器
184 | 		:return:
185 | 		"""
186 | 		optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
187 | 		train_op = optimizer.minimize(self.loss, global_step=self.global_step)
188 | 		return train_op
189 | 


--------------------------------------------------------------------------------
/model/match_pyramid.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 19-1-25 上午11:17
  4 | # @Author  : 林利芳
  5 | # @File    : match_pyramid.py
  6 | import tensorflow as tf
  7 | from config.hyperparams import MatchPyramidParams as hp
  8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize
  9 | 
 10 | 
 11 | class MatchPyramidNetwork(object):
 12 | 	def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True):
 13 | 		self.vocab_size = vocab_size
 14 | 		self.embedding_size = embedding_size
 15 | 		self.max_len = max_len
 16 | 		self.is_training = is_training
 17 | 		self.graph = tf.Graph()
 18 | 		with self.graph.as_default():
 19 | 			self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x")
 20 | 			self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x")
 21 | 			self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target")
 22 | 			self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
 23 | 			self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
 24 | 			self.global_step = tf.train.create_global_step()
 25 | 			
 26 | 			outputs = self.match_pyramid()
 27 | 			outputs, self.pre_y = self.multi_dense_layer(outputs)
 28 | 			self.acc = self.predict()
 29 | 			self.loss = self.loss_layer(outputs)
 30 | 			self.train_op = self.optimize()
 31 | 	
 32 | 	def match_pyramid(self):
 33 | 		"""
 34 | 		pyramid
 35 | 		:return:
 36 | 		"""
 37 | 		left_embed = embedding(self.left_x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True,
 38 | 							   scope="left_embed")
 39 | 		right_embed = embedding(self.right_x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True,
 40 | 								scope="right_embed")
 41 | 		outputs = self.match_text(left_embed, right_embed)
 42 | 		outputs = self.cnn_layer(outputs, 1)
 43 | 		outputs = self.cnn_layer(outputs, 2)
 44 | 		return outputs
 45 | 	
 46 | 	@staticmethod
 47 | 	def match_text(left_embed, right_embed):
 48 | 		"""
 49 | 		文本匹配 cosine dot binary
 50 | 		:param left_embed: 词嵌入 batch * T * D
 51 | 		:param right_embed: 词嵌入 batch * T * D
 52 | 		:return:
 53 | 		"""
 54 | 		with tf.variable_scope("match-text"):
 55 | 			dot_output = tf.matmul(left_embed, tf.transpose(right_embed, [0, 2, 1]))  # batch * T * T
 56 | 			left_norm = tf.sqrt(tf.matmul(left_embed, tf.transpose(left_embed, [0, 2, 1]))+hp.eps)
 57 | 			right_norm = tf.sqrt(tf.matmul(right_embed, tf.transpose(right_embed, [0, 2, 1]))+hp.eps)
 58 | 			cosine_outputs = tf.div(dot_output, left_norm * right_norm)
 59 | 			binary_outputs = tf.cast(tf.equal(cosine_outputs, 1), tf.float32)
 60 | 			dot_output = tf.expand_dims(dot_output, axis=-1)
 61 | 			cosine_outputs = tf.expand_dims(cosine_outputs, axis=-1)
 62 | 			binary_outputs = tf.expand_dims(binary_outputs, axis=-1)
 63 | 			
 64 | 			outputs = tf.concat([dot_output, cosine_outputs, binary_outputs], axis=-1)
 65 | 		print(outputs.get_shape().as_list())
 66 | 		return dot_output
 67 | 	
 68 | 	@staticmethod
 69 | 	def cnn_layer(inputs, layer=1):
 70 | 		"""
 71 | 		卷积层 卷积核2,3,4,5 激活层relu 池化层 size=2
 72 | 		:param inputs: batch T * T
 73 | 		:param layer: batch T * T
 74 | 		:return:
 75 | 		"""
 76 | 		outputs = []
 77 | 		channel = inputs.get_shape().as_list()[-1]
 78 | 		for ii, width in enumerate(hp.kernel):
 79 | 			with tf.variable_scope("cnn_{}_{}_layer".format(layer, ii + 1)):
 80 | 				weight = tf.Variable(tf.truncated_normal([width, width, channel, hp.channel], stddev=0.1, name='w'))
 81 | 				bias = tf.get_variable('bias', [hp.channel], initializer=tf.constant_initializer(0.0))
 82 | 				output = tf.nn.conv2d(inputs, weight, strides=[1, 1, 1, 1], padding='SAME')  # batch T T channel
 83 | 				output = tf.nn.relu(tf.nn.bias_add(output, bias, data_format="NHWC"))
 84 | 				pool = tf.nn.max_pool(output, ksize=[1, hp.pool_size, hp.pool_size, 1], strides=[1, 1, 1, 1],
 85 | 									  padding='VALID')
 86 | 				outputs.append(pool)
 87 | 		outputs = tf.concat(outputs, axis=-1)
 88 | 		return outputs
 89 | 	
 90 | 	@staticmethod
 91 | 	def multi_dense_layer(inputs):
 92 | 		"""
 93 | 		多层感知机 T*T*channel -> dense_size ->2
 94 | 		:param inputs: batch T T channel
 95 | 		:return:
 96 | 		"""
 97 | 		_, width, height, channel = inputs.get_shape().as_list()
 98 | 		size = width * height * channel
 99 | 		inputs = tf.reshape(inputs, shape=[-1, size])
100 | 		with tf.variable_scope("dense_layer"):
101 | 			w = tf.get_variable(name='w', dtype=tf.float32, shape=[size, hp.dense_size])
102 | 			b = tf.get_variable(name='b', dtype=tf.float32, shape=[hp.dense_size])
103 | 			outputs = layer_normalize(tf.matmul(inputs, w) + b, )
104 | 		
105 | 		with tf.variable_scope("logit_layer"):
106 | 			w = tf.get_variable(name='w', dtype=tf.float32, shape=[hp.dense_size, 2])
107 | 			b = tf.get_variable(name='b', dtype=tf.float32, shape=[2])
108 | 			outputs = tf.nn.softmax(tf.matmul(outputs, w) + b, axis=-1)
109 | 		pre_y = tf.cast(tf.argmax(outputs, axis=-1), dtype=tf.int32)
110 | 		return outputs, pre_y
111 | 	
112 | 	def rnn_layer(self, inputs, seq_lens, seg=hp.seg):
113 | 		"""
114 | 		创建双向RNN层
115 | 		:param inputs:
116 | 		:param seq_lens:
117 | 		:param seg: LSTM GRU F-LSTM, IndRNN
118 | 		:return:
119 | 		"""
120 | 		if seg == 'LSTM':
121 | 			fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
122 | 			bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
123 | 		
124 | 		elif seg == 'GRU':
125 | 			fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
126 | 			bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
127 | 		else:
128 | 			fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
129 | 			bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
130 | 		# 双向rnn
131 | 		(fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn(
132 | 			fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32)
133 | 		# 合并双向rnn的output batch_size * max_seq * (hidden_dim*2)
134 | 		output = tf.add(fw_output, bw_output)
135 | 		return output
136 | 	
137 | 	def transformer(self, embed, value):
138 | 		with tf.variable_scope("Transformer_Encoder"):
139 | 			# Positional Encoding
140 | 			embed += positional_encoding(value, num_units=hp.num_units, zero_pad=False, scale=False, scope="post")
141 | 			# Dropout
142 | 			output = self.multi_head_block(embed)
143 | 			return output
144 | 	
145 | 	def multi_head_block(self, query, causality=False):
146 | 		"""
147 | 		多头注意力机制
148 | 		:param query:
149 | 		:param causality:
150 | 		:return:
151 | 		"""
152 | 		for i in range(hp.num_blocks):
153 | 			with tf.variable_scope("num_blocks_{}".format(i)):
154 | 				# multi head Attention ( self-attention)
155 | 				query = multihead_attention(
156 | 					queries=query, keys=query, num_units=hp.num_units, num_heads=hp.num_heads,
157 | 					dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality,
158 | 					scope="self_attention")
159 | 				# Feed Forward
160 | 				query = feedforward(query, num_units=[4 * hp.num_units, hp.num_units])
161 | 		return query
162 | 	
163 | 	def loss_layer(self, inputs):
164 | 		"""
165 | 		损失函数 L+ = （1-Ew)^2/4  L_ = max(Ex,0)^2
166 | 		:return:
167 | 		"""
168 | 		y = tf.cast(self.y, tf.float32)
169 | 		with tf.name_scope("loss_layer"):
170 | 			loss_p = y * tf.log(tf.clip_by_value(inputs[:, -1], hp.eps, 1.0))
171 | 			loss_m = (1 - y) * tf.log(tf.clip_by_value(inputs[:, 0], hp.eps, 1.0))
172 | 			loss = -tf.reduce_sum(loss_p + loss_m)
173 | 			return loss
174 | 	
175 | 	def attention(self, embed, query):
176 | 		"""
177 | 		注意力机制
178 | 		:param embed:
179 | 		:param query:
180 | 		:return:
181 | 		"""
182 | 		with tf.name_scope("attention"):
183 | 			w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32)
184 | 			b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32)
185 | 			u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32)
186 | 			value = tf.concat([embed, query], axis=-1)
187 | 			value = tf.reshape(value, [-1, 2 * hp.num_units])
188 | 			attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u)
189 | 			attention = tf.reshape(attention, shape=[-1, self.max_len])
190 | 			attention = tf.nn.softmax(attention, axis=-1)
191 | 			attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units])
192 | 			
193 | 			output = tf.reduce_sum(attention * query, axis=1)
194 | 			output = layer_normalize(output)
195 | 			return output
196 | 	
197 | 	def predict(self):
198 | 		correct_predictions = tf.equal(self.pre_y, self.y)
199 | 		accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
200 | 		return accuracy
201 | 	
202 | 	def optimize(self):
203 | 		"""
204 | 		优化器
205 | 		:return:
206 | 		"""
207 | 		optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
208 | 		train_op = optimizer.minimize(self.loss, global_step=self.global_step)
209 | 		return train_op
210 | 


--------------------------------------------------------------------------------
/model/module/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time    : 19-1-25 下午6:02
4 | # @Author  : 林利芳
5 | # @File    : __init__.py
6 | 


--------------------------------------------------------------------------------
/model/module/feature.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python3
  2 | # -*- coding: utf-8 -*-
  3 | # @时间   : 18-12-11 下午5:44
  4 | # @作者   : Lin lifang
  5 | # @文件   : feature.py
  6 | from utils.utils import read_template
  7 | import numpy as np
  8 | 
  9 | 
 10 | class Feature(object):
 11 | 	def __init__(self, fd=5):
 12 | 		self.fd = fd
 13 | 		self.fss = None
 14 | 		self.bf_size = 0
 15 | 		self.uf_size = 0
 16 | 		self.f_size = 0
 17 | 		self.num_k = 0
 18 | 		self.node_obs = dict()
 19 | 		self.edge_obs = dict()
 20 | 		self.oby_dict = dict()
 21 | 		self.node_fs = []
 22 | 		self.edge_fs = []
 23 | 		self.tp_list = [
 24 | 			['U00', ['-2', '0']],
 25 | 			['U01', ['-1', '0']],
 26 | 			['U02', ['0', '0']],
 27 | 			['U03', ['1', '0']],
 28 | 			['U04', ['2', '0']],
 29 | 			['U05', ['-2', '0'], ['-1', '0'], ['0', '0']],
 30 | 			['U06', ['-1', '0'], ['0', '0'], ['1', '0']],
 31 | 			['U07', ['0', '0'], ['1', '0'], ['2', '0']],
 32 | 			['U08', ['-1', '0'], ['0', '0']],
 33 | 			['U09', ['0', '0'], ['1', '0']],
 34 | 			['B'], ]
 35 | 
 36 | 	def process_features(self, texts):
 37 | 		"""
 38 | 		特征提取
 39 | 		:param texts: 序列文本 [[['你',],['好',]],[['你',],['好',]]]
 40 | 		:return:
 41 | 		"""
 42 | 		print("特征提取...")
 43 | 		uf_obs = dict()
 44 | 		bf_obs = dict()
 45 | 
 46 | 		for text in texts:
 47 | 			seq_uf, seq_bf = self.feature_vector(text)
 48 | 			for loc_id, (loc_uf, loc_bf) in enumerate(zip(seq_uf, seq_bf)):
 49 | 				for fs in loc_bf:
 50 | 					fs_id = bf_obs.get(fs)
 51 | 					bf_obs[fs] = fs_id + 1 if fs_id is not None else 1
 52 | 				for fs in loc_uf:
 53 | 					fs_id = uf_obs.get(fs)
 54 | 					uf_obs[fs] = fs_id + 1 if fs_id is not None else 1
 55 | 
 56 | 		node_fs = [key for key, v in sorted(uf_obs.items(), key=lambda x: x[1], reverse=True) if v >= self.fd]
 57 | 		edge_fs = [key for key, v in sorted(bf_obs.items(), key=lambda x: x[1], reverse=True) if v >= self.fd]
 58 | 		self.node_obs = {key: kk * self.num_k for kk, key in enumerate(node_fs)}
 59 | 		self.edge_obs = {key: kk * self.num_k * self.num_k for kk, key in enumerate(edge_fs)}
 60 | 
 61 | 		self.uf_size = len(node_fs) * self.num_k
 62 | 		self.bf_size = len(edge_fs) * self.num_k * self.num_k
 63 | 		self.f_size = self.uf_size + self.bf_size
 64 | 		print("B 特征:\t{}\nU 特征:\t{}\n总特征:\t{}\n".format(self.bf_size, self.uf_size, self.f_size))
 65 | 
 66 | 	def feature_vector(self, text, init=True):
 67 | 		"""
 68 | 		特征序列化
 69 | 		:param text:
 70 | 		:param init:
 71 | 		:return:
 72 | 		"""
 73 | 		seq_bf = []
 74 | 		seq_uf = []
 75 | 		for loc_id in range(len(text)):
 76 | 			loc_uf, loc_bf = self.expand_observation(text, loc_id, init)
 77 | 			seq_bf.append(loc_bf)
 78 | 			seq_uf.append(loc_uf)
 79 | 		return seq_uf, seq_bf
 80 | 
 81 | 	def expand_observation(self, sentence, loc_id, init=True):
 82 | 		"""
 83 | 		expend the observation at loc_id for sequence
 84 | 		:param sentence: 字符序列
 85 | 		:param loc_id: 字符在sentence的位置序号
 86 | 		:param init: 是否初始化
 87 | 		:return:
 88 | 		"""
 89 | 		loc_uf = []
 90 | 		loc_bf = []
 91 | 		for tp in self.tp_list:
 92 | 			fs = tp[0]
 93 | 			for li in tp[1::]:
 94 | 				row = loc_id + int(li[0])
 95 | 				col = int(li[1])
 96 | 				if len(sentence) > row >= 0:
 97 | 					if len(sentence[row][col]) > col >= 0:
 98 | 						fs += ":" + sentence[row][col]
 99 | 				else:
100 | 					fs += ':B' + li[0]
101 | 			if fs[0] == "U":
102 | 				if init:
103 | 					loc_uf.append(fs)
104 | 				else:
105 | 					fs_id = self.node_obs.get(fs)
106 | 					if fs_id is not None:
107 | 						loc_uf.append(fs_id)
108 | 			if fs[0] == "B":
109 | 				if init:
110 | 					loc_bf.append(fs)
111 | 				else:
112 | 					fs_id = self.edge_obs.get(fs)
113 | 					if fs_id is not None:
114 | 						loc_bf.append(fs_id)
115 | 		return loc_uf, loc_bf
116 | 
117 | 	def cal_observe_on(self, texts, init=False):
118 | 		"""
119 | 		获取文本特征 [[['U:你','U:你:好'],['U:你','U:你:好'],[]],[],[]] =[[[145,456,566],[3455,]],[]]
120 | 		:param texts:
121 | 		:param init:
122 | 		:return:
123 | 		"""
124 | 		self.node_fs = []
125 | 		self.edge_fs = []
126 | 		for text in texts:
127 | 			seq_uf, seq_bf = self.feature_vector(text, init)
128 | 			self.node_fs.append(seq_uf)
129 | 			self.edge_fs.append(seq_bf)
130 | 		return self.node_fs, self.edge_fs
131 | 
132 | 	def cal_fss(self, labels, y0):
133 | 		"""
134 | 		统计特征数量 每个特征对应 num_k 个特征
135 | 		:param labels: 标签
136 | 		:param y0: 起始值0
137 | 		:return:
138 | 		"""
139 | 		self.fss = np.zeros((self.f_size,))
140 | 		fss_b = self.fss[0:self.bf_size]
141 | 		fss_u = self.fss[self.bf_size:]
142 | 		for seq_id, label in enumerate(labels):
143 | 			y_p = y0
144 | 			for loc_id, y in enumerate(label):
145 | 				for fs_id in self.node_fs[seq_id][loc_id]:
146 | 					fss_u[fs_id + y] += 1.0
147 | 				for fs_id in self.edge_fs[seq_id][loc_id]:
148 | 					fss_b[fs_id + y_p * self.num_k + y] += 1.0
149 | 				y_p = y
150 | 
151 | 	def save_feature(self):
152 | 		result = ['#CRF Feature Templates.\n\n']
153 | 		for tp in self.tp_list:
154 | 			feature = tp[0] + ':'
155 | 			for start, end in tp[1:]:
156 | 				feature += '%x[' + start + ',' + end + ']'
157 | 			result.append(feature)
158 | 		result.append('\n\n#U')
159 | 		u_feature = list(sorted(self.node_obs.keys(), key=lambda x: x))
160 | 		result.extend(u_feature)
161 | 		with open('feature.txt', 'w', encoding='utf-8') as fp:
162 | 			fp.write('\n'.join(result))
163 | 
164 | 	def process_state(self, labels):
165 | 		"""
166 | 		状态预处理
167 | 		:param labels:
168 | 		:return:
169 | 		"""
170 | 		new_label = []
171 | 		oby_id = 0
172 | 		for sentence in labels:
173 | 			s_label = []
174 | 			for label in sentence:
175 | 				label_id = self.oby_dict.get(label)
176 | 				if label_id is None:
177 | 					label_id = oby_id
178 | 					self.oby_dict[label] = oby_id
179 | 					oby_id += 1
180 | 				s_label.append(label_id)
181 | 			new_label.append(s_label)
182 | 		self.num_k = len(self.oby_dict)
183 | 		return new_label
184 | 
185 | 	def __call__(self, texts, labels, template_file, y0=0, *args, **kwargs):
186 | 		if template_file:
187 | 			self.tp_list = read_template(template_file)
188 | 		self.seq_lens = [len(x) for x in labels]
189 | 		labels = self.process_state(labels)
190 | 		self.process_features(texts)
191 | 		self.cal_observe_on(texts)
192 | 		self.cal_fss(labels, y0)
193 | 		self.save_feature()
194 | 


--------------------------------------------------------------------------------
/model/module/modules.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 18-12-25 上午10:37
  4 | # @Author  : 林利芳
  5 | # @File    : modules.py
  6 | 
  7 | from __future__ import print_function
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | 
 11 | 
 12 | def layer_normalize(inputs, epsilon=1e-8, scope="ln", reuse=None):
 13 |     """Applies layer normalization.
 14 |     Args:
 15 |         inputs: A tensor with 2 or more dimensions, where the first dimension has
 16 |         `batch_size`.
 17 |         epsilon: A floating number. A very small number for preventing ZeroDivision Error.
 18 |         scope: Optional scope for `variable_scope`.
 19 |         reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
 20 |     Returns:
 21 |         A tensor with the same shape and data dtype as `inputs`.
 22 |     """
 23 |     with tf.variable_scope(scope, reuse=reuse):
 24 |         inputs_shape = inputs.get_shape()
 25 |         params_shape = inputs_shape[-1:]
 26 | 
 27 |         mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
 28 |         beta = tf.Variable(tf.zeros(params_shape))
 29 |         gamma = tf.Variable(tf.ones(params_shape))
 30 |         normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
 31 |         outputs = gamma * normalized + beta
 32 | 
 33 |     return outputs
 34 | 
 35 | 
 36 | def embedding(inputs, vocab_size, num_units, zero_pad=True, scale=True, scope="embedding", reuse=None):
 37 |     """Embeds a given tensor.
 38 |     Args:
 39 |         inputs: A `Tensor` with type `int32` or `int64` containing the ids to be looked up in `lookup table`.
 40 |         vocab_size: An int. Vocabulary size.
 41 |         num_units: An int. Number of embedding hidden units.
 42 |         zero_pad: A boolean. If True, all the values of the fist row (id 0) should be constant zeros.
 43 |         scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
 44 |         scope: Optional scope for `variable_scope`.
 45 |         reuse: Boolean, whether to reuse the weights of a previous layer
 46 |         by the same name.
 47 | 
 48 |     Returns:
 49 |         A `Tensor` with one more rank than inputs's. The last dimensionality should be `num_units`.
 50 | 
 51 |     For example,
 52 | 
 53 |     ```
 54 |     import tensorflow as tf
 55 | 
 56 |     inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
 57 |     outputs = embedding(inputs, 6, 2, zero_pad=True)
 58 |     with tf.Session() as sess:
 59 |         sess.run(tf.global_variables_initializer())
 60 |         print sess.run(outputs)
 61 |     >>
 62 |     [[[ 0.          0.        ]
 63 |         [ 0.09754146  0.67385566]
 64 |         [ 0.37864095 -0.35689294]]
 65 | 
 66 |     [[-1.01329422 -1.09939694]
 67 |         [ 0.7521342   0.38203377]
 68 |         [-0.04973143 -0.06210355]]]
 69 |     ```
 70 | 
 71 |     ```
 72 |     import tensorflow as tf
 73 | 
 74 |     inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
 75 |     outputs = embedding(inputs, 6, 2, zero_pad=False)
 76 |     with tf.Session() as sess:
 77 |         sess.run(tf.global_variables_initializer())
 78 |         print sess.run(outputs)
 79 |     >>
 80 |         [[[-0.19172323 -0.39159766]
 81 |             [-0.43212751 -0.66207761]
 82 |             [ 1.03452027 -0.26704335]]
 83 | 
 84 |         [[-0.11634696 -0.35983452]
 85 |             [ 0.50208133  0.53509563]
 86 |             [ 1.22204471 -0.96587461]]]
 87 |     ```
 88 |     """
 89 |     with tf.variable_scope(scope, reuse=reuse):
 90 |         lookup_table = tf.get_variable('lookup_table', dtype=tf.float32, shape=[vocab_size, num_units],
 91 |                                        initializer=tf.contrib.layers.xavier_initializer())
 92 |         if zero_pad:
 93 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0)
 94 |         outputs = tf.nn.embedding_lookup(lookup_table, inputs)
 95 | 
 96 |         if scale:
 97 |             outputs = outputs * (num_units ** 0.5)
 98 | 
 99 |     return outputs
100 | 
101 | 
102 | def positional_encoding(inputs, num_units, zero_pad=True, scale=True, scope="positional_encoding", reuse=None):
103 |     """Sinusoidal Positional_Encoding.
104 | 
105 |     Args:
106 |         inputs: A 2d Tensor with shape of (N, T).
107 |         num_units: Output dimensionality
108 |         zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
109 |         scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
110 |         scope: Optional scope for `variable_scope`.
111 |         reuse: Boolean, whether to reuse the weights of a previous layer
112 |             by the same name.
113 | 
114 |     Returns:
115 |         A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
116 |     """
117 | 
118 |     N, T = inputs.get_shape().as_list()
119 |     with tf.variable_scope(scope, reuse=reuse):
120 |         position_ind = tf.ones_like(inputs) * tf.range(T)
121 | 
122 |         # First part of the PE function: sin and cos argument
123 |         position_enc = np.array([
124 |             [pos / np.power(10000, 2. * i / num_units) for i in range(num_units)] for pos in range(T)], dtype=np.float32)
125 | 
126 |         # Second part, apply the cosine to even columns and sin to odds.
127 |         position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
128 |         position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
129 | 
130 |         # Convert to a tensor
131 |         lookup_table = tf.convert_to_tensor(position_enc)
132 | 
133 |         if zero_pad:
134 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0)
135 |         outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
136 | 
137 |         if scale:
138 |             outputs = outputs * num_units ** 0.5
139 | 
140 |         return outputs
141 | 
142 | 
143 | def multihead_attention(
144 |         queries, keys, num_units=None, num_heads=8, dropout_rate=0, is_training=True, causality=False,
145 |         scope="multihead_attention", reuse=None):
146 |     """Applies multihead attention.
147 |     Args:
148 |         queries: A 3d tensor with shape of [N, T_q, C_q].
149 |         keys: A 3d tensor with shape of [N, T_k, C_k].
150 |         num_units: A scalar. Attention size.
151 |         dropout_rate: A floating point number.
152 |         is_training: Boolean. Controller of mechanism for dropout.
153 |         causality: Boolean. If true, units that reference the future are masked.
154 |         num_heads: An int. Number of heads.
155 |         scope: Optional scope for `variable_scope`.
156 |         reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
157 |     Returns
158 |         A 3d tensor with shape of (N, T_q, C)
159 |     """
160 |     with tf.variable_scope(scope, reuse=reuse):
161 |         # Set the fall back option for num_units
162 |         if num_units is None:
163 |             num_units = queries.get_shape().as_list[-1]
164 | 
165 |         # Linear projections
166 |         Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu)  # (N, T_q, C)
167 |         K = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # (N, T_k, C)
168 |         V = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # (N, T_k, C)
169 | 
170 |         # Split and concat
171 |         Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)  # (h*N, T_q, C/h)
172 |         K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)
173 |         V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)
174 | 
175 |         # Multiplication
176 |         outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (h*N, T_q, T_k)
177 | 
178 |         # Scale
179 |         outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
180 | 
181 |         # Key Masking
182 |         key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))  # (N, T_k)
183 |         key_masks = tf.tile(key_masks, [num_heads, 1])  # (h*N, T_k)
184 |         key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1])  # (h*N, T_q, T_k)
185 | 
186 |         paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
187 |         outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs)  # (h*N, T_q, T_k)
188 | 
189 |         # Causality = Future blinding
190 |         if causality:
191 |             diag_vals = tf.ones_like(outputs[0, :, :])  # (T_q, T_k)
192 |             tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  # (T_q, T_k)
193 |             masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1])  # (h*N, T_q, T_k)
194 | 
195 |             paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
196 |             outputs = tf.where(tf.equal(masks, 0), paddings, outputs)  # (h*N, T_q, T_k)
197 | 
198 |         # Activation
199 |         outputs = tf.nn.softmax(outputs)  # (h*N, T_q, T_k)
200 | 
201 |         # Query Masking
202 |         query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))  # (N, T_q)
203 |         query_masks = tf.tile(query_masks, [num_heads, 1])  # (h*N, T_q)
204 |         query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]])  # (h*N, T_q, T_k)
205 |         outputs *= query_masks  # broadcasting. (N, T_q, C)
206 | 
207 |         # Dropouts
208 |         outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
209 | 
210 |         # Weighted sum
211 |         outputs = tf.matmul(outputs, V_)  # ( h*N, T_q, C/h)
212 | 
213 |         # Restore shape
214 |         outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, T_q, C)
215 | 
216 |         # Residual connection 残差
217 |         outputs += queries
218 | 
219 |         # Normalize 层归一化
220 |         outputs = layer_normalize(outputs)  # (N, T_q, C)
221 | 
222 |     return outputs
223 | 
224 | 
225 | def feedforward(inputs, num_units=[2048, 512], scope="multihead_attention", reuse=None):
226 |     """Point-wise feed forward net.
227 |     Args:
228 |         inputs: A 3d tensor with shape of [N, T, C].
229 |         num_units: A list of two integers.
230 |         scope: Optional scope for `variable_scope`.
231 |         reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
232 |     Returns:
233 |         A 3d tensor with the same shape and dtype as inputs
234 |     """
235 |     with tf.variable_scope(scope, reuse=reuse):
236 |         # Inner layer
237 |         params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1, "activation": tf.nn.relu,
238 |                   "use_bias": True,"reuse":False}
239 |         outputs = tf.layers.conv1d(**params)
240 | 
241 |         # Readout layer
242 |         params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1, "activation": None, "use_bias": True,"reuse":False}
243 |         outputs = tf.layers.conv1d(**params)
244 | 
245 |         # Residual connection
246 |         outputs += inputs
247 | 
248 |         # Normalize
249 |         outputs = layer_normalize(outputs)
250 | 
251 |     return outputs
252 | 
253 | 
254 | def label_smoothing(inputs, epsilon=0.1):
255 |     """Applies label smoothing. See https://arxiv.org/abs/1512.00567.
256 | 
257 |     Args:
258 |         inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
259 |         epsilon: Smoothing rate.
260 | 
261 |     For example,
262 | 
263 |     ```
264 |     import tensorflow as tf
265 |     inputs = tf.convert_to_tensor([[[0, 0, 1], [0, 1, 0], [1, 0, 0]], [[1, 0, 0], [1, 0, 0], [0, 1, 0]]], tf.float32)
266 | 
267 |     outputs = label_smoothing(inputs)
268 | 
269 |     with tf.Session() as sess:
270 |         print(sess.run([outputs]))
271 | 
272 |     >>
273 |     [array([[[ 0.03333334,  0.03333334,  0.93333334],
274 |         [ 0.03333334,  0.93333334,  0.03333334],
275 |         [ 0.93333334,  0.03333334,  0.03333334]],
276 | 
277 |         [[ 0.93333334,  0.03333334,  0.03333334],
278 |         [ 0.93333334,  0.03333334,  0.03333334],
279 |         [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]
280 |     ```
281 |     """
282 |     K = inputs.get_shape().as_list()[-1]  # number of channels
283 |     return ((1 - epsilon) * inputs) + (epsilon / K)
284 | 


--------------------------------------------------------------------------------
/model/module/rnn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 18-12-28 上午10:54
  4 | # @Author  : 林利芳
  5 | # @File    : rnn.py
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | from tensorflow.python.framework import constant_op
 11 | from tensorflow.python.framework import dtypes
 12 | from tensorflow.python.layers import base as base_layer
 13 | from tensorflow.python.ops import array_ops, clip_ops
 14 | from tensorflow.python.ops import init_ops
 15 | from tensorflow.python.ops import math_ops
 16 | from tensorflow.python.ops import nn_ops
 17 | from tensorflow.python.platform import tf_logging as logging
 18 | from tensorflow.python.ops.rnn_cell_impl import LayerRNNCell, LSTMStateTuple
 19 | 
 20 | _BIAS_VARIABLE_NAME = "bias"
 21 | _WEIGHTS_VARIABLE_NAME = "kernel"
 22 | 
 23 | 
 24 | class ForgetLSTMCell(LayerRNNCell):
 25 | 	"""Basic LSTM recurrent network cell.
 26 | 
 27 | 	The implementation is based on: http://arxiv.org/abs/1409.2329.
 28 | 
 29 | 	We add forget_bias (default: 1) to the biases of the forget gate in order to
 30 | 	reduce the scale of forgetting in the beginning of the training.
 31 | 
 32 | 	It does not allow cell clipping, a projection layer, and does not
 33 | 	use peep-hole connections: it is the basic baseline.
 34 | 
 35 | 	For advanced models, please use the full @{tf.nn.rnn_cell.LSTMCell}
 36 | 	that follows.
 37 | 	"""
 38 | 	
 39 | 	def __init__(self, num_units, forget_bias=1.0,
 40 | 				 state_is_tuple=True, activation=None, reuse=None, name=None):
 41 | 		"""Initialize the basic LSTM cell.
 42 | 
 43 | 		Args:
 44 | 		  num_units: int, The number of units in the LSTM cell.
 45 | 		  forget_bias: float, The bias added to forget gates (see above).
 46 | 			Must set to `0.0` manually when restoring from CudnnLSTM-trained
 47 | 			checkpoints.
 48 | 		  state_is_tuple: If True, accepted and returned states are 2-tuples of
 49 | 			the `c_state` and `m_state`.  If False, they are concatenated
 50 | 			along the column axis.  The latter behavior will soon be deprecated.
 51 | 		  activation: Activation function of the inner states.  Default: `tanh`.
 52 | 		  reuse: (optional) Python boolean describing whether to reuse variables
 53 | 			in an existing scope.  If not `True`, and the existing scope already has
 54 | 			the given variables, an error is raised.
 55 | 		  name: String, the name of the layer. Layers with the same name will
 56 | 			share weights, but to avoid mistakes we require reuse=True in such
 57 | 			cases.
 58 | 
 59 | 		  When restoring from CudnnLSTM-trained checkpoints, must use
 60 | 		  `CudnnCompatibleLSTMCell` instead.
 61 | 		"""
 62 | 		super(ForgetLSTMCell, self).__init__(_reuse=reuse, name=name)
 63 | 		if not state_is_tuple:
 64 | 			logging.warn("%s: Using a concatenated state is slower and will soon be "
 65 | 						 "deprecated.  Use state_is_tuple=True.", self)
 66 | 		
 67 | 		# Inputs must be 2-dimensional.
 68 | 		self.input_spec = base_layer.InputSpec(ndim=2)
 69 | 		
 70 | 		self._num_units = num_units
 71 | 		self._forget_bias = forget_bias
 72 | 		self._state_is_tuple = state_is_tuple
 73 | 		self._activation = activation or math_ops.tanh
 74 | 	
 75 | 	@property
 76 | 	def state_size(self):
 77 | 		return (LSTMStateTuple(self._num_units, self._num_units)
 78 | 				if self._state_is_tuple else 2 * self._num_units)
 79 | 	
 80 | 	@property
 81 | 	def output_size(self):
 82 | 		return self._num_units
 83 | 	
 84 | 	def build(self, inputs_shape):
 85 | 		if inputs_shape[1].value is None:
 86 | 			raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
 87 | 							 % inputs_shape)
 88 | 		
 89 | 		input_depth = inputs_shape[1].value
 90 | 		h_depth = self._num_units
 91 | 		self._kernel = self.add_variable(
 92 | 			_WEIGHTS_VARIABLE_NAME,
 93 | 			shape=[input_depth + h_depth, 2 * self._num_units])
 94 | 		self._bias = self.add_variable(
 95 | 			_BIAS_VARIABLE_NAME,
 96 | 			shape=[2 * self._num_units],
 97 | 			initializer=init_ops.zeros_initializer(dtype=self.dtype))
 98 | 		
 99 | 		self.built = True
100 | 	
101 | 	def call(self, inputs, state):
102 | 		"""Long short-term memory cell (LSTM).
103 | 
104 | 		Args:
105 | 		  inputs: `2-D` tensor with shape `[batch_size, input_size]`.
106 | 		  state: An `LSTMStateTuple` of state tensors, each shaped
107 | 			`[batch_size, self.state_size]`, if `state_is_tuple` has been set to
108 | 			`True`.  Otherwise, a `Tensor` shaped
109 | 			`[batch_size, 2 * self.state_size]`.
110 | 
111 | 		Returns:
112 | 		  A pair containing the new hidden state, and the new state (either a
113 | 			`LSTMStateTuple` or a concatenated state, depending on
114 | 			`state_is_tuple`).
115 | 		"""
116 | 		sigmoid = math_ops.sigmoid
117 | 		one = constant_op.constant(1, dtype=dtypes.int32)
118 | 		# Parameters of gates are concatenated into one multiply for efficiency.
119 | 		if self._state_is_tuple:
120 | 			c, h = state
121 | 		else:
122 | 			c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one)
123 | 		
124 | 		gate_inputs = math_ops.matmul(
125 | 			array_ops.concat([inputs, h], 1), self._kernel)
126 | 		gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
127 | 		
128 | 		# i = input_gate, j = new_input, f = forget_gate, o = output_gate
129 | 		j, f = array_ops.split(
130 | 			value=gate_inputs, num_or_size_splits=2, axis=one)
131 | 		i = 1 - f
132 | 		forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype)
133 | 		# Note that using `add` and `multiply` instead of `+` and `*` gives a
134 | 		# performance improvement. So using those at the cost of readability.
135 | 		add = math_ops.add
136 | 		multiply = math_ops.multiply
137 | 		new_c = add(multiply(c, sigmoid(add(f, forget_bias_tensor))), multiply(sigmoid(i), self._activation(j)))
138 | 		new_h = new_c
139 | 		
140 | 		if self._state_is_tuple:
141 | 			new_state = LSTMStateTuple(new_c, new_h)
142 | 		else:
143 | 			new_state = array_ops.concat([new_c, new_h], 1)
144 | 		return new_h, new_state
145 | 
146 | 
147 | class IndRNNCell(LayerRNNCell):  # 继承 LayerRNNCell
148 | 	
149 | 	def __init__(self,
150 | 				 num_units,
151 | 				 recurrent_min_abs=0,
152 | 				 recurrent_max_abs=None,
153 | 				 recurrent_kernel_initializer=None,
154 | 				 input_kernel_initializer=None,
155 | 				 activation=None,
156 | 				 reuse=None,
157 | 				 name=None):
158 | 		super(IndRNNCell, self).__init__(_reuse=reuse, name=name)
159 | 		
160 | 		self.input_spec = base_layer.InputSpec(ndim=2)
161 | 		
162 | 		# initialization
163 | 		self._num_units = num_units
164 | 		self._recurrent_min_abs = recurrent_min_abs
165 | 		
166 | 		self._recurrent_max_abs = recurrent_max_abs
167 | 		self._recurrent_recurrent_kernel_initializer = recurrent_kernel_initializer
168 | 		self._input_kernel_initializer = input_kernel_initializer
169 | 		self._activation = activation or nn_ops.relu
170 | 	
171 | 	@property
172 | 	def state_size(self):
173 | 		return self._num_units
174 | 	
175 | 	@property
176 | 	def output_size(self):
177 | 		return self._num_units
178 | 	
179 | 	def build(self, inputs_shape):
180 | 		'''construct the IndRNN Cell'''
181 | 		if inputs_shape[1].value is None:
182 | 			raise ValueError("Expected input shape[1] is known")
183 | 		
184 | 		input_depth = inputs_shape[1]
185 | 		if self._input_kernel_initializer is None:
186 | 			self._input_kernel_initializer = init_ops.random_normal_initializer(mean=0,
187 | 																				stddev=1e-3)
188 | 		# matrix W
189 | 		self._input_kernel = self.add_variable(
190 | 			"input_kernel",
191 | 			shape=[input_depth, self._num_units],
192 | 			initializer=self._input_kernel_initializer
193 | 		)
194 | 		
195 | 		if self._recurrent_recurrent_kernel_initializer is None:
196 | 			self._recurrent_recurrent_kernel_initializer = init_ops.constant_initializer(1.)
197 | 		
198 | 		# matrix U
199 | 		self._recurrent_kernel = self.add_variable(
200 | 			"recurrent_kernel",
201 | 			shape=[self._num_units],
202 | 			initializer=self._recurrent_recurrent_kernel_initializer
203 | 		)
204 | 		
205 | 		# Clip the U to min - max
206 | 		if self._recurrent_min_abs:
207 | 			abs_kernel = math_ops.abs(self._recurrent_kernel)
208 | 			min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs)
209 | 			self._recurrent_kernel = math_ops.multiply(
210 | 				math_ops.sign(self._recurrent_kernel),
211 | 				min_abs_kernel
212 | 			)
213 | 		if self._recurrent_max_abs:
214 | 			self._recurrent_kernel = clip_ops.clip_by_value(
215 | 				self._recurrent_kernel,
216 | 				-self._recurrent_max_abs,
217 | 				self._recurrent_max_abs
218 | 			)
219 | 		
220 | 		self._bias = self.add_variable(
221 | 			"bias",
222 | 			shape=[self._num_units],
223 | 			initializer=init_ops.zeros_initializer(dtype=self.dtype)
224 | 		)
225 | 		# built finished
226 | 		self.built = True
227 | 	
228 | 	def call(self, inputs, state):
229 | 		'''output = new state = activation(W * x + U (*) h_t-1 + b)'''
230 | 		
231 | 		gate_inputs = math_ops.matmul(inputs, self._input_kernel)
232 | 		# (*)
233 | 		state_update = math_ops.multiply(state, self._recurrent_kernel)
234 | 		gate_inputs = math_ops.add(gate_inputs, state_update)
235 | 		gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
236 | 		output = self._activation(gate_inputs)
237 | 		return output, output
238 | 


--------------------------------------------------------------------------------
/model/module/templates.txt:
--------------------------------------------------------------------------------
 1 | # Unigram
 2 | 
 3 | U00:%x[-2,0]
 4 | U01:%x[-1,0]
 5 | U02:%x[0,0]
 6 | U03:%x[1,0]
 7 | U04:%x[2,0]
 8 | U05:%x[-2,0]/%x[-1,0]/%x[0,0]
 9 | U06:%x[-1,0]/%x[0,0]/%x[1,0]
10 | U07:%x[0,0]/%x[1,0]/%x[2,0]
11 | U08:%x[-1,0]/%x[0,0]
12 | U09:%x[0,0]/%x[1,0]
13 | 
14 | # Bigram
15 | B


--------------------------------------------------------------------------------
/model/rnn_siamese.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 18-12-29 下午3:07
  4 | # @Author  : 林利芳
  5 | # @File    : rnn_siamese.py
  6 | import tensorflow as tf
  7 | from config.hyperparams import RnnParams as hp
  8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize
  9 | 
 10 | 
 11 | class RnnSiameseNetwork(object):
 12 | 	def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True):
 13 | 		self.vocab_size = vocab_size
 14 | 		self.embedding_size = embedding_size
 15 | 		self.max_len = max_len
 16 | 		self.is_training = is_training
 17 | 		self.graph = tf.Graph()
 18 | 		with self.graph.as_default():
 19 | 			self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x")
 20 | 			self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x")
 21 | 			self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target")
 22 | 			self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
 23 | 			self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
 24 | 			self.global_step = tf.train.create_global_step()
 25 | 			
 26 | 			key, value = self.siamese()
 27 | 			self.distance, self.pre_y = self.similar(key, value)
 28 | 			self.accuracy = self.predict()
 29 | 			self.loss = self.loss_layer()
 30 | 			self.train_op = self.optimize()
 31 | 	
 32 | 	def siamese(self):
 33 | 		"""
 34 | 		孪生网络 transformer + rnn
 35 | 		:return:
 36 | 		"""
 37 | 		x = tf.concat([self.left_x, self.right_x], axis=0)
 38 | 		seq_lens = tf.concat([self.left_seq_lens, self.right_seq_lens], axis=0)
 39 | 		# layers embedding multi_head_attention rnn
 40 | 		embed = embedding(x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True, scope="embed")
 41 | 		
 42 | 		# output = self.transformer(embed, x)
 43 | 		output = self.rnn_layer(embed, seq_lens)
 44 | 		output = self.attention(embed, output)
 45 | 		key, value = tf.split(output, 2, axis=0)
 46 | 		return key, value
 47 | 	
 48 | 	def rnn_layer(self, inputs, seq_lens, seg=hp.seg):
 49 | 		"""
 50 | 		创建双向RNN层
 51 | 		:param inputs:
 52 | 		:param seq_lens:
 53 | 		:param seg: LSTM GRU F-LSTM, IndRNN
 54 | 		:return:
 55 | 		"""
 56 | 		if seg == 'LSTM':
 57 | 			fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
 58 | 			bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
 59 | 		
 60 | 		elif seg == 'GRU':
 61 | 			fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
 62 | 			bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
 63 | 		else:
 64 | 			fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
 65 | 			bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
 66 | 		# 双向rnn
 67 | 		(fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn(
 68 | 			fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32)
 69 | 		# 合并双向rnn的output batch_size * max_seq * (hidden_dim*2)
 70 | 		output = tf.add(fw_output, bw_output)
 71 | 		return output
 72 | 	
 73 | 	def transformer(self, embed, value):
 74 | 		with tf.variable_scope("Transformer_Encoder"):
 75 | 			# Positional Encoding
 76 | 			embed += positional_encoding(value, num_units=hp.num_units, zero_pad=False, scale=False, scope="post")
 77 | 			# Dropout
 78 | 			output = self.multi_head_block(embed)
 79 | 			return output
 80 | 	
 81 | 	def multi_head_block(self, query, causality=False):
 82 | 		"""
 83 | 		多头注意力机制
 84 | 		:param query:
 85 | 		:param causality:
 86 | 		:return:
 87 | 		"""
 88 | 		for i in range(hp.num_blocks):
 89 | 			with tf.variable_scope("num_blocks_{}".format(i)):
 90 | 				# multi head Attention ( self-attention)
 91 | 				query = multihead_attention(
 92 | 					queries=query, keys=query, num_units=hp.num_units, num_heads=hp.num_heads,
 93 | 					dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality,
 94 | 					scope="self_attention")
 95 | 				# Feed Forward
 96 | 				query = feedforward(query, num_units=[4 * hp.num_units, hp.num_units])
 97 | 		return query
 98 | 	
 99 | 	def loss_layer(self):
100 | 		"""
101 | 		损失函数 L+ = （1-Ew)^2/4  L_ = max(Ex,0)^2
102 | 		:return:
103 | 		"""
104 | 		y = tf.cast(self.y, tf.float32)
105 | 		with tf.name_scope("output"):
106 | 			loss_p = tf.square(1 - self.distance) / 4
107 | 			mask = tf.sign(tf.nn.relu(self.distance - hp.margin))
108 | 			loss_m = tf.square(mask * self.distance)
109 | 			loss = tf.reduce_sum(y * loss_p + (1 - y) * loss_m)
110 | 			return loss
111 | 	
112 | 	def attention(self, embed, query):
113 | 		"""
114 | 		注意力机制
115 | 		:param embed:
116 | 		:param query:
117 | 		:return:
118 | 		"""
119 | 		with tf.name_scope("attention"):
120 | 			w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32)
121 | 			b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32)
122 | 			u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32)
123 | 			value = tf.concat([embed, query], axis=-1)
124 | 			value = tf.reshape(value, [-1, 2 * hp.num_units])
125 | 			attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u)
126 | 			attention = tf.reshape(attention, shape=[-1, self.max_len])
127 | 			attention = tf.nn.softmax(attention, axis=-1)
128 | 			attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units])
129 | 			
130 | 			output = tf.reduce_sum(attention * query, axis=1)
131 | 			output = layer_normalize(output)
132 | 			return output
133 | 	
134 | 	@staticmethod
135 | 	def similar(key, value):
136 | 		"""
137 | 		cosine(key,value) = key * value/(|key|*|value|)
138 | 		:param key:
139 | 		:param value:
140 | 		:return:
141 | 		"""
142 | 		dot_value = tf.reduce_sum(key * value, axis=-1)
143 | 		key_sqrt = tf.sqrt(tf.reduce_sum(tf.square(key), axis=-1) + hp.eps)
144 | 		value_sqrt = tf.sqrt(tf.reduce_sum(tf.square(value), axis=-1) + hp.eps)
145 | 		distance = tf.div(dot_value, key_sqrt * value_sqrt, name="similar")
146 | 		pre_y = tf.sign(tf.nn.relu(distance - hp.margin))
147 | 		pre_y = tf.cast(pre_y, tf.int32, name='pre')
148 | 		return distance, pre_y
149 | 	
150 | 	def predict(self):
151 | 		correct_predictions = tf.equal(self.pre_y, self.y)
152 | 		accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
153 | 		return accuracy
154 | 	
155 | 	def optimize(self):
156 | 		"""
157 | 		优化器
158 | 		:return:
159 | 		"""
160 | 		optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
161 | 		train_op = optimizer.minimize(self.loss, global_step=self.global_step)
162 | 		return train_op
163 | 


--------------------------------------------------------------------------------
/model/transformer_siamese.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin python3
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 19-1-22 上午10:48
  4 | # @Author  : 林利芳
  5 | # @File    : transformer_siamese.py
  6 | import tensorflow as tf
  7 | from config.hyperparams import HyperParams as hp
  8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize
  9 | 
 10 | 
 11 | class TransformerSiameseNetwork(object):
 12 | 	def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True, seg='LSTM'):
 13 | 		self.vocab_size = vocab_size
 14 | 		self.embedding_size = embedding_size
 15 | 		self.max_len = max_len
 16 | 		self.is_training = is_training
 17 | 		self.graph = tf.Graph()
 18 | 		with self.graph.as_default():
 19 | 			self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x")
 20 | 			self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x")
 21 | 			self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target")
 22 | 			self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
 23 | 			self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
 24 | 			self.global_step = tf.train.create_global_step()
 25 | 			
 26 | 			query, key = self.siamese(seg)
 27 | 			self.distance, self.pre_y = self.similar(query, key)
 28 | 			self.accuracy = self.predict()
 29 | 			self.loss = self.loss_layer()
 30 | 			self.train_op = self.optimize()
 31 | 	
 32 | 	def siamese(self, seg):
 33 | 		"""
 34 | 		孪生网络 transformer + rnn
 35 | 		:param seg:
 36 | 		:return:
 37 | 		"""
 38 | 		x = tf.concat([self.left_x, self.right_x], axis=0)
 39 | 		seq_lens = tf.concat([self.left_seq_lens, self.right_seq_lens], axis=0)
 40 | 		# layers embedding multi_head_attention rnn
 41 | 		left_embed = embedding(self.left_x, vocab_size=self.vocab_size, num_units=hp.num_units, scale=True,
 42 | 							   scope="lembed")
 43 | 		right_embed = embedding(self.right_x, vocab_size=self.vocab_size, num_units=hp.num_units, scale=True,
 44 | 								scope="rembed")
 45 | 		
 46 | 		query, key = self.transformer(left_embed, right_embed)
 47 | 		# output = self.rnn_layer(embed, seq_lens, seg)
 48 | 		query = self.attention(query, query)
 49 | 		key = self.attention(key, key)
 50 | 		return query, key
 51 | 	
 52 | 	def rnn_layer(self, inputs, seq_lens, seg):
 53 | 		"""
 54 | 		创建双向RNN层
 55 | 		:param inputs:
 56 | 		:param seq_lens:
 57 | 		:param seg: LSTM GRU F-LSTM, IndRNN
 58 | 		:return:
 59 | 		"""
 60 | 		if seg == 'LSTM':
 61 | 			fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
 62 | 			bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
 63 | 		
 64 | 		elif seg == 'GRU':
 65 | 			fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
 66 | 			bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
 67 | 		else:
 68 | 			fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
 69 | 			bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
 70 | 		# 双向rnn
 71 | 		(fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn(
 72 | 			fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32)
 73 | 		# 合并双向rnn的output batch_size * max_seq * (hidden_dim*2)
 74 | 		output = tf.add(fw_output, bw_output)
 75 | 		return output
 76 | 	
 77 | 	def transformer(self, query, key):
 78 | 		with tf.variable_scope("Transformer_Encoder"):
 79 | 			# Positional Encoding
 80 | 			query += positional_encoding(self.left_x, num_units=hp.num_units, zero_pad=False, scale=False)
 81 | 			key += positional_encoding(self.right_x, num_units=hp.num_units, zero_pad=False, scale=False)
 82 | 			# Dropout
 83 | 			output = self.multi_head_block(query, key)
 84 | 			return output
 85 | 	
 86 | 	def multi_head_block(self, query, key, causality=False):
 87 | 		"""
 88 | 		多头注意力机制
 89 | 		:param query:
 90 | 		:param key:
 91 | 		:param causality:
 92 | 		:return:
 93 | 		"""
 94 | 		for i in range(hp.num_blocks):
 95 | 			with tf.variable_scope("num_blocks_{}".format(i)):
 96 | 				# multi head Attention ( self-attention)
 97 | 				query = self.multihead_attention(query, query, name="query_attention", causality=causality)
 98 | 				key = self.multihead_attention(key, key, name="key_attention", causality=causality)
 99 | 				query = self.multihead_attention(query, key, name="query_key_attention")
100 | 				key = self.multihead_attention(key, query, name="query_key_attention")
101 | 		return query, key
102 | 	
103 | 	def multihead_attention(self, query, key, name="key_attention", causality=False):
104 | 		value = multihead_attention(
105 | 			queries=query, keys=key, num_units=hp.num_units, num_heads=hp.num_heads,
106 | 			dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality,
107 | 			scope=name)
108 | 		# Feed Forward
109 | 		value = feedforward(value, num_units=[4 * hp.num_units, hp.num_units])
110 | 		return value
111 | 	
112 | 	def loss_layer(self):
113 | 		"""
114 | 		损失函数 L+ = （1-Ew)^2/4  L_ = max(Ex,0)^2
115 | 		:return:
116 | 		"""
117 | 		y = tf.cast(self.y, tf.float32)
118 | 		with tf.name_scope("output"):
119 | 			loss_p = tf.square(1 - self.distance) / 4
120 | 			mask = tf.sign(tf.nn.relu(self.distance - hp.margin))
121 | 			loss_m = tf.square(mask * self.distance)
122 | 			loss = tf.reduce_sum(y * loss_p + (1 - y) * loss_m)
123 | 			return loss
124 | 	
125 | 	def attention(self, embed, query):
126 | 		"""
127 | 		注意力机制
128 | 		:param embed:
129 | 		:param query:
130 | 		:return:
131 | 		"""
132 | 		with tf.name_scope("attention"):
133 | 			w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32)
134 | 			b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32)
135 | 			u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32)
136 | 			value = tf.concat([embed, query], axis=-1)
137 | 			value = tf.reshape(value, [-1, 2 * hp.num_units])
138 | 			attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u)
139 | 			attention = tf.reshape(attention, shape=[-1, self.max_len])
140 | 			attention = tf.nn.softmax(attention, axis=-1)
141 | 			attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units])
142 | 			
143 | 			output = tf.reduce_sum(attention * query, axis=1)
144 | 			output = layer_normalize(output)
145 | 			return output
146 | 	
147 | 	@staticmethod
148 | 	def similar(query, key):
149 | 		"""
150 | 		cosine(key,value) = key * value/(|key|*|value|)
151 | 		:param key:
152 | 		:param value:
153 | 		:return:
154 | 		"""
155 | 		dot_value = tf.reduce_sum(query * key, axis=-1)
156 | 		query_sqrt = tf.sqrt(tf.reduce_sum(tf.square(query), axis=-1) + hp.eps)
157 | 		key_sqrt = tf.sqrt(tf.reduce_sum(tf.square(key), axis=-1) + hp.eps)
158 | 		distance = tf.div(dot_value, key_sqrt * query_sqrt, name="similar")
159 | 		pre_y = tf.sign(tf.nn.relu(distance - hp.margin))
160 | 		pre_y = tf.cast(pre_y, tf.int32, name='pre')
161 | 		return distance, pre_y
162 | 	
163 | 	def predict(self):
164 | 		correct_predictions = tf.equal(self.pre_y, self.y)
165 | 		accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
166 | 		return accuracy
167 | 	
168 | 	def optimize(self):
169 | 		"""
170 | 		优化器
171 | 		:return:
172 | 		"""
173 | 		optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
174 | 		train_op = optimizer.minimize(self.loss, global_step=self.global_step)
175 | 		return train_op
176 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin python3
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 18-12-29 下午3:06
 4 | # @Author  : 林利芳
 5 | # @File    : run.py
 6 | import os
 7 | from core.load_data import load_train_data, get_feed_dict, print_info, preprocessor
 8 | from config.config import checkpoint_dir, VOCAB_PKL
 9 | from core.utils import load_data
10 | from model.rnn_siamese import RnnSiameseNetwork
11 | from model.match_pyramid import MatchPyramidNetwork
12 | from model.cnn_siamese import CnnSiameseNetwork
13 | from model.transformer_siamese import TransformerSiameseNetwork
14 | from config.hyperparams import HyperParams as hp
15 | import tensorflow as tf
16 | 
17 | 
18 | def run(network='rnn'):
19 | 	checkpoint_file = checkpoint_dir.format(network)
20 | 	if not os.path.exists(checkpoint_file):
21 | 		os.mkdir(checkpoint_file)
22 | 	train_l_x, val_l_x, train_l_len, val_l_len, train_r_x, val_r_x, train_r_len, val_r_len, train_y, val_y = load_train_data()
23 | 	vocab = load_data(VOCAB_PKL)
24 | 	vocab_size = len(vocab.word2idx)
25 | 	
26 | 	batch_size = hp.batch_size
27 | 	if network == 'rnn':
28 | 		model = RnnSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True)
29 | 	elif network == 'match_pyramid':
30 | 		model = MatchPyramidNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True)
31 | 	elif network == 'cnn':
32 | 		model = CnnSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True)
33 | 	elif network == "transformer":
34 | 		model = TransformerSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True)
35 | 	else:
36 | 		return
37 | 	sv = tf.train.Supervisor(graph=model.graph, logdir=checkpoint_file, save_model_secs=150)
38 | 	with sv.managed_session() as sess:
39 | 		print("start training...\n")
40 | 		for epoch in range(1, hp.num_epochs + 1):
41 | 			if sv.should_stop():
42 | 				break
43 | 			train_loss = []
44 | 			
45 | 			for feed_dict, _ in get_feed_dict(model, train_l_x, train_r_x, train_l_len, train_r_len, train_y,
46 | 											  batch_size):
47 | 				loss, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict)
48 | 				train_loss.append(loss)
49 | 			dev_loss = []
50 | 			predicts = []
51 | 			for feed_dict, start in get_feed_dict(model, val_l_x, val_r_x, val_l_len, val_r_len, val_y, batch_size):
52 | 				loss, gs, pre_y = sess.run([model.loss, model.global_step, model.pre_y], feed_dict=feed_dict)
53 | 				dev_loss.append(loss)
54 | 				predicts.extend(pre_y[start:])
55 | 			print_info(epoch, gs, train_loss, dev_loss, val_y, predicts)
56 | 	
57 | 
58 | if __name__ == "__main__":
59 | 	# preprocessor(True)
60 | 	network = 'transformer'  # network = [rnn match_pyramid cnn transformer]
61 | 	run(network)
62 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | python main.py $1 $2


--------------------------------------------------------------------------------