├── .gitignore
├── .idea
└── vcs.xml
├── README.md
├── config
├── __init__.py
├── config.py
├── hyperparams.py
└── synonym.py
├── core
├── __init__.py
├── load_data.py
├── preprocessor.py
├── utils.py
└── word_embedding.py
├── data
├── atec_nlp_sim_test.csv
├── atec_nlp_sim_train.csv
├── atec_nlp_sim_train_add.csv
└── corpus.txt
├── logdir
└── graph
│ ├── match_pyramid
│ └── siamese.png
├── main.py
├── model
├── __init__.py
├── cnn_siamese.py
├── match_pyramid.py
├── module
│ ├── __init__.py
│ ├── feature.py
│ ├── modules.py
│ ├── rnn.py
│ └── templates.txt
├── rnn_siamese.py
└── transformer_siamese.py
├── run.py
└── run.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 | data/char2vec_300
9 | data/data.pkl
10 | data/vocab.pkl
11 | data/expend_atec_nlp.csv
12 | logdir/checkpoints/*
13 | logdir/checkpoints-match_pyramid/*
14 | logdir/checkpoints-rnn/*
15 | logdir/model/*
16 | # Distribution / packaging
17 | .Python
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *.cover
54 | .hypothesis/
55 | .pytest_cache/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 | db.sqlite3
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # celery beat schedule file
86 | celerybeat-schedule
87 |
88 | # SageMath parsed files
89 | *.sage.py
90 |
91 | # Environments
92 | .env
93 | .venv
94 | env/
95 | venv/
96 | ENV/
97 | env.bak/
98 | venv.bak/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 |
104 | # Rope project settings
105 | .ropeproject
106 |
107 | # mkdocs documentation
108 | /site
109 |
110 | # mypy
111 | .mypy_cache/
112 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TextSimilar
2 | 短文本相似度
3 | ### 孪生网络
4 | [Learning Text Similarity with Siamese Recurrent Networks](http://www.aclweb.org/anthology/W/W16/W16-1617.pdf)
5 | loss函数
6 |
7 | ---
8 | ### match pyramid
9 | [Text Matching as Image Recognition](https://arxiv.org/abs/1602.06359)
10 |
11 | ---
12 | 数据来源于[蚂蚁金融NLP之问题相似度计算](https://dc.cloud.alipay.com/index#/topic/intro?id=8)
13 | >问题相似度计算,即给定客服里用户描述的两句话,用算法来判断是否表示了相同的语义。
14 | >
15 | >示例:
16 | >
17 | >1. “花呗如何还款” --“花呗怎么还款”:同义问句
18 | >
19 | >2. “花呗如何还款” -- “我怎么还我的花被呢”:同义问句
20 | >
21 | >3. “花呗分期后逾期了如何还款”-- “花呗分期后逾期了哪里还款”:非同义问句
22 | >
23 | >对于例子a,比较简单的方法就可以判定同义;对于例子b,包含了错别字、同义词、词序变换等问题,两个句子乍一看并不类似,想正确判断比较有挑战;对于例子c,两句话很类似,仅仅有一处细微的差别 “如何”和“哪里”,就导致语义不一致。
24 |
25 | 数据预处理python3 run.py, 在data目录得到data.pkl和vocab.pkl。
26 | ```python
27 | if __name__ == "__main__":
28 | preprocessor(True)
29 | network = 'rnn' # network = [rnn match_pyramid cnn]
30 | run(network)
31 | ```
32 |
--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 19-1-25 下午6:02
4 | # @Author : 林利芳
5 | # @File : __init__.py
6 |
--------------------------------------------------------------------------------
/config/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 18-12-27 上午9:54
4 | # @Author : 林利芳
5 | # @File : config.py
6 |
7 | import os
8 |
9 | PATH = os.getcwd()
10 | ATEC_NLP_DATA = os.path.join(PATH, 'data/atec_nlp_sim_train.csv')
11 | ADD_ATEC_NLP_DATA = os.path.join(PATH, 'data/atec_nlp_sim_train_add.csv')
12 |
13 | TEST_DATA = os.path.join(PATH, 'data/atec_nlp_sim_test.csv')
14 | TEST_RESULT = os.path.join(PATH, 'data/test_result.csv')
15 |
16 | EXPEND_ATEC_NLP_DATA = os.path.join(PATH, 'data/expend_atec_nlp_{}.csv')
17 |
18 | DATA_PKL = os.path.join(PATH, 'data/data.pkl')
19 | VOCAB_PKL = os.path.join(PATH, 'data/vocab.pkl')
20 |
21 | CORPUS_DATA = os.path.join(PATH, 'data/corpus.txt')
22 |
23 | WORD2VEC_DATA = os.path.join(PATH, 'data/char2vec_300')
24 | logdir = os.path.join(PATH, 'logdir')
25 | checkpoint_dir = "logdir/checkpoints-{}"
26 | model_dir = os.path.join(logdir, "model")
27 |
28 | WordChar = "char"
29 |
--------------------------------------------------------------------------------
/config/hyperparams.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 18-12-25 上午10:41
4 | # @Author : 林利芳
5 | # @File : hyperparams.py
6 |
7 |
8 | class HyperParams:
9 | # training
10 | batch_size = 32 # alias = N
11 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
12 | seg = 'GRU' # seg = [GRU,LSTM,IndRNN,F-LSTM]
13 | # model
14 | max_len = 50 # Maximum number of words in a sentence. alias = T.
15 | # Feel free to increase this if you are ambitious.
16 | min_cnt = 20 # words whose occurred less than min_cnt are encoded as .
17 | num_units = 512 # alias = C
18 | embedding_size = 512
19 | vocab_size = 10000
20 | num_blocks = 1 # number of encoder/decoder blocks
21 | num_epochs = 100
22 | num_heads = 8
23 | attention_size = 100
24 | clip = 5
25 | dropout_rate = 0.1
26 | eps = 1e-9
27 | margin = 0.7
28 | sinusoid = False # If True, use sinusoid. If false, positional embedding.
29 |
30 |
31 | class RnnParams:
32 | # training
33 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
34 | seg = 'GRU' # seg = [GRU,LSTM,IndRNN,F-LSTM]
35 | # model
36 | max_len = 50 # Maximum number of words in a sentence. alias = T.
37 | # Feel free to increase this if you are ambitious.
38 | min_cnt = 20 # words whose occurred less than min_cnt are encoded as .
39 | num_units = 100 # alias = C
40 | embedding_size = 100
41 | num_epochs = 40
42 | attention_size = 100
43 | clip = 5
44 | dropout_rate = 0.1
45 | eps = 1e-9
46 | margin = 0.1
47 | sinusoid = False # If True, use sinusoid. If false, positional embedding.
48 |
49 |
50 | class CnnParams:
51 | # training
52 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
53 | seg = 'GRU' # seg = [GRU,LSTM,IndRNN,F-LSTM]
54 | # model
55 | max_len = 50 # Maximum number of words in a sentence. alias = T.
56 | # Feel free to increase this if you are ambitious.
57 | min_cnt = 20 # words whose occurred less than min_cnt are encoded as .
58 | num_units = 100 # alias = C
59 | embedding_size = 100
60 | num_epochs = 40
61 | attention_size = 100
62 | clip = 5
63 | dropout_rate = 0.1
64 | eps = 1e-9
65 | margin = 0.01
66 | channel = 64 # 通道数
67 | kernel = [3, 5] # 核大小
68 | pool_size = 2 # 池化层大小
69 | dense_size = 100 # 全连接层大小
70 | sinusoid = False # If True, use sinusoid. If false, positional embedding.
71 |
72 |
73 | class TransformerParams:
74 | # training
75 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
76 | seg = 'GRU' # seg = [GRU,LSTM,IndRNN,F-LSTM]
77 | # model
78 | max_len = 50 # Maximum number of words in a sentence. alias = T.
79 | # Feel free to increase this if you are ambitious.
80 | min_cnt = 20 # words whose occurred less than min_cnt are encoded as .
81 | num_units = 512 # alias = C
82 | embedding_size = 512
83 | num_epochs = 40
84 | num_blocks = 6 # number of encoder/decoder blocks
85 | num_heads = 8
86 | attention_size = 100
87 | clip = 5
88 | dropout_rate = 0.1
89 | eps = 1e-9
90 | margin = 0.3
91 | sinusoid = False # If True, use sinusoid. If false, positional embedding.
92 |
93 |
94 | class MatchPyramidParams:
95 | # training
96 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
97 | seg = 'GRU' # seg = [GRU,LSTM,IndRNN,F-LSTM]
98 | # model
99 | max_len = 50 # Maximum number of words in a sentence. alias = T.
100 | # Feel free to increase this if you are ambitious.
101 | min_cnt = 20 # words whose occurred less than min_cnt are encoded as .
102 | num_units = 100 # alias = C
103 | embedding_size = 100
104 | num_epochs = 40
105 | attention_size = 100
106 | clip = 5
107 | dropout_rate = 0.1
108 | eps = 1e-9
109 | margin = 0.3
110 | channel = 64 # 通道数
111 | kernel = [3, 5] # 核大小
112 | pool_size = 2 # 池化层大小
113 | dense_size = 100 # 全连接层大小
114 | sinusoid = False # If True, use sinusoid. If false, positional embedding.
115 |
--------------------------------------------------------------------------------
/config/synonym.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 19-1-24 上午11:11
4 | # @Author : 林利芳
5 | # @File : synonym.py
6 | import re
7 |
8 | SYNONYM_DICT = {
9 | "更改": ["更改", '更换'],
10 | "改为": ["更改成", "改为", "更改为"],
11 | "可以": ["可以", "能"],
12 | "下降": ['降低', "下降"],
13 | "为什么": ["为何", "为啥"],
14 | "能不能": ["能不能", "行不行", "可不可以"],
15 | "不能用": ["不能用", "用不了"],
16 | }
17 |
18 | SYNONYM_WRONG = {
19 | "零时额度": "临时额度",
20 | "花贝": '花呗',
21 | "花唄": '花呗',
22 | "花被": '花呗',
23 | "蚂蚁花贝": '花呗',
24 | "蚂蚁花唄": '花呗',
25 | "蚂蚁花被": '花呗',
26 | "蚂蚁花呗": '花呗',
27 | "蚂蚁借呗": '借呗',
28 | "届不了": '借不了',
29 | "为何": "为什么",
30 | "为啥": "为什么",
31 | "下个月": '下月',
32 | "上个月": '上月',
33 | "行不行": '能不能',
34 | "可不可以": '能不能',
35 | "用不了": "不能用",
36 | "不让": '不能',
37 | "不可以": '不能',
38 | "不行": '不能',
39 | "老有": '总有',
40 | "日息": "利息",
41 | "更改成": "改为",
42 | "更改为": "改为",
43 | "更换": "更改",
44 | "能": "可以",
45 | "降低": "下降",
46 | "受到": "收到",
47 | ',': ',',
48 | '?': '?',
49 | '!': '!',
50 | ';': ';',
51 | '***': '0',
52 | }
53 |
54 | PATTERN = [
55 | [re.compile('\*+'), '*'],
56 | [re.compile('\?'), '?'],
57 | [re.compile('\.$'), '。'],
58 | [re.compile('!'), '!'],
59 | [re.compile(','), ','],
60 | [re.compile(';'), ';'],
61 | [re.compile('\s+'), ''],
62 | [re.compile('\ufeff'), ''],
63 | ]
64 |
--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 19-1-25 下午6:02
4 | # @Author : 林利芳
5 | # @File : __init__.py
6 |
--------------------------------------------------------------------------------
/core/load_data.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 19-1-18 下午4:08
4 | # @Author : 林利芳
5 | # @File : load_data.py
6 | from config.config import DATA_PKL, VOCAB_PKL
7 | from core.preprocessor import preprocessor, pad_sequence, trim
8 | from core.utils import load_data, read_csv
9 | import numpy as np
10 | from sklearn.metrics import recall_score, precision_score, f1_score
11 |
12 |
13 | def gen_batch_data(l_x, r_x, l_len, r_len, y, batch_size):
14 | """
15 | 生成batch数据
16 | :param l_x:
17 | :param r_x:
18 | :param l_len:
19 | :param r_len:
20 | :param y:
21 | :param batch_size:
22 | :return:
23 | """
24 | data_size = len(y)
25 | num_batch = data_size // batch_size + 1
26 |
27 | for ii in range(num_batch):
28 | start, end = ii * batch_size, (ii + 1) * batch_size
29 | start_batch = 0
30 | if end > data_size:
31 | start_batch = end - data_size
32 | start, end = data_size - batch_size, data_size
33 | l_x_batch = l_x[start:end]
34 | r_x_batch = r_x[start:end]
35 | l_len_batch = l_len[start:end]
36 | r_len_batch = r_len[start:end]
37 | y_batch = y[start:end]
38 | yield l_x_batch, r_x_batch, l_len_batch, r_len_batch, y_batch, start_batch
39 |
40 |
41 | def load_train_data():
42 | data = load_data(DATA_PKL)
43 | train_l_x, val_l_x, train_l_len, val_l_len, train_r_x, val_r_x, train_r_len, val_r_len, train_y, val_y = \
44 | data['train_l_x'], data['val_l_x'], data['train_l_len'], data['val_l_len'], data['train_r_x'], data[
45 | 'val_r_x'], data['train_r_len'], data['val_r_len'], data['train_y'], data['val_y']
46 | train_l_x = np.array(train_l_x)
47 | val_l_x = np.array(val_l_x)
48 | train_l_len = np.array(train_l_len)
49 | val_l_len = np.array(val_l_len)
50 | train_r_x = np.array(train_r_x)
51 | val_r_x = np.array(val_r_x)
52 | train_r_len = np.array(train_r_len)
53 | val_r_len = np.array(val_r_len)
54 | train_y = np.array(train_y)
55 | val_y = np.array(val_y)
56 | return train_l_x, val_l_x, train_l_len, val_l_len, train_r_x, val_r_x, train_r_len, val_r_len, train_y, val_y
57 |
58 |
59 | def get_feed_dict(model, l_x, r_x, l_len, r_len, y, batch_size):
60 | """
61 | 生成feed_dict
62 | :param model:
63 | :param l_x:
64 | :param r_x:
65 | :param l_len:
66 | :param r_len:
67 | :param y:
68 | :param batch_size:
69 | :return:
70 | """
71 | for l_x_batch, r_x_batch, l_len_batch, r_len_batch, y_batch, start_batch in gen_batch_data(
72 | l_x, r_x, l_len, r_len, y, batch_size):
73 | feed_dict = {
74 | model.left_x: l_x_batch,
75 | model.right_x: r_x_batch,
76 | model.y: y_batch,
77 | model.left_seq_lens: l_len_batch,
78 | model.right_seq_lens: r_len_batch
79 | }
80 | yield feed_dict, start_batch
81 |
82 |
83 | def print_info(epoch, step, train_loss, dev_loss, y, pre_y):
84 | loss = round(float(np.mean(train_loss)), 3)
85 | val_loss = round(float(np.mean(dev_loss)), 3)
86 | f1 = round(f1_score(y, pre_y), 4)
87 | recall = round(recall_score(y, pre_y), 4)
88 | precision = round(precision_score(y, pre_y), 4)
89 | print('**************************************************')
90 | print("epoch\t{}\tstep\t{}\ttrain_loss\t{}\tdev_loss\t{}\t".format(epoch, step, loss, val_loss))
91 | print("precision\t{}\trecall\t{}\tf1\t{}\n\n".format(precision, recall, f1))
92 |
93 |
94 | def load_test_data(filename):
95 | vocab = load_data(VOCAB_PKL)
96 | max_len = vocab.max_len
97 | data = read_csv(filename)
98 | data = [kk[:3] for kk in data]
99 | idx, left_x, right_x = zip(*data)
100 |
101 | left_x = [trim(kk) for kk in left_x]
102 | right_x = [trim(kk) for kk in right_x]
103 |
104 | left_x, left_len = pad_sequence(left_x, vocab, max_len)
105 | right_x, right_len = pad_sequence(right_x, vocab, max_len)
106 |
107 | return idx, left_x, left_len, right_x, right_len, vocab
108 |
109 |
110 | def save_test_result(filename, idx, predicts):
111 | import codecs
112 | with codecs.open(filename, 'w', encoding='utf-8') as fp:
113 | for _id, pre in zip(idx, predicts):
114 | fp.writelines('{}\t{}\n'.format(_id, pre))
115 |
--------------------------------------------------------------------------------
/core/preprocessor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 19-1-18 下午3:49
4 | # @Author : 林利芳
5 | # @File : preprocessor.py
6 | import pprint
7 |
8 | from sklearn.model_selection import train_test_split
9 | import numpy as np
10 | from config.config import DATA_PKL, VOCAB_PKL, ATEC_NLP_DATA, ADD_ATEC_NLP_DATA, CORPUS_DATA, EXPEND_ATEC_NLP_DATA, \
11 | WordChar
12 | from core.utils import save_data, read_csv, load_data
13 | from core.word_embedding import Vocab
14 | import re
15 | import jieba
16 | import collections
17 | from config.synonym import SYNONYM_DICT, SYNONYM_WRONG, PATTERN
18 | import itertools
19 | from config.hyperparams import HyperParams as hp
20 | import sys
21 |
22 | try:
23 | reload(sys)
24 | sys.setdefaultencoding('utf8')
25 | except:
26 | pass
27 | jieba.load_userdict(CORPUS_DATA)
28 | PAD = ""
29 | UNK = ""
30 | PAD2ID = 0
31 | UNK2ID = 0
32 |
33 |
34 | def extended_corpus(data, is_training=True, filename="train"):
35 | """
36 | 扩展语料
37 | :param data:
38 | :param is_training:
39 | :param filename:
40 | :return:
41 | """
42 | print("同义词替换...\n")
43 | similar_data = []
44 | for sub_data in data:
45 | idx, left_s, right_s, y = sub_data
46 | idx = idx.replace('\ufeff', '')
47 | left_s = trim(left_s)
48 | right_s = trim(right_s)
49 | if is_training:
50 | data = combine_data(idx, left_s, right_s, y)
51 | else:
52 | data = [[idx, ' '.join(left_s), ' '.join(right_s), y]]
53 | similar_data.extend(data)
54 |
55 | save_expend_data(similar_data, EXPEND_ATEC_NLP_DATA.format(filename))
56 |
57 |
58 | # return similar_data
59 |
60 |
61 | def save_expend_data(data, filename):
62 | import codecs
63 | with codecs.open(filename, 'w', encoding='utf-8') as fp:
64 | for line in data:
65 | idx, left_x, right_x, y = line
66 | temp = [idx, left_x, right_x, str(y)]
67 | fp.writelines('\t'.join(temp) + '\n')
68 |
69 |
70 | def synonym_replace(sentence):
71 | """
72 | 同义词替换
73 | :param sentence:
74 | :return:
75 | """
76 | sentences = []
77 | for word in sentence:
78 | words = SYNONYM_DICT.get(word, [word])
79 | sentences.append(words)
80 | sentences = list(set(itertools.product(*sentences)))
81 | result = []
82 | for ii, sub_data in enumerate(sentences):
83 | sub_data = list(sub_data)
84 | if sub_data == sentence:
85 | continue
86 | result.append(sub_data)
87 | return result
88 |
89 |
90 | def combine_data(idx, left_s, right_s, y):
91 | similar_data = [[idx, ' '.join(left_s), ' '.join(right_s), y]]
92 | left_sentence = synonym_replace(left_s)
93 | right_sentence = synonym_replace(right_s)
94 | left_len, right_len = len(left_sentence), len(right_sentence)
95 | max_num = max(left_len, right_len)
96 | if y == '0':
97 | max_num = 0
98 | for sub_s in left_sentence[:max_num]:
99 | temp = [idx, ' '.join(sub_s), ' '.join(right_s), y]
100 | similar_data.append(temp)
101 | for sub_s in right_sentence[:max_num]:
102 | temp = [idx, ' '.join(left_s), ' '.join(sub_s), y]
103 | similar_data.append(temp)
104 | return similar_data
105 |
106 |
107 | # if y == '1':
108 | # for sub_left_s, sub_right_s in zip(left_sentence[:3], right_sentence[:3]):
109 | # temp = [idx, sub_left_s, sub_right_s, y]
110 | # similar_data.append(temp)
111 | #
112 | # if left_len > right_len:
113 | # for sub_left_s, sub_right_s in zip(left_sentence[1:], right_sentence):
114 | # temp = [idx, sub_left_s, sub_right_s, y]
115 | # similar_data.append(temp)
116 | # elif right_len > left_len:
117 | # for sub_left_s, sub_right_s in zip(left_sentence, right_sentence[1:]):
118 | # temp = [idx, sub_left_s, sub_right_s, y]
119 | # similar_data.append(temp)
120 | # else:
121 | # data = left_sentence.pop()
122 | # left_sentence.insert(0, data)
123 | # for sub_left_s, sub_right_s in zip(left_sentence, right_sentence):
124 | # temp = [idx, sub_left_s, sub_right_s, y]
125 | # similar_data.append(temp)
126 |
127 |
128 | def trim(text):
129 | for rule, region in PATTERN:
130 | text = rule.sub(region, text)
131 | sentence = list(jieba.cut(text))
132 | for ii, word in enumerate(sentence):
133 | if word in SYNONYM_WRONG:
134 | word = SYNONYM_WRONG.get(word, word)
135 | sentence[ii] = word
136 | return sentence
137 |
138 |
139 | def build_vocab(text, max_len):
140 | """
141 | 构建词库
142 | :param text: text = [sentence]
143 | :param max_len: int
144 | :return:
145 | """
146 | vocab = []
147 | for sentence in text:
148 | vocab.extend(sentence)
149 | count = collections.Counter(vocab).most_common()
150 | vocab = {v: k + 2 for k, (v, _) in enumerate(count)}
151 | vocab[PAD] = PAD2ID
152 | vocab[UNK] = UNK2ID
153 |
154 | v = Vocab()
155 | v.word2idx = vocab
156 | v.max_len = max_len
157 | return v
158 |
159 |
160 | def process_label(y):
161 | result = []
162 | num = 0
163 | for label in y:
164 | if label == '1':
165 | num += 1
166 | try:
167 | result.append(int(label))
168 | except:
169 | result.append(0)
170 | print("正样本数\t{}\t负样本数\t{}".format(num, len(y) - num))
171 | return result
172 |
173 |
174 | def preprocessor(synonym=False):
175 | """数据预处理"""
176 | if synonym:
177 | data = read_csv(ATEC_NLP_DATA)
178 | data.extend(read_csv(ADD_ATEC_NLP_DATA))
179 | init_num = len(data)
180 | train_data, dev_data = train_test_split(data, test_size=0.1, random_state=50)
181 | extended_corpus(train_data)
182 | extended_corpus(dev_data, False, 'dev')
183 | # expand_num = len(train_data) + len(dev_data)
184 | # print("初始语料\t{}\t扩展语料\t{}\t新增语料\t{}".format(init_num, expand_num, expand_num - init_num))
185 | # else:
186 | train_data = read_csv(EXPEND_ATEC_NLP_DATA.format('train'))
187 | dev_data = read_csv(EXPEND_ATEC_NLP_DATA.format('dev'))
188 | train_idx, train_left_x, train_right_x, train_y = zip(*train_data)
189 | dev_idx, dev_left_x, dev_right_x, dev_y = zip(*dev_data)
190 |
191 | train_left_x = split_data(train_left_x)
192 |
193 | train_right_x = split_data(train_right_x)
194 | dev_left_x = split_data(dev_left_x)
195 | dev_right_x = split_data(dev_right_x)
196 | train_y = process_label(train_y)
197 | dev_y = process_label(dev_y)
198 | max_len = max(len(kk) for kk in train_left_x + train_right_x + dev_right_x + dev_left_x)
199 | vocab = build_vocab(train_left_x + train_right_x + dev_right_x + dev_left_x, max_len)
200 |
201 | print("最大长度\t{}\t词汇量\t{}".format(max_len, len(vocab.word2idx)))
202 |
203 | train_left_x, train_left_len = pad_sequence(train_left_x, vocab, max_len)
204 | train_right_x, train_right_len = pad_sequence(train_right_x, vocab, max_len)
205 | dev_left_x, dev_left_len = pad_sequence(dev_left_x, vocab, max_len)
206 | dev_right_x, dev_right_len = pad_sequence(dev_right_x, vocab, max_len)
207 |
208 | data = {
209 | "train_l_x": train_left_x,
210 | "train_r_x": train_right_x,
211 | "train_l_len": train_left_len,
212 | "train_r_len": train_right_len,
213 | "train_y": train_y,
214 | "val_l_x": dev_left_x,
215 | "val_r_x": dev_right_x,
216 | "val_l_len": dev_left_len,
217 | "val_r_len": dev_right_len,
218 | "val_y": dev_y,
219 | }
220 | save_data(DATA_PKL, data)
221 | save_data(VOCAB_PKL, vocab)
222 | return data, vocab
223 |
224 |
225 | def split_data(data):
226 | result = []
227 | if WordChar == 'char':
228 | for sentence in data:
229 | sentence = sentence.replace(' ', '')
230 | new_sentence = [char for char in sentence]
231 | result.append(new_sentence)
232 | else:
233 | for sentence in data:
234 | sentence = sentence.split(' ')
235 | result.append(sentence)
236 | return result
237 |
238 |
239 | def pad_sequence(data, vocab, max_len):
240 | """
241 | 补全数据
242 | :param data:
243 | :param vocab:
244 | :param max_len:
245 | :return:
246 | """
247 | seqs_data = []
248 | seqs_len = []
249 | for sentence in data:
250 | seq_len = len(sentence)
251 | seqs_len.append(len(sentence))
252 | sentence = [vocab.word2idx.get(kk, UNK2ID) for kk in sentence] + [PAD2ID] * (max_len - seq_len)
253 | seqs_data.append(sentence[:max_len])
254 | return seqs_data, seqs_len
255 |
--------------------------------------------------------------------------------
/core/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # @author: Linlifang
5 | # @file: utils.py
6 | # @time: 18-6-27下午6:13
7 | import csv
8 |
9 | try:
10 | import cPickle as pickle
11 | except:
12 | import pickle
13 | try:
14 | import sys
15 |
16 | reload(sys)
17 | sys.setdefaultencoding('utf8')
18 | except:
19 | pass
20 |
21 |
22 | def read_csv(filename, delimiter='\t'):
23 | """
24 | 读取csv
25 | :param filename:
26 | :param delimiter:
27 | :return:
28 | """
29 | import codecs
30 | with codecs.open(filename, 'r', encoding='utf-8') as fp:
31 | data = [[ii for ii in each] for each in csv.reader(fp, delimiter=delimiter)]
32 | return data
33 |
34 |
35 | def load_text(filename):
36 | """
37 | 加载数据
38 | :param filename:
39 | :return:
40 | """
41 | data = []
42 | with open(filename, 'r') as fp:
43 | for idx, line in enumerate(fp):
44 | line = line.strip('\n')
45 | tokens = line.split()
46 | data.append(tokens)
47 | return data
48 |
49 |
50 | def load_data(filename):
51 | """
52 | 加载词汇信息
53 | :return:
54 | """
55 | try:
56 | with open(filename, 'rb') as fp:
57 | data = pickle.load(fp)
58 | except:
59 | with open('data/vocab2.pkl', 'rb') as fp:
60 | data = pickle.load(fp)
61 | return data
62 |
63 |
64 | def save_data(filename, data):
65 | with open(filename, 'wb') as fp:
66 | pickle.dump(data, fp)
67 |
--------------------------------------------------------------------------------
/core/word_embedding.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 19-1-17 下午5:08
4 | # @Author : 林利芳
5 | # @File : word_embedding.py
6 | import numpy as np
7 |
8 | from config.config import WORD2VEC_DATA
9 |
10 |
11 | class Vocab(object):
12 | def __init__(self):
13 | self.word2vec = []
14 | self.word2idx = {'': 0, '': 1}
15 | self.max_len = 0
16 |
17 | def add_word(self, word, vector):
18 | self.word2idx[word] = len(self.word2idx)
19 | self.word2vec.append(vector)
20 |
21 | def load_word_vectors(self):
22 | with open(WORD2VEC_DATA, 'r') as f:
23 | vocab_size, embedding_dim = [int(_) for _ in f.readline().strip().split(' ')]
24 | self.word2vec = [[0.0] * embedding_dim]
25 | self.word2vec.append(np.random.uniform(-0.25, 0.25, embedding_dim).round(6).tolist())
26 | lines = f.readlines()
27 | for line in lines:
28 | word, vector = line.strip().split(' ', 1)
29 | self.add_word(word, [float(_) for _ in vector.split(' ')])
30 | self.word2vec = np.array(self.word2vec).astype(np.float32)
31 |
--------------------------------------------------------------------------------
/data/corpus.txt:
--------------------------------------------------------------------------------
1 | 怎么
2 | 怎样
3 | 如何
4 | 更改
5 | 更换
6 | 更新
7 | 修改
8 | 未
9 | 没有
10 | 可以
11 | 为什么
12 | 为何
13 | 为啥
14 | 零时额度
15 | 临时额度
16 | 这么久
17 | 降低
18 | 下降
19 | 日息
20 | 不能
21 | 不让
22 | 不可以
23 | 能不能
24 | 行不行
25 | 可不可以
26 | 用不了
27 | 不能用
28 | 被冻结
29 | 被封了
30 | 下月
31 | 下个月
32 | 蚂蚁借呗
33 | 借呗
34 | 花呗
35 | 花贝
36 | 花唄
37 | 花被
38 | 蚂蚁借呗
39 | 蚂蚁花呗
40 | 蚂蚁花贝
41 | 蚂蚁花唄
42 | 蚂蚁花被
43 | ***
44 | ofo
45 | 借呗
46 | 余额宝
47 | 代扣完
48 | 更改成
49 | 用了
50 | 届不了
51 | 借不了
52 | 上个月
53 | 上月
54 | 老有
55 | 总有
--------------------------------------------------------------------------------
/logdir/graph/match_pyramid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phychaos/TextSimilar/5c3e23bceba3e2aebf5c2db390ab1ddeb728e30e/logdir/graph/match_pyramid
--------------------------------------------------------------------------------
/logdir/graph/siamese.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phychaos/TextSimilar/5c3e23bceba3e2aebf5c2db390ab1ddeb728e30e/logdir/graph/siamese.png
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 19-1-18 下午5:46
4 | # @Author : 林利芳
5 | # @File : main.py
6 | import os
7 | import sys
8 |
9 | from core.load_data import get_feed_dict, load_test_data, save_test_result
10 | from config.config import checkpoint_dir, TEST_DATA, TEST_RESULT
11 | from model.match_pyramid import MatchPyramidNetwork
12 | from model.rnn_siamese import RnnSiameseNetwork
13 | from config.hyperparams import HyperParams as hp
14 | import tensorflow as tf
15 | import numpy as np
16 |
17 |
18 | def test(filename=TEST_DATA, outfile=TEST_RESULT, network='rnn'):
19 | checkpoint_file = checkpoint_dir.format(network)
20 | idx, left_x, left_len, right_x, right_len, vocab = load_test_data(filename)
21 | y = np.ones_like(idx)
22 | vocab_size = len(vocab.word2idx)
23 | if network == 'rnn':
24 | model = RnnSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, hp.batch_size, False)
25 | elif network == 'match_pyramid':
26 | model = MatchPyramidNetwork(vocab_size, hp.embedding_size, vocab.max_len, hp.batch_size, False)
27 | else:
28 | return
29 | sv = tf.train.Supervisor(graph=model.graph, logdir=checkpoint_file, save_model_secs=0)
30 | with sv.managed_session() as sess:
31 | predicts = []
32 | for feed_dict, start_batch in get_feed_dict(model, left_x, right_x, left_len, right_len, y, hp.batch_size):
33 | pre_y, distince = sess.run([model.pre_y, model.distance], feed_dict=feed_dict)
34 | predicts.extend(pre_y[start_batch:])
35 | save_test_result(outfile, idx, predicts)
36 |
37 |
38 | if __name__ == "__main__":
39 | test(sys.argv[1], sys.argv[2], 'rnn')
40 |
--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 18-12-29 下午3:17
4 | # @Author : 林利芳
5 | # @File : __init__.py
6 |
--------------------------------------------------------------------------------
/model/cnn_siamese.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 18-12-29 下午3:07
4 | # @Author : 林利芳
5 | # @File : rnn_siamese.py
6 | import tensorflow as tf
7 | from config.hyperparams import CnnParams as hp
8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize
9 |
10 |
11 | class CnnSiameseNetwork(object):
12 | def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True):
13 | self.vocab_size = vocab_size
14 | self.embedding_size = embedding_size
15 | self.max_len = max_len
16 | self.is_training = is_training
17 | self.graph = tf.Graph()
18 | with self.graph.as_default():
19 | self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x")
20 | self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x")
21 | self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target")
22 | self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
23 | self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
24 | self.global_step = tf.train.create_global_step()
25 |
26 | key, value = self.siamese()
27 | self.distance, self.pre_y = self.similar(key, value)
28 | self.accuracy = self.predict()
29 | self.loss = self.loss_layer()
30 | self.train_op = self.optimize()
31 |
32 | def siamese(self):
33 | """
34 | 孪生网络 transformer + rnn
35 | :return:
36 | """
37 | x = tf.concat([self.left_x, self.right_x], axis=0)
38 | seq_lens = tf.concat([self.left_seq_lens, self.right_seq_lens], axis=0)
39 | # layers embedding multi_head_attention rnn
40 | embed = embedding(x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True, scope="embed")
41 |
42 | # output = self.transformer(embed, x)
43 | inputs = tf.expand_dims(embed, -1)
44 | output = self.cnn_layer(inputs, 1)
45 | output = tf.expand_dims(output, -1)
46 | output = self.cnn_layer(output, 2)
47 | output = self.attention(embed, output)
48 | key, value = tf.split(output, 2, axis=0)
49 | return key, value
50 |
51 | def rnn_layer(self, inputs, seq_lens, seg=hp.seg):
52 | """
53 | 创建双向RNN层
54 | :param inputs:
55 | :param seq_lens:
56 | :param seg: LSTM GRU F-LSTM, IndRNN
57 | :return:
58 | """
59 | if seg == 'LSTM':
60 | fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
61 | bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
62 |
63 | elif seg == 'GRU':
64 | fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
65 | bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
66 | else:
67 | fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
68 | bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
69 | # 双向rnn
70 | (fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn(
71 | fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32)
72 | # 合并双向rnn的output batch_size * max_seq * (hidden_dim*2)
73 | output = tf.add(fw_output, bw_output)
74 | return output
75 |
76 | def cnn_layer(self, inputs, layer=1):
77 | """
78 | 卷积层 卷积核2,3,4,5 激活层relu 池化层 size=2
79 | :param inputs: batch T * T
80 | :param layer: batch T * T
81 | :return:
82 | """
83 | outputs = []
84 | d_dim, channel = inputs.get_shape().as_list()[-2:]
85 | for ii, width in enumerate(hp.kernel):
86 | with tf.variable_scope("cnn_{}_{}_layer".format(layer, ii + 1)):
87 | weight = tf.Variable(tf.truncated_normal([width, d_dim, channel, hp.channel], stddev=0.1, name='w'))
88 | bias = tf.get_variable('bias', [hp.channel], initializer=tf.constant_initializer(0.0))
89 | output = tf.nn.conv2d(inputs, weight, strides=[1, 1, d_dim, 1], padding='SAME') # batch T T channel
90 | output = tf.nn.relu(tf.nn.bias_add(output, bias, data_format="NHWC"))
91 |
92 | output = tf.reshape(output, shape=[-1, self.max_len, hp.channel])
93 | outputs.append(output)
94 | outputs = tf.concat(outputs, axis=-1)
95 | return outputs
96 |
97 | def transformer(self, embed, value):
98 | with tf.variable_scope("Transformer_Encoder"):
99 | # Positional Encoding
100 | embed += positional_encoding(value, num_units=hp.num_units, zero_pad=False, scale=False, scope="post")
101 | # Dropout
102 | output = self.multi_head_block(embed)
103 | return output
104 |
105 | def multi_head_block(self, query, causality=False):
106 | """
107 | 多头注意力机制
108 | :param query:
109 | :param causality:
110 | :return:
111 | """
112 | for i in range(hp.num_blocks):
113 | with tf.variable_scope("num_blocks_{}".format(i)):
114 | # multi head Attention ( self-attention)
115 | query = multihead_attention(
116 | queries=query, keys=query, num_units=hp.num_units, num_heads=hp.num_heads,
117 | dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality,
118 | scope="self_attention")
119 | # Feed Forward
120 | query = feedforward(query, num_units=[4 * hp.num_units, hp.num_units])
121 | return query
122 |
123 | def loss_layer(self):
124 | """
125 | 损失函数 L+ = (1-Ew)^2/4 L_ = max(Ex,0)^2
126 | :return:
127 | """
128 | y = tf.cast(self.y, tf.float32)
129 | with tf.name_scope("output"):
130 | loss_p = tf.square(1 - self.distance) / 4
131 | mask = tf.sign(tf.nn.relu(self.distance - hp.margin))
132 | loss_m = tf.square(mask * self.distance)
133 | loss = tf.reduce_sum(y * loss_p + (1 - y) * loss_m)
134 | return loss
135 |
136 | def attention(self, embed, query):
137 | """
138 | 注意力机制
139 | :param embed:
140 | :param query:
141 | :return:
142 | """
143 | output = tf.reduce_mean(query, axis=1)
144 | return output
145 | with tf.name_scope("attention"):
146 | w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32)
147 | b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32)
148 | u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32)
149 | value = tf.concat([embed, query], axis=-1)
150 | value = tf.reshape(value, [-1, 2 * hp.num_units])
151 | attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u)
152 | attention = tf.reshape(attention, shape=[-1, self.max_len])
153 | attention = tf.nn.softmax(attention, axis=-1)
154 | attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units])
155 |
156 | output = tf.reduce_sum(attention * query, axis=1)
157 | output = layer_normalize(output)
158 | return output
159 |
160 | @staticmethod
161 | def similar(key, value):
162 | """
163 | cosine(key,value) = key * value/(|key|*|value|)
164 | :param key:
165 | :param value:
166 | :return:
167 | """
168 | dot_value = tf.reduce_sum(key * value, axis=-1)
169 | key_sqrt = tf.sqrt(tf.reduce_sum(tf.square(key), axis=-1) + hp.eps)
170 | value_sqrt = tf.sqrt(tf.reduce_sum(tf.square(value), axis=-1) + hp.eps)
171 | distance = tf.div(dot_value, key_sqrt * value_sqrt, name="similar")
172 | pre_y = tf.sign(tf.nn.relu(distance - hp.margin))
173 | pre_y = tf.cast(pre_y, tf.int32, name='pre')
174 | return distance, pre_y
175 |
176 | def predict(self):
177 | correct_predictions = tf.equal(self.pre_y, self.y)
178 | accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
179 | return accuracy
180 |
181 | def optimize(self):
182 | """
183 | 优化器
184 | :return:
185 | """
186 | optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
187 | train_op = optimizer.minimize(self.loss, global_step=self.global_step)
188 | return train_op
189 |
--------------------------------------------------------------------------------
/model/match_pyramid.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 19-1-25 上午11:17
4 | # @Author : 林利芳
5 | # @File : match_pyramid.py
6 | import tensorflow as tf
7 | from config.hyperparams import MatchPyramidParams as hp
8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize
9 |
10 |
11 | class MatchPyramidNetwork(object):
12 | def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True):
13 | self.vocab_size = vocab_size
14 | self.embedding_size = embedding_size
15 | self.max_len = max_len
16 | self.is_training = is_training
17 | self.graph = tf.Graph()
18 | with self.graph.as_default():
19 | self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x")
20 | self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x")
21 | self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target")
22 | self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
23 | self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
24 | self.global_step = tf.train.create_global_step()
25 |
26 | outputs = self.match_pyramid()
27 | outputs, self.pre_y = self.multi_dense_layer(outputs)
28 | self.acc = self.predict()
29 | self.loss = self.loss_layer(outputs)
30 | self.train_op = self.optimize()
31 |
32 | def match_pyramid(self):
33 | """
34 | pyramid
35 | :return:
36 | """
37 | left_embed = embedding(self.left_x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True,
38 | scope="left_embed")
39 | right_embed = embedding(self.right_x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True,
40 | scope="right_embed")
41 | outputs = self.match_text(left_embed, right_embed)
42 | outputs = self.cnn_layer(outputs, 1)
43 | outputs = self.cnn_layer(outputs, 2)
44 | return outputs
45 |
46 | @staticmethod
47 | def match_text(left_embed, right_embed):
48 | """
49 | 文本匹配 cosine dot binary
50 | :param left_embed: 词嵌入 batch * T * D
51 | :param right_embed: 词嵌入 batch * T * D
52 | :return:
53 | """
54 | with tf.variable_scope("match-text"):
55 | dot_output = tf.matmul(left_embed, tf.transpose(right_embed, [0, 2, 1])) # batch * T * T
56 | left_norm = tf.sqrt(tf.matmul(left_embed, tf.transpose(left_embed, [0, 2, 1]))+hp.eps)
57 | right_norm = tf.sqrt(tf.matmul(right_embed, tf.transpose(right_embed, [0, 2, 1]))+hp.eps)
58 | cosine_outputs = tf.div(dot_output, left_norm * right_norm)
59 | binary_outputs = tf.cast(tf.equal(cosine_outputs, 1), tf.float32)
60 | dot_output = tf.expand_dims(dot_output, axis=-1)
61 | cosine_outputs = tf.expand_dims(cosine_outputs, axis=-1)
62 | binary_outputs = tf.expand_dims(binary_outputs, axis=-1)
63 |
64 | outputs = tf.concat([dot_output, cosine_outputs, binary_outputs], axis=-1)
65 | print(outputs.get_shape().as_list())
66 | return dot_output
67 |
68 | @staticmethod
69 | def cnn_layer(inputs, layer=1):
70 | """
71 | 卷积层 卷积核2,3,4,5 激活层relu 池化层 size=2
72 | :param inputs: batch T * T
73 | :param layer: batch T * T
74 | :return:
75 | """
76 | outputs = []
77 | channel = inputs.get_shape().as_list()[-1]
78 | for ii, width in enumerate(hp.kernel):
79 | with tf.variable_scope("cnn_{}_{}_layer".format(layer, ii + 1)):
80 | weight = tf.Variable(tf.truncated_normal([width, width, channel, hp.channel], stddev=0.1, name='w'))
81 | bias = tf.get_variable('bias', [hp.channel], initializer=tf.constant_initializer(0.0))
82 | output = tf.nn.conv2d(inputs, weight, strides=[1, 1, 1, 1], padding='SAME') # batch T T channel
83 | output = tf.nn.relu(tf.nn.bias_add(output, bias, data_format="NHWC"))
84 | pool = tf.nn.max_pool(output, ksize=[1, hp.pool_size, hp.pool_size, 1], strides=[1, 1, 1, 1],
85 | padding='VALID')
86 | outputs.append(pool)
87 | outputs = tf.concat(outputs, axis=-1)
88 | return outputs
89 |
90 | @staticmethod
91 | def multi_dense_layer(inputs):
92 | """
93 | 多层感知机 T*T*channel -> dense_size ->2
94 | :param inputs: batch T T channel
95 | :return:
96 | """
97 | _, width, height, channel = inputs.get_shape().as_list()
98 | size = width * height * channel
99 | inputs = tf.reshape(inputs, shape=[-1, size])
100 | with tf.variable_scope("dense_layer"):
101 | w = tf.get_variable(name='w', dtype=tf.float32, shape=[size, hp.dense_size])
102 | b = tf.get_variable(name='b', dtype=tf.float32, shape=[hp.dense_size])
103 | outputs = layer_normalize(tf.matmul(inputs, w) + b, )
104 |
105 | with tf.variable_scope("logit_layer"):
106 | w = tf.get_variable(name='w', dtype=tf.float32, shape=[hp.dense_size, 2])
107 | b = tf.get_variable(name='b', dtype=tf.float32, shape=[2])
108 | outputs = tf.nn.softmax(tf.matmul(outputs, w) + b, axis=-1)
109 | pre_y = tf.cast(tf.argmax(outputs, axis=-1), dtype=tf.int32)
110 | return outputs, pre_y
111 |
112 | def rnn_layer(self, inputs, seq_lens, seg=hp.seg):
113 | """
114 | 创建双向RNN层
115 | :param inputs:
116 | :param seq_lens:
117 | :param seg: LSTM GRU F-LSTM, IndRNN
118 | :return:
119 | """
120 | if seg == 'LSTM':
121 | fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
122 | bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
123 |
124 | elif seg == 'GRU':
125 | fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
126 | bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
127 | else:
128 | fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
129 | bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
130 | # 双向rnn
131 | (fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn(
132 | fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32)
133 | # 合并双向rnn的output batch_size * max_seq * (hidden_dim*2)
134 | output = tf.add(fw_output, bw_output)
135 | return output
136 |
137 | def transformer(self, embed, value):
138 | with tf.variable_scope("Transformer_Encoder"):
139 | # Positional Encoding
140 | embed += positional_encoding(value, num_units=hp.num_units, zero_pad=False, scale=False, scope="post")
141 | # Dropout
142 | output = self.multi_head_block(embed)
143 | return output
144 |
145 | def multi_head_block(self, query, causality=False):
146 | """
147 | 多头注意力机制
148 | :param query:
149 | :param causality:
150 | :return:
151 | """
152 | for i in range(hp.num_blocks):
153 | with tf.variable_scope("num_blocks_{}".format(i)):
154 | # multi head Attention ( self-attention)
155 | query = multihead_attention(
156 | queries=query, keys=query, num_units=hp.num_units, num_heads=hp.num_heads,
157 | dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality,
158 | scope="self_attention")
159 | # Feed Forward
160 | query = feedforward(query, num_units=[4 * hp.num_units, hp.num_units])
161 | return query
162 |
163 | def loss_layer(self, inputs):
164 | """
165 | 损失函数 L+ = (1-Ew)^2/4 L_ = max(Ex,0)^2
166 | :return:
167 | """
168 | y = tf.cast(self.y, tf.float32)
169 | with tf.name_scope("loss_layer"):
170 | loss_p = y * tf.log(tf.clip_by_value(inputs[:, -1], hp.eps, 1.0))
171 | loss_m = (1 - y) * tf.log(tf.clip_by_value(inputs[:, 0], hp.eps, 1.0))
172 | loss = -tf.reduce_sum(loss_p + loss_m)
173 | return loss
174 |
175 | def attention(self, embed, query):
176 | """
177 | 注意力机制
178 | :param embed:
179 | :param query:
180 | :return:
181 | """
182 | with tf.name_scope("attention"):
183 | w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32)
184 | b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32)
185 | u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32)
186 | value = tf.concat([embed, query], axis=-1)
187 | value = tf.reshape(value, [-1, 2 * hp.num_units])
188 | attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u)
189 | attention = tf.reshape(attention, shape=[-1, self.max_len])
190 | attention = tf.nn.softmax(attention, axis=-1)
191 | attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units])
192 |
193 | output = tf.reduce_sum(attention * query, axis=1)
194 | output = layer_normalize(output)
195 | return output
196 |
197 | def predict(self):
198 | correct_predictions = tf.equal(self.pre_y, self.y)
199 | accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
200 | return accuracy
201 |
202 | def optimize(self):
203 | """
204 | 优化器
205 | :return:
206 | """
207 | optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
208 | train_op = optimizer.minimize(self.loss, global_step=self.global_step)
209 | return train_op
210 |
--------------------------------------------------------------------------------
/model/module/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 19-1-25 下午6:02
4 | # @Author : 林利芳
5 | # @File : __init__.py
6 |
--------------------------------------------------------------------------------
/model/module/feature.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @时间 : 18-12-11 下午5:44
4 | # @作者 : Lin lifang
5 | # @文件 : feature.py
6 | from utils.utils import read_template
7 | import numpy as np
8 |
9 |
10 | class Feature(object):
11 | def __init__(self, fd=5):
12 | self.fd = fd
13 | self.fss = None
14 | self.bf_size = 0
15 | self.uf_size = 0
16 | self.f_size = 0
17 | self.num_k = 0
18 | self.node_obs = dict()
19 | self.edge_obs = dict()
20 | self.oby_dict = dict()
21 | self.node_fs = []
22 | self.edge_fs = []
23 | self.tp_list = [
24 | ['U00', ['-2', '0']],
25 | ['U01', ['-1', '0']],
26 | ['U02', ['0', '0']],
27 | ['U03', ['1', '0']],
28 | ['U04', ['2', '0']],
29 | ['U05', ['-2', '0'], ['-1', '0'], ['0', '0']],
30 | ['U06', ['-1', '0'], ['0', '0'], ['1', '0']],
31 | ['U07', ['0', '0'], ['1', '0'], ['2', '0']],
32 | ['U08', ['-1', '0'], ['0', '0']],
33 | ['U09', ['0', '0'], ['1', '0']],
34 | ['B'], ]
35 |
36 | def process_features(self, texts):
37 | """
38 | 特征提取
39 | :param texts: 序列文本 [[['你',],['好',]],[['你',],['好',]]]
40 | :return:
41 | """
42 | print("特征提取...")
43 | uf_obs = dict()
44 | bf_obs = dict()
45 |
46 | for text in texts:
47 | seq_uf, seq_bf = self.feature_vector(text)
48 | for loc_id, (loc_uf, loc_bf) in enumerate(zip(seq_uf, seq_bf)):
49 | for fs in loc_bf:
50 | fs_id = bf_obs.get(fs)
51 | bf_obs[fs] = fs_id + 1 if fs_id is not None else 1
52 | for fs in loc_uf:
53 | fs_id = uf_obs.get(fs)
54 | uf_obs[fs] = fs_id + 1 if fs_id is not None else 1
55 |
56 | node_fs = [key for key, v in sorted(uf_obs.items(), key=lambda x: x[1], reverse=True) if v >= self.fd]
57 | edge_fs = [key for key, v in sorted(bf_obs.items(), key=lambda x: x[1], reverse=True) if v >= self.fd]
58 | self.node_obs = {key: kk * self.num_k for kk, key in enumerate(node_fs)}
59 | self.edge_obs = {key: kk * self.num_k * self.num_k for kk, key in enumerate(edge_fs)}
60 |
61 | self.uf_size = len(node_fs) * self.num_k
62 | self.bf_size = len(edge_fs) * self.num_k * self.num_k
63 | self.f_size = self.uf_size + self.bf_size
64 | print("B 特征:\t{}\nU 特征:\t{}\n总特征:\t{}\n".format(self.bf_size, self.uf_size, self.f_size))
65 |
66 | def feature_vector(self, text, init=True):
67 | """
68 | 特征序列化
69 | :param text:
70 | :param init:
71 | :return:
72 | """
73 | seq_bf = []
74 | seq_uf = []
75 | for loc_id in range(len(text)):
76 | loc_uf, loc_bf = self.expand_observation(text, loc_id, init)
77 | seq_bf.append(loc_bf)
78 | seq_uf.append(loc_uf)
79 | return seq_uf, seq_bf
80 |
81 | def expand_observation(self, sentence, loc_id, init=True):
82 | """
83 | expend the observation at loc_id for sequence
84 | :param sentence: 字符序列
85 | :param loc_id: 字符在sentence的位置序号
86 | :param init: 是否初始化
87 | :return:
88 | """
89 | loc_uf = []
90 | loc_bf = []
91 | for tp in self.tp_list:
92 | fs = tp[0]
93 | for li in tp[1::]:
94 | row = loc_id + int(li[0])
95 | col = int(li[1])
96 | if len(sentence) > row >= 0:
97 | if len(sentence[row][col]) > col >= 0:
98 | fs += ":" + sentence[row][col]
99 | else:
100 | fs += ':B' + li[0]
101 | if fs[0] == "U":
102 | if init:
103 | loc_uf.append(fs)
104 | else:
105 | fs_id = self.node_obs.get(fs)
106 | if fs_id is not None:
107 | loc_uf.append(fs_id)
108 | if fs[0] == "B":
109 | if init:
110 | loc_bf.append(fs)
111 | else:
112 | fs_id = self.edge_obs.get(fs)
113 | if fs_id is not None:
114 | loc_bf.append(fs_id)
115 | return loc_uf, loc_bf
116 |
117 | def cal_observe_on(self, texts, init=False):
118 | """
119 | 获取文本特征 [[['U:你','U:你:好'],['U:你','U:你:好'],[]],[],[]] =[[[145,456,566],[3455,]],[]]
120 | :param texts:
121 | :param init:
122 | :return:
123 | """
124 | self.node_fs = []
125 | self.edge_fs = []
126 | for text in texts:
127 | seq_uf, seq_bf = self.feature_vector(text, init)
128 | self.node_fs.append(seq_uf)
129 | self.edge_fs.append(seq_bf)
130 | return self.node_fs, self.edge_fs
131 |
132 | def cal_fss(self, labels, y0):
133 | """
134 | 统计特征数量 每个特征对应 num_k 个特征
135 | :param labels: 标签
136 | :param y0: 起始值0
137 | :return:
138 | """
139 | self.fss = np.zeros((self.f_size,))
140 | fss_b = self.fss[0:self.bf_size]
141 | fss_u = self.fss[self.bf_size:]
142 | for seq_id, label in enumerate(labels):
143 | y_p = y0
144 | for loc_id, y in enumerate(label):
145 | for fs_id in self.node_fs[seq_id][loc_id]:
146 | fss_u[fs_id + y] += 1.0
147 | for fs_id in self.edge_fs[seq_id][loc_id]:
148 | fss_b[fs_id + y_p * self.num_k + y] += 1.0
149 | y_p = y
150 |
151 | def save_feature(self):
152 | result = ['#CRF Feature Templates.\n\n']
153 | for tp in self.tp_list:
154 | feature = tp[0] + ':'
155 | for start, end in tp[1:]:
156 | feature += '%x[' + start + ',' + end + ']'
157 | result.append(feature)
158 | result.append('\n\n#U')
159 | u_feature = list(sorted(self.node_obs.keys(), key=lambda x: x))
160 | result.extend(u_feature)
161 | with open('feature.txt', 'w', encoding='utf-8') as fp:
162 | fp.write('\n'.join(result))
163 |
164 | def process_state(self, labels):
165 | """
166 | 状态预处理
167 | :param labels:
168 | :return:
169 | """
170 | new_label = []
171 | oby_id = 0
172 | for sentence in labels:
173 | s_label = []
174 | for label in sentence:
175 | label_id = self.oby_dict.get(label)
176 | if label_id is None:
177 | label_id = oby_id
178 | self.oby_dict[label] = oby_id
179 | oby_id += 1
180 | s_label.append(label_id)
181 | new_label.append(s_label)
182 | self.num_k = len(self.oby_dict)
183 | return new_label
184 |
185 | def __call__(self, texts, labels, template_file, y0=0, *args, **kwargs):
186 | if template_file:
187 | self.tp_list = read_template(template_file)
188 | self.seq_lens = [len(x) for x in labels]
189 | labels = self.process_state(labels)
190 | self.process_features(texts)
191 | self.cal_observe_on(texts)
192 | self.cal_fss(labels, y0)
193 | self.save_feature()
194 |
--------------------------------------------------------------------------------
/model/module/modules.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 18-12-25 上午10:37
4 | # @Author : 林利芳
5 | # @File : modules.py
6 |
7 | from __future__ import print_function
8 | import tensorflow as tf
9 | import numpy as np
10 |
11 |
12 | def layer_normalize(inputs, epsilon=1e-8, scope="ln", reuse=None):
13 | """Applies layer normalization.
14 | Args:
15 | inputs: A tensor with 2 or more dimensions, where the first dimension has
16 | `batch_size`.
17 | epsilon: A floating number. A very small number for preventing ZeroDivision Error.
18 | scope: Optional scope for `variable_scope`.
19 | reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
20 | Returns:
21 | A tensor with the same shape and data dtype as `inputs`.
22 | """
23 | with tf.variable_scope(scope, reuse=reuse):
24 | inputs_shape = inputs.get_shape()
25 | params_shape = inputs_shape[-1:]
26 |
27 | mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
28 | beta = tf.Variable(tf.zeros(params_shape))
29 | gamma = tf.Variable(tf.ones(params_shape))
30 | normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
31 | outputs = gamma * normalized + beta
32 |
33 | return outputs
34 |
35 |
36 | def embedding(inputs, vocab_size, num_units, zero_pad=True, scale=True, scope="embedding", reuse=None):
37 | """Embeds a given tensor.
38 | Args:
39 | inputs: A `Tensor` with type `int32` or `int64` containing the ids to be looked up in `lookup table`.
40 | vocab_size: An int. Vocabulary size.
41 | num_units: An int. Number of embedding hidden units.
42 | zero_pad: A boolean. If True, all the values of the fist row (id 0) should be constant zeros.
43 | scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
44 | scope: Optional scope for `variable_scope`.
45 | reuse: Boolean, whether to reuse the weights of a previous layer
46 | by the same name.
47 |
48 | Returns:
49 | A `Tensor` with one more rank than inputs's. The last dimensionality should be `num_units`.
50 |
51 | For example,
52 |
53 | ```
54 | import tensorflow as tf
55 |
56 | inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
57 | outputs = embedding(inputs, 6, 2, zero_pad=True)
58 | with tf.Session() as sess:
59 | sess.run(tf.global_variables_initializer())
60 | print sess.run(outputs)
61 | >>
62 | [[[ 0. 0. ]
63 | [ 0.09754146 0.67385566]
64 | [ 0.37864095 -0.35689294]]
65 |
66 | [[-1.01329422 -1.09939694]
67 | [ 0.7521342 0.38203377]
68 | [-0.04973143 -0.06210355]]]
69 | ```
70 |
71 | ```
72 | import tensorflow as tf
73 |
74 | inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
75 | outputs = embedding(inputs, 6, 2, zero_pad=False)
76 | with tf.Session() as sess:
77 | sess.run(tf.global_variables_initializer())
78 | print sess.run(outputs)
79 | >>
80 | [[[-0.19172323 -0.39159766]
81 | [-0.43212751 -0.66207761]
82 | [ 1.03452027 -0.26704335]]
83 |
84 | [[-0.11634696 -0.35983452]
85 | [ 0.50208133 0.53509563]
86 | [ 1.22204471 -0.96587461]]]
87 | ```
88 | """
89 | with tf.variable_scope(scope, reuse=reuse):
90 | lookup_table = tf.get_variable('lookup_table', dtype=tf.float32, shape=[vocab_size, num_units],
91 | initializer=tf.contrib.layers.xavier_initializer())
92 | if zero_pad:
93 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0)
94 | outputs = tf.nn.embedding_lookup(lookup_table, inputs)
95 |
96 | if scale:
97 | outputs = outputs * (num_units ** 0.5)
98 |
99 | return outputs
100 |
101 |
102 | def positional_encoding(inputs, num_units, zero_pad=True, scale=True, scope="positional_encoding", reuse=None):
103 | """Sinusoidal Positional_Encoding.
104 |
105 | Args:
106 | inputs: A 2d Tensor with shape of (N, T).
107 | num_units: Output dimensionality
108 | zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
109 | scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
110 | scope: Optional scope for `variable_scope`.
111 | reuse: Boolean, whether to reuse the weights of a previous layer
112 | by the same name.
113 |
114 | Returns:
115 | A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
116 | """
117 |
118 | N, T = inputs.get_shape().as_list()
119 | with tf.variable_scope(scope, reuse=reuse):
120 | position_ind = tf.ones_like(inputs) * tf.range(T)
121 |
122 | # First part of the PE function: sin and cos argument
123 | position_enc = np.array([
124 | [pos / np.power(10000, 2. * i / num_units) for i in range(num_units)] for pos in range(T)], dtype=np.float32)
125 |
126 | # Second part, apply the cosine to even columns and sin to odds.
127 | position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i
128 | position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1
129 |
130 | # Convert to a tensor
131 | lookup_table = tf.convert_to_tensor(position_enc)
132 |
133 | if zero_pad:
134 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0)
135 | outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
136 |
137 | if scale:
138 | outputs = outputs * num_units ** 0.5
139 |
140 | return outputs
141 |
142 |
143 | def multihead_attention(
144 | queries, keys, num_units=None, num_heads=8, dropout_rate=0, is_training=True, causality=False,
145 | scope="multihead_attention", reuse=None):
146 | """Applies multihead attention.
147 | Args:
148 | queries: A 3d tensor with shape of [N, T_q, C_q].
149 | keys: A 3d tensor with shape of [N, T_k, C_k].
150 | num_units: A scalar. Attention size.
151 | dropout_rate: A floating point number.
152 | is_training: Boolean. Controller of mechanism for dropout.
153 | causality: Boolean. If true, units that reference the future are masked.
154 | num_heads: An int. Number of heads.
155 | scope: Optional scope for `variable_scope`.
156 | reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
157 | Returns
158 | A 3d tensor with shape of (N, T_q, C)
159 | """
160 | with tf.variable_scope(scope, reuse=reuse):
161 | # Set the fall back option for num_units
162 | if num_units is None:
163 | num_units = queries.get_shape().as_list[-1]
164 |
165 | # Linear projections
166 | Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
167 | K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
168 | V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
169 |
170 | # Split and concat
171 | Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h)
172 | K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
173 | V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
174 |
175 | # Multiplication
176 | outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
177 |
178 | # Scale
179 | outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
180 |
181 | # Key Masking
182 | key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k)
183 | key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
184 | key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
185 |
186 | paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
187 | outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
188 |
189 | # Causality = Future blinding
190 | if causality:
191 | diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
192 | tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k)
193 | masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
194 |
195 | paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
196 | outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
197 |
198 | # Activation
199 | outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
200 |
201 | # Query Masking
202 | query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q)
203 | query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
204 | query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
205 | outputs *= query_masks # broadcasting. (N, T_q, C)
206 |
207 | # Dropouts
208 | outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
209 |
210 | # Weighted sum
211 | outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
212 |
213 | # Restore shape
214 | outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C)
215 |
216 | # Residual connection 残差
217 | outputs += queries
218 |
219 | # Normalize 层归一化
220 | outputs = layer_normalize(outputs) # (N, T_q, C)
221 |
222 | return outputs
223 |
224 |
225 | def feedforward(inputs, num_units=[2048, 512], scope="multihead_attention", reuse=None):
226 | """Point-wise feed forward net.
227 | Args:
228 | inputs: A 3d tensor with shape of [N, T, C].
229 | num_units: A list of two integers.
230 | scope: Optional scope for `variable_scope`.
231 | reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
232 | Returns:
233 | A 3d tensor with the same shape and dtype as inputs
234 | """
235 | with tf.variable_scope(scope, reuse=reuse):
236 | # Inner layer
237 | params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1, "activation": tf.nn.relu,
238 | "use_bias": True,"reuse":False}
239 | outputs = tf.layers.conv1d(**params)
240 |
241 | # Readout layer
242 | params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1, "activation": None, "use_bias": True,"reuse":False}
243 | outputs = tf.layers.conv1d(**params)
244 |
245 | # Residual connection
246 | outputs += inputs
247 |
248 | # Normalize
249 | outputs = layer_normalize(outputs)
250 |
251 | return outputs
252 |
253 |
254 | def label_smoothing(inputs, epsilon=0.1):
255 | """Applies label smoothing. See https://arxiv.org/abs/1512.00567.
256 |
257 | Args:
258 | inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
259 | epsilon: Smoothing rate.
260 |
261 | For example,
262 |
263 | ```
264 | import tensorflow as tf
265 | inputs = tf.convert_to_tensor([[[0, 0, 1], [0, 1, 0], [1, 0, 0]], [[1, 0, 0], [1, 0, 0], [0, 1, 0]]], tf.float32)
266 |
267 | outputs = label_smoothing(inputs)
268 |
269 | with tf.Session() as sess:
270 | print(sess.run([outputs]))
271 |
272 | >>
273 | [array([[[ 0.03333334, 0.03333334, 0.93333334],
274 | [ 0.03333334, 0.93333334, 0.03333334],
275 | [ 0.93333334, 0.03333334, 0.03333334]],
276 |
277 | [[ 0.93333334, 0.03333334, 0.03333334],
278 | [ 0.93333334, 0.03333334, 0.03333334],
279 | [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)]
280 | ```
281 | """
282 | K = inputs.get_shape().as_list()[-1] # number of channels
283 | return ((1 - epsilon) * inputs) + (epsilon / K)
284 |
--------------------------------------------------------------------------------
/model/module/rnn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 18-12-28 上午10:54
4 | # @Author : 林利芳
5 | # @File : rnn.py
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | from tensorflow.python.framework import constant_op
11 | from tensorflow.python.framework import dtypes
12 | from tensorflow.python.layers import base as base_layer
13 | from tensorflow.python.ops import array_ops, clip_ops
14 | from tensorflow.python.ops import init_ops
15 | from tensorflow.python.ops import math_ops
16 | from tensorflow.python.ops import nn_ops
17 | from tensorflow.python.platform import tf_logging as logging
18 | from tensorflow.python.ops.rnn_cell_impl import LayerRNNCell, LSTMStateTuple
19 |
20 | _BIAS_VARIABLE_NAME = "bias"
21 | _WEIGHTS_VARIABLE_NAME = "kernel"
22 |
23 |
24 | class ForgetLSTMCell(LayerRNNCell):
25 | """Basic LSTM recurrent network cell.
26 |
27 | The implementation is based on: http://arxiv.org/abs/1409.2329.
28 |
29 | We add forget_bias (default: 1) to the biases of the forget gate in order to
30 | reduce the scale of forgetting in the beginning of the training.
31 |
32 | It does not allow cell clipping, a projection layer, and does not
33 | use peep-hole connections: it is the basic baseline.
34 |
35 | For advanced models, please use the full @{tf.nn.rnn_cell.LSTMCell}
36 | that follows.
37 | """
38 |
39 | def __init__(self, num_units, forget_bias=1.0,
40 | state_is_tuple=True, activation=None, reuse=None, name=None):
41 | """Initialize the basic LSTM cell.
42 |
43 | Args:
44 | num_units: int, The number of units in the LSTM cell.
45 | forget_bias: float, The bias added to forget gates (see above).
46 | Must set to `0.0` manually when restoring from CudnnLSTM-trained
47 | checkpoints.
48 | state_is_tuple: If True, accepted and returned states are 2-tuples of
49 | the `c_state` and `m_state`. If False, they are concatenated
50 | along the column axis. The latter behavior will soon be deprecated.
51 | activation: Activation function of the inner states. Default: `tanh`.
52 | reuse: (optional) Python boolean describing whether to reuse variables
53 | in an existing scope. If not `True`, and the existing scope already has
54 | the given variables, an error is raised.
55 | name: String, the name of the layer. Layers with the same name will
56 | share weights, but to avoid mistakes we require reuse=True in such
57 | cases.
58 |
59 | When restoring from CudnnLSTM-trained checkpoints, must use
60 | `CudnnCompatibleLSTMCell` instead.
61 | """
62 | super(ForgetLSTMCell, self).__init__(_reuse=reuse, name=name)
63 | if not state_is_tuple:
64 | logging.warn("%s: Using a concatenated state is slower and will soon be "
65 | "deprecated. Use state_is_tuple=True.", self)
66 |
67 | # Inputs must be 2-dimensional.
68 | self.input_spec = base_layer.InputSpec(ndim=2)
69 |
70 | self._num_units = num_units
71 | self._forget_bias = forget_bias
72 | self._state_is_tuple = state_is_tuple
73 | self._activation = activation or math_ops.tanh
74 |
75 | @property
76 | def state_size(self):
77 | return (LSTMStateTuple(self._num_units, self._num_units)
78 | if self._state_is_tuple else 2 * self._num_units)
79 |
80 | @property
81 | def output_size(self):
82 | return self._num_units
83 |
84 | def build(self, inputs_shape):
85 | if inputs_shape[1].value is None:
86 | raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
87 | % inputs_shape)
88 |
89 | input_depth = inputs_shape[1].value
90 | h_depth = self._num_units
91 | self._kernel = self.add_variable(
92 | _WEIGHTS_VARIABLE_NAME,
93 | shape=[input_depth + h_depth, 2 * self._num_units])
94 | self._bias = self.add_variable(
95 | _BIAS_VARIABLE_NAME,
96 | shape=[2 * self._num_units],
97 | initializer=init_ops.zeros_initializer(dtype=self.dtype))
98 |
99 | self.built = True
100 |
101 | def call(self, inputs, state):
102 | """Long short-term memory cell (LSTM).
103 |
104 | Args:
105 | inputs: `2-D` tensor with shape `[batch_size, input_size]`.
106 | state: An `LSTMStateTuple` of state tensors, each shaped
107 | `[batch_size, self.state_size]`, if `state_is_tuple` has been set to
108 | `True`. Otherwise, a `Tensor` shaped
109 | `[batch_size, 2 * self.state_size]`.
110 |
111 | Returns:
112 | A pair containing the new hidden state, and the new state (either a
113 | `LSTMStateTuple` or a concatenated state, depending on
114 | `state_is_tuple`).
115 | """
116 | sigmoid = math_ops.sigmoid
117 | one = constant_op.constant(1, dtype=dtypes.int32)
118 | # Parameters of gates are concatenated into one multiply for efficiency.
119 | if self._state_is_tuple:
120 | c, h = state
121 | else:
122 | c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one)
123 |
124 | gate_inputs = math_ops.matmul(
125 | array_ops.concat([inputs, h], 1), self._kernel)
126 | gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
127 |
128 | # i = input_gate, j = new_input, f = forget_gate, o = output_gate
129 | j, f = array_ops.split(
130 | value=gate_inputs, num_or_size_splits=2, axis=one)
131 | i = 1 - f
132 | forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype)
133 | # Note that using `add` and `multiply` instead of `+` and `*` gives a
134 | # performance improvement. So using those at the cost of readability.
135 | add = math_ops.add
136 | multiply = math_ops.multiply
137 | new_c = add(multiply(c, sigmoid(add(f, forget_bias_tensor))), multiply(sigmoid(i), self._activation(j)))
138 | new_h = new_c
139 |
140 | if self._state_is_tuple:
141 | new_state = LSTMStateTuple(new_c, new_h)
142 | else:
143 | new_state = array_ops.concat([new_c, new_h], 1)
144 | return new_h, new_state
145 |
146 |
147 | class IndRNNCell(LayerRNNCell): # 继承 LayerRNNCell
148 |
149 | def __init__(self,
150 | num_units,
151 | recurrent_min_abs=0,
152 | recurrent_max_abs=None,
153 | recurrent_kernel_initializer=None,
154 | input_kernel_initializer=None,
155 | activation=None,
156 | reuse=None,
157 | name=None):
158 | super(IndRNNCell, self).__init__(_reuse=reuse, name=name)
159 |
160 | self.input_spec = base_layer.InputSpec(ndim=2)
161 |
162 | # initialization
163 | self._num_units = num_units
164 | self._recurrent_min_abs = recurrent_min_abs
165 |
166 | self._recurrent_max_abs = recurrent_max_abs
167 | self._recurrent_recurrent_kernel_initializer = recurrent_kernel_initializer
168 | self._input_kernel_initializer = input_kernel_initializer
169 | self._activation = activation or nn_ops.relu
170 |
171 | @property
172 | def state_size(self):
173 | return self._num_units
174 |
175 | @property
176 | def output_size(self):
177 | return self._num_units
178 |
179 | def build(self, inputs_shape):
180 | '''construct the IndRNN Cell'''
181 | if inputs_shape[1].value is None:
182 | raise ValueError("Expected input shape[1] is known")
183 |
184 | input_depth = inputs_shape[1]
185 | if self._input_kernel_initializer is None:
186 | self._input_kernel_initializer = init_ops.random_normal_initializer(mean=0,
187 | stddev=1e-3)
188 | # matrix W
189 | self._input_kernel = self.add_variable(
190 | "input_kernel",
191 | shape=[input_depth, self._num_units],
192 | initializer=self._input_kernel_initializer
193 | )
194 |
195 | if self._recurrent_recurrent_kernel_initializer is None:
196 | self._recurrent_recurrent_kernel_initializer = init_ops.constant_initializer(1.)
197 |
198 | # matrix U
199 | self._recurrent_kernel = self.add_variable(
200 | "recurrent_kernel",
201 | shape=[self._num_units],
202 | initializer=self._recurrent_recurrent_kernel_initializer
203 | )
204 |
205 | # Clip the U to min - max
206 | if self._recurrent_min_abs:
207 | abs_kernel = math_ops.abs(self._recurrent_kernel)
208 | min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs)
209 | self._recurrent_kernel = math_ops.multiply(
210 | math_ops.sign(self._recurrent_kernel),
211 | min_abs_kernel
212 | )
213 | if self._recurrent_max_abs:
214 | self._recurrent_kernel = clip_ops.clip_by_value(
215 | self._recurrent_kernel,
216 | -self._recurrent_max_abs,
217 | self._recurrent_max_abs
218 | )
219 |
220 | self._bias = self.add_variable(
221 | "bias",
222 | shape=[self._num_units],
223 | initializer=init_ops.zeros_initializer(dtype=self.dtype)
224 | )
225 | # built finished
226 | self.built = True
227 |
228 | def call(self, inputs, state):
229 | '''output = new state = activation(W * x + U (*) h_t-1 + b)'''
230 |
231 | gate_inputs = math_ops.matmul(inputs, self._input_kernel)
232 | # (*)
233 | state_update = math_ops.multiply(state, self._recurrent_kernel)
234 | gate_inputs = math_ops.add(gate_inputs, state_update)
235 | gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
236 | output = self._activation(gate_inputs)
237 | return output, output
238 |
--------------------------------------------------------------------------------
/model/module/templates.txt:
--------------------------------------------------------------------------------
1 | # Unigram
2 |
3 | U00:%x[-2,0]
4 | U01:%x[-1,0]
5 | U02:%x[0,0]
6 | U03:%x[1,0]
7 | U04:%x[2,0]
8 | U05:%x[-2,0]/%x[-1,0]/%x[0,0]
9 | U06:%x[-1,0]/%x[0,0]/%x[1,0]
10 | U07:%x[0,0]/%x[1,0]/%x[2,0]
11 | U08:%x[-1,0]/%x[0,0]
12 | U09:%x[0,0]/%x[1,0]
13 |
14 | # Bigram
15 | B
--------------------------------------------------------------------------------
/model/rnn_siamese.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 18-12-29 下午3:07
4 | # @Author : 林利芳
5 | # @File : rnn_siamese.py
6 | import tensorflow as tf
7 | from config.hyperparams import RnnParams as hp
8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize
9 |
10 |
11 | class RnnSiameseNetwork(object):
12 | def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True):
13 | self.vocab_size = vocab_size
14 | self.embedding_size = embedding_size
15 | self.max_len = max_len
16 | self.is_training = is_training
17 | self.graph = tf.Graph()
18 | with self.graph.as_default():
19 | self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x")
20 | self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x")
21 | self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target")
22 | self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
23 | self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
24 | self.global_step = tf.train.create_global_step()
25 |
26 | key, value = self.siamese()
27 | self.distance, self.pre_y = self.similar(key, value)
28 | self.accuracy = self.predict()
29 | self.loss = self.loss_layer()
30 | self.train_op = self.optimize()
31 |
32 | def siamese(self):
33 | """
34 | 孪生网络 transformer + rnn
35 | :return:
36 | """
37 | x = tf.concat([self.left_x, self.right_x], axis=0)
38 | seq_lens = tf.concat([self.left_seq_lens, self.right_seq_lens], axis=0)
39 | # layers embedding multi_head_attention rnn
40 | embed = embedding(x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True, scope="embed")
41 |
42 | # output = self.transformer(embed, x)
43 | output = self.rnn_layer(embed, seq_lens)
44 | output = self.attention(embed, output)
45 | key, value = tf.split(output, 2, axis=0)
46 | return key, value
47 |
48 | def rnn_layer(self, inputs, seq_lens, seg=hp.seg):
49 | """
50 | 创建双向RNN层
51 | :param inputs:
52 | :param seq_lens:
53 | :param seg: LSTM GRU F-LSTM, IndRNN
54 | :return:
55 | """
56 | if seg == 'LSTM':
57 | fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
58 | bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
59 |
60 | elif seg == 'GRU':
61 | fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
62 | bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
63 | else:
64 | fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
65 | bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
66 | # 双向rnn
67 | (fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn(
68 | fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32)
69 | # 合并双向rnn的output batch_size * max_seq * (hidden_dim*2)
70 | output = tf.add(fw_output, bw_output)
71 | return output
72 |
73 | def transformer(self, embed, value):
74 | with tf.variable_scope("Transformer_Encoder"):
75 | # Positional Encoding
76 | embed += positional_encoding(value, num_units=hp.num_units, zero_pad=False, scale=False, scope="post")
77 | # Dropout
78 | output = self.multi_head_block(embed)
79 | return output
80 |
81 | def multi_head_block(self, query, causality=False):
82 | """
83 | 多头注意力机制
84 | :param query:
85 | :param causality:
86 | :return:
87 | """
88 | for i in range(hp.num_blocks):
89 | with tf.variable_scope("num_blocks_{}".format(i)):
90 | # multi head Attention ( self-attention)
91 | query = multihead_attention(
92 | queries=query, keys=query, num_units=hp.num_units, num_heads=hp.num_heads,
93 | dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality,
94 | scope="self_attention")
95 | # Feed Forward
96 | query = feedforward(query, num_units=[4 * hp.num_units, hp.num_units])
97 | return query
98 |
99 | def loss_layer(self):
100 | """
101 | 损失函数 L+ = (1-Ew)^2/4 L_ = max(Ex,0)^2
102 | :return:
103 | """
104 | y = tf.cast(self.y, tf.float32)
105 | with tf.name_scope("output"):
106 | loss_p = tf.square(1 - self.distance) / 4
107 | mask = tf.sign(tf.nn.relu(self.distance - hp.margin))
108 | loss_m = tf.square(mask * self.distance)
109 | loss = tf.reduce_sum(y * loss_p + (1 - y) * loss_m)
110 | return loss
111 |
112 | def attention(self, embed, query):
113 | """
114 | 注意力机制
115 | :param embed:
116 | :param query:
117 | :return:
118 | """
119 | with tf.name_scope("attention"):
120 | w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32)
121 | b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32)
122 | u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32)
123 | value = tf.concat([embed, query], axis=-1)
124 | value = tf.reshape(value, [-1, 2 * hp.num_units])
125 | attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u)
126 | attention = tf.reshape(attention, shape=[-1, self.max_len])
127 | attention = tf.nn.softmax(attention, axis=-1)
128 | attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units])
129 |
130 | output = tf.reduce_sum(attention * query, axis=1)
131 | output = layer_normalize(output)
132 | return output
133 |
134 | @staticmethod
135 | def similar(key, value):
136 | """
137 | cosine(key,value) = key * value/(|key|*|value|)
138 | :param key:
139 | :param value:
140 | :return:
141 | """
142 | dot_value = tf.reduce_sum(key * value, axis=-1)
143 | key_sqrt = tf.sqrt(tf.reduce_sum(tf.square(key), axis=-1) + hp.eps)
144 | value_sqrt = tf.sqrt(tf.reduce_sum(tf.square(value), axis=-1) + hp.eps)
145 | distance = tf.div(dot_value, key_sqrt * value_sqrt, name="similar")
146 | pre_y = tf.sign(tf.nn.relu(distance - hp.margin))
147 | pre_y = tf.cast(pre_y, tf.int32, name='pre')
148 | return distance, pre_y
149 |
150 | def predict(self):
151 | correct_predictions = tf.equal(self.pre_y, self.y)
152 | accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
153 | return accuracy
154 |
155 | def optimize(self):
156 | """
157 | 优化器
158 | :return:
159 | """
160 | optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
161 | train_op = optimizer.minimize(self.loss, global_step=self.global_step)
162 | return train_op
163 |
--------------------------------------------------------------------------------
/model/transformer_siamese.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 19-1-22 上午10:48
4 | # @Author : 林利芳
5 | # @File : transformer_siamese.py
6 | import tensorflow as tf
7 | from config.hyperparams import HyperParams as hp
8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize
9 |
10 |
11 | class TransformerSiameseNetwork(object):
12 | def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True, seg='LSTM'):
13 | self.vocab_size = vocab_size
14 | self.embedding_size = embedding_size
15 | self.max_len = max_len
16 | self.is_training = is_training
17 | self.graph = tf.Graph()
18 | with self.graph.as_default():
19 | self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x")
20 | self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x")
21 | self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target")
22 | self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
23 | self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size])
24 | self.global_step = tf.train.create_global_step()
25 |
26 | query, key = self.siamese(seg)
27 | self.distance, self.pre_y = self.similar(query, key)
28 | self.accuracy = self.predict()
29 | self.loss = self.loss_layer()
30 | self.train_op = self.optimize()
31 |
32 | def siamese(self, seg):
33 | """
34 | 孪生网络 transformer + rnn
35 | :param seg:
36 | :return:
37 | """
38 | x = tf.concat([self.left_x, self.right_x], axis=0)
39 | seq_lens = tf.concat([self.left_seq_lens, self.right_seq_lens], axis=0)
40 | # layers embedding multi_head_attention rnn
41 | left_embed = embedding(self.left_x, vocab_size=self.vocab_size, num_units=hp.num_units, scale=True,
42 | scope="lembed")
43 | right_embed = embedding(self.right_x, vocab_size=self.vocab_size, num_units=hp.num_units, scale=True,
44 | scope="rembed")
45 |
46 | query, key = self.transformer(left_embed, right_embed)
47 | # output = self.rnn_layer(embed, seq_lens, seg)
48 | query = self.attention(query, query)
49 | key = self.attention(key, key)
50 | return query, key
51 |
52 | def rnn_layer(self, inputs, seq_lens, seg):
53 | """
54 | 创建双向RNN层
55 | :param inputs:
56 | :param seq_lens:
57 | :param seg: LSTM GRU F-LSTM, IndRNN
58 | :return:
59 | """
60 | if seg == 'LSTM':
61 | fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
62 | bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units)
63 |
64 | elif seg == 'GRU':
65 | fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
66 | bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units)
67 | else:
68 | fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
69 | bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units)
70 | # 双向rnn
71 | (fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn(
72 | fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32)
73 | # 合并双向rnn的output batch_size * max_seq * (hidden_dim*2)
74 | output = tf.add(fw_output, bw_output)
75 | return output
76 |
77 | def transformer(self, query, key):
78 | with tf.variable_scope("Transformer_Encoder"):
79 | # Positional Encoding
80 | query += positional_encoding(self.left_x, num_units=hp.num_units, zero_pad=False, scale=False)
81 | key += positional_encoding(self.right_x, num_units=hp.num_units, zero_pad=False, scale=False)
82 | # Dropout
83 | output = self.multi_head_block(query, key)
84 | return output
85 |
86 | def multi_head_block(self, query, key, causality=False):
87 | """
88 | 多头注意力机制
89 | :param query:
90 | :param key:
91 | :param causality:
92 | :return:
93 | """
94 | for i in range(hp.num_blocks):
95 | with tf.variable_scope("num_blocks_{}".format(i)):
96 | # multi head Attention ( self-attention)
97 | query = self.multihead_attention(query, query, name="query_attention", causality=causality)
98 | key = self.multihead_attention(key, key, name="key_attention", causality=causality)
99 | query = self.multihead_attention(query, key, name="query_key_attention")
100 | key = self.multihead_attention(key, query, name="query_key_attention")
101 | return query, key
102 |
103 | def multihead_attention(self, query, key, name="key_attention", causality=False):
104 | value = multihead_attention(
105 | queries=query, keys=key, num_units=hp.num_units, num_heads=hp.num_heads,
106 | dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality,
107 | scope=name)
108 | # Feed Forward
109 | value = feedforward(value, num_units=[4 * hp.num_units, hp.num_units])
110 | return value
111 |
112 | def loss_layer(self):
113 | """
114 | 损失函数 L+ = (1-Ew)^2/4 L_ = max(Ex,0)^2
115 | :return:
116 | """
117 | y = tf.cast(self.y, tf.float32)
118 | with tf.name_scope("output"):
119 | loss_p = tf.square(1 - self.distance) / 4
120 | mask = tf.sign(tf.nn.relu(self.distance - hp.margin))
121 | loss_m = tf.square(mask * self.distance)
122 | loss = tf.reduce_sum(y * loss_p + (1 - y) * loss_m)
123 | return loss
124 |
125 | def attention(self, embed, query):
126 | """
127 | 注意力机制
128 | :param embed:
129 | :param query:
130 | :return:
131 | """
132 | with tf.name_scope("attention"):
133 | w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32)
134 | b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32)
135 | u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32)
136 | value = tf.concat([embed, query], axis=-1)
137 | value = tf.reshape(value, [-1, 2 * hp.num_units])
138 | attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u)
139 | attention = tf.reshape(attention, shape=[-1, self.max_len])
140 | attention = tf.nn.softmax(attention, axis=-1)
141 | attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units])
142 |
143 | output = tf.reduce_sum(attention * query, axis=1)
144 | output = layer_normalize(output)
145 | return output
146 |
147 | @staticmethod
148 | def similar(query, key):
149 | """
150 | cosine(key,value) = key * value/(|key|*|value|)
151 | :param key:
152 | :param value:
153 | :return:
154 | """
155 | dot_value = tf.reduce_sum(query * key, axis=-1)
156 | query_sqrt = tf.sqrt(tf.reduce_sum(tf.square(query), axis=-1) + hp.eps)
157 | key_sqrt = tf.sqrt(tf.reduce_sum(tf.square(key), axis=-1) + hp.eps)
158 | distance = tf.div(dot_value, key_sqrt * query_sqrt, name="similar")
159 | pre_y = tf.sign(tf.nn.relu(distance - hp.margin))
160 | pre_y = tf.cast(pre_y, tf.int32, name='pre')
161 | return distance, pre_y
162 |
163 | def predict(self):
164 | correct_predictions = tf.equal(self.pre_y, self.y)
165 | accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
166 | return accuracy
167 |
168 | def optimize(self):
169 | """
170 | 优化器
171 | :return:
172 | """
173 | optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
174 | train_op = optimizer.minimize(self.loss, global_step=self.global_step)
175 | return train_op
176 |
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 18-12-29 下午3:06
4 | # @Author : 林利芳
5 | # @File : run.py
6 | import os
7 | from core.load_data import load_train_data, get_feed_dict, print_info, preprocessor
8 | from config.config import checkpoint_dir, VOCAB_PKL
9 | from core.utils import load_data
10 | from model.rnn_siamese import RnnSiameseNetwork
11 | from model.match_pyramid import MatchPyramidNetwork
12 | from model.cnn_siamese import CnnSiameseNetwork
13 | from model.transformer_siamese import TransformerSiameseNetwork
14 | from config.hyperparams import HyperParams as hp
15 | import tensorflow as tf
16 |
17 |
18 | def run(network='rnn'):
19 | checkpoint_file = checkpoint_dir.format(network)
20 | if not os.path.exists(checkpoint_file):
21 | os.mkdir(checkpoint_file)
22 | train_l_x, val_l_x, train_l_len, val_l_len, train_r_x, val_r_x, train_r_len, val_r_len, train_y, val_y = load_train_data()
23 | vocab = load_data(VOCAB_PKL)
24 | vocab_size = len(vocab.word2idx)
25 |
26 | batch_size = hp.batch_size
27 | if network == 'rnn':
28 | model = RnnSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True)
29 | elif network == 'match_pyramid':
30 | model = MatchPyramidNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True)
31 | elif network == 'cnn':
32 | model = CnnSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True)
33 | elif network == "transformer":
34 | model = TransformerSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True)
35 | else:
36 | return
37 | sv = tf.train.Supervisor(graph=model.graph, logdir=checkpoint_file, save_model_secs=150)
38 | with sv.managed_session() as sess:
39 | print("start training...\n")
40 | for epoch in range(1, hp.num_epochs + 1):
41 | if sv.should_stop():
42 | break
43 | train_loss = []
44 |
45 | for feed_dict, _ in get_feed_dict(model, train_l_x, train_r_x, train_l_len, train_r_len, train_y,
46 | batch_size):
47 | loss, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict)
48 | train_loss.append(loss)
49 | dev_loss = []
50 | predicts = []
51 | for feed_dict, start in get_feed_dict(model, val_l_x, val_r_x, val_l_len, val_r_len, val_y, batch_size):
52 | loss, gs, pre_y = sess.run([model.loss, model.global_step, model.pre_y], feed_dict=feed_dict)
53 | dev_loss.append(loss)
54 | predicts.extend(pre_y[start:])
55 | print_info(epoch, gs, train_loss, dev_loss, val_y, predicts)
56 |
57 |
58 | if __name__ == "__main__":
59 | # preprocessor(True)
60 | network = 'transformer' # network = [rnn match_pyramid cnn transformer]
61 | run(network)
62 |
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | python main.py $1 $2
--------------------------------------------------------------------------------