├── .gitignore ├── .idea └── vcs.xml ├── README.md ├── config ├── __init__.py ├── config.py ├── hyperparams.py └── synonym.py ├── core ├── __init__.py ├── load_data.py ├── preprocessor.py ├── utils.py └── word_embedding.py ├── data ├── atec_nlp_sim_test.csv ├── atec_nlp_sim_train.csv ├── atec_nlp_sim_train_add.csv └── corpus.txt ├── logdir └── graph │ ├── match_pyramid │ └── siamese.png ├── main.py ├── model ├── __init__.py ├── cnn_siamese.py ├── match_pyramid.py ├── module │ ├── __init__.py │ ├── feature.py │ ├── modules.py │ ├── rnn.py │ └── templates.txt ├── rnn_siamese.py └── transformer_siamese.py ├── run.py └── run.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | data/char2vec_300 9 | data/data.pkl 10 | data/vocab.pkl 11 | data/expend_atec_nlp.csv 12 | logdir/checkpoints/* 13 | logdir/checkpoints-match_pyramid/* 14 | logdir/checkpoints-rnn/* 15 | logdir/model/* 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TextSimilar 2 | 短文本相似度 3 | ### 孪生网络 4 | [Learning Text Similarity with Siamese Recurrent Networks](http://www.aclweb.org/anthology/W/W16/W16-1617.pdf) 5 | loss函数 6 | 7 | --- 8 | ### match pyramid 9 | [Text Matching as Image Recognition](https://arxiv.org/abs/1602.06359) 10 | 11 | --- 12 | 数据来源于[蚂蚁金融NLP之问题相似度计算](https://dc.cloud.alipay.com/index#/topic/intro?id=8) 13 | >问题相似度计算,即给定客服里用户描述的两句话,用算法来判断是否表示了相同的语义。 14 | > 15 | >示例: 16 | > 17 | >1. “花呗如何还款” --“花呗怎么还款”:同义问句 18 | > 19 | >2. “花呗如何还款” -- “我怎么还我的花被呢”:同义问句 20 | > 21 | >3. “花呗分期后逾期了如何还款”-- “花呗分期后逾期了哪里还款”:非同义问句 22 | > 23 | >对于例子a,比较简单的方法就可以判定同义;对于例子b,包含了错别字、同义词、词序变换等问题,两个句子乍一看并不类似,想正确判断比较有挑战;对于例子c,两句话很类似,仅仅有一处细微的差别 “如何”和“哪里”,就导致语义不一致。 24 | 25 | 数据预处理python3 run.py, 在data目录得到data.pkl和vocab.pkl。 26 | ```python 27 | if __name__ == "__main__": 28 | preprocessor(True) 29 | network = 'rnn' # network = [rnn match_pyramid cnn] 30 | run(network) 31 | ``` 32 | -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 19-1-25 下午6:02 4 | # @Author : 林利芳 5 | # @File : __init__.py 6 | -------------------------------------------------------------------------------- /config/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 18-12-27 上午9:54 4 | # @Author : 林利芳 5 | # @File : config.py 6 | 7 | import os 8 | 9 | PATH = os.getcwd() 10 | ATEC_NLP_DATA = os.path.join(PATH, 'data/atec_nlp_sim_train.csv') 11 | ADD_ATEC_NLP_DATA = os.path.join(PATH, 'data/atec_nlp_sim_train_add.csv') 12 | 13 | TEST_DATA = os.path.join(PATH, 'data/atec_nlp_sim_test.csv') 14 | TEST_RESULT = os.path.join(PATH, 'data/test_result.csv') 15 | 16 | EXPEND_ATEC_NLP_DATA = os.path.join(PATH, 'data/expend_atec_nlp_{}.csv') 17 | 18 | DATA_PKL = os.path.join(PATH, 'data/data.pkl') 19 | VOCAB_PKL = os.path.join(PATH, 'data/vocab.pkl') 20 | 21 | CORPUS_DATA = os.path.join(PATH, 'data/corpus.txt') 22 | 23 | WORD2VEC_DATA = os.path.join(PATH, 'data/char2vec_300') 24 | logdir = os.path.join(PATH, 'logdir') 25 | checkpoint_dir = "logdir/checkpoints-{}" 26 | model_dir = os.path.join(logdir, "model") 27 | 28 | WordChar = "char" 29 | -------------------------------------------------------------------------------- /config/hyperparams.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 18-12-25 上午10:41 4 | # @Author : 林利芳 5 | # @File : hyperparams.py 6 | 7 | 8 | class HyperParams: 9 | # training 10 | batch_size = 32 # alias = N 11 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 12 | seg = 'GRU' # seg = [GRU,LSTM,IndRNN,F-LSTM] 13 | # model 14 | max_len = 50 # Maximum number of words in a sentence. alias = T. 15 | # Feel free to increase this if you are ambitious. 16 | min_cnt = 20 # words whose occurred less than min_cnt are encoded as . 17 | num_units = 512 # alias = C 18 | embedding_size = 512 19 | vocab_size = 10000 20 | num_blocks = 1 # number of encoder/decoder blocks 21 | num_epochs = 100 22 | num_heads = 8 23 | attention_size = 100 24 | clip = 5 25 | dropout_rate = 0.1 26 | eps = 1e-9 27 | margin = 0.7 28 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 29 | 30 | 31 | class RnnParams: 32 | # training 33 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 34 | seg = 'GRU' # seg = [GRU,LSTM,IndRNN,F-LSTM] 35 | # model 36 | max_len = 50 # Maximum number of words in a sentence. alias = T. 37 | # Feel free to increase this if you are ambitious. 38 | min_cnt = 20 # words whose occurred less than min_cnt are encoded as . 39 | num_units = 100 # alias = C 40 | embedding_size = 100 41 | num_epochs = 40 42 | attention_size = 100 43 | clip = 5 44 | dropout_rate = 0.1 45 | eps = 1e-9 46 | margin = 0.1 47 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 48 | 49 | 50 | class CnnParams: 51 | # training 52 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 53 | seg = 'GRU' # seg = [GRU,LSTM,IndRNN,F-LSTM] 54 | # model 55 | max_len = 50 # Maximum number of words in a sentence. alias = T. 56 | # Feel free to increase this if you are ambitious. 57 | min_cnt = 20 # words whose occurred less than min_cnt are encoded as . 58 | num_units = 100 # alias = C 59 | embedding_size = 100 60 | num_epochs = 40 61 | attention_size = 100 62 | clip = 5 63 | dropout_rate = 0.1 64 | eps = 1e-9 65 | margin = 0.01 66 | channel = 64 # 通道数 67 | kernel = [3, 5] # 核大小 68 | pool_size = 2 # 池化层大小 69 | dense_size = 100 # 全连接层大小 70 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 71 | 72 | 73 | class TransformerParams: 74 | # training 75 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 76 | seg = 'GRU' # seg = [GRU,LSTM,IndRNN,F-LSTM] 77 | # model 78 | max_len = 50 # Maximum number of words in a sentence. alias = T. 79 | # Feel free to increase this if you are ambitious. 80 | min_cnt = 20 # words whose occurred less than min_cnt are encoded as . 81 | num_units = 512 # alias = C 82 | embedding_size = 512 83 | num_epochs = 40 84 | num_blocks = 6 # number of encoder/decoder blocks 85 | num_heads = 8 86 | attention_size = 100 87 | clip = 5 88 | dropout_rate = 0.1 89 | eps = 1e-9 90 | margin = 0.3 91 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 92 | 93 | 94 | class MatchPyramidParams: 95 | # training 96 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 97 | seg = 'GRU' # seg = [GRU,LSTM,IndRNN,F-LSTM] 98 | # model 99 | max_len = 50 # Maximum number of words in a sentence. alias = T. 100 | # Feel free to increase this if you are ambitious. 101 | min_cnt = 20 # words whose occurred less than min_cnt are encoded as . 102 | num_units = 100 # alias = C 103 | embedding_size = 100 104 | num_epochs = 40 105 | attention_size = 100 106 | clip = 5 107 | dropout_rate = 0.1 108 | eps = 1e-9 109 | margin = 0.3 110 | channel = 64 # 通道数 111 | kernel = [3, 5] # 核大小 112 | pool_size = 2 # 池化层大小 113 | dense_size = 100 # 全连接层大小 114 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 115 | -------------------------------------------------------------------------------- /config/synonym.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 19-1-24 上午11:11 4 | # @Author : 林利芳 5 | # @File : synonym.py 6 | import re 7 | 8 | SYNONYM_DICT = { 9 | "更改": ["更改", '更换'], 10 | "改为": ["更改成", "改为", "更改为"], 11 | "可以": ["可以", "能"], 12 | "下降": ['降低', "下降"], 13 | "为什么": ["为何", "为啥"], 14 | "能不能": ["能不能", "行不行", "可不可以"], 15 | "不能用": ["不能用", "用不了"], 16 | } 17 | 18 | SYNONYM_WRONG = { 19 | "零时额度": "临时额度", 20 | "花贝": '花呗', 21 | "花唄": '花呗', 22 | "花被": '花呗', 23 | "蚂蚁花贝": '花呗', 24 | "蚂蚁花唄": '花呗', 25 | "蚂蚁花被": '花呗', 26 | "蚂蚁花呗": '花呗', 27 | "蚂蚁借呗": '借呗', 28 | "届不了": '借不了', 29 | "为何": "为什么", 30 | "为啥": "为什么", 31 | "下个月": '下月', 32 | "上个月": '上月', 33 | "行不行": '能不能', 34 | "可不可以": '能不能', 35 | "用不了": "不能用", 36 | "不让": '不能', 37 | "不可以": '不能', 38 | "不行": '不能', 39 | "老有": '总有', 40 | "日息": "利息", 41 | "更改成": "改为", 42 | "更改为": "改为", 43 | "更换": "更改", 44 | "能": "可以", 45 | "降低": "下降", 46 | "受到": "收到", 47 | ',': ',', 48 | '?': '?', 49 | '!': '!', 50 | ';': ';', 51 | '***': '0', 52 | } 53 | 54 | PATTERN = [ 55 | [re.compile('\*+'), '*'], 56 | [re.compile('\?'), '?'], 57 | [re.compile('\.$'), '。'], 58 | [re.compile('!'), '!'], 59 | [re.compile(','), ','], 60 | [re.compile(';'), ';'], 61 | [re.compile('\s+'), ''], 62 | [re.compile('\ufeff'), ''], 63 | ] 64 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 19-1-25 下午6:02 4 | # @Author : 林利芳 5 | # @File : __init__.py 6 | -------------------------------------------------------------------------------- /core/load_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 19-1-18 下午4:08 4 | # @Author : 林利芳 5 | # @File : load_data.py 6 | from config.config import DATA_PKL, VOCAB_PKL 7 | from core.preprocessor import preprocessor, pad_sequence, trim 8 | from core.utils import load_data, read_csv 9 | import numpy as np 10 | from sklearn.metrics import recall_score, precision_score, f1_score 11 | 12 | 13 | def gen_batch_data(l_x, r_x, l_len, r_len, y, batch_size): 14 | """ 15 | 生成batch数据 16 | :param l_x: 17 | :param r_x: 18 | :param l_len: 19 | :param r_len: 20 | :param y: 21 | :param batch_size: 22 | :return: 23 | """ 24 | data_size = len(y) 25 | num_batch = data_size // batch_size + 1 26 | 27 | for ii in range(num_batch): 28 | start, end = ii * batch_size, (ii + 1) * batch_size 29 | start_batch = 0 30 | if end > data_size: 31 | start_batch = end - data_size 32 | start, end = data_size - batch_size, data_size 33 | l_x_batch = l_x[start:end] 34 | r_x_batch = r_x[start:end] 35 | l_len_batch = l_len[start:end] 36 | r_len_batch = r_len[start:end] 37 | y_batch = y[start:end] 38 | yield l_x_batch, r_x_batch, l_len_batch, r_len_batch, y_batch, start_batch 39 | 40 | 41 | def load_train_data(): 42 | data = load_data(DATA_PKL) 43 | train_l_x, val_l_x, train_l_len, val_l_len, train_r_x, val_r_x, train_r_len, val_r_len, train_y, val_y = \ 44 | data['train_l_x'], data['val_l_x'], data['train_l_len'], data['val_l_len'], data['train_r_x'], data[ 45 | 'val_r_x'], data['train_r_len'], data['val_r_len'], data['train_y'], data['val_y'] 46 | train_l_x = np.array(train_l_x) 47 | val_l_x = np.array(val_l_x) 48 | train_l_len = np.array(train_l_len) 49 | val_l_len = np.array(val_l_len) 50 | train_r_x = np.array(train_r_x) 51 | val_r_x = np.array(val_r_x) 52 | train_r_len = np.array(train_r_len) 53 | val_r_len = np.array(val_r_len) 54 | train_y = np.array(train_y) 55 | val_y = np.array(val_y) 56 | return train_l_x, val_l_x, train_l_len, val_l_len, train_r_x, val_r_x, train_r_len, val_r_len, train_y, val_y 57 | 58 | 59 | def get_feed_dict(model, l_x, r_x, l_len, r_len, y, batch_size): 60 | """ 61 | 生成feed_dict 62 | :param model: 63 | :param l_x: 64 | :param r_x: 65 | :param l_len: 66 | :param r_len: 67 | :param y: 68 | :param batch_size: 69 | :return: 70 | """ 71 | for l_x_batch, r_x_batch, l_len_batch, r_len_batch, y_batch, start_batch in gen_batch_data( 72 | l_x, r_x, l_len, r_len, y, batch_size): 73 | feed_dict = { 74 | model.left_x: l_x_batch, 75 | model.right_x: r_x_batch, 76 | model.y: y_batch, 77 | model.left_seq_lens: l_len_batch, 78 | model.right_seq_lens: r_len_batch 79 | } 80 | yield feed_dict, start_batch 81 | 82 | 83 | def print_info(epoch, step, train_loss, dev_loss, y, pre_y): 84 | loss = round(float(np.mean(train_loss)), 3) 85 | val_loss = round(float(np.mean(dev_loss)), 3) 86 | f1 = round(f1_score(y, pre_y), 4) 87 | recall = round(recall_score(y, pre_y), 4) 88 | precision = round(precision_score(y, pre_y), 4) 89 | print('**************************************************') 90 | print("epoch\t{}\tstep\t{}\ttrain_loss\t{}\tdev_loss\t{}\t".format(epoch, step, loss, val_loss)) 91 | print("precision\t{}\trecall\t{}\tf1\t{}\n\n".format(precision, recall, f1)) 92 | 93 | 94 | def load_test_data(filename): 95 | vocab = load_data(VOCAB_PKL) 96 | max_len = vocab.max_len 97 | data = read_csv(filename) 98 | data = [kk[:3] for kk in data] 99 | idx, left_x, right_x = zip(*data) 100 | 101 | left_x = [trim(kk) for kk in left_x] 102 | right_x = [trim(kk) for kk in right_x] 103 | 104 | left_x, left_len = pad_sequence(left_x, vocab, max_len) 105 | right_x, right_len = pad_sequence(right_x, vocab, max_len) 106 | 107 | return idx, left_x, left_len, right_x, right_len, vocab 108 | 109 | 110 | def save_test_result(filename, idx, predicts): 111 | import codecs 112 | with codecs.open(filename, 'w', encoding='utf-8') as fp: 113 | for _id, pre in zip(idx, predicts): 114 | fp.writelines('{}\t{}\n'.format(_id, pre)) 115 | -------------------------------------------------------------------------------- /core/preprocessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 19-1-18 下午3:49 4 | # @Author : 林利芳 5 | # @File : preprocessor.py 6 | import pprint 7 | 8 | from sklearn.model_selection import train_test_split 9 | import numpy as np 10 | from config.config import DATA_PKL, VOCAB_PKL, ATEC_NLP_DATA, ADD_ATEC_NLP_DATA, CORPUS_DATA, EXPEND_ATEC_NLP_DATA, \ 11 | WordChar 12 | from core.utils import save_data, read_csv, load_data 13 | from core.word_embedding import Vocab 14 | import re 15 | import jieba 16 | import collections 17 | from config.synonym import SYNONYM_DICT, SYNONYM_WRONG, PATTERN 18 | import itertools 19 | from config.hyperparams import HyperParams as hp 20 | import sys 21 | 22 | try: 23 | reload(sys) 24 | sys.setdefaultencoding('utf8') 25 | except: 26 | pass 27 | jieba.load_userdict(CORPUS_DATA) 28 | PAD = "" 29 | UNK = "" 30 | PAD2ID = 0 31 | UNK2ID = 0 32 | 33 | 34 | def extended_corpus(data, is_training=True, filename="train"): 35 | """ 36 | 扩展语料 37 | :param data: 38 | :param is_training: 39 | :param filename: 40 | :return: 41 | """ 42 | print("同义词替换...\n") 43 | similar_data = [] 44 | for sub_data in data: 45 | idx, left_s, right_s, y = sub_data 46 | idx = idx.replace('\ufeff', '') 47 | left_s = trim(left_s) 48 | right_s = trim(right_s) 49 | if is_training: 50 | data = combine_data(idx, left_s, right_s, y) 51 | else: 52 | data = [[idx, ' '.join(left_s), ' '.join(right_s), y]] 53 | similar_data.extend(data) 54 | 55 | save_expend_data(similar_data, EXPEND_ATEC_NLP_DATA.format(filename)) 56 | 57 | 58 | # return similar_data 59 | 60 | 61 | def save_expend_data(data, filename): 62 | import codecs 63 | with codecs.open(filename, 'w', encoding='utf-8') as fp: 64 | for line in data: 65 | idx, left_x, right_x, y = line 66 | temp = [idx, left_x, right_x, str(y)] 67 | fp.writelines('\t'.join(temp) + '\n') 68 | 69 | 70 | def synonym_replace(sentence): 71 | """ 72 | 同义词替换 73 | :param sentence: 74 | :return: 75 | """ 76 | sentences = [] 77 | for word in sentence: 78 | words = SYNONYM_DICT.get(word, [word]) 79 | sentences.append(words) 80 | sentences = list(set(itertools.product(*sentences))) 81 | result = [] 82 | for ii, sub_data in enumerate(sentences): 83 | sub_data = list(sub_data) 84 | if sub_data == sentence: 85 | continue 86 | result.append(sub_data) 87 | return result 88 | 89 | 90 | def combine_data(idx, left_s, right_s, y): 91 | similar_data = [[idx, ' '.join(left_s), ' '.join(right_s), y]] 92 | left_sentence = synonym_replace(left_s) 93 | right_sentence = synonym_replace(right_s) 94 | left_len, right_len = len(left_sentence), len(right_sentence) 95 | max_num = max(left_len, right_len) 96 | if y == '0': 97 | max_num = 0 98 | for sub_s in left_sentence[:max_num]: 99 | temp = [idx, ' '.join(sub_s), ' '.join(right_s), y] 100 | similar_data.append(temp) 101 | for sub_s in right_sentence[:max_num]: 102 | temp = [idx, ' '.join(left_s), ' '.join(sub_s), y] 103 | similar_data.append(temp) 104 | return similar_data 105 | 106 | 107 | # if y == '1': 108 | # for sub_left_s, sub_right_s in zip(left_sentence[:3], right_sentence[:3]): 109 | # temp = [idx, sub_left_s, sub_right_s, y] 110 | # similar_data.append(temp) 111 | # 112 | # if left_len > right_len: 113 | # for sub_left_s, sub_right_s in zip(left_sentence[1:], right_sentence): 114 | # temp = [idx, sub_left_s, sub_right_s, y] 115 | # similar_data.append(temp) 116 | # elif right_len > left_len: 117 | # for sub_left_s, sub_right_s in zip(left_sentence, right_sentence[1:]): 118 | # temp = [idx, sub_left_s, sub_right_s, y] 119 | # similar_data.append(temp) 120 | # else: 121 | # data = left_sentence.pop() 122 | # left_sentence.insert(0, data) 123 | # for sub_left_s, sub_right_s in zip(left_sentence, right_sentence): 124 | # temp = [idx, sub_left_s, sub_right_s, y] 125 | # similar_data.append(temp) 126 | 127 | 128 | def trim(text): 129 | for rule, region in PATTERN: 130 | text = rule.sub(region, text) 131 | sentence = list(jieba.cut(text)) 132 | for ii, word in enumerate(sentence): 133 | if word in SYNONYM_WRONG: 134 | word = SYNONYM_WRONG.get(word, word) 135 | sentence[ii] = word 136 | return sentence 137 | 138 | 139 | def build_vocab(text, max_len): 140 | """ 141 | 构建词库 142 | :param text: text = [sentence] 143 | :param max_len: int 144 | :return: 145 | """ 146 | vocab = [] 147 | for sentence in text: 148 | vocab.extend(sentence) 149 | count = collections.Counter(vocab).most_common() 150 | vocab = {v: k + 2 for k, (v, _) in enumerate(count)} 151 | vocab[PAD] = PAD2ID 152 | vocab[UNK] = UNK2ID 153 | 154 | v = Vocab() 155 | v.word2idx = vocab 156 | v.max_len = max_len 157 | return v 158 | 159 | 160 | def process_label(y): 161 | result = [] 162 | num = 0 163 | for label in y: 164 | if label == '1': 165 | num += 1 166 | try: 167 | result.append(int(label)) 168 | except: 169 | result.append(0) 170 | print("正样本数\t{}\t负样本数\t{}".format(num, len(y) - num)) 171 | return result 172 | 173 | 174 | def preprocessor(synonym=False): 175 | """数据预处理""" 176 | if synonym: 177 | data = read_csv(ATEC_NLP_DATA) 178 | data.extend(read_csv(ADD_ATEC_NLP_DATA)) 179 | init_num = len(data) 180 | train_data, dev_data = train_test_split(data, test_size=0.1, random_state=50) 181 | extended_corpus(train_data) 182 | extended_corpus(dev_data, False, 'dev') 183 | # expand_num = len(train_data) + len(dev_data) 184 | # print("初始语料\t{}\t扩展语料\t{}\t新增语料\t{}".format(init_num, expand_num, expand_num - init_num)) 185 | # else: 186 | train_data = read_csv(EXPEND_ATEC_NLP_DATA.format('train')) 187 | dev_data = read_csv(EXPEND_ATEC_NLP_DATA.format('dev')) 188 | train_idx, train_left_x, train_right_x, train_y = zip(*train_data) 189 | dev_idx, dev_left_x, dev_right_x, dev_y = zip(*dev_data) 190 | 191 | train_left_x = split_data(train_left_x) 192 | 193 | train_right_x = split_data(train_right_x) 194 | dev_left_x = split_data(dev_left_x) 195 | dev_right_x = split_data(dev_right_x) 196 | train_y = process_label(train_y) 197 | dev_y = process_label(dev_y) 198 | max_len = max(len(kk) for kk in train_left_x + train_right_x + dev_right_x + dev_left_x) 199 | vocab = build_vocab(train_left_x + train_right_x + dev_right_x + dev_left_x, max_len) 200 | 201 | print("最大长度\t{}\t词汇量\t{}".format(max_len, len(vocab.word2idx))) 202 | 203 | train_left_x, train_left_len = pad_sequence(train_left_x, vocab, max_len) 204 | train_right_x, train_right_len = pad_sequence(train_right_x, vocab, max_len) 205 | dev_left_x, dev_left_len = pad_sequence(dev_left_x, vocab, max_len) 206 | dev_right_x, dev_right_len = pad_sequence(dev_right_x, vocab, max_len) 207 | 208 | data = { 209 | "train_l_x": train_left_x, 210 | "train_r_x": train_right_x, 211 | "train_l_len": train_left_len, 212 | "train_r_len": train_right_len, 213 | "train_y": train_y, 214 | "val_l_x": dev_left_x, 215 | "val_r_x": dev_right_x, 216 | "val_l_len": dev_left_len, 217 | "val_r_len": dev_right_len, 218 | "val_y": dev_y, 219 | } 220 | save_data(DATA_PKL, data) 221 | save_data(VOCAB_PKL, vocab) 222 | return data, vocab 223 | 224 | 225 | def split_data(data): 226 | result = [] 227 | if WordChar == 'char': 228 | for sentence in data: 229 | sentence = sentence.replace(' ', '') 230 | new_sentence = [char for char in sentence] 231 | result.append(new_sentence) 232 | else: 233 | for sentence in data: 234 | sentence = sentence.split(' ') 235 | result.append(sentence) 236 | return result 237 | 238 | 239 | def pad_sequence(data, vocab, max_len): 240 | """ 241 | 补全数据 242 | :param data: 243 | :param vocab: 244 | :param max_len: 245 | :return: 246 | """ 247 | seqs_data = [] 248 | seqs_len = [] 249 | for sentence in data: 250 | seq_len = len(sentence) 251 | seqs_len.append(len(sentence)) 252 | sentence = [vocab.word2idx.get(kk, UNK2ID) for kk in sentence] + [PAD2ID] * (max_len - seq_len) 253 | seqs_data.append(sentence[:max_len]) 254 | return seqs_data, seqs_len 255 | -------------------------------------------------------------------------------- /core/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # @author: Linlifang 5 | # @file: utils.py 6 | # @time: 18-6-27下午6:13 7 | import csv 8 | 9 | try: 10 | import cPickle as pickle 11 | except: 12 | import pickle 13 | try: 14 | import sys 15 | 16 | reload(sys) 17 | sys.setdefaultencoding('utf8') 18 | except: 19 | pass 20 | 21 | 22 | def read_csv(filename, delimiter='\t'): 23 | """ 24 | 读取csv 25 | :param filename: 26 | :param delimiter: 27 | :return: 28 | """ 29 | import codecs 30 | with codecs.open(filename, 'r', encoding='utf-8') as fp: 31 | data = [[ii for ii in each] for each in csv.reader(fp, delimiter=delimiter)] 32 | return data 33 | 34 | 35 | def load_text(filename): 36 | """ 37 | 加载数据 38 | :param filename: 39 | :return: 40 | """ 41 | data = [] 42 | with open(filename, 'r') as fp: 43 | for idx, line in enumerate(fp): 44 | line = line.strip('\n') 45 | tokens = line.split() 46 | data.append(tokens) 47 | return data 48 | 49 | 50 | def load_data(filename): 51 | """ 52 | 加载词汇信息 53 | :return: 54 | """ 55 | try: 56 | with open(filename, 'rb') as fp: 57 | data = pickle.load(fp) 58 | except: 59 | with open('data/vocab2.pkl', 'rb') as fp: 60 | data = pickle.load(fp) 61 | return data 62 | 63 | 64 | def save_data(filename, data): 65 | with open(filename, 'wb') as fp: 66 | pickle.dump(data, fp) 67 | -------------------------------------------------------------------------------- /core/word_embedding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 19-1-17 下午5:08 4 | # @Author : 林利芳 5 | # @File : word_embedding.py 6 | import numpy as np 7 | 8 | from config.config import WORD2VEC_DATA 9 | 10 | 11 | class Vocab(object): 12 | def __init__(self): 13 | self.word2vec = [] 14 | self.word2idx = {'': 0, '': 1} 15 | self.max_len = 0 16 | 17 | def add_word(self, word, vector): 18 | self.word2idx[word] = len(self.word2idx) 19 | self.word2vec.append(vector) 20 | 21 | def load_word_vectors(self): 22 | with open(WORD2VEC_DATA, 'r') as f: 23 | vocab_size, embedding_dim = [int(_) for _ in f.readline().strip().split(' ')] 24 | self.word2vec = [[0.0] * embedding_dim] 25 | self.word2vec.append(np.random.uniform(-0.25, 0.25, embedding_dim).round(6).tolist()) 26 | lines = f.readlines() 27 | for line in lines: 28 | word, vector = line.strip().split(' ', 1) 29 | self.add_word(word, [float(_) for _ in vector.split(' ')]) 30 | self.word2vec = np.array(self.word2vec).astype(np.float32) 31 | -------------------------------------------------------------------------------- /data/corpus.txt: -------------------------------------------------------------------------------- 1 | 怎么 2 | 怎样 3 | 如何 4 | 更改 5 | 更换 6 | 更新 7 | 修改 8 | 未 9 | 没有 10 | 可以 11 | 为什么 12 | 为何 13 | 为啥 14 | 零时额度 15 | 临时额度 16 | 这么久 17 | 降低 18 | 下降 19 | 日息 20 | 不能 21 | 不让 22 | 不可以 23 | 能不能 24 | 行不行 25 | 可不可以 26 | 用不了 27 | 不能用 28 | 被冻结 29 | 被封了 30 | 下月 31 | 下个月 32 | 蚂蚁借呗 33 | 借呗 34 | 花呗 35 | 花贝 36 | 花唄 37 | 花被 38 | 蚂蚁借呗 39 | 蚂蚁花呗 40 | 蚂蚁花贝 41 | 蚂蚁花唄 42 | 蚂蚁花被 43 | *** 44 | ofo 45 | 借呗 46 | 余额宝 47 | 代扣完 48 | 更改成 49 | 用了 50 | 届不了 51 | 借不了 52 | 上个月 53 | 上月 54 | 老有 55 | 总有 -------------------------------------------------------------------------------- /logdir/graph/match_pyramid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phychaos/TextSimilar/5c3e23bceba3e2aebf5c2db390ab1ddeb728e30e/logdir/graph/match_pyramid -------------------------------------------------------------------------------- /logdir/graph/siamese.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phychaos/TextSimilar/5c3e23bceba3e2aebf5c2db390ab1ddeb728e30e/logdir/graph/siamese.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 19-1-18 下午5:46 4 | # @Author : 林利芳 5 | # @File : main.py 6 | import os 7 | import sys 8 | 9 | from core.load_data import get_feed_dict, load_test_data, save_test_result 10 | from config.config import checkpoint_dir, TEST_DATA, TEST_RESULT 11 | from model.match_pyramid import MatchPyramidNetwork 12 | from model.rnn_siamese import RnnSiameseNetwork 13 | from config.hyperparams import HyperParams as hp 14 | import tensorflow as tf 15 | import numpy as np 16 | 17 | 18 | def test(filename=TEST_DATA, outfile=TEST_RESULT, network='rnn'): 19 | checkpoint_file = checkpoint_dir.format(network) 20 | idx, left_x, left_len, right_x, right_len, vocab = load_test_data(filename) 21 | y = np.ones_like(idx) 22 | vocab_size = len(vocab.word2idx) 23 | if network == 'rnn': 24 | model = RnnSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, hp.batch_size, False) 25 | elif network == 'match_pyramid': 26 | model = MatchPyramidNetwork(vocab_size, hp.embedding_size, vocab.max_len, hp.batch_size, False) 27 | else: 28 | return 29 | sv = tf.train.Supervisor(graph=model.graph, logdir=checkpoint_file, save_model_secs=0) 30 | with sv.managed_session() as sess: 31 | predicts = [] 32 | for feed_dict, start_batch in get_feed_dict(model, left_x, right_x, left_len, right_len, y, hp.batch_size): 33 | pre_y, distince = sess.run([model.pre_y, model.distance], feed_dict=feed_dict) 34 | predicts.extend(pre_y[start_batch:]) 35 | save_test_result(outfile, idx, predicts) 36 | 37 | 38 | if __name__ == "__main__": 39 | test(sys.argv[1], sys.argv[2], 'rnn') 40 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 18-12-29 下午3:17 4 | # @Author : 林利芳 5 | # @File : __init__.py 6 | -------------------------------------------------------------------------------- /model/cnn_siamese.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 18-12-29 下午3:07 4 | # @Author : 林利芳 5 | # @File : rnn_siamese.py 6 | import tensorflow as tf 7 | from config.hyperparams import CnnParams as hp 8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize 9 | 10 | 11 | class CnnSiameseNetwork(object): 12 | def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True): 13 | self.vocab_size = vocab_size 14 | self.embedding_size = embedding_size 15 | self.max_len = max_len 16 | self.is_training = is_training 17 | self.graph = tf.Graph() 18 | with self.graph.as_default(): 19 | self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x") 20 | self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x") 21 | self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target") 22 | self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size]) 23 | self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size]) 24 | self.global_step = tf.train.create_global_step() 25 | 26 | key, value = self.siamese() 27 | self.distance, self.pre_y = self.similar(key, value) 28 | self.accuracy = self.predict() 29 | self.loss = self.loss_layer() 30 | self.train_op = self.optimize() 31 | 32 | def siamese(self): 33 | """ 34 | 孪生网络 transformer + rnn 35 | :return: 36 | """ 37 | x = tf.concat([self.left_x, self.right_x], axis=0) 38 | seq_lens = tf.concat([self.left_seq_lens, self.right_seq_lens], axis=0) 39 | # layers embedding multi_head_attention rnn 40 | embed = embedding(x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True, scope="embed") 41 | 42 | # output = self.transformer(embed, x) 43 | inputs = tf.expand_dims(embed, -1) 44 | output = self.cnn_layer(inputs, 1) 45 | output = tf.expand_dims(output, -1) 46 | output = self.cnn_layer(output, 2) 47 | output = self.attention(embed, output) 48 | key, value = tf.split(output, 2, axis=0) 49 | return key, value 50 | 51 | def rnn_layer(self, inputs, seq_lens, seg=hp.seg): 52 | """ 53 | 创建双向RNN层 54 | :param inputs: 55 | :param seq_lens: 56 | :param seg: LSTM GRU F-LSTM, IndRNN 57 | :return: 58 | """ 59 | if seg == 'LSTM': 60 | fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units) 61 | bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units) 62 | 63 | elif seg == 'GRU': 64 | fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units) 65 | bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units) 66 | else: 67 | fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units) 68 | bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units) 69 | # 双向rnn 70 | (fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn( 71 | fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32) 72 | # 合并双向rnn的output batch_size * max_seq * (hidden_dim*2) 73 | output = tf.add(fw_output, bw_output) 74 | return output 75 | 76 | def cnn_layer(self, inputs, layer=1): 77 | """ 78 | 卷积层 卷积核2,3,4,5 激活层relu 池化层 size=2 79 | :param inputs: batch T * T 80 | :param layer: batch T * T 81 | :return: 82 | """ 83 | outputs = [] 84 | d_dim, channel = inputs.get_shape().as_list()[-2:] 85 | for ii, width in enumerate(hp.kernel): 86 | with tf.variable_scope("cnn_{}_{}_layer".format(layer, ii + 1)): 87 | weight = tf.Variable(tf.truncated_normal([width, d_dim, channel, hp.channel], stddev=0.1, name='w')) 88 | bias = tf.get_variable('bias', [hp.channel], initializer=tf.constant_initializer(0.0)) 89 | output = tf.nn.conv2d(inputs, weight, strides=[1, 1, d_dim, 1], padding='SAME') # batch T T channel 90 | output = tf.nn.relu(tf.nn.bias_add(output, bias, data_format="NHWC")) 91 | 92 | output = tf.reshape(output, shape=[-1, self.max_len, hp.channel]) 93 | outputs.append(output) 94 | outputs = tf.concat(outputs, axis=-1) 95 | return outputs 96 | 97 | def transformer(self, embed, value): 98 | with tf.variable_scope("Transformer_Encoder"): 99 | # Positional Encoding 100 | embed += positional_encoding(value, num_units=hp.num_units, zero_pad=False, scale=False, scope="post") 101 | # Dropout 102 | output = self.multi_head_block(embed) 103 | return output 104 | 105 | def multi_head_block(self, query, causality=False): 106 | """ 107 | 多头注意力机制 108 | :param query: 109 | :param causality: 110 | :return: 111 | """ 112 | for i in range(hp.num_blocks): 113 | with tf.variable_scope("num_blocks_{}".format(i)): 114 | # multi head Attention ( self-attention) 115 | query = multihead_attention( 116 | queries=query, keys=query, num_units=hp.num_units, num_heads=hp.num_heads, 117 | dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality, 118 | scope="self_attention") 119 | # Feed Forward 120 | query = feedforward(query, num_units=[4 * hp.num_units, hp.num_units]) 121 | return query 122 | 123 | def loss_layer(self): 124 | """ 125 | 损失函数 L+ = (1-Ew)^2/4 L_ = max(Ex,0)^2 126 | :return: 127 | """ 128 | y = tf.cast(self.y, tf.float32) 129 | with tf.name_scope("output"): 130 | loss_p = tf.square(1 - self.distance) / 4 131 | mask = tf.sign(tf.nn.relu(self.distance - hp.margin)) 132 | loss_m = tf.square(mask * self.distance) 133 | loss = tf.reduce_sum(y * loss_p + (1 - y) * loss_m) 134 | return loss 135 | 136 | def attention(self, embed, query): 137 | """ 138 | 注意力机制 139 | :param embed: 140 | :param query: 141 | :return: 142 | """ 143 | output = tf.reduce_mean(query, axis=1) 144 | return output 145 | with tf.name_scope("attention"): 146 | w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32) 147 | b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32) 148 | u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32) 149 | value = tf.concat([embed, query], axis=-1) 150 | value = tf.reshape(value, [-1, 2 * hp.num_units]) 151 | attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u) 152 | attention = tf.reshape(attention, shape=[-1, self.max_len]) 153 | attention = tf.nn.softmax(attention, axis=-1) 154 | attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units]) 155 | 156 | output = tf.reduce_sum(attention * query, axis=1) 157 | output = layer_normalize(output) 158 | return output 159 | 160 | @staticmethod 161 | def similar(key, value): 162 | """ 163 | cosine(key,value) = key * value/(|key|*|value|) 164 | :param key: 165 | :param value: 166 | :return: 167 | """ 168 | dot_value = tf.reduce_sum(key * value, axis=-1) 169 | key_sqrt = tf.sqrt(tf.reduce_sum(tf.square(key), axis=-1) + hp.eps) 170 | value_sqrt = tf.sqrt(tf.reduce_sum(tf.square(value), axis=-1) + hp.eps) 171 | distance = tf.div(dot_value, key_sqrt * value_sqrt, name="similar") 172 | pre_y = tf.sign(tf.nn.relu(distance - hp.margin)) 173 | pre_y = tf.cast(pre_y, tf.int32, name='pre') 174 | return distance, pre_y 175 | 176 | def predict(self): 177 | correct_predictions = tf.equal(self.pre_y, self.y) 178 | accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 179 | return accuracy 180 | 181 | def optimize(self): 182 | """ 183 | 优化器 184 | :return: 185 | """ 186 | optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 187 | train_op = optimizer.minimize(self.loss, global_step=self.global_step) 188 | return train_op 189 | -------------------------------------------------------------------------------- /model/match_pyramid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 19-1-25 上午11:17 4 | # @Author : 林利芳 5 | # @File : match_pyramid.py 6 | import tensorflow as tf 7 | from config.hyperparams import MatchPyramidParams as hp 8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize 9 | 10 | 11 | class MatchPyramidNetwork(object): 12 | def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True): 13 | self.vocab_size = vocab_size 14 | self.embedding_size = embedding_size 15 | self.max_len = max_len 16 | self.is_training = is_training 17 | self.graph = tf.Graph() 18 | with self.graph.as_default(): 19 | self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x") 20 | self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x") 21 | self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target") 22 | self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size]) 23 | self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size]) 24 | self.global_step = tf.train.create_global_step() 25 | 26 | outputs = self.match_pyramid() 27 | outputs, self.pre_y = self.multi_dense_layer(outputs) 28 | self.acc = self.predict() 29 | self.loss = self.loss_layer(outputs) 30 | self.train_op = self.optimize() 31 | 32 | def match_pyramid(self): 33 | """ 34 | pyramid 35 | :return: 36 | """ 37 | left_embed = embedding(self.left_x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True, 38 | scope="left_embed") 39 | right_embed = embedding(self.right_x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True, 40 | scope="right_embed") 41 | outputs = self.match_text(left_embed, right_embed) 42 | outputs = self.cnn_layer(outputs, 1) 43 | outputs = self.cnn_layer(outputs, 2) 44 | return outputs 45 | 46 | @staticmethod 47 | def match_text(left_embed, right_embed): 48 | """ 49 | 文本匹配 cosine dot binary 50 | :param left_embed: 词嵌入 batch * T * D 51 | :param right_embed: 词嵌入 batch * T * D 52 | :return: 53 | """ 54 | with tf.variable_scope("match-text"): 55 | dot_output = tf.matmul(left_embed, tf.transpose(right_embed, [0, 2, 1])) # batch * T * T 56 | left_norm = tf.sqrt(tf.matmul(left_embed, tf.transpose(left_embed, [0, 2, 1]))+hp.eps) 57 | right_norm = tf.sqrt(tf.matmul(right_embed, tf.transpose(right_embed, [0, 2, 1]))+hp.eps) 58 | cosine_outputs = tf.div(dot_output, left_norm * right_norm) 59 | binary_outputs = tf.cast(tf.equal(cosine_outputs, 1), tf.float32) 60 | dot_output = tf.expand_dims(dot_output, axis=-1) 61 | cosine_outputs = tf.expand_dims(cosine_outputs, axis=-1) 62 | binary_outputs = tf.expand_dims(binary_outputs, axis=-1) 63 | 64 | outputs = tf.concat([dot_output, cosine_outputs, binary_outputs], axis=-1) 65 | print(outputs.get_shape().as_list()) 66 | return dot_output 67 | 68 | @staticmethod 69 | def cnn_layer(inputs, layer=1): 70 | """ 71 | 卷积层 卷积核2,3,4,5 激活层relu 池化层 size=2 72 | :param inputs: batch T * T 73 | :param layer: batch T * T 74 | :return: 75 | """ 76 | outputs = [] 77 | channel = inputs.get_shape().as_list()[-1] 78 | for ii, width in enumerate(hp.kernel): 79 | with tf.variable_scope("cnn_{}_{}_layer".format(layer, ii + 1)): 80 | weight = tf.Variable(tf.truncated_normal([width, width, channel, hp.channel], stddev=0.1, name='w')) 81 | bias = tf.get_variable('bias', [hp.channel], initializer=tf.constant_initializer(0.0)) 82 | output = tf.nn.conv2d(inputs, weight, strides=[1, 1, 1, 1], padding='SAME') # batch T T channel 83 | output = tf.nn.relu(tf.nn.bias_add(output, bias, data_format="NHWC")) 84 | pool = tf.nn.max_pool(output, ksize=[1, hp.pool_size, hp.pool_size, 1], strides=[1, 1, 1, 1], 85 | padding='VALID') 86 | outputs.append(pool) 87 | outputs = tf.concat(outputs, axis=-1) 88 | return outputs 89 | 90 | @staticmethod 91 | def multi_dense_layer(inputs): 92 | """ 93 | 多层感知机 T*T*channel -> dense_size ->2 94 | :param inputs: batch T T channel 95 | :return: 96 | """ 97 | _, width, height, channel = inputs.get_shape().as_list() 98 | size = width * height * channel 99 | inputs = tf.reshape(inputs, shape=[-1, size]) 100 | with tf.variable_scope("dense_layer"): 101 | w = tf.get_variable(name='w', dtype=tf.float32, shape=[size, hp.dense_size]) 102 | b = tf.get_variable(name='b', dtype=tf.float32, shape=[hp.dense_size]) 103 | outputs = layer_normalize(tf.matmul(inputs, w) + b, ) 104 | 105 | with tf.variable_scope("logit_layer"): 106 | w = tf.get_variable(name='w', dtype=tf.float32, shape=[hp.dense_size, 2]) 107 | b = tf.get_variable(name='b', dtype=tf.float32, shape=[2]) 108 | outputs = tf.nn.softmax(tf.matmul(outputs, w) + b, axis=-1) 109 | pre_y = tf.cast(tf.argmax(outputs, axis=-1), dtype=tf.int32) 110 | return outputs, pre_y 111 | 112 | def rnn_layer(self, inputs, seq_lens, seg=hp.seg): 113 | """ 114 | 创建双向RNN层 115 | :param inputs: 116 | :param seq_lens: 117 | :param seg: LSTM GRU F-LSTM, IndRNN 118 | :return: 119 | """ 120 | if seg == 'LSTM': 121 | fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units) 122 | bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units) 123 | 124 | elif seg == 'GRU': 125 | fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units) 126 | bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units) 127 | else: 128 | fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units) 129 | bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units) 130 | # 双向rnn 131 | (fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn( 132 | fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32) 133 | # 合并双向rnn的output batch_size * max_seq * (hidden_dim*2) 134 | output = tf.add(fw_output, bw_output) 135 | return output 136 | 137 | def transformer(self, embed, value): 138 | with tf.variable_scope("Transformer_Encoder"): 139 | # Positional Encoding 140 | embed += positional_encoding(value, num_units=hp.num_units, zero_pad=False, scale=False, scope="post") 141 | # Dropout 142 | output = self.multi_head_block(embed) 143 | return output 144 | 145 | def multi_head_block(self, query, causality=False): 146 | """ 147 | 多头注意力机制 148 | :param query: 149 | :param causality: 150 | :return: 151 | """ 152 | for i in range(hp.num_blocks): 153 | with tf.variable_scope("num_blocks_{}".format(i)): 154 | # multi head Attention ( self-attention) 155 | query = multihead_attention( 156 | queries=query, keys=query, num_units=hp.num_units, num_heads=hp.num_heads, 157 | dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality, 158 | scope="self_attention") 159 | # Feed Forward 160 | query = feedforward(query, num_units=[4 * hp.num_units, hp.num_units]) 161 | return query 162 | 163 | def loss_layer(self, inputs): 164 | """ 165 | 损失函数 L+ = (1-Ew)^2/4 L_ = max(Ex,0)^2 166 | :return: 167 | """ 168 | y = tf.cast(self.y, tf.float32) 169 | with tf.name_scope("loss_layer"): 170 | loss_p = y * tf.log(tf.clip_by_value(inputs[:, -1], hp.eps, 1.0)) 171 | loss_m = (1 - y) * tf.log(tf.clip_by_value(inputs[:, 0], hp.eps, 1.0)) 172 | loss = -tf.reduce_sum(loss_p + loss_m) 173 | return loss 174 | 175 | def attention(self, embed, query): 176 | """ 177 | 注意力机制 178 | :param embed: 179 | :param query: 180 | :return: 181 | """ 182 | with tf.name_scope("attention"): 183 | w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32) 184 | b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32) 185 | u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32) 186 | value = tf.concat([embed, query], axis=-1) 187 | value = tf.reshape(value, [-1, 2 * hp.num_units]) 188 | attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u) 189 | attention = tf.reshape(attention, shape=[-1, self.max_len]) 190 | attention = tf.nn.softmax(attention, axis=-1) 191 | attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units]) 192 | 193 | output = tf.reduce_sum(attention * query, axis=1) 194 | output = layer_normalize(output) 195 | return output 196 | 197 | def predict(self): 198 | correct_predictions = tf.equal(self.pre_y, self.y) 199 | accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 200 | return accuracy 201 | 202 | def optimize(self): 203 | """ 204 | 优化器 205 | :return: 206 | """ 207 | optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 208 | train_op = optimizer.minimize(self.loss, global_step=self.global_step) 209 | return train_op 210 | -------------------------------------------------------------------------------- /model/module/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 19-1-25 下午6:02 4 | # @Author : 林利芳 5 | # @File : __init__.py 6 | -------------------------------------------------------------------------------- /model/module/feature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @时间 : 18-12-11 下午5:44 4 | # @作者 : Lin lifang 5 | # @文件 : feature.py 6 | from utils.utils import read_template 7 | import numpy as np 8 | 9 | 10 | class Feature(object): 11 | def __init__(self, fd=5): 12 | self.fd = fd 13 | self.fss = None 14 | self.bf_size = 0 15 | self.uf_size = 0 16 | self.f_size = 0 17 | self.num_k = 0 18 | self.node_obs = dict() 19 | self.edge_obs = dict() 20 | self.oby_dict = dict() 21 | self.node_fs = [] 22 | self.edge_fs = [] 23 | self.tp_list = [ 24 | ['U00', ['-2', '0']], 25 | ['U01', ['-1', '0']], 26 | ['U02', ['0', '0']], 27 | ['U03', ['1', '0']], 28 | ['U04', ['2', '0']], 29 | ['U05', ['-2', '0'], ['-1', '0'], ['0', '0']], 30 | ['U06', ['-1', '0'], ['0', '0'], ['1', '0']], 31 | ['U07', ['0', '0'], ['1', '0'], ['2', '0']], 32 | ['U08', ['-1', '0'], ['0', '0']], 33 | ['U09', ['0', '0'], ['1', '0']], 34 | ['B'], ] 35 | 36 | def process_features(self, texts): 37 | """ 38 | 特征提取 39 | :param texts: 序列文本 [[['你',],['好',]],[['你',],['好',]]] 40 | :return: 41 | """ 42 | print("特征提取...") 43 | uf_obs = dict() 44 | bf_obs = dict() 45 | 46 | for text in texts: 47 | seq_uf, seq_bf = self.feature_vector(text) 48 | for loc_id, (loc_uf, loc_bf) in enumerate(zip(seq_uf, seq_bf)): 49 | for fs in loc_bf: 50 | fs_id = bf_obs.get(fs) 51 | bf_obs[fs] = fs_id + 1 if fs_id is not None else 1 52 | for fs in loc_uf: 53 | fs_id = uf_obs.get(fs) 54 | uf_obs[fs] = fs_id + 1 if fs_id is not None else 1 55 | 56 | node_fs = [key for key, v in sorted(uf_obs.items(), key=lambda x: x[1], reverse=True) if v >= self.fd] 57 | edge_fs = [key for key, v in sorted(bf_obs.items(), key=lambda x: x[1], reverse=True) if v >= self.fd] 58 | self.node_obs = {key: kk * self.num_k for kk, key in enumerate(node_fs)} 59 | self.edge_obs = {key: kk * self.num_k * self.num_k for kk, key in enumerate(edge_fs)} 60 | 61 | self.uf_size = len(node_fs) * self.num_k 62 | self.bf_size = len(edge_fs) * self.num_k * self.num_k 63 | self.f_size = self.uf_size + self.bf_size 64 | print("B 特征:\t{}\nU 特征:\t{}\n总特征:\t{}\n".format(self.bf_size, self.uf_size, self.f_size)) 65 | 66 | def feature_vector(self, text, init=True): 67 | """ 68 | 特征序列化 69 | :param text: 70 | :param init: 71 | :return: 72 | """ 73 | seq_bf = [] 74 | seq_uf = [] 75 | for loc_id in range(len(text)): 76 | loc_uf, loc_bf = self.expand_observation(text, loc_id, init) 77 | seq_bf.append(loc_bf) 78 | seq_uf.append(loc_uf) 79 | return seq_uf, seq_bf 80 | 81 | def expand_observation(self, sentence, loc_id, init=True): 82 | """ 83 | expend the observation at loc_id for sequence 84 | :param sentence: 字符序列 85 | :param loc_id: 字符在sentence的位置序号 86 | :param init: 是否初始化 87 | :return: 88 | """ 89 | loc_uf = [] 90 | loc_bf = [] 91 | for tp in self.tp_list: 92 | fs = tp[0] 93 | for li in tp[1::]: 94 | row = loc_id + int(li[0]) 95 | col = int(li[1]) 96 | if len(sentence) > row >= 0: 97 | if len(sentence[row][col]) > col >= 0: 98 | fs += ":" + sentence[row][col] 99 | else: 100 | fs += ':B' + li[0] 101 | if fs[0] == "U": 102 | if init: 103 | loc_uf.append(fs) 104 | else: 105 | fs_id = self.node_obs.get(fs) 106 | if fs_id is not None: 107 | loc_uf.append(fs_id) 108 | if fs[0] == "B": 109 | if init: 110 | loc_bf.append(fs) 111 | else: 112 | fs_id = self.edge_obs.get(fs) 113 | if fs_id is not None: 114 | loc_bf.append(fs_id) 115 | return loc_uf, loc_bf 116 | 117 | def cal_observe_on(self, texts, init=False): 118 | """ 119 | 获取文本特征 [[['U:你','U:你:好'],['U:你','U:你:好'],[]],[],[]] =[[[145,456,566],[3455,]],[]] 120 | :param texts: 121 | :param init: 122 | :return: 123 | """ 124 | self.node_fs = [] 125 | self.edge_fs = [] 126 | for text in texts: 127 | seq_uf, seq_bf = self.feature_vector(text, init) 128 | self.node_fs.append(seq_uf) 129 | self.edge_fs.append(seq_bf) 130 | return self.node_fs, self.edge_fs 131 | 132 | def cal_fss(self, labels, y0): 133 | """ 134 | 统计特征数量 每个特征对应 num_k 个特征 135 | :param labels: 标签 136 | :param y0: 起始值0 137 | :return: 138 | """ 139 | self.fss = np.zeros((self.f_size,)) 140 | fss_b = self.fss[0:self.bf_size] 141 | fss_u = self.fss[self.bf_size:] 142 | for seq_id, label in enumerate(labels): 143 | y_p = y0 144 | for loc_id, y in enumerate(label): 145 | for fs_id in self.node_fs[seq_id][loc_id]: 146 | fss_u[fs_id + y] += 1.0 147 | for fs_id in self.edge_fs[seq_id][loc_id]: 148 | fss_b[fs_id + y_p * self.num_k + y] += 1.0 149 | y_p = y 150 | 151 | def save_feature(self): 152 | result = ['#CRF Feature Templates.\n\n'] 153 | for tp in self.tp_list: 154 | feature = tp[0] + ':' 155 | for start, end in tp[1:]: 156 | feature += '%x[' + start + ',' + end + ']' 157 | result.append(feature) 158 | result.append('\n\n#U') 159 | u_feature = list(sorted(self.node_obs.keys(), key=lambda x: x)) 160 | result.extend(u_feature) 161 | with open('feature.txt', 'w', encoding='utf-8') as fp: 162 | fp.write('\n'.join(result)) 163 | 164 | def process_state(self, labels): 165 | """ 166 | 状态预处理 167 | :param labels: 168 | :return: 169 | """ 170 | new_label = [] 171 | oby_id = 0 172 | for sentence in labels: 173 | s_label = [] 174 | for label in sentence: 175 | label_id = self.oby_dict.get(label) 176 | if label_id is None: 177 | label_id = oby_id 178 | self.oby_dict[label] = oby_id 179 | oby_id += 1 180 | s_label.append(label_id) 181 | new_label.append(s_label) 182 | self.num_k = len(self.oby_dict) 183 | return new_label 184 | 185 | def __call__(self, texts, labels, template_file, y0=0, *args, **kwargs): 186 | if template_file: 187 | self.tp_list = read_template(template_file) 188 | self.seq_lens = [len(x) for x in labels] 189 | labels = self.process_state(labels) 190 | self.process_features(texts) 191 | self.cal_observe_on(texts) 192 | self.cal_fss(labels, y0) 193 | self.save_feature() 194 | -------------------------------------------------------------------------------- /model/module/modules.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 18-12-25 上午10:37 4 | # @Author : 林利芳 5 | # @File : modules.py 6 | 7 | from __future__ import print_function 8 | import tensorflow as tf 9 | import numpy as np 10 | 11 | 12 | def layer_normalize(inputs, epsilon=1e-8, scope="ln", reuse=None): 13 | """Applies layer normalization. 14 | Args: 15 | inputs: A tensor with 2 or more dimensions, where the first dimension has 16 | `batch_size`. 17 | epsilon: A floating number. A very small number for preventing ZeroDivision Error. 18 | scope: Optional scope for `variable_scope`. 19 | reuse: Boolean, whether to reuse the weights of a previous layer by the same name. 20 | Returns: 21 | A tensor with the same shape and data dtype as `inputs`. 22 | """ 23 | with tf.variable_scope(scope, reuse=reuse): 24 | inputs_shape = inputs.get_shape() 25 | params_shape = inputs_shape[-1:] 26 | 27 | mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) 28 | beta = tf.Variable(tf.zeros(params_shape)) 29 | gamma = tf.Variable(tf.ones(params_shape)) 30 | normalized = (inputs - mean) / ((variance + epsilon) ** (.5)) 31 | outputs = gamma * normalized + beta 32 | 33 | return outputs 34 | 35 | 36 | def embedding(inputs, vocab_size, num_units, zero_pad=True, scale=True, scope="embedding", reuse=None): 37 | """Embeds a given tensor. 38 | Args: 39 | inputs: A `Tensor` with type `int32` or `int64` containing the ids to be looked up in `lookup table`. 40 | vocab_size: An int. Vocabulary size. 41 | num_units: An int. Number of embedding hidden units. 42 | zero_pad: A boolean. If True, all the values of the fist row (id 0) should be constant zeros. 43 | scale: A boolean. If True. the outputs is multiplied by sqrt num_units. 44 | scope: Optional scope for `variable_scope`. 45 | reuse: Boolean, whether to reuse the weights of a previous layer 46 | by the same name. 47 | 48 | Returns: 49 | A `Tensor` with one more rank than inputs's. The last dimensionality should be `num_units`. 50 | 51 | For example, 52 | 53 | ``` 54 | import tensorflow as tf 55 | 56 | inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) 57 | outputs = embedding(inputs, 6, 2, zero_pad=True) 58 | with tf.Session() as sess: 59 | sess.run(tf.global_variables_initializer()) 60 | print sess.run(outputs) 61 | >> 62 | [[[ 0. 0. ] 63 | [ 0.09754146 0.67385566] 64 | [ 0.37864095 -0.35689294]] 65 | 66 | [[-1.01329422 -1.09939694] 67 | [ 0.7521342 0.38203377] 68 | [-0.04973143 -0.06210355]]] 69 | ``` 70 | 71 | ``` 72 | import tensorflow as tf 73 | 74 | inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) 75 | outputs = embedding(inputs, 6, 2, zero_pad=False) 76 | with tf.Session() as sess: 77 | sess.run(tf.global_variables_initializer()) 78 | print sess.run(outputs) 79 | >> 80 | [[[-0.19172323 -0.39159766] 81 | [-0.43212751 -0.66207761] 82 | [ 1.03452027 -0.26704335]] 83 | 84 | [[-0.11634696 -0.35983452] 85 | [ 0.50208133 0.53509563] 86 | [ 1.22204471 -0.96587461]]] 87 | ``` 88 | """ 89 | with tf.variable_scope(scope, reuse=reuse): 90 | lookup_table = tf.get_variable('lookup_table', dtype=tf.float32, shape=[vocab_size, num_units], 91 | initializer=tf.contrib.layers.xavier_initializer()) 92 | if zero_pad: 93 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0) 94 | outputs = tf.nn.embedding_lookup(lookup_table, inputs) 95 | 96 | if scale: 97 | outputs = outputs * (num_units ** 0.5) 98 | 99 | return outputs 100 | 101 | 102 | def positional_encoding(inputs, num_units, zero_pad=True, scale=True, scope="positional_encoding", reuse=None): 103 | """Sinusoidal Positional_Encoding. 104 | 105 | Args: 106 | inputs: A 2d Tensor with shape of (N, T). 107 | num_units: Output dimensionality 108 | zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero 109 | scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper) 110 | scope: Optional scope for `variable_scope`. 111 | reuse: Boolean, whether to reuse the weights of a previous layer 112 | by the same name. 113 | 114 | Returns: 115 | A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units' 116 | """ 117 | 118 | N, T = inputs.get_shape().as_list() 119 | with tf.variable_scope(scope, reuse=reuse): 120 | position_ind = tf.ones_like(inputs) * tf.range(T) 121 | 122 | # First part of the PE function: sin and cos argument 123 | position_enc = np.array([ 124 | [pos / np.power(10000, 2. * i / num_units) for i in range(num_units)] for pos in range(T)], dtype=np.float32) 125 | 126 | # Second part, apply the cosine to even columns and sin to odds. 127 | position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i 128 | position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1 129 | 130 | # Convert to a tensor 131 | lookup_table = tf.convert_to_tensor(position_enc) 132 | 133 | if zero_pad: 134 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0) 135 | outputs = tf.nn.embedding_lookup(lookup_table, position_ind) 136 | 137 | if scale: 138 | outputs = outputs * num_units ** 0.5 139 | 140 | return outputs 141 | 142 | 143 | def multihead_attention( 144 | queries, keys, num_units=None, num_heads=8, dropout_rate=0, is_training=True, causality=False, 145 | scope="multihead_attention", reuse=None): 146 | """Applies multihead attention. 147 | Args: 148 | queries: A 3d tensor with shape of [N, T_q, C_q]. 149 | keys: A 3d tensor with shape of [N, T_k, C_k]. 150 | num_units: A scalar. Attention size. 151 | dropout_rate: A floating point number. 152 | is_training: Boolean. Controller of mechanism for dropout. 153 | causality: Boolean. If true, units that reference the future are masked. 154 | num_heads: An int. Number of heads. 155 | scope: Optional scope for `variable_scope`. 156 | reuse: Boolean, whether to reuse the weights of a previous layer by the same name. 157 | Returns 158 | A 3d tensor with shape of (N, T_q, C) 159 | """ 160 | with tf.variable_scope(scope, reuse=reuse): 161 | # Set the fall back option for num_units 162 | if num_units is None: 163 | num_units = queries.get_shape().as_list[-1] 164 | 165 | # Linear projections 166 | Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C) 167 | K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) 168 | V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) 169 | 170 | # Split and concat 171 | Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 172 | K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 173 | V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 174 | 175 | # Multiplication 176 | outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) 177 | 178 | # Scale 179 | outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5) 180 | 181 | # Key Masking 182 | key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k) 183 | key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) 184 | key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) 185 | 186 | paddings = tf.ones_like(outputs) * (-2 ** 32 + 1) 187 | outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) 188 | 189 | # Causality = Future blinding 190 | if causality: 191 | diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) 192 | tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k) 193 | masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) 194 | 195 | paddings = tf.ones_like(masks) * (-2 ** 32 + 1) 196 | outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) 197 | 198 | # Activation 199 | outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) 200 | 201 | # Query Masking 202 | query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) 203 | query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) 204 | query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) 205 | outputs *= query_masks # broadcasting. (N, T_q, C) 206 | 207 | # Dropouts 208 | outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) 209 | 210 | # Weighted sum 211 | outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) 212 | 213 | # Restore shape 214 | outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C) 215 | 216 | # Residual connection 残差 217 | outputs += queries 218 | 219 | # Normalize 层归一化 220 | outputs = layer_normalize(outputs) # (N, T_q, C) 221 | 222 | return outputs 223 | 224 | 225 | def feedforward(inputs, num_units=[2048, 512], scope="multihead_attention", reuse=None): 226 | """Point-wise feed forward net. 227 | Args: 228 | inputs: A 3d tensor with shape of [N, T, C]. 229 | num_units: A list of two integers. 230 | scope: Optional scope for `variable_scope`. 231 | reuse: Boolean, whether to reuse the weights of a previous layer by the same name. 232 | Returns: 233 | A 3d tensor with the same shape and dtype as inputs 234 | """ 235 | with tf.variable_scope(scope, reuse=reuse): 236 | # Inner layer 237 | params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1, "activation": tf.nn.relu, 238 | "use_bias": True,"reuse":False} 239 | outputs = tf.layers.conv1d(**params) 240 | 241 | # Readout layer 242 | params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1, "activation": None, "use_bias": True,"reuse":False} 243 | outputs = tf.layers.conv1d(**params) 244 | 245 | # Residual connection 246 | outputs += inputs 247 | 248 | # Normalize 249 | outputs = layer_normalize(outputs) 250 | 251 | return outputs 252 | 253 | 254 | def label_smoothing(inputs, epsilon=0.1): 255 | """Applies label smoothing. See https://arxiv.org/abs/1512.00567. 256 | 257 | Args: 258 | inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary. 259 | epsilon: Smoothing rate. 260 | 261 | For example, 262 | 263 | ``` 264 | import tensorflow as tf 265 | inputs = tf.convert_to_tensor([[[0, 0, 1], [0, 1, 0], [1, 0, 0]], [[1, 0, 0], [1, 0, 0], [0, 1, 0]]], tf.float32) 266 | 267 | outputs = label_smoothing(inputs) 268 | 269 | with tf.Session() as sess: 270 | print(sess.run([outputs])) 271 | 272 | >> 273 | [array([[[ 0.03333334, 0.03333334, 0.93333334], 274 | [ 0.03333334, 0.93333334, 0.03333334], 275 | [ 0.93333334, 0.03333334, 0.03333334]], 276 | 277 | [[ 0.93333334, 0.03333334, 0.03333334], 278 | [ 0.93333334, 0.03333334, 0.03333334], 279 | [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)] 280 | ``` 281 | """ 282 | K = inputs.get_shape().as_list()[-1] # number of channels 283 | return ((1 - epsilon) * inputs) + (epsilon / K) 284 | -------------------------------------------------------------------------------- /model/module/rnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 18-12-28 上午10:54 4 | # @Author : 林利芳 5 | # @File : rnn.py 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | from tensorflow.python.framework import constant_op 11 | from tensorflow.python.framework import dtypes 12 | from tensorflow.python.layers import base as base_layer 13 | from tensorflow.python.ops import array_ops, clip_ops 14 | from tensorflow.python.ops import init_ops 15 | from tensorflow.python.ops import math_ops 16 | from tensorflow.python.ops import nn_ops 17 | from tensorflow.python.platform import tf_logging as logging 18 | from tensorflow.python.ops.rnn_cell_impl import LayerRNNCell, LSTMStateTuple 19 | 20 | _BIAS_VARIABLE_NAME = "bias" 21 | _WEIGHTS_VARIABLE_NAME = "kernel" 22 | 23 | 24 | class ForgetLSTMCell(LayerRNNCell): 25 | """Basic LSTM recurrent network cell. 26 | 27 | The implementation is based on: http://arxiv.org/abs/1409.2329. 28 | 29 | We add forget_bias (default: 1) to the biases of the forget gate in order to 30 | reduce the scale of forgetting in the beginning of the training. 31 | 32 | It does not allow cell clipping, a projection layer, and does not 33 | use peep-hole connections: it is the basic baseline. 34 | 35 | For advanced models, please use the full @{tf.nn.rnn_cell.LSTMCell} 36 | that follows. 37 | """ 38 | 39 | def __init__(self, num_units, forget_bias=1.0, 40 | state_is_tuple=True, activation=None, reuse=None, name=None): 41 | """Initialize the basic LSTM cell. 42 | 43 | Args: 44 | num_units: int, The number of units in the LSTM cell. 45 | forget_bias: float, The bias added to forget gates (see above). 46 | Must set to `0.0` manually when restoring from CudnnLSTM-trained 47 | checkpoints. 48 | state_is_tuple: If True, accepted and returned states are 2-tuples of 49 | the `c_state` and `m_state`. If False, they are concatenated 50 | along the column axis. The latter behavior will soon be deprecated. 51 | activation: Activation function of the inner states. Default: `tanh`. 52 | reuse: (optional) Python boolean describing whether to reuse variables 53 | in an existing scope. If not `True`, and the existing scope already has 54 | the given variables, an error is raised. 55 | name: String, the name of the layer. Layers with the same name will 56 | share weights, but to avoid mistakes we require reuse=True in such 57 | cases. 58 | 59 | When restoring from CudnnLSTM-trained checkpoints, must use 60 | `CudnnCompatibleLSTMCell` instead. 61 | """ 62 | super(ForgetLSTMCell, self).__init__(_reuse=reuse, name=name) 63 | if not state_is_tuple: 64 | logging.warn("%s: Using a concatenated state is slower and will soon be " 65 | "deprecated. Use state_is_tuple=True.", self) 66 | 67 | # Inputs must be 2-dimensional. 68 | self.input_spec = base_layer.InputSpec(ndim=2) 69 | 70 | self._num_units = num_units 71 | self._forget_bias = forget_bias 72 | self._state_is_tuple = state_is_tuple 73 | self._activation = activation or math_ops.tanh 74 | 75 | @property 76 | def state_size(self): 77 | return (LSTMStateTuple(self._num_units, self._num_units) 78 | if self._state_is_tuple else 2 * self._num_units) 79 | 80 | @property 81 | def output_size(self): 82 | return self._num_units 83 | 84 | def build(self, inputs_shape): 85 | if inputs_shape[1].value is None: 86 | raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" 87 | % inputs_shape) 88 | 89 | input_depth = inputs_shape[1].value 90 | h_depth = self._num_units 91 | self._kernel = self.add_variable( 92 | _WEIGHTS_VARIABLE_NAME, 93 | shape=[input_depth + h_depth, 2 * self._num_units]) 94 | self._bias = self.add_variable( 95 | _BIAS_VARIABLE_NAME, 96 | shape=[2 * self._num_units], 97 | initializer=init_ops.zeros_initializer(dtype=self.dtype)) 98 | 99 | self.built = True 100 | 101 | def call(self, inputs, state): 102 | """Long short-term memory cell (LSTM). 103 | 104 | Args: 105 | inputs: `2-D` tensor with shape `[batch_size, input_size]`. 106 | state: An `LSTMStateTuple` of state tensors, each shaped 107 | `[batch_size, self.state_size]`, if `state_is_tuple` has been set to 108 | `True`. Otherwise, a `Tensor` shaped 109 | `[batch_size, 2 * self.state_size]`. 110 | 111 | Returns: 112 | A pair containing the new hidden state, and the new state (either a 113 | `LSTMStateTuple` or a concatenated state, depending on 114 | `state_is_tuple`). 115 | """ 116 | sigmoid = math_ops.sigmoid 117 | one = constant_op.constant(1, dtype=dtypes.int32) 118 | # Parameters of gates are concatenated into one multiply for efficiency. 119 | if self._state_is_tuple: 120 | c, h = state 121 | else: 122 | c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one) 123 | 124 | gate_inputs = math_ops.matmul( 125 | array_ops.concat([inputs, h], 1), self._kernel) 126 | gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) 127 | 128 | # i = input_gate, j = new_input, f = forget_gate, o = output_gate 129 | j, f = array_ops.split( 130 | value=gate_inputs, num_or_size_splits=2, axis=one) 131 | i = 1 - f 132 | forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype) 133 | # Note that using `add` and `multiply` instead of `+` and `*` gives a 134 | # performance improvement. So using those at the cost of readability. 135 | add = math_ops.add 136 | multiply = math_ops.multiply 137 | new_c = add(multiply(c, sigmoid(add(f, forget_bias_tensor))), multiply(sigmoid(i), self._activation(j))) 138 | new_h = new_c 139 | 140 | if self._state_is_tuple: 141 | new_state = LSTMStateTuple(new_c, new_h) 142 | else: 143 | new_state = array_ops.concat([new_c, new_h], 1) 144 | return new_h, new_state 145 | 146 | 147 | class IndRNNCell(LayerRNNCell): # 继承 LayerRNNCell 148 | 149 | def __init__(self, 150 | num_units, 151 | recurrent_min_abs=0, 152 | recurrent_max_abs=None, 153 | recurrent_kernel_initializer=None, 154 | input_kernel_initializer=None, 155 | activation=None, 156 | reuse=None, 157 | name=None): 158 | super(IndRNNCell, self).__init__(_reuse=reuse, name=name) 159 | 160 | self.input_spec = base_layer.InputSpec(ndim=2) 161 | 162 | # initialization 163 | self._num_units = num_units 164 | self._recurrent_min_abs = recurrent_min_abs 165 | 166 | self._recurrent_max_abs = recurrent_max_abs 167 | self._recurrent_recurrent_kernel_initializer = recurrent_kernel_initializer 168 | self._input_kernel_initializer = input_kernel_initializer 169 | self._activation = activation or nn_ops.relu 170 | 171 | @property 172 | def state_size(self): 173 | return self._num_units 174 | 175 | @property 176 | def output_size(self): 177 | return self._num_units 178 | 179 | def build(self, inputs_shape): 180 | '''construct the IndRNN Cell''' 181 | if inputs_shape[1].value is None: 182 | raise ValueError("Expected input shape[1] is known") 183 | 184 | input_depth = inputs_shape[1] 185 | if self._input_kernel_initializer is None: 186 | self._input_kernel_initializer = init_ops.random_normal_initializer(mean=0, 187 | stddev=1e-3) 188 | # matrix W 189 | self._input_kernel = self.add_variable( 190 | "input_kernel", 191 | shape=[input_depth, self._num_units], 192 | initializer=self._input_kernel_initializer 193 | ) 194 | 195 | if self._recurrent_recurrent_kernel_initializer is None: 196 | self._recurrent_recurrent_kernel_initializer = init_ops.constant_initializer(1.) 197 | 198 | # matrix U 199 | self._recurrent_kernel = self.add_variable( 200 | "recurrent_kernel", 201 | shape=[self._num_units], 202 | initializer=self._recurrent_recurrent_kernel_initializer 203 | ) 204 | 205 | # Clip the U to min - max 206 | if self._recurrent_min_abs: 207 | abs_kernel = math_ops.abs(self._recurrent_kernel) 208 | min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) 209 | self._recurrent_kernel = math_ops.multiply( 210 | math_ops.sign(self._recurrent_kernel), 211 | min_abs_kernel 212 | ) 213 | if self._recurrent_max_abs: 214 | self._recurrent_kernel = clip_ops.clip_by_value( 215 | self._recurrent_kernel, 216 | -self._recurrent_max_abs, 217 | self._recurrent_max_abs 218 | ) 219 | 220 | self._bias = self.add_variable( 221 | "bias", 222 | shape=[self._num_units], 223 | initializer=init_ops.zeros_initializer(dtype=self.dtype) 224 | ) 225 | # built finished 226 | self.built = True 227 | 228 | def call(self, inputs, state): 229 | '''output = new state = activation(W * x + U (*) h_t-1 + b)''' 230 | 231 | gate_inputs = math_ops.matmul(inputs, self._input_kernel) 232 | # (*) 233 | state_update = math_ops.multiply(state, self._recurrent_kernel) 234 | gate_inputs = math_ops.add(gate_inputs, state_update) 235 | gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) 236 | output = self._activation(gate_inputs) 237 | return output, output 238 | -------------------------------------------------------------------------------- /model/module/templates.txt: -------------------------------------------------------------------------------- 1 | # Unigram 2 | 3 | U00:%x[-2,0] 4 | U01:%x[-1,0] 5 | U02:%x[0,0] 6 | U03:%x[1,0] 7 | U04:%x[2,0] 8 | U05:%x[-2,0]/%x[-1,0]/%x[0,0] 9 | U06:%x[-1,0]/%x[0,0]/%x[1,0] 10 | U07:%x[0,0]/%x[1,0]/%x[2,0] 11 | U08:%x[-1,0]/%x[0,0] 12 | U09:%x[0,0]/%x[1,0] 13 | 14 | # Bigram 15 | B -------------------------------------------------------------------------------- /model/rnn_siamese.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 18-12-29 下午3:07 4 | # @Author : 林利芳 5 | # @File : rnn_siamese.py 6 | import tensorflow as tf 7 | from config.hyperparams import RnnParams as hp 8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize 9 | 10 | 11 | class RnnSiameseNetwork(object): 12 | def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True): 13 | self.vocab_size = vocab_size 14 | self.embedding_size = embedding_size 15 | self.max_len = max_len 16 | self.is_training = is_training 17 | self.graph = tf.Graph() 18 | with self.graph.as_default(): 19 | self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x") 20 | self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x") 21 | self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target") 22 | self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size]) 23 | self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size]) 24 | self.global_step = tf.train.create_global_step() 25 | 26 | key, value = self.siamese() 27 | self.distance, self.pre_y = self.similar(key, value) 28 | self.accuracy = self.predict() 29 | self.loss = self.loss_layer() 30 | self.train_op = self.optimize() 31 | 32 | def siamese(self): 33 | """ 34 | 孪生网络 transformer + rnn 35 | :return: 36 | """ 37 | x = tf.concat([self.left_x, self.right_x], axis=0) 38 | seq_lens = tf.concat([self.left_seq_lens, self.right_seq_lens], axis=0) 39 | # layers embedding multi_head_attention rnn 40 | embed = embedding(x, vocab_size=self.vocab_size, num_units=self.embedding_size, scale=True, scope="embed") 41 | 42 | # output = self.transformer(embed, x) 43 | output = self.rnn_layer(embed, seq_lens) 44 | output = self.attention(embed, output) 45 | key, value = tf.split(output, 2, axis=0) 46 | return key, value 47 | 48 | def rnn_layer(self, inputs, seq_lens, seg=hp.seg): 49 | """ 50 | 创建双向RNN层 51 | :param inputs: 52 | :param seq_lens: 53 | :param seg: LSTM GRU F-LSTM, IndRNN 54 | :return: 55 | """ 56 | if seg == 'LSTM': 57 | fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units) 58 | bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units) 59 | 60 | elif seg == 'GRU': 61 | fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units) 62 | bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units) 63 | else: 64 | fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units) 65 | bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units) 66 | # 双向rnn 67 | (fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn( 68 | fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32) 69 | # 合并双向rnn的output batch_size * max_seq * (hidden_dim*2) 70 | output = tf.add(fw_output, bw_output) 71 | return output 72 | 73 | def transformer(self, embed, value): 74 | with tf.variable_scope("Transformer_Encoder"): 75 | # Positional Encoding 76 | embed += positional_encoding(value, num_units=hp.num_units, zero_pad=False, scale=False, scope="post") 77 | # Dropout 78 | output = self.multi_head_block(embed) 79 | return output 80 | 81 | def multi_head_block(self, query, causality=False): 82 | """ 83 | 多头注意力机制 84 | :param query: 85 | :param causality: 86 | :return: 87 | """ 88 | for i in range(hp.num_blocks): 89 | with tf.variable_scope("num_blocks_{}".format(i)): 90 | # multi head Attention ( self-attention) 91 | query = multihead_attention( 92 | queries=query, keys=query, num_units=hp.num_units, num_heads=hp.num_heads, 93 | dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality, 94 | scope="self_attention") 95 | # Feed Forward 96 | query = feedforward(query, num_units=[4 * hp.num_units, hp.num_units]) 97 | return query 98 | 99 | def loss_layer(self): 100 | """ 101 | 损失函数 L+ = (1-Ew)^2/4 L_ = max(Ex,0)^2 102 | :return: 103 | """ 104 | y = tf.cast(self.y, tf.float32) 105 | with tf.name_scope("output"): 106 | loss_p = tf.square(1 - self.distance) / 4 107 | mask = tf.sign(tf.nn.relu(self.distance - hp.margin)) 108 | loss_m = tf.square(mask * self.distance) 109 | loss = tf.reduce_sum(y * loss_p + (1 - y) * loss_m) 110 | return loss 111 | 112 | def attention(self, embed, query): 113 | """ 114 | 注意力机制 115 | :param embed: 116 | :param query: 117 | :return: 118 | """ 119 | with tf.name_scope("attention"): 120 | w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32) 121 | b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32) 122 | u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32) 123 | value = tf.concat([embed, query], axis=-1) 124 | value = tf.reshape(value, [-1, 2 * hp.num_units]) 125 | attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u) 126 | attention = tf.reshape(attention, shape=[-1, self.max_len]) 127 | attention = tf.nn.softmax(attention, axis=-1) 128 | attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units]) 129 | 130 | output = tf.reduce_sum(attention * query, axis=1) 131 | output = layer_normalize(output) 132 | return output 133 | 134 | @staticmethod 135 | def similar(key, value): 136 | """ 137 | cosine(key,value) = key * value/(|key|*|value|) 138 | :param key: 139 | :param value: 140 | :return: 141 | """ 142 | dot_value = tf.reduce_sum(key * value, axis=-1) 143 | key_sqrt = tf.sqrt(tf.reduce_sum(tf.square(key), axis=-1) + hp.eps) 144 | value_sqrt = tf.sqrt(tf.reduce_sum(tf.square(value), axis=-1) + hp.eps) 145 | distance = tf.div(dot_value, key_sqrt * value_sqrt, name="similar") 146 | pre_y = tf.sign(tf.nn.relu(distance - hp.margin)) 147 | pre_y = tf.cast(pre_y, tf.int32, name='pre') 148 | return distance, pre_y 149 | 150 | def predict(self): 151 | correct_predictions = tf.equal(self.pre_y, self.y) 152 | accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 153 | return accuracy 154 | 155 | def optimize(self): 156 | """ 157 | 优化器 158 | :return: 159 | """ 160 | optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 161 | train_op = optimizer.minimize(self.loss, global_step=self.global_step) 162 | return train_op 163 | -------------------------------------------------------------------------------- /model/transformer_siamese.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 19-1-22 上午10:48 4 | # @Author : 林利芳 5 | # @File : transformer_siamese.py 6 | import tensorflow as tf 7 | from config.hyperparams import HyperParams as hp 8 | from model.module.modules import embedding, positional_encoding, multihead_attention, feedforward, layer_normalize 9 | 10 | 11 | class TransformerSiameseNetwork(object): 12 | def __init__(self, vocab_size, embedding_size, max_len, batch_size, is_training=True, seg='LSTM'): 13 | self.vocab_size = vocab_size 14 | self.embedding_size = embedding_size 15 | self.max_len = max_len 16 | self.is_training = is_training 17 | self.graph = tf.Graph() 18 | with self.graph.as_default(): 19 | self.left_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="left_x") 20 | self.right_x = tf.placeholder(tf.int32, shape=(batch_size, max_len), name="right_x") 21 | self.y = tf.placeholder(tf.int32, shape=(batch_size,), name="target") 22 | self.left_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size]) 23 | self.right_seq_lens = tf.placeholder(dtype=tf.int32, shape=[batch_size]) 24 | self.global_step = tf.train.create_global_step() 25 | 26 | query, key = self.siamese(seg) 27 | self.distance, self.pre_y = self.similar(query, key) 28 | self.accuracy = self.predict() 29 | self.loss = self.loss_layer() 30 | self.train_op = self.optimize() 31 | 32 | def siamese(self, seg): 33 | """ 34 | 孪生网络 transformer + rnn 35 | :param seg: 36 | :return: 37 | """ 38 | x = tf.concat([self.left_x, self.right_x], axis=0) 39 | seq_lens = tf.concat([self.left_seq_lens, self.right_seq_lens], axis=0) 40 | # layers embedding multi_head_attention rnn 41 | left_embed = embedding(self.left_x, vocab_size=self.vocab_size, num_units=hp.num_units, scale=True, 42 | scope="lembed") 43 | right_embed = embedding(self.right_x, vocab_size=self.vocab_size, num_units=hp.num_units, scale=True, 44 | scope="rembed") 45 | 46 | query, key = self.transformer(left_embed, right_embed) 47 | # output = self.rnn_layer(embed, seq_lens, seg) 48 | query = self.attention(query, query) 49 | key = self.attention(key, key) 50 | return query, key 51 | 52 | def rnn_layer(self, inputs, seq_lens, seg): 53 | """ 54 | 创建双向RNN层 55 | :param inputs: 56 | :param seq_lens: 57 | :param seg: LSTM GRU F-LSTM, IndRNN 58 | :return: 59 | """ 60 | if seg == 'LSTM': 61 | fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units) 62 | bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hp.num_units) 63 | 64 | elif seg == 'GRU': 65 | fw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units) 66 | bw_cell = tf.nn.rnn_cell.GRUCell(num_units=hp.num_units) 67 | else: 68 | fw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units) 69 | bw_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hp.num_units) 70 | # 双向rnn 71 | (fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn( 72 | fw_cell, bw_cell, inputs, sequence_length=seq_lens, dtype=tf.float32) 73 | # 合并双向rnn的output batch_size * max_seq * (hidden_dim*2) 74 | output = tf.add(fw_output, bw_output) 75 | return output 76 | 77 | def transformer(self, query, key): 78 | with tf.variable_scope("Transformer_Encoder"): 79 | # Positional Encoding 80 | query += positional_encoding(self.left_x, num_units=hp.num_units, zero_pad=False, scale=False) 81 | key += positional_encoding(self.right_x, num_units=hp.num_units, zero_pad=False, scale=False) 82 | # Dropout 83 | output = self.multi_head_block(query, key) 84 | return output 85 | 86 | def multi_head_block(self, query, key, causality=False): 87 | """ 88 | 多头注意力机制 89 | :param query: 90 | :param key: 91 | :param causality: 92 | :return: 93 | """ 94 | for i in range(hp.num_blocks): 95 | with tf.variable_scope("num_blocks_{}".format(i)): 96 | # multi head Attention ( self-attention) 97 | query = self.multihead_attention(query, query, name="query_attention", causality=causality) 98 | key = self.multihead_attention(key, key, name="key_attention", causality=causality) 99 | query = self.multihead_attention(query, key, name="query_key_attention") 100 | key = self.multihead_attention(key, query, name="query_key_attention") 101 | return query, key 102 | 103 | def multihead_attention(self, query, key, name="key_attention", causality=False): 104 | value = multihead_attention( 105 | queries=query, keys=key, num_units=hp.num_units, num_heads=hp.num_heads, 106 | dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=causality, 107 | scope=name) 108 | # Feed Forward 109 | value = feedforward(value, num_units=[4 * hp.num_units, hp.num_units]) 110 | return value 111 | 112 | def loss_layer(self): 113 | """ 114 | 损失函数 L+ = (1-Ew)^2/4 L_ = max(Ex,0)^2 115 | :return: 116 | """ 117 | y = tf.cast(self.y, tf.float32) 118 | with tf.name_scope("output"): 119 | loss_p = tf.square(1 - self.distance) / 4 120 | mask = tf.sign(tf.nn.relu(self.distance - hp.margin)) 121 | loss_m = tf.square(mask * self.distance) 122 | loss = tf.reduce_sum(y * loss_p + (1 - y) * loss_m) 123 | return loss 124 | 125 | def attention(self, embed, query): 126 | """ 127 | 注意力机制 128 | :param embed: 129 | :param query: 130 | :return: 131 | """ 132 | with tf.name_scope("attention"): 133 | w = tf.get_variable(name="attention_w", shape=[2 * hp.num_units, hp.attention_size], dtype=tf.float32) 134 | b = tf.get_variable(name="attention_b", shape=[hp.attention_size], dtype=tf.float32) 135 | u = tf.get_variable(name="attention_u", shape=[hp.attention_size, 1], dtype=tf.float32) 136 | value = tf.concat([embed, query], axis=-1) 137 | value = tf.reshape(value, [-1, 2 * hp.num_units]) 138 | attention = tf.matmul(tf.tanh(tf.matmul(value, w) + b), u) 139 | attention = tf.reshape(attention, shape=[-1, self.max_len]) 140 | attention = tf.nn.softmax(attention, axis=-1) 141 | attention = tf.tile(tf.expand_dims(attention, axis=-1), multiples=[1, 1, hp.num_units]) 142 | 143 | output = tf.reduce_sum(attention * query, axis=1) 144 | output = layer_normalize(output) 145 | return output 146 | 147 | @staticmethod 148 | def similar(query, key): 149 | """ 150 | cosine(key,value) = key * value/(|key|*|value|) 151 | :param key: 152 | :param value: 153 | :return: 154 | """ 155 | dot_value = tf.reduce_sum(query * key, axis=-1) 156 | query_sqrt = tf.sqrt(tf.reduce_sum(tf.square(query), axis=-1) + hp.eps) 157 | key_sqrt = tf.sqrt(tf.reduce_sum(tf.square(key), axis=-1) + hp.eps) 158 | distance = tf.div(dot_value, key_sqrt * query_sqrt, name="similar") 159 | pre_y = tf.sign(tf.nn.relu(distance - hp.margin)) 160 | pre_y = tf.cast(pre_y, tf.int32, name='pre') 161 | return distance, pre_y 162 | 163 | def predict(self): 164 | correct_predictions = tf.equal(self.pre_y, self.y) 165 | accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 166 | return accuracy 167 | 168 | def optimize(self): 169 | """ 170 | 优化器 171 | :return: 172 | """ 173 | optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 174 | train_op = optimizer.minimize(self.loss, global_step=self.global_step) 175 | return train_op 176 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 18-12-29 下午3:06 4 | # @Author : 林利芳 5 | # @File : run.py 6 | import os 7 | from core.load_data import load_train_data, get_feed_dict, print_info, preprocessor 8 | from config.config import checkpoint_dir, VOCAB_PKL 9 | from core.utils import load_data 10 | from model.rnn_siamese import RnnSiameseNetwork 11 | from model.match_pyramid import MatchPyramidNetwork 12 | from model.cnn_siamese import CnnSiameseNetwork 13 | from model.transformer_siamese import TransformerSiameseNetwork 14 | from config.hyperparams import HyperParams as hp 15 | import tensorflow as tf 16 | 17 | 18 | def run(network='rnn'): 19 | checkpoint_file = checkpoint_dir.format(network) 20 | if not os.path.exists(checkpoint_file): 21 | os.mkdir(checkpoint_file) 22 | train_l_x, val_l_x, train_l_len, val_l_len, train_r_x, val_r_x, train_r_len, val_r_len, train_y, val_y = load_train_data() 23 | vocab = load_data(VOCAB_PKL) 24 | vocab_size = len(vocab.word2idx) 25 | 26 | batch_size = hp.batch_size 27 | if network == 'rnn': 28 | model = RnnSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True) 29 | elif network == 'match_pyramid': 30 | model = MatchPyramidNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True) 31 | elif network == 'cnn': 32 | model = CnnSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True) 33 | elif network == "transformer": 34 | model = TransformerSiameseNetwork(vocab_size, hp.embedding_size, vocab.max_len, batch_size, True) 35 | else: 36 | return 37 | sv = tf.train.Supervisor(graph=model.graph, logdir=checkpoint_file, save_model_secs=150) 38 | with sv.managed_session() as sess: 39 | print("start training...\n") 40 | for epoch in range(1, hp.num_epochs + 1): 41 | if sv.should_stop(): 42 | break 43 | train_loss = [] 44 | 45 | for feed_dict, _ in get_feed_dict(model, train_l_x, train_r_x, train_l_len, train_r_len, train_y, 46 | batch_size): 47 | loss, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict) 48 | train_loss.append(loss) 49 | dev_loss = [] 50 | predicts = [] 51 | for feed_dict, start in get_feed_dict(model, val_l_x, val_r_x, val_l_len, val_r_len, val_y, batch_size): 52 | loss, gs, pre_y = sess.run([model.loss, model.global_step, model.pre_y], feed_dict=feed_dict) 53 | dev_loss.append(loss) 54 | predicts.extend(pre_y[start:]) 55 | print_info(epoch, gs, train_loss, dev_loss, val_y, predicts) 56 | 57 | 58 | if __name__ == "__main__": 59 | # preprocessor(True) 60 | network = 'transformer' # network = [rnn match_pyramid cnn transformer] 61 | run(network) 62 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | python main.py $1 $2 --------------------------------------------------------------------------------