├── results ├── model_epoch_20_gs_0 ├── model_epoch_20_gs_120 └── model_epoch_20_gs_240 ├── requirements.txt ├── images ├── train_loss.png ├── NMT_res_BLEU.png ├── train_accuracy.png ├── rc_model_train_loss.png ├── infersent_train_loss.png ├── infersent_train_accuracy.png ├── infersent_train_SNLI_loss.png ├── rc_model_train_loss_200epoch.png └── infersent_train_with_SNLI_accuracy.png ├── __pycache__ ├── modules.cpython-36.pyc ├── data_load.cpython-36.pyc ├── hyperparams.cpython-35.pyc └── hyperparams.cpython-36.pyc ├── transformer_RC ├── __pycache__ │ ├── models.cpython-35.pyc │ ├── modules.cpython-35.pyc │ ├── data_load.cpython-35.pyc │ └── hyperparams.cpython-35.pyc ├── layers │ ├── __pycache__ │ │ ├── basic_rnn.cpython-35.pyc │ │ ├── match_layer.cpython-35.pyc │ │ └── pointer_net.cpython-35.pyc │ ├── basic_rnn.py │ ├── match_layer.py │ └── pointer_net.py ├── README.md ├── prepro.py ├── hyperparams.py ├── data_load.py ├── eval.py ├── train.py └── modules.py ├── en-zh_NMT ├── README.MD ├── prepro.py ├── data_pre.py ├── eval.py ├── data_load.py ├── train.py └── modules.py ├── transformer_text_Classfication ├── README.MD ├── prepro.py ├── data_load.py ├── eval.py ├── hyperparams.py ├── data_pre.py ├── train.py └── modules.py ├── transformer_infersent ├── data_prepare.py ├── README.MD ├── prepro.py ├── hyperparams.py ├── data_load.py ├── eval.py ├── train.py └── modules.py ├── transformer_jieba ├── prepro.py ├── train.py ├── data_pre.py ├── eval.py └── data_load.py ├── .circleci └── config.yml ├── hyperparams.py ├── README.md └── Models └── models.py /results/model_epoch_20_gs_0: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /results/model_epoch_20_gs_120: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /results/model_epoch_20_gs_240: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nltk>=3.2.4 2 | numpy>=1.13.0 3 | regex>=2017.6.7 4 | tensorflow==1.12.0 5 | -------------------------------------------------------------------------------- /images/train_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/train_loss.png -------------------------------------------------------------------------------- /images/NMT_res_BLEU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/NMT_res_BLEU.png -------------------------------------------------------------------------------- /images/train_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/train_accuracy.png -------------------------------------------------------------------------------- /images/rc_model_train_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/rc_model_train_loss.png -------------------------------------------------------------------------------- /images/infersent_train_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/infersent_train_loss.png -------------------------------------------------------------------------------- /__pycache__/modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/__pycache__/modules.cpython-36.pyc -------------------------------------------------------------------------------- /images/infersent_train_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/infersent_train_accuracy.png -------------------------------------------------------------------------------- /__pycache__/data_load.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/__pycache__/data_load.cpython-36.pyc -------------------------------------------------------------------------------- /images/infersent_train_SNLI_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/infersent_train_SNLI_loss.png -------------------------------------------------------------------------------- /__pycache__/hyperparams.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/__pycache__/hyperparams.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/hyperparams.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/__pycache__/hyperparams.cpython-36.pyc -------------------------------------------------------------------------------- /images/rc_model_train_loss_200epoch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/rc_model_train_loss_200epoch.png -------------------------------------------------------------------------------- /images/infersent_train_with_SNLI_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/infersent_train_with_SNLI_accuracy.png -------------------------------------------------------------------------------- /transformer_RC/__pycache__/models.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/__pycache__/models.cpython-35.pyc -------------------------------------------------------------------------------- /transformer_RC/__pycache__/modules.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/__pycache__/modules.cpython-35.pyc -------------------------------------------------------------------------------- /transformer_RC/__pycache__/data_load.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/__pycache__/data_load.cpython-35.pyc -------------------------------------------------------------------------------- /transformer_RC/__pycache__/hyperparams.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/__pycache__/hyperparams.cpython-35.pyc -------------------------------------------------------------------------------- /transformer_RC/layers/__pycache__/basic_rnn.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/layers/__pycache__/basic_rnn.cpython-35.pyc -------------------------------------------------------------------------------- /transformer_RC/layers/__pycache__/match_layer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/layers/__pycache__/match_layer.cpython-35.pyc -------------------------------------------------------------------------------- /transformer_RC/layers/__pycache__/pointer_net.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/layers/__pycache__/pointer_net.cpython-35.pyc -------------------------------------------------------------------------------- /en-zh_NMT/README.MD: -------------------------------------------------------------------------------- 1 | # ***Second - zh-en NMT*** 2 | - the train and test data was from `Web Inventory of Transcribed and Translated Talks`-**WIT3**, we train a model for English-Chinese translation model([data source](https://wit3.fbk.eu/mt.php?release=2015-01)). 3 | - test Result: 4 | ![NMT result](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/NMT_res_BLEU.png) 5 | 6 | -------------------------------------------------------------------------------- /transformer_text_Classfication/README.MD: -------------------------------------------------------------------------------- 1 | # result of chinese sentences classfication(char-level) 2 | ``` 3 | precision recall f1-score support 4 | 5 | 0 0.99 1.00 0.99 992 6 | 1 1.00 0.99 0.99 980 7 | 8 | micro avg 0.99 0.99 0.99 1972 9 | macro avg 0.99 0.99 0.99 1972 10 | weighted avg 0.99 0.99 0.99 1972 11 | 12 | Done 13 | ``` 14 | -------------------------------------------------------------------------------- /transformer_RC/README.md: -------------------------------------------------------------------------------- 1 | reading comprehension model by transformer 2 | - The Architecture of this model employed the **transformer feature** parallized attention + **BiDAF query-wise Passage content state** + **PointerNetwork**. 3 | 4 | 5 | - train Loss:![loss](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/rc_model_train_loss.png) 6 | 7 | - You may want to inspect the predict result in [here](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/results/rc_model_epoch_50_gs_10500) 8 | 9 | final_reuslt: **`Rouge-L:0.2651. BLEU_1: 0.36.`** 10 | 11 | Result is keep updating, welcome to star and follow. 12 | -------------------------------------------------------------------------------- /transformer_infersent/data_prepare.py: -------------------------------------------------------------------------------- 1 | # encoding = utf-8 2 | # /usr/bin/python3 3 | 4 | import json 5 | 6 | #{'entailment', '-', 'contradiction', 'neutral'} 7 | hashmap = {'entailment':'0', 'contradiction':'1', 'neutral':'2'} 8 | 9 | 10 | def prepare(): 11 | train, dev, test = [[json.loads(line) for line in open('./snli_1.0/snli_1.0_{}.jsonl'.format(x)).readlines()]\ 12 | for x in ['train', 'dev', 'test']] 13 | 14 | train, dev, test = ['<>'.join([hashmap[x['gold_label']], x['sentence1'], x['sentence2']]) for x in train if x['gold_label'] in hashmap], ['<>'.join([hashmap[x['gold_label']], x['sentence1'], x['sentence2']]) for x in dev if x['gold_label'] in hashmap], ['<>'.join([hashmap[x['gold_label']], x['sentence1'], x['sentence2']]) for x in test if x['gold_label'] in hashmap] 15 | 16 | 17 | with open('./train.csv', 'w') as f1: 18 | for line in train: f1.write(line + '\n') 19 | 20 | with open('./dev.csv', 'w') as f2: 21 | for line in dev: f2.write(line + '\n') 22 | 23 | with open('./test.csv', 'w') as f3: 24 | for line in test: f3.write(line + '\n') 25 | 26 | 27 | if __name__ == '__main__': 28 | prepare() 29 | 30 | -------------------------------------------------------------------------------- /transformer_infersent/README.MD: -------------------------------------------------------------------------------- 1 | ***We implemented a sentences entailment inference task with transformer*** 2 | --- 3 | **Data source** [standord SNLI](https://nlp.stanford.edu/projects/snli/snli_1.0.zip) 4 | 5 | - *Download source data and unzip* : `wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip && unzip snli_1.0.zip` 6 | - *preprocess data*: `python data_prepare.py && python prepro.py` 7 | - *train*: run `python train.py` 8 | - *eval*: run `python eval.py --task infersent` 9 | 10 | Experiment result: 11 | - accuracy: 12 | ![train accuracy](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/infersent_train_with_SNLI_accuracy.png) 13 | 14 | - loss: 15 | ![train loss](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/infersent_train_SNLI_loss.png) 16 | 17 | - eval result: 18 | ``` 19 | precision recall f1-score support 20 | 21 | 0 0.82 0.76 0.79 3358 22 | 1 0.77 0.80 0.79 3226 23 | 2 0.70 0.73 0.72 3208 24 | 25 | accuracy 0.76 9792 26 | macro avg 0.76 0.76 0.76 9792 27 | weighted avg 0.76 0.76 0.76 9792 28 | ``` 29 | -------------------------------------------------------------------------------- /transformer_text_Classfication/prepro.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | from hyperparams import feature_Block_Hyperparams as hp 5 | import tensorflow as tf 6 | import numpy as np 7 | import codecs 8 | import os 9 | 10 | #import regex 11 | import re 12 | from collections import Counter 13 | 14 | #import tokenize 15 | import jieba 16 | 17 | def make_vocab(fpath, fname): 18 | '''Constructs vocabulary. 19 | 20 | Args: 21 | fpath: A list. Input file paths. 22 | fname: A string. Output file name. 23 | 24 | Writes vocabulary line by line to `preprocessed/fname` 25 | ''' 26 | texts = [] 27 | for path in fpath: 28 | text = [x.strip().split()[1] for x in codecs.open(path, 'r', 'utf-8').readlines()] 29 | texts.extend(text) 30 | 31 | corpus = ''.join(texts) 32 | corpus = re.sub("[\s\p']", "", corpus) 33 | corpus = re.sub('[0-9]+', 'N', corpus) 34 | corpus = re.sub('[a-zA-Z]+', 'α', corpus) 35 | #words = jieba.cut(corpus) 36 | words = list(corpus) 37 | 38 | word2cnt = Counter(words) 39 | if not os.path.exists('preprocessed'): os.mkdir('preprocessed') 40 | with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout: 41 | fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("", "", "", "")) 42 | for word, cnt in word2cnt.most_common(len(word2cnt)): 43 | fout.write(u"{}\t{}\n".format(word, cnt)) 44 | 45 | 46 | 47 | 48 | if __name__ == '__main__': 49 | make_vocab([hp.trainset, hp.testset], "vocabs.txt") 50 | print("Done") 51 | -------------------------------------------------------------------------------- /en-zh_NMT/prepro.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | 5 | from hyperparams import seq2seq_Hyperparams as hp 6 | import tensorflow as tf 7 | import numpy as np 8 | import os 9 | 10 | import re 11 | from collections import Counter 12 | 13 | #import tokenize 14 | import jieba 15 | 16 | def make_vocab(fpath, fname, tokenizer = None): 17 | '''Constructs vocabulary. 18 | 19 | Args: 20 | fpath: A string. Input file path. 21 | fname: A string. Output file name. 22 | 23 | Writes vocabulary line by line to `preprocessed/fname` 24 | ''' 25 | #text = codecs.open(fpath, 'r', 'utf-8').read() 26 | text = open(fpath, 'r', encoding = 'utf-8').readlines() 27 | text = [line.strip() for line in text if not line.startswith("<")] 28 | print('length of senteces from path:{} is {}'.format(fpath, len(text))) 29 | text = ' '.join(text) 30 | 31 | if tokenizer == 'jieba': 32 | text = re.sub("[\s\p']", "", text) 33 | words = jieba.cut(text) 34 | elif tokenizer == None: 35 | text = re.sub("[^a-zA-Z]", " ", text) 36 | words = text.split() 37 | else: 38 | raise Exception('Could not find tokenizer...') 39 | 40 | word2cnt = Counter(words) 41 | if not os.path.exists('preprocessed'): os.mkdir('preprocessed') 42 | with open('preprocessed/{}'.format(fname), 'w', encoding = 'utf-8') as fout: 43 | fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("", "", "", "")) 44 | for word, cnt in word2cnt.most_common(len(word2cnt)): 45 | fout.write(u"{}\t{}\n".format(word, cnt)) 46 | 47 | 48 | 49 | 50 | 51 | 52 | if __name__ == '__main__': 53 | make_vocab(hp.source_train, "en.vocab.tsv") 54 | make_vocab(hp.target_train, "zh.vocab.tsv", tokenizer = 'jieba') 55 | print("Done") 56 | -------------------------------------------------------------------------------- /transformer_jieba/prepro.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | 5 | from hyperparams import seq2seq_Hyperparams as hp 6 | import tensorflow as tf 7 | import numpy as np 8 | import os 9 | 10 | import re 11 | from collections import Counter 12 | 13 | #import tokenize 14 | import jieba 15 | 16 | def make_vocab(fpath, fname, tokenizer = None): 17 | '''Constructs vocabulary. 18 | 19 | Args: 20 | fpath: A string. Input file path. 21 | fname: A string. Output file name. 22 | 23 | Writes vocabulary line by line to `preprocessed/fname` 24 | ''' 25 | #text = codecs.open(fpath, 'r', 'utf-8').read() 26 | text = open(fpath, 'r', encoding = 'utf-8').readlines() 27 | text = [line.strip() for line in text if not line.startswith("<")] 28 | print('length of senteces from path:{} is {}'.format(fpath, len(text))) 29 | text = ' '.join(text) 30 | 31 | if tokenizer == 'jieba': 32 | text = re.sub("[\s\p']", "", text) 33 | words = jieba.cut(text) 34 | elif tokenizer == None: 35 | #text = re.sub("[^a-zA-Z]", " ", text) 36 | words = text.split() 37 | else: 38 | raise Exception('Could not find tokenizer...') 39 | 40 | word2cnt = Counter(words) 41 | if not os.path.exists('preprocessed'): os.mkdir('preprocessed') 42 | with open('preprocessed/{}'.format(fname), 'w', encoding = 'utf-8') as fout: 43 | fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("", "", "", "")) 44 | for word, cnt in word2cnt.most_common(len(word2cnt)): 45 | fout.write(u"{}\t{}\n".format(word, cnt)) 46 | 47 | 48 | 49 | 50 | 51 | 52 | if __name__ == '__main__': 53 | make_vocab(hp.source_train, "src.vocab.tsv") 54 | make_vocab(hp.target_train, "tgt.vocab.tsv", tokenizer = None) 55 | print("Done") 56 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | seq2seq: 4 | working_directory: ~/transfromer_NN_Block/en-zh_NMT 5 | docker: 6 | - image: circleci/python:3.5 7 | resource_class: middle 8 | parallelism: 4 9 | steps: 10 | - checkout 11 | - run: sudo pip install -r requirements.txt 12 | 13 | text_classfication: 14 | working_directory: ~/transfromer_NN_Block/transformer_text_Classfication 15 | resource_class: middle 16 | parallelism: 4 17 | docker: 18 | - image: circleci/python:3.5 19 | steps: 20 | - checkout 21 | - run: sudo pip install -r requirements.txt 22 | 23 | sentences_entailments: 24 | working_directory: ~/transfromer_NN_Block/transformer_infersent 25 | resource_class: middle 26 | parallelism: 4 27 | docker: 28 | - image: circleci/python:3.5 29 | steps: 30 | - checkout 31 | - run: sudo pip install -r requirements.txt 32 | - run: wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip && unzip snli_1.0.zip 33 | - run: python data_prepare.py && python prepro.py 34 | - run: python train.py 35 | - run: python eval.py --task infersent 36 | 37 | transformer_jieba: 38 | working_directory: ~/transfromer_NN_Block/transformer_jieba 39 | docker: 40 | - image: circleci/python:3.5 41 | resource_class: middle 42 | parallelism: 4 43 | steps: 44 | - checkout 45 | - run: sudo pip install -r requirements.txt 46 | - run: python data_pre.py 47 | - run: python prepro.py 48 | - run: python train.py 49 | - run: python eval.py --task infersent 50 | 51 | 52 | 53 | workflows: 54 | version: 2 55 | build_and_test: 56 | jobs: 57 | - seq2seq 58 | - text_classfication -------------------------------------------------------------------------------- /transformer_infersent/prepro.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | from hyperparams import infersent_Block_Hyperparams as hp 5 | import tensorflow as tf 6 | import numpy as np 7 | import codecs 8 | import os 9 | 10 | #import regex 11 | import re 12 | from collections import Counter 13 | 14 | #import tokenize 15 | import jieba 16 | 17 | def make_vocab(fpath, fname, lan = 'zh'): 18 | '''Constructs vocabulary. 19 | 20 | Args: 21 | fpath: A list. Input file paths. 22 | fname: A string. Output file name. 23 | 24 | Writes vocabulary line by line to `preprocessed/fname` 25 | ''' 26 | if lan == 'zh': 27 | texts = [] 28 | for path in fpath: 29 | text = [x.strip() for x in codecs.open(path, 'r', 'utf-8').readlines()] 30 | texts.extend(text) 31 | 32 | 33 | corpus = ''.join(texts) 34 | corpus = re.sub("[\s\p']", "", corpus) 35 | #replace numbers with NUM 36 | corpus = re.sub(r'[0-9]+', 'n', corpus) 37 | corpus = re.sub(r'[a-zA-Z]+', 'α', corpus) 38 | words = jieba.cut(corpus) 39 | elif lan == 'en': 40 | texts = [] 41 | for path in fpath: 42 | texts.extend([x.strip().split('<>', 1)[1] for x in codecs.open(path, 'r', 'utf-8').readlines()]) 43 | corpus = ' '.join(texts) 44 | corpus = re.sub(r"[^a-zA-Z]", " ", corpus) 45 | words = corpus.split() 46 | 47 | word2cnt = Counter(words) 48 | if not os.path.exists('preprocessed'): os.mkdir('preprocessed') 49 | with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout: 50 | fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("", "", "", "")) 51 | for word, cnt in word2cnt.most_common(len(word2cnt)): 52 | fout.write(u"{}\t{}\n".format(word, cnt)) 53 | 54 | 55 | 56 | 57 | if __name__ == '__main__': 58 | make_vocab([hp.trainset, hp.testset], "vocabs.txt", lan = 'en') 59 | print("Done") 60 | -------------------------------------------------------------------------------- /transformer_RC/prepro.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | from hyperparams import rc_Hyperparams as hp 5 | import tensorflow as tf 6 | import numpy as np 7 | import codecs 8 | import os 9 | import pandas as pd 10 | 11 | #import regex 12 | import re 13 | from collections import Counter 14 | 15 | #import tokenize 16 | import jieba 17 | 18 | def make_vocab(fpath, fname, lan = 'zh'): 19 | '''Constructs vocabulary. 20 | 21 | Args: 22 | fpath: A list. Input file paths. 23 | fname: A string. Output file name. 24 | 25 | Writes vocabulary line by line to `preprocessed/fname` 26 | ''' 27 | if lan == 'zh': 28 | texts = [] 29 | for path in fpath: 30 | data = pd.read_csv(path) 31 | q_text, p_text = list(data['question']), \ 32 | list(data['content1']) + list(data['content2']) + list(data['content3']) + \ 33 | list(data['content4']) + list(data['content5']) 34 | 35 | texts = q_text + p_text 36 | 37 | 38 | corpus = ''.join(texts) 39 | corpus = re.sub("[\s\p']", "", corpus) 40 | #replace numbers with NUM 41 | #corpus = re.sub(r'[0-9]+', ' n', corpus) 42 | #corpus = re.sub(r'[a-zA-Z]+', ' α', corpus) 43 | words = jieba.cut(corpus) 44 | elif lan == 'en': 45 | texts = [] 46 | for path in fpath: 47 | texts.extend([x.strip().split('<>', 1)[1] for x in codecs.open(path, 'r', 'utf-8').readlines()]) 48 | corpus = ' '.join(texts) 49 | corpus = re.sub(r"[^a-zA-Z]", " ", corpus) 50 | words = corpus.split() 51 | 52 | word2cnt = Counter(words) 53 | if not os.path.exists('preprocessed'): os.mkdir('preprocessed') 54 | with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout: 55 | fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("", "", "", "")) 56 | for word, cnt in word2cnt.most_common(len(word2cnt)): 57 | fout.write(u"{}\t{}\n".format(word, cnt)) 58 | 59 | 60 | 61 | 62 | if __name__ == '__main__': 63 | make_vocab([hp.trainset, hp.testset], "vocabs.txt") 64 | print("Done") 65 | -------------------------------------------------------------------------------- /transformer_infersent/hyperparams.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | 5 | class seq2seq_Hyperparams: 6 | '''Hyperparameters''' 7 | # data 8 | source_train = './datasets/zh-en/train.tags.zh-en.en' 9 | target_train = './datasets/zh-en/train.tags.zh-en.zh' 10 | source_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.en.xml' 11 | target_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.zh.xml' 12 | 13 | # training 14 | batch_size = 32 # alias = N 15 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 16 | logdir = 'seq2seq_model_dir' # log directory 17 | 18 | # model 19 | maxlen = 100 # Maximum number of words in a sentence. alias = T. 20 | # Feel free to increase this if you are ambitious. 21 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 22 | hidden_units = 512 # alias = C 23 | num_blocks = 5 # number of encoder/decoder blocks 24 | num_epochs = 20 25 | num_heads = 8 26 | dropout_rate = 0.1 27 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 28 | 29 | 30 | 31 | 32 | class feature_Block_Hyperparams: 33 | '''Hyperparameters''' 34 | # data 35 | trainset = './datasets/trainset.txt' 36 | testset = './datasets/testset.txt' 37 | 38 | 39 | # training 40 | batch_size = 4 # alias = N 41 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 42 | logdir = 'Block_model_dir' # log directory 43 | 44 | # model 45 | maxlen = 500 # Maximum number of words in a sentence. alias = T. 46 | # Feel free to increase this if you are ambitious. 47 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 48 | hidden_units = 512 # alias = C 49 | num_blocks = 5 # number of encoder/decoder blocks 50 | num_epochs = 20 51 | num_heads = 8 52 | dropout_rate = 0.1 53 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 54 | n_class = 2 55 | 56 | 57 | class infersent_Block_Hyperparams: 58 | '''Hyperparameters''' 59 | # data 60 | trainset = './opensrc_dta/train.csv' 61 | testset = './opensrc_dta/test.csv' 62 | 63 | 64 | # training 65 | relations = {'entailment': '0', 'contradiction': '1', 'neutral': '2'} 66 | 67 | batch_size = 64 # alias = N 68 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 69 | logdir = 'infersent_model_dir' # log directory 70 | 71 | # model 72 | maxlen = 24 # Maximum number of words in a sentence. alias = T. 73 | # Feel free to increase this if you are ambitious. 74 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 75 | hidden_units = 512 # alias = C 76 | num_blocks = 5 # number of encoder/decoder blocks 77 | num_epochs = 20 78 | num_heads = 8 79 | dropout_rate = 0.1 80 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 81 | #n_class = 2 82 | dropout_keep_prob = 0.55 83 | reg_lambda = 0.1 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /transformer_text_Classfication/data_load.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | from __future__ import print_function 5 | from hyperparams import feature_Block_Hyperparams as hp 6 | import tensorflow as tf 7 | import numpy as np 8 | import codecs 9 | import re 10 | from jieba import cut 11 | from collections import Counter 12 | 13 | tagging = {'时尚':0, '教育':1, '时政':2, '体育':3, '游戏':4, '家居':5, '科技':6, '房产':7, '财经':8, '娱乐':9} 14 | 15 | 16 | def load_vocabs(): 17 | vocab = [line.split()[0] for line in codecs.open('./preprocessed/vocabs.txt', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt] #raw code is hp.mincnt 18 | word2idx = {word: idx for idx, word in enumerate(vocab)} 19 | idx2word = {idx: word for idx, word in enumerate(vocab)} 20 | return word2idx, idx2word 21 | 22 | 23 | 24 | def create_data(corpus, labels): 25 | word2idx, idx2word = load_vocabs() 26 | 27 | 28 | # Index 29 | x_list, y_list, Sources, Targets = [], [], [], [] 30 | for sent, label in zip(corpus, labels): 31 | x = [word2idx.get(word, 1) for word in (sent + u" ").split()[:hp.maxlen]] # 1: OOV, : End of Text 32 | x_list.append(np.array(x)) 33 | 34 | 35 | # Pad 36 | X = np.zeros([len(x_list), hp.maxlen], np.int32) 37 | 38 | for i, x in enumerate(x_list): 39 | X[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0)) 40 | 41 | return X, np.array(labels), corpus, labels 42 | 43 | 44 | 45 | def _refine(line): 46 | line = re.sub("[\s\p']", "", line) 47 | line = re.sub('[0-9]+', 'N', line) 48 | line = re.sub('[a-zA-Z]+', 'α', line) 49 | return ' '.join(list(line)) 50 | 51 | 52 | def load_train_data(tokenizer = None): 53 | if tokenizer == None: 54 | corpus = [line.strip().split() for line in codecs.open(hp.trainset, 'r', 'utf-8').readlines()] 55 | corpus = [line for line in corpus if line[0] in tagging] 56 | texts, labels = [_refine(line[1]) for line in corpus], [tagging[line[0]] for line in corpus] 57 | 58 | X, Y, Sources, labels = create_data(texts, labels) 59 | return X, Y 60 | 61 | def load_test_data(tokenizer = None): 62 | if tokenizer == None: 63 | corpus = [line.strip().split() for line in codecs.open(hp.testset, 'r', 'utf-8').readlines()] 64 | corpus = [line for line in corpus if line[0] in tagging] 65 | texts, labels = [_refine(line[1]) for line in corpus], [tagging[line[0]] for line in corpus] 66 | 67 | X, Y, Sources, labels = create_data(texts, labels) 68 | return X, Y, Sources, labels 69 | 70 | 71 | def get_batch_data(): 72 | # Load data 73 | X, Y = load_train_data() 74 | 75 | # calc total batch count 76 | num_batch = len(X) // hp.batch_size 77 | 78 | # Convert to tensor 79 | X = tf.convert_to_tensor(X, tf.int32) 80 | Y = tf.convert_to_tensor(Y, tf.int32) 81 | X, Y 82 | # Create Queues 83 | input_queues = tf.train.slice_input_producer([X, Y]) 84 | 85 | # create batch queues 86 | x, y = tf.train.shuffle_batch(input_queues, 87 | num_threads=8, 88 | batch_size=hp.batch_size, 89 | capacity=hp.batch_size*64, 90 | min_after_dequeue=hp.batch_size*32, 91 | allow_smaller_final_batch=False) 92 | 93 | return x, y, num_batch # (N, T), (N, T), () 94 | 95 | -------------------------------------------------------------------------------- /transformer_jieba/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | 5 | import tensorflow as tf 6 | 7 | from hyperparams import seq2seq_Hyperparams as hp 8 | from data_load import get_batch_data, load_en_vocab, load_zh_vocab 9 | from modules import * 10 | import os, codecs 11 | from tqdm import tqdm 12 | 13 | os.sys.path.append('../Models') 14 | from models import vanilla_transformer 15 | 16 | 17 | class Graph(): 18 | def __init__(self, is_training=True): 19 | self.graph = tf.Graph() 20 | with self.graph.as_default(): 21 | if is_training: 22 | self.x, self.y, self.num_batch = get_batch_data() # (N, T) 23 | else: # inference 24 | self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) 25 | self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) 26 | 27 | # define decoder inputs 28 | self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2: 29 | 30 | # Load vocabulary 31 | en2idx, idx2en = load_en_vocab() 32 | zh2idx, idx2zh = load_zh_vocab() 33 | 34 | # initialize transformer 35 | transformer = vanilla_transformer(hp, self.is_training) 36 | self.enc = transformer.encode(self.x, len(en2idx)) 37 | 38 | # Decoder 39 | self.dec = transformer.decode(self.decoder_inputs, self.enc, len(zh2idx), hp.maxlen) 40 | 41 | # Final linear projection 42 | self.logits = tf.layers.dense(self.dec, len(zh2idx)) 43 | self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) 44 | self.istarget = tf.to_float(tf.not_equal(self.y, 0)) 45 | self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget)) 46 | tf.summary.scalar('acc', self.acc) 47 | 48 | if is_training: 49 | # Loss 50 | self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(zh2idx))) 51 | self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed) 52 | self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget)) 53 | 54 | # Training Scheme 55 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 56 | self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 57 | self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step) 58 | 59 | # Summary 60 | tf.summary.scalar('mean_loss', self.mean_loss) 61 | self.merged = tf.summary.merge_all() 62 | 63 | if __name__ == '__main__': 64 | # Load vocabulary 65 | en2idx, idx2en = load_en_vocab() 66 | zh2idx, idx2zh = load_zh_vocab() 67 | 68 | # Construct graph 69 | g = Graph("train"); print("Graph loaded") 70 | 71 | # Start session 72 | sv = tf.train.Supervisor(graph=g.graph, 73 | logdir=hp.logdir, 74 | save_model_secs=0) 75 | with sv.managed_session() as sess: 76 | for epoch in range(1, hp.num_epochs+1): 77 | if sv.should_stop(): break 78 | for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): 79 | sess.run(g.train_op) 80 | 81 | gs = sess.run(g.global_step) 82 | sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs)) 83 | 84 | print("Done") 85 | 86 | 87 | -------------------------------------------------------------------------------- /hyperparams.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | 5 | 6 | class rc_Hyperparams: 7 | trainset = './datasets/train_round_0.csv' 8 | testset = './datasets/test_data_r0.csv' 9 | 10 | trainfile = './preprocessed/train.csv' 11 | testfile = './preprocessed/test.csv' 12 | 13 | 14 | batch_size = 64 # alias = N 15 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 16 | logdir = 'rc_model_dir' # log directory 17 | 18 | # model 19 | q_maxlen = 50 20 | p_maxlen = 200 21 | ans_maxlen = 40 22 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 23 | hidden_units = 256 # alias = C 24 | num_blocks = 5 # number of encoder/decoder blocks 25 | num_epochs = 20 26 | num_heads = 8 27 | dropout_rate = 0.5 28 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 29 | 30 | dropout_keep_prob = 0.55 31 | reg_lambda = 0.1 32 | use_dropout = True 33 | weight_decay = 0.1 34 | 35 | 36 | 37 | 38 | 39 | class seq2seq_Hyperparams: 40 | '''Hyperparameters''' 41 | # data 42 | source_train = './datasets/zh-en/train.tags.zh-en.en' 43 | target_train = './datasets/zh-en/train.tags.zh-en.zh' 44 | source_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.en.xml' 45 | target_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.zh.xml' 46 | 47 | # training 48 | batch_size = 32 # alias = N 49 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 50 | logdir = 'seq2seq_model_dir' # log directory 51 | 52 | # model 53 | maxlen = 100 # Maximum number of words in a sentence. alias = T. 54 | # Feel free to increase this if you are ambitious. 55 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 56 | hidden_units = 512 # alias = C 57 | num_blocks = 5 # number of encoder/decoder blocks 58 | num_epochs = 20 59 | num_heads = 8 60 | dropout_rate = 0.1 61 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 62 | 63 | 64 | 65 | 66 | class feature_Block_Hyperparams: 67 | '''Hyperparameters''' 68 | # data 69 | trainset = './datasets/trainset.txt' 70 | testset = './datasets/testset.txt' 71 | 72 | 73 | # training 74 | batch_size = 4 # alias = N 75 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 76 | logdir = 'Block_model_dir' # log directory 77 | 78 | # model 79 | maxlen = 500 # Maximum number of words in a sentence. alias = T. 80 | # Feel free to increase this if you are ambitious. 81 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 82 | hidden_units = 512 # alias = C 83 | num_blocks = 5 # number of encoder/decoder blocks 84 | num_epochs = 20 85 | num_heads = 8 86 | dropout_rate = 0.1 87 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 88 | n_class = 2 89 | 90 | 91 | class infersent_Block_Hyperparams: 92 | '''Hyperparameters''' 93 | # data 94 | trainset = './opensrc_dta/train.csv' 95 | testset = './opensrc_dta/test.csv' 96 | 97 | 98 | # training 99 | relations = {'entailment': '0', 'contradiction': '1', 'neutral': '2'} 100 | 101 | batch_size = 64 # alias = N 102 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 103 | logdir = 'infersent_model_dir' # log directory 104 | 105 | # model 106 | maxlen = 24 # Maximum number of words in a sentence. alias = T. 107 | # Feel free to increase this if you are ambitious. 108 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 109 | hidden_units = 512 # alias = C 110 | num_blocks = 5 # number of encoder/decoder blocks 111 | num_epochs = 20 112 | num_heads = 8 113 | dropout_rate = 0.1 114 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 115 | dropout_keep_prob = 0.55 116 | reg_lambda = 0.1 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /transformer_text_Classfication/eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import codecs 5 | import os 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | 10 | from hyperparams import feature_Block_Hyperparams as hp 11 | from data_load import load_vocabs, load_train_data, load_test_data, create_data 12 | from train import Graph 13 | #from nltk.translate.bleu_score import corpus_bleu 14 | import argparse 15 | from sklearn.metrics import classification_report 16 | 17 | 18 | 19 | 20 | def eval(task_name): 21 | # Load graph 22 | g = Graph(is_training=False) 23 | print("Graph loaded") 24 | 25 | # Load data 26 | X, _, Texts, Labels = load_test_data() 27 | 28 | word2idx, idx2word = load_vocabs() 29 | 30 | # Start session 31 | with g.graph.as_default(): 32 | sv = tf.train.Supervisor() 33 | with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 34 | ## Restore parameters 35 | sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) 36 | print("Restored!") 37 | 38 | ## Get model name 39 | print('Model dir:', hp.logdir) 40 | mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name 41 | print("Model name:", mname) 42 | 43 | ## Inference 44 | if not os.path.exists('results'): os.mkdir('results') 45 | with codecs.open("results/" + mname, "w", "utf-8") as fout: 46 | list_of_refs, hypotheses = [], [] 47 | print("Iterator:", len(X), hp.batch_size) 48 | 49 | predict_label = [] 50 | for i in range(len(X) // hp.batch_size + 1): 51 | print('Step:\t', i, '/', len(X) // hp.batch_size) 52 | ### Get mini-batches 53 | x = X[i*hp.batch_size: (i+1)*hp.batch_size] 54 | sentences = Texts[i*hp.batch_size: (i+1)*hp.batch_size] 55 | labels = Labels[i*hp.batch_size: (i+1)*hp.batch_size] 56 | 57 | 58 | preds = sess.run(g.preds, {g.x:x}) 59 | preds = [int(x) for x in preds] 60 | predict_label.extend(preds) 61 | 62 | ### Write to file 63 | for sent, label, pred in zip(sentences, labels, preds): # sentence-wise 64 | #got = " ".join(idx2word[idx] for idx in pred).split("")[0].strip() 65 | fout.write("- sent: " + sent +"\n") 66 | fout.write('- label: {}, -predict: {} \n'.format(label, pred)) 67 | fout.flush() 68 | 69 | # bleu score 70 | if task_name == 'seq2seq': 71 | ref = target.split() 72 | hypothesis = got.split() 73 | if len(ref) > 3 and len(hypothesis) > 3: 74 | list_of_refs.append([ref]) 75 | hypotheses.append(hypothesis) 76 | 77 | 78 | ## Calculate bleu score 79 | if task_name == 'seq2seq': 80 | score = corpus_bleu(list_of_refs, hypotheses) 81 | fout.write("Bleu Score = " + str(100*score)) 82 | elif task_name == 'classfication': 83 | assert len(Labels) == len(predict_label), 'The length of label and predicts\ 84 | are not alignmentted.' 85 | res = classification_report(Labels, predict_label) 86 | print(res) 87 | fout.write(res + '\n') 88 | 89 | 90 | if __name__ == '__main__': 91 | parser = argparse.ArgumentParser(description='Choice the task you want to eval.') 92 | parser.add_argument('--task', help='task name(default: classfication)') 93 | 94 | args = parser.parse_args() 95 | task_name = args.task 96 | eval(task_name) 97 | print("Done") 98 | 99 | 100 | -------------------------------------------------------------------------------- /transformer_RC/hyperparams.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | 5 | 6 | class rc_Hyperparams: 7 | trainset = './datasets/train_round_0.csv' 8 | testset = './datasets/test_data_r0.csv' 9 | 10 | trainfile = './preprocessed/train.csv' 11 | testfile = './preprocessed/test.csv' 12 | 13 | 14 | batch_size = 64 # alias = N 15 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 16 | logdir = 'rc_model_dir' # log directory 17 | 18 | # model 19 | q_maxlen = 50 20 | p_maxlen = 200 21 | ans_maxlen = 40 22 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 23 | hidden_units = 256 # alias = C 24 | num_blocks = 5 # number of encoder/decoder blocks 25 | num_epochs = 20 26 | num_heads = 8 27 | dropout_rate = 0.5 28 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 29 | #n_class = 2 30 | dropout_keep_prob = 0.55 31 | reg_lambda = 0.1 32 | Passage_fuse = 'bi-rnn' # bi-rnn or Pooling 33 | use_dropout = True 34 | weight_decay = 0.1 35 | 36 | 37 | 38 | 39 | 40 | class seq2seq_Hyperparams: 41 | '''Hyperparameters''' 42 | # data 43 | source_train = './datasets/zh-en/train.tags.zh-en.en' 44 | target_train = './datasets/zh-en/train.tags.zh-en.zh' 45 | source_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.en.xml' 46 | target_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.zh.xml' 47 | 48 | # training 49 | batch_size = 32 # alias = N 50 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 51 | logdir = 'seq2seq_model_dir' # log directory 52 | 53 | # model 54 | maxlen = 100 # Maximum number of words in a sentence. alias = T. 55 | # Feel free to increase this if you are ambitious. 56 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 57 | hidden_units = 512 # alias = C 58 | num_blocks = 5 # number of encoder/decoder blocks 59 | num_epochs = 20 60 | num_heads = 8 61 | dropout_rate = 0.1 62 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 63 | 64 | 65 | 66 | 67 | class feature_Block_Hyperparams: 68 | '''Hyperparameters''' 69 | # data 70 | trainset = './datasets/trainset.txt' 71 | testset = './datasets/testset.txt' 72 | 73 | 74 | # training 75 | batch_size = 4 # alias = N 76 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 77 | logdir = 'Block_model_dir' # log directory 78 | 79 | # model 80 | maxlen = 500 # Maximum number of words in a sentence. alias = T. 81 | # Feel free to increase this if you are ambitious. 82 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 83 | hidden_units = 512 # alias = C 84 | num_blocks = 5 # number of encoder/decoder blocks 85 | num_epochs = 20 86 | num_heads = 8 87 | dropout_rate = 0.1 88 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 89 | n_class = 2 90 | 91 | 92 | class infersent_Block_Hyperparams: 93 | '''Hyperparameters''' 94 | # data 95 | trainset = './opensrc_dta/train.csv' 96 | testset = './opensrc_dta/test.csv' 97 | 98 | 99 | # training 100 | relations = {'entailment': '0', 'contradiction': '1', 'neutral': '2'} 101 | 102 | batch_size = 64 # alias = N 103 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 104 | logdir = 'infersent_model_dir' # log directory 105 | 106 | # model 107 | maxlen = 24 # Maximum number of words in a sentence. alias = T. 108 | # Feel free to increase this if you are ambitious. 109 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 110 | hidden_units = 512 # alias = C 111 | num_blocks = 5 # number of encoder/decoder blocks 112 | num_epochs = 20 113 | num_heads = 8 114 | dropout_rate = 0.1 115 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 116 | #n_class = 2 117 | dropout_keep_prob = 0.55 118 | reg_lambda = 0.1 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /transformer_RC/layers/basic_rnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module provides wrappers for variants of RNN in Tensorflow 19 | """ 20 | 21 | import tensorflow as tf 22 | import tensorflow.contrib as tc 23 | 24 | 25 | def rnn(rnn_type, inputs, length, hidden_size, layer_num=1, dropout_keep_prob=None, concat=True): 26 | """ 27 | Implements (Bi-)LSTM, (Bi-)GRU and (Bi-)RNN 28 | Args: 29 | rnn_type: the type of rnn 30 | inputs: padded inputs into rnn 31 | length: the valid length of the inputs 32 | hidden_size: the size of hidden units 33 | layer_num: multiple rnn layer are stacked if layer_num > 1 34 | dropout_keep_prob: 35 | concat: When the rnn is bidirectional, the forward outputs and backward outputs are 36 | concatenated if this is True, else we add them. 37 | Returns: 38 | RNN outputs and final state 39 | """ 40 | if not rnn_type.startswith('bi'): 41 | cell = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 42 | outputs, states = tf.nn.dynamic_rnn(cell, inputs, sequence_length=length, dtype=tf.float32) 43 | if rnn_type.endswith('lstm'): 44 | c = [state.c for state in states] 45 | h = [state.h for state in states] 46 | states = h 47 | else: 48 | cell_fw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 49 | cell_bw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 50 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 51 | cell_bw, cell_fw, inputs, sequence_length=length, dtype=tf.float32 52 | ) 53 | states_fw, states_bw = states 54 | if rnn_type.endswith('lstm'): 55 | c_fw = [state_fw.c for state_fw in states_fw] 56 | h_fw = [state_fw.h for state_fw in states_fw] 57 | c_bw = [state_bw.c for state_bw in states_bw] 58 | h_bw = [state_bw.h for state_bw in states_bw] 59 | states_fw, states_bw = h_fw, h_bw 60 | if concat: 61 | outputs = tf.concat(outputs, 2) 62 | states = tf.concat([states_fw, states_bw], 1) 63 | else: 64 | outputs = outputs[0] + outputs[1] 65 | states = states_fw + states_bw 66 | return outputs, states 67 | 68 | 69 | def get_cell(rnn_type, hidden_size, layer_num=1, dropout_keep_prob=None): 70 | """ 71 | Gets the RNN Cell 72 | Args: 73 | rnn_type: 'lstm', 'gru' or 'rnn' 74 | hidden_size: The size of hidden units 75 | layer_num: MultiRNNCell are used if layer_num > 1 76 | dropout_keep_prob: dropout in RNN 77 | Returns: 78 | An RNN Cell 79 | """ 80 | cells = [] 81 | for i in range(layer_num): 82 | if rnn_type.endswith('lstm'): 83 | cell = tc.rnn.LSTMCell(num_units=hidden_size, state_is_tuple=True) 84 | elif rnn_type.endswith('gru'): 85 | cell = tc.rnn.GRUCell(num_units=hidden_size) 86 | elif rnn_type.endswith('rnn'): 87 | cell = tc.rnn.BasicRNNCell(num_units=hidden_size) 88 | else: 89 | raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type)) 90 | if dropout_keep_prob is not None: 91 | cell = tc.rnn.DropoutWrapper(cell, 92 | input_keep_prob=dropout_keep_prob, 93 | output_keep_prob=dropout_keep_prob) 94 | cells.append(cell) 95 | cells = tc.rnn.MultiRNNCell(cells, state_is_tuple=True) 96 | return cells 97 | 98 | 99 | -------------------------------------------------------------------------------- /transformer_jieba/data_pre.py: -------------------------------------------------------------------------------- 1 | #data preparation 2 | import codecs 3 | import os 4 | import argparse 5 | import jieba 6 | #from hyperparams import Hyperparams as hp 7 | 8 | 9 | def jieba_data_pre(): 10 | with codecs.open('./dataset/train.txt', 'r', encoding = 'utf-8') as f: 11 | vocabset = f.readlines() 12 | vocabset = [x.strip() for x in vocabset] 13 | #print(vocabset[:10]) 14 | zh = '' 15 | en = '' 16 | 17 | for pair in vocabset: 18 | try: 19 | z, e = pair.strip().split('\t') 20 | zh += z + ' ' 21 | en += e + ' ' 22 | except: 23 | zh += '' 24 | en += '' 25 | 26 | 27 | zh_sent = zh.split('') 28 | en_sent = en.split('') 29 | assert len(zh_sent) == len(en_sent), 'length of source and target not comliable' 30 | 31 | files = [] 32 | for root, dirs, file in os.walk(".", topdown=False): 33 | files.append(file) 34 | #print(files) 35 | 36 | if 'train.tags.zh-en.zh' not in files: 37 | with codecs.open('./dataset/train.tags.src-tgt.src', 'w', 'utf-8') as f: 38 | for i in zh_sent[:int(0.8*len(zh_sent))]: 39 | f.write(i+'\n') 40 | 41 | with codecs.open('./dataset/train.tags.tgt-src.tgt', 'w', 'utf-8') as f: 42 | for i in en_sent[:int(0.8*len(en_sent))]: 43 | f.write(i+'\n') 44 | 45 | with codecs.open('./dataset/test.tags.src-tgt.src', 'w', 'utf-8') as f: 46 | for i in zh_sent[int(0.8*len(zh_sent)):]: 47 | f.write(i+'\n') 48 | 49 | with codecs.open('./dataset/test.tags.tgt-src.tgt', 'w', 'utf-8') as f: 50 | for i in en_sent[int(0.8*len(en_sent)):]: 51 | f.write(i+'\n') 52 | 53 | 54 | 55 | def text_sum_pre(): 56 | cnt_title_path = './dataset/content-title.txt' 57 | cnt_title_pair = [x.strip().split() for x in open(cnt_title_path).readlines()] 58 | cnt_title_pair = [x for x in cnt_title_pair if len(x) == 2] 59 | 60 | content_set, sum_set = [x[0] for x in cnt_title_pair], [x[1] for x in cnt_title_pair] 61 | pad = ['', '', "", ""] 62 | content_vocabs, title_vocabs = {}, {} 63 | 64 | for x in content_set: 65 | vocabs = jieba.cut(x) 66 | for x in vocabs: 67 | if x not in content_vocabs: 68 | content_vocabs[x] = 1 69 | else: 70 | content_vocabs[x] += 1 71 | 72 | for x in sum_set: 73 | vocabs = jieba.cut(x) 74 | for x in vocabs: 75 | if x not in title_vocabs: 76 | title_vocabs[x] = 1 77 | else: 78 | title_vocabs[x] += 1 79 | 80 | #save vocab 81 | 82 | if not 'textSummary' in os.listdir('./preprocessed'): 83 | os.mkdir('./preprocessed/textSummary') 84 | with codecs.open('./preprocessed/textSummary/src.vocab.tsv', 'w', 'utf-8') as f: 85 | for token in pad: 86 | f.write(token + '\t' + '1000000000' + '\n') 87 | for token, val in content_vocabs.items(): 88 | f.write(token + '\t' + str(content_vocabs[token]) + '\n') 89 | 90 | with codecs.open('./preprocessed/textSummary/tgt.vocab.tsv', 'w', 'utf-8') as f: 91 | for token in pad: 92 | f.write(token + '\t' + '1000000000' + '\n') 93 | for token, val in title_vocabs.items(): 94 | f.write(token + '\t' + str(title_vocabs[token]) + '\n') 95 | 96 | 97 | if 'textSummary' in os.listdir('./dataset'): 98 | os._exit(0) 99 | else: 100 | os.mkdir('./dataset/textSummary') 101 | 102 | n = len(sum_set) 103 | with codecs.open('./dataset/textSummary/train.tags.src-tgt.src', 'w', 'utf-8') as f: 104 | for x in content_set[:int(0.8*n)]: 105 | f.write(x+'\n') 106 | 107 | with codecs.open('./dataset/textSummary/train.tags.tgt-src.tgt', 'w', 'utf-8') as f: 108 | for x in sum_set[:int(0.8*n)]: 109 | f.write(x+'\n') 110 | 111 | with codecs.open('./dataset/textSummary/test.tags.src-tgt.src', 'w', 'utf-8') as f: 112 | for x in content_set[int(0.8*n):]: 113 | f.write(x+'\n') 114 | 115 | with codecs.open('./dataset/textSummary/test.tags.tgt-src.tgt', 'w', 'utf-8') as f: 116 | for x in sum_set[int(0.8*n):]: 117 | f.write(x+'\n') 118 | 119 | 120 | def main(): 121 | parser = argparse.ArgumentParser(description='Choice the task you want to run.') 122 | parser.add_argument('--task', default = 'jieba', 123 | help='task name(default: tokenize)') 124 | 125 | args = parser.parse_args() 126 | task_name = args.task 127 | 128 | if task_name == 'jieba': jieba_data_pre() 129 | elif task_name == 'textsum': text_sum_pre() 130 | 131 | 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /en-zh_NMT/data_pre.py: -------------------------------------------------------------------------------- 1 | # encoding = 'utf-8' 2 | #/usr/bin/python3 3 | 4 | 5 | #data preparation 6 | import codecs 7 | import os 8 | import argparse 9 | import jieba 10 | #from hyperparams import Hyperparams as hp 11 | 12 | 13 | def jieba_data_pre(): 14 | with codecs.open('./dataset/train.txt', 'r', encoding = 'utf-8') as f: 15 | vocabset = f.readlines() 16 | vocabset = [x.strip() for x in vocabset] 17 | #print(vocabset[:10]) 18 | zh = '' 19 | en = '' 20 | 21 | for pair in vocabset: 22 | try: 23 | z, e = pair.strip().split('\t') 24 | zh += z + ' ' 25 | en += e + ' ' 26 | except: 27 | zh += '' 28 | en += '' 29 | 30 | 31 | zh_sent = zh.split('') 32 | en_sent = en.split('') 33 | assert len(zh_sent) == len(en_sent), 'length of source and target not comliable' 34 | 35 | files = [] 36 | for root, dirs, file in os.walk(".", topdown=False): 37 | files.append(file) 38 | 39 | if 'train.tags.zh-en.zh' not in files: 40 | with codecs.open('./dataset/train.tags.src-tgt.src', 'w', 'utf-8') as f: 41 | for i in zh_sent[:int(0.8*len(zh_sent))]: 42 | f.write(i+'\n') 43 | 44 | with codecs.open('./dataset/train.tags.tgt-src.tgt', 'w', 'utf-8') as f: 45 | for i in en_sent[:int(0.8*len(en_sent))]: 46 | f.write(i+'\n') 47 | 48 | with codecs.open('./dataset/test.tags.src-tgt.src', 'w', 'utf-8') as f: 49 | for i in zh_sent[int(0.8*len(zh_sent)):]: 50 | f.write(i+'\n') 51 | 52 | with codecs.open('./dataset/test.tags.tgt-src.tgt', 'w', 'utf-8') as f: 53 | for i in en_sent[int(0.8*len(en_sent)):]: 54 | f.write(i+'\n') 55 | 56 | 57 | 58 | def text_sum_pre(): 59 | cnt_title_path = './dataset/content-title.txt' 60 | cnt_title_pair = [x.strip().split() for x in open(cnt_title_path).readlines()] 61 | cnt_title_pair = [x for x in cnt_title_pair if len(x) == 2] 62 | 63 | content_set, sum_set = [x[0] for x in cnt_title_pair], [x[1] for x in cnt_title_pair] 64 | pad = ['', '', "", ""] 65 | content_vocabs, title_vocabs = {}, {} 66 | 67 | for x in content_set: 68 | vocabs = jieba.cut(x) 69 | for x in vocabs: 70 | if x not in content_vocabs: 71 | content_vocabs[x] = 1 72 | else: 73 | content_vocabs[x] += 1 74 | 75 | for x in sum_set: 76 | vocabs = jieba.cut(x) 77 | for x in vocabs: 78 | if x not in title_vocabs: 79 | title_vocabs[x] = 1 80 | else: 81 | title_vocabs[x] += 1 82 | 83 | #save vocab 84 | 85 | if not 'textSummary' in os.listdir('./preprocessed'): 86 | os.mkdir('./preprocessed/textSummary') 87 | with codecs.open('./preprocessed/textSummary/src.vocab.tsv', 'w', 'utf-8') as f: 88 | for token in pad: 89 | f.write(token + '\t' + '1000000000' + '\n') 90 | for token, val in content_vocabs.items(): 91 | f.write(token + '\t' + str(content_vocabs[token]) + '\n') 92 | 93 | with codecs.open('./preprocessed/textSummary/tgt.vocab.tsv', 'w', 'utf-8') as f: 94 | for token in pad: 95 | f.write(token + '\t' + '1000000000' + '\n') 96 | for token, val in title_vocabs.items(): 97 | f.write(token + '\t' + str(title_vocabs[token]) + '\n') 98 | 99 | 100 | if 'textSummary' in os.listdir('./dataset'): 101 | os._exit(0) 102 | else: 103 | os.mkdir('./dataset/textSummary') 104 | 105 | n = len(sum_set) 106 | with codecs.open('./dataset/textSummary/train.tags.src-tgt.src', 'w', 'utf-8') as f: 107 | for x in content_set[:int(0.8*n)]: 108 | f.write(x+'\n') 109 | 110 | with codecs.open('./dataset/textSummary/train.tags.tgt-src.tgt', 'w', 'utf-8') as f: 111 | for x in sum_set[:int(0.8*n)]: 112 | f.write(x+'\n') 113 | 114 | with codecs.open('./dataset/textSummary/test.tags.src-tgt.src', 'w', 'utf-8') as f: 115 | for x in content_set[int(0.8*n):]: 116 | f.write(x+'\n') 117 | 118 | with codecs.open('./dataset/textSummary/test.tags.tgt-src.tgt', 'w', 'utf-8') as f: 119 | for x in sum_set[int(0.8*n):]: 120 | f.write(x+'\n') 121 | 122 | 123 | def main(): 124 | parser = argparse.ArgumentParser(description='Choice the task you want to run.') 125 | parser.add_argument('--task', default = 'jieba', 126 | help='task name(default: tokenize)') 127 | 128 | args = parser.parse_args() 129 | task_name = args.task 130 | 131 | if task_name == 'jieba': jieba_data_pre() 132 | elif task_name == 'textsum': text_sum_pre() 133 | 134 | 135 | 136 | if __name__ == '__main__': 137 | main() 138 | -------------------------------------------------------------------------------- /transformer_infersent/data_load.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | from __future__ import print_function 5 | from hyperparams import infersent_Block_Hyperparams as hp 6 | import tensorflow as tf 7 | import numpy as np 8 | import codecs 9 | import re 10 | from jieba import cut 11 | 12 | 13 | def load_vocabs(): 14 | vocab = [line.split()[0] for line in codecs.open('./preprocessed/vocabs.txt', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt] #raw code is hp.mincnt 15 | word2idx = {word: idx for idx, word in enumerate(vocab)} 16 | idx2word = {idx: word for idx, word in enumerate(vocab)} 17 | return word2idx, idx2word 18 | 19 | 20 | 21 | def create_data(s1, s2, labels): 22 | word2idx, idx2word = load_vocabs() 23 | 24 | #max token numbers 25 | max_token_num = len(word2idx.keys()) + 100 26 | 27 | # Index 28 | x1_list, x2_list, Sources, Targets = [], [], [], [] 29 | for sent1, sent2 in zip(s1, s2): 30 | x1 = [word2idx.get(word, 1) for word in (sent1 + u" ").split()[:hp.maxlen-5]] # 1: OOV, : End of Text 31 | x2 = [word2idx.get(word, 1) for word in (sent2 + u" ").split()[:hp.maxlen-5]] 32 | 33 | x1_list.append(np.array(x1)) 34 | x2_list.append(np.array(x2)) 35 | print('demo', x1_list[0], x2_list[0], labels[0]) 36 | 37 | # Pad 38 | X1 = np.zeros([len(x1_list), hp.maxlen], np.int32) 39 | X2 = np.zeros([len(x2_list), hp.maxlen], np.int32) 40 | 41 | for i, x in enumerate(x1_list): 42 | X1[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0)) 43 | 44 | for i, x in enumerate(x2_list): 45 | X2[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0)) 46 | 47 | labels = [int(x) for x in labels] 48 | 49 | return X1, X2, np.array(labels) 50 | 51 | 52 | def _refine(line, lan = 'zh'): 53 | #line = re.sub("[^\s\p{Latin}']", "", line) 54 | if lan == 'zh': 55 | line = re.sub("[\s\p']", "", line) 56 | line = re.sub(r'[0-9]+', 'n', line) 57 | line = re.sub(r'[a-zA-Z]+', 'α', line) 58 | line = jieba.cut(line) 59 | return ' '.join(list(line)) 60 | elif lan == 'en': 61 | line = re.sub("[^a-zA-Z]", " ", line) 62 | return line 63 | 64 | else: 65 | raise Exception('Havn\'t specified language!') 66 | return 67 | 68 | 69 | 70 | def load_train_data(tokenizer = None): 71 | corpus = [line.strip().split('<>') for line in codecs.open(hp.trainset, 'r', 'utf-8').readlines()[:100000]] 72 | s1, s2, labels = [_refine(line[1], lan = 'en') for line in corpus], [_refine(line[2], lan = 'en') for line in corpus], \ 73 | [int(line[0]) for line in corpus] 74 | 75 | X1, X2, Label = create_data(s1, s2, labels) 76 | return X1, X2, Label 77 | 78 | 79 | 80 | def load_test_data(tokenizer = None): 81 | corpus = [line.strip().split('<>') for line in codecs.open(hp.testset, 'r', 'utf-8').readlines()] 82 | s1, s2, labels = [_refine(line[1], lan = 'en') for line in corpus], [_refine(line[2], lan = 'en') for line in corpus], \ 83 | [line[0] for line in corpus] 84 | 85 | X1, X2, Label = create_data(s1, s2, labels) 86 | return X1, X2, Label 87 | 88 | 89 | def get_batch_data(): 90 | # Load data 91 | X1, X2, Label = load_train_data() 92 | 93 | # calc total batch count 94 | num_batch = len(X1) // hp.batch_size 95 | 96 | # Convert to tensor 97 | X1 = tf.convert_to_tensor(X1, tf.int32) 98 | X2 = tf.convert_to_tensor(X2, tf.int32) 99 | Label = tf.convert_to_tensor(Label, tf.int32) 100 | 101 | # Create Queues 102 | input_queues = tf.train.slice_input_producer([X1, X2, Label]) 103 | 104 | # create batch queues 105 | x1, x2, label = tf.train.shuffle_batch(input_queues, 106 | num_threads=8, 107 | batch_size=hp.batch_size, 108 | capacity=hp.batch_size*64, 109 | min_after_dequeue=hp.batch_size*32, 110 | allow_smaller_final_batch=False) 111 | return x1, x2, label, num_batch # (N, T), (N, T), () 112 | 113 | -------------------------------------------------------------------------------- /transformer_text_Classfication/hyperparams.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | 5 | 6 | class rc_Hyperparams: 7 | trainset = './datasets/train_round_0.csv' 8 | testset = './datasets/test_data_r0.csv' 9 | 10 | trainfile = './preprocessed/train.csv' 11 | testfile = './preprocessed/test.csv' 12 | predictfile = './inference_QA.csv' 13 | 14 | batch_size = 64 # alias = N 15 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 16 | logdir = 'rc_model_dir' # log directory 17 | 18 | # model 19 | q_maxlen = 50 20 | p_maxlen = 300 21 | ans_maxlen = 40 22 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 23 | hidden_units = 512 # alias = C 24 | num_blocks = 5 # number of encoder/decoder blocks 25 | num_epochs = 200 26 | num_heads = 8 27 | dropout_rate = 0.33 28 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 29 | #n_class = 2 30 | dropout_keep_prob = 0.33 31 | reg_lambda = 0.1 32 | Passage_fuse = 'bi-rnn' # bi-rnn or Pooling 33 | use_dropout = True 34 | weight_decay = 0.1 35 | 36 | 37 | 38 | 39 | 40 | class seq2seq_Hyperparams: 41 | '''Hyperparameters''' 42 | # data 43 | source_train = './datasets/zh-en/train.tags.zh-en.en' 44 | target_train = './datasets/zh-en/train.tags.zh-en.zh' 45 | source_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.en.xml' 46 | target_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.zh.xml' 47 | 48 | # training 49 | batch_size = 32 # alias = N 50 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 51 | logdir = 'seq2seq_model_dir' # log directory 52 | 53 | # model 54 | maxlen = 100 # Maximum number of words in a sentence. alias = T. 55 | # Feel free to increase this if you are ambitious. 56 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 57 | hidden_units = 512 # alias = C 58 | num_blocks = 5 # number of encoder/decoder blocks 59 | num_epochs = 20 60 | num_heads = 8 61 | dropout_rate = 0.1 62 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 63 | 64 | 65 | 66 | 67 | class feature_Block_Hyperparams: 68 | '''Hyperparameters''' 69 | # data 70 | trainset = './datasets/cnews.train.txt' 71 | testset = './datasets/cnews.test.txt' 72 | tagging = {'时尚':0, '教育':1, '时政':2, '体育':3, '游戏':4, '家居':5, '科技':6, '房产':7, '财经':8, '娱乐':9} 73 | 74 | # training 75 | batch_size = 4 # alias = N 76 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 77 | logdir = 'Block_model_dir' # log directory 78 | 79 | # model 80 | maxlen = 500 # Maximum number of words in a sentence. alias = T. 81 | # Feel free to increase this if you are ambitious. 82 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 83 | hidden_units = 512 # alias = C 84 | num_blocks = 5 # number of encoder/decoder blocks 85 | num_epochs = 20 86 | num_heads = 8 87 | dropout_rate = 0.1 88 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 89 | n_class = 10 90 | 91 | 92 | class infersent_Block_Hyperparams: 93 | '''Hyperparameters''' 94 | # data 95 | trainset = './opensrc_dta/train.csv' 96 | testset = './opensrc_dta/test.csv' 97 | 98 | 99 | # training 100 | relations = {'entailment': '0', 'contradiction': '1', 'neutral': '2'} 101 | 102 | batch_size = 64 # alias = N 103 | lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. 104 | logdir = 'infersent_model_dir' # log directory 105 | 106 | # model 107 | maxlen = 24 # Maximum number of words in a sentence. alias = T. 108 | # Feel free to increase this if you are ambitious. 109 | min_cnt = 3 # words whose occurred less than min_cnt are encoded as . 110 | hidden_units = 512 # alias = C 111 | num_blocks = 5 # number of encoder/decoder blocks 112 | num_epochs = 20 113 | num_heads = 8 114 | dropout_rate = 0.1 115 | sinusoid = False # If True, use sinusoid. If false, positional embedding. 116 | #n_class = 2 117 | dropout_keep_prob = 0.55 118 | reg_lambda = 0.1 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /transformer_infersent/eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import codecs 5 | import os 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | 10 | from hyperparams import infersent_Block_Hyperparams as hp 11 | from data_load import load_vocabs, load_train_data, load_test_data, create_data 12 | from train import Graph 13 | #from nltk.translate.bleu_score import corpus_bleu 14 | import argparse 15 | from sklearn.metrics import classification_report 16 | 17 | 18 | 19 | 20 | def eval(task_name): 21 | # Load graph 22 | g = Graph(is_training=False) 23 | print("Graph loaded") 24 | 25 | # Load data 26 | #X, _, Texts, Labels = load_test_data() 27 | s1, s2, raw_labels = load_test_data() 28 | raw_labels = [int(x) for x in raw_labels] 29 | 30 | word2idx, idx2word = load_vocabs() 31 | 32 | # Start session 33 | with g.graph.as_default(): 34 | sv = tf.train.Supervisor() 35 | with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 36 | ## Restore parameters 37 | sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) 38 | print("Restored!") 39 | 40 | ## Get model name 41 | print('Model dir:', hp.logdir) 42 | mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name 43 | print("Model name:", mname) 44 | 45 | ## Inference 46 | if not os.path.exists('results'): os.mkdir('results') 47 | with codecs.open("results/" + mname, "w", "utf-8") as fout: 48 | #list_of_refs, hypotheses = [], [] 49 | #print("Iterator: {} / {}".format(len(s1), hp.batch_size)) 50 | 51 | test_labels, predict_label = [], [] 52 | for i in range(len(s1) // hp.batch_size): 53 | print("Iterator: {} / {}".format(i, len(s1)//hp.batch_size)) 54 | ### Get mini-batches 55 | x1 = s1[i*hp.batch_size: (i+1)*hp.batch_size] 56 | x2 = s2[i*hp.batch_size: (i+1)*hp.batch_size] 57 | #sentences = Texts[i*hp.batch_size: (i+1)*hp.batch_size] 58 | labels = raw_labels[i*hp.batch_size: (i+1)*hp.batch_size] 59 | test_labels.extend([int(x) for x in labels]) 60 | 61 | preds = sess.run(g.preds, {g.x1:x1, g.x2:x2}) 62 | predict_label.extend([int(x) for x in preds]) 63 | assert len(preds) == len(labels), 'not alignmented...' 64 | 65 | 66 | ### Write to file 67 | #for sent, label, pred in zip(sentences, labels, preds): # sentence-wise 68 | for label, pred in zip(labels, preds): 69 | #got = " ".join(idx2word[idx] for idx in pred).split("")[0].strip() 70 | 71 | 72 | # bleu score 73 | if task_name == 'seq2seq': 74 | ref = target.split() 75 | hypothesis = got.split() 76 | if len(ref) > 3 and len(hypothesis) > 3: 77 | list_of_refs.append([ref]) 78 | hypotheses.append(hypothesis) 79 | 80 | 81 | ## Calculate bleu score 82 | if task_name == 'seq2seq': 83 | score = corpus_bleu(list_of_refs, hypotheses) 84 | fout.write("Bleu Score = " + str(100*score)) 85 | elif task_name == 'classfication' or task_name == 'infersent': 86 | assert len(test_labels) == len(predict_label), 'The length of label and predicts\ 87 | are not alignmentted.' 88 | res = classification_report(test_labels, predict_label) 89 | print(res) 90 | fout.write(res + '\n') 91 | 92 | 93 | if __name__ == '__main__': 94 | parser = argparse.ArgumentParser(description='Choice the task you want to eval.') 95 | parser.add_argument('--task', help='task name(default: infersent)') 96 | 97 | args = parser.parse_args() 98 | task_name = args.task 99 | eval(task_name) 100 | print("Done") 101 | 102 | 103 | -------------------------------------------------------------------------------- /transformer_jieba/eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import codecs 5 | import os 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | 10 | from hyperparams import seq2seq_Hyperparams as hp 11 | from data_load import load_en_vocab, load_zh_vocab, load_train_data, load_test_data, create_data 12 | from train import Graph 13 | 14 | import argparse 15 | from modules import bleu 16 | import math 17 | from modules import cut 18 | 19 | 20 | 21 | 22 | def eval(task_name): 23 | # Load graph 24 | g = Graph(is_training=False) 25 | print("Graph loaded") 26 | 27 | # Load data 28 | X, Sources, Targets = load_test_data() 29 | #print(X, Sources, Targets) 30 | en2idx, idx2en = load_en_vocab() 31 | zh2idx, idx2zh = load_zh_vocab() 32 | 33 | 34 | # Start session 35 | with g.graph.as_default(): 36 | sv = tf.train.Supervisor() 37 | with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 38 | ## Restore parameters 39 | sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) 40 | print("Restored!") 41 | 42 | ## Get model name 43 | print('Model dir:', hp.logdir) 44 | mname = '{}'.format(''.join(hp.source_test.split('/')[-1].split('.', 3)[:-1])) + open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name 45 | print("Model name:", mname) 46 | 47 | ## Inference 48 | if not os.path.exists('results'): os.mkdir('results') 49 | with codecs.open("results/" + mname, "w", "utf-8") as fout: 50 | list_of_refs, hypotheses, scores = [], [], [] 51 | print("Iterator:", len(X), hp.batch_size) 52 | for i in range(len(X) // hp.batch_size): 53 | print('Step:\t', i) 54 | ### Get mini-batches 55 | x = X[i*hp.batch_size: (i+1)*hp.batch_size] 56 | sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size] 57 | targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size] 58 | 59 | ### Autoregressive inference 60 | preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) 61 | for j in range(hp.maxlen): 62 | _preds = sess.run(g.preds, {g.x: x, g.y: preds}) 63 | preds[:, j] = _preds[:, j] 64 | 65 | 66 | ### Write to file 67 | for source, target, pred in zip(sources, targets, preds): # sentence-wise 68 | #print('Inspecting:', source, target, pred) 69 | #got = " ".join(idx2zh[idx] for idx in pred).split("。", 2)[0].strip() + ' 。' 70 | #got = ''.join(idx2zh[idx] for idx in pred).split('。')[0].strip() 71 | got = ' '.join(idx2zh[idx] for idx in pred).split('')[0] 72 | if task_name == 'jieba': 73 | fout.write("- source: " + source +"\n") 74 | fout.write("- expected: " + ' '.join(cut(source, target)) + "\n") 75 | fout.write("- got: " + ' '.join(cut(source, got)) + "\n\n") 76 | fout.flush() 77 | else: 78 | fout.write("- source: " + source +"\n") 79 | fout.write("- expected: " + target + "\n") 80 | fout.write("- got: " + got + "\n\n") 81 | fout.flush() 82 | 83 | 84 | # accumlate accuracty 85 | ref = cut(source, target) 86 | hypothesis = cut(source, got) 87 | acc = len([x for x in hypothesis if x in ref])/len(ref) 88 | scores.append(min(1, acc)) 89 | 90 | 91 | 92 | ## Calculate bleu score 93 | #score = corpus_bleu(list_of_refs, hypotheses) 94 | fout.write("Tokenization Accuracy = " + str(100*(sum(scores)/len(scores)))) 95 | 96 | 97 | if __name__ == '__main__': 98 | parser = argparse.ArgumentParser(description='Choice the task you want to eval.') 99 | parser.add_argument('--task', help='task name(default: seq2seq)') 100 | 101 | args = parser.parse_args() 102 | task_name = args.task 103 | eval(task_name) 104 | print("Done") 105 | 106 | 107 | -------------------------------------------------------------------------------- /en-zh_NMT/eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import codecs 5 | import os 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | 10 | from hyperparams import seq2seq_Hyperparams as hp 11 | from data_load import load_en_vocab, load_zh_vocab, load_test_data 12 | from train import Graph 13 | #from nltk.translate.bleu_score import corpus_bleu 14 | import argparse 15 | from modules import bleu 16 | import math 17 | 18 | 19 | 20 | 21 | def eval(task_name): 22 | # Load graph 23 | g = Graph(is_training=False) 24 | print("Graph loaded") 25 | 26 | # Load data 27 | X, Sources, Targets = load_test_data() 28 | en2idx, idx2en = load_en_vocab() 29 | zh2idx, idx2zh = load_zh_vocab() 30 | 31 | 32 | # Start session 33 | with g.graph.as_default(): 34 | sv = tf.train.Supervisor() 35 | with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 36 | ## Restore parameters 37 | sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) 38 | print("Restored!") 39 | 40 | ## Get model name 41 | print('Model dir:', hp.logdir) 42 | mname = '{}'.format(''.join(hp.source_test.split('/')[-1].split('.', 3)[:-1])) + open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name 43 | print("Model name:", mname) 44 | 45 | ## Inference 46 | if not os.path.exists('results'): os.mkdir('results') 47 | with codecs.open("results/" + mname, "w", "utf-8") as fout: 48 | list_of_refs, hypotheses, scores = [], [], [] 49 | print("Iterator:", len(X), hp.batch_size) 50 | for i in range(len(X) // hp.batch_size): 51 | print('Step:\t', i) 52 | ### Get mini-batches 53 | x = X[i*hp.batch_size: (i+1)*hp.batch_size] 54 | sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size] 55 | targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size] 56 | 57 | ### Autoregressive inference 58 | preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) 59 | for j in range(hp.maxlen): 60 | _preds = sess.run(g.preds, {g.x: x, g.y: preds}) 61 | preds[:, j] = _preds[:, j] 62 | 63 | 64 | ### Write to file 65 | for source, target, pred in zip(sources, targets, preds): # sentence-wise 66 | #print('Inspecting:', source, target, pred) 67 | got = " ".join(idx2zh[idx] for idx in pred).split("。", 2)[0].strip() + ' 。' 68 | #got = ''.join(idx2zh[idx] for idx in pred).split('。')[0].strip() 69 | if task_name == 'jieba': 70 | fout.write("- source: " + source +"\n") 71 | if len(got) < len(target): got += target[len(got):] 72 | fout.write("- expected: " + cut(source, target) + "\n") 73 | fout.write("- got: " + cut(source, got) + "\n\n") 74 | fout.flush() 75 | else: 76 | fout.write("- source: " + source +"\n") 77 | fout.write("- expected: " + target + "\n") 78 | fout.write("- got: " + got + "\n\n") 79 | fout.flush() 80 | 81 | # bleu score- BLEU-2 82 | ref = target.split() 83 | hypothesis = got.split() 84 | print(ref, '\n', hypothesis) 85 | if len(ref) > 2 and len(hypothesis) > 2: 86 | scores.append(bleu(hypothesis, ref, 2)) 87 | #list_of_refs.append([ref]) 88 | #hypotheses.append(hypothesis) 89 | 90 | 91 | ## Calculate bleu score 92 | #score = corpus_bleu(list_of_refs, hypotheses) 93 | fout.write("Bleu Score = " + str(100*(sum(scores)/len(scores)))) 94 | 95 | 96 | if __name__ == '__main__': 97 | parser = argparse.ArgumentParser(description='Choice the task you want to eval.') 98 | parser.add_argument('--task', help='task name(default: seq2seq)') 99 | 100 | args = parser.parse_args() 101 | task_name = args.task 102 | eval(task_name) 103 | print("Done") 104 | 105 | 106 | -------------------------------------------------------------------------------- /transformer_jieba/data_load.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # /usr/bin/python3 3 | 4 | from hyperparams import seq2seq_Hyperparams as hp 5 | import tensorflow as tf 6 | import numpy as np 7 | import codecs 8 | import re 9 | import jieba 10 | from bs4 import BeautifulSoup as bs 11 | 12 | 13 | def load_en_vocab(): 14 | vocab = [line.split()[0] for line in codecs.open('./preprocessed/src.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt] 15 | word2idx = {word: idx for idx, word in enumerate(vocab)} 16 | idx2word = {idx: word for idx, word in enumerate(vocab)} 17 | return word2idx, idx2word 18 | 19 | def load_zh_vocab(): 20 | vocab = [line.split()[0] for line in codecs.open('./preprocessed/tgt.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt] 21 | word2idx = {word: idx for idx, word in enumerate(vocab)} 22 | idx2word = {idx: word for idx, word in enumerate(vocab)} 23 | return word2idx, idx2word 24 | 25 | def create_data(source_sents, target_sents): 26 | en2idx, idx2en = load_en_vocab() 27 | zh2idx, idx2zh = load_zh_vocab() 28 | #max token numbers 29 | max_token_num = max(len(en2idx.keys()), len(zh2idx.keys())) + 100 30 | 31 | # Index 32 | x_list, y_list, Sources, Targets = [], [], [], [] 33 | for source_sent, target_sent in zip(source_sents, target_sents): 34 | #the default source senteces is english and target sentences is chinese 35 | x = [en2idx.get(word, 1) for word in source_sent.split()[:hp.maxlen-5] + [u" "]] 36 | y = [zh2idx.get(word, 1) for word in target_sent.split()[:hp.maxlen-5] + [u" "]] 37 | 38 | x_list.append(np.array(x)) 39 | y_list.append(np.array(y)) 40 | Sources.append(source_sent) 41 | Targets.append(target_sent) 42 | print('Demo: {}->\n{}'.format(Sources[0], Targets[0])) 43 | 44 | # Pad 45 | X = np.zeros([len(x_list), hp.maxlen], np.int32) 46 | Y = np.zeros([len(y_list), hp.maxlen], np.int32) 47 | for i, (x, y) in enumerate(zip(x_list, y_list)): 48 | #print(x, y, hp.maxlen, len(x), len(y)) 49 | X[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0)) 50 | Y[i] = np.lib.pad(y, [0, hp.maxlen-len(y)], 'constant', constant_values=(0, 0)) 51 | 52 | return X, Y, Sources, Targets 53 | 54 | 55 | def refine(line, tokenizer): 56 | if tokenizer == 'jieba': 57 | line = re.sub("[\s\p']", "", line) 58 | return ' '.join(jieba.cut(line)) 59 | elif tokenizer == 'en': 60 | line = re.sub("[^a-zA-Z]", " ", line) 61 | return line 62 | else: 63 | raise Exception('Could not find tokenizer...') 64 | 65 | def load_train_data(): 66 | en_sents = [line.strip() \ 67 | for line in open(hp.source_train, 'r', encoding = 'utf-8').read().split("\n") \ 68 | if not line.startswith('<')] 69 | zh_sents = [line.strip() \ 70 | for line in open(hp.target_train, 'r', encoding = 'utf-8').read().split("\n") \ 71 | if not line.startswith('<')] 72 | 73 | X, Y, Sources, Targets = create_data(en_sents, zh_sents) 74 | return X, Y 75 | 76 | def load_test_data(): 77 | def _parser(text): 78 | return [x.text for x in bs(text).find_all('seg')] 79 | 80 | 81 | en_sents = [line.strip() \ 82 | for line in open(hp.source_test, 'r', encoding = 'utf-8').read().split("\n") \ 83 | if not line.startswith('=hp.min_cnt] 15 | word2idx = {word: idx for idx, word in enumerate(vocab)} 16 | idx2word = {idx: word for idx, word in enumerate(vocab)} 17 | return word2idx, idx2word 18 | 19 | def load_zh_vocab(): 20 | vocab = [line.split()[0] for line in codecs.open('./preprocessed/zh.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt] 21 | word2idx = {word: idx for idx, word in enumerate(vocab)} 22 | idx2word = {idx: word for idx, word in enumerate(vocab)} 23 | return word2idx, idx2word 24 | 25 | def create_data(source_sents, target_sents): 26 | en2idx, idx2en = load_en_vocab() 27 | zh2idx, idx2zh = load_zh_vocab() 28 | #max token numbers 29 | max_token_num = max(len(en2idx.keys()), len(zh2idx.keys())) + 100 30 | 31 | # Index 32 | x_list, y_list, Sources, Targets = [], [], [], [] 33 | for source_sent, target_sent in zip(source_sents, target_sents): 34 | #the default source senteces is english and target sentences is chinese 35 | x = [en2idx.get(word, max_token_num) for word in source_sent.split()[:hp.maxlen-5] + [u" "]] 36 | y = [zh2idx.get(word, max_token_num) for word in target_sent.split()[:hp.maxlen-5] + [u" "]] 37 | 38 | x_list.append(np.array(x)) 39 | y_list.append(np.array(y)) 40 | Sources.append(source_sent) 41 | Targets.append(target_sent) 42 | print('Inspect data: {}->\n{}'.format(Sources[0], Targets[0])) 43 | 44 | # Pad 45 | X = np.zeros([len(x_list), hp.maxlen], np.int32) 46 | Y = np.zeros([len(y_list), hp.maxlen], np.int32) 47 | for i, (x, y) in enumerate(zip(x_list, y_list)): 48 | #print(x, y, hp.maxlen, len(x), len(y)) 49 | X[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0)) 50 | Y[i] = np.lib.pad(y, [0, hp.maxlen-len(y)], 'constant', constant_values=(0, 0)) 51 | 52 | return X, Y, Sources, Targets 53 | 54 | 55 | def refine(line, tokenizer): 56 | if tokenizer == 'jieba': 57 | line = re.sub("[\s\p']", "", line) 58 | return ' '.join(jieba.cut(line)) 59 | elif tokenizer == 'en': 60 | line = re.sub("[^a-zA-Z]", " ", line) 61 | return line 62 | else: 63 | raise Exception('Could not find tokenizer...') 64 | 65 | def load_train_data(): 66 | en_sents = [refine(line, tokenizer = 'en') \ 67 | for line in open(hp.source_train, 'r', encoding = 'utf-8').read().split("\n") \ 68 | if not line.startswith('<')] 69 | zh_sents = [refine(line, tokenizer = 'jieba') \ 70 | for line in open(hp.target_train, 'r', encoding = 'utf-8').read().split("\n") \ 71 | if not line.startswith('<')] 72 | 73 | X, Y, Sources, Targets = create_data(en_sents, zh_sents) 74 | return X, Y 75 | 76 | def load_test_data(): 77 | def _parser(text): 78 | return [x.text for x in bs(text).find_all('seg')] 79 | 80 | ''' 81 | en_sents = [refine(line, tokenizer = 'en') \ 82 | for line in open(hp.source_test, 'r', encoding = 'utf-8').read().split("\n") \ 83 | if line.startswith('=hp.min_cnt] #raw code is hp.mincnt 15 | word2idx = {word: idx for idx, word in enumerate(vocab)} 16 | idx2word = {idx: word for idx, word in enumerate(vocab)} 17 | return word2idx, idx2word 18 | 19 | 20 | 21 | def create_data(s1, s2, answer_span): 22 | """ 23 | the default s1 is the question and s2 is the content 24 | """ 25 | word2idx, idx2word = load_vocabs() 26 | 27 | 28 | # Index 29 | x1_list, x2_list, q_lens, p_lens, s_labels, e_labels, Questions, Contents = \ 30 | [], [], [], [], [], [], [], [] 31 | 32 | for sent1, sent2, span in zip(s1, s2, answer_span): 33 | x1 = [word2idx.get(word, 1) for word in (sent1 + u" ").split()[:hp.q_maxlen-5]] # 1: OOV, : End of Text 34 | x2 = [word2idx.get(word, 1) for word in (sent2 + u" ").split()[:hp.p_maxlen-5]] 35 | 36 | x1_list.append(np.array(x1)) 37 | x2_list.append(np.array(x2)) 38 | 39 | q_lens.append(len(x1)) 40 | p_lens.append(len(x2)) 41 | 42 | s_labels.append(span[0]) 43 | e_labels.append(span[1]) 44 | 45 | print('Demo:', x1_list[0], x2_list[0], q_lens[0], p_lens[0], s_labels[0], e_labels[0]) 46 | 47 | # Pad 48 | X1 = np.zeros([len(x1_list), hp.q_maxlen], np.int32) 49 | X2 = np.zeros([len(x2_list), hp.p_maxlen], np.int32) 50 | 51 | for i, x in enumerate(x1_list): 52 | X1[i] = np.lib.pad(x, [0, hp.q_maxlen-len(x)], 'constant', constant_values=(0, 0)) 53 | 54 | for i, x in enumerate(x2_list): 55 | X2[i] = np.lib.pad(x, [0, hp.p_maxlen-len(x)], 'constant', constant_values=(0, 0)) 56 | 57 | 58 | 59 | return X1, X2, q_lens, p_lens, s_labels, e_labels 60 | 61 | 62 | def _refine(line, lan = 'zh'): 63 | if lan == 'zh': 64 | line = re.sub("[\s\p']", "", line) 65 | #line = re.sub(r'[0-9]+', ' n', line) 66 | #line = re.sub(r'[a-zA-Z]+', ' α', line) 67 | line = jieba.cut(line) 68 | return ' '.join(list(line)) 69 | elif lan == 'en': 70 | line = re.sub("[^a-zA-Z]", " ", line) 71 | return line 72 | 73 | else: 74 | raise Exception('Havn\'t specified language!') 75 | return 76 | 77 | 78 | 79 | def load_train_data(tokenizer = None): 80 | train_data = pd.read_csv(hp.trainfile) 81 | questions, contents, answer_spans = list(train_data['question']), list(train_data['content']), \ 82 | list(train_data['answer_span']) 83 | 84 | questions, contents = [_refine(line) for line in questions], [_refine(line) for line in contents] 85 | 86 | answer_spans = [eval(line) for line in answer_spans] 87 | 88 | X1, X2, q_lens, p_lens, start_labels, end_labels = create_data(questions, contents, answer_spans) 89 | return X1, X2, q_lens, p_lens, start_labels, end_labels 90 | 91 | 92 | 93 | def load_test_data(tokenizer = None): 94 | test_data = pd.read_csv(hp.testfile) 95 | questions, contents, answer_spans = list(test_data['question']), list(test_data['content']), \ 96 | list(test_data['answer_span']) 97 | 98 | answer_spans = [eval(line) for line in answer_spans] 99 | 100 | questions, contents = [_refine(line) for line in questions], [_refine(line) for line in contents] 101 | 102 | 103 | X1, X2, q_lens, p_lens, start_labels, end_labels = create_data(questions, contents, answer_spans) 104 | return X1, X2, q_lens, p_lens, start_labels, end_labels 105 | 106 | 107 | def get_batch_data(): 108 | # Load data 109 | X1, X2, q_lens, p_lens, start_labels, end_labels = load_train_data() 110 | 111 | # calc total batch count 112 | num_batch = len(X1) // hp.batch_size 113 | 114 | # Convert to tensor 115 | X1 = tf.convert_to_tensor(X1, tf.int32) 116 | X2 = tf.convert_to_tensor(X2, tf.int32) 117 | q_lens = tf.convert_to_tensor(q_lens, tf.int32) 118 | p_lens = tf.convert_to_tensor(p_lens, tf.int32) 119 | start_labels = tf.convert_to_tensor(start_labels, tf.int32) 120 | end_labels = tf.convert_to_tensor(end_labels, tf.int32) 121 | 122 | 123 | # Create Queues 124 | input_queues = tf.train.slice_input_producer([X1, X2, q_lens, p_lens, start_labels, end_labels]) 125 | 126 | # create batch queues 127 | q, p, q_length, p_length, start_pos, end_pos = tf.train.shuffle_batch(input_queues, 128 | num_threads=8, 129 | batch_size=hp.batch_size, 130 | capacity=hp.batch_size*64, 131 | min_after_dequeue=hp.batch_size*32, 132 | allow_smaller_final_batch=False) 133 | 134 | 135 | return q, p, q_length, p_length, start_pos, end_pos, num_batch # (N, T), (N, T), () 136 | 137 | -------------------------------------------------------------------------------- /transformer_text_Classfication/data_pre.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | #data preparation 5 | import codecs 6 | import os 7 | import argparse 8 | import jieba 9 | from hyperparams import feature_Block_Hyperparams as hp 10 | from sklearn.externals import joblib 11 | import re 12 | 13 | def jieba_data_pre(): 14 | with codecs.open('./dataset/train.txt', 'r', encoding = 'utf-8') as f: 15 | vocabset = f.readlines() 16 | vocabset = [x.strip() for x in vocabset] 17 | #print(vocabset[:10]) 18 | zh = '' 19 | en = '' 20 | 21 | for pair in vocabset: 22 | try: 23 | z, e = pair.strip().split('\t') 24 | zh += z + ' ' 25 | en += e + ' ' 26 | except: 27 | zh += '' 28 | en += '' 29 | 30 | 31 | zh_sent = zh.split('') 32 | en_sent = en.split('') 33 | assert len(zh_sent) == len(en_sent), 'length of source and target not comliable' 34 | 35 | files = [] 36 | for root, dirs, file in os.walk(".", topdown=False): 37 | files.append(file) 38 | #print(files) 39 | 40 | if 'train.tags.zh-en.zh' not in files: 41 | with codecs.open('./dataset/train.tags.src-tgt.src', 'w', 'utf-8') as f: 42 | for i in zh_sent[:int(0.8*len(zh_sent))]: 43 | f.write(i+'\n') 44 | 45 | with codecs.open('./dataset/train.tags.tgt-src.tgt', 'w', 'utf-8') as f: 46 | for i in en_sent[:int(0.8*len(en_sent))]: 47 | f.write(i+'\n') 48 | 49 | with codecs.open('./dataset/test.tags.src-tgt.src', 'w', 'utf-8') as f: 50 | for i in zh_sent[int(0.8*len(zh_sent)):]: 51 | f.write(i+'\n') 52 | 53 | with codecs.open('./dataset/test.tags.tgt-src.tgt', 'w', 'utf-8') as f: 54 | for i in en_sent[int(0.8*len(en_sent)):]: 55 | f.write(i+'\n') 56 | 57 | 58 | 59 | def text_sum_pre(): 60 | cnt_title_path = './dataset/content-title.txt' 61 | cnt_title_pair = [x.strip().split() for x in open(cnt_title_path).readlines()] 62 | cnt_title_pair = [x for x in cnt_title_pair if len(x) == 2] 63 | 64 | content_set, sum_set = [x[0] for x in cnt_title_pair], [x[1] for x in cnt_title_pair] 65 | pad = ['', '', "", ""] 66 | content_vocabs, title_vocabs = {}, {} 67 | 68 | for x in content_set: 69 | vocabs = jieba.cut(x) 70 | for x in vocabs: 71 | if x not in content_vocabs: 72 | content_vocabs[x] = 1 73 | else: 74 | content_vocabs[x] += 1 75 | 76 | for x in sum_set: 77 | vocabs = jieba.cut(x) 78 | for x in vocabs: 79 | if x not in title_vocabs: 80 | title_vocabs[x] = 1 81 | else: 82 | title_vocabs[x] += 1 83 | 84 | #save vocab 85 | 86 | if not 'textSummary' in os.listdir('./preprocessed'): 87 | os.mkdir('./preprocessed/textSummary') 88 | with codecs.open('./preprocessed/textSummary/src.vocab.tsv', 'w', 'utf-8') as f: 89 | for token in pad: 90 | f.write(token + '\t' + '1000000000' + '\n') 91 | for token, val in content_vocabs.items(): 92 | f.write(token + '\t' + str(content_vocabs[token]) + '\n') 93 | 94 | with codecs.open('./preprocessed/textSummary/tgt.vocab.tsv', 'w', 'utf-8') as f: 95 | for token in pad: 96 | f.write(token + '\t' + '1000000000' + '\n') 97 | for token, val in title_vocabs.items(): 98 | f.write(token + '\t' + str(title_vocabs[token]) + '\n') 99 | 100 | 101 | if 'textSummary' in os.listdir('./dataset'): 102 | os._exit(0) 103 | else: 104 | os.mkdir('./dataset/textSummary') 105 | 106 | n = len(sum_set) 107 | with codecs.open('./dataset/textSummary/train.tags.src-tgt.src', 'w', 'utf-8') as f: 108 | for x in content_set[:int(0.8*n)]: 109 | f.write(x+'\n') 110 | 111 | with codecs.open('./dataset/textSummary/train.tags.tgt-src.tgt', 'w', 'utf-8') as f: 112 | for x in sum_set[:int(0.8*n)]: 113 | f.write(x+'\n') 114 | 115 | with codecs.open('./dataset/textSummary/test.tags.src-tgt.src', 'w', 'utf-8') as f: 116 | for x in content_set[int(0.8*n):]: 117 | f.write(x+'\n') 118 | 119 | with codecs.open('./dataset/textSummary/test.tags.tgt-src.tgt', 'w', 'utf-8') as f: 120 | for x in sum_set[int(0.8*n):]: 121 | f.write(x+'\n') 122 | 123 | 124 | def main(): 125 | parser = argparse.ArgumentParser(description='Choice the task you want to run.') 126 | parser.add_argument('--task', default = 'jieba', 127 | help='task name(default: tokenize)') 128 | 129 | args = parser.parse_args() 130 | task_name = args.task 131 | 132 | if task_name == 'jieba': jieba_data_pre() 133 | elif task_name == 'textsum': text_sum_pre() 134 | 135 | 136 | if __name__ == '__main__': 137 | import pandas as pd 138 | import random 139 | adspath = ['./datasets/ADs_detection.csv', './datasets/20190723_ads_annotation.csv'] 140 | 141 | df = pd.read_csv(adspath.pop()) 142 | while adspath: 143 | df = pd.concat([df, pd.read_csv(adspath.pop())], axis = 0) 144 | 145 | ads = [re.sub("[\s\p']", "", x)+'\t'+'1' for x in df['text']] 146 | asr = [x.strip() for x in joblib.load('./datasets/corpus.json')['content'] if len(x)>3] 147 | no_ads = random.sample(asr, min(len(ads)*1, len(asr))) 148 | no_ads = [re.sub("[\s\p']", "", x)+'\t'+'0' for x in no_ads] 149 | 150 | corpus = ads[:] 151 | corpus.extend(no_ads) 152 | 153 | random.shuffle(corpus) 154 | sep = int(0.9*len(corpus)) 155 | with open('./datasets/trainset.txt', 'w') as f1: 156 | for line in corpus[:sep]: 157 | f1.write(line+'\n') 158 | 159 | with open('./datasets/testset.txt', 'w') as f2: 160 | for line in corpus[sep:]: 161 | f2.write(line+'\n') 162 | 163 | print('Done!') 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # transfromer_NN_Block 2 | We are doing this to implemented transformer as a neural network building block to overcome several task in NLP research, this rep follow the raw paper realization of [Attention Is All You Need](https://arxiv.org/abs/1706.03762). 3 | 4 | [![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/fooSynaptic/transfromer_NN_Block) 5 | 6 | This rep achieved **Several** tasks: 7 | - [The seq2seq text generation, we try to implemented transformer solve a conventional problem in NLP - Words segmentation(Chinese).](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/transformer_jieba) 8 | - [The NMT problem track on Chinese-English machine translation with WIT3 datasets.](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/en-zh_NMT) 9 | - [The language model encoder architecture for Text-classfication.](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/transformer_text_Classfication) 10 | - [The sentence entailement task experiment with stanford SNLI datasets(Natural language Inference).](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/transformer_infersent) 11 | - [Updated reading comprehension task.](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/transformer_RC) 12 | 13 | 14 | 15 | # INSTALL ENV: 16 | Please run `pip install -r requirements.txt` first. 17 | 18 | 19 | # ***First- the encoder-decoder architectures.*** 20 | # train 21 | -The aim is train a sequence labeling model with **Transformer**. We follow the 22 | conventional sentence tokenize method - **/B/E/S/M** (represent the word begin/end/single word/in the middle respectively). 23 | 24 | - We used some labeled chinese Ducuments to train my model. The raw data presented in the `./transformer_jieba/dataset` dir. Or you may want use the `./transformer_jieba/prepro.py` to preprocess the raw data. 25 | 26 | - Just use the `python train.py` to train the model. 27 | 28 | 29 | # eval 30 | - Run `python eval.py`, We achieved the BLEU score nearly 80. 31 | 32 | 33 | # ***Second - zh-en NMT*** 34 | - the train and test data was from `Web Inventory of Transcribed and Translated Talks`-**WIT3**, we train a model for English-Chinese translation model([data source](https://wit3.fbk.eu/mt.php?release=2015-01)). 35 | - test Result: 36 | ![NMT result](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/NMT_res_BLEU.png) 37 | 38 | 39 | 40 | 41 | # ***Third - the transformer feature extraction block*** 42 | - you may find the code in `./transformer_text_Classfication`, codes about preprocessing and training as well as evaluation locate in this path. And the wrappers usage are similar to encoder-decoder architecture. 43 | - The chinese corpus was downloaded from [THUCTC(THU Chinese Text Classification)](http://thuctc.thunlp.org/), and we show better macro avg f1-score with over 0.05. 44 | - ***Our model is very raw and shallow(only 8 multi-head attention projection and final linear projection) and without pre-trained embedding, you can explore performance with our code.*** 45 | 46 | # result of chinese sentences classfication(char-level) 47 | ` tagging = {'时尚':0, '教育':1, '时政':2, '体育':3, '游戏':4, '家居':5, '科技':6, '房产':7, '财经':8, '娱乐':9} ` 48 | ``` 49 | precision recall f1-score support 50 | 51 | 0 0.91 0.95 0.93 1000 52 | 1 0.96 0.77 0.85 1000 53 | 2 0.92 0.93 0.92 1000 54 | 3 0.95 0.93 0.94 1000 55 | 4 0.86 0.91 0.88 1000 56 | 5 0.83 0.47 0.60 1000 57 | 6 0.86 0.85 0.86 1000 58 | 7 0.64 0.87 0.74 1000 59 | 8 0.79 0.91 0.85 1000 60 | 9 0.88 0.91 0.89 1000 61 | 62 | accuracy 0.85 10000 63 | macro avg 0.86 0.85 0.85 10000 64 | weighted avg 0.86 0.85 0.85 10000 65 | 66 | Done 67 | ``` 68 | [***We Also implemented a sentences entailment inference task with transformer***](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/transformer_infersent) 69 | --- 70 | **Data source** [standord SNLI](https://nlp.stanford.edu/projects/snli/snli_1.0.zip) 71 | 72 | - *Download source data and unzip* : `wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip && unzip snli_1.0.zip` 73 | - *preprocess data*: `python data_prepare.py && python prepro.py` 74 | - *train*: run `python train.py` 75 | - *eval*: run `python eval.py --task infersent` 76 | 77 | Experiment result: 78 | - train accuracy: 79 | ![train accuracy](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/infersent_train_with_SNLI_accuracy.png) 80 | 81 | - train loss: 82 | ![train loss](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/infersent_train_SNLI_loss.png) 83 | 84 | 85 | - eval result: 86 | ``` 87 | precision recall f1-score support 88 | 89 | 0 0.82 0.76 0.79 3358 90 | 1 0.77 0.80 0.79 3226 91 | 2 0.70 0.73 0.72 3208 92 | 93 | accuracy 0.76 9792 94 | macro avg 0.76 0.76 0.76 9792 95 | weighted avg 0.76 0.76 0.76 9792 96 | ``` 97 | 98 | 99 | # Ref 100 | 101 | - https://github.com/Kyubyong/transformer 102 | - [Attention Is All You Need](https://arxiv.org/abs/1706.03762). 103 | -------------------------------------------------------------------------------- /transformer_RC/layers/match_layer.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module implements the core layer of Match-LSTM and BiDAF 19 | """ 20 | 21 | import tensorflow as tf 22 | import tensorflow.contrib as tc 23 | 24 | class MatchLSTMAttnCell(tc.rnn.LSTMCell): 25 | """ 26 | Implements the Match-LSTM attention cell 27 | """ 28 | def __init__(self, num_units, context_to_attend): 29 | super(MatchLSTMAttnCell, self).__init__(num_units, state_is_tuple=True) 30 | self.context_to_attend = context_to_attend 31 | self.fc_context = tc.layers.fully_connected(self.context_to_attend, 32 | num_outputs=self._num_units, 33 | activation_fn=None) 34 | 35 | def __call__(self, inputs, state, scope=None): 36 | (c_prev, h_prev) = state 37 | with tf.variable_scope(scope or type(self).__name__): 38 | ref_vector = tf.concat([inputs, h_prev], -1) 39 | G = tf.tanh(self.fc_context 40 | + tf.expand_dims(tc.layers.fully_connected(ref_vector, 41 | num_outputs=self._num_units, 42 | activation_fn=None), 1)) 43 | logits = tc.layers.fully_connected(G, num_outputs=1, activation_fn=None) 44 | scores = tf.nn.softmax(logits, 1) 45 | attended_context = tf.reduce_sum(self.context_to_attend * scores, axis=1) 46 | new_inputs = tf.concat([inputs, attended_context, 47 | inputs - attended_context, inputs * attended_context], 48 | -1) 49 | return super(MatchLSTMAttnCell, self).__call__(new_inputs, state, scope) 50 | 51 | 52 | class MatchLSTMLayer(object): 53 | """ 54 | Implements the Match-LSTM layer, which attend to the question dynamically in a LSTM fashion. 55 | """ 56 | def __init__(self, hidden_size): 57 | self.hidden_size = hidden_size 58 | 59 | def match(self, passage_encodes, question_encodes, p_length, q_length): 60 | """ 61 | Match the passage_encodes with question_encodes using Match-LSTM algorithm 62 | """ 63 | with tf.variable_scope('match_lstm'): 64 | cell_fw = MatchLSTMAttnCell(self.hidden_size, question_encodes) 65 | cell_bw = MatchLSTMAttnCell(self.hidden_size, question_encodes) 66 | outputs, state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, 67 | inputs=passage_encodes, 68 | sequence_length=p_length, 69 | dtype=tf.float32) 70 | match_outputs = tf.concat(outputs, 2) 71 | state_fw, state_bw = state 72 | c_fw, h_fw = state_fw 73 | c_bw, h_bw = state_bw 74 | match_state = tf.concat([h_fw, h_bw], 1) 75 | return match_outputs, match_state 76 | 77 | 78 | class AttentionFlowMatchLayer(object): 79 | """ 80 | Implements the Attention Flow layer, 81 | which computes Context-to-question Attention and question-to-context Attention 82 | """ 83 | def __init__(self, hidden_size): 84 | self.hidden_size = hidden_size 85 | 86 | def match(self, passage_encodes, question_encodes, p_length, q_length): 87 | """ 88 | Match the passage_encodes with question_encodes using Attention Flow Match algorithm 89 | """ 90 | #p_encodes = (batch_size, p_length, hidden_size),\ 91 | # q_encodes = (batch_size, q_length, hidden_size) 92 | with tf.variable_scope('bidaf'): 93 | sim_matrix = tf.matmul(passage_encodes, question_encodes, transpose_b=True) 94 | #sim_matrix = (batch_size, p_length, q_length) 95 | context2question_attn = tf.matmul(tf.nn.softmax(sim_matrix, -1), question_encodes) 96 | #c2q_atten_weight = (batch_size, p_length, hidden_size) 97 | b = tf.nn.softmax(tf.expand_dims(tf.reduce_max(sim_matrix, 2), 1), -1) 98 | # b = (batch_size, 1, p_length) 99 | question2context_attn = tf.tile(tf.matmul(b, passage_encodes), 100 | [1, tf.shape(passage_encodes)[1], 1]) 101 | # q2c_atten_weight = (batch_size, 1, p_length) @ (batch_size, p_length, hidden_size) \ 102 | # = (batch_size, 1, hidden_size) 103 | # (tile) => (batch_size, p_length, hidden_size) 104 | 105 | assert tf.shape(question2context_attn) == context2question_attn, \ 106 | print("Dimension not fixed to cancate.") 107 | concat_outputs = tf.concat([passage_encodes, context2question_attn, 108 | passage_encodes * context2question_attn, 109 | passage_encodes * question2context_attn], -1) 110 | 111 | 112 | return concat_outputs, None 113 | -------------------------------------------------------------------------------- /Models/models.py: -------------------------------------------------------------------------------- 1 | # encoding = utf-8 2 | # /usr/bin/python3 3 | import tensorflow as tf 4 | from modules import * 5 | 6 | 7 | class vanilla_transformer(): 8 | def __init__(self, hp, is_training): 9 | self.hp = hp 10 | self.train = is_training 11 | 12 | def encode(self, Input, Vocabs_length): 13 | with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): 14 | ## Embedding 15 | enc = embedding(Input, 16 | vocab_size=Vocabs_length, 17 | num_units=self.hp.hidden_units, 18 | scale=True, 19 | scope="enc_embed") 20 | 21 | ## Positional Encoding 22 | if self.hp.sinusoid: 23 | enc += positional_encoding(Input, 24 | num_units=self.hp.hidden_units, 25 | zero_pad=False, 26 | scale=False, 27 | scope="enc_pe") 28 | else: 29 | enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(Input)[1]), 0), [tf.shape(Input)[0], 1]), 30 | vocab_size=Vocabs_length, 31 | num_units=self.hp.hidden_units, 32 | zero_pad=False, 33 | scale=False, 34 | scope="enc_pe") 35 | 36 | ## Dropout 37 | enc = tf.layers.dropout(enc, 38 | rate=self.hp.dropout_rate, 39 | training=tf.convert_to_tensor(self.train)) 40 | 41 | ## Blocks 42 | for i in range(self.hp.num_blocks): 43 | with tf.variable_scope("num_blocks", reuse = tf.AUTO_REUSE): 44 | ### Multihead Attention 45 | enc = multihead_attention(queries=enc, 46 | keys=enc, 47 | num_units=self.hp.hidden_units, 48 | num_heads=self.hp.num_heads, 49 | dropout_rate=self.hp.dropout_rate, 50 | is_training=self.train, 51 | causality=False) 52 | 53 | ### Feed Forward 54 | enc = feedforward(enc, num_units=[4*self.hp.hidden_units, self.hp.hidden_units]) 55 | state = enc 56 | return state 57 | 58 | 59 | def decode(self, decoder_inputs, key_states, Vocabs_length, decode_length): 60 | with tf.variable_scope("decoder", reuse = tf.AUTO_REUSE): 61 | ## Embedding 62 | self.dec = embedding(decoder_inputs, 63 | vocab_size=Vocabs_length, 64 | num_units=self.hp.hidden_units, 65 | scale=True, 66 | scope="dec_embed") 67 | 68 | ## Positional Encoding 69 | if self.hp.sinusoid: 70 | self.dec += positional_encoding(decoder_inputs, 71 | vocab_size=decode_length, 72 | num_units=self.hp.hidden_units, 73 | zero_pad=False, 74 | scale=False, 75 | scope="dec_pe") 76 | else: 77 | self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(decoder_inputs)[1]), 0), [tf.shape(decoder_inputs)[0], 1]), 78 | vocab_size=decode_length, 79 | num_units=self.hp.hidden_units, 80 | zero_pad=False, 81 | scale=False, 82 | scope="dec_pe") 83 | 84 | ## Dropout 85 | self.dec = tf.layers.dropout(self.dec, 86 | rate=self.hp.dropout_rate, 87 | training=tf.convert_to_tensor(self.train)) 88 | 89 | ## Blocks 90 | for i in range(self.hp.num_blocks): 91 | with tf.variable_scope("num_blocks_{}".format(i)): 92 | ## Multihead Attention ( self-attention) 93 | self.dec = multihead_attention(queries=self.dec, 94 | keys=self.dec, 95 | num_units=self.hp.hidden_units, 96 | num_heads=self.hp.num_heads, 97 | dropout_rate=self.hp.dropout_rate, 98 | is_training=self.train, 99 | causality=True, 100 | scope="self_attention") 101 | 102 | 103 | ## Multihead Attention ( vanilla attention) 104 | self.dec = multihead_attention(queries=self.dec, 105 | keys=key_states, 106 | num_units=self.hp.hidden_units, 107 | num_heads=self.hp.num_heads, 108 | dropout_rate=self.hp.dropout_rate, 109 | is_training=self.train, 110 | causality=False, 111 | scope="vanilla_attention") 112 | 113 | 114 | ## Feed Forward 115 | self.dec = feedforward(self.dec, num_units=[4*self.hp.hidden_units, self.hp.hidden_units]) 116 | 117 | output_state = self.dec 118 | return output_state 119 | -------------------------------------------------------------------------------- /transformer_RC/eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import codecs 5 | import os 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | 10 | from hyperparams import rc_Hyperparams as hp 11 | from data_load import load_vocabs, load_train_data, load_test_data, create_data 12 | from train import Graph 13 | #from nltk.translate.bleu_score import corpus_bleu 14 | import argparse 15 | #from sklearn.metrics import classification_report 16 | #from utils import compute_bleu_rouge 17 | import pandas as pd 18 | from modules import bleu 19 | 20 | 21 | def find_best_answer_for_passage(start_probs, end_probs, passage_len=None): 22 | """ 23 | Finds the best answer with the maximum start_prob * end_prob from a single passage 24 | """ 25 | if passage_len is None: 26 | passage_len = len(start_probs) 27 | else: 28 | passage_len = min(len(start_probs), passage_len) 29 | 30 | best_start, best_end, max_prob = -1, -1, 0 31 | 32 | for start_idx in range(passage_len): 33 | #within the span of answer limit 34 | for ans_len in range(hp.ans_maxlen): 35 | end_idx = start_idx + ans_len 36 | if end_idx >= passage_len: 37 | continue 38 | 39 | prob = start_probs[start_idx] * end_probs[end_idx] 40 | if prob > max_prob: 41 | best_start = start_idx 42 | best_end = end_idx 43 | max_prob = prob 44 | return (best_start, best_end), max_prob 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | def eval(task_name): 53 | # Load graph 54 | g = Graph(is_training=False) 55 | print("Graph loaded") 56 | 57 | # Load data 58 | test_data = pd.read_csv(hp.testfile) 59 | questions, contents, q_lens, p_lens, start_pos, end_pos = load_test_data() 60 | raw_passages = list(test_data['content']) 61 | reference_answers = list(test_data['answer']) 62 | 63 | 64 | word2idx, idx2word = load_vocabs() 65 | 66 | # Start session 67 | with g.graph.as_default(): 68 | sv = tf.train.Supervisor() 69 | with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 70 | ## Restore parameters 71 | sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) 72 | print("Restored!") 73 | 74 | ## Get model name 75 | print('Model dir:', hp.logdir) 76 | mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name 77 | print("Model name:", mname) 78 | 79 | ## Inference 80 | if not os.path.exists('results'): os.mkdir('results') 81 | with codecs.open("results/" + mname, "w", "utf-8") as fout: 82 | 83 | pred_answers, ref_answers = [], [] 84 | pred_dict, ref_dict = {}, {} 85 | ques_id = 0 86 | eval_dict = {'bleu_1':[], 'bleu_2':[], 'bleu_3':[], 'bleu_4':[]} 87 | 88 | for i in range(len(questions) // hp.batch_size): 89 | print("Iterator: {} / {}".format(i, len(questions)//hp.batch_size)) 90 | 91 | ### Get mini-batches 92 | q = questions[i*hp.batch_size: (i+1)*hp.batch_size] 93 | p = contents[i*hp.batch_size: (i+1)*hp.batch_size] 94 | q_length = q_lens[i*hp.batch_size: (i+1)*hp.batch_size] 95 | p_length = p_lens[i*hp.batch_size: (i+1)*hp.batch_size] 96 | s_pos = start_pos[i*hp.batch_size: (i+1)*hp.batch_size] 97 | e_pos = end_pos[i*hp.batch_size: (i+1)*hp.batch_size] 98 | passages = raw_passages[i*hp.batch_size: (i+1)*hp.batch_size] 99 | ref_answers = reference_answers[i*hp.batch_size: (i+1)*hp.batch_size] 100 | 101 | feed_dict = {g.q: q, 102 | g.p: p, 103 | g.q_length: q_length, 104 | g.p_length: p_length, 105 | g.start_label: s_pos, 106 | g.end_label: e_pos} 107 | 108 | start_probs, end_probs = sess.run([g.start_probs, g.end_probs], feed_dict) 109 | 110 | 111 | ### Write to file 112 | for start_prob, end_prob, passage, ref in zip(start_probs, end_probs, passages, ref_answers): 113 | pred_span, prob = find_best_answer_for_passage(start_prob, end_prob) 114 | pred_answer = passage[pred_span[0]: pred_span[1]+1] 115 | 116 | if not len(pred_answer) > 0: continue 117 | 118 | pred_dict[str(ques_id)] = [pred_answer] 119 | ref_dict[str(ques_id)] = [ref] 120 | ques_id += 1 121 | 122 | fout.write('-ref: '+ ref) 123 | fout.write("-pred: "+ pred_answer) 124 | 125 | b1, b2, b3, b4 = bleu(list(pred_answer), list(ref), 1), \ 126 | bleu(list(pred_answer), list(ref), 2), \ 127 | bleu(list(pred_answer), list(ref), 3), \ 128 | bleu(list(pred_answer), list(ref), 4) 129 | 130 | 131 | eval_dict['bleu_1'].append(b1) 132 | eval_dict['bleu_2'].append(b2) 133 | eval_dict['bleu_3'].append(b3) 134 | eval_dict['bleu_2'].append(b2) 135 | 136 | for metric in eval_dict: 137 | fout.write(metric + '\t' + str(np.mean(eval_dict[metric])) + '\n') 138 | print(metric + '\t' + str(np.mean(eval_dict[metric]))) 139 | 140 | if __name__ == '__main__': 141 | parser = argparse.ArgumentParser(description='Choice the task you want to eval.') 142 | parser.add_argument('--task', help='task name(default: RC)') 143 | 144 | args = parser.parse_args() 145 | task_name = args.task 146 | eval(task_name) 147 | print("Done") 148 | 149 | 150 | -------------------------------------------------------------------------------- /transformer_infersent/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import tensorflow as tf 5 | 6 | from hyperparams import infersent_Block_Hyperparams as hp 7 | from data_load import get_batch_data, load_vocabs 8 | from modules import * 9 | import os, codecs 10 | from tqdm import tqdm 11 | 12 | os.sys.path.append('../Models') 13 | from models import vanilla_transformer 14 | 15 | 16 | class Graph(): 17 | def __init__(self, is_training=True): 18 | self.graph = tf.Graph() 19 | with self.graph.as_default(): 20 | if is_training: 21 | self.x1, self.x2, self.y, self.num_batch = get_batch_data() 22 | #self.x, self.label, self.num_batch = get_batch_data() # (N, T) 23 | #self.y = tf.one_hot(self.label, depth = hp.n_class) 24 | 25 | else: # inference 26 | self.x1 = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) 27 | self.x2 = tf.placeholder(tf.int32, shape = (None, hp.maxlen)) 28 | #self.label = tf.placeholder(tf.int32, shape = (None, hp.n_class)) 29 | #self.y = tf.placeholder(tf.int32, shape = (None, hp.n_class)) 30 | #self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) 31 | 32 | self.l2_loss = tf.constant(0.0) 33 | # define decoder inputs 34 | #for sentence relationship learning task we want to encoder sent1 to e1, then decoder(e1 + sent2) 35 | #to get a more sementic relationship across corpus 36 | self.decoder_inputs = tf.concat((tf.ones_like(self.x2[:, :1])*2, self.x2[:, :-1]), -1) # 2: 37 | 38 | # Load vocabulary 39 | word2idx, idx2word = load_vocabs() 40 | 41 | 42 | # initialize transformer 43 | transformer = vanilla_transformer(hp, self.is_training) 44 | 45 | #encode 46 | self.encode1, self.encode2 = transformer.encode(self.x1, len(word2idx)), \ 47 | transformer.encode(self.x2, len(word2idx)) 48 | 49 | #concated 50 | self.enc = tf.divide(tf.add(self.encode1, encode2), 2) 51 | self.enc = normalize(self.enc) 52 | 53 | #for sentence relationship learning task we want to encoder sent1 to e1, then decoder(e1 + sent2) 54 | #to get a more sementic relationship across corpus 55 | 56 | # Decoder 57 | self.dec = transformer.decode(self.decoder_inputs, self.enc, len(word2idx), hp.p_maxlen) 58 | 59 | 60 | self.logits = tf.add(self.enc, tf.multiply(self.enc, self.dec)) 61 | #self.logits = self.enc 62 | 63 | #self.logits = tf.layers.dense(self.logits, 64, activation = 'tanh') 64 | self.logits = tf.layers.flatten(self.logits) 65 | #self.logits = tf.reshape(self.logits, [64, -1]) 66 | self.h_drop = tf.nn.dropout(self.logits, hp.dropout_keep_prob) 67 | 68 | with tf.name_scope("output_logit"): 69 | W = tf.get_variable( 70 | "W", 71 | shape=[hp.maxlen * hp.hidden_units, len(hp.relations)], 72 | initializer=tf.contrib.layers.xavier_initializer()) 73 | 74 | b = tf.Variable(tf.constant(0.1, shape=[len(hp.relations)]), name="b") 75 | self.l2_loss += tf.nn.l2_loss(W) 76 | self.l2_loss += tf.nn.l2_loss(b) 77 | self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name="logit") 78 | #self.preds = tf.argmax(self.scores, 1, name="predictions") 79 | 80 | self.preds = tf.to_int32(tf.argmax(self.logits, dimension = -1)) 81 | 82 | 83 | if is_training: 84 | self.y_hotting = tf.one_hot(self.y, depth = len(hp.relations)) 85 | 86 | #Accuracy 87 | self.cpl = tf.equal(tf.convert_to_tensor(self.y, tf.int32), self.preds) 88 | self.cpl = tf.to_int32(self.cpl) 89 | self.acc = tf.reduce_sum(self.cpl) / tf.to_int32(tf.reduce_sum(self.y_hotting)) 90 | tf.summary.scalar('acc', self.acc) 91 | 92 | # Loss 93 | #self.y_smoothed = label_smoothing(self.y_hotting) 94 | self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_hotting) 95 | self.mean_loss = (tf.reduce_sum(self.loss) + self.l2_loss*hp.reg_lambda)/tf.reduce_sum(self.y_hotting) 96 | 97 | # Training Scheme 98 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 99 | self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 100 | self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step) 101 | 102 | # Summary 103 | tf.summary.scalar('mean_loss', self.mean_loss) 104 | self.merged = tf.summary.merge_all() 105 | 106 | 107 | if __name__ == '__main__': 108 | # Load vocabulary 109 | word2idx, idx2word = load_vocabs() 110 | 111 | # Construct graph 112 | g = Graph("train"); print("Graph loaded") 113 | 114 | # Start session 115 | sv = tf.train.Supervisor(graph=g.graph, 116 | logdir=hp.logdir, 117 | save_model_secs=0) 118 | with sv.managed_session() as sess: 119 | with open('acc_mean_loss.rec', 'w') as rec: 120 | for epoch in range(1, hp.num_epochs+1): 121 | if sv.should_stop(): break 122 | for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): 123 | sess.run(g.train_op) 124 | acc, los = sess.run(g.acc), sess.run(g.mean_loss) 125 | #print(acc, los) 126 | rec.write('{}\t{}\n'.format(acc, los)) 127 | #print(sess.run(g.preds), sess.run(g.y)) 128 | #print(sess.run(tf.equal(tf.convert_to_tensor(g.y, tf.int32), g.preds))) 129 | 130 | gs = sess.run(g.global_step) 131 | sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs)) 132 | 133 | print("Done") 134 | 135 | 136 | -------------------------------------------------------------------------------- /transformer_text_Classfication/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import tensorflow as tf 5 | 6 | from hyperparams import feature_Block_Hyperparams as hp 7 | from data_load import get_batch_data, load_vocabs 8 | from modules import * 9 | import os, codecs 10 | from tqdm import tqdm 11 | 12 | 13 | class Graph(): 14 | def __init__(self, is_training=True): 15 | self.graph = tf.Graph() 16 | with self.graph.as_default(): 17 | if is_training: 18 | self.x, self.label, self.num_batch = get_batch_data() # (N, T) 19 | self.y = tf.one_hot(self.label, depth = hp.n_class) 20 | else: # inference 21 | self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) 22 | self.label = tf.placeholder(tf.int32, shape = (None, hp.n_class)) 23 | #self.y = tf.placeholder(tf.int32, shape = (None, hp.n_class)) 24 | #self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) 25 | 26 | # define decoder inputs 27 | #self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2: 28 | 29 | # Load vocabulary 30 | word2idx, idx2word = load_vocabs() 31 | 32 | 33 | # Encoder 34 | with tf.variable_scope("encoder"): 35 | ## Embedding 36 | self.enc = embedding(self.x, 37 | vocab_size=len(word2idx), 38 | num_units=hp.hidden_units, 39 | scale=True, 40 | scope="enc_embed") 41 | 42 | ## Positional Encoding 43 | if hp.sinusoid: 44 | self.enc += positional_encoding(self.x, 45 | num_units=hp.hidden_units, 46 | zero_pad=False, 47 | scale=False, 48 | scope="enc_pe") 49 | else: 50 | self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), 51 | vocab_size=hp.maxlen, 52 | num_units=hp.hidden_units, 53 | zero_pad=False, 54 | scale=False, 55 | scope="enc_pe") 56 | 57 | 58 | ## Dropout 59 | self.enc = tf.layers.dropout(self.enc, 60 | rate=hp.dropout_rate, 61 | training=tf.convert_to_tensor(is_training)) 62 | 63 | ## Blocks 64 | for i in range(hp.num_blocks): 65 | with tf.variable_scope("num_blocks_{}".format(i)): 66 | ### Multihead Attention 67 | self.enc = multihead_attention(queries=self.enc, 68 | keys=self.enc, 69 | num_units=hp.hidden_units, 70 | num_heads=hp.num_heads, 71 | dropout_rate=hp.dropout_rate, 72 | is_training=is_training, 73 | causality=False) 74 | 75 | ### Feed Forward 76 | self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units]) 77 | 78 | 79 | 80 | # Final linear projection 81 | #print(self.enc.shape) #4, 500, 512 82 | self.enc = tf.reduce_sum(self.enc, axis=2) #4, 500 83 | self.enc = tf.layers.batch_normalization(self.enc, True) 84 | self.logits = tf.layers.dense(self.enc, hp.n_class) #4, 2 85 | #print(self.logits.shape) 86 | self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) 87 | 88 | 89 | if is_training: 90 | #Accuracy 91 | self.cpl = tf.equal(tf.convert_to_tensor(self.label, tf.int32), self.preds) 92 | self.cpl = tf.to_int32(self.cpl) 93 | self.acc = tf.reduce_sum(self.cpl) / tf.reduce_sum(tf.to_int32(self.y)) 94 | tf.summary.scalar('acc', self.acc) 95 | 96 | # Loss 97 | self.y_smoothed = label_smoothing(self.y) 98 | self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed) 99 | self.mean_loss = tf.reduce_sum(self.loss)/tf.reduce_sum(self.y) 100 | #self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget)) 101 | 102 | 103 | # Training Scheme 104 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 105 | self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 106 | self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step) 107 | 108 | # Summary 109 | tf.summary.scalar('mean_loss', self.mean_loss) 110 | self.merged = tf.summary.merge_all() 111 | 112 | if __name__ == '__main__': 113 | # Load vocabulary 114 | word2idx, idx2word = load_vocabs() 115 | 116 | # Construct graph 117 | g = Graph("train"); print("Graph loaded") 118 | 119 | # Start session 120 | sv = tf.train.Supervisor(graph=g.graph, 121 | logdir=hp.logdir, 122 | save_model_secs=0) 123 | with sv.managed_session() as sess: 124 | with open("acc_loss_rec.log", 'w') as f: 125 | for epoch in range(1, hp.num_epochs+1): 126 | if sv.should_stop(): break 127 | for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): 128 | sess.run(g.train_op) 129 | acc, loss = sess.run([g.acc, g.mean_loss]) 130 | f.write('{}\t{}\n'.format(acc, loss)) 131 | gs = sess.run(g.global_step) 132 | sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs)) 133 | 134 | print("Done") 135 | 136 | 137 | -------------------------------------------------------------------------------- /transformer_RC/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import tensorflow as tf 5 | 6 | from hyperparams import rc_Hyperparams as hp 7 | from data_load import get_batch_data, load_vocabs 8 | from modules import * 9 | import os, codecs 10 | from tqdm import tqdm 11 | from models import vanilla_transformer 12 | 13 | # custom wrapper 14 | from layers.basic_rnn import rnn 15 | from layers.match_layer import MatchLSTMLayer 16 | from layers.match_layer import AttentionFlowMatchLayer 17 | from layers.pointer_net import PointerNetDecoder 18 | 19 | os.sys.path.append('../Models') 20 | from models import vanilla_transformer 21 | 22 | 23 | 24 | class Graph(): 25 | def __init__(self, is_training=True): 26 | self.graph = tf.Graph() 27 | with self.graph.as_default(): 28 | if is_training: 29 | self.q, self.p, self.q_length, self.p_length, \ 30 | self.start_label, self.end_label, self.num_batch = get_batch_data() 31 | self.dropout_keep_prob = hp.dropout_keep_prob 32 | 33 | else: # inference 34 | self.q = tf.placeholder(tf.int32, [None, hp.q_maxlen]) 35 | self.p = tf.placeholder(tf.int32, [None, hp.p_maxlen]) 36 | self.q_length = tf.placeholder(tf.int32, [None]) 37 | self.p_length = tf.placeholder(tf.int32, [None]) 38 | self.start_label = tf.placeholder(tf.int32, [None]) 39 | self.end_label = tf.placeholder(tf.int32, [None]) 40 | 41 | self.dropout_keep_prob = hp.dropout_keep_prob 42 | self.l2_loss = tf.constant(0.0) 43 | # define decoder input 44 | self.decoder_inputs = tf.concat((tf.ones_like(self.p[:, :1])*2, self.p[:, :-1]), -1) # 2: 45 | 46 | # Load vocabulary 47 | word2idx, idx2word = load_vocabs() 48 | 49 | # initialize transformer 50 | transformer = vanilla_transformer(hp, self.is_training) 51 | ### encode 52 | self.q_encodes, self.p_encodes = transformer.encode(self.q, len(word2idx)), \ 53 | transformer.encode(self.q, len(word2idx)) 54 | 55 | #concated features to attend p with q 56 | # first pad q_encodes to the length of p_encodes 57 | pad_dim = hp.p_maxlen - hp.q_maxlen 58 | pad_ = tf.zeros([tf.shape(self.q_encodes)[0], pad_dim, hp.hidden_units], dtype = self.q_encodes.dtype) 59 | self.padded_q_encodes = tf.concat([self.q_encodes, pad_,], 1) 60 | #normalization 61 | self.padded_q_encodes = normalize(self.padded_q_encodes) 62 | 63 | # Decoder 64 | self.dec = transformer.decode(self.decoder_inputs, self.padded_q_encodes, len(word2idx), hp.p_maxlen) 65 | 66 | # fix paragraph tensor with self.dec 67 | self.p_encodes = self.dec 68 | 69 | """ 70 | The core of RC model, get the question-aware passage encoding 71 | """ 72 | match_layer = AttentionFlowMatchLayer(hp.hidden_units) 73 | self.match_p_encodes, _ = match_layer.match(self.p_encodes, self.q_encodes, 74 | self.p_length, self.q_length) 75 | 76 | # pooling or bi-rnn to fuision passage encodes 77 | if hp.Passage_fuse == 'Pooling': 78 | #pooling layer 79 | self.match_p_encodes = \ 80 | tf.keras.layers.MaxPool1D(pool_size=4, strides=None, padding='valid')\ 81 | (self.match_p_encodes) 82 | 83 | self.match_p_encodes = tf.reshape(self.match_p_encodes, [-1, hp.p_maxlen, hp.hidden_units]) 84 | #normalization 85 | self.match_p_encodes = tf.layers.batch_normalization(self.match_p_encodes) 86 | if hp.use_dropout: 87 | self.match_p_encodes = tf.nn.dropout(self.match_p_encodes, self.dropout_keep_prob) 88 | elif hp.Passage_fuse == 'bi-rnn': 89 | self.fuse_p_encodes, _ = rnn('bi-lstm', self.match_p_encodes, self.p_length, 90 | hp.hidden_units, layer_num=1, concat = False) 91 | if hp.use_dropout: 92 | self.fuse_p_encodes = tf.nn.dropout(self.fuse_p_encodes, self.dropout_keep_prob) 93 | 94 | 95 | decoder = PointerNetDecoder(hp.hidden_units) 96 | self.start_probs, self.end_probs = decoder.decode(self.match_p_encodes, 97 | self.q_encodes) 98 | 99 | 100 | if is_training: 101 | self.start_loss = self.sparse_nll_loss(probs=self.start_probs, labels=self.start_label) 102 | self.end_loss = self.sparse_nll_loss(probs=self.end_probs, labels=self.end_label) 103 | self.all_params = tf.trainable_variables() 104 | self.loss = tf.reduce_mean(tf.add(self.start_loss, self.end_loss)) 105 | if hp.weight_decay > 0: 106 | with tf.variable_scope('l2_loss'): 107 | l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.all_params]) 108 | self.loss += hp.weight_decay * l2_loss 109 | 110 | 111 | 112 | # Training Scheme 113 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 114 | self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 115 | self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step) 116 | 117 | # Summary 118 | tf.summary.scalar('mean_loss', self.loss) 119 | self.merged = tf.summary.merge_all() 120 | 121 | 122 | 123 | def sparse_nll_loss(self, probs, labels, epsilon=1e-9, scope=None): 124 | """ 125 | negative log likelyhood loss 126 | """ 127 | with tf.name_scope(scope, "log_loss"): 128 | labels = tf.one_hot(labels, tf.shape(probs)[1], axis=1) 129 | losses = - tf.reduce_sum(labels * tf.log(probs + epsilon), 1) 130 | return losses 131 | 132 | 133 | if __name__ == '__main__': 134 | # Load vocabulary 135 | word2idx, idx2word = load_vocabs() 136 | 137 | # Construct graph 138 | g = Graph("train"); print("Graph loaded") 139 | 140 | # Start session 141 | sv = tf.train.Supervisor(graph=g.graph, 142 | logdir=hp.logdir, 143 | save_model_secs=0) 144 | with sv.managed_session() as sess: 145 | with open('acc_mean_loss.rec', 'w') as rec: 146 | for epoch in range(1, hp.num_epochs+1): 147 | if sv.should_stop(): break 148 | for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): 149 | sess.run(g.train_op) 150 | #acc, los = sess.run(g.acc), sess.run(g.mean_loss) 151 | los = sess.run(g.loss) 152 | if not los > float('-inf'): 153 | print("loss: ",los) 154 | gs = sess.run(g.global_step) 155 | sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs)) 156 | break 157 | 158 | rec.write('epochs {}\tstep {}\t{}\t{}\n'.format(epoch, step, 'Loss:', los)) 159 | 160 | gs = sess.run(g.global_step) 161 | sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs)) 162 | 163 | print("Done") 164 | 165 | 166 | -------------------------------------------------------------------------------- /transformer_RC/layers/pointer_net.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module implements the Pointer Network for selecting answer spans, as described in: 19 | https://openreview.net/pdf?id=B1-q5Pqxl 20 | """ 21 | 22 | import tensorflow as tf 23 | import tensorflow.contrib as tc 24 | 25 | 26 | def custom_dynamic_rnn(cell, inputs, inputs_len, initial_state=None): 27 | """ 28 | Implements a dynamic rnn that can store scores in the pointer network, 29 | the reason why we implements this is that the raw_rnn or dynamic_rnn function in Tensorflow 30 | seem to require the hidden unit and memory unit has the same dimension, and we cannot 31 | store the scores directly in the hidden unit. 32 | Args: 33 | cell: RNN cell 34 | inputs: the input sequence to rnn 35 | inputs_len: valid length 36 | initial_state: initial_state of the cell 37 | Returns: 38 | outputs and state 39 | """ 40 | batch_size, max_time = tf.shape(inputs)[0], tf.shape(inputs)[1] 41 | 42 | 43 | inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time) 44 | inputs_ta = inputs_ta.unstack(tf.transpose(inputs, [1, 0, 2])) 45 | 46 | # record cells 47 | emit_ta = tf.TensorArray(dtype=tf.float32, dynamic_size=True, size=0) 48 | 49 | # iter timesteps 50 | t0 = tf.constant(0, dtype=tf.int32) 51 | if initial_state is not None: 52 | 53 | # initial state 54 | s0 = initial_state 55 | else: 56 | s0 = cell.zero_state(batch_size, dtype=tf.float32) 57 | # 58 | f0 = tf.zeros([batch_size], dtype=tf.bool) 59 | 60 | def loop_fn(t, prev_s, emit_ta, finished): 61 | """ 62 | the loop function of rnn 63 | """ 64 | cur_x = inputs_ta.read(t) 65 | # use pre cell state and current input to predict the scores and current state 66 | ### dimension of scores: (batchsize, hiddensize) equal to cur_x 67 | ### the score is the logit of each position at each sample 68 | 69 | ### current state is a tuple (hidden state, cell state) 70 | scores, cur_state = cell(cur_x, prev_s) 71 | 72 | # copy through 73 | scores = tf.where(finished, tf.zeros_like(scores), scores) 74 | 75 | if isinstance(cell, tc.rnn.LSTMCell): 76 | cur_c, cur_h = cur_state 77 | prev_c, prev_h = prev_s 78 | cur_state = tc.rnn.LSTMStateTuple(tf.where(finished, prev_c, cur_c), 79 | tf.where(finished, prev_h, cur_h)) 80 | else: 81 | cur_state = tf.where(finished, prev_s, cur_state) 82 | 83 | ### store the logit scores of each step 84 | emit_ta = emit_ta.write(t, scores) 85 | finished = tf.greater_equal(t + 1, inputs_len) 86 | return [t + 1, cur_state, emit_ta, finished] 87 | 88 | _, state, emit_ta, _ = tf.while_loop( 89 | cond=lambda _1, _2, _3, finished: tf.logical_not(tf.reduce_all(finished)), 90 | body=loop_fn, 91 | loop_vars=(t0, s0, emit_ta, f0), 92 | parallel_iterations=32, 93 | swap_memory=False) 94 | 95 | outputs = tf.transpose(emit_ta.stack(), [1, 0, 2]) 96 | return outputs, state 97 | 98 | 99 | def attend_pooling(pooling_vectors, ref_vector, hidden_size, scope=None): 100 | """ 101 | Applies attend pooling to a set of vectors according to a reference vector. 102 | Args: 103 | pooling_vectors: the vectors to pool 104 | ref_vector: the reference vector 105 | hidden_size: the hidden size for attention function 106 | scope: score name 107 | Returns: 108 | the pooled vector 109 | pooling to vector with one dimension 110 | """ 111 | with tf.variable_scope(scope or 'attend_pooling'): 112 | U = tf.tanh(tc.layers.fully_connected(pooling_vectors, num_outputs=hidden_size, 113 | activation_fn=None, biases_initializer=None) 114 | + tc.layers.fully_connected(tf.expand_dims(ref_vector, 1), 115 | num_outputs=hidden_size, 116 | activation_fn=None)) 117 | logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None) 118 | scores = tf.nn.softmax(logits, 1) 119 | pooled_vector = tf.reduce_sum(pooling_vectors * scores, axis=1) 120 | return pooled_vector 121 | 122 | 123 | class PointerNetLSTMCell(tc.rnn.LSTMCell): 124 | """ 125 | Implements the Pointer Network Cell 126 | """ 127 | def __init__(self, num_units, context_to_point): 128 | super(PointerNetLSTMCell, self).__init__(num_units, state_is_tuple=True) 129 | self.context_to_point = context_to_point 130 | self.fc_context = tc.layers.fully_connected(self.context_to_point, 131 | num_outputs=self._num_units, 132 | activation_fn=None) 133 | 134 | def __call__(self, inputs, state, scope=None): 135 | (c_prev, m_prev) = state 136 | with tf.variable_scope(scope or type(self).__name__): 137 | U = tf.tanh(self.fc_context 138 | + tf.expand_dims(tc.layers.fully_connected(m_prev, 139 | num_outputs=self._num_units, 140 | activation_fn=None), 141 | 1)) 142 | logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None) 143 | scores = tf.nn.softmax(logits, 1) 144 | attended_context = tf.reduce_sum(self.context_to_point * scores, axis=1) 145 | lstm_out, lstm_state = super(PointerNetLSTMCell, self).__call__(attended_context, state) 146 | return tf.squeeze(scores, -1), lstm_state 147 | 148 | 149 | class PointerNetDecoder(object): 150 | """ 151 | Implements the Pointer Network 152 | """ 153 | def __init__(self, hidden_size): 154 | self.hidden_size = hidden_size 155 | 156 | def decode(self, passage_vectors, question_vectors, init_with_question=True): 157 | """ 158 | Use Pointer Network to compute the probabilities of each position 159 | to be start and end of the answer 160 | Args: 161 | passage_vectors: the encoded passage vectors 162 | question_vectors: the encoded question vectors 163 | init_with_question: if set to be true, 164 | we will use the question_vectors to init the state of Pointer Network 165 | Returns: 166 | the probs of evary position to be start and end of the answer 167 | """ 168 | with tf.variable_scope('pn_decoder'): 169 | fake_inputs = tf.zeros([tf.shape(passage_vectors)[0], 2, 1]) # not used 170 | sequence_len = tf.tile([2], [tf.shape(passage_vectors)[0]]) 171 | if init_with_question: 172 | random_attn_vector = tf.Variable(tf.random_normal([1, self.hidden_size]), 173 | trainable=True, name="random_attn_vector") 174 | pooled_question_rep = tc.layers.fully_connected( 175 | attend_pooling(question_vectors, random_attn_vector, self.hidden_size), 176 | num_outputs=self.hidden_size, activation_fn=None 177 | ) 178 | init_state = tc.rnn.LSTMStateTuple(pooled_question_rep, pooled_question_rep) 179 | else: 180 | init_state = None 181 | with tf.variable_scope('fw'): 182 | fw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors) 183 | fw_outputs, _ = custom_dynamic_rnn(fw_cell, fake_inputs, sequence_len, init_state) 184 | with tf.variable_scope('bw'): 185 | bw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors) 186 | bw_outputs, _ = custom_dynamic_rnn(bw_cell, fake_inputs, sequence_len, init_state) 187 | 188 | # the start prob and end prob of each position 189 | start_prob = (fw_outputs[0:, 0, 0:] + bw_outputs[0:, 1, 0:]) / 2 190 | end_prob = (fw_outputs[0:, 1, 0:] + bw_outputs[0:, 0, 0:]) / 2 191 | return start_prob, end_prob 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /en-zh_NMT/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | 5 | import tensorflow as tf 6 | 7 | from hyperparams import seq2seq_Hyperparams as hp 8 | from data_load import get_batch_data, load_en_vocab, load_zh_vocab 9 | from modules import * 10 | import os, codecs 11 | from tqdm import tqdm 12 | 13 | class Graph(): 14 | def __init__(self, is_training=True): 15 | self.graph = tf.Graph() 16 | with self.graph.as_default(): 17 | if is_training: 18 | self.x, self.y, self.num_batch = get_batch_data() # (N, T) 19 | else: # inference 20 | self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) 21 | self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) 22 | 23 | # define decoder inputs 24 | self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2: 25 | 26 | # Load vocabulary 27 | en2idx, idx2en = load_en_vocab() 28 | zh2idx, idx2zh = load_zh_vocab() 29 | 30 | # Encoder 31 | with tf.variable_scope("encoder"): 32 | ## Embedding 33 | self.enc = embedding(self.x, 34 | vocab_size=len(en2idx), 35 | num_units=hp.hidden_units, 36 | scale=True, 37 | scope="enc_embed") 38 | 39 | ## Positional Encoding 40 | if hp.sinusoid: 41 | self.enc += positional_encoding(self.x, 42 | num_units=hp.hidden_units, 43 | zero_pad=False, 44 | scale=False, 45 | scope="enc_pe") 46 | else: 47 | self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), 48 | vocab_size=hp.maxlen, 49 | num_units=hp.hidden_units, 50 | zero_pad=False, 51 | scale=False, 52 | scope="enc_pe") 53 | 54 | 55 | ## Dropout 56 | self.enc = tf.layers.dropout(self.enc, 57 | rate=hp.dropout_rate, 58 | training=tf.convert_to_tensor(is_training)) 59 | 60 | ## Blocks 61 | for i in range(hp.num_blocks): 62 | with tf.variable_scope("num_blocks_{}".format(i)): 63 | ### Multihead Attention 64 | self.enc = multihead_attention(queries=self.enc, 65 | keys=self.enc, 66 | num_units=hp.hidden_units, 67 | num_heads=hp.num_heads, 68 | dropout_rate=hp.dropout_rate, 69 | is_training=is_training, 70 | causality=False) 71 | 72 | ### Feed Forward 73 | self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units]) 74 | 75 | # Decoder 76 | with tf.variable_scope("decoder"): 77 | ## Embedding 78 | self.dec = embedding(self.decoder_inputs, 79 | vocab_size=len(zh2idx), 80 | num_units=hp.hidden_units, 81 | scale=True, 82 | scope="dec_embed") 83 | 84 | ## Positional Encoding 85 | if hp.sinusoid: 86 | self.dec += positional_encoding(self.decoder_inputs, 87 | vocab_size=hp.maxlen, 88 | num_units=hp.hidden_units, 89 | zero_pad=False, 90 | scale=False, 91 | scope="dec_pe") 92 | else: 93 | self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), 94 | vocab_size=hp.maxlen, 95 | num_units=hp.hidden_units, 96 | zero_pad=False, 97 | scale=False, 98 | scope="dec_pe") 99 | 100 | ## Dropout 101 | self.dec = tf.layers.dropout(self.dec, 102 | rate=hp.dropout_rate, 103 | training=tf.convert_to_tensor(is_training)) 104 | 105 | ## Blocks 106 | for i in range(hp.num_blocks): 107 | with tf.variable_scope("num_blocks_{}".format(i)): 108 | ## Multihead Attention ( self-attention) 109 | self.dec = multihead_attention(queries=self.dec, 110 | keys=self.dec, 111 | num_units=hp.hidden_units, 112 | num_heads=hp.num_heads, 113 | dropout_rate=hp.dropout_rate, 114 | is_training=is_training, 115 | causality=True, 116 | scope="self_attention") 117 | 118 | ## Multihead Attention ( vanilla attention) 119 | self.dec = multihead_attention(queries=self.dec, 120 | keys=self.enc, 121 | num_units=hp.hidden_units, 122 | num_heads=hp.num_heads, 123 | dropout_rate=hp.dropout_rate, 124 | is_training=is_training, 125 | causality=False, 126 | scope="vanilla_attention") 127 | 128 | ## Feed Forward 129 | self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units]) 130 | 131 | # Final linear projection 132 | self.logits = tf.layers.dense(self.dec, len(zh2idx)) 133 | self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) 134 | self.istarget = tf.to_float(tf.not_equal(self.y, 0)) 135 | self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget)) 136 | tf.summary.scalar('acc', self.acc) 137 | 138 | if is_training: 139 | # Loss 140 | self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(zh2idx))) 141 | self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed) 142 | self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget)) 143 | 144 | # Training Scheme 145 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 146 | self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 147 | self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step) 148 | 149 | # Summary 150 | tf.summary.scalar('mean_loss', self.mean_loss) 151 | self.merged = tf.summary.merge_all() 152 | 153 | if __name__ == '__main__': 154 | # Load vocabulary 155 | en2idx, idx2en = load_en_vocab() 156 | zh2idx, idx2zh = load_zh_vocab() 157 | 158 | # Construct graph 159 | g = Graph("train"); print("Graph loaded") 160 | 161 | # Start session 162 | sv = tf.train.Supervisor(graph=g.graph, 163 | logdir=hp.logdir, 164 | save_model_secs=0) 165 | with sv.managed_session() as sess: 166 | for epoch in range(1, hp.num_epochs+1): 167 | if sv.should_stop(): break 168 | for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): 169 | sess.run(g.train_op) 170 | 171 | gs = sess.run(g.global_step) 172 | sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs)) 173 | 174 | print("Done") 175 | 176 | 177 | -------------------------------------------------------------------------------- /en-zh_NMT/modules.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import collections 7 | import math 8 | 9 | 10 | 11 | def normalize(inputs, 12 | epsilon = 1e-8, 13 | scope="ln", 14 | reuse=None): 15 | '''Applies layer normalization. 16 | 17 | Args: 18 | inputs: A tensor with 2 or more dimensions, where the first dimension has 19 | `batch_size`. 20 | epsilon: A floating number. A very small number for preventing ZeroDivision Error. 21 | scope: Optional scope for `variable_scope`. 22 | reuse: Boolean, whether to reuse the weights of a previous layer 23 | by the same name. 24 | 25 | Returns: 26 | A tensor with the same shape and data dtype as `inputs`. 27 | ''' 28 | with tf.variable_scope(scope, reuse=reuse): 29 | inputs_shape = inputs.get_shape() 30 | params_shape = inputs_shape[-1:] 31 | 32 | mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) 33 | beta= tf.Variable(tf.zeros(params_shape)) 34 | gamma = tf.Variable(tf.ones(params_shape)) 35 | normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) ) 36 | outputs = gamma * normalized + beta 37 | 38 | return outputs 39 | 40 | def embedding(inputs, 41 | vocab_size, 42 | num_units, 43 | zero_pad=True, 44 | scale=True, 45 | scope="embedding", 46 | reuse=None): 47 | '''Embeds a given tensor. 48 | 49 | Args: 50 | inputs: A `Tensor` with type `int32` or `int64` containing the ids 51 | to be looked up in `lookup table`. 52 | vocab_size: An int. Vocabulary size. 53 | num_units: An int. Number of embedding hidden units. 54 | zero_pad: A boolean. If True, all the values of the fist row (id 0) 55 | should be constant zeros. 56 | scale: A boolean. If True. the outputs is multiplied by sqrt num_units. 57 | scope: Optional scope for `variable_scope`. 58 | reuse: Boolean, whether to reuse the weights of a previous layer 59 | by the same name. 60 | 61 | Returns: 62 | A `Tensor` with one more rank than inputs's. The last dimensionality 63 | should be `num_units`. 64 | 65 | For example, 66 | 67 | ``` 68 | import tensorflow as tf 69 | 70 | inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) 71 | outputs = embedding(inputs, 6, 2, zero_pad=True) 72 | with tf.Session() as sess: 73 | sess.run(tf.global_variables_initializer()) 74 | print sess.run(outputs) 75 | >> 76 | [[[ 0. 0. ] 77 | [ 0.09754146 0.67385566] 78 | [ 0.37864095 -0.35689294]] 79 | 80 | [[-1.01329422 -1.09939694] 81 | [ 0.7521342 0.38203377] 82 | [-0.04973143 -0.06210355]]] 83 | ``` 84 | 85 | ``` 86 | import tensorflow as tf 87 | 88 | inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) 89 | outputs = embedding(inputs, 6, 2, zero_pad=False) 90 | with tf.Session() as sess: 91 | sess.run(tf.global_variables_initializer()) 92 | print sess.run(outputs) 93 | >> 94 | [[[-0.19172323 -0.39159766] 95 | [-0.43212751 -0.66207761] 96 | [ 1.03452027 -0.26704335]] 97 | 98 | [[-0.11634696 -0.35983452] 99 | [ 0.50208133 0.53509563] 100 | [ 1.22204471 -0.96587461]]] 101 | ``` 102 | ''' 103 | with tf.variable_scope(scope, reuse=reuse): 104 | lookup_table = tf.get_variable('lookup_table', 105 | dtype=tf.float32, 106 | shape=[vocab_size, num_units], 107 | initializer=tf.contrib.layers.xavier_initializer()) 108 | if zero_pad: 109 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 110 | lookup_table[1:, :]), 0) 111 | outputs = tf.nn.embedding_lookup(lookup_table, inputs) 112 | 113 | if scale: 114 | outputs = outputs * (num_units ** 0.5) 115 | 116 | return outputs 117 | 118 | 119 | def positional_encoding(inputs, 120 | num_units, 121 | zero_pad=True, 122 | scale=True, 123 | scope="positional_encoding", 124 | reuse=None): 125 | '''Sinusoidal Positional_Encoding. 126 | 127 | Args: 128 | inputs: A 2d Tensor with shape of (N, T). 129 | num_units: Output dimensionality 130 | zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero 131 | scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper) 132 | scope: Optional scope for `variable_scope`. 133 | reuse: Boolean, whether to reuse the weights of a previous layer 134 | by the same name. 135 | 136 | Returns: 137 | A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units' 138 | ''' 139 | 140 | N, T = inputs.get_shape().as_list() 141 | with tf.variable_scope(scope, reuse=reuse): 142 | position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1]) 143 | 144 | # First part of the PE function: sin and cos argument 145 | position_enc = np.array([ 146 | [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)] 147 | for pos in range(T)]) 148 | 149 | # Second part, apply the cosine to even columns and sin to odds. 150 | position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i 151 | position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1 152 | 153 | # Convert to a tensor 154 | lookup_table = tf.convert_to_tensor(position_enc) 155 | 156 | if zero_pad: 157 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 158 | lookup_table[1:, :]), 0) 159 | outputs = tf.nn.embedding_lookup(lookup_table, position_ind) 160 | 161 | if scale: 162 | outputs = outputs * num_units**0.5 163 | 164 | return outputs 165 | 166 | 167 | 168 | def multihead_attention(queries, 169 | keys, 170 | num_units=None, 171 | num_heads=8, 172 | dropout_rate=0, 173 | is_training=True, 174 | causality=False, 175 | scope="multihead_attention", 176 | reuse=None): 177 | '''Applies multihead attention. 178 | 179 | Args: 180 | queries: A 3d tensor with shape of [N, T_q, C_q]. 181 | keys: A 3d tensor with shape of [N, T_k, C_k]. 182 | num_units: A scalar. Attention size. 183 | dropout_rate: A floating point number. 184 | is_training: Boolean. Controller of mechanism for dropout. 185 | causality: Boolean. If true, units that reference the future are masked. 186 | num_heads: An int. Number of heads. 187 | scope: Optional scope for `variable_scope`. 188 | reuse: Boolean, whether to reuse the weights of a previous layer 189 | by the same name. 190 | 191 | Returns 192 | A 3d tensor with shape of (N, T_q, C) 193 | ''' 194 | with tf.variable_scope(scope, reuse=reuse): 195 | # Set the fall back option for num_units 196 | if num_units is None: 197 | num_units = queries.get_shape().as_list[-1] 198 | 199 | # Linear projections 200 | Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C) 201 | K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) 202 | V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) 203 | 204 | # Split and concat 205 | Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 206 | K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 207 | V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 208 | 209 | # Multiplication 210 | outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) 211 | 212 | # Scale 213 | outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5) 214 | 215 | # Key Masking 216 | key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k) 217 | key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) 218 | key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) 219 | 220 | paddings = tf.ones_like(outputs)*(-2**32+1) 221 | outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) 222 | 223 | # Causality = Future blinding 224 | if causality: 225 | diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) 226 | #tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k) 227 | tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() 228 | masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) 229 | 230 | paddings = tf.ones_like(masks)*(-2**32+1) 231 | outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) 232 | 233 | # Activation 234 | outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) 235 | 236 | # Query Masking 237 | query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) 238 | query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) 239 | query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) 240 | outputs *= query_masks # broadcasting. (N, T_q, C) 241 | 242 | # Dropouts 243 | outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) 244 | 245 | # Weighted sum 246 | outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) 247 | 248 | # Restore shape 249 | outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C) 250 | 251 | # Residual connection 252 | outputs += queries 253 | 254 | # Normalize 255 | outputs = normalize(outputs) # (N, T_q, C) 256 | 257 | return outputs 258 | 259 | def feedforward(inputs, 260 | num_units=[2048, 512], 261 | scope="multihead_attention", 262 | reuse=None): 263 | '''Point-wise feed forward net. 264 | 265 | Args: 266 | inputs: A 3d tensor with shape of [N, T, C]. 267 | num_units: A list of two integers. 268 | scope: Optional scope for `variable_scope`. 269 | reuse: Boolean, whether to reuse the weights of a previous layer 270 | by the same name. 271 | 272 | Returns: 273 | A 3d tensor with the same shape and dtype as inputs 274 | ''' 275 | with tf.variable_scope(scope, reuse=reuse): 276 | # Inner layer 277 | params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1, 278 | "activation": tf.nn.relu, "use_bias": True} 279 | outputs = tf.layers.conv1d(**params) 280 | 281 | # Readout layer 282 | params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1, 283 | "activation": None, "use_bias": True} 284 | outputs = tf.layers.conv1d(**params) 285 | 286 | # Residual connection 287 | outputs += inputs 288 | 289 | # Normalize 290 | outputs = normalize(outputs) 291 | 292 | return outputs 293 | 294 | def label_smoothing(inputs, epsilon=0.1): 295 | '''Applies label smoothing. See https://arxiv.org/abs/1512.00567. 296 | 297 | Args: 298 | inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary. 299 | epsilon: Smoothing rate. 300 | 301 | For example, 302 | 303 | ``` 304 | import tensorflow as tf 305 | inputs = tf.convert_to_tensor([[[0, 0, 1], 306 | [0, 1, 0], 307 | [1, 0, 0]], 308 | 309 | [[1, 0, 0], 310 | [1, 0, 0], 311 | [0, 1, 0]]], tf.float32) 312 | 313 | outputs = label_smoothing(inputs) 314 | 315 | with tf.Session() as sess: 316 | print(sess.run([outputs])) 317 | 318 | >> 319 | [array([[[ 0.03333334, 0.03333334, 0.93333334], 320 | [ 0.03333334, 0.93333334, 0.03333334], 321 | [ 0.93333334, 0.03333334, 0.03333334]], 322 | 323 | [[ 0.93333334, 0.03333334, 0.03333334], 324 | [ 0.93333334, 0.03333334, 0.03333334], 325 | [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)] 326 | ``` 327 | ''' 328 | K = inputs.get_shape().as_list()[-1] # number of channels 329 | return ((1-epsilon) * inputs) + (epsilon / K) 330 | 331 | 332 | def bleu(pred_tokens, label_tokens, k): 333 | len_pred, len_label = len(pred_tokens), len(label_tokens) 334 | score = math.exp(min(0, 1 - len_label / len_pred)) 335 | for n in range(1, k + 1): 336 | num_matches, label_subs = 0, collections.defaultdict(int) 337 | for i in range(len_label - n + 1): 338 | label_subs[''.join(label_tokens[i: i + n])] += 1 339 | for i in range(len_pred - n + 1): 340 | if label_subs[''.join(pred_tokens[i: i + n])] > 0: 341 | num_matches += 1 342 | label_subs[''.join(pred_tokens[i: i + n])] -= 1 343 | score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n)) 344 | return score 345 | 346 | 347 | 348 | -------------------------------------------------------------------------------- /transformer_infersent/modules.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | 8 | 9 | 10 | def normalize(inputs, 11 | epsilon = 1e-8, 12 | scope="ln", 13 | reuse=None): 14 | '''Applies layer normalization. 15 | 16 | Args: 17 | inputs: A tensor with 2 or more dimensions, where the first dimension has 18 | `batch_size`. 19 | epsilon: A floating number. A very small number for preventing ZeroDivision Error. 20 | scope: Optional scope for `variable_scope`. 21 | reuse: Boolean, whether to reuse the weights of a previous layer 22 | by the same name. 23 | 24 | Returns: 25 | A tensor with the same shape and data dtype as `inputs`. 26 | ''' 27 | with tf.variable_scope(scope, reuse=reuse): 28 | inputs_shape = inputs.get_shape() 29 | params_shape = inputs_shape[-1:] 30 | 31 | mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) 32 | beta= tf.Variable(tf.zeros(params_shape)) 33 | gamma = tf.Variable(tf.ones(params_shape)) 34 | normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) ) 35 | outputs = gamma * normalized + beta 36 | 37 | return outputs 38 | 39 | def embedding(inputs, 40 | vocab_size, 41 | num_units, 42 | zero_pad=True, 43 | scale=True, 44 | scope="embedding", 45 | reuse=None): 46 | '''Embeds a given tensor. 47 | 48 | Args: 49 | inputs: A `Tensor` with type `int32` or `int64` containing the ids 50 | to be looked up in `lookup table`. 51 | vocab_size: An int. Vocabulary size. 52 | num_units: An int. Number of embedding hidden units. 53 | zero_pad: A boolean. If True, all the values of the fist row (id 0) 54 | should be constant zeros. 55 | scale: A boolean. If True. the outputs is multiplied by sqrt num_units. 56 | scope: Optional scope for `variable_scope`. 57 | reuse: Boolean, whether to reuse the weights of a previous layer 58 | by the same name. 59 | 60 | Returns: 61 | A `Tensor` with one more rank than inputs's. The last dimensionality 62 | should be `num_units`. 63 | 64 | For example, 65 | 66 | ``` 67 | import tensorflow as tf 68 | 69 | inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) 70 | outputs = embedding(inputs, 6, 2, zero_pad=True) 71 | with tf.Session() as sess: 72 | sess.run(tf.global_variables_initializer()) 73 | print sess.run(outputs) 74 | >> 75 | [[[ 0. 0. ] 76 | [ 0.09754146 0.67385566] 77 | [ 0.37864095 -0.35689294]] 78 | 79 | [[-1.01329422 -1.09939694] 80 | [ 0.7521342 0.38203377] 81 | [-0.04973143 -0.06210355]]] 82 | ``` 83 | 84 | ``` 85 | import tensorflow as tf 86 | 87 | inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) 88 | outputs = embedding(inputs, 6, 2, zero_pad=False) 89 | with tf.Session() as sess: 90 | sess.run(tf.global_variables_initializer()) 91 | print sess.run(outputs) 92 | >> 93 | [[[-0.19172323 -0.39159766] 94 | [-0.43212751 -0.66207761] 95 | [ 1.03452027 -0.26704335]] 96 | 97 | [[-0.11634696 -0.35983452] 98 | [ 0.50208133 0.53509563] 99 | [ 1.22204471 -0.96587461]]] 100 | ``` 101 | ''' 102 | with tf.variable_scope(scope, reuse=reuse): 103 | lookup_table = tf.get_variable('lookup_table', 104 | dtype=tf.float32, 105 | shape=[vocab_size, num_units], 106 | initializer=tf.contrib.layers.xavier_initializer()) 107 | if zero_pad: 108 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 109 | lookup_table[1:, :]), 0) 110 | outputs = tf.nn.embedding_lookup(lookup_table, inputs) 111 | 112 | if scale: 113 | outputs = outputs * (num_units ** 0.5) 114 | 115 | return outputs 116 | 117 | 118 | def positional_encoding(inputs, 119 | num_units, 120 | zero_pad=True, 121 | scale=True, 122 | scope="positional_encoding", 123 | reuse=None): 124 | '''Sinusoidal Positional_Encoding. 125 | 126 | Args: 127 | inputs: A 2d Tensor with shape of (N, T). 128 | num_units: Output dimensionality 129 | zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero 130 | scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper) 131 | scope: Optional scope for `variable_scope`. 132 | reuse: Boolean, whether to reuse the weights of a previous layer 133 | by the same name. 134 | 135 | Returns: 136 | A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units' 137 | ''' 138 | 139 | N, T = inputs.get_shape().as_list() 140 | with tf.variable_scope(scope, reuse=reuse): 141 | position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1]) 142 | 143 | # First part of the PE function: sin and cos argument 144 | position_enc = np.array([ 145 | [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)] 146 | for pos in range(T)]) 147 | 148 | # Second part, apply the cosine to even columns and sin to odds. 149 | position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i 150 | position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1 151 | 152 | # Convert to a tensor 153 | lookup_table = tf.convert_to_tensor(position_enc) 154 | 155 | if zero_pad: 156 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 157 | lookup_table[1:, :]), 0) 158 | outputs = tf.nn.embedding_lookup(lookup_table, position_ind) 159 | 160 | if scale: 161 | outputs = outputs * num_units**0.5 162 | 163 | return outputs 164 | 165 | 166 | 167 | def multihead_attention(queries, 168 | keys, 169 | num_units=None, 170 | num_heads=8, 171 | dropout_rate=0, 172 | is_training=True, 173 | causality=False, 174 | scope="multihead_attention", 175 | reuse=None): 176 | '''Applies multihead attention. 177 | 178 | Args: 179 | queries: A 3d tensor with shape of [N, T_q, C_q]. 180 | keys: A 3d tensor with shape of [N, T_k, C_k]. 181 | num_units: A scalar. Attention size. 182 | dropout_rate: A floating point number. 183 | is_training: Boolean. Controller of mechanism for dropout. 184 | causality: Boolean. If true, units that reference the future are masked. 185 | num_heads: An int. Number of heads. 186 | scope: Optional scope for `variable_scope`. 187 | reuse: Boolean, whether to reuse the weights of a previous layer 188 | by the same name. 189 | 190 | Returns 191 | A 3d tensor with shape of (N, T_q, C) 192 | ''' 193 | with tf.variable_scope(scope, reuse=reuse): 194 | # Set the fall back option for num_units 195 | if num_units is None: 196 | num_units = queries.get_shape().as_list[-1] 197 | 198 | # Linear projections 199 | Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C) 200 | K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) 201 | V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) 202 | 203 | # Split and concat 204 | Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 205 | K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 206 | V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 207 | 208 | # Multiplication 209 | outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) 210 | 211 | # Scale 212 | outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5) 213 | 214 | # Key Masking 215 | key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k) 216 | key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) 217 | key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) 218 | 219 | paddings = tf.ones_like(outputs)*(-2**32+1) 220 | outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) 221 | 222 | # Causality = Future blinding 223 | if causality: 224 | diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) 225 | #tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k) 226 | tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() 227 | masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) 228 | 229 | paddings = tf.ones_like(masks)*(-2**32+1) 230 | outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) 231 | 232 | # Activation 233 | outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) 234 | 235 | # Query Masking 236 | query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) 237 | query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) 238 | query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) 239 | outputs *= query_masks # broadcasting. (N, T_q, C) 240 | 241 | # Dropouts 242 | outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) 243 | 244 | # Weighted sum 245 | outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) 246 | 247 | # Restore shape 248 | outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C) 249 | 250 | # Residual connection 251 | outputs += queries 252 | 253 | # Normalize 254 | outputs = normalize(outputs) # (N, T_q, C) 255 | 256 | return outputs 257 | 258 | def feedforward(inputs, 259 | num_units=[2048, 512], 260 | scope="multihead_attention", 261 | reuse=None): 262 | '''Point-wise feed forward net. 263 | 264 | Args: 265 | inputs: A 3d tensor with shape of [N, T, C]. 266 | num_units: A list of two integers. 267 | scope: Optional scope for `variable_scope`. 268 | reuse: Boolean, whether to reuse the weights of a previous layer 269 | by the same name. 270 | 271 | Returns: 272 | A 3d tensor with the same shape and dtype as inputs 273 | ''' 274 | with tf.variable_scope(scope, reuse=reuse): 275 | # Inner layer 276 | params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1, 277 | "activation": tf.nn.relu, "use_bias": True} 278 | outputs = tf.layers.conv1d(**params) 279 | 280 | # Readout layer 281 | params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1, 282 | "activation": None, "use_bias": True} 283 | outputs = tf.layers.conv1d(**params) 284 | 285 | # Residual connection 286 | outputs += inputs 287 | 288 | # Normalize 289 | outputs = normalize(outputs) 290 | 291 | return outputs 292 | 293 | def label_smoothing(inputs, epsilon=0.1): 294 | '''Applies label smoothing. See https://arxiv.org/abs/1512.00567. 295 | 296 | Args: 297 | inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary. 298 | epsilon: Smoothing rate. 299 | 300 | For example, 301 | 302 | ``` 303 | import tensorflow as tf 304 | inputs = tf.convert_to_tensor([[[0, 0, 1], 305 | [0, 1, 0], 306 | [1, 0, 0]], 307 | 308 | [[1, 0, 0], 309 | [1, 0, 0], 310 | [0, 1, 0]]], tf.float32) 311 | 312 | outputs = label_smoothing(inputs) 313 | 314 | with tf.Session() as sess: 315 | print(sess.run([outputs])) 316 | 317 | >> 318 | [array([[[ 0.03333334, 0.03333334, 0.93333334], 319 | [ 0.03333334, 0.93333334, 0.03333334], 320 | [ 0.93333334, 0.03333334, 0.03333334]], 321 | 322 | [[ 0.93333334, 0.03333334, 0.03333334], 323 | [ 0.93333334, 0.03333334, 0.03333334], 324 | [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)] 325 | ``` 326 | ''' 327 | K = inputs.get_shape().as_list()[-1] # number of channels 328 | return ((1-epsilon) * inputs) + (epsilon / K) 329 | 330 | 331 | 332 | 333 | def BME_cut(seq, label): 334 | ''' 335 | Tokenization with sequence tagging of /B/E/S/M 336 | represent the word begin/end/single word/in the middle respectively. 337 | Args: 338 | inputs: seq:str, label:str. 339 | output:List. 340 | 341 | Examples: 342 | >>> BME_cut('l i k e m e','B M M E B E') 343 | like me 344 | ''' 345 | if isinstance(seq, str): 346 | seq = seq.split() 347 | if isinstance(label, str): 348 | label = label.split() 349 | 350 | seq = seq + ['PAD']*(len(label) - len(seq)) 351 | assert len(seq) == len(label), "seq label is not compliable...{}, {}".format(seq, label) 352 | tokens = [] 353 | i = 0 354 | while i < len(seq): 355 | if label[i] == 'S': 356 | tokens.append(seq[i]) 357 | elif label[i] == 'B': 358 | tmp = seq[i] 359 | while i+1 < len(seq) and label[i+1] == 'M': 360 | tmp += seq[i+1] 361 | i += 1 362 | if not i+1 < len(seq): break 363 | #print(label[i+1], seq[i+1]) 364 | if label[i+1] == 'E': 365 | tmp += seq[i+1] 366 | tokens.append(tmp) 367 | i += 1 368 | return ' '.join(tokens) -------------------------------------------------------------------------------- /transformer_text_Classfication/modules.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | 8 | 9 | 10 | def normalize(inputs, 11 | epsilon = 1e-8, 12 | scope="ln", 13 | reuse=None): 14 | '''Applies layer normalization. 15 | 16 | Args: 17 | inputs: A tensor with 2 or more dimensions, where the first dimension has 18 | `batch_size`. 19 | epsilon: A floating number. A very small number for preventing ZeroDivision Error. 20 | scope: Optional scope for `variable_scope`. 21 | reuse: Boolean, whether to reuse the weights of a previous layer 22 | by the same name. 23 | 24 | Returns: 25 | A tensor with the same shape and data dtype as `inputs`. 26 | ''' 27 | with tf.variable_scope(scope, reuse=reuse): 28 | inputs_shape = inputs.get_shape() 29 | params_shape = inputs_shape[-1:] 30 | 31 | mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) 32 | beta= tf.Variable(tf.zeros(params_shape)) 33 | gamma = tf.Variable(tf.ones(params_shape)) 34 | normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) ) 35 | outputs = gamma * normalized + beta 36 | 37 | return outputs 38 | 39 | def embedding(inputs, 40 | vocab_size, 41 | num_units, 42 | zero_pad=True, 43 | scale=True, 44 | scope="embedding", 45 | reuse=None): 46 | '''Embeds a given tensor. 47 | 48 | Args: 49 | inputs: A `Tensor` with type `int32` or `int64` containing the ids 50 | to be looked up in `lookup table`. 51 | vocab_size: An int. Vocabulary size. 52 | num_units: An int. Number of embedding hidden units. 53 | zero_pad: A boolean. If True, all the values of the fist row (id 0) 54 | should be constant zeros. 55 | scale: A boolean. If True. the outputs is multiplied by sqrt num_units. 56 | scope: Optional scope for `variable_scope`. 57 | reuse: Boolean, whether to reuse the weights of a previous layer 58 | by the same name. 59 | 60 | Returns: 61 | A `Tensor` with one more rank than inputs's. The last dimensionality 62 | should be `num_units`. 63 | 64 | For example, 65 | 66 | ``` 67 | import tensorflow as tf 68 | 69 | inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) 70 | outputs = embedding(inputs, 6, 2, zero_pad=True) 71 | with tf.Session() as sess: 72 | sess.run(tf.global_variables_initializer()) 73 | print sess.run(outputs) 74 | >> 75 | [[[ 0. 0. ] 76 | [ 0.09754146 0.67385566] 77 | [ 0.37864095 -0.35689294]] 78 | 79 | [[-1.01329422 -1.09939694] 80 | [ 0.7521342 0.38203377] 81 | [-0.04973143 -0.06210355]]] 82 | ``` 83 | 84 | ``` 85 | import tensorflow as tf 86 | 87 | inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) 88 | outputs = embedding(inputs, 6, 2, zero_pad=False) 89 | with tf.Session() as sess: 90 | sess.run(tf.global_variables_initializer()) 91 | print sess.run(outputs) 92 | >> 93 | [[[-0.19172323 -0.39159766] 94 | [-0.43212751 -0.66207761] 95 | [ 1.03452027 -0.26704335]] 96 | 97 | [[-0.11634696 -0.35983452] 98 | [ 0.50208133 0.53509563] 99 | [ 1.22204471 -0.96587461]]] 100 | ``` 101 | ''' 102 | with tf.variable_scope(scope, reuse=reuse): 103 | lookup_table = tf.get_variable('lookup_table', 104 | dtype=tf.float32, 105 | shape=[vocab_size, num_units], 106 | initializer=tf.contrib.layers.xavier_initializer()) 107 | if zero_pad: 108 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 109 | lookup_table[1:, :]), 0) 110 | outputs = tf.nn.embedding_lookup(lookup_table, inputs) 111 | 112 | if scale: 113 | outputs = outputs * (num_units ** 0.5) 114 | 115 | return outputs 116 | 117 | 118 | def positional_encoding(inputs, 119 | num_units, 120 | zero_pad=True, 121 | scale=True, 122 | scope="positional_encoding", 123 | reuse=None): 124 | '''Sinusoidal Positional_Encoding. 125 | 126 | Args: 127 | inputs: A 2d Tensor with shape of (N, T). 128 | num_units: Output dimensionality 129 | zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero 130 | scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper) 131 | scope: Optional scope for `variable_scope`. 132 | reuse: Boolean, whether to reuse the weights of a previous layer 133 | by the same name. 134 | 135 | Returns: 136 | A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units' 137 | ''' 138 | 139 | N, T = inputs.get_shape().as_list() 140 | with tf.variable_scope(scope, reuse=reuse): 141 | position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1]) 142 | 143 | # First part of the PE function: sin and cos argument 144 | position_enc = np.array([ 145 | [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)] 146 | for pos in range(T)]) 147 | 148 | # Second part, apply the cosine to even columns and sin to odds. 149 | position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i 150 | position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1 151 | 152 | # Convert to a tensor 153 | lookup_table = tf.convert_to_tensor(position_enc) 154 | 155 | if zero_pad: 156 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 157 | lookup_table[1:, :]), 0) 158 | outputs = tf.nn.embedding_lookup(lookup_table, position_ind) 159 | 160 | if scale: 161 | outputs = outputs * num_units**0.5 162 | 163 | return outputs 164 | 165 | 166 | 167 | def multihead_attention(queries, 168 | keys, 169 | num_units=None, 170 | num_heads=8, 171 | dropout_rate=0, 172 | is_training=True, 173 | causality=False, 174 | scope="multihead_attention", 175 | reuse=None): 176 | '''Applies multihead attention. 177 | 178 | Args: 179 | queries: A 3d tensor with shape of [N, T_q, C_q]. 180 | keys: A 3d tensor with shape of [N, T_k, C_k]. 181 | num_units: A scalar. Attention size. 182 | dropout_rate: A floating point number. 183 | is_training: Boolean. Controller of mechanism for dropout. 184 | causality: Boolean. If true, units that reference the future are masked. 185 | num_heads: An int. Number of heads. 186 | scope: Optional scope for `variable_scope`. 187 | reuse: Boolean, whether to reuse the weights of a previous layer 188 | by the same name. 189 | 190 | Returns 191 | A 3d tensor with shape of (N, T_q, C) 192 | ''' 193 | with tf.variable_scope(scope, reuse=reuse): 194 | # Set the fall back option for num_units 195 | if num_units is None: 196 | num_units = queries.get_shape().as_list[-1] 197 | 198 | # Linear projections 199 | Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C) 200 | K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) 201 | V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) 202 | 203 | # Split and concat 204 | Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 205 | K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 206 | V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 207 | 208 | # Multiplication 209 | outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) 210 | 211 | # Scale 212 | outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5) 213 | 214 | # Key Masking 215 | key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k) 216 | key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) 217 | key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) 218 | 219 | paddings = tf.ones_like(outputs)*(-2**32+1) 220 | outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) 221 | 222 | # Causality = Future blinding 223 | if causality: 224 | diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) 225 | #tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k) 226 | tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() 227 | masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) 228 | 229 | paddings = tf.ones_like(masks)*(-2**32+1) 230 | outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) 231 | 232 | # Activation 233 | outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) 234 | 235 | # Query Masking 236 | query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) 237 | query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) 238 | query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) 239 | outputs *= query_masks # broadcasting. (N, T_q, C) 240 | 241 | # Dropouts 242 | outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) 243 | 244 | # Weighted sum 245 | outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) 246 | 247 | # Restore shape 248 | outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C) 249 | 250 | # Residual connection 251 | outputs += queries 252 | 253 | # Normalize 254 | outputs = normalize(outputs) # (N, T_q, C) 255 | 256 | return outputs 257 | 258 | def feedforward(inputs, 259 | num_units=[2048, 512], 260 | scope="multihead_attention", 261 | reuse=None): 262 | '''Point-wise feed forward net. 263 | 264 | Args: 265 | inputs: A 3d tensor with shape of [N, T, C]. 266 | num_units: A list of two integers. 267 | scope: Optional scope for `variable_scope`. 268 | reuse: Boolean, whether to reuse the weights of a previous layer 269 | by the same name. 270 | 271 | Returns: 272 | A 3d tensor with the same shape and dtype as inputs 273 | ''' 274 | with tf.variable_scope(scope, reuse=reuse): 275 | # Inner layer 276 | params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1, 277 | "activation": tf.nn.relu, "use_bias": True} 278 | outputs = tf.layers.conv1d(**params) 279 | 280 | # Readout layer 281 | params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1, 282 | "activation": None, "use_bias": True} 283 | outputs = tf.layers.conv1d(**params) 284 | 285 | # Residual connection 286 | outputs += inputs 287 | 288 | # Normalize 289 | outputs = normalize(outputs) 290 | 291 | return outputs 292 | 293 | def label_smoothing(inputs, epsilon=0.1): 294 | '''Applies label smoothing. See https://arxiv.org/abs/1512.00567. 295 | 296 | Args: 297 | inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary. 298 | epsilon: Smoothing rate. 299 | 300 | For example, 301 | 302 | ``` 303 | import tensorflow as tf 304 | inputs = tf.convert_to_tensor([[[0, 0, 1], 305 | [0, 1, 0], 306 | [1, 0, 0]], 307 | 308 | [[1, 0, 0], 309 | [1, 0, 0], 310 | [0, 1, 0]]], tf.float32) 311 | 312 | outputs = label_smoothing(inputs) 313 | 314 | with tf.Session() as sess: 315 | print(sess.run([outputs])) 316 | 317 | >> 318 | [array([[[ 0.03333334, 0.03333334, 0.93333334], 319 | [ 0.03333334, 0.93333334, 0.03333334], 320 | [ 0.93333334, 0.03333334, 0.03333334]], 321 | 322 | [[ 0.93333334, 0.03333334, 0.03333334], 323 | [ 0.93333334, 0.03333334, 0.03333334], 324 | [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)] 325 | ``` 326 | ''' 327 | K = inputs.get_shape().as_list()[-1] # number of channels 328 | return ((1-epsilon) * inputs) + (epsilon / K) 329 | 330 | 331 | 332 | 333 | def BME_cut(seq, label): 334 | ''' 335 | Tokenization with sequence tagging of /B/E/S/M 336 | represent the word begin/end/single word/in the middle respectively. 337 | Args: 338 | inputs: seq:str, label:str. 339 | output:List. 340 | 341 | Examples: 342 | >>> BME_cut('l i k e m e','B M M E B E') 343 | like me 344 | ''' 345 | if isinstance(seq, str): 346 | seq = seq.split() 347 | if isinstance(label, str): 348 | label = label.split() 349 | 350 | seq = seq + ['PAD']*(len(label) - len(seq)) 351 | assert len(seq) == len(label), "seq label is not compliable...{}, {}".format(seq, label) 352 | tokens = [] 353 | i = 0 354 | while i < len(seq): 355 | if label[i] == 'S': 356 | tokens.append(seq[i]) 357 | elif label[i] == 'B': 358 | tmp = seq[i] 359 | while i+1 < len(seq) and label[i+1] == 'M': 360 | tmp += seq[i+1] 361 | i += 1 362 | if not i+1 < len(seq): break 363 | #print(label[i+1], seq[i+1]) 364 | if label[i+1] == 'E': 365 | tmp += seq[i+1] 366 | tokens.append(tmp) 367 | i += 1 368 | return ' '.join(tokens) -------------------------------------------------------------------------------- /transformer_RC/modules.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python3 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import math 7 | 8 | 9 | 10 | def normalize(inputs, 11 | epsilon = 1e-8, 12 | scope="ln", 13 | reuse=None): 14 | '''Applies layer normalization. 15 | 16 | Args: 17 | inputs: A tensor with 2 or more dimensions, where the first dimension has 18 | `batch_size`. 19 | epsilon: A floating number. A very small number for preventing ZeroDivision Error. 20 | scope: Optional scope for `variable_scope`. 21 | reuse: Boolean, whether to reuse the weights of a previous layer 22 | by the same name. 23 | 24 | Returns: 25 | A tensor with the same shape and data dtype as `inputs`. 26 | ''' 27 | with tf.variable_scope(scope, reuse=reuse): 28 | inputs_shape = inputs.get_shape() 29 | params_shape = inputs_shape[-1:] 30 | 31 | mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) 32 | beta= tf.Variable(tf.zeros(params_shape)) 33 | gamma = tf.Variable(tf.ones(params_shape)) 34 | normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) ) 35 | outputs = gamma * normalized + beta 36 | 37 | return outputs 38 | 39 | 40 | def embedding(inputs, 41 | pretrained_embedding = None, 42 | vocab_size, 43 | num_units, 44 | zero_pad=True, 45 | scale=True, 46 | scope="embedding", 47 | reuse=None): 48 | '''Embeds a given tensor. 49 | 50 | Args: 51 | inputs: A `Tensor` with type `int32` or `int64` containing the ids 52 | to be looked up in `lookup table`. 53 | vocab_size: An int. Vocabulary size. 54 | num_units: An int. Number of embedding hidden units. 55 | zero_pad: A boolean. If True, all the values of the fist row (id 0) 56 | should be constant zeros. 57 | scale: A boolean. If True. the outputs is multiplied by sqrt num_units. 58 | scope: Optional scope for `variable_scope`. 59 | reuse: Boolean, whether to reuse the weights of a previous layer 60 | by the same name. 61 | ''' 62 | with tf.variable_scope(scope, reuse=reuse): 63 | if pretrained_embedding is not None: 64 | if not tf.shape(pretrained_embedding)[1] == num_units: 65 | pre_emb = pretrained_embedding 66 | fusion_emb = tf.layers.dense(pre_emb, num_units, activation='tanh') 67 | fusion_emb = normalize(funsion_emb) 68 | 69 | lookup_table = fusion_emb 70 | else: 71 | lookup_table = pretrained_embedding 72 | else: 73 | lookup_table = tf.get_variable('lookup_table', 74 | dtype=tf.float32, 75 | shape=[vocab_size, num_units], 76 | initializer=tf.contrib.layers.xavier_initializer()) 77 | if zero_pad: 78 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 79 | lookup_table[1:, :]), 0) 80 | 81 | 82 | 83 | outputs = tf.nn.embedding_lookup(lookup_table, inputs) 84 | 85 | if scale: 86 | outputs = outputs * (num_units ** 0.5) 87 | 88 | return outputs 89 | 90 | 91 | def positional_encoding(inputs, 92 | num_units, 93 | zero_pad=True, 94 | scale=True, 95 | scope="positional_encoding", 96 | reuse=None): 97 | '''Sinusoidal Positional_Encoding. 98 | 99 | Args: 100 | inputs: A 2d Tensor with shape of (N, T). 101 | num_units: Output dimensionality 102 | zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero 103 | scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper) 104 | scope: Optional scope for `variable_scope`. 105 | reuse: Boolean, whether to reuse the weights of a previous layer 106 | by the same name. 107 | 108 | Returns: 109 | A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units' 110 | ''' 111 | 112 | N, T = inputs.get_shape().as_list() 113 | with tf.variable_scope(scope, reuse=reuse): 114 | position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1]) 115 | 116 | # First part of the PE function: sin and cos argument 117 | position_enc = np.array([ 118 | [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)] 119 | for pos in range(T)]) 120 | 121 | # Second part, apply the cosine to even columns and sin to odds. 122 | position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i 123 | position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1 124 | 125 | # Convert to a tensor 126 | lookup_table = tf.convert_to_tensor(position_enc) 127 | 128 | if zero_pad: 129 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 130 | lookup_table[1:, :]), 0) 131 | outputs = tf.nn.embedding_lookup(lookup_table, position_ind) 132 | 133 | if scale: 134 | outputs = outputs * num_units**0.5 135 | 136 | return outputs 137 | 138 | 139 | 140 | def multihead_attention(queries, 141 | keys, 142 | num_units=None, 143 | num_heads=8, 144 | dropout_rate=0, 145 | is_training=True, 146 | causality=False, 147 | scope="multihead_attention", 148 | reuse=None): 149 | '''Applies multihead attention. 150 | 151 | Args: 152 | queries: A 3d tensor with shape of [N, T_q, C_q]. 153 | keys: A 3d tensor with shape of [N, T_k, C_k]. 154 | num_units: A scalar. Attention size. 155 | dropout_rate: A floating point number. 156 | is_training: Boolean. Controller of mechanism for dropout. 157 | causality: Boolean. If true, units that reference the future are masked. 158 | num_heads: An int. Number of heads. 159 | scope: Optional scope for `variable_scope`. 160 | reuse: Boolean, whether to reuse the weights of a previous layer 161 | by the same name. 162 | 163 | Returns 164 | A 3d tensor 165 | ''' 166 | with tf.variable_scope(scope, reuse=reuse): 167 | # Set the fall back option for num_units 168 | if num_units is None: 169 | num_units = queries.get_shape().as_list[-1] 170 | 171 | # Linear projections 172 | Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C) 173 | K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) 174 | V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) 175 | 176 | # Split and concat 177 | Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 178 | K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 179 | V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 180 | 181 | # Multiplication 182 | # (h*N, T_q, C/h) @ (h*N, C/h, T_k) 183 | outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) 184 | 185 | # Scale 186 | outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5) 187 | 188 | # Key Masking 189 | key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k) 190 | key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) 191 | key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) 192 | 193 | paddings = tf.ones_like(outputs)*(-2**32+1) 194 | outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) 195 | 196 | # Causality = Future blinding 197 | if causality: 198 | diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) 199 | #tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k) 200 | tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() 201 | masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) 202 | 203 | paddings = tf.ones_like(masks)*(-2**32+1) 204 | outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) 205 | 206 | # Activation 207 | outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) 208 | 209 | # Query Masking 210 | query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) 211 | query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) 212 | query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) 213 | outputs *= query_masks # broadcasting. (N, T_q, C) 214 | 215 | # Dropouts 216 | outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) 217 | 218 | # Weighted sum 219 | outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) 220 | 221 | # Restore shape 222 | outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C) 223 | 224 | # Residual connection 225 | outputs += queries 226 | 227 | # Normalize 228 | outputs = normalize(outputs) # (N, T_q, C) 229 | 230 | return outputs 231 | 232 | def feedforward(inputs, 233 | num_units=[2048, 512], 234 | scope="multihead_attention", 235 | reuse=None): 236 | '''Point-wise feed forward net. 237 | 238 | Args: 239 | inputs: A 3d tensor with shape of [N, T, C]. 240 | num_units: A list of two integers. 241 | scope: Optional scope for `variable_scope`. 242 | reuse: Boolean, whether to reuse the weights of a previous layer 243 | by the same name. 244 | 245 | Returns: 246 | A 3d tensor with the same shape and dtype as inputs 247 | ''' 248 | with tf.variable_scope(scope, reuse=reuse): 249 | # Inner layer 250 | params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1, 251 | "activation": tf.nn.relu, "use_bias": True} 252 | outputs = tf.layers.conv1d(**params) 253 | 254 | # Readout layer 255 | params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1, 256 | "activation": None, "use_bias": True} 257 | outputs = tf.layers.conv1d(**params) 258 | 259 | # Residual connection 260 | outputs += inputs 261 | 262 | # Normalize 263 | outputs = normalize(outputs) 264 | 265 | return outputs 266 | 267 | def label_smoothing(inputs, epsilon=0.1): 268 | '''Applies label smoothing. See https://arxiv.org/abs/1512.00567. 269 | 270 | Args: 271 | inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary. 272 | epsilon: Smoothing rate. 273 | 274 | For example, 275 | 276 | ``` 277 | import tensorflow as tf 278 | inputs = tf.convert_to_tensor([[[0, 0, 1], 279 | [0, 1, 0], 280 | [1, 0, 0]], 281 | 282 | [[1, 0, 0], 283 | [1, 0, 0], 284 | [0, 1, 0]]], tf.float32) 285 | 286 | outputs = label_smoothing(inputs) 287 | 288 | with tf.Session() as sess: 289 | print(sess.run([outputs])) 290 | 291 | >> 292 | [array([[[ 0.03333334, 0.03333334, 0.93333334], 293 | [ 0.03333334, 0.93333334, 0.03333334], 294 | [ 0.93333334, 0.03333334, 0.03333334]], 295 | 296 | [[ 0.93333334, 0.03333334, 0.03333334], 297 | [ 0.93333334, 0.03333334, 0.03333334], 298 | [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)] 299 | ``` 300 | ''' 301 | K = inputs.get_shape().as_list()[-1] # number of channels 302 | return ((1-epsilon) * inputs) + (epsilon / K) 303 | 304 | 305 | 306 | 307 | def BME_cut(seq, label): 308 | ''' 309 | Tokenization with sequence tagging of /B/E/S/M 310 | represent the word begin/end/single word/in the middle respectively. 311 | Args: 312 | inputs: seq:str, label:str. 313 | output:List. 314 | 315 | Examples: 316 | >>> BME_cut('l i k e m e','B M M E B E') 317 | like me 318 | ''' 319 | if isinstance(seq, str): 320 | seq = seq.split() 321 | if isinstance(label, str): 322 | label = label.split() 323 | 324 | seq = seq + ['PAD']*(len(label) - len(seq)) 325 | assert len(seq) == len(label), "seq label is not compliable...{}, {}".format(seq, label) 326 | tokens = [] 327 | i = 0 328 | while i < len(seq): 329 | if label[i] == 'S': 330 | tokens.append(seq[i]) 331 | elif label[i] == 'B': 332 | tmp = seq[i] 333 | while i+1 < len(seq) and label[i+1] == 'M': 334 | tmp += seq[i+1] 335 | i += 1 336 | if not i+1 < len(seq): break 337 | #print(label[i+1], seq[i+1]) 338 | if label[i+1] == 'E': 339 | tmp += seq[i+1] 340 | tokens.append(tmp) 341 | i += 1 342 | return ' '.join(tokens) 343 | 344 | 345 | 346 | 347 | 348 | def bleu(pred_tokens, label_tokens, k): 349 | """craft bleu realization""" 350 | len_pred, len_label = len(pred_tokens), len(label_tokens) 351 | score = math.exp(min(0, 1 - len_label / len_pred)) 352 | for n in range(1, k + 1): 353 | num_matches, label_subs = 0, collections.defaultdict(int) 354 | for i in range(len_label - n + 1): 355 | label_subs[''.join(label_tokens[i: i + n])] += 1 356 | for i in range(len_pred - n + 1): 357 | if label_subs[''.join(pred_tokens[i: i + n])] > 0: 358 | num_matches += 1 359 | label_subs[''.join(pred_tokens[i: i + n])] -= 1 360 | score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n)) 361 | return score 362 | 363 | --------------------------------------------------------------------------------