├── results
    ├── model_epoch_20_gs_0
    ├── model_epoch_20_gs_120
    └── model_epoch_20_gs_240
├── requirements.txt
├── images
    ├── train_loss.png
    ├── NMT_res_BLEU.png
    ├── train_accuracy.png
    ├── rc_model_train_loss.png
    ├── infersent_train_loss.png
    ├── infersent_train_accuracy.png
    ├── infersent_train_SNLI_loss.png
    ├── rc_model_train_loss_200epoch.png
    └── infersent_train_with_SNLI_accuracy.png
├── __pycache__
    ├── modules.cpython-36.pyc
    ├── data_load.cpython-36.pyc
    ├── hyperparams.cpython-35.pyc
    └── hyperparams.cpython-36.pyc
├── transformer_RC
    ├── __pycache__
    │   ├── models.cpython-35.pyc
    │   ├── modules.cpython-35.pyc
    │   ├── data_load.cpython-35.pyc
    │   └── hyperparams.cpython-35.pyc
    ├── layers
    │   ├── __pycache__
    │   │   ├── basic_rnn.cpython-35.pyc
    │   │   ├── match_layer.cpython-35.pyc
    │   │   └── pointer_net.cpython-35.pyc
    │   ├── basic_rnn.py
    │   ├── match_layer.py
    │   └── pointer_net.py
    ├── README.md
    ├── prepro.py
    ├── hyperparams.py
    ├── data_load.py
    ├── eval.py
    ├── train.py
    └── modules.py
├── en-zh_NMT
    ├── README.MD
    ├── prepro.py
    ├── data_pre.py
    ├── eval.py
    ├── data_load.py
    ├── train.py
    └── modules.py
├── transformer_text_Classfication
    ├── README.MD
    ├── prepro.py
    ├── data_load.py
    ├── eval.py
    ├── hyperparams.py
    ├── data_pre.py
    ├── train.py
    └── modules.py
├── transformer_infersent
    ├── data_prepare.py
    ├── README.MD
    ├── prepro.py
    ├── hyperparams.py
    ├── data_load.py
    ├── eval.py
    ├── train.py
    └── modules.py
├── transformer_jieba
    ├── prepro.py
    ├── train.py
    ├── data_pre.py
    ├── eval.py
    └── data_load.py
├── .circleci
    └── config.yml
├── hyperparams.py
├── README.md
└── Models
    └── models.py


/results/model_epoch_20_gs_0:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/results/model_epoch_20_gs_120:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/results/model_epoch_20_gs_240:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk>=3.2.4
2 | numpy>=1.13.0
3 | regex>=2017.6.7
4 | tensorflow==1.12.0
5 | 


--------------------------------------------------------------------------------
/images/train_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/train_loss.png


--------------------------------------------------------------------------------
/images/NMT_res_BLEU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/NMT_res_BLEU.png


--------------------------------------------------------------------------------
/images/train_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/train_accuracy.png


--------------------------------------------------------------------------------
/images/rc_model_train_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/rc_model_train_loss.png


--------------------------------------------------------------------------------
/images/infersent_train_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/infersent_train_loss.png


--------------------------------------------------------------------------------
/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/__pycache__/modules.cpython-36.pyc


--------------------------------------------------------------------------------
/images/infersent_train_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/infersent_train_accuracy.png


--------------------------------------------------------------------------------
/__pycache__/data_load.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/__pycache__/data_load.cpython-36.pyc


--------------------------------------------------------------------------------
/images/infersent_train_SNLI_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/infersent_train_SNLI_loss.png


--------------------------------------------------------------------------------
/__pycache__/hyperparams.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/__pycache__/hyperparams.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/hyperparams.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/__pycache__/hyperparams.cpython-36.pyc


--------------------------------------------------------------------------------
/images/rc_model_train_loss_200epoch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/rc_model_train_loss_200epoch.png


--------------------------------------------------------------------------------
/images/infersent_train_with_SNLI_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/images/infersent_train_with_SNLI_accuracy.png


--------------------------------------------------------------------------------
/transformer_RC/__pycache__/models.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/__pycache__/models.cpython-35.pyc


--------------------------------------------------------------------------------
/transformer_RC/__pycache__/modules.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/__pycache__/modules.cpython-35.pyc


--------------------------------------------------------------------------------
/transformer_RC/__pycache__/data_load.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/__pycache__/data_load.cpython-35.pyc


--------------------------------------------------------------------------------
/transformer_RC/__pycache__/hyperparams.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/__pycache__/hyperparams.cpython-35.pyc


--------------------------------------------------------------------------------
/transformer_RC/layers/__pycache__/basic_rnn.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/layers/__pycache__/basic_rnn.cpython-35.pyc


--------------------------------------------------------------------------------
/transformer_RC/layers/__pycache__/match_layer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/layers/__pycache__/match_layer.cpython-35.pyc


--------------------------------------------------------------------------------
/transformer_RC/layers/__pycache__/pointer_net.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fooSynaptic/transfromer_NN_Block/HEAD/transformer_RC/layers/__pycache__/pointer_net.cpython-35.pyc


--------------------------------------------------------------------------------
/en-zh_NMT/README.MD:
--------------------------------------------------------------------------------
1 | # ***Second - zh-en NMT***
2 | - the train and test data was from `Web Inventory of Transcribed and Translated Talks`-**WIT3**, we train a model for English-Chinese translation model([data source](https://wit3.fbk.eu/mt.php?release=2015-01)).
3 | - test Result:
4 |   ![NMT result](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/NMT_res_BLEU.png)
5 | 
6 | 


--------------------------------------------------------------------------------
/transformer_text_Classfication/README.MD:
--------------------------------------------------------------------------------
 1 | # result of chinese sentences classfication(char-level)
 2 | ```
 3 |               precision    recall  f1-score   support
 4 | 
 5 |            0       0.99      1.00      0.99       992
 6 |            1       1.00      0.99      0.99       980
 7 | 
 8 |    micro avg       0.99      0.99      0.99      1972
 9 |    macro avg       0.99      0.99      0.99      1972
10 | weighted avg       0.99      0.99      0.99      1972
11 | 
12 | Done
13 | ```
14 | 


--------------------------------------------------------------------------------
/transformer_RC/README.md:
--------------------------------------------------------------------------------
 1 | reading comprehension model by transformer
 2 | - The Architecture of this model employed the **transformer feature** parallized attention + **BiDAF query-wise Passage content state** + **PointerNetwork**. 
 3 | 
 4 | 
 5 | - train Loss:![loss](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/rc_model_train_loss.png)
 6 | 
 7 | - You may want to inspect the predict result in [here](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/results/rc_model_epoch_50_gs_10500)
 8 | 
 9 | final_reuslt: **`Rouge-L:0.2651. BLEU_1: 0.36.`**
10 | 
11 | Result is keep updating, welcome to star and follow.
12 | 


--------------------------------------------------------------------------------
/transformer_infersent/data_prepare.py:
--------------------------------------------------------------------------------
 1 | # encoding = utf-8
 2 | # /usr/bin/python3
 3 | 
 4 | import json
 5 | 
 6 | #{'entailment', '-', 'contradiction', 'neutral'}
 7 | hashmap = {'entailment':'0', 'contradiction':'1', 'neutral':'2'}
 8 | 
 9 | 
10 | def prepare():
11 | 	train, dev, test = [[json.loads(line) for line in open('./snli_1.0/snli_1.0_{}.jsonl'.format(x)).readlines()]\
12 | 	 for x in ['train', 'dev', 'test']]
13 | 
14 | 	train, dev, test = ['<>'.join([hashmap[x['gold_label']], x['sentence1'], x['sentence2']]) for x in train if x['gold_label'] in hashmap], ['<>'.join([hashmap[x['gold_label']], x['sentence1'], x['sentence2']]) for x in dev if x['gold_label'] in hashmap], ['<>'.join([hashmap[x['gold_label']], x['sentence1'], x['sentence2']]) for x in test if x['gold_label'] in hashmap]
15 | 
16 | 
17 | 	with open('./train.csv', 'w') as f1:
18 | 		for line in train: f1.write(line + '\n')
19 | 
20 | 	with open('./dev.csv', 'w') as f2:
21 | 		for line in dev: f2.write(line + '\n')
22 | 
23 | 	with open('./test.csv', 'w') as f3:
24 | 		for line in test: f3.write(line + '\n')
25 | 
26 | 
27 | if __name__ == '__main__':
28 | 	prepare()
29 | 
30 | 


--------------------------------------------------------------------------------
/transformer_infersent/README.MD:
--------------------------------------------------------------------------------
 1 | ***We implemented a sentences entailment inference task with transformer***
 2 | ---
 3 | **Data source** [standord SNLI](https://nlp.stanford.edu/projects/snli/snli_1.0.zip)
 4 | 
 5 | - *Download source data and unzip* : `wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip && unzip snli_1.0.zip`
 6 | - *preprocess data*: `python data_prepare.py && python prepro.py`
 7 | - *train*: run `python train.py`
 8 | - *eval*: run `python eval.py --task infersent`
 9 | 
10 | Experiment result:
11 | - accuracy:
12 | ![train accuracy](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/infersent_train_with_SNLI_accuracy.png)
13 | 
14 | - loss:
15 | ![train loss](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/infersent_train_SNLI_loss.png)
16 | 
17 | - eval result:
18 | ```
19 |               precision    recall  f1-score   support
20 | 
21 |            0       0.82      0.76      0.79      3358
22 |            1       0.77      0.80      0.79      3226
23 |            2       0.70      0.73      0.72      3208
24 | 
25 |     accuracy                           0.76      9792
26 |    macro avg       0.76      0.76      0.76      9792
27 | weighted avg       0.76      0.76      0.76      9792
28 | ```
29 | 


--------------------------------------------------------------------------------
/transformer_text_Classfication/prepro.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python3
 3 | 
 4 | from hyperparams import feature_Block_Hyperparams as hp
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | import codecs
 8 | import os
 9 |  
10 | #import regex
11 | import re
12 | from collections import Counter
13 | 
14 | #import tokenize
15 | import jieba
16 | 
17 | def make_vocab(fpath, fname):
18 |     '''Constructs vocabulary.
19 |     
20 |     Args:
21 |       fpath: A list. Input file paths.
22 |       fname: A string. Output file name.
23 |     
24 |     Writes vocabulary line by line to `preprocessed/fname`
25 |     ''' 
26 |     texts = []
27 |     for path in fpath:
28 |         text = [x.strip().split()[1] for x in codecs.open(path, 'r', 'utf-8').readlines()]
29 |         texts.extend(text)
30 | 
31 |     corpus = ''.join(texts)
32 |     corpus = re.sub("[\s\p']", "", corpus)
33 |     corpus = re.sub('[0-9]+', 'N', corpus)
34 |     corpus = re.sub('[a-zA-Z]+', 'α', corpus)
35 |     #words = jieba.cut(corpus)
36 |     words = list(corpus)
37 | 
38 |     word2cnt = Counter(words)
39 |     if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
40 |     with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout:
41 |         fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
42 |         for word, cnt in word2cnt.most_common(len(word2cnt)):
43 |             fout.write(u"{}\t{}\n".format(word, cnt))
44 | 
45 | 
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     make_vocab([hp.trainset, hp.testset], "vocabs.txt")
50 |     print("Done")
51 | 


--------------------------------------------------------------------------------
/en-zh_NMT/prepro.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python3
 3 | 
 4 | 
 5 | from hyperparams import seq2seq_Hyperparams as hp
 6 | import tensorflow as tf
 7 | import numpy as np
 8 | import os
 9 |  
10 | import re
11 | from collections import Counter
12 | 
13 | #import tokenize
14 | import jieba
15 | 
16 | def make_vocab(fpath, fname, tokenizer = None):
17 |     '''Constructs vocabulary.
18 |     
19 |     Args:
20 |       fpath: A string. Input file path.
21 |       fname: A string. Output file name.
22 |     
23 |     Writes vocabulary line by line to `preprocessed/fname`
24 |     '''  
25 |     #text = codecs.open(fpath, 'r', 'utf-8').read()
26 |     text = open(fpath, 'r', encoding = 'utf-8').readlines()
27 |     text = [line.strip() for line in text if not line.startswith("<")]
28 |     print('length of senteces from path:{} is {}'.format(fpath, len(text)))
29 |     text = ' '.join(text)
30 | 
31 |     if tokenizer == 'jieba':
32 |         text = re.sub("[\s\p']", "", text)
33 |         words = jieba.cut(text)
34 |     elif tokenizer == None:
35 |         text = re.sub("[^a-zA-Z]", " ", text)
36 |         words = text.split()
37 |     else:
38 |         raise Exception('Could not find tokenizer...')
39 | 
40 |     word2cnt = Counter(words)
41 |     if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
42 |     with open('preprocessed/{}'.format(fname), 'w', encoding = 'utf-8') as fout:
43 |         fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
44 |         for word, cnt in word2cnt.most_common(len(word2cnt)):
45 |             fout.write(u"{}\t{}\n".format(word, cnt))
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     make_vocab(hp.source_train, "en.vocab.tsv")
54 |     make_vocab(hp.target_train, "zh.vocab.tsv", tokenizer = 'jieba')
55 |     print("Done")
56 | 


--------------------------------------------------------------------------------
/transformer_jieba/prepro.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python3
 3 | 
 4 | 
 5 | from hyperparams import seq2seq_Hyperparams as hp
 6 | import tensorflow as tf
 7 | import numpy as np
 8 | import os
 9 |  
10 | import re
11 | from collections import Counter
12 | 
13 | #import tokenize
14 | import jieba
15 | 
16 | def make_vocab(fpath, fname, tokenizer = None):
17 |     '''Constructs vocabulary.
18 |     
19 |     Args:
20 |       fpath: A string. Input file path.
21 |       fname: A string. Output file name.
22 |     
23 |     Writes vocabulary line by line to `preprocessed/fname`
24 |     '''  
25 |     #text = codecs.open(fpath, 'r', 'utf-8').read()
26 |     text = open(fpath, 'r', encoding = 'utf-8').readlines()
27 |     text = [line.strip() for line in text if not line.startswith("<")]
28 |     print('length of senteces from path:{} is {}'.format(fpath, len(text)))
29 |     text = ' '.join(text)
30 | 
31 |     if tokenizer == 'jieba':
32 |         text = re.sub("[\s\p']", "", text)
33 |         words = jieba.cut(text)
34 |     elif tokenizer == None:
35 |         #text = re.sub("[^a-zA-Z]", " ", text)
36 |         words = text.split()
37 |     else:
38 |         raise Exception('Could not find tokenizer...')
39 | 
40 |     word2cnt = Counter(words)
41 |     if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
42 |     with open('preprocessed/{}'.format(fname), 'w', encoding = 'utf-8') as fout:
43 |         fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
44 |         for word, cnt in word2cnt.most_common(len(word2cnt)):
45 |             fout.write(u"{}\t{}\n".format(word, cnt))
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     make_vocab(hp.source_train, "src.vocab.tsv")
54 |     make_vocab(hp.target_train, "tgt.vocab.tsv", tokenizer = None)
55 |     print("Done")
56 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |     seq2seq:
 4 |         working_directory: ~/transfromer_NN_Block/en-zh_NMT
 5 |         docker:
 6 |             - image: circleci/python:3.5
 7 |         resource_class: middle
 8 |         parallelism: 4
 9 |         steps:
10 |             - checkout
11 |             - run: sudo pip install -r requirements.txt
12 | 
13 |     text_classfication:
14 |         working_directory: ~/transfromer_NN_Block/transformer_text_Classfication
15 |         resource_class: middle
16 |         parallelism: 4
17 |         docker:
18 |             - image: circleci/python:3.5
19 |         steps:
20 |             - checkout
21 |             - run: sudo pip install -r requirements.txt
22 | 
23 |     sentences_entailments:
24 |         working_directory: ~/transfromer_NN_Block/transformer_infersent
25 |         resource_class: middle
26 |         parallelism: 4
27 |         docker:
28 |             - image: circleci/python:3.5
29 |         steps:
30 |             - checkout
31 |             - run: sudo pip install -r requirements.txt
32 |             - run: wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip && unzip snli_1.0.zip
33 |             - run: python data_prepare.py && python prepro.py
34 |             - run: python train.py
35 |             - run: python eval.py --task infersent
36 | 
37 |     transformer_jieba:
38 |         working_directory: ~/transfromer_NN_Block/transformer_jieba
39 |         docker:
40 |             - image: circleci/python:3.5
41 |         resource_class: middle
42 |         parallelism: 4
43 |         steps:
44 |             - checkout
45 |             - run: sudo pip install -r requirements.txt
46 |             - run: python data_pre.py
47 |             - run: python prepro.py
48 |             - run: python train.py
49 |             - run: python eval.py --task infersent
50 | 
51 | 
52 | 
53 | workflows:
54 |   version: 2
55 |   build_and_test:
56 |     jobs:
57 |       - seq2seq
58 |       - text_classfication


--------------------------------------------------------------------------------
/transformer_infersent/prepro.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python3
 3 | 
 4 | from hyperparams import infersent_Block_Hyperparams as hp
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | import codecs
 8 | import os
 9 |  
10 | #import regex
11 | import re
12 | from collections import Counter
13 | 
14 | #import tokenize
15 | import jieba
16 | 
17 | def make_vocab(fpath, fname, lan = 'zh'):
18 |     '''Constructs vocabulary.
19 |     
20 |     Args:
21 |       fpath: A list. Input file paths.
22 |       fname: A string. Output file name.
23 |     
24 |     Writes vocabulary line by line to `preprocessed/fname`
25 |     ''' 
26 |     if lan == 'zh':
27 |         texts = []
28 |         for path in fpath:
29 |             text = [x.strip() for x in codecs.open(path, 'r', 'utf-8').readlines()]
30 |             texts.extend(text)
31 | 
32 | 
33 |         corpus = ''.join(texts)
34 |         corpus = re.sub("[\s\p']", "", corpus)
35 |         #replace numbers with NUM
36 |         corpus = re.sub(r'[0-9]+', 'n', corpus)
37 |         corpus = re.sub(r'[a-zA-Z]+', 'α', corpus)
38 |         words = jieba.cut(corpus)
39 |     elif lan == 'en':
40 |         texts = []
41 |         for path in fpath:
42 |             texts.extend([x.strip().split('<>', 1)[1] for x in codecs.open(path, 'r', 'utf-8').readlines()])
43 |         corpus = ' '.join(texts)
44 |         corpus = re.sub(r"[^a-zA-Z]", " ", corpus)
45 |         words = corpus.split()
46 | 
47 |     word2cnt = Counter(words)
48 |     if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
49 |     with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout:
50 |         fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
51 |         for word, cnt in word2cnt.most_common(len(word2cnt)):
52 |             fout.write(u"{}\t{}\n".format(word, cnt))
53 | 
54 | 
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     make_vocab([hp.trainset, hp.testset], "vocabs.txt", lan = 'en')
59 |     print("Done")
60 | 


--------------------------------------------------------------------------------
/transformer_RC/prepro.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python3
 3 | 
 4 | from hyperparams import rc_Hyperparams as hp
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | import codecs
 8 | import os
 9 | import pandas as pd
10 |  
11 | #import regex
12 | import re
13 | from collections import Counter
14 | 
15 | #import tokenize
16 | import jieba
17 | 
18 | def make_vocab(fpath, fname, lan = 'zh'):
19 |     '''Constructs vocabulary.
20 |     
21 |     Args:
22 |       fpath: A list. Input file paths.
23 |       fname: A string. Output file name.
24 |     
25 |     Writes vocabulary line by line to `preprocessed/fname`
26 |     ''' 
27 |     if lan == 'zh':
28 |         texts = []
29 |         for path in fpath:
30 |             data = pd.read_csv(path)
31 |             q_text, p_text = list(data['question']), \
32 |             list(data['content1']) + list(data['content2']) + list(data['content3']) + \
33 |             list(data['content4']) + list(data['content5'])
34 |             
35 |             texts =  q_text + p_text
36 | 
37 | 
38 |         corpus = ''.join(texts)
39 |         corpus = re.sub("[\s\p']", "", corpus)
40 |         #replace numbers with NUM
41 |         #corpus = re.sub(r'[0-9]+', ' n', corpus)
42 |         #corpus = re.sub(r'[a-zA-Z]+', ' α', corpus)
43 |         words = jieba.cut(corpus)
44 |     elif lan == 'en':
45 |         texts = []
46 |         for path in fpath:
47 |             texts.extend([x.strip().split('<>', 1)[1] for x in codecs.open(path, 'r', 'utf-8').readlines()])
48 |         corpus = ' '.join(texts)
49 |         corpus = re.sub(r"[^a-zA-Z]", " ", corpus)
50 |         words = corpus.split()
51 | 
52 |     word2cnt = Counter(words)
53 |     if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
54 |     with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout:
55 |         fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
56 |         for word, cnt in word2cnt.most_common(len(word2cnt)):
57 |             fout.write(u"{}\t{}\n".format(word, cnt))
58 | 
59 | 
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     make_vocab([hp.trainset, hp.testset], "vocabs.txt")
64 |     print("Done")
65 | 


--------------------------------------------------------------------------------
/transformer_infersent/hyperparams.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python3
 3 | 
 4 | 
 5 | class seq2seq_Hyperparams:
 6 |     '''Hyperparameters'''
 7 |     # data
 8 |     source_train = './datasets/zh-en/train.tags.zh-en.en'
 9 |     target_train = './datasets/zh-en/train.tags.zh-en.zh' 
10 |     source_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.en.xml' 
11 |     target_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.zh.xml'
12 | 
13 |     # training
14 |     batch_size = 32 # alias = N
15 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
16 |     logdir = 'seq2seq_model_dir' # log directory
17 |     
18 |     # model
19 |     maxlen = 100 # Maximum number of words in a sentence. alias = T.
20 |                 # Feel free to increase this if you are ambitious.
21 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
22 |     hidden_units = 512 # alias = C
23 |     num_blocks = 5 # number of encoder/decoder blocks
24 |     num_epochs = 20
25 |     num_heads = 8
26 |     dropout_rate = 0.1
27 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
28 | 
29 | 
30 | 
31 | 
32 | class feature_Block_Hyperparams:
33 |     '''Hyperparameters'''
34 |     # data
35 |     trainset = './datasets/trainset.txt'
36 |     testset = './datasets/testset.txt' 
37 | 
38 | 
39 |     # training
40 |     batch_size = 4 # alias = N
41 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
42 |     logdir = 'Block_model_dir' # log directory
43 |     
44 |     # model
45 |     maxlen = 500 # Maximum number of words in a sentence. alias = T.
46 |                 # Feel free to increase this if you are ambitious.
47 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
48 |     hidden_units = 512 # alias = C
49 |     num_blocks = 5 # number of encoder/decoder blocks
50 |     num_epochs = 20
51 |     num_heads = 8
52 |     dropout_rate = 0.1
53 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
54 |     n_class = 2
55 | 
56 | 
57 | class infersent_Block_Hyperparams:
58 |     '''Hyperparameters'''
59 |     # data
60 |     trainset = './opensrc_dta/train.csv'
61 |     testset = './opensrc_dta/test.csv' 
62 | 
63 | 
64 |     # training
65 |     relations = {'entailment': '0', 'contradiction': '1', 'neutral': '2'}
66 | 
67 |     batch_size = 64 # alias = N
68 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
69 |     logdir = 'infersent_model_dir' # log directory
70 |     
71 |     # model
72 |     maxlen = 24 # Maximum number of words in a sentence. alias = T.
73 |                 # Feel free to increase this if you are ambitious.
74 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
75 |     hidden_units = 512 # alias = C
76 |     num_blocks = 5 # number of encoder/decoder blocks
77 |     num_epochs = 20
78 |     num_heads = 8
79 |     dropout_rate = 0.1
80 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
81 |     #n_class = 2
82 |     dropout_keep_prob = 0.55
83 |     reg_lambda = 0.1
84 | 
85 | 
86 | 
87 | 
88 | 
89 |     
90 |     
91 |     
92 | 


--------------------------------------------------------------------------------
/transformer_text_Classfication/data_load.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python3
 3 | 
 4 | from __future__ import print_function
 5 | from hyperparams import feature_Block_Hyperparams as hp
 6 | import tensorflow as tf
 7 | import numpy as np
 8 | import codecs
 9 | import re
10 | from jieba import cut
11 | from collections import Counter
12 |   
13 | tagging = {'时尚':0, '教育':1, '时政':2, '体育':3, '游戏':4, '家居':5, '科技':6, '房产':7, '财经':8, '娱乐':9}
14 | 
15 | 
16 | def load_vocabs():
17 |     vocab = [line.split()[0] for line in codecs.open('./preprocessed/vocabs.txt', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt] #raw code is hp.mincnt
18 |     word2idx = {word: idx for idx, word in enumerate(vocab)}
19 |     idx2word = {idx: word for idx, word in enumerate(vocab)}
20 |     return word2idx, idx2word
21 | 
22 | 
23 | 
24 | def create_data(corpus, labels): 
25 |     word2idx, idx2word = load_vocabs()
26 | 
27 | 
28 |     # Index
29 |     x_list, y_list, Sources, Targets = [], [], [], []
30 |     for sent, label in zip(corpus, labels):
31 |         x = [word2idx.get(word, 1) for word in (sent + u" </S>").split()[:hp.maxlen]] # 1: OOV, </S>: End of Text
32 |         x_list.append(np.array(x))
33 | 
34 | 
35 |     # Pad 
36 |     X = np.zeros([len(x_list), hp.maxlen], np.int32)
37 | 
38 |     for i, x in enumerate(x_list):
39 |         X[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0))
40 |         
41 |     return X, np.array(labels), corpus, labels
42 | 
43 | 
44 | 
45 | def _refine(line):
46 | 	line = re.sub("[\s\p']", "", line)
47 | 	line = re.sub('[0-9]+', 'N', line)
48 | 	line = re.sub('[a-zA-Z]+', 'α', line)
49 | 	return ' '.join(list(line))
50 | 
51 | 
52 | def load_train_data(tokenizer = None):
53 |     if tokenizer == None:
54 |         corpus = [line.strip().split() for line in codecs.open(hp.trainset, 'r', 'utf-8').readlines()]
55 |         corpus = [line for line in corpus if line[0] in tagging]
56 |         texts, labels = [_refine(line[1]) for line in corpus], [tagging[line[0]] for line in corpus]
57 |     
58 |     X, Y, Sources, labels = create_data(texts, labels)
59 |     return X, Y
60 |     
61 | def load_test_data(tokenizer = None):
62 |     if tokenizer == None:
63 |         corpus = [line.strip().split() for line in codecs.open(hp.testset, 'r', 'utf-8').readlines()]
64 |         corpus = [line for line in corpus if line[0] in tagging]
65 |         texts, labels = [_refine(line[1]) for line in corpus], [tagging[line[0]] for line in corpus] 
66 | 
67 |     X, Y, Sources, labels = create_data(texts, labels)
68 |     return X, Y, Sources, labels
69 | 
70 | 
71 | def get_batch_data():
72 |     # Load data
73 |     X, Y = load_train_data()
74 |     
75 |     # calc total batch count
76 |     num_batch = len(X) // hp.batch_size
77 |     
78 |     # Convert to tensor
79 |     X = tf.convert_to_tensor(X, tf.int32)
80 |     Y = tf.convert_to_tensor(Y, tf.int32)
81 |     X, Y 
82 |     # Create Queues
83 |     input_queues = tf.train.slice_input_producer([X, Y])
84 |             
85 |     # create batch queues
86 |     x, y = tf.train.shuffle_batch(input_queues,
87 |                                 num_threads=8,
88 |                                 batch_size=hp.batch_size, 
89 |                                 capacity=hp.batch_size*64, 
90 |                                 min_after_dequeue=hp.batch_size*32, 
91 |                                 allow_smaller_final_batch=False)
92 |     
93 |     return x, y, num_batch # (N, T), (N, T), ()
94 | 
95 | 


--------------------------------------------------------------------------------
/transformer_jieba/train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python3
 3 | 
 4 | 
 5 | import tensorflow as tf
 6 | 
 7 | from hyperparams import seq2seq_Hyperparams as hp
 8 | from data_load import get_batch_data, load_en_vocab, load_zh_vocab
 9 | from modules import *
10 | import os, codecs
11 | from tqdm import tqdm
12 | 
13 | os.sys.path.append('../Models')
14 | from models import vanilla_transformer
15 | 
16 | 
17 | class Graph():
18 |     def __init__(self, is_training=True):
19 |         self.graph = tf.Graph()
20 |         with self.graph.as_default():
21 |             if is_training:
22 |                 self.x, self.y, self.num_batch = get_batch_data() # (N, T)
23 |             else: # inference
24 |                 self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
25 |                 self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
26 | 
27 |             # define decoder inputs
28 |             self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S>
29 | 
30 |             # Load vocabulary    
31 |             en2idx, idx2en = load_en_vocab()
32 |             zh2idx, idx2zh = load_zh_vocab()
33 |             
34 |             # initialize transformer
35 |             transformer = vanilla_transformer(hp, self.is_training)
36 |             self.enc = transformer.encode(self.x, len(en2idx))
37 |             
38 |             # Decoder
39 |             self.dec = transformer.decode(self.decoder_inputs, self.enc, len(zh2idx), hp.maxlen)
40 | 
41 |             # Final linear projection
42 |             self.logits = tf.layers.dense(self.dec, len(zh2idx))
43 |             self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
44 |             self.istarget = tf.to_float(tf.not_equal(self.y, 0))
45 |             self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget))
46 |             tf.summary.scalar('acc', self.acc)
47 |                 
48 |             if is_training:  
49 |                 # Loss
50 |                 self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(zh2idx)))
51 |                 self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
52 |                 self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
53 |                
54 |                 # Training Scheme
55 |                 self.global_step = tf.Variable(0, name='global_step', trainable=False)
56 |                 self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
57 |                 self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
58 |                    
59 |                 # Summary 
60 |                 tf.summary.scalar('mean_loss', self.mean_loss)
61 |                 self.merged = tf.summary.merge_all()
62 | 
63 | if __name__ == '__main__':                
64 |     # Load vocabulary    
65 |     en2idx, idx2en = load_en_vocab()
66 |     zh2idx, idx2zh = load_zh_vocab()
67 |     
68 |     # Construct graph
69 |     g = Graph("train"); print("Graph loaded")
70 |     
71 |     # Start session
72 |     sv = tf.train.Supervisor(graph=g.graph, 
73 |                              logdir=hp.logdir,
74 |                              save_model_secs=0)
75 |     with sv.managed_session() as sess:
76 |         for epoch in range(1, hp.num_epochs+1): 
77 |             if sv.should_stop(): break
78 |             for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
79 |                 sess.run(g.train_op)
80 |                 
81 |             gs = sess.run(g.global_step)   
82 |             sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
83 |     
84 |     print("Done")    
85 |     
86 | 
87 | 


--------------------------------------------------------------------------------
/hyperparams.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | 
  5 | 
  6 | class rc_Hyperparams:
  7 |     trainset = './datasets/train_round_0.csv'
  8 |     testset = './datasets/test_data_r0.csv'
  9 | 
 10 |     trainfile = './preprocessed/train.csv'
 11 |     testfile = './preprocessed/test.csv'
 12 | 
 13 | 
 14 |     batch_size = 64 # alias = N
 15 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
 16 |     logdir = 'rc_model_dir' # log directory
 17 |     
 18 |     # model
 19 |     q_maxlen = 50
 20 |     p_maxlen = 200
 21 |     ans_maxlen = 40
 22 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
 23 |     hidden_units = 256 # alias = C
 24 |     num_blocks = 5 # number of encoder/decoder blocks
 25 |     num_epochs = 20
 26 |     num_heads = 8
 27 |     dropout_rate = 0.5
 28 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
 29 | 
 30 |     dropout_keep_prob = 0.55
 31 |     reg_lambda = 0.1
 32 |     use_dropout = True
 33 |     weight_decay = 0.1
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | class seq2seq_Hyperparams:
 40 |     '''Hyperparameters'''
 41 |     # data
 42 |     source_train = './datasets/zh-en/train.tags.zh-en.en'
 43 |     target_train = './datasets/zh-en/train.tags.zh-en.zh' 
 44 |     source_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.en.xml' 
 45 |     target_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.zh.xml'
 46 | 
 47 |     # training
 48 |     batch_size = 32 # alias = N
 49 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
 50 |     logdir = 'seq2seq_model_dir' # log directory
 51 |     
 52 |     # model
 53 |     maxlen = 100 # Maximum number of words in a sentence. alias = T.
 54 |                 # Feel free to increase this if you are ambitious.
 55 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
 56 |     hidden_units = 512 # alias = C
 57 |     num_blocks = 5 # number of encoder/decoder blocks
 58 |     num_epochs = 20
 59 |     num_heads = 8
 60 |     dropout_rate = 0.1
 61 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
 62 | 
 63 | 
 64 | 
 65 | 
 66 | class feature_Block_Hyperparams:
 67 |     '''Hyperparameters'''
 68 |     # data
 69 |     trainset = './datasets/trainset.txt'
 70 |     testset = './datasets/testset.txt' 
 71 | 
 72 | 
 73 |     # training
 74 |     batch_size = 4 # alias = N
 75 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
 76 |     logdir = 'Block_model_dir' # log directory
 77 |     
 78 |     # model
 79 |     maxlen = 500 # Maximum number of words in a sentence. alias = T.
 80 |                 # Feel free to increase this if you are ambitious.
 81 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
 82 |     hidden_units = 512 # alias = C
 83 |     num_blocks = 5 # number of encoder/decoder blocks
 84 |     num_epochs = 20
 85 |     num_heads = 8
 86 |     dropout_rate = 0.1
 87 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
 88 |     n_class = 2
 89 | 
 90 | 
 91 | class infersent_Block_Hyperparams:
 92 |     '''Hyperparameters'''
 93 |     # data
 94 |     trainset = './opensrc_dta/train.csv'
 95 |     testset = './opensrc_dta/test.csv' 
 96 | 
 97 | 
 98 |     # training
 99 |     relations = {'entailment': '0', 'contradiction': '1', 'neutral': '2'}
100 | 
101 |     batch_size = 64 # alias = N
102 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
103 |     logdir = 'infersent_model_dir' # log directory
104 |     
105 |     # model
106 |     maxlen = 24 # Maximum number of words in a sentence. alias = T.
107 |                 # Feel free to increase this if you are ambitious.
108 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
109 |     hidden_units = 512 # alias = C
110 |     num_blocks = 5 # number of encoder/decoder blocks
111 |     num_epochs = 20
112 |     num_heads = 8
113 |     dropout_rate = 0.1
114 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
115 |     dropout_keep_prob = 0.55
116 |     reg_lambda = 0.1
117 | 
118 | 
119 | 
120 | 
121 | 
122 |     
123 |     
124 |     
125 | 


--------------------------------------------------------------------------------
/transformer_text_Classfication/eval.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import codecs
  5 | import os
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | 
 10 | from hyperparams import feature_Block_Hyperparams as hp
 11 | from data_load import load_vocabs, load_train_data, load_test_data, create_data
 12 | from train import Graph
 13 | #from nltk.translate.bleu_score import corpus_bleu
 14 | import argparse
 15 | from sklearn.metrics import classification_report
 16 | 
 17 | 
 18 | 
 19 | 
 20 | def eval(task_name):
 21 |     # Load graph
 22 |     g = Graph(is_training=False)
 23 |     print("Graph loaded")
 24 |     
 25 |     # Load data
 26 |     X, _, Texts, Labels = load_test_data()
 27 | 
 28 |     word2idx, idx2word = load_vocabs()
 29 | 
 30 |     # Start session         
 31 |     with g.graph.as_default():    
 32 |         sv = tf.train.Supervisor()
 33 |         with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
 34 |             ## Restore parameters
 35 |             sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
 36 |             print("Restored!")
 37 |               
 38 |             ## Get model name
 39 |             print('Model dir:', hp.logdir)
 40 |             mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
 41 |             print("Model name:", mname)
 42 |              
 43 |             ## Inference
 44 |             if not os.path.exists('results'): os.mkdir('results')
 45 |             with codecs.open("results/" + mname, "w", "utf-8") as fout:
 46 |                 list_of_refs, hypotheses = [], []
 47 |                 print("Iterator:", len(X), hp.batch_size)
 48 | 
 49 |                 predict_label = []
 50 |                 for i in range(len(X) // hp.batch_size + 1):                
 51 |                     print('Step:\t', i, '/', len(X) // hp.batch_size)     
 52 |                     ### Get mini-batches
 53 |                     x = X[i*hp.batch_size: (i+1)*hp.batch_size]
 54 |                     sentences = Texts[i*hp.batch_size: (i+1)*hp.batch_size]
 55 |                     labels = Labels[i*hp.batch_size: (i+1)*hp.batch_size]
 56 |                      
 57 |                     
 58 |                     preds = sess.run(g.preds, {g.x:x})
 59 |                     preds = [int(x) for x in preds]
 60 |                     predict_label.extend(preds)
 61 | 
 62 |                     ### Write to file
 63 |                     for sent, label, pred in zip(sentences, labels, preds): # sentence-wise
 64 |                         #got = " ".join(idx2word[idx] for idx in pred).split("</S>")[0].strip()
 65 |                         fout.write("- sent: " + sent +"\n")
 66 |                         fout.write('- label: {}, -predict: {} \n'.format(label, pred))
 67 |                         fout.flush()
 68 |                         
 69 |                         # bleu score
 70 |                         if task_name == 'seq2seq':
 71 |                             ref = target.split()
 72 |                             hypothesis = got.split()
 73 |                             if len(ref) > 3 and len(hypothesis) > 3:
 74 |                                 list_of_refs.append([ref])
 75 |                                 hypotheses.append(hypothesis)
 76 |                                  
 77 | 
 78 |                 ## Calculate bleu score
 79 |                 if task_name == 'seq2seq':
 80 |                     score = corpus_bleu(list_of_refs, hypotheses)
 81 |                     fout.write("Bleu Score = " + str(100*score))
 82 |                 elif task_name == 'classfication':
 83 |                     assert len(Labels) == len(predict_label), 'The length of label and predicts\
 84 |                         are not alignmentted.'
 85 |                 res = classification_report(Labels, predict_label)
 86 |                 print(res)
 87 |                 fout.write(res + '\n')
 88 |                     
 89 |                                           
 90 | if __name__ == '__main__':
 91 |     parser = argparse.ArgumentParser(description='Choice the task you want to eval.')
 92 |     parser.add_argument('--task', help='task name(default: classfication)')
 93 | 
 94 |     args = parser.parse_args()
 95 |     task_name = args.task
 96 |     eval(task_name)
 97 |     print("Done")
 98 |     
 99 | 
100 | 


--------------------------------------------------------------------------------
/transformer_RC/hyperparams.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | 
  5 | 
  6 | class rc_Hyperparams:
  7 |     trainset = './datasets/train_round_0.csv'
  8 |     testset = './datasets/test_data_r0.csv'
  9 | 
 10 |     trainfile = './preprocessed/train.csv'
 11 |     testfile = './preprocessed/test.csv'
 12 | 
 13 | 
 14 |     batch_size = 64 # alias = N
 15 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
 16 |     logdir = 'rc_model_dir' # log directory
 17 |     
 18 |     # model
 19 |     q_maxlen = 50
 20 |     p_maxlen = 200
 21 |     ans_maxlen = 40
 22 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
 23 |     hidden_units = 256 # alias = C
 24 |     num_blocks = 5 # number of encoder/decoder blocks
 25 |     num_epochs = 20
 26 |     num_heads = 8
 27 |     dropout_rate = 0.5
 28 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
 29 |     #n_class = 2
 30 |     dropout_keep_prob = 0.55
 31 |     reg_lambda = 0.1
 32 |     Passage_fuse = 'bi-rnn' # bi-rnn or Pooling
 33 |     use_dropout = True
 34 |     weight_decay = 0.1
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | class seq2seq_Hyperparams:
 41 |     '''Hyperparameters'''
 42 |     # data
 43 |     source_train = './datasets/zh-en/train.tags.zh-en.en'
 44 |     target_train = './datasets/zh-en/train.tags.zh-en.zh' 
 45 |     source_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.en.xml' 
 46 |     target_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.zh.xml'
 47 | 
 48 |     # training
 49 |     batch_size = 32 # alias = N
 50 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
 51 |     logdir = 'seq2seq_model_dir' # log directory
 52 |     
 53 |     # model
 54 |     maxlen = 100 # Maximum number of words in a sentence. alias = T.
 55 |                 # Feel free to increase this if you are ambitious.
 56 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
 57 |     hidden_units = 512 # alias = C
 58 |     num_blocks = 5 # number of encoder/decoder blocks
 59 |     num_epochs = 20
 60 |     num_heads = 8
 61 |     dropout_rate = 0.1
 62 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
 63 | 
 64 | 
 65 | 
 66 | 
 67 | class feature_Block_Hyperparams:
 68 |     '''Hyperparameters'''
 69 |     # data
 70 |     trainset = './datasets/trainset.txt'
 71 |     testset = './datasets/testset.txt' 
 72 | 
 73 | 
 74 |     # training
 75 |     batch_size = 4 # alias = N
 76 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
 77 |     logdir = 'Block_model_dir' # log directory
 78 |     
 79 |     # model
 80 |     maxlen = 500 # Maximum number of words in a sentence. alias = T.
 81 |                 # Feel free to increase this if you are ambitious.
 82 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
 83 |     hidden_units = 512 # alias = C
 84 |     num_blocks = 5 # number of encoder/decoder blocks
 85 |     num_epochs = 20
 86 |     num_heads = 8
 87 |     dropout_rate = 0.1
 88 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
 89 |     n_class = 2
 90 | 
 91 | 
 92 | class infersent_Block_Hyperparams:
 93 |     '''Hyperparameters'''
 94 |     # data
 95 |     trainset = './opensrc_dta/train.csv'
 96 |     testset = './opensrc_dta/test.csv' 
 97 | 
 98 | 
 99 |     # training
100 |     relations = {'entailment': '0', 'contradiction': '1', 'neutral': '2'}
101 | 
102 |     batch_size = 64 # alias = N
103 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
104 |     logdir = 'infersent_model_dir' # log directory
105 |     
106 |     # model
107 |     maxlen = 24 # Maximum number of words in a sentence. alias = T.
108 |                 # Feel free to increase this if you are ambitious.
109 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
110 |     hidden_units = 512 # alias = C
111 |     num_blocks = 5 # number of encoder/decoder blocks
112 |     num_epochs = 20
113 |     num_heads = 8
114 |     dropout_rate = 0.1
115 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
116 |     #n_class = 2
117 |     dropout_keep_prob = 0.55
118 |     reg_lambda = 0.1
119 | 
120 | 
121 | 
122 | 
123 | 
124 |     
125 |     
126 |     
127 | 


--------------------------------------------------------------------------------
/transformer_RC/layers/basic_rnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf8 -*-
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | This module provides wrappers for variants of RNN in Tensorflow
19 | """
20 | 
21 | import tensorflow as tf
22 | import tensorflow.contrib as tc
23 | 
24 | 
25 | def rnn(rnn_type, inputs, length, hidden_size, layer_num=1, dropout_keep_prob=None, concat=True):
26 |     """
27 |     Implements (Bi-)LSTM, (Bi-)GRU and (Bi-)RNN
28 |     Args:
29 |         rnn_type: the type of rnn
30 |         inputs: padded inputs into rnn
31 |         length: the valid length of the inputs
32 |         hidden_size: the size of hidden units
33 |         layer_num: multiple rnn layer are stacked if layer_num > 1
34 |         dropout_keep_prob:
35 |         concat: When the rnn is bidirectional, the forward outputs and backward outputs are
36 |                 concatenated if this is True, else we add them.
37 |     Returns:
38 |         RNN outputs and final state
39 |     """
40 |     if not rnn_type.startswith('bi'):
41 |         cell = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob)
42 |         outputs, states = tf.nn.dynamic_rnn(cell, inputs, sequence_length=length, dtype=tf.float32)
43 |         if rnn_type.endswith('lstm'):
44 |             c = [state.c for state in states]
45 |             h = [state.h for state in states]
46 |             states = h
47 |     else:
48 |         cell_fw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob)
49 |         cell_bw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob)
50 |         outputs, states = tf.nn.bidirectional_dynamic_rnn(
51 |             cell_bw, cell_fw, inputs, sequence_length=length, dtype=tf.float32
52 |         )
53 |         states_fw, states_bw = states
54 |         if rnn_type.endswith('lstm'):
55 |             c_fw = [state_fw.c for state_fw in states_fw]
56 |             h_fw = [state_fw.h for state_fw in states_fw]
57 |             c_bw = [state_bw.c for state_bw in states_bw]
58 |             h_bw = [state_bw.h for state_bw in states_bw]
59 |             states_fw, states_bw = h_fw, h_bw
60 |         if concat:
61 |             outputs = tf.concat(outputs, 2)
62 |             states = tf.concat([states_fw, states_bw], 1)
63 |         else:
64 |             outputs = outputs[0] + outputs[1]
65 |             states = states_fw + states_bw
66 |     return outputs, states
67 | 
68 | 
69 | def get_cell(rnn_type, hidden_size, layer_num=1, dropout_keep_prob=None):
70 |     """
71 |     Gets the RNN Cell
72 |     Args:
73 |         rnn_type: 'lstm', 'gru' or 'rnn'
74 |         hidden_size: The size of hidden units
75 |         layer_num: MultiRNNCell are used if layer_num > 1
76 |         dropout_keep_prob: dropout in RNN
77 |     Returns:
78 |         An RNN Cell
79 |     """
80 |     cells = []
81 |     for i in range(layer_num):
82 |         if rnn_type.endswith('lstm'):
83 |             cell = tc.rnn.LSTMCell(num_units=hidden_size, state_is_tuple=True)
84 |         elif rnn_type.endswith('gru'):
85 |             cell = tc.rnn.GRUCell(num_units=hidden_size)
86 |         elif rnn_type.endswith('rnn'):
87 |             cell = tc.rnn.BasicRNNCell(num_units=hidden_size)
88 |         else:
89 |             raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type))
90 |         if dropout_keep_prob is not None:
91 |             cell = tc.rnn.DropoutWrapper(cell,
92 |                                          input_keep_prob=dropout_keep_prob,
93 |                                          output_keep_prob=dropout_keep_prob)
94 |         cells.append(cell)
95 |     cells = tc.rnn.MultiRNNCell(cells, state_is_tuple=True)
96 |     return cells
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/transformer_jieba/data_pre.py:
--------------------------------------------------------------------------------
  1 | #data preparation
  2 | import codecs
  3 | import os
  4 | import argparse
  5 | import jieba
  6 | #from hyperparams import Hyperparams as hp
  7 | 
  8 | 
  9 | def jieba_data_pre():
 10 | 	with codecs.open('./dataset/train.txt', 'r', encoding = 'utf-8') as f:
 11 | 		vocabset = f.readlines()
 12 | 		vocabset = [x.strip() for x in vocabset]
 13 | 		#print(vocabset[:10])
 14 | 	zh = ''
 15 | 	en = ''
 16 | 
 17 | 	for pair in vocabset:
 18 | 		try:
 19 | 			z, e = pair.strip().split('\t')
 20 | 			zh += z + ' '
 21 | 			en += e + ' '
 22 | 		except:
 23 | 			zh += '<eos>'
 24 | 			en += '<eos>'
 25 | 
 26 | 
 27 | 	zh_sent = zh.split('<eos>')
 28 | 	en_sent = en.split('<eos>')
 29 | 	assert len(zh_sent) == len(en_sent), 'length of source and target not comliable'
 30 | 
 31 | 	files = []
 32 | 	for root, dirs, file in os.walk(".", topdown=False):
 33 | 		files.append(file)
 34 | 	#print(files)
 35 | 
 36 | 	if 'train.tags.zh-en.zh' not in files:
 37 | 		with codecs.open('./dataset/train.tags.src-tgt.src', 'w', 'utf-8') as f:
 38 | 			for i in zh_sent[:int(0.8*len(zh_sent))]:
 39 | 				f.write(i+'\n')
 40 | 
 41 | 		with codecs.open('./dataset/train.tags.tgt-src.tgt', 'w', 'utf-8') as f:
 42 | 			for i in en_sent[:int(0.8*len(en_sent))]:
 43 | 				f.write(i+'\n')
 44 | 
 45 | 		with codecs.open('./dataset/test.tags.src-tgt.src', 'w', 'utf-8') as f:
 46 | 			for i in zh_sent[int(0.8*len(zh_sent)):]:
 47 | 				f.write(i+'\n')
 48 | 
 49 | 		with codecs.open('./dataset/test.tags.tgt-src.tgt', 'w', 'utf-8') as f:
 50 | 			for i in en_sent[int(0.8*len(en_sent)):]:
 51 | 				f.write(i+'\n')
 52 | 
 53 | 
 54 | 
 55 | def text_sum_pre():
 56 | 	cnt_title_path = './dataset/content-title.txt'
 57 | 	cnt_title_pair = [x.strip().split() for x in open(cnt_title_path).readlines()]
 58 | 	cnt_title_pair = [x for x in cnt_title_pair if len(x) == 2]
 59 | 
 60 | 	content_set, sum_set = [x[0] for x in cnt_title_pair], [x[1] for x in cnt_title_pair]
 61 | 	pad = ['<PAD>', '<UNK>', "<S>", "</S>"]
 62 | 	content_vocabs, title_vocabs = {}, {}
 63 | 
 64 | 	for x in content_set:
 65 | 		vocabs = jieba.cut(x)
 66 | 		for x in vocabs:
 67 | 			if x not in content_vocabs:
 68 | 				content_vocabs[x] = 1
 69 | 			else:
 70 | 				content_vocabs[x] += 1
 71 | 
 72 | 	for x in sum_set:
 73 | 		vocabs = jieba.cut(x)
 74 | 		for x in vocabs:
 75 | 			if x not in title_vocabs:
 76 | 				title_vocabs[x] = 1
 77 | 			else:
 78 | 				title_vocabs[x] += 1
 79 | 
 80 | 	#save vocab
 81 | 
 82 | 	if not 'textSummary' in os.listdir('./preprocessed'):
 83 | 		os.mkdir('./preprocessed/textSummary')
 84 | 	with codecs.open('./preprocessed/textSummary/src.vocab.tsv', 'w', 'utf-8') as f:
 85 | 		for token in pad:
 86 | 			f.write(token + '\t' + '1000000000' + '\n')
 87 | 		for token, val in content_vocabs.items():
 88 | 			f.write(token + '\t' + str(content_vocabs[token])  + '\n')
 89 | 	
 90 | 	with codecs.open('./preprocessed/textSummary/tgt.vocab.tsv', 'w', 'utf-8') as f:
 91 | 		for token in pad:
 92 | 			f.write(token + '\t' + '1000000000' + '\n')
 93 | 		for token, val in title_vocabs.items():
 94 | 			f.write(token + '\t' + str(title_vocabs[token])  + '\n')
 95 | 
 96 | 
 97 | 	if 'textSummary' in os.listdir('./dataset'):
 98 | 		os._exit(0)
 99 | 	else:
100 | 		os.mkdir('./dataset/textSummary')
101 | 	
102 | 	n = len(sum_set)
103 | 	with codecs.open('./dataset/textSummary/train.tags.src-tgt.src', 'w', 'utf-8') as f:
104 | 		for x in content_set[:int(0.8*n)]:
105 | 			f.write(x+'\n')
106 | 
107 | 	with codecs.open('./dataset/textSummary/train.tags.tgt-src.tgt', 'w', 'utf-8') as f:
108 | 		for x in sum_set[:int(0.8*n)]:
109 | 			f.write(x+'\n')
110 | 	
111 | 	with codecs.open('./dataset/textSummary/test.tags.src-tgt.src', 'w', 'utf-8') as f:
112 | 		for x in content_set[int(0.8*n):]:
113 | 			f.write(x+'\n')
114 | 	
115 | 	with codecs.open('./dataset/textSummary/test.tags.tgt-src.tgt', 'w', 'utf-8') as f:
116 | 		for x in sum_set[int(0.8*n):]:
117 | 			f.write(x+'\n')
118 | 
119 | 
120 | def main():
121 | 	parser = argparse.ArgumentParser(description='Choice the task you want to run.')
122 | 	parser.add_argument('--task', default = 'jieba',
123 | 						help='task name(default: tokenize)')
124 | 
125 | 	args = parser.parse_args()
126 | 	task_name = args.task
127 | 	
128 | 	if task_name == 'jieba': jieba_data_pre()
129 | 	elif task_name == 'textsum': text_sum_pre()
130 | 
131 | 
132 | 
133 | if __name__ == '__main__':
134 | 	main()
135 | 


--------------------------------------------------------------------------------
/en-zh_NMT/data_pre.py:
--------------------------------------------------------------------------------
  1 | # encoding = 'utf-8'
  2 | #/usr/bin/python3
  3 | 
  4 | 
  5 | #data preparation
  6 | import codecs
  7 | import os
  8 | import argparse
  9 | import jieba
 10 | #from hyperparams import Hyperparams as hp
 11 | 
 12 | 
 13 | def jieba_data_pre():
 14 | 	with codecs.open('./dataset/train.txt', 'r', encoding = 'utf-8') as f:
 15 | 		vocabset = f.readlines()
 16 | 		vocabset = [x.strip() for x in vocabset]
 17 | 		#print(vocabset[:10])
 18 | 	zh = ''
 19 | 	en = ''
 20 | 
 21 | 	for pair in vocabset:
 22 | 		try:
 23 | 			z, e = pair.strip().split('\t')
 24 | 			zh += z + ' '
 25 | 			en += e + ' '
 26 | 		except:
 27 | 			zh += '<eos>'
 28 | 			en += '<eos>'
 29 | 
 30 | 
 31 | 	zh_sent = zh.split('<eos>')
 32 | 	en_sent = en.split('<eos>')
 33 | 	assert len(zh_sent) == len(en_sent), 'length of source and target not comliable'
 34 | 
 35 | 	files = []
 36 | 	for root, dirs, file in os.walk(".", topdown=False):
 37 | 		files.append(file)
 38 | 
 39 | 	if 'train.tags.zh-en.zh' not in files:
 40 | 		with codecs.open('./dataset/train.tags.src-tgt.src', 'w', 'utf-8') as f:
 41 | 			for i in zh_sent[:int(0.8*len(zh_sent))]:
 42 | 				f.write(i+'\n')
 43 | 
 44 | 		with codecs.open('./dataset/train.tags.tgt-src.tgt', 'w', 'utf-8') as f:
 45 | 			for i in en_sent[:int(0.8*len(en_sent))]:
 46 | 				f.write(i+'\n')
 47 | 
 48 | 		with codecs.open('./dataset/test.tags.src-tgt.src', 'w', 'utf-8') as f:
 49 | 			for i in zh_sent[int(0.8*len(zh_sent)):]:
 50 | 				f.write(i+'\n')
 51 | 
 52 | 		with codecs.open('./dataset/test.tags.tgt-src.tgt', 'w', 'utf-8') as f:
 53 | 			for i in en_sent[int(0.8*len(en_sent)):]:
 54 | 				f.write(i+'\n')
 55 | 
 56 | 
 57 | 
 58 | def text_sum_pre():
 59 | 	cnt_title_path = './dataset/content-title.txt'
 60 | 	cnt_title_pair = [x.strip().split() for x in open(cnt_title_path).readlines()]
 61 | 	cnt_title_pair = [x for x in cnt_title_pair if len(x) == 2]
 62 | 
 63 | 	content_set, sum_set = [x[0] for x in cnt_title_pair], [x[1] for x in cnt_title_pair]
 64 | 	pad = ['<PAD>', '<UNK>', "<S>", "</S>"]
 65 | 	content_vocabs, title_vocabs = {}, {}
 66 | 
 67 | 	for x in content_set:
 68 | 		vocabs = jieba.cut(x)
 69 | 		for x in vocabs:
 70 | 			if x not in content_vocabs:
 71 | 				content_vocabs[x] = 1
 72 | 			else:
 73 | 				content_vocabs[x] += 1
 74 | 
 75 | 	for x in sum_set:
 76 | 		vocabs = jieba.cut(x)
 77 | 		for x in vocabs:
 78 | 			if x not in title_vocabs:
 79 | 				title_vocabs[x] = 1
 80 | 			else:
 81 | 				title_vocabs[x] += 1
 82 | 
 83 | 	#save vocab
 84 | 
 85 | 	if not 'textSummary' in os.listdir('./preprocessed'):
 86 | 		os.mkdir('./preprocessed/textSummary')
 87 | 	with codecs.open('./preprocessed/textSummary/src.vocab.tsv', 'w', 'utf-8') as f:
 88 | 		for token in pad:
 89 | 			f.write(token + '\t' + '1000000000' + '\n')
 90 | 		for token, val in content_vocabs.items():
 91 | 			f.write(token + '\t' + str(content_vocabs[token])  + '\n')
 92 | 	
 93 | 	with codecs.open('./preprocessed/textSummary/tgt.vocab.tsv', 'w', 'utf-8') as f:
 94 | 		for token in pad:
 95 | 			f.write(token + '\t' + '1000000000' + '\n')
 96 | 		for token, val in title_vocabs.items():
 97 | 			f.write(token + '\t' + str(title_vocabs[token])  + '\n')
 98 | 
 99 | 
100 | 	if 'textSummary' in os.listdir('./dataset'):
101 | 		os._exit(0)
102 | 	else:
103 | 		os.mkdir('./dataset/textSummary')
104 | 	
105 | 	n = len(sum_set)
106 | 	with codecs.open('./dataset/textSummary/train.tags.src-tgt.src', 'w', 'utf-8') as f:
107 | 		for x in content_set[:int(0.8*n)]:
108 | 			f.write(x+'\n')
109 | 
110 | 	with codecs.open('./dataset/textSummary/train.tags.tgt-src.tgt', 'w', 'utf-8') as f:
111 | 		for x in sum_set[:int(0.8*n)]:
112 | 			f.write(x+'\n')
113 | 	
114 | 	with codecs.open('./dataset/textSummary/test.tags.src-tgt.src', 'w', 'utf-8') as f:
115 | 		for x in content_set[int(0.8*n):]:
116 | 			f.write(x+'\n')
117 | 	
118 | 	with codecs.open('./dataset/textSummary/test.tags.tgt-src.tgt', 'w', 'utf-8') as f:
119 | 		for x in sum_set[int(0.8*n):]:
120 | 			f.write(x+'\n')
121 | 
122 | 
123 | def main():
124 | 	parser = argparse.ArgumentParser(description='Choice the task you want to run.')
125 | 	parser.add_argument('--task', default = 'jieba',
126 | 						help='task name(default: tokenize)')
127 | 
128 | 	args = parser.parse_args()
129 | 	task_name = args.task
130 | 	
131 | 	if task_name == 'jieba': jieba_data_pre()
132 | 	elif task_name == 'textsum': text_sum_pre()
133 | 
134 | 
135 | 
136 | if __name__ == '__main__':
137 | 	main()
138 | 


--------------------------------------------------------------------------------
/transformer_infersent/data_load.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | from __future__ import print_function
  5 | from hyperparams import infersent_Block_Hyperparams as hp
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | import codecs
  9 | import re
 10 | from jieba import cut
 11 | 
 12 |   
 13 | def load_vocabs():
 14 |     vocab = [line.split()[0] for line in codecs.open('./preprocessed/vocabs.txt', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt] #raw code is hp.mincnt
 15 |     word2idx = {word: idx for idx, word in enumerate(vocab)}
 16 |     idx2word = {idx: word for idx, word in enumerate(vocab)}
 17 |     return word2idx, idx2word
 18 | 
 19 | 
 20 | 
 21 | def create_data(s1, s2, labels): 
 22 |     word2idx, idx2word = load_vocabs()
 23 | 
 24 |     #max token numbers
 25 |     max_token_num = len(word2idx.keys()) + 100
 26 |     
 27 |     # Index
 28 |     x1_list, x2_list, Sources, Targets = [], [], [], []
 29 |     for sent1, sent2 in zip(s1, s2):
 30 |         x1 = [word2idx.get(word, 1) for word in (sent1 + u" </S>").split()[:hp.maxlen-5]] # 1: OOV, </S>: End of Text
 31 |         x2 = [word2idx.get(word, 1) for word in (sent2 + u" </S>").split()[:hp.maxlen-5]] 
 32 | 
 33 |         x1_list.append(np.array(x1))
 34 |         x2_list.append(np.array(x2))
 35 |     print('demo', x1_list[0], x2_list[0], labels[0])
 36 | 
 37 |     # Pad      
 38 |     X1 = np.zeros([len(x1_list), hp.maxlen], np.int32)
 39 |     X2 = np.zeros([len(x2_list), hp.maxlen], np.int32)
 40 | 
 41 |     for i, x in enumerate(x1_list):
 42 |         X1[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0))
 43 | 
 44 |     for i, x in enumerate(x2_list):
 45 |         X2[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0))
 46 | 
 47 |     labels = [int(x) for x in labels]    
 48 |     
 49 |     return X1, X2, np.array(labels)
 50 | 
 51 | 
 52 | def _refine(line, lan = 'zh'):
 53 |     #line = re.sub("[^\s\p{Latin}']", "", line) 
 54 |     if lan == 'zh':
 55 |         line = re.sub("[\s\p']", "", line)
 56 |         line = re.sub(r'[0-9]+', 'n', line)
 57 |         line = re.sub(r'[a-zA-Z]+', 'α', line)
 58 |         line = jieba.cut(line)
 59 |         return ' '.join(list(line))
 60 |     elif lan == 'en':
 61 |         line = re.sub("[^a-zA-Z]", " ", line)
 62 |         return line
 63 | 
 64 |     else:
 65 |         raise Exception('Havn\'t specified language!')
 66 |         return
 67 | 
 68 | 
 69 | 
 70 | def load_train_data(tokenizer = None):
 71 |     corpus = [line.strip().split('<>') for line in codecs.open(hp.trainset, 'r', 'utf-8').readlines()[:100000]]
 72 |     s1, s2, labels = [_refine(line[1], lan = 'en') for line in corpus], [_refine(line[2], lan = 'en') for line in corpus], \
 73 |     [int(line[0]) for line in corpus]
 74 | 
 75 |     X1, X2, Label = create_data(s1, s2, labels)
 76 |     return X1, X2, Label
 77 |     
 78 | 
 79 | 
 80 | def load_test_data(tokenizer = None):
 81 |     corpus = [line.strip().split('<>') for line in codecs.open(hp.testset, 'r', 'utf-8').readlines()]
 82 |     s1, s2, labels = [_refine(line[1], lan = 'en') for line in corpus], [_refine(line[2], lan = 'en') for line in corpus], \
 83 |     [line[0] for line in corpus] 
 84 | 
 85 |     X1, X2, Label = create_data(s1, s2, labels)
 86 |     return X1, X2, Label
 87 | 
 88 | 
 89 | def get_batch_data():
 90 |     # Load data
 91 |     X1, X2, Label = load_train_data()
 92 |     
 93 |     # calc total batch count
 94 |     num_batch = len(X1) // hp.batch_size
 95 |     
 96 |     # Convert to tensor
 97 |     X1 = tf.convert_to_tensor(X1, tf.int32)
 98 |     X2 = tf.convert_to_tensor(X2, tf.int32)
 99 |     Label = tf.convert_to_tensor(Label, tf.int32)
100 |     
101 |     # Create Queues
102 |     input_queues = tf.train.slice_input_producer([X1, X2, Label])
103 |             
104 |     # create batch queues
105 |     x1, x2, label = tf.train.shuffle_batch(input_queues,
106 |                                 num_threads=8,
107 |                                 batch_size=hp.batch_size, 
108 |                                 capacity=hp.batch_size*64, 
109 |                                 min_after_dequeue=hp.batch_size*32, 
110 |                                 allow_smaller_final_batch=False)
111 |     return x1, x2, label, num_batch # (N, T), (N, T), ()
112 | 
113 | 


--------------------------------------------------------------------------------
/transformer_text_Classfication/hyperparams.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | 
  5 | 
  6 | class rc_Hyperparams:
  7 |     trainset = './datasets/train_round_0.csv'
  8 |     testset = './datasets/test_data_r0.csv'
  9 | 
 10 |     trainfile = './preprocessed/train.csv'
 11 |     testfile = './preprocessed/test.csv'
 12 |     predictfile = './inference_QA.csv'
 13 | 
 14 |     batch_size = 64 # alias = N
 15 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
 16 |     logdir = 'rc_model_dir' # log directory
 17 |     
 18 |     # model
 19 |     q_maxlen = 50
 20 |     p_maxlen = 300
 21 |     ans_maxlen = 40
 22 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
 23 |     hidden_units = 512 # alias = C
 24 |     num_blocks = 5 # number of encoder/decoder blocks
 25 |     num_epochs = 200
 26 |     num_heads = 8
 27 |     dropout_rate = 0.33
 28 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
 29 |     #n_class = 2
 30 |     dropout_keep_prob = 0.33
 31 |     reg_lambda = 0.1
 32 |     Passage_fuse = 'bi-rnn' # bi-rnn or Pooling
 33 |     use_dropout = True
 34 |     weight_decay = 0.1
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | class seq2seq_Hyperparams:
 41 |     '''Hyperparameters'''
 42 |     # data
 43 |     source_train = './datasets/zh-en/train.tags.zh-en.en'
 44 |     target_train = './datasets/zh-en/train.tags.zh-en.zh' 
 45 |     source_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.en.xml' 
 46 |     target_test = './datasets/zh-en/IWSLT15.TED.tst2011.zh-en.zh.xml'
 47 | 
 48 |     # training
 49 |     batch_size = 32 # alias = N
 50 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
 51 |     logdir = 'seq2seq_model_dir' # log directory
 52 |     
 53 |     # model
 54 |     maxlen = 100 # Maximum number of words in a sentence. alias = T.
 55 |                 # Feel free to increase this if you are ambitious.
 56 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
 57 |     hidden_units = 512 # alias = C
 58 |     num_blocks = 5 # number of encoder/decoder blocks
 59 |     num_epochs = 20
 60 |     num_heads = 8
 61 |     dropout_rate = 0.1
 62 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
 63 | 
 64 | 
 65 | 
 66 | 
 67 | class feature_Block_Hyperparams:
 68 |     '''Hyperparameters'''
 69 |     # data
 70 |     trainset = './datasets/cnews.train.txt'
 71 |     testset = './datasets/cnews.test.txt' 
 72 |     tagging = {'时尚':0, '教育':1, '时政':2, '体育':3, '游戏':4, '家居':5, '科技':6, '房产':7, '财经':8, '娱乐':9} 
 73 | 
 74 |     # training
 75 |     batch_size = 4 # alias = N
 76 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
 77 |     logdir = 'Block_model_dir' # log directory
 78 |     
 79 |     # model
 80 |     maxlen = 500 # Maximum number of words in a sentence. alias = T.
 81 |                 # Feel free to increase this if you are ambitious.
 82 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
 83 |     hidden_units = 512 # alias = C
 84 |     num_blocks = 5 # number of encoder/decoder blocks
 85 |     num_epochs = 20
 86 |     num_heads = 8
 87 |     dropout_rate = 0.1
 88 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
 89 |     n_class = 10
 90 | 
 91 | 
 92 | class infersent_Block_Hyperparams:
 93 |     '''Hyperparameters'''
 94 |     # data
 95 |     trainset = './opensrc_dta/train.csv'
 96 |     testset = './opensrc_dta/test.csv' 
 97 | 
 98 | 
 99 |     # training
100 |     relations = {'entailment': '0', 'contradiction': '1', 'neutral': '2'}
101 | 
102 |     batch_size = 64 # alias = N
103 |     lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
104 |     logdir = 'infersent_model_dir' # log directory
105 |     
106 |     # model
107 |     maxlen = 24 # Maximum number of words in a sentence. alias = T.
108 |                 # Feel free to increase this if you are ambitious.
109 |     min_cnt = 3 # words whose occurred less than min_cnt are encoded as <UNK>.
110 |     hidden_units = 512 # alias = C
111 |     num_blocks = 5 # number of encoder/decoder blocks
112 |     num_epochs = 20
113 |     num_heads = 8
114 |     dropout_rate = 0.1
115 |     sinusoid = False # If True, use sinusoid. If false, positional embedding.
116 |     #n_class = 2
117 |     dropout_keep_prob = 0.55
118 |     reg_lambda = 0.1
119 | 
120 | 
121 | 
122 | 
123 | 
124 |     
125 |     
126 |     
127 | 


--------------------------------------------------------------------------------
/transformer_infersent/eval.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import codecs
  5 | import os
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | 
 10 | from hyperparams import infersent_Block_Hyperparams as hp
 11 | from data_load import load_vocabs, load_train_data, load_test_data, create_data
 12 | from train import Graph
 13 | #from nltk.translate.bleu_score import corpus_bleu
 14 | import argparse
 15 | from sklearn.metrics import classification_report
 16 | 
 17 | 
 18 |  
 19 | 
 20 | def eval(task_name):
 21 |     # Load graph
 22 |     g = Graph(is_training=False)
 23 |     print("Graph loaded")
 24 |     
 25 |     # Load data
 26 |     #X, _, Texts, Labels = load_test_data()
 27 |     s1, s2, raw_labels = load_test_data()
 28 |     raw_labels = [int(x) for x in raw_labels]
 29 | 
 30 |     word2idx, idx2word = load_vocabs()
 31 | 
 32 |     # Start session         
 33 |     with g.graph.as_default():    
 34 |         sv = tf.train.Supervisor()
 35 |         with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
 36 |             ## Restore parameters
 37 |             sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
 38 |             print("Restored!")
 39 |               
 40 |             ## Get model name
 41 |             print('Model dir:', hp.logdir)
 42 |             mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
 43 |             print("Model name:", mname)
 44 |              
 45 |             ## Inference
 46 |             if not os.path.exists('results'): os.mkdir('results')
 47 |             with codecs.open("results/" + mname, "w", "utf-8") as fout:
 48 |                 #list_of_refs, hypotheses = [], []
 49 |                 #print("Iterator: {} / {}".format(len(s1), hp.batch_size))
 50 | 
 51 |                 test_labels, predict_label = [], []
 52 |                 for i in range(len(s1) // hp.batch_size):                
 53 |                     print("Iterator: {} / {}".format(i, len(s1)//hp.batch_size))     
 54 |                     ### Get mini-batches
 55 |                     x1 = s1[i*hp.batch_size: (i+1)*hp.batch_size]
 56 |                     x2 = s2[i*hp.batch_size: (i+1)*hp.batch_size]
 57 |                     #sentences = Texts[i*hp.batch_size: (i+1)*hp.batch_size]
 58 |                     labels = raw_labels[i*hp.batch_size: (i+1)*hp.batch_size]
 59 |                     test_labels.extend([int(x) for x in labels])
 60 |  
 61 |                     preds = sess.run(g.preds, {g.x1:x1, g.x2:x2})
 62 |                     predict_label.extend([int(x) for x in preds])
 63 |                     assert len(preds) == len(labels), 'not alignmented...'
 64 |  
 65 | 
 66 |                     ### Write to file
 67 |                     #for sent, label, pred in zip(sentences, labels, preds): # sentence-wise
 68 |                     for label, pred in zip(labels, preds):
 69 |                         #got = " ".join(idx2word[idx] for idx in pred).split("</S>")[0].strip()
 70 | 
 71 |                         
 72 |                         # bleu score
 73 |                         if task_name == 'seq2seq':
 74 |                             ref = target.split()
 75 |                             hypothesis = got.split()
 76 |                             if len(ref) > 3 and len(hypothesis) > 3:
 77 |                                 list_of_refs.append([ref])
 78 |                                 hypotheses.append(hypothesis)
 79 |                                  
 80 | 
 81 |                 ## Calculate bleu score
 82 |                 if task_name == 'seq2seq':
 83 |                     score = corpus_bleu(list_of_refs, hypotheses)
 84 |                     fout.write("Bleu Score = " + str(100*score))
 85 |                 elif task_name == 'classfication' or task_name == 'infersent':
 86 |                     assert len(test_labels) == len(predict_label), 'The length of label and predicts\
 87 |                         are not alignmentted.'
 88 |                 res = classification_report(test_labels, predict_label)
 89 |                 print(res)
 90 |                 fout.write(res + '\n')
 91 |                     
 92 |                                           
 93 | if __name__ == '__main__':
 94 |     parser = argparse.ArgumentParser(description='Choice the task you want to eval.')
 95 |     parser.add_argument('--task', help='task name(default: infersent)')
 96 | 
 97 |     args = parser.parse_args()
 98 |     task_name = args.task
 99 |     eval(task_name)
100 |     print("Done")
101 |     
102 | 
103 | 


--------------------------------------------------------------------------------
/transformer_jieba/eval.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import codecs
  5 | import os
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | 
 10 | from hyperparams import seq2seq_Hyperparams as hp
 11 | from data_load import load_en_vocab, load_zh_vocab, load_train_data, load_test_data, create_data
 12 | from train import Graph
 13 | 
 14 | import argparse
 15 | from modules import bleu
 16 | import math
 17 | from modules import cut
 18 | 
 19 | 
 20 | 
 21 | 
 22 | def eval(task_name):
 23 |     # Load graph
 24 |     g = Graph(is_training=False)
 25 |     print("Graph loaded")
 26 |     
 27 |     # Load data
 28 |     X, Sources, Targets = load_test_data()
 29 |     #print(X, Sources, Targets)
 30 |     en2idx, idx2en = load_en_vocab()
 31 |     zh2idx, idx2zh = load_zh_vocab()
 32 |      
 33 |      
 34 |     # Start session         
 35 |     with g.graph.as_default():    
 36 |         sv = tf.train.Supervisor()
 37 |         with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
 38 |             ## Restore parameters
 39 |             sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
 40 |             print("Restored!")
 41 |               
 42 |             ## Get model name
 43 |             print('Model dir:', hp.logdir)
 44 |             mname = '{}'.format(''.join(hp.source_test.split('/')[-1].split('.', 3)[:-1])) + open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
 45 |             print("Model name:", mname)
 46 |              
 47 |             ## Inference
 48 |             if not os.path.exists('results'): os.mkdir('results')
 49 |             with codecs.open("results/" + mname, "w", "utf-8") as fout:
 50 |                 list_of_refs, hypotheses, scores = [], [], []
 51 |                 print("Iterator:", len(X), hp.batch_size)
 52 |                 for i in range(len(X) // hp.batch_size):                
 53 |                     print('Step:\t', i)     
 54 |                     ### Get mini-batches
 55 |                     x = X[i*hp.batch_size: (i+1)*hp.batch_size]
 56 |                     sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size]
 57 |                     targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size]
 58 |                      
 59 |                     ### Autoregressive inference
 60 |                     preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
 61 |                     for j in range(hp.maxlen):
 62 |                         _preds = sess.run(g.preds, {g.x: x, g.y: preds})
 63 |                         preds[:, j] = _preds[:, j]
 64 |                     
 65 | 
 66 |                     ### Write to file
 67 |                     for source, target, pred in zip(sources, targets, preds): # sentence-wise
 68 |                         #print('Inspecting:', source, target, pred)
 69 |                         #got = " ".join(idx2zh[idx] for idx in pred).split("。", 2)[0].strip() + ' 。'
 70 |                         #got = ''.join(idx2zh[idx] for idx in pred).split('。')[0].strip() 
 71 |                         got = ' '.join(idx2zh[idx] for idx in pred).split('</S>')[0]
 72 |                         if task_name == 'jieba':
 73 |                             fout.write("- source: " + source +"\n")
 74 |                             fout.write("- expected: " + ' '.join(cut(source, target)) + "\n")
 75 |                             fout.write("- got: " + ' '.join(cut(source, got)) + "\n\n")
 76 |                             fout.flush()
 77 |                         else:
 78 |                             fout.write("- source: " + source +"\n")
 79 |                             fout.write("- expected: " + target + "\n")
 80 |                             fout.write("- got: " + got + "\n\n")
 81 |                             fout.flush()
 82 |                         
 83 | 
 84 |                         # accumlate accuracty
 85 |                         ref = cut(source, target)
 86 |                         hypothesis = cut(source, got)
 87 |                         acc = len([x for x in hypothesis if x in ref])/len(ref)
 88 |                         scores.append(min(1, acc))
 89 |  
 90 |                                  
 91 | 
 92 |                 ## Calculate bleu score
 93 |                 #score = corpus_bleu(list_of_refs, hypotheses)
 94 |                 fout.write("Tokenization Accuracy = " + str(100*(sum(scores)/len(scores))))
 95 | 
 96 |                                           
 97 | if __name__ == '__main__':
 98 |     parser = argparse.ArgumentParser(description='Choice the task you want to eval.')
 99 |     parser.add_argument('--task', help='task name(default: seq2seq)')
100 | 
101 |     args = parser.parse_args()
102 |     task_name = args.task
103 |     eval(task_name)
104 |     print("Done")
105 |     
106 | 
107 | 


--------------------------------------------------------------------------------
/en-zh_NMT/eval.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import codecs
  5 | import os
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | 
 10 | from hyperparams import seq2seq_Hyperparams as hp
 11 | from data_load import load_en_vocab, load_zh_vocab, load_test_data
 12 | from train import Graph
 13 | #from nltk.translate.bleu_score import corpus_bleu
 14 | import argparse
 15 | from modules import bleu
 16 | import math
 17 | 
 18 | 
 19 | 
 20 | 
 21 | def eval(task_name):
 22 |     # Load graph
 23 |     g = Graph(is_training=False)
 24 |     print("Graph loaded")
 25 |     
 26 |     # Load data
 27 |     X, Sources, Targets = load_test_data()
 28 |     en2idx, idx2en = load_en_vocab()
 29 |     zh2idx, idx2zh = load_zh_vocab()
 30 |      
 31 |      
 32 |     # Start session         
 33 |     with g.graph.as_default():    
 34 |         sv = tf.train.Supervisor()
 35 |         with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
 36 |             ## Restore parameters
 37 |             sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
 38 |             print("Restored!")
 39 |               
 40 |             ## Get model name
 41 |             print('Model dir:', hp.logdir)
 42 |             mname = '{}'.format(''.join(hp.source_test.split('/')[-1].split('.', 3)[:-1])) + open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
 43 |             print("Model name:", mname)
 44 |              
 45 |             ## Inference
 46 |             if not os.path.exists('results'): os.mkdir('results')
 47 |             with codecs.open("results/" + mname, "w", "utf-8") as fout:
 48 |                 list_of_refs, hypotheses, scores = [], [], []
 49 |                 print("Iterator:", len(X), hp.batch_size)
 50 |                 for i in range(len(X) // hp.batch_size):                
 51 |                     print('Step:\t', i)     
 52 |                     ### Get mini-batches
 53 |                     x = X[i*hp.batch_size: (i+1)*hp.batch_size]
 54 |                     sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size]
 55 |                     targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size]
 56 |                      
 57 |                     ### Autoregressive inference
 58 |                     preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
 59 |                     for j in range(hp.maxlen):
 60 |                         _preds = sess.run(g.preds, {g.x: x, g.y: preds})
 61 |                         preds[:, j] = _preds[:, j]
 62 |                     
 63 | 
 64 |                     ### Write to file
 65 |                     for source, target, pred in zip(sources, targets, preds): # sentence-wise
 66 |                         #print('Inspecting:', source, target, pred)
 67 |                         got = " ".join(idx2zh[idx] for idx in pred).split("。", 2)[0].strip() + ' 。'
 68 |                         #got = ''.join(idx2zh[idx] for idx in pred).split('。')[0].strip() 
 69 |                         if task_name == 'jieba':
 70 |                             fout.write("- source: " + source +"\n")
 71 |                             if len(got) < len(target): got += target[len(got):]
 72 |                             fout.write("- expected: " + cut(source, target) + "\n")
 73 |                             fout.write("- got: " + cut(source, got) + "\n\n")
 74 |                             fout.flush()
 75 |                         else:
 76 |                             fout.write("- source: " + source +"\n")
 77 |                             fout.write("- expected: " + target + "\n")
 78 |                             fout.write("- got: " + got + "\n\n")
 79 |                             fout.flush()
 80 |                         
 81 |                         # bleu score- BLEU-2
 82 |                         ref = target.split()
 83 |                         hypothesis = got.split()
 84 |                         print(ref, '\n', hypothesis)
 85 |                         if len(ref) > 2 and len(hypothesis) > 2:
 86 |                             scores.append(bleu(hypothesis, ref, 2))
 87 |                             #list_of_refs.append([ref])
 88 |                             #hypotheses.append(hypothesis)
 89 |                                  
 90 | 
 91 |                 ## Calculate bleu score
 92 |                 #score = corpus_bleu(list_of_refs, hypotheses)
 93 |                 fout.write("Bleu Score = " + str(100*(sum(scores)/len(scores))))
 94 | 
 95 |                                           
 96 | if __name__ == '__main__':
 97 |     parser = argparse.ArgumentParser(description='Choice the task you want to eval.')
 98 |     parser.add_argument('--task', help='task name(default: seq2seq)')
 99 | 
100 |     args = parser.parse_args()
101 |     task_name = args.task
102 |     eval(task_name)
103 |     print("Done")
104 |     
105 | 
106 | 


--------------------------------------------------------------------------------
/transformer_jieba/data_load.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # /usr/bin/python3
  3 | 
  4 | from hyperparams import seq2seq_Hyperparams as hp
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | import codecs
  8 | import re
  9 | import jieba
 10 | from bs4 import BeautifulSoup as bs
 11 | 
 12 |  
 13 | def load_en_vocab():
 14 |     vocab = [line.split()[0] for line in codecs.open('./preprocessed/src.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt] 
 15 |     word2idx = {word: idx for idx, word in enumerate(vocab)}
 16 |     idx2word = {idx: word for idx, word in enumerate(vocab)}
 17 |     return word2idx, idx2word
 18 | 
 19 | def load_zh_vocab():
 20 |     vocab = [line.split()[0] for line in codecs.open('./preprocessed/tgt.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt]
 21 |     word2idx = {word: idx for idx, word in enumerate(vocab)}
 22 |     idx2word = {idx: word for idx, word in enumerate(vocab)}
 23 |     return word2idx, idx2word
 24 | 
 25 | def create_data(source_sents, target_sents): 
 26 |     en2idx, idx2en = load_en_vocab()
 27 |     zh2idx, idx2zh = load_zh_vocab()
 28 |     #max token numbers
 29 |     max_token_num = max(len(en2idx.keys()), len(zh2idx.keys())) + 100
 30 |     
 31 |     # Index
 32 |     x_list, y_list, Sources, Targets = [], [], [], []
 33 |     for source_sent, target_sent in zip(source_sents, target_sents):
 34 |         #the default source senteces is english and target sentences is chinese
 35 |         x = [en2idx.get(word, 1) for word in source_sent.split()[:hp.maxlen-5] + [u" </S>"]]
 36 |         y = [zh2idx.get(word, 1) for word in target_sent.split()[:hp.maxlen-5] + [u" </S>"]]
 37 |         
 38 |         x_list.append(np.array(x))
 39 |         y_list.append(np.array(y))
 40 |         Sources.append(source_sent)
 41 |         Targets.append(target_sent)
 42 |     print('Demo: {}->\n{}'.format(Sources[0], Targets[0]))
 43 |     
 44 |     # Pad      
 45 |     X = np.zeros([len(x_list), hp.maxlen], np.int32)
 46 |     Y = np.zeros([len(y_list), hp.maxlen], np.int32)
 47 |     for i, (x, y) in enumerate(zip(x_list, y_list)):
 48 |         #print(x, y, hp.maxlen, len(x), len(y))
 49 |         X[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0))
 50 |         Y[i] = np.lib.pad(y, [0, hp.maxlen-len(y)], 'constant', constant_values=(0, 0))
 51 |     
 52 |     return X, Y, Sources, Targets
 53 | 
 54 | 
 55 | def refine(line, tokenizer):
 56 |     if tokenizer == 'jieba':
 57 |         line = re.sub("[\s\p']", "", line)
 58 |         return ' '.join(jieba.cut(line))
 59 |     elif tokenizer == 'en':
 60 |         line = re.sub("[^a-zA-Z]", " ", line)
 61 |         return line
 62 |     else:
 63 |         raise Exception('Could not find tokenizer...') 
 64 | 
 65 | def load_train_data():    
 66 |     en_sents = [line.strip() \
 67 |         for line in open(hp.source_train, 'r', encoding = 'utf-8').read().split("\n") \
 68 |             if not line.startswith('<')]
 69 |     zh_sents = [line.strip() \
 70 |         for line in open(hp.target_train, 'r', encoding = 'utf-8').read().split("\n") \
 71 |             if not line.startswith('<')]
 72 | 
 73 |     X, Y, Sources, Targets = create_data(en_sents, zh_sents)
 74 |     return X, Y
 75 |     
 76 | def load_test_data():
 77 |     def _parser(text):
 78 |         return [x.text for x in bs(text).find_all('seg')]
 79 | 
 80 | 
 81 |     en_sents = [line.strip() \
 82 |         for line in open(hp.source_test, 'r', encoding = 'utf-8').read().split("\n") \
 83 |             if not line.startswith('<seg id')]
 84 |     zh_sents = [line.strip() \
 85 |         for line in open(hp.target_test, 'r', encoding = 'utf-8').read().split("\n") \
 86 |             if not line.startswith('<seg id')]
 87 | 
 88 | 
 89 |     X, Y, Sources, Targets = create_data(en_sents, zh_sents)
 90 |     return X, Sources, Targets # (1064, 150)
 91 | 
 92 | def get_batch_data():
 93 |     # Load data
 94 |     X, Y = load_train_data()
 95 |     
 96 |     # calc total batch count
 97 |     num_batch = len(X) // hp.batch_size
 98 |     
 99 |     # Convert to tensor
100 |     X = tf.convert_to_tensor(X, tf.int32)
101 |     Y = tf.convert_to_tensor(Y, tf.int32)
102 |     
103 |     # Create Queues
104 |     input_queues = tf.train.slice_input_producer([X, Y])
105 |             
106 |     # create batch queues
107 |     x, y = tf.train.shuffle_batch(input_queues,
108 |                                 num_threads=8,
109 |                                 batch_size=hp.batch_size, 
110 |                                 capacity=hp.batch_size*64, 
111 |                                 min_after_dequeue=hp.batch_size*32, 
112 |                                 allow_smaller_final_batch=False)
113 |     
114 |     return x, y, num_batch # (N, T), (N, T), ()
115 | 
116 | 


--------------------------------------------------------------------------------
/en-zh_NMT/data_load.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # /usr/bin/python3
  3 | 
  4 | from hyperparams import seq2seq_Hyperparams as hp
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | import codecs
  8 | import re
  9 | import jieba
 10 | from bs4 import BeautifulSoup as bs
 11 | 
 12 |  
 13 | def load_en_vocab():
 14 |     vocab = [line.split()[0] for line in codecs.open('./preprocessed/en.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt] 
 15 |     word2idx = {word: idx for idx, word in enumerate(vocab)}
 16 |     idx2word = {idx: word for idx, word in enumerate(vocab)}
 17 |     return word2idx, idx2word
 18 | 
 19 | def load_zh_vocab():
 20 |     vocab = [line.split()[0] for line in codecs.open('./preprocessed/zh.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt]
 21 |     word2idx = {word: idx for idx, word in enumerate(vocab)}
 22 |     idx2word = {idx: word for idx, word in enumerate(vocab)}
 23 |     return word2idx, idx2word
 24 | 
 25 | def create_data(source_sents, target_sents): 
 26 |     en2idx, idx2en = load_en_vocab()
 27 |     zh2idx, idx2zh = load_zh_vocab()
 28 |     #max token numbers
 29 |     max_token_num = max(len(en2idx.keys()), len(zh2idx.keys())) + 100
 30 |     
 31 |     # Index
 32 |     x_list, y_list, Sources, Targets = [], [], [], []
 33 |     for source_sent, target_sent in zip(source_sents, target_sents):
 34 |         #the default source senteces is english and target sentences is chinese
 35 |         x = [en2idx.get(word, max_token_num) for word in source_sent.split()[:hp.maxlen-5] + [u" </S>"]]
 36 |         y = [zh2idx.get(word, max_token_num) for word in target_sent.split()[:hp.maxlen-5] + [u" </S>"]]
 37 |         
 38 |         x_list.append(np.array(x))
 39 |         y_list.append(np.array(y))
 40 |         Sources.append(source_sent)
 41 |         Targets.append(target_sent)
 42 |     print('Inspect data: {}->\n{}'.format(Sources[0], Targets[0]))
 43 |     
 44 |     # Pad      
 45 |     X = np.zeros([len(x_list), hp.maxlen], np.int32)
 46 |     Y = np.zeros([len(y_list), hp.maxlen], np.int32)
 47 |     for i, (x, y) in enumerate(zip(x_list, y_list)):
 48 |         #print(x, y, hp.maxlen, len(x), len(y))
 49 |         X[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0))
 50 |         Y[i] = np.lib.pad(y, [0, hp.maxlen-len(y)], 'constant', constant_values=(0, 0))
 51 |     
 52 |     return X, Y, Sources, Targets
 53 | 
 54 | 
 55 | def refine(line, tokenizer):
 56 |     if tokenizer == 'jieba':
 57 |         line = re.sub("[\s\p']", "", line)
 58 |         return ' '.join(jieba.cut(line))
 59 |     elif tokenizer == 'en':
 60 |         line = re.sub("[^a-zA-Z]", " ", line)
 61 |         return line
 62 |     else:
 63 |         raise Exception('Could not find tokenizer...') 
 64 | 
 65 | def load_train_data():    
 66 |     en_sents = [refine(line, tokenizer = 'en') \
 67 |         for line in open(hp.source_train, 'r', encoding = 'utf-8').read().split("\n") \
 68 |             if not line.startswith('<')]
 69 |     zh_sents = [refine(line, tokenizer = 'jieba') \
 70 |         for line in open(hp.target_train, 'r', encoding = 'utf-8').read().split("\n") \
 71 |             if not line.startswith('<')]
 72 | 
 73 |     X, Y, Sources, Targets = create_data(en_sents, zh_sents)
 74 |     return X, Y
 75 |     
 76 | def load_test_data():
 77 |     def _parser(text):
 78 |         return [x.text for x in bs(text).find_all('seg')]
 79 | 
 80 |     '''
 81 |     en_sents = [refine(line, tokenizer = 'en') \
 82 |         for line in open(hp.source_test, 'r', encoding = 'utf-8').read().split("\n") \
 83 |             if line.startswith('<seg id')]
 84 |     zh_sents = [refine(line, tokenizer = 'jieba') \
 85 |         for line in open(hp.target_test, 'r', encoding = 'utf-8').read().split("\n") \
 86 |             if line.startswith('<seg id')]
 87 |     '''
 88 |     en_sents = [refine(line, tokenizer = 'en') for line in _parser(open(hp.source_test).read().strip())]
 89 |     zh_sents = [refine(line, tokenizer = 'jieba') for line in _parser(open(hp.target_test).read().strip())]
 90 | 
 91 |     X, Y, Sources, Targets = create_data(en_sents, zh_sents)
 92 |     return X, Sources, Targets # (1064, 150)
 93 | 
 94 | def get_batch_data():
 95 |     # Load data
 96 |     X, Y = load_train_data()
 97 |     
 98 |     # calc total batch count
 99 |     num_batch = len(X) // hp.batch_size
100 |     
101 |     # Convert to tensor
102 |     X = tf.convert_to_tensor(X, tf.int32)
103 |     Y = tf.convert_to_tensor(Y, tf.int32)
104 |     
105 |     # Create Queues
106 |     input_queues = tf.train.slice_input_producer([X, Y])
107 |             
108 |     # create batch queues
109 |     x, y = tf.train.shuffle_batch(input_queues,
110 |                                 num_threads=8,
111 |                                 batch_size=hp.batch_size, 
112 |                                 capacity=hp.batch_size*64, 
113 |                                 min_after_dequeue=hp.batch_size*32, 
114 |                                 allow_smaller_final_batch=False)
115 |     
116 |     return x, y, num_batch # (N, T), (N, T), ()
117 | 
118 | 


--------------------------------------------------------------------------------
/transformer_RC/data_load.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | from __future__ import print_function
  5 | from hyperparams import rc_Hyperparams as hp
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | import codecs
  9 | import re
 10 | import jieba
 11 | import pandas as pd
 12 |   
 13 | def load_vocabs():
 14 |     vocab = [line.split()[0] for line in codecs.open('./preprocessed/vocabs.txt', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt] #raw code is hp.mincnt
 15 |     word2idx = {word: idx for idx, word in enumerate(vocab)}
 16 |     idx2word = {idx: word for idx, word in enumerate(vocab)}
 17 |     return word2idx, idx2word
 18 | 
 19 | 
 20 | 
 21 | def create_data(s1, s2, answer_span):
 22 |     """
 23 |     the default s1 is the question and s2 is the content
 24 |     """
 25 |     word2idx, idx2word = load_vocabs()
 26 | 
 27 |     
 28 |     # Index
 29 |     x1_list, x2_list, q_lens, p_lens, s_labels, e_labels,  Questions, Contents = \
 30 |     [], [], [], [], [], [], [], []
 31 | 
 32 |     for sent1, sent2, span in zip(s1, s2, answer_span):
 33 |         x1 = [word2idx.get(word, 1) for word in (sent1 + u" </S>").split()[:hp.q_maxlen-5]] # 1: OOV, </S>: End of Text
 34 |         x2 = [word2idx.get(word, 1) for word in (sent2 + u" </S>").split()[:hp.p_maxlen-5]] 
 35 | 
 36 |         x1_list.append(np.array(x1))
 37 |         x2_list.append(np.array(x2))
 38 | 
 39 |         q_lens.append(len(x1))
 40 |         p_lens.append(len(x2))
 41 | 
 42 |         s_labels.append(span[0])
 43 |         e_labels.append(span[1])
 44 | 
 45 |     print('Demo:', x1_list[0], x2_list[0], q_lens[0], p_lens[0], s_labels[0], e_labels[0])
 46 | 
 47 |     # Pad      
 48 |     X1 = np.zeros([len(x1_list), hp.q_maxlen], np.int32)
 49 |     X2 = np.zeros([len(x2_list), hp.p_maxlen], np.int32)
 50 | 
 51 |     for i, x in enumerate(x1_list):
 52 |         X1[i] = np.lib.pad(x, [0, hp.q_maxlen-len(x)], 'constant', constant_values=(0, 0))
 53 | 
 54 |     for i, x in enumerate(x2_list):
 55 |         X2[i] = np.lib.pad(x, [0, hp.p_maxlen-len(x)], 'constant', constant_values=(0, 0))
 56 | 
 57 | 
 58 |     
 59 |     return X1, X2, q_lens, p_lens, s_labels, e_labels
 60 | 
 61 | 
 62 | def _refine(line, lan = 'zh'):
 63 |     if lan == 'zh':
 64 |         line = re.sub("[\s\p']", "", line)
 65 |         #line = re.sub(r'[0-9]+', ' n', line)
 66 |         #line = re.sub(r'[a-zA-Z]+', ' α', line)
 67 |         line = jieba.cut(line)
 68 |         return ' '.join(list(line))
 69 |     elif lan == 'en':
 70 |         line = re.sub("[^a-zA-Z]", " ", line)
 71 |         return line
 72 | 
 73 |     else:
 74 |         raise Exception('Havn\'t specified language!')
 75 |         return
 76 | 
 77 | 
 78 | 
 79 | def load_train_data(tokenizer = None):
 80 |     train_data = pd.read_csv(hp.trainfile)
 81 |     questions, contents, answer_spans = list(train_data['question']), list(train_data['content']), \
 82 |     list(train_data['answer_span'])
 83 | 
 84 |     questions, contents = [_refine(line) for line in questions], [_refine(line) for line in contents]
 85 |     
 86 |     answer_spans = [eval(line) for line in answer_spans]
 87 | 
 88 |     X1, X2, q_lens, p_lens, start_labels, end_labels = create_data(questions, contents, answer_spans)
 89 |     return X1, X2, q_lens, p_lens, start_labels, end_labels
 90 |     
 91 | 
 92 | 
 93 | def load_test_data(tokenizer = None):
 94 |     test_data = pd.read_csv(hp.testfile)
 95 |     questions, contents, answer_spans = list(test_data['question']), list(test_data['content']), \
 96 |     list(test_data['answer_span'])
 97 | 
 98 |     answer_spans = [eval(line) for line in answer_spans]
 99 | 
100 |     questions, contents = [_refine(line) for line in questions], [_refine(line) for line in contents]
101 |     
102 | 
103 |     X1, X2, q_lens, p_lens, start_labels, end_labels = create_data(questions, contents, answer_spans)
104 |     return X1, X2, q_lens, p_lens, start_labels, end_labels
105 | 
106 | 
107 | def get_batch_data():
108 |     # Load data
109 |     X1, X2, q_lens, p_lens, start_labels, end_labels = load_train_data()
110 |     
111 |     # calc total batch count
112 |     num_batch = len(X1) // hp.batch_size
113 |     
114 |     # Convert to tensor
115 |     X1 = tf.convert_to_tensor(X1, tf.int32)
116 |     X2 = tf.convert_to_tensor(X2, tf.int32)
117 |     q_lens = tf.convert_to_tensor(q_lens, tf.int32)
118 |     p_lens = tf.convert_to_tensor(p_lens, tf.int32)
119 |     start_labels = tf.convert_to_tensor(start_labels, tf.int32)
120 |     end_labels = tf.convert_to_tensor(end_labels, tf.int32)
121 | 
122 |     
123 |     # Create Queues
124 |     input_queues = tf.train.slice_input_producer([X1, X2, q_lens, p_lens, start_labels, end_labels])
125 |             
126 |     # create batch queues
127 |     q, p, q_length, p_length, start_pos, end_pos = tf.train.shuffle_batch(input_queues,
128 |                                 num_threads=8,
129 |                                 batch_size=hp.batch_size, 
130 |                                 capacity=hp.batch_size*64, 
131 |                                 min_after_dequeue=hp.batch_size*32, 
132 |                                 allow_smaller_final_batch=False)
133 | 
134 |     
135 |     return q, p, q_length, p_length, start_pos, end_pos, num_batch # (N, T), (N, T), ()
136 | 
137 | 


--------------------------------------------------------------------------------
/transformer_text_Classfication/data_pre.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | #data preparation
  5 | import codecs
  6 | import os
  7 | import argparse
  8 | import jieba
  9 | from hyperparams import feature_Block_Hyperparams as hp
 10 | from sklearn.externals import joblib
 11 | import re
 12 | 
 13 | def jieba_data_pre():
 14 | 	with codecs.open('./dataset/train.txt', 'r', encoding = 'utf-8') as f:
 15 | 		vocabset = f.readlines()
 16 | 		vocabset = [x.strip() for x in vocabset]
 17 | 		#print(vocabset[:10])
 18 | 	zh = ''
 19 | 	en = ''
 20 | 
 21 | 	for pair in vocabset:
 22 | 		try:
 23 | 			z, e = pair.strip().split('\t')
 24 | 			zh += z + ' '
 25 | 			en += e + ' '
 26 | 		except:
 27 | 			zh += '<eos>'
 28 | 			en += '<eos>'
 29 | 
 30 | 
 31 | 	zh_sent = zh.split('<eos>')
 32 | 	en_sent = en.split('<eos>')
 33 | 	assert len(zh_sent) == len(en_sent), 'length of source and target not comliable'
 34 | 
 35 | 	files = []
 36 | 	for root, dirs, file in os.walk(".", topdown=False):
 37 | 		files.append(file)
 38 | 	#print(files)
 39 | 
 40 | 	if 'train.tags.zh-en.zh' not in files:
 41 | 		with codecs.open('./dataset/train.tags.src-tgt.src', 'w', 'utf-8') as f:
 42 | 			for i in zh_sent[:int(0.8*len(zh_sent))]:
 43 | 				f.write(i+'\n')
 44 | 
 45 | 		with codecs.open('./dataset/train.tags.tgt-src.tgt', 'w', 'utf-8') as f:
 46 | 			for i in en_sent[:int(0.8*len(en_sent))]:
 47 | 				f.write(i+'\n')
 48 | 
 49 | 		with codecs.open('./dataset/test.tags.src-tgt.src', 'w', 'utf-8') as f:
 50 | 			for i in zh_sent[int(0.8*len(zh_sent)):]:
 51 | 				f.write(i+'\n')
 52 | 
 53 | 		with codecs.open('./dataset/test.tags.tgt-src.tgt', 'w', 'utf-8') as f:
 54 | 			for i in en_sent[int(0.8*len(en_sent)):]:
 55 | 				f.write(i+'\n')
 56 | 
 57 | 
 58 | 
 59 | def text_sum_pre():
 60 | 	cnt_title_path = './dataset/content-title.txt'
 61 | 	cnt_title_pair = [x.strip().split() for x in open(cnt_title_path).readlines()]
 62 | 	cnt_title_pair = [x for x in cnt_title_pair if len(x) == 2]
 63 | 
 64 | 	content_set, sum_set = [x[0] for x in cnt_title_pair], [x[1] for x in cnt_title_pair]
 65 | 	pad = ['<PAD>', '<UNK>', "<S>", "</S>"]
 66 | 	content_vocabs, title_vocabs = {}, {}
 67 | 
 68 | 	for x in content_set:
 69 | 		vocabs = jieba.cut(x)
 70 | 		for x in vocabs:
 71 | 			if x not in content_vocabs:
 72 | 				content_vocabs[x] = 1
 73 | 			else:
 74 | 				content_vocabs[x] += 1
 75 | 
 76 | 	for x in sum_set:
 77 | 		vocabs = jieba.cut(x)
 78 | 		for x in vocabs:
 79 | 			if x not in title_vocabs:
 80 | 				title_vocabs[x] = 1
 81 | 			else:
 82 | 				title_vocabs[x] += 1
 83 | 
 84 | 	#save vocab
 85 | 
 86 | 	if not 'textSummary' in os.listdir('./preprocessed'):
 87 | 		os.mkdir('./preprocessed/textSummary')
 88 | 	with codecs.open('./preprocessed/textSummary/src.vocab.tsv', 'w', 'utf-8') as f:
 89 | 		for token in pad:
 90 | 			f.write(token + '\t' + '1000000000' + '\n')
 91 | 		for token, val in content_vocabs.items():
 92 | 			f.write(token + '\t' + str(content_vocabs[token])  + '\n')
 93 | 	
 94 | 	with codecs.open('./preprocessed/textSummary/tgt.vocab.tsv', 'w', 'utf-8') as f:
 95 | 		for token in pad:
 96 | 			f.write(token + '\t' + '1000000000' + '\n')
 97 | 		for token, val in title_vocabs.items():
 98 | 			f.write(token + '\t' + str(title_vocabs[token])  + '\n')
 99 | 
100 | 
101 | 	if 'textSummary' in os.listdir('./dataset'):
102 | 		os._exit(0)
103 | 	else:
104 | 		os.mkdir('./dataset/textSummary')
105 | 	
106 | 	n = len(sum_set)
107 | 	with codecs.open('./dataset/textSummary/train.tags.src-tgt.src', 'w', 'utf-8') as f:
108 | 		for x in content_set[:int(0.8*n)]:
109 | 			f.write(x+'\n')
110 | 
111 | 	with codecs.open('./dataset/textSummary/train.tags.tgt-src.tgt', 'w', 'utf-8') as f:
112 | 		for x in sum_set[:int(0.8*n)]:
113 | 			f.write(x+'\n')
114 | 	
115 | 	with codecs.open('./dataset/textSummary/test.tags.src-tgt.src', 'w', 'utf-8') as f:
116 | 		for x in content_set[int(0.8*n):]:
117 | 			f.write(x+'\n')
118 | 	
119 | 	with codecs.open('./dataset/textSummary/test.tags.tgt-src.tgt', 'w', 'utf-8') as f:
120 | 		for x in sum_set[int(0.8*n):]:
121 | 			f.write(x+'\n')
122 | 
123 | 
124 | def main():
125 | 	parser = argparse.ArgumentParser(description='Choice the task you want to run.')
126 | 	parser.add_argument('--task', default = 'jieba',
127 | 						help='task name(default: tokenize)')
128 | 
129 | 	args = parser.parse_args()
130 | 	task_name = args.task
131 | 	
132 | 	if task_name == 'jieba': jieba_data_pre()
133 | 	elif task_name == 'textsum': text_sum_pre()
134 | 
135 | 
136 | if __name__ == '__main__':
137 | 	import pandas as pd
138 | 	import random
139 | 	adspath = ['./datasets/ADs_detection.csv', './datasets/20190723_ads_annotation.csv']
140 | 	
141 | 	df = pd.read_csv(adspath.pop())
142 | 	while adspath:
143 | 		df = pd.concat([df, pd.read_csv(adspath.pop())], axis = 0)
144 | 
145 | 	ads = [re.sub("[\s\p']", "", x)+'\t'+'1' for x in df['text']]
146 | 	asr = [x.strip() for x in joblib.load('./datasets/corpus.json')['content'] if len(x)>3]
147 | 	no_ads = random.sample(asr, min(len(ads)*1, len(asr)))
148 | 	no_ads = [re.sub("[\s\p']", "", x)+'\t'+'0' for x in no_ads]
149 | 
150 | 	corpus = ads[:]
151 | 	corpus.extend(no_ads)
152 | 
153 | 	random.shuffle(corpus)
154 | 	sep = int(0.9*len(corpus))
155 | 	with open('./datasets/trainset.txt', 'w') as f1:
156 | 		for line in corpus[:sep]:
157 | 			f1.write(line+'\n')
158 | 
159 | 	with open('./datasets/testset.txt', 'w') as f2:
160 | 		for line in corpus[sep:]:
161 | 			f2.write(line+'\n')
162 | 
163 | 	print('Done!')
164 | 
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # transfromer_NN_Block
  2 | We are doing this to implemented transformer as a neural network building block to overcome several task in NLP research, this rep follow the raw paper realization of [Attention Is All You Need](https://arxiv.org/abs/1706.03762).
  3 | 
  4 | [![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/fooSynaptic/transfromer_NN_Block)
  5 | 
  6 | This rep achieved **Several** tasks:
  7 | - [The seq2seq text generation, we try to implemented transformer solve a conventional problem in NLP - Words segmentation(Chinese).](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/transformer_jieba)
  8 | - [The NMT problem track on Chinese-English machine translation with WIT3 datasets.](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/en-zh_NMT)
  9 | - [The language model encoder architecture for Text-classfication.](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/transformer_text_Classfication)
 10 | - [The sentence entailement task experiment with stanford SNLI datasets(Natural language Inference).](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/transformer_infersent)
 11 | - [Updated reading comprehension task.](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/transformer_RC)
 12 | 
 13 | 
 14 | 
 15 | # INSTALL ENV:
 16 | Please run `pip install -r requirements.txt` first.
 17 | 
 18 | 
 19 | # ***First- the encoder-decoder architectures.***
 20 | # train
 21 | -The aim is train a sequence labeling model with **Transformer**. We follow the 
 22 | conventional sentence tokenize method - **/B/E/S/M** (represent the word begin/end/single word/in the middle respectively).
 23 | 
 24 | - We used some labeled chinese Ducuments to train my model. The raw data presented in the `./transformer_jieba/dataset` dir. Or you may want use the `./transformer_jieba/prepro.py` to preprocess the raw data.
 25 | 
 26 | - Just use the `python train.py` to train the model.
 27 | 
 28 | 
 29 | # eval
 30 | - Run `python eval.py`, We achieved the BLEU score nearly 80.
 31 | 
 32 | 
 33 | # ***Second - zh-en NMT***
 34 | - the train and test data was from `Web Inventory of Transcribed and Translated Talks`-**WIT3**, we train a model for English-Chinese translation model([data source](https://wit3.fbk.eu/mt.php?release=2015-01)).
 35 | - test Result:
 36 |   ![NMT result](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/NMT_res_BLEU.png)
 37 | 
 38 | 
 39 | 
 40 | 
 41 | # ***Third - the transformer feature extraction block***
 42 | - you may find the code in `./transformer_text_Classfication`, codes about preprocessing and training as well as evaluation locate in this path. And the wrappers usage are similar to encoder-decoder architecture.
 43 | - The chinese corpus was downloaded from [THUCTC(THU Chinese Text Classification)](http://thuctc.thunlp.org/), and we show better macro avg f1-score with over 0.05.
 44 | - ***Our model is very raw and shallow(only 8 multi-head attention projection and final linear projection) and without pre-trained embedding, you can explore performance with our code.***
 45 | 
 46 | # result of chinese sentences classfication(char-level)
 47 | ` tagging = {'时尚':0, '教育':1, '时政':2, '体育':3, '游戏':4, '家居':5, '科技':6, '房产':7, '财经':8, '娱乐':9} `
 48 | ```
 49 |               precision    recall  f1-score   support
 50 | 
 51 |            0       0.91      0.95      0.93      1000
 52 |            1       0.96      0.77      0.85      1000
 53 |            2       0.92      0.93      0.92      1000
 54 |            3       0.95      0.93      0.94      1000
 55 |            4       0.86      0.91      0.88      1000
 56 |            5       0.83      0.47      0.60      1000
 57 |            6       0.86      0.85      0.86      1000
 58 |            7       0.64      0.87      0.74      1000
 59 |            8       0.79      0.91      0.85      1000
 60 |            9       0.88      0.91      0.89      1000
 61 | 
 62 |     accuracy                           0.85     10000
 63 |    macro avg       0.86      0.85      0.85     10000
 64 | weighted avg       0.86      0.85      0.85     10000
 65 | 
 66 | Done
 67 | ```
 68 | [***We Also implemented a sentences entailment inference task with transformer***](https://github.com/fooSynaptic/transfromer_NN_Block/tree/master/transformer_infersent)
 69 | ---
 70 | **Data source** [standord SNLI](https://nlp.stanford.edu/projects/snli/snli_1.0.zip)
 71 | 
 72 | - *Download source data and unzip* : `wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip && unzip snli_1.0.zip`
 73 | - *preprocess data*: `python data_prepare.py && python prepro.py`
 74 | - *train*: run `python train.py`
 75 | - *eval*: run `python eval.py --task infersent`
 76 | 
 77 | Experiment result:
 78 | - train accuracy:
 79 | ![train accuracy](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/infersent_train_with_SNLI_accuracy.png)
 80 | 
 81 | - train loss:
 82 | ![train loss](https://github.com/fooSynaptic/transfromer_NN_Block/blob/master/images/infersent_train_SNLI_loss.png)
 83 | 
 84 | 
 85 | - eval result:
 86 | ```
 87 |               precision    recall  f1-score   support
 88 | 
 89 |            0       0.82      0.76      0.79      3358
 90 |            1       0.77      0.80      0.79      3226
 91 |            2       0.70      0.73      0.72      3208
 92 | 
 93 |     accuracy                           0.76      9792
 94 |    macro avg       0.76      0.76      0.76      9792
 95 | weighted avg       0.76      0.76      0.76      9792
 96 | ```
 97 | 
 98 | 
 99 | # Ref
100 | 
101 | -  https://github.com/Kyubyong/transformer
102 | -  [Attention Is All You Need](https://arxiv.org/abs/1706.03762).
103 | 


--------------------------------------------------------------------------------
/transformer_RC/layers/match_layer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf8 -*-
  2 | # ==============================================================================
  3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | """
 18 | This module implements the core layer of Match-LSTM and BiDAF
 19 | """
 20 | 
 21 | import tensorflow as tf
 22 | import tensorflow.contrib as tc
 23 | 
 24 | class MatchLSTMAttnCell(tc.rnn.LSTMCell):
 25 |     """
 26 |     Implements the Match-LSTM attention cell
 27 |     """
 28 |     def __init__(self, num_units, context_to_attend):
 29 |         super(MatchLSTMAttnCell, self).__init__(num_units, state_is_tuple=True)
 30 |         self.context_to_attend = context_to_attend
 31 |         self.fc_context = tc.layers.fully_connected(self.context_to_attend,
 32 |                                                     num_outputs=self._num_units,
 33 |                                                     activation_fn=None)
 34 | 
 35 |     def __call__(self, inputs, state, scope=None):
 36 |         (c_prev, h_prev) = state
 37 |         with tf.variable_scope(scope or type(self).__name__):
 38 |             ref_vector = tf.concat([inputs, h_prev], -1)
 39 |             G = tf.tanh(self.fc_context
 40 |                         + tf.expand_dims(tc.layers.fully_connected(ref_vector,
 41 |                                                                    num_outputs=self._num_units,
 42 |                                                                    activation_fn=None), 1))
 43 |             logits = tc.layers.fully_connected(G, num_outputs=1, activation_fn=None)
 44 |             scores = tf.nn.softmax(logits, 1)
 45 |             attended_context = tf.reduce_sum(self.context_to_attend * scores, axis=1)
 46 |             new_inputs = tf.concat([inputs, attended_context,
 47 |                                     inputs - attended_context, inputs * attended_context],
 48 |                                    -1)
 49 |             return super(MatchLSTMAttnCell, self).__call__(new_inputs, state, scope)
 50 | 
 51 | 
 52 | class MatchLSTMLayer(object):
 53 |     """
 54 |     Implements the Match-LSTM layer, which attend to the question dynamically in a LSTM fashion.
 55 |     """
 56 |     def __init__(self, hidden_size):
 57 |         self.hidden_size = hidden_size
 58 | 
 59 |     def match(self, passage_encodes, question_encodes, p_length, q_length):
 60 |         """
 61 |         Match the passage_encodes with question_encodes using Match-LSTM algorithm
 62 |         """
 63 |         with tf.variable_scope('match_lstm'):
 64 |             cell_fw = MatchLSTMAttnCell(self.hidden_size, question_encodes)
 65 |             cell_bw = MatchLSTMAttnCell(self.hidden_size, question_encodes)
 66 |             outputs, state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw,
 67 |                                                              inputs=passage_encodes,
 68 |                                                              sequence_length=p_length,
 69 |                                                              dtype=tf.float32)
 70 |             match_outputs = tf.concat(outputs, 2)
 71 |             state_fw, state_bw = state
 72 |             c_fw, h_fw = state_fw
 73 |             c_bw, h_bw = state_bw
 74 |             match_state = tf.concat([h_fw, h_bw], 1)
 75 |         return match_outputs, match_state
 76 | 
 77 | 
 78 | class AttentionFlowMatchLayer(object):
 79 |     """
 80 |     Implements the Attention Flow layer,
 81 |     which computes Context-to-question Attention and question-to-context Attention
 82 |     """
 83 |     def __init__(self, hidden_size):
 84 |         self.hidden_size = hidden_size
 85 | 
 86 |     def match(self, passage_encodes, question_encodes, p_length, q_length):
 87 |         """
 88 |         Match the passage_encodes with question_encodes using Attention Flow Match algorithm
 89 |         """
 90 |         #p_encodes = (batch_size, p_length, hidden_size),\
 91 |         #  q_encodes = (batch_size, q_length, hidden_size)
 92 |         with tf.variable_scope('bidaf'):
 93 |             sim_matrix = tf.matmul(passage_encodes, question_encodes, transpose_b=True)
 94 |             #sim_matrix = (batch_size, p_length, q_length)
 95 |             context2question_attn = tf.matmul(tf.nn.softmax(sim_matrix, -1), question_encodes)
 96 |             #c2q_atten_weight = (batch_size, p_length, hidden_size)
 97 |             b = tf.nn.softmax(tf.expand_dims(tf.reduce_max(sim_matrix, 2), 1), -1)
 98 |             # b = (batch_size, 1,  p_length)
 99 |             question2context_attn = tf.tile(tf.matmul(b, passage_encodes),
100 |                                          [1, tf.shape(passage_encodes)[1], 1])
101 |             # q2c_atten_weight = (batch_size, 1,  p_length) @ (batch_size, p_length, hidden_size) \
102 |             # = (batch_size, 1, hidden_size)
103 |             # (tile) => (batch_size, p_length, hidden_size)
104 | 
105 |             assert tf.shape(question2context_attn) == context2question_attn, \
106 |                 print("Dimension not fixed to cancate.")
107 |             concat_outputs = tf.concat([passage_encodes, context2question_attn,
108 |                                         passage_encodes * context2question_attn,
109 |                                         passage_encodes * question2context_attn], -1)
110 |                                  
111 | 
112 |             return concat_outputs, None
113 | 


--------------------------------------------------------------------------------
/Models/models.py:
--------------------------------------------------------------------------------
  1 | # encoding = utf-8
  2 | # /usr/bin/python3
  3 | import tensorflow as tf
  4 | from modules import *
  5 | 
  6 | 
  7 | class vanilla_transformer():
  8 |     def __init__(self, hp, is_training):
  9 |         self.hp = hp
 10 |         self.train = is_training
 11 | 
 12 |     def encode(self, Input, Vocabs_length):
 13 |         with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
 14 |             ## Embedding
 15 |             enc = embedding(Input,
 16 |                                 vocab_size=Vocabs_length, 
 17 |                                 num_units=self.hp.hidden_units, 
 18 |                                 scale=True,
 19 |                                 scope="enc_embed")
 20 |             
 21 |             ## Positional Encoding
 22 |             if self.hp.sinusoid:
 23 |                 enc += positional_encoding(Input,
 24 |                                 num_units=self.hp.hidden_units, 
 25 |                                 zero_pad=False, 
 26 |                                 scale=False,
 27 |                                 scope="enc_pe")
 28 |             else:
 29 |                 enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(Input)[1]), 0), [tf.shape(Input)[0], 1]),
 30 |                                 vocab_size=Vocabs_length, 
 31 |                                 num_units=self.hp.hidden_units, 
 32 |                                 zero_pad=False, 
 33 |                                 scale=False,
 34 |                                 scope="enc_pe")
 35 | 
 36 |             ## Dropout
 37 |             enc = tf.layers.dropout(enc, 
 38 |                                         rate=self.hp.dropout_rate, 
 39 |                                         training=tf.convert_to_tensor(self.train))
 40 |             
 41 |             ## Blocks
 42 |             for i in range(self.hp.num_blocks):
 43 |                 with tf.variable_scope("num_blocks", reuse = tf.AUTO_REUSE):
 44 |                     ### Multihead Attention
 45 |                     enc = multihead_attention(queries=enc, 
 46 |                                                     keys=enc, 
 47 |                                                     num_units=self.hp.hidden_units, 
 48 |                                                     num_heads=self.hp.num_heads, 
 49 |                                                     dropout_rate=self.hp.dropout_rate,
 50 |                                                     is_training=self.train,
 51 |                                                     causality=False)
 52 |                     
 53 |                     ### Feed Forward
 54 |                     enc = feedforward(enc, num_units=[4*self.hp.hidden_units, self.hp.hidden_units])
 55 |         state = enc
 56 |         return state
 57 | 
 58 |     
 59 |     def decode(self, decoder_inputs, key_states, Vocabs_length, decode_length):
 60 |         with tf.variable_scope("decoder", reuse = tf.AUTO_REUSE):
 61 |             ## Embedding
 62 |             self.dec = embedding(decoder_inputs, 
 63 |                                     vocab_size=Vocabs_length, 
 64 |                                     num_units=self.hp.hidden_units,
 65 |                                     scale=True, 
 66 |                                     scope="dec_embed")
 67 |             
 68 |             ## Positional Encoding
 69 |             if self.hp.sinusoid:
 70 |                 self.dec += positional_encoding(decoder_inputs,
 71 |                                     vocab_size=decode_length, 
 72 |                                     num_units=self.hp.hidden_units, 
 73 |                                     zero_pad=False, 
 74 |                                     scale=False,
 75 |                                     scope="dec_pe")
 76 |             else:
 77 |                 self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(decoder_inputs)[1]), 0), [tf.shape(decoder_inputs)[0], 1]),
 78 |                                     vocab_size=decode_length, 
 79 |                                     num_units=self.hp.hidden_units, 
 80 |                                     zero_pad=False,
 81 |                                     scale=False,
 82 |                                     scope="dec_pe")
 83 |             
 84 |             ## Dropout
 85 |             self.dec = tf.layers.dropout(self.dec, 
 86 |                                         rate=self.hp.dropout_rate, 
 87 |                                         training=tf.convert_to_tensor(self.train))
 88 |             
 89 |             ## Blocks
 90 |             for i in range(self.hp.num_blocks):
 91 |                 with tf.variable_scope("num_blocks_{}".format(i)):
 92 |                     ## Multihead Attention ( self-attention)
 93 |                     self.dec = multihead_attention(queries=self.dec, 
 94 |                                                     keys=self.dec, 
 95 |                                                     num_units=self.hp.hidden_units, 
 96 |                                                     num_heads=self.hp.num_heads, 
 97 |                                                     dropout_rate=self.hp.dropout_rate,
 98 |                                                     is_training=self.train,
 99 |                                                     causality=True, 
100 |                                                     scope="self_attention")
101 |                     
102 | 
103 |                     ## Multihead Attention ( vanilla attention)
104 |                     self.dec = multihead_attention(queries=self.dec, 
105 |                                                     keys=key_states,
106 |                                                     num_units=self.hp.hidden_units, 
107 |                                                     num_heads=self.hp.num_heads,
108 |                                                     dropout_rate=self.hp.dropout_rate,
109 |                                                     is_training=self.train, 
110 |                                                     causality=False,
111 |                                                     scope="vanilla_attention")
112 |                     
113 | 
114 |                     ## Feed Forward
115 |                     self.dec = feedforward(self.dec, num_units=[4*self.hp.hidden_units, self.hp.hidden_units])
116 |         
117 |         output_state = self.dec
118 |         return output_state
119 | 


--------------------------------------------------------------------------------
/transformer_RC/eval.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import codecs
  5 | import os
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | 
 10 | from hyperparams import rc_Hyperparams as hp
 11 | from data_load import load_vocabs, load_train_data, load_test_data, create_data
 12 | from train import Graph
 13 | #from nltk.translate.bleu_score import corpus_bleu
 14 | import argparse
 15 | #from sklearn.metrics import classification_report
 16 | #from utils import compute_bleu_rouge
 17 | import pandas as pd
 18 | from modules import bleu
 19 | 
 20 | 
 21 | def find_best_answer_for_passage(start_probs, end_probs, passage_len=None):
 22 |     """
 23 |     Finds the best answer with the maximum start_prob * end_prob from a single passage
 24 |     """
 25 |     if passage_len is None:
 26 |         passage_len = len(start_probs)
 27 |     else:
 28 |         passage_len = min(len(start_probs), passage_len)
 29 | 
 30 |     best_start, best_end, max_prob = -1, -1, 0
 31 | 
 32 |     for start_idx in range(passage_len):
 33 |         #within the span of answer limit
 34 |         for ans_len in range(hp.ans_maxlen):
 35 |             end_idx = start_idx + ans_len
 36 |             if end_idx >= passage_len:
 37 |                 continue
 38 | 
 39 |             prob = start_probs[start_idx] * end_probs[end_idx]
 40 |             if prob > max_prob:
 41 |                 best_start = start_idx
 42 |                 best_end = end_idx
 43 |                 max_prob = prob
 44 |     return (best_start, best_end), max_prob
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 |  
 51 | 
 52 | def eval(task_name):
 53 |     # Load graph
 54 |     g = Graph(is_training=False)
 55 |     print("Graph loaded")
 56 |     
 57 |     # Load data
 58 |     test_data = pd.read_csv(hp.testfile)
 59 |     questions, contents, q_lens, p_lens, start_pos, end_pos = load_test_data()
 60 |     raw_passages = list(test_data['content'])
 61 |     reference_answers = list(test_data['answer'])
 62 | 
 63 | 
 64 |     word2idx, idx2word = load_vocabs()
 65 | 
 66 |     # Start session         
 67 |     with g.graph.as_default():    
 68 |         sv = tf.train.Supervisor()
 69 |         with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
 70 |             ## Restore parameters
 71 |             sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
 72 |             print("Restored!")
 73 |               
 74 |             ## Get model name
 75 |             print('Model dir:', hp.logdir)
 76 |             mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
 77 |             print("Model name:", mname)
 78 |              
 79 |             ## Inference
 80 |             if not os.path.exists('results'): os.mkdir('results')
 81 |             with codecs.open("results/" + mname, "w", "utf-8") as fout:
 82 |                 
 83 |                 pred_answers, ref_answers = [], []
 84 |                 pred_dict, ref_dict = {}, {}
 85 |                 ques_id = 0
 86 |                 eval_dict = {'bleu_1':[], 'bleu_2':[], 'bleu_3':[], 'bleu_4':[]}
 87 | 
 88 |                 for i in range(len(questions) // hp.batch_size):                
 89 |                     print("Iterator: {} / {}".format(i, len(questions)//hp.batch_size))    
 90 | 
 91 |                     ### Get mini-batches
 92 |                     q = questions[i*hp.batch_size: (i+1)*hp.batch_size]
 93 |                     p = contents[i*hp.batch_size: (i+1)*hp.batch_size]
 94 |                     q_length = q_lens[i*hp.batch_size: (i+1)*hp.batch_size]
 95 |                     p_length = p_lens[i*hp.batch_size: (i+1)*hp.batch_size]
 96 |                     s_pos = start_pos[i*hp.batch_size: (i+1)*hp.batch_size]
 97 |                     e_pos = end_pos[i*hp.batch_size: (i+1)*hp.batch_size]
 98 |                     passages = raw_passages[i*hp.batch_size: (i+1)*hp.batch_size]
 99 |                     ref_answers = reference_answers[i*hp.batch_size: (i+1)*hp.batch_size]
100 | 
101 |                     feed_dict = {g.q: q,
102 |                                 g.p: p,
103 |                                 g.q_length: q_length,
104 |                                 g.p_length: p_length,
105 |                                 g.start_label: s_pos,
106 |                                 g.end_label: e_pos}
107 | 
108 |                     start_probs, end_probs = sess.run([g.start_probs, g.end_probs], feed_dict)
109 | 
110 | 
111 |                     ### Write to file
112 |                     for start_prob, end_prob, passage, ref in zip(start_probs, end_probs, passages, ref_answers):
113 |                         pred_span, prob = find_best_answer_for_passage(start_prob, end_prob)
114 |                         pred_answer = passage[pred_span[0]: pred_span[1]+1]
115 |                         
116 |                         if not len(pred_answer) > 0: continue
117 | 
118 |                         pred_dict[str(ques_id)] = [pred_answer]
119 |                         ref_dict[str(ques_id)] = [ref]
120 |                         ques_id += 1
121 | 
122 |                         fout.write('-ref: '+ ref)
123 |                         fout.write("-pred: "+ pred_answer)
124 | 
125 |                         b1, b2, b3, b4 = bleu(list(pred_answer), list(ref), 1), \
126 |                                          bleu(list(pred_answer), list(ref), 2), \
127 |                                          bleu(list(pred_answer), list(ref), 3), \
128 |                                          bleu(list(pred_answer), list(ref), 4)
129 |  
130 | 
131 |                         eval_dict['bleu_1'].append(b1)
132 |                         eval_dict['bleu_2'].append(b2)
133 |                         eval_dict['bleu_3'].append(b3)
134 |                         eval_dict['bleu_2'].append(b2)
135 |                 
136 |                 for metric in eval_dict:
137 |                     fout.write(metric + '\t' + str(np.mean(eval_dict[metric])) + '\n') 
138 |                     print(metric + '\t' + str(np.mean(eval_dict[metric]))) 
139 |                                           
140 | if __name__ == '__main__':
141 |     parser = argparse.ArgumentParser(description='Choice the task you want to eval.')
142 |     parser.add_argument('--task', help='task name(default: RC)')
143 | 
144 |     args = parser.parse_args()
145 |     task_name = args.task
146 |     eval(task_name)
147 |     print("Done")
148 |     
149 | 
150 | 


--------------------------------------------------------------------------------
/transformer_infersent/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import tensorflow as tf
  5 | 
  6 | from hyperparams import infersent_Block_Hyperparams as hp
  7 | from data_load import get_batch_data, load_vocabs
  8 | from modules import *
  9 | import os, codecs
 10 | from tqdm import tqdm
 11 | 
 12 | os.sys.path.append('../Models')
 13 | from models import vanilla_transformer
 14 | 
 15 | 
 16 | class Graph():
 17 |     def __init__(self, is_training=True):
 18 |         self.graph = tf.Graph()
 19 |         with self.graph.as_default():
 20 |             if is_training:
 21 |                 self.x1, self.x2, self.y, self.num_batch = get_batch_data() 
 22 |                 #self.x, self.label, self.num_batch = get_batch_data() # (N, T)
 23 |                 #self.y = tf.one_hot(self.label, depth = hp.n_class)
 24 | 
 25 |             else: # inference
 26 |                 self.x1 = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
 27 |                 self.x2 = tf.placeholder(tf.int32, shape = (None, hp.maxlen))
 28 |                 #self.label = tf.placeholder(tf.int32, shape = (None, hp.n_class))
 29 |                 #self.y = tf.placeholder(tf.int32, shape = (None, hp.n_class))
 30 |                 #self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
 31 | 
 32 |             self.l2_loss = tf.constant(0.0)
 33 |             # define decoder inputs
 34 |             #for sentence relationship learning task we want to encoder sent1 to e1, then decoder(e1 + sent2)
 35 |             #to get a more sementic relationship across corpus
 36 |             self.decoder_inputs = tf.concat((tf.ones_like(self.x2[:, :1])*2, self.x2[:, :-1]), -1) # 2:<S>
 37 | 
 38 |             # Load vocabulary    
 39 |             word2idx, idx2word = load_vocabs()
 40 | 
 41 | 
 42 |             # initialize transformer
 43 |             transformer = vanilla_transformer(hp, self.is_training)
 44 | 
 45 |             #encode
 46 |             self.encode1, self.encode2 = transformer.encode(self.x1, len(word2idx)), \
 47 |                 transformer.encode(self.x2, len(word2idx))
 48 | 
 49 |             #concated
 50 |             self.enc = tf.divide(tf.add(self.encode1, encode2), 2)
 51 |             self.enc = normalize(self.enc)
 52 | 
 53 |             #for sentence relationship learning task we want to encoder sent1 to e1, then decoder(e1 + sent2)
 54 |             #to get a more sementic relationship across corpus
 55 | 
 56 |             # Decoder
 57 |             self.dec = transformer.decode(self.decoder_inputs, self.enc, len(word2idx), hp.p_maxlen)
 58 | 
 59 | 
 60 |             self.logits = tf.add(self.enc, tf.multiply(self.enc, self.dec))
 61 |             #self.logits = self.enc
 62 | 
 63 |             #self.logits = tf.layers.dense(self.logits, 64, activation = 'tanh')
 64 |             self.logits = tf.layers.flatten(self.logits)
 65 |             #self.logits = tf.reshape(self.logits, [64, -1])
 66 |             self.h_drop = tf.nn.dropout(self.logits, hp.dropout_keep_prob)
 67 | 
 68 |             with tf.name_scope("output_logit"):
 69 |               W = tf.get_variable(
 70 |                   "W",
 71 |                   shape=[hp.maxlen * hp.hidden_units, len(hp.relations)],
 72 |                   initializer=tf.contrib.layers.xavier_initializer())
 73 | 
 74 |               b = tf.Variable(tf.constant(0.1, shape=[len(hp.relations)]), name="b")
 75 |               self.l2_loss += tf.nn.l2_loss(W)
 76 |               self.l2_loss += tf.nn.l2_loss(b)
 77 |               self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name="logit")
 78 |               #self.preds = tf.argmax(self.scores, 1, name="predictions")
 79 | 
 80 |             self.preds = tf.to_int32(tf.argmax(self.logits, dimension = -1))
 81 | 
 82 |                 
 83 |             if is_training:  
 84 |                 self.y_hotting = tf.one_hot(self.y, depth = len(hp.relations))
 85 | 
 86 |                 #Accuracy
 87 |                 self.cpl = tf.equal(tf.convert_to_tensor(self.y, tf.int32), self.preds)
 88 |                 self.cpl = tf.to_int32(self.cpl)
 89 |                 self.acc = tf.reduce_sum(self.cpl) / tf.to_int32(tf.reduce_sum(self.y_hotting))
 90 |                 tf.summary.scalar('acc', self.acc)
 91 | 
 92 |                 # Loss
 93 |                 #self.y_smoothed = label_smoothing(self.y_hotting)
 94 |                 self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_hotting)
 95 |                 self.mean_loss = (tf.reduce_sum(self.loss) + self.l2_loss*hp.reg_lambda)/tf.reduce_sum(self.y_hotting)
 96 | 
 97 |                 # Training Scheme
 98 |                 self.global_step = tf.Variable(0, name='global_step', trainable=False)
 99 |                 self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
100 |                 self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
101 |                    
102 |                 # Summary 
103 |                 tf.summary.scalar('mean_loss', self.mean_loss)
104 |                 self.merged = tf.summary.merge_all()
105 | 
106 | 
107 | if __name__ == '__main__':                
108 |     # Load vocabulary    
109 |     word2idx, idx2word = load_vocabs()
110 |     
111 |     # Construct graph
112 |     g = Graph("train"); print("Graph loaded")
113 |     
114 |     # Start session
115 |     sv = tf.train.Supervisor(graph=g.graph, 
116 |                              logdir=hp.logdir,
117 |                              save_model_secs=0)
118 |     with sv.managed_session() as sess:
119 |       with open('acc_mean_loss.rec', 'w') as rec:
120 |         for epoch in range(1, hp.num_epochs+1): 
121 |             if sv.should_stop(): break
122 |             for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
123 |                 sess.run(g.train_op)
124 |                 acc, los = sess.run(g.acc), sess.run(g.mean_loss)
125 |                 #print(acc, los)
126 |                 rec.write('{}\t{}\n'.format(acc, los))
127 |                 #print(sess.run(g.preds), sess.run(g.y))
128 |                 #print(sess.run(tf.equal(tf.convert_to_tensor(g.y, tf.int32), g.preds)))
129 |                 
130 |             gs = sess.run(g.global_step)   
131 |             sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
132 |     
133 |     print("Done")    
134 |     
135 | 
136 | 


--------------------------------------------------------------------------------
/transformer_text_Classfication/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import tensorflow as tf
  5 | 
  6 | from hyperparams import feature_Block_Hyperparams as hp
  7 | from data_load import get_batch_data, load_vocabs
  8 | from modules import *
  9 | import os, codecs
 10 | from tqdm import tqdm
 11 | 
 12 | 
 13 | class Graph():
 14 |     def __init__(self, is_training=True):
 15 |         self.graph = tf.Graph()
 16 |         with self.graph.as_default():
 17 |             if is_training:
 18 |                 self.x, self.label, self.num_batch = get_batch_data() # (N, T)
 19 |                 self.y = tf.one_hot(self.label, depth = hp.n_class)
 20 |             else: # inference
 21 |                 self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
 22 |                 self.label = tf.placeholder(tf.int32, shape = (None, hp.n_class))
 23 |                 #self.y = tf.placeholder(tf.int32, shape = (None, hp.n_class))
 24 |                 #self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
 25 | 
 26 |             # define decoder inputs
 27 |             #self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S>
 28 | 
 29 |             # Load vocabulary    
 30 |             word2idx, idx2word = load_vocabs()
 31 | 
 32 |             
 33 |             # Encoder
 34 |             with tf.variable_scope("encoder"):
 35 |                 ## Embedding
 36 |                 self.enc = embedding(self.x, 
 37 |                                       vocab_size=len(word2idx), 
 38 |                                       num_units=hp.hidden_units, 
 39 |                                       scale=True,
 40 |                                       scope="enc_embed")
 41 |                
 42 |                 ## Positional Encoding
 43 |                 if hp.sinusoid:
 44 |                     self.enc += positional_encoding(self.x,
 45 |                                       num_units=hp.hidden_units, 
 46 |                                       zero_pad=False, 
 47 |                                       scale=False,
 48 |                                       scope="enc_pe")
 49 |                 else:
 50 |                     self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]),
 51 |                                       vocab_size=hp.maxlen, 
 52 |                                       num_units=hp.hidden_units, 
 53 |                                       zero_pad=False, 
 54 |                                       scale=False,
 55 |                                       scope="enc_pe")
 56 |                     
 57 |                  
 58 |                 ## Dropout
 59 |                 self.enc = tf.layers.dropout(self.enc, 
 60 |                                             rate=hp.dropout_rate, 
 61 |                                             training=tf.convert_to_tensor(is_training))
 62 |                 
 63 |                 ## Blocks
 64 |                 for i in range(hp.num_blocks):
 65 |                     with tf.variable_scope("num_blocks_{}".format(i)):
 66 |                         ### Multihead Attention
 67 |                         self.enc = multihead_attention(queries=self.enc, 
 68 |                                                         keys=self.enc, 
 69 |                                                         num_units=hp.hidden_units, 
 70 |                                                         num_heads=hp.num_heads, 
 71 |                                                         dropout_rate=hp.dropout_rate,
 72 |                                                         is_training=is_training,
 73 |                                                         causality=False)
 74 |                         
 75 |                         ### Feed Forward
 76 |                         self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units])
 77 |             
 78 |             
 79 | 
 80 |             # Final linear projection
 81 |             #print(self.enc.shape) #4, 500, 512
 82 |             self.enc = tf.reduce_sum(self.enc, axis=2) #4, 500
 83 |             self.enc = tf.layers.batch_normalization(self.enc, True)
 84 |             self.logits = tf.layers.dense(self.enc, hp.n_class) #4, 2
 85 |             #print(self.logits.shape)
 86 |             self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
 87 | 
 88 |                 
 89 |             if is_training:  
 90 |                 #Accuracy
 91 |                 self.cpl = tf.equal(tf.convert_to_tensor(self.label, tf.int32), self.preds)
 92 |                 self.cpl = tf.to_int32(self.cpl)
 93 |                 self.acc = tf.reduce_sum(self.cpl) / tf.reduce_sum(tf.to_int32(self.y))
 94 |                 tf.summary.scalar('acc', self.acc)
 95 | 
 96 |                 # Loss
 97 |                 self.y_smoothed = label_smoothing(self.y)
 98 |                 self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
 99 |                 self.mean_loss = tf.reduce_sum(self.loss)/tf.reduce_sum(self.y)
100 |                 #self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
101 |                 
102 | 
103 |                 # Training Scheme
104 |                 self.global_step = tf.Variable(0, name='global_step', trainable=False)
105 |                 self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
106 |                 self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
107 |                    
108 |                 # Summary 
109 |                 tf.summary.scalar('mean_loss', self.mean_loss)
110 |                 self.merged = tf.summary.merge_all()
111 | 
112 | if __name__ == '__main__':                
113 |     # Load vocabulary    
114 |     word2idx, idx2word = load_vocabs()
115 |     
116 |     # Construct graph
117 |     g = Graph("train"); print("Graph loaded")
118 |     
119 |     # Start session
120 |     sv = tf.train.Supervisor(graph=g.graph, 
121 |                              logdir=hp.logdir,
122 |                              save_model_secs=0)
123 |     with sv.managed_session() as sess:
124 |       with open("acc_loss_rec.log", 'w') as f:
125 |         for epoch in range(1, hp.num_epochs+1): 
126 |             if sv.should_stop(): break
127 |             for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
128 |                 sess.run(g.train_op)
129 |                 acc, loss = sess.run([g.acc, g.mean_loss])
130 |                 f.write('{}\t{}\n'.format(acc, loss))
131 |             gs = sess.run(g.global_step)   
132 |             sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
133 |     
134 |     print("Done")    
135 |     
136 | 
137 | 


--------------------------------------------------------------------------------
/transformer_RC/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import tensorflow as tf
  5 | 
  6 | from hyperparams import rc_Hyperparams as hp
  7 | from data_load import get_batch_data, load_vocabs
  8 | from modules import *
  9 | import os, codecs
 10 | from tqdm import tqdm
 11 | from models import vanilla_transformer
 12 | 
 13 | # custom wrapper
 14 | from layers.basic_rnn import rnn
 15 | from layers.match_layer import MatchLSTMLayer
 16 | from layers.match_layer import AttentionFlowMatchLayer
 17 | from layers.pointer_net import PointerNetDecoder
 18 | 
 19 | os.sys.path.append('../Models')
 20 | from models import vanilla_transformer
 21 | 
 22 | 
 23 | 
 24 | class Graph():
 25 |     def __init__(self, is_training=True):
 26 |         self.graph = tf.Graph()
 27 |         with self.graph.as_default():
 28 |             if is_training:
 29 |                 self.q, self.p, self.q_length, self.p_length, \
 30 |                 self.start_label, self.end_label, self.num_batch = get_batch_data() 
 31 |                 self.dropout_keep_prob = hp.dropout_keep_prob
 32 | 
 33 |             else: # inference
 34 |                 self.q = tf.placeholder(tf.int32, [None, hp.q_maxlen])
 35 |                 self.p = tf.placeholder(tf.int32, [None, hp.p_maxlen])
 36 |                 self.q_length = tf.placeholder(tf.int32, [None])
 37 |                 self.p_length = tf.placeholder(tf.int32, [None])
 38 |                 self.start_label = tf.placeholder(tf.int32, [None])
 39 |                 self.end_label = tf.placeholder(tf.int32, [None])
 40 | 
 41 |             self.dropout_keep_prob = hp.dropout_keep_prob
 42 |             self.l2_loss = tf.constant(0.0)
 43 |             # define decoder input
 44 |             self.decoder_inputs = tf.concat((tf.ones_like(self.p[:, :1])*2, self.p[:, :-1]), -1) # 2:<S>
 45 | 
 46 |             # Load vocabulary    
 47 |             word2idx, idx2word = load_vocabs()
 48 | 
 49 |             # initialize transformer
 50 |             transformer = vanilla_transformer(hp, self.is_training)
 51 |             ### encode
 52 |             self.q_encodes, self.p_encodes = transformer.encode(self.q, len(word2idx)), \
 53 |                 transformer.encode(self.q, len(word2idx))
 54 | 
 55 |             #concated features to attend p with q
 56 |             # first pad q_encodes to the length of p_encodes
 57 |             pad_dim = hp.p_maxlen - hp.q_maxlen
 58 |             pad_ = tf.zeros([tf.shape(self.q_encodes)[0], pad_dim, hp.hidden_units], dtype = self.q_encodes.dtype)
 59 |             self.padded_q_encodes = tf.concat([self.q_encodes, pad_,], 1)
 60 |             #normalization
 61 |             self.padded_q_encodes = normalize(self.padded_q_encodes)
 62 | 
 63 |             # Decoder
 64 |             self.dec = transformer.decode(self.decoder_inputs, self.padded_q_encodes, len(word2idx), hp.p_maxlen)
 65 | 
 66 |             # fix paragraph tensor with self.dec
 67 |             self.p_encodes = self.dec
 68 | 
 69 |             """
 70 |             The core of RC model, get the question-aware passage encoding
 71 |             """
 72 |             match_layer = AttentionFlowMatchLayer(hp.hidden_units)
 73 |             self.match_p_encodes, _ = match_layer.match(self.p_encodes, self.q_encodes,
 74 |                                                         self.p_length, self.q_length)
 75 | 
 76 |             # pooling or bi-rnn to fuision passage encodes
 77 |             if hp.Passage_fuse == 'Pooling':
 78 |                 #pooling layer
 79 |                 self.match_p_encodes = \
 80 |                 tf.keras.layers.MaxPool1D(pool_size=4, strides=None, padding='valid')\
 81 |                                         (self.match_p_encodes)
 82 | 
 83 |                 self.match_p_encodes = tf.reshape(self.match_p_encodes, [-1, hp.p_maxlen, hp.hidden_units])
 84 |                 #normalization
 85 |                 self.match_p_encodes = tf.layers.batch_normalization(self.match_p_encodes)
 86 |                 if hp.use_dropout:
 87 |                     self.match_p_encodes = tf.nn.dropout(self.match_p_encodes, self.dropout_keep_prob)
 88 |             elif hp.Passage_fuse == 'bi-rnn':
 89 |                 self.fuse_p_encodes, _ = rnn('bi-lstm', self.match_p_encodes, self.p_length,
 90 |                                              hp.hidden_units, layer_num=1, concat = False)
 91 |                 if hp.use_dropout:
 92 |                     self.fuse_p_encodes = tf.nn.dropout(self.fuse_p_encodes, self.dropout_keep_prob)
 93 | 
 94 | 
 95 |             decoder = PointerNetDecoder(hp.hidden_units)
 96 |             self.start_probs, self.end_probs = decoder.decode(self.match_p_encodes,
 97 |                                                               self.q_encodes)
 98 | 
 99 |                 
100 |             if is_training:  
101 |                 self.start_loss = self.sparse_nll_loss(probs=self.start_probs, labels=self.start_label)
102 |                 self.end_loss = self.sparse_nll_loss(probs=self.end_probs, labels=self.end_label)
103 |                 self.all_params = tf.trainable_variables()
104 |                 self.loss = tf.reduce_mean(tf.add(self.start_loss, self.end_loss))
105 |                 if hp.weight_decay > 0:
106 |                     with tf.variable_scope('l2_loss'):
107 |                         l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.all_params])
108 |                     self.loss += hp.weight_decay * l2_loss
109 | 
110 | 
111 | 
112 |                 # Training Scheme
113 |                 self.global_step = tf.Variable(0, name='global_step', trainable=False)
114 |                 self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
115 |                 self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
116 |                    
117 |                 # Summary 
118 |                 tf.summary.scalar('mean_loss', self.loss)
119 |                 self.merged = tf.summary.merge_all()
120 | 
121 | 
122 | 
123 |     def sparse_nll_loss(self, probs, labels, epsilon=1e-9, scope=None):
124 |         """
125 |         negative log likelyhood loss
126 |         """
127 |         with tf.name_scope(scope, "log_loss"):
128 |             labels = tf.one_hot(labels, tf.shape(probs)[1], axis=1)
129 |             losses = - tf.reduce_sum(labels * tf.log(probs + epsilon), 1)
130 |         return losses
131 | 
132 | 
133 | if __name__ == '__main__':                
134 |     # Load vocabulary    
135 |     word2idx, idx2word = load_vocabs()
136 |     
137 |     # Construct graph
138 |     g = Graph("train"); print("Graph loaded")
139 |     
140 |     # Start session
141 |     sv = tf.train.Supervisor(graph=g.graph, 
142 |                              logdir=hp.logdir,
143 |                              save_model_secs=0)
144 |     with sv.managed_session() as sess:
145 |       with open('acc_mean_loss.rec', 'w') as rec:
146 |         for epoch in range(1, hp.num_epochs+1): 
147 |             if sv.should_stop(): break
148 |             for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
149 |                 sess.run(g.train_op)
150 |                 #acc, los = sess.run(g.acc), sess.run(g.mean_loss)
151 |                 los = sess.run(g.loss)
152 |                 if not los > float('-inf'):
153 |                   print("loss: ",los)
154 |                   gs = sess.run(g.global_step) 
155 |                   sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
156 |                   break
157 |    
158 |                 rec.write('epochs {}\tstep {}\t{}\t{}\n'.format(epoch, step, 'Loss:', los))
159 | 
160 |             gs = sess.run(g.global_step)   
161 |             sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
162 |     
163 |     print("Done")
164 |     
165 | 
166 | 


--------------------------------------------------------------------------------
/transformer_RC/layers/pointer_net.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf8 -*-
  2 | # ==============================================================================
  3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | """
 18 | This module implements the Pointer Network for selecting answer spans, as described in:
 19 | https://openreview.net/pdf?id=B1-q5Pqxl
 20 | """
 21 | 
 22 | import tensorflow as tf
 23 | import tensorflow.contrib as tc
 24 | 
 25 | 
 26 | def custom_dynamic_rnn(cell, inputs, inputs_len, initial_state=None):
 27 |     """
 28 |     Implements a dynamic rnn that can store scores in the pointer network,
 29 |     the reason why we implements this is that the raw_rnn or dynamic_rnn function in Tensorflow
 30 |     seem to require the hidden unit and memory unit has the same dimension, and we cannot
 31 |     store the scores directly in the hidden unit.
 32 |     Args:
 33 |         cell: RNN cell
 34 |         inputs: the input sequence to rnn
 35 |         inputs_len: valid length
 36 |         initial_state: initial_state of the cell
 37 |     Returns:
 38 |         outputs and state
 39 |     """
 40 |     batch_size, max_time = tf.shape(inputs)[0], tf.shape(inputs)[1]
 41 | 
 42 | 
 43 |     inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
 44 |     inputs_ta = inputs_ta.unstack(tf.transpose(inputs, [1, 0, 2]))
 45 | 
 46 |     # record cells
 47 |     emit_ta = tf.TensorArray(dtype=tf.float32, dynamic_size=True, size=0)
 48 | 
 49 |     # iter timesteps
 50 |     t0 = tf.constant(0, dtype=tf.int32)
 51 |     if initial_state is not None:
 52 | 
 53 |         # initial state
 54 |         s0 = initial_state
 55 |     else:
 56 |         s0 = cell.zero_state(batch_size, dtype=tf.float32)
 57 |     #
 58 |     f0 = tf.zeros([batch_size], dtype=tf.bool)
 59 | 
 60 |     def loop_fn(t, prev_s, emit_ta, finished):
 61 |         """
 62 |         the loop function of rnn
 63 |         """
 64 |         cur_x = inputs_ta.read(t)
 65 |         # use pre cell state and current input to predict the scores and current state
 66 |         ### dimension of scores: (batchsize, hiddensize) equal to cur_x
 67 |         ### the score is the logit of each position at each sample
 68 |         
 69 |         ### current state is a tuple (hidden state, cell state)
 70 |         scores, cur_state = cell(cur_x, prev_s)
 71 | 
 72 |         # copy through
 73 |         scores = tf.where(finished, tf.zeros_like(scores), scores)
 74 | 
 75 |         if isinstance(cell, tc.rnn.LSTMCell):
 76 |             cur_c, cur_h = cur_state
 77 |             prev_c, prev_h = prev_s
 78 |             cur_state = tc.rnn.LSTMStateTuple(tf.where(finished, prev_c, cur_c),
 79 |                                               tf.where(finished, prev_h, cur_h))
 80 |         else:
 81 |             cur_state = tf.where(finished, prev_s, cur_state)
 82 | 
 83 |         ### store the logit scores of each step
 84 |         emit_ta = emit_ta.write(t, scores)
 85 |         finished = tf.greater_equal(t + 1, inputs_len)
 86 |         return [t + 1, cur_state, emit_ta, finished]
 87 | 
 88 |     _, state, emit_ta, _ = tf.while_loop(
 89 |         cond=lambda _1, _2, _3, finished: tf.logical_not(tf.reduce_all(finished)),
 90 |         body=loop_fn,
 91 |         loop_vars=(t0, s0, emit_ta, f0),
 92 |         parallel_iterations=32,
 93 |         swap_memory=False)
 94 | 
 95 |     outputs = tf.transpose(emit_ta.stack(), [1, 0, 2])
 96 |     return outputs, state
 97 | 
 98 | 
 99 | def attend_pooling(pooling_vectors, ref_vector, hidden_size, scope=None):
100 |     """
101 |     Applies attend pooling to a set of vectors according to a reference vector.
102 |     Args:
103 |         pooling_vectors: the vectors to pool
104 |         ref_vector: the reference vector
105 |         hidden_size: the hidden size for attention function
106 |         scope: score name
107 |     Returns:
108 |         the pooled vector
109 |         pooling to vector with one dimension
110 |     """
111 |     with tf.variable_scope(scope or 'attend_pooling'):
112 |         U = tf.tanh(tc.layers.fully_connected(pooling_vectors, num_outputs=hidden_size,
113 |                                               activation_fn=None, biases_initializer=None)
114 |                     + tc.layers.fully_connected(tf.expand_dims(ref_vector, 1),
115 |                                                 num_outputs=hidden_size,
116 |                                                 activation_fn=None))
117 |         logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None)
118 |         scores = tf.nn.softmax(logits, 1)
119 |         pooled_vector = tf.reduce_sum(pooling_vectors * scores, axis=1)
120 |     return pooled_vector
121 | 
122 | 
123 | class PointerNetLSTMCell(tc.rnn.LSTMCell):
124 |     """
125 |     Implements the Pointer Network Cell
126 |     """
127 |     def __init__(self, num_units, context_to_point):
128 |         super(PointerNetLSTMCell, self).__init__(num_units, state_is_tuple=True)
129 |         self.context_to_point = context_to_point
130 |         self.fc_context = tc.layers.fully_connected(self.context_to_point,
131 |                                                     num_outputs=self._num_units,
132 |                                                     activation_fn=None)
133 | 
134 |     def __call__(self, inputs, state, scope=None):
135 |         (c_prev, m_prev) = state
136 |         with tf.variable_scope(scope or type(self).__name__):
137 |             U = tf.tanh(self.fc_context
138 |                         + tf.expand_dims(tc.layers.fully_connected(m_prev,
139 |                                                                    num_outputs=self._num_units,
140 |                                                                    activation_fn=None),
141 |                                          1))
142 |             logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None)
143 |             scores = tf.nn.softmax(logits, 1)
144 |             attended_context = tf.reduce_sum(self.context_to_point * scores, axis=1)
145 |             lstm_out, lstm_state = super(PointerNetLSTMCell, self).__call__(attended_context, state)
146 |         return tf.squeeze(scores, -1), lstm_state
147 | 
148 | 
149 | class PointerNetDecoder(object):
150 |     """
151 |     Implements the Pointer Network
152 |     """
153 |     def __init__(self, hidden_size):
154 |         self.hidden_size = hidden_size
155 | 
156 |     def decode(self, passage_vectors, question_vectors, init_with_question=True):
157 |         """
158 |         Use Pointer Network to compute the probabilities of each position
159 |         to be start and end of the answer
160 |         Args:
161 |             passage_vectors: the encoded passage vectors
162 |             question_vectors: the encoded question vectors
163 |             init_with_question: if set to be true,
164 |                              we will use the question_vectors to init the state of Pointer Network
165 |         Returns:
166 |             the probs of evary position to be start and end of the answer
167 |         """
168 |         with tf.variable_scope('pn_decoder'):
169 |             fake_inputs = tf.zeros([tf.shape(passage_vectors)[0], 2, 1])  # not used
170 |             sequence_len = tf.tile([2], [tf.shape(passage_vectors)[0]])
171 |             if init_with_question:
172 |                 random_attn_vector = tf.Variable(tf.random_normal([1, self.hidden_size]),
173 |                                                  trainable=True, name="random_attn_vector")
174 |                 pooled_question_rep = tc.layers.fully_connected(
175 |                     attend_pooling(question_vectors, random_attn_vector, self.hidden_size),
176 |                     num_outputs=self.hidden_size, activation_fn=None
177 |                 )
178 |                 init_state = tc.rnn.LSTMStateTuple(pooled_question_rep, pooled_question_rep)
179 |             else:
180 |                 init_state = None
181 |             with tf.variable_scope('fw'):
182 |                 fw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors)
183 |                 fw_outputs, _ = custom_dynamic_rnn(fw_cell, fake_inputs, sequence_len, init_state)
184 |             with tf.variable_scope('bw'):
185 |                 bw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors)
186 |                 bw_outputs, _ = custom_dynamic_rnn(bw_cell, fake_inputs, sequence_len, init_state)
187 |             
188 |             # the start prob and end prob of each position
189 |             start_prob = (fw_outputs[0:, 0, 0:] + bw_outputs[0:, 1, 0:]) / 2
190 |             end_prob = (fw_outputs[0:, 1, 0:] + bw_outputs[0:, 0, 0:]) / 2
191 |             return start_prob, end_prob
192 | 
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/en-zh_NMT/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | 
  5 | import tensorflow as tf
  6 | 
  7 | from hyperparams import seq2seq_Hyperparams as hp
  8 | from data_load import get_batch_data, load_en_vocab, load_zh_vocab
  9 | from modules import *
 10 | import os, codecs
 11 | from tqdm import tqdm
 12 | 
 13 | class Graph():
 14 |     def __init__(self, is_training=True):
 15 |         self.graph = tf.Graph()
 16 |         with self.graph.as_default():
 17 |             if is_training:
 18 |                 self.x, self.y, self.num_batch = get_batch_data() # (N, T)
 19 |             else: # inference
 20 |                 self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
 21 |                 self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
 22 | 
 23 |             # define decoder inputs
 24 |             self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S>
 25 | 
 26 |             # Load vocabulary    
 27 |             en2idx, idx2en = load_en_vocab()
 28 |             zh2idx, idx2zh = load_zh_vocab()
 29 |             
 30 |             # Encoder
 31 |             with tf.variable_scope("encoder"):
 32 |                 ## Embedding
 33 |                 self.enc = embedding(self.x, 
 34 |                                       vocab_size=len(en2idx), 
 35 |                                       num_units=hp.hidden_units, 
 36 |                                       scale=True,
 37 |                                       scope="enc_embed")
 38 |                 
 39 |                 ## Positional Encoding
 40 |                 if hp.sinusoid:
 41 |                     self.enc += positional_encoding(self.x,
 42 |                                       num_units=hp.hidden_units, 
 43 |                                       zero_pad=False, 
 44 |                                       scale=False,
 45 |                                       scope="enc_pe")
 46 |                 else:
 47 |                     self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]),
 48 |                                       vocab_size=hp.maxlen, 
 49 |                                       num_units=hp.hidden_units, 
 50 |                                       zero_pad=False, 
 51 |                                       scale=False,
 52 |                                       scope="enc_pe")
 53 |                     
 54 |                  
 55 |                 ## Dropout
 56 |                 self.enc = tf.layers.dropout(self.enc, 
 57 |                                             rate=hp.dropout_rate, 
 58 |                                             training=tf.convert_to_tensor(is_training))
 59 |                 
 60 |                 ## Blocks
 61 |                 for i in range(hp.num_blocks):
 62 |                     with tf.variable_scope("num_blocks_{}".format(i)):
 63 |                         ### Multihead Attention
 64 |                         self.enc = multihead_attention(queries=self.enc, 
 65 |                                                         keys=self.enc, 
 66 |                                                         num_units=hp.hidden_units, 
 67 |                                                         num_heads=hp.num_heads, 
 68 |                                                         dropout_rate=hp.dropout_rate,
 69 |                                                         is_training=is_training,
 70 |                                                         causality=False)
 71 |                         
 72 |                         ### Feed Forward
 73 |                         self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units])
 74 |             
 75 |             # Decoder
 76 |             with tf.variable_scope("decoder"):
 77 |                 ## Embedding
 78 |                 self.dec = embedding(self.decoder_inputs, 
 79 |                                       vocab_size=len(zh2idx), 
 80 |                                       num_units=hp.hidden_units,
 81 |                                       scale=True, 
 82 |                                       scope="dec_embed")
 83 |                 
 84 |                 ## Positional Encoding
 85 |                 if hp.sinusoid:
 86 |                     self.dec += positional_encoding(self.decoder_inputs,
 87 |                                       vocab_size=hp.maxlen, 
 88 |                                       num_units=hp.hidden_units, 
 89 |                                       zero_pad=False, 
 90 |                                       scale=False,
 91 |                                       scope="dec_pe")
 92 |                 else:
 93 |                     self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]),
 94 |                                       vocab_size=hp.maxlen, 
 95 |                                       num_units=hp.hidden_units, 
 96 |                                       zero_pad=False, 
 97 |                                       scale=False,
 98 |                                       scope="dec_pe")
 99 |                 
100 |                 ## Dropout
101 |                 self.dec = tf.layers.dropout(self.dec, 
102 |                                             rate=hp.dropout_rate, 
103 |                                             training=tf.convert_to_tensor(is_training))
104 |                 
105 |                 ## Blocks
106 |                 for i in range(hp.num_blocks):
107 |                     with tf.variable_scope("num_blocks_{}".format(i)):
108 |                         ## Multihead Attention ( self-attention)
109 |                         self.dec = multihead_attention(queries=self.dec, 
110 |                                                         keys=self.dec, 
111 |                                                         num_units=hp.hidden_units, 
112 |                                                         num_heads=hp.num_heads, 
113 |                                                         dropout_rate=hp.dropout_rate,
114 |                                                         is_training=is_training,
115 |                                                         causality=True, 
116 |                                                         scope="self_attention")
117 |                         
118 |                         ## Multihead Attention ( vanilla attention)
119 |                         self.dec = multihead_attention(queries=self.dec, 
120 |                                                         keys=self.enc, 
121 |                                                         num_units=hp.hidden_units, 
122 |                                                         num_heads=hp.num_heads,
123 |                                                         dropout_rate=hp.dropout_rate,
124 |                                                         is_training=is_training, 
125 |                                                         causality=False,
126 |                                                         scope="vanilla_attention")
127 |                         
128 |                         ## Feed Forward
129 |                         self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units])
130 |                 
131 |             # Final linear projection
132 |             self.logits = tf.layers.dense(self.dec, len(zh2idx))
133 |             self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
134 |             self.istarget = tf.to_float(tf.not_equal(self.y, 0))
135 |             self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget))
136 |             tf.summary.scalar('acc', self.acc)
137 |                 
138 |             if is_training:  
139 |                 # Loss
140 |                 self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(zh2idx)))
141 |                 self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
142 |                 self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
143 |                
144 |                 # Training Scheme
145 |                 self.global_step = tf.Variable(0, name='global_step', trainable=False)
146 |                 self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
147 |                 self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
148 |                    
149 |                 # Summary 
150 |                 tf.summary.scalar('mean_loss', self.mean_loss)
151 |                 self.merged = tf.summary.merge_all()
152 | 
153 | if __name__ == '__main__':                
154 |     # Load vocabulary    
155 |     en2idx, idx2en = load_en_vocab()
156 |     zh2idx, idx2zh = load_zh_vocab()
157 |     
158 |     # Construct graph
159 |     g = Graph("train"); print("Graph loaded")
160 |     
161 |     # Start session
162 |     sv = tf.train.Supervisor(graph=g.graph, 
163 |                              logdir=hp.logdir,
164 |                              save_model_secs=0)
165 |     with sv.managed_session() as sess:
166 |         for epoch in range(1, hp.num_epochs+1): 
167 |             if sv.should_stop(): break
168 |             for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
169 |                 sess.run(g.train_op)
170 |                 
171 |             gs = sess.run(g.global_step)   
172 |             sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
173 |     
174 |     print("Done")    
175 |     
176 | 
177 | 


--------------------------------------------------------------------------------
/en-zh_NMT/modules.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import collections
  7 | import math
  8 | 
  9 | 
 10 | 
 11 | def normalize(inputs, 
 12 |               epsilon = 1e-8,
 13 |               scope="ln",
 14 |               reuse=None):
 15 |     '''Applies layer normalization.
 16 |     
 17 |     Args:
 18 |       inputs: A tensor with 2 or more dimensions, where the first dimension has
 19 |         `batch_size`.
 20 |       epsilon: A floating number. A very small number for preventing ZeroDivision Error.
 21 |       scope: Optional scope for `variable_scope`.
 22 |       reuse: Boolean, whether to reuse the weights of a previous layer
 23 |         by the same name.
 24 |       
 25 |     Returns:
 26 |       A tensor with the same shape and data dtype as `inputs`.
 27 |     '''
 28 |     with tf.variable_scope(scope, reuse=reuse):
 29 |         inputs_shape = inputs.get_shape()
 30 |         params_shape = inputs_shape[-1:]
 31 |     
 32 |         mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
 33 |         beta= tf.Variable(tf.zeros(params_shape))
 34 |         gamma = tf.Variable(tf.ones(params_shape))
 35 |         normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
 36 |         outputs = gamma * normalized + beta
 37 |         
 38 |     return outputs
 39 | 
 40 | def embedding(inputs, 
 41 |               vocab_size, 
 42 |               num_units, 
 43 |               zero_pad=True, 
 44 |               scale=True,
 45 |               scope="embedding", 
 46 |               reuse=None):
 47 |     '''Embeds a given tensor.
 48 | 
 49 |     Args:
 50 |       inputs: A `Tensor` with type `int32` or `int64` containing the ids
 51 |          to be looked up in `lookup table`.
 52 |       vocab_size: An int. Vocabulary size.
 53 |       num_units: An int. Number of embedding hidden units.
 54 |       zero_pad: A boolean. If True, all the values of the fist row (id 0)
 55 |         should be constant zeros.
 56 |       scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
 57 |       scope: Optional scope for `variable_scope`.
 58 |       reuse: Boolean, whether to reuse the weights of a previous layer
 59 |         by the same name.
 60 | 
 61 |     Returns:
 62 |       A `Tensor` with one more rank than inputs's. The last dimensionality
 63 |         should be `num_units`.
 64 |         
 65 |     For example,
 66 |     
 67 |     ```
 68 |     import tensorflow as tf
 69 |     
 70 |     inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
 71 |     outputs = embedding(inputs, 6, 2, zero_pad=True)
 72 |     with tf.Session() as sess:
 73 |         sess.run(tf.global_variables_initializer())
 74 |         print sess.run(outputs)
 75 |     >>
 76 |     [[[ 0.          0.        ]
 77 |       [ 0.09754146  0.67385566]
 78 |       [ 0.37864095 -0.35689294]]
 79 | 
 80 |      [[-1.01329422 -1.09939694]
 81 |       [ 0.7521342   0.38203377]
 82 |       [-0.04973143 -0.06210355]]]
 83 |     ```
 84 |     
 85 |     ```
 86 |     import tensorflow as tf
 87 |     
 88 |     inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
 89 |     outputs = embedding(inputs, 6, 2, zero_pad=False)
 90 |     with tf.Session() as sess:
 91 |         sess.run(tf.global_variables_initializer())
 92 |         print sess.run(outputs)
 93 |     >>
 94 |     [[[-0.19172323 -0.39159766]
 95 |       [-0.43212751 -0.66207761]
 96 |       [ 1.03452027 -0.26704335]]
 97 | 
 98 |      [[-0.11634696 -0.35983452]
 99 |       [ 0.50208133  0.53509563]
100 |       [ 1.22204471 -0.96587461]]]    
101 |     ```    
102 |     '''
103 |     with tf.variable_scope(scope, reuse=reuse):
104 |         lookup_table = tf.get_variable('lookup_table',
105 |                                        dtype=tf.float32,
106 |                                        shape=[vocab_size, num_units],
107 |                                        initializer=tf.contrib.layers.xavier_initializer())
108 |         if zero_pad:
109 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
110 |                                       lookup_table[1:, :]), 0)
111 |         outputs = tf.nn.embedding_lookup(lookup_table, inputs)
112 |         
113 |         if scale:
114 |             outputs = outputs * (num_units ** 0.5) 
115 |             
116 |     return outputs
117 |     
118 | 
119 | def positional_encoding(inputs,
120 |                         num_units,
121 |                         zero_pad=True,
122 |                         scale=True,
123 |                         scope="positional_encoding",
124 |                         reuse=None):
125 |     '''Sinusoidal Positional_Encoding.
126 | 
127 |     Args:
128 |       inputs: A 2d Tensor with shape of (N, T).
129 |       num_units: Output dimensionality
130 |       zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
131 |       scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
132 |       scope: Optional scope for `variable_scope`.
133 |       reuse: Boolean, whether to reuse the weights of a previous layer
134 |         by the same name.
135 | 
136 |     Returns:
137 |         A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
138 |     '''
139 | 
140 |     N, T = inputs.get_shape().as_list()
141 |     with tf.variable_scope(scope, reuse=reuse):
142 |         position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])
143 | 
144 |         # First part of the PE function: sin and cos argument
145 |         position_enc = np.array([
146 |             [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)]
147 |             for pos in range(T)])
148 | 
149 |         # Second part, apply the cosine to even columns and sin to odds.
150 |         position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
151 |         position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
152 | 
153 |         # Convert to a tensor
154 |         lookup_table = tf.convert_to_tensor(position_enc)
155 | 
156 |         if zero_pad:
157 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
158 |                                       lookup_table[1:, :]), 0)
159 |         outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
160 | 
161 |         if scale:
162 |             outputs = outputs * num_units**0.5
163 | 
164 |         return outputs
165 | 
166 | 
167 | 
168 | def multihead_attention(queries, 
169 |                         keys, 
170 |                         num_units=None, 
171 |                         num_heads=8, 
172 |                         dropout_rate=0,
173 |                         is_training=True,
174 |                         causality=False,
175 |                         scope="multihead_attention", 
176 |                         reuse=None):
177 |     '''Applies multihead attention.
178 |     
179 |     Args:
180 |       queries: A 3d tensor with shape of [N, T_q, C_q].
181 |       keys: A 3d tensor with shape of [N, T_k, C_k].
182 |       num_units: A scalar. Attention size.
183 |       dropout_rate: A floating point number.
184 |       is_training: Boolean. Controller of mechanism for dropout.
185 |       causality: Boolean. If true, units that reference the future are masked. 
186 |       num_heads: An int. Number of heads.
187 |       scope: Optional scope for `variable_scope`.
188 |       reuse: Boolean, whether to reuse the weights of a previous layer
189 |         by the same name.
190 |         
191 |     Returns
192 |       A 3d tensor with shape of (N, T_q, C)  
193 |     '''
194 |     with tf.variable_scope(scope, reuse=reuse):
195 |         # Set the fall back option for num_units
196 |         if num_units is None:
197 |             num_units = queries.get_shape().as_list[-1]
198 |         
199 |         # Linear projections
200 |         Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
201 |         K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
202 |         V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
203 |         
204 |         # Split and concat
205 |         Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 
206 |         K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
207 |         V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
208 | 
209 |         # Multiplication
210 |         outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
211 |         
212 |         # Scale
213 |         outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
214 |         
215 |         # Key Masking
216 |         key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k)
217 |         key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
218 |         key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
219 |         
220 |         paddings = tf.ones_like(outputs)*(-2**32+1)
221 |         outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
222 |   
223 |         # Causality = Future blinding
224 |         if causality:
225 |             diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
226 |             #tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k)
227 |             tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
228 |             masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
229 |    
230 |             paddings = tf.ones_like(masks)*(-2**32+1)
231 |             outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
232 |   
233 |         # Activation
234 |         outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
235 |          
236 |         # Query Masking
237 |         query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q)
238 |         query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
239 |         query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
240 |         outputs *= query_masks # broadcasting. (N, T_q, C)
241 |           
242 |         # Dropouts
243 |         outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
244 |                
245 |         # Weighted sum
246 |         outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
247 |         
248 |         # Restore shape
249 |         outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)
250 |               
251 |         # Residual connection
252 |         outputs += queries
253 |               
254 |         # Normalize
255 |         outputs = normalize(outputs) # (N, T_q, C)
256 |  
257 |     return outputs
258 | 
259 | def feedforward(inputs, 
260 |                 num_units=[2048, 512],
261 |                 scope="multihead_attention", 
262 |                 reuse=None):
263 |     '''Point-wise feed forward net.
264 |     
265 |     Args:
266 |       inputs: A 3d tensor with shape of [N, T, C].
267 |       num_units: A list of two integers.
268 |       scope: Optional scope for `variable_scope`.
269 |       reuse: Boolean, whether to reuse the weights of a previous layer
270 |         by the same name.
271 |         
272 |     Returns:
273 |       A 3d tensor with the same shape and dtype as inputs
274 |     '''
275 |     with tf.variable_scope(scope, reuse=reuse):
276 |         # Inner layer
277 |         params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
278 |                   "activation": tf.nn.relu, "use_bias": True}
279 |         outputs = tf.layers.conv1d(**params)
280 |         
281 |         # Readout layer
282 |         params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
283 |                   "activation": None, "use_bias": True}
284 |         outputs = tf.layers.conv1d(**params)
285 |         
286 |         # Residual connection
287 |         outputs += inputs
288 |         
289 |         # Normalize
290 |         outputs = normalize(outputs)
291 |     
292 |     return outputs
293 | 
294 | def label_smoothing(inputs, epsilon=0.1):
295 |     '''Applies label smoothing. See https://arxiv.org/abs/1512.00567.
296 |     
297 |     Args:
298 |       inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
299 |       epsilon: Smoothing rate.
300 |     
301 |     For example,
302 |     
303 |     ```
304 |     import tensorflow as tf
305 |     inputs = tf.convert_to_tensor([[[0, 0, 1], 
306 |        [0, 1, 0],
307 |        [1, 0, 0]],
308 | 
309 |       [[1, 0, 0],
310 |        [1, 0, 0],
311 |        [0, 1, 0]]], tf.float32)
312 |        
313 |     outputs = label_smoothing(inputs)
314 |     
315 |     with tf.Session() as sess:
316 |         print(sess.run([outputs]))
317 |     
318 |     >>
319 |     [array([[[ 0.03333334,  0.03333334,  0.93333334],
320 |         [ 0.03333334,  0.93333334,  0.03333334],
321 |         [ 0.93333334,  0.03333334,  0.03333334]],
322 | 
323 |        [[ 0.93333334,  0.03333334,  0.03333334],
324 |         [ 0.93333334,  0.03333334,  0.03333334],
325 |         [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]   
326 |     ```    
327 |     '''
328 |     K = inputs.get_shape().as_list()[-1] # number of channels
329 |     return ((1-epsilon) * inputs) + (epsilon / K)
330 |     
331 |     
332 | def bleu(pred_tokens, label_tokens, k):
333 |     len_pred, len_label = len(pred_tokens), len(label_tokens)
334 |     score = math.exp(min(0, 1 - len_label / len_pred))
335 |     for n in range(1, k + 1):
336 |         num_matches, label_subs = 0, collections.defaultdict(int)
337 |         for i in range(len_label - n + 1):
338 |             label_subs[''.join(label_tokens[i: i + n])] += 1
339 |         for i in range(len_pred - n + 1):
340 |             if label_subs[''.join(pred_tokens[i: i + n])] > 0:
341 |                 num_matches += 1
342 |                 label_subs[''.join(pred_tokens[i: i + n])] -= 1
343 |         score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n))
344 |     return score
345 | 
346 | 
347 |             
348 | 


--------------------------------------------------------------------------------
/transformer_infersent/modules.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | 
  7 | 
  8 | 
  9 | 
 10 | def normalize(inputs, 
 11 |               epsilon = 1e-8,
 12 |               scope="ln",
 13 |               reuse=None):
 14 |     '''Applies layer normalization.
 15 |     
 16 |     Args:
 17 |       inputs: A tensor with 2 or more dimensions, where the first dimension has
 18 |         `batch_size`.
 19 |       epsilon: A floating number. A very small number for preventing ZeroDivision Error.
 20 |       scope: Optional scope for `variable_scope`.
 21 |       reuse: Boolean, whether to reuse the weights of a previous layer
 22 |         by the same name.
 23 |       
 24 |     Returns:
 25 |       A tensor with the same shape and data dtype as `inputs`.
 26 |     '''
 27 |     with tf.variable_scope(scope, reuse=reuse):
 28 |         inputs_shape = inputs.get_shape()
 29 |         params_shape = inputs_shape[-1:]
 30 |     
 31 |         mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
 32 |         beta= tf.Variable(tf.zeros(params_shape))
 33 |         gamma = tf.Variable(tf.ones(params_shape))
 34 |         normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
 35 |         outputs = gamma * normalized + beta
 36 |         
 37 |     return outputs
 38 | 
 39 | def embedding(inputs, 
 40 |               vocab_size, 
 41 |               num_units, 
 42 |               zero_pad=True, 
 43 |               scale=True,
 44 |               scope="embedding", 
 45 |               reuse=None):
 46 |     '''Embeds a given tensor.
 47 | 
 48 |     Args:
 49 |       inputs: A `Tensor` with type `int32` or `int64` containing the ids
 50 |          to be looked up in `lookup table`.
 51 |       vocab_size: An int. Vocabulary size.
 52 |       num_units: An int. Number of embedding hidden units.
 53 |       zero_pad: A boolean. If True, all the values of the fist row (id 0)
 54 |         should be constant zeros.
 55 |       scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
 56 |       scope: Optional scope for `variable_scope`.
 57 |       reuse: Boolean, whether to reuse the weights of a previous layer
 58 |         by the same name.
 59 | 
 60 |     Returns:
 61 |       A `Tensor` with one more rank than inputs's. The last dimensionality
 62 |         should be `num_units`.
 63 |         
 64 |     For example,
 65 |     
 66 |     ```
 67 |     import tensorflow as tf
 68 |     
 69 |     inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
 70 |     outputs = embedding(inputs, 6, 2, zero_pad=True)
 71 |     with tf.Session() as sess:
 72 |         sess.run(tf.global_variables_initializer())
 73 |         print sess.run(outputs)
 74 |     >>
 75 |     [[[ 0.          0.        ]
 76 |       [ 0.09754146  0.67385566]
 77 |       [ 0.37864095 -0.35689294]]
 78 | 
 79 |      [[-1.01329422 -1.09939694]
 80 |       [ 0.7521342   0.38203377]
 81 |       [-0.04973143 -0.06210355]]]
 82 |     ```
 83 |     
 84 |     ```
 85 |     import tensorflow as tf
 86 |     
 87 |     inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
 88 |     outputs = embedding(inputs, 6, 2, zero_pad=False)
 89 |     with tf.Session() as sess:
 90 |         sess.run(tf.global_variables_initializer())
 91 |         print sess.run(outputs)
 92 |     >>
 93 |     [[[-0.19172323 -0.39159766]
 94 |       [-0.43212751 -0.66207761]
 95 |       [ 1.03452027 -0.26704335]]
 96 | 
 97 |      [[-0.11634696 -0.35983452]
 98 |       [ 0.50208133  0.53509563]
 99 |       [ 1.22204471 -0.96587461]]]    
100 |     ```    
101 |     '''
102 |     with tf.variable_scope(scope, reuse=reuse):
103 |         lookup_table = tf.get_variable('lookup_table',
104 |                                        dtype=tf.float32,
105 |                                        shape=[vocab_size, num_units],
106 |                                        initializer=tf.contrib.layers.xavier_initializer())
107 |         if zero_pad:
108 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
109 |                                       lookup_table[1:, :]), 0)
110 |         outputs = tf.nn.embedding_lookup(lookup_table, inputs)
111 |         
112 |         if scale:
113 |             outputs = outputs * (num_units ** 0.5) 
114 |             
115 |     return outputs
116 |     
117 | 
118 | def positional_encoding(inputs,
119 |                         num_units,
120 |                         zero_pad=True,
121 |                         scale=True,
122 |                         scope="positional_encoding",
123 |                         reuse=None):
124 |     '''Sinusoidal Positional_Encoding.
125 | 
126 |     Args:
127 |       inputs: A 2d Tensor with shape of (N, T).
128 |       num_units: Output dimensionality
129 |       zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
130 |       scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
131 |       scope: Optional scope for `variable_scope`.
132 |       reuse: Boolean, whether to reuse the weights of a previous layer
133 |         by the same name.
134 | 
135 |     Returns:
136 |         A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
137 |     '''
138 | 
139 |     N, T = inputs.get_shape().as_list()
140 |     with tf.variable_scope(scope, reuse=reuse):
141 |         position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])
142 | 
143 |         # First part of the PE function: sin and cos argument
144 |         position_enc = np.array([
145 |             [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)]
146 |             for pos in range(T)])
147 | 
148 |         # Second part, apply the cosine to even columns and sin to odds.
149 |         position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
150 |         position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
151 | 
152 |         # Convert to a tensor
153 |         lookup_table = tf.convert_to_tensor(position_enc)
154 | 
155 |         if zero_pad:
156 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
157 |                                       lookup_table[1:, :]), 0)
158 |         outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
159 | 
160 |         if scale:
161 |             outputs = outputs * num_units**0.5
162 | 
163 |         return outputs
164 | 
165 | 
166 | 
167 | def multihead_attention(queries, 
168 |                         keys, 
169 |                         num_units=None, 
170 |                         num_heads=8, 
171 |                         dropout_rate=0,
172 |                         is_training=True,
173 |                         causality=False,
174 |                         scope="multihead_attention", 
175 |                         reuse=None):
176 |     '''Applies multihead attention.
177 |     
178 |     Args:
179 |       queries: A 3d tensor with shape of [N, T_q, C_q].
180 |       keys: A 3d tensor with shape of [N, T_k, C_k].
181 |       num_units: A scalar. Attention size.
182 |       dropout_rate: A floating point number.
183 |       is_training: Boolean. Controller of mechanism for dropout.
184 |       causality: Boolean. If true, units that reference the future are masked. 
185 |       num_heads: An int. Number of heads.
186 |       scope: Optional scope for `variable_scope`.
187 |       reuse: Boolean, whether to reuse the weights of a previous layer
188 |         by the same name.
189 |         
190 |     Returns
191 |       A 3d tensor with shape of (N, T_q, C)  
192 |     '''
193 |     with tf.variable_scope(scope, reuse=reuse):
194 |         # Set the fall back option for num_units
195 |         if num_units is None:
196 |             num_units = queries.get_shape().as_list[-1]
197 |         
198 |         # Linear projections
199 |         Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
200 |         K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
201 |         V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
202 |         
203 |         # Split and concat
204 |         Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 
205 |         K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
206 |         V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
207 | 
208 |         # Multiplication
209 |         outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
210 |         
211 |         # Scale
212 |         outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
213 |         
214 |         # Key Masking
215 |         key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k)
216 |         key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
217 |         key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
218 |         
219 |         paddings = tf.ones_like(outputs)*(-2**32+1)
220 |         outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
221 |   
222 |         # Causality = Future blinding
223 |         if causality:
224 |             diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
225 |             #tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k)
226 |             tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
227 |             masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
228 |    
229 |             paddings = tf.ones_like(masks)*(-2**32+1)
230 |             outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
231 |   
232 |         # Activation
233 |         outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
234 |          
235 |         # Query Masking
236 |         query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q)
237 |         query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
238 |         query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
239 |         outputs *= query_masks # broadcasting. (N, T_q, C)
240 |           
241 |         # Dropouts
242 |         outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
243 |                
244 |         # Weighted sum
245 |         outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
246 |         
247 |         # Restore shape
248 |         outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)
249 |               
250 |         # Residual connection
251 |         outputs += queries
252 |               
253 |         # Normalize
254 |         outputs = normalize(outputs) # (N, T_q, C)
255 |  
256 |     return outputs
257 | 
258 | def feedforward(inputs, 
259 |                 num_units=[2048, 512],
260 |                 scope="multihead_attention", 
261 |                 reuse=None):
262 |     '''Point-wise feed forward net.
263 |     
264 |     Args:
265 |       inputs: A 3d tensor with shape of [N, T, C].
266 |       num_units: A list of two integers.
267 |       scope: Optional scope for `variable_scope`.
268 |       reuse: Boolean, whether to reuse the weights of a previous layer
269 |         by the same name.
270 |         
271 |     Returns:
272 |       A 3d tensor with the same shape and dtype as inputs
273 |     '''
274 |     with tf.variable_scope(scope, reuse=reuse):
275 |         # Inner layer
276 |         params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
277 |                   "activation": tf.nn.relu, "use_bias": True}
278 |         outputs = tf.layers.conv1d(**params)
279 |         
280 |         # Readout layer
281 |         params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
282 |                   "activation": None, "use_bias": True}
283 |         outputs = tf.layers.conv1d(**params)
284 |         
285 |         # Residual connection
286 |         outputs += inputs
287 |         
288 |         # Normalize
289 |         outputs = normalize(outputs)
290 |     
291 |     return outputs
292 | 
293 | def label_smoothing(inputs, epsilon=0.1):
294 |     '''Applies label smoothing. See https://arxiv.org/abs/1512.00567.
295 |     
296 |     Args:
297 |       inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
298 |       epsilon: Smoothing rate.
299 |     
300 |     For example,
301 |     
302 |     ```
303 |     import tensorflow as tf
304 |     inputs = tf.convert_to_tensor([[[0, 0, 1], 
305 |        [0, 1, 0],
306 |        [1, 0, 0]],
307 | 
308 |       [[1, 0, 0],
309 |        [1, 0, 0],
310 |        [0, 1, 0]]], tf.float32)
311 |        
312 |     outputs = label_smoothing(inputs)
313 |     
314 |     with tf.Session() as sess:
315 |         print(sess.run([outputs]))
316 |     
317 |     >>
318 |     [array([[[ 0.03333334,  0.03333334,  0.93333334],
319 |         [ 0.03333334,  0.93333334,  0.03333334],
320 |         [ 0.93333334,  0.03333334,  0.03333334]],
321 | 
322 |        [[ 0.93333334,  0.03333334,  0.03333334],
323 |         [ 0.93333334,  0.03333334,  0.03333334],
324 |         [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]   
325 |     ```    
326 |     '''
327 |     K = inputs.get_shape().as_list()[-1] # number of channels
328 |     return ((1-epsilon) * inputs) + (epsilon / K)
329 |     
330 |     
331 | 
332 |             
333 | def BME_cut(seq, label):
334 |     '''
335 |     Tokenization with sequence tagging of /B/E/S/M 
336 |     represent the word begin/end/single word/in the middle respectively.
337 |     Args:
338 |       inputs: seq:str, label:str.
339 |       output:List.
340 | 
341 |     Examples:
342 |     >>> BME_cut('l i k e m e','B M M E B E')
343 |     like me
344 |     '''
345 |     if isinstance(seq, str):
346 |         seq = seq.split()
347 |     if isinstance(label, str):
348 |         label = label.split()
349 | 
350 |     seq = seq + ['PAD']*(len(label) - len(seq))
351 |     assert len(seq) == len(label), "seq label is not compliable...{}, {}".format(seq, label)
352 |     tokens = []
353 |     i = 0
354 |     while i < len(seq):
355 |         if label[i] == 'S':
356 |             tokens.append(seq[i])
357 |         elif label[i] == 'B':
358 |             tmp = seq[i]
359 |             while i+1 < len(seq) and label[i+1] == 'M':
360 |                 tmp += seq[i+1]
361 |                 i += 1
362 |             if not i+1 < len(seq): break
363 |             #print(label[i+1], seq[i+1])
364 |             if label[i+1] == 'E':
365 |                 tmp += seq[i+1]
366 |             tokens.append(tmp)
367 |         i += 1
368 |     return ' '.join(tokens)


--------------------------------------------------------------------------------
/transformer_text_Classfication/modules.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | 
  7 | 
  8 | 
  9 | 
 10 | def normalize(inputs, 
 11 |               epsilon = 1e-8,
 12 |               scope="ln",
 13 |               reuse=None):
 14 |     '''Applies layer normalization.
 15 |     
 16 |     Args:
 17 |       inputs: A tensor with 2 or more dimensions, where the first dimension has
 18 |         `batch_size`.
 19 |       epsilon: A floating number. A very small number for preventing ZeroDivision Error.
 20 |       scope: Optional scope for `variable_scope`.
 21 |       reuse: Boolean, whether to reuse the weights of a previous layer
 22 |         by the same name.
 23 |       
 24 |     Returns:
 25 |       A tensor with the same shape and data dtype as `inputs`.
 26 |     '''
 27 |     with tf.variable_scope(scope, reuse=reuse):
 28 |         inputs_shape = inputs.get_shape()
 29 |         params_shape = inputs_shape[-1:]
 30 |     
 31 |         mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
 32 |         beta= tf.Variable(tf.zeros(params_shape))
 33 |         gamma = tf.Variable(tf.ones(params_shape))
 34 |         normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
 35 |         outputs = gamma * normalized + beta
 36 |         
 37 |     return outputs
 38 | 
 39 | def embedding(inputs, 
 40 |               vocab_size, 
 41 |               num_units, 
 42 |               zero_pad=True, 
 43 |               scale=True,
 44 |               scope="embedding", 
 45 |               reuse=None):
 46 |     '''Embeds a given tensor.
 47 | 
 48 |     Args:
 49 |       inputs: A `Tensor` with type `int32` or `int64` containing the ids
 50 |          to be looked up in `lookup table`.
 51 |       vocab_size: An int. Vocabulary size.
 52 |       num_units: An int. Number of embedding hidden units.
 53 |       zero_pad: A boolean. If True, all the values of the fist row (id 0)
 54 |         should be constant zeros.
 55 |       scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
 56 |       scope: Optional scope for `variable_scope`.
 57 |       reuse: Boolean, whether to reuse the weights of a previous layer
 58 |         by the same name.
 59 | 
 60 |     Returns:
 61 |       A `Tensor` with one more rank than inputs's. The last dimensionality
 62 |         should be `num_units`.
 63 |         
 64 |     For example,
 65 |     
 66 |     ```
 67 |     import tensorflow as tf
 68 |     
 69 |     inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
 70 |     outputs = embedding(inputs, 6, 2, zero_pad=True)
 71 |     with tf.Session() as sess:
 72 |         sess.run(tf.global_variables_initializer())
 73 |         print sess.run(outputs)
 74 |     >>
 75 |     [[[ 0.          0.        ]
 76 |       [ 0.09754146  0.67385566]
 77 |       [ 0.37864095 -0.35689294]]
 78 | 
 79 |      [[-1.01329422 -1.09939694]
 80 |       [ 0.7521342   0.38203377]
 81 |       [-0.04973143 -0.06210355]]]
 82 |     ```
 83 |     
 84 |     ```
 85 |     import tensorflow as tf
 86 |     
 87 |     inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
 88 |     outputs = embedding(inputs, 6, 2, zero_pad=False)
 89 |     with tf.Session() as sess:
 90 |         sess.run(tf.global_variables_initializer())
 91 |         print sess.run(outputs)
 92 |     >>
 93 |     [[[-0.19172323 -0.39159766]
 94 |       [-0.43212751 -0.66207761]
 95 |       [ 1.03452027 -0.26704335]]
 96 | 
 97 |      [[-0.11634696 -0.35983452]
 98 |       [ 0.50208133  0.53509563]
 99 |       [ 1.22204471 -0.96587461]]]    
100 |     ```    
101 |     '''
102 |     with tf.variable_scope(scope, reuse=reuse):
103 |         lookup_table = tf.get_variable('lookup_table',
104 |                                        dtype=tf.float32,
105 |                                        shape=[vocab_size, num_units],
106 |                                        initializer=tf.contrib.layers.xavier_initializer())
107 |         if zero_pad:
108 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
109 |                                       lookup_table[1:, :]), 0)
110 |         outputs = tf.nn.embedding_lookup(lookup_table, inputs)
111 |         
112 |         if scale:
113 |             outputs = outputs * (num_units ** 0.5) 
114 |             
115 |     return outputs
116 |     
117 | 
118 | def positional_encoding(inputs,
119 |                         num_units,
120 |                         zero_pad=True,
121 |                         scale=True,
122 |                         scope="positional_encoding",
123 |                         reuse=None):
124 |     '''Sinusoidal Positional_Encoding.
125 | 
126 |     Args:
127 |       inputs: A 2d Tensor with shape of (N, T).
128 |       num_units: Output dimensionality
129 |       zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
130 |       scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
131 |       scope: Optional scope for `variable_scope`.
132 |       reuse: Boolean, whether to reuse the weights of a previous layer
133 |         by the same name.
134 | 
135 |     Returns:
136 |         A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
137 |     '''
138 | 
139 |     N, T = inputs.get_shape().as_list()
140 |     with tf.variable_scope(scope, reuse=reuse):
141 |         position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])
142 | 
143 |         # First part of the PE function: sin and cos argument
144 |         position_enc = np.array([
145 |             [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)]
146 |             for pos in range(T)])
147 | 
148 |         # Second part, apply the cosine to even columns and sin to odds.
149 |         position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
150 |         position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
151 | 
152 |         # Convert to a tensor
153 |         lookup_table = tf.convert_to_tensor(position_enc)
154 | 
155 |         if zero_pad:
156 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
157 |                                       lookup_table[1:, :]), 0)
158 |         outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
159 | 
160 |         if scale:
161 |             outputs = outputs * num_units**0.5
162 | 
163 |         return outputs
164 | 
165 | 
166 | 
167 | def multihead_attention(queries, 
168 |                         keys, 
169 |                         num_units=None, 
170 |                         num_heads=8, 
171 |                         dropout_rate=0,
172 |                         is_training=True,
173 |                         causality=False,
174 |                         scope="multihead_attention", 
175 |                         reuse=None):
176 |     '''Applies multihead attention.
177 |     
178 |     Args:
179 |       queries: A 3d tensor with shape of [N, T_q, C_q].
180 |       keys: A 3d tensor with shape of [N, T_k, C_k].
181 |       num_units: A scalar. Attention size.
182 |       dropout_rate: A floating point number.
183 |       is_training: Boolean. Controller of mechanism for dropout.
184 |       causality: Boolean. If true, units that reference the future are masked. 
185 |       num_heads: An int. Number of heads.
186 |       scope: Optional scope for `variable_scope`.
187 |       reuse: Boolean, whether to reuse the weights of a previous layer
188 |         by the same name.
189 |         
190 |     Returns
191 |       A 3d tensor with shape of (N, T_q, C)  
192 |     '''
193 |     with tf.variable_scope(scope, reuse=reuse):
194 |         # Set the fall back option for num_units
195 |         if num_units is None:
196 |             num_units = queries.get_shape().as_list[-1]
197 |         
198 |         # Linear projections
199 |         Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
200 |         K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
201 |         V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
202 |         
203 |         # Split and concat
204 |         Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 
205 |         K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
206 |         V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
207 | 
208 |         # Multiplication
209 |         outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
210 |         
211 |         # Scale
212 |         outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
213 |         
214 |         # Key Masking
215 |         key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k)
216 |         key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
217 |         key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
218 |         
219 |         paddings = tf.ones_like(outputs)*(-2**32+1)
220 |         outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
221 |   
222 |         # Causality = Future blinding
223 |         if causality:
224 |             diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
225 |             #tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k)
226 |             tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
227 |             masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
228 |    
229 |             paddings = tf.ones_like(masks)*(-2**32+1)
230 |             outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
231 |   
232 |         # Activation
233 |         outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
234 |          
235 |         # Query Masking
236 |         query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q)
237 |         query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
238 |         query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
239 |         outputs *= query_masks # broadcasting. (N, T_q, C)
240 |           
241 |         # Dropouts
242 |         outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
243 |                
244 |         # Weighted sum
245 |         outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
246 |         
247 |         # Restore shape
248 |         outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)
249 |               
250 |         # Residual connection
251 |         outputs += queries
252 |               
253 |         # Normalize
254 |         outputs = normalize(outputs) # (N, T_q, C)
255 |  
256 |     return outputs
257 | 
258 | def feedforward(inputs, 
259 |                 num_units=[2048, 512],
260 |                 scope="multihead_attention", 
261 |                 reuse=None):
262 |     '''Point-wise feed forward net.
263 |     
264 |     Args:
265 |       inputs: A 3d tensor with shape of [N, T, C].
266 |       num_units: A list of two integers.
267 |       scope: Optional scope for `variable_scope`.
268 |       reuse: Boolean, whether to reuse the weights of a previous layer
269 |         by the same name.
270 |         
271 |     Returns:
272 |       A 3d tensor with the same shape and dtype as inputs
273 |     '''
274 |     with tf.variable_scope(scope, reuse=reuse):
275 |         # Inner layer
276 |         params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
277 |                   "activation": tf.nn.relu, "use_bias": True}
278 |         outputs = tf.layers.conv1d(**params)
279 |         
280 |         # Readout layer
281 |         params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
282 |                   "activation": None, "use_bias": True}
283 |         outputs = tf.layers.conv1d(**params)
284 |         
285 |         # Residual connection
286 |         outputs += inputs
287 |         
288 |         # Normalize
289 |         outputs = normalize(outputs)
290 |     
291 |     return outputs
292 | 
293 | def label_smoothing(inputs, epsilon=0.1):
294 |     '''Applies label smoothing. See https://arxiv.org/abs/1512.00567.
295 |     
296 |     Args:
297 |       inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
298 |       epsilon: Smoothing rate.
299 |     
300 |     For example,
301 |     
302 |     ```
303 |     import tensorflow as tf
304 |     inputs = tf.convert_to_tensor([[[0, 0, 1], 
305 |        [0, 1, 0],
306 |        [1, 0, 0]],
307 | 
308 |       [[1, 0, 0],
309 |        [1, 0, 0],
310 |        [0, 1, 0]]], tf.float32)
311 |        
312 |     outputs = label_smoothing(inputs)
313 |     
314 |     with tf.Session() as sess:
315 |         print(sess.run([outputs]))
316 |     
317 |     >>
318 |     [array([[[ 0.03333334,  0.03333334,  0.93333334],
319 |         [ 0.03333334,  0.93333334,  0.03333334],
320 |         [ 0.93333334,  0.03333334,  0.03333334]],
321 | 
322 |        [[ 0.93333334,  0.03333334,  0.03333334],
323 |         [ 0.93333334,  0.03333334,  0.03333334],
324 |         [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]   
325 |     ```    
326 |     '''
327 |     K = inputs.get_shape().as_list()[-1] # number of channels
328 |     return ((1-epsilon) * inputs) + (epsilon / K)
329 |     
330 |     
331 | 
332 |             
333 | def BME_cut(seq, label):
334 |     '''
335 |     Tokenization with sequence tagging of /B/E/S/M 
336 |     represent the word begin/end/single word/in the middle respectively.
337 |     Args:
338 |       inputs: seq:str, label:str.
339 |       output:List.
340 | 
341 |     Examples:
342 |     >>> BME_cut('l i k e m e','B M M E B E')
343 |     like me
344 |     '''
345 |     if isinstance(seq, str):
346 |         seq = seq.split()
347 |     if isinstance(label, str):
348 |         label = label.split()
349 | 
350 |     seq = seq + ['PAD']*(len(label) - len(seq))
351 |     assert len(seq) == len(label), "seq label is not compliable...{}, {}".format(seq, label)
352 |     tokens = []
353 |     i = 0
354 |     while i < len(seq):
355 |         if label[i] == 'S':
356 |             tokens.append(seq[i])
357 |         elif label[i] == 'B':
358 |             tmp = seq[i]
359 |             while i+1 < len(seq) and label[i+1] == 'M':
360 |                 tmp += seq[i+1]
361 |                 i += 1
362 |             if not i+1 < len(seq): break
363 |             #print(label[i+1], seq[i+1])
364 |             if label[i+1] == 'E':
365 |                 tmp += seq[i+1]
366 |             tokens.append(tmp)
367 |         i += 1
368 |     return ' '.join(tokens)


--------------------------------------------------------------------------------
/transformer_RC/modules.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python3
  3 | 
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import math
  7 | 
  8 | 
  9 | 
 10 | def normalize(inputs, 
 11 |               epsilon = 1e-8,
 12 |               scope="ln",
 13 |               reuse=None):
 14 |     '''Applies layer normalization.
 15 |     
 16 |     Args:
 17 |       inputs: A tensor with 2 or more dimensions, where the first dimension has
 18 |         `batch_size`.
 19 |       epsilon: A floating number. A very small number for preventing ZeroDivision Error.
 20 |       scope: Optional scope for `variable_scope`.
 21 |       reuse: Boolean, whether to reuse the weights of a previous layer
 22 |         by the same name.
 23 |       
 24 |     Returns:
 25 |       A tensor with the same shape and data dtype as `inputs`.
 26 |     '''
 27 |     with tf.variable_scope(scope, reuse=reuse):
 28 |         inputs_shape = inputs.get_shape()
 29 |         params_shape = inputs_shape[-1:]
 30 |     
 31 |         mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
 32 |         beta= tf.Variable(tf.zeros(params_shape))
 33 |         gamma = tf.Variable(tf.ones(params_shape))
 34 |         normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
 35 |         outputs = gamma * normalized + beta
 36 |         
 37 |     return outputs
 38 | 
 39 | 
 40 | def embedding(inputs,
 41 |               pretrained_embedding = None,
 42 |               vocab_size, 
 43 |               num_units, 
 44 |               zero_pad=True, 
 45 |               scale=True,
 46 |               scope="embedding", 
 47 |               reuse=None):
 48 |     '''Embeds a given tensor.
 49 | 
 50 |     Args:
 51 |       inputs: A `Tensor` with type `int32` or `int64` containing the ids
 52 |          to be looked up in `lookup table`.
 53 |       vocab_size: An int. Vocabulary size.
 54 |       num_units: An int. Number of embedding hidden units.
 55 |       zero_pad: A boolean. If True, all the values of the fist row (id 0)
 56 |         should be constant zeros.
 57 |       scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
 58 |       scope: Optional scope for `variable_scope`.
 59 |       reuse: Boolean, whether to reuse the weights of a previous layer
 60 |         by the same name.
 61 |     '''
 62 |     with tf.variable_scope(scope, reuse=reuse):
 63 |         if pretrained_embedding is not None:
 64 |             if not tf.shape(pretrained_embedding)[1] == num_units:
 65 |                 pre_emb = pretrained_embedding
 66 |                 fusion_emb = tf.layers.dense(pre_emb, num_units, activation='tanh')
 67 |                 fusion_emb = normalize(funsion_emb)
 68 | 
 69 |                 lookup_table = fusion_emb
 70 |             else:
 71 |                 lookup_table = pretrained_embedding
 72 |         else:
 73 |             lookup_table = tf.get_variable('lookup_table',
 74 |                                         dtype=tf.float32,
 75 |                                         shape=[vocab_size, num_units],
 76 |                                         initializer=tf.contrib.layers.xavier_initializer())
 77 |             if zero_pad:
 78 |                 lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
 79 |                                         lookup_table[1:, :]), 0)
 80 | 
 81 | 
 82 | 
 83 |         outputs = tf.nn.embedding_lookup(lookup_table, inputs)
 84 |         
 85 |         if scale:
 86 |             outputs = outputs * (num_units ** 0.5) 
 87 |             
 88 |     return outputs
 89 |     
 90 | 
 91 | def positional_encoding(inputs,
 92 |                         num_units,
 93 |                         zero_pad=True,
 94 |                         scale=True,
 95 |                         scope="positional_encoding",
 96 |                         reuse=None):
 97 |     '''Sinusoidal Positional_Encoding.
 98 | 
 99 |     Args:
100 |       inputs: A 2d Tensor with shape of (N, T).
101 |       num_units: Output dimensionality
102 |       zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
103 |       scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
104 |       scope: Optional scope for `variable_scope`.
105 |       reuse: Boolean, whether to reuse the weights of a previous layer
106 |         by the same name.
107 | 
108 |     Returns:
109 |         A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
110 |     '''
111 | 
112 |     N, T = inputs.get_shape().as_list()
113 |     with tf.variable_scope(scope, reuse=reuse):
114 |         position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])
115 | 
116 |         # First part of the PE function: sin and cos argument
117 |         position_enc = np.array([
118 |             [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)]
119 |             for pos in range(T)])
120 | 
121 |         # Second part, apply the cosine to even columns and sin to odds.
122 |         position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
123 |         position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
124 | 
125 |         # Convert to a tensor
126 |         lookup_table = tf.convert_to_tensor(position_enc)
127 | 
128 |         if zero_pad:
129 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
130 |                                       lookup_table[1:, :]), 0)
131 |         outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
132 | 
133 |         if scale:
134 |             outputs = outputs * num_units**0.5
135 | 
136 |         return outputs
137 | 
138 | 
139 | 
140 | def multihead_attention(queries, 
141 |                         keys, 
142 |                         num_units=None, 
143 |                         num_heads=8, 
144 |                         dropout_rate=0,
145 |                         is_training=True,
146 |                         causality=False,
147 |                         scope="multihead_attention", 
148 |                         reuse=None):
149 |     '''Applies multihead attention.
150 |     
151 |     Args:
152 |       queries: A 3d tensor with shape of [N, T_q, C_q].
153 |       keys: A 3d tensor with shape of [N, T_k, C_k].
154 |       num_units: A scalar. Attention size.
155 |       dropout_rate: A floating point number.
156 |       is_training: Boolean. Controller of mechanism for dropout.
157 |       causality: Boolean. If true, units that reference the future are masked. 
158 |       num_heads: An int. Number of heads.
159 |       scope: Optional scope for `variable_scope`.
160 |       reuse: Boolean, whether to reuse the weights of a previous layer
161 |         by the same name.
162 |         
163 |     Returns
164 |       A 3d tensor
165 |     '''
166 |     with tf.variable_scope(scope, reuse=reuse):
167 |         # Set the fall back option for num_units
168 |         if num_units is None:
169 |             num_units = queries.get_shape().as_list[-1]
170 |         
171 |         # Linear projections
172 |         Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
173 |         K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
174 |         V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
175 |         
176 |         # Split and concat
177 |         Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 
178 |         K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
179 |         V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
180 | 
181 |         # Multiplication
182 |         # (h*N, T_q, C/h)  @ (h*N, C/h, T_k) 
183 |         outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
184 |         
185 |         # Scale
186 |         outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
187 |         
188 |         # Key Masking
189 |         key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k)
190 |         key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
191 |         key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
192 |         
193 |         paddings = tf.ones_like(outputs)*(-2**32+1)
194 |         outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
195 |   
196 |         # Causality = Future blinding
197 |         if causality:
198 |             diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
199 |             #tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k)
200 |             tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
201 |             masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
202 |    
203 |             paddings = tf.ones_like(masks)*(-2**32+1)
204 |             outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
205 |   
206 |         # Activation
207 |         outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
208 |          
209 |         # Query Masking
210 |         query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q)
211 |         query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
212 |         query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
213 |         outputs *= query_masks # broadcasting. (N, T_q, C)
214 |           
215 |         # Dropouts
216 |         outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
217 |                
218 |         # Weighted sum
219 |         outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
220 |         
221 |         # Restore shape
222 |         outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)
223 |               
224 |         # Residual connection
225 |         outputs += queries
226 |               
227 |         # Normalize
228 |         outputs = normalize(outputs) # (N, T_q, C)
229 |  
230 |     return outputs
231 | 
232 | def feedforward(inputs, 
233 |                 num_units=[2048, 512],
234 |                 scope="multihead_attention", 
235 |                 reuse=None):
236 |     '''Point-wise feed forward net.
237 |     
238 |     Args:
239 |       inputs: A 3d tensor with shape of [N, T, C].
240 |       num_units: A list of two integers.
241 |       scope: Optional scope for `variable_scope`.
242 |       reuse: Boolean, whether to reuse the weights of a previous layer
243 |         by the same name.
244 |         
245 |     Returns:
246 |       A 3d tensor with the same shape and dtype as inputs
247 |     '''
248 |     with tf.variable_scope(scope, reuse=reuse):
249 |         # Inner layer
250 |         params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
251 |                   "activation": tf.nn.relu, "use_bias": True}
252 |         outputs = tf.layers.conv1d(**params)
253 |         
254 |         # Readout layer
255 |         params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
256 |                   "activation": None, "use_bias": True}
257 |         outputs = tf.layers.conv1d(**params)
258 |         
259 |         # Residual connection
260 |         outputs += inputs
261 |         
262 |         # Normalize
263 |         outputs = normalize(outputs)
264 |     
265 |     return outputs
266 | 
267 | def label_smoothing(inputs, epsilon=0.1):
268 |     '''Applies label smoothing. See https://arxiv.org/abs/1512.00567.
269 |     
270 |     Args:
271 |       inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
272 |       epsilon: Smoothing rate.
273 |     
274 |     For example,
275 |     
276 |     ```
277 |     import tensorflow as tf
278 |     inputs = tf.convert_to_tensor([[[0, 0, 1], 
279 |        [0, 1, 0],
280 |        [1, 0, 0]],
281 | 
282 |       [[1, 0, 0],
283 |        [1, 0, 0],
284 |        [0, 1, 0]]], tf.float32)
285 |        
286 |     outputs = label_smoothing(inputs)
287 |     
288 |     with tf.Session() as sess:
289 |         print(sess.run([outputs]))
290 |     
291 |     >>
292 |     [array([[[ 0.03333334,  0.03333334,  0.93333334],
293 |         [ 0.03333334,  0.93333334,  0.03333334],
294 |         [ 0.93333334,  0.03333334,  0.03333334]],
295 | 
296 |        [[ 0.93333334,  0.03333334,  0.03333334],
297 |         [ 0.93333334,  0.03333334,  0.03333334],
298 |         [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]   
299 |     ```    
300 |     '''
301 |     K = inputs.get_shape().as_list()[-1] # number of channels
302 |     return ((1-epsilon) * inputs) + (epsilon / K)
303 |     
304 |     
305 | 
306 |             
307 | def BME_cut(seq, label):
308 |     '''
309 |     Tokenization with sequence tagging of /B/E/S/M 
310 |     represent the word begin/end/single word/in the middle respectively.
311 |     Args:
312 |       inputs: seq:str, label:str.
313 |       output:List.
314 | 
315 |     Examples:
316 |     >>> BME_cut('l i k e m e','B M M E B E')
317 |     like me
318 |     '''
319 |     if isinstance(seq, str):
320 |         seq = seq.split()
321 |     if isinstance(label, str):
322 |         label = label.split()
323 | 
324 |     seq = seq + ['PAD']*(len(label) - len(seq))
325 |     assert len(seq) == len(label), "seq label is not compliable...{}, {}".format(seq, label)
326 |     tokens = []
327 |     i = 0
328 |     while i < len(seq):
329 |         if label[i] == 'S':
330 |             tokens.append(seq[i])
331 |         elif label[i] == 'B':
332 |             tmp = seq[i]
333 |             while i+1 < len(seq) and label[i+1] == 'M':
334 |                 tmp += seq[i+1]
335 |                 i += 1
336 |             if not i+1 < len(seq): break
337 |             #print(label[i+1], seq[i+1])
338 |             if label[i+1] == 'E':
339 |                 tmp += seq[i+1]
340 |             tokens.append(tmp)
341 |         i += 1
342 |     return ' '.join(tokens)
343 | 
344 | 
345 | 
346 | 
347 | 
348 | def bleu(pred_tokens, label_tokens, k):
349 |     """craft bleu realization"""
350 |     len_pred, len_label = len(pred_tokens), len(label_tokens)
351 |     score = math.exp(min(0, 1 - len_label / len_pred))
352 |     for n in range(1, k + 1):
353 |         num_matches, label_subs = 0, collections.defaultdict(int)
354 |         for i in range(len_label - n + 1):
355 |             label_subs[''.join(label_tokens[i: i + n])] += 1
356 |         for i in range(len_pred - n + 1):
357 |             if label_subs[''.join(pred_tokens[i: i + n])] > 0:
358 |                 num_matches += 1
359 |                 label_subs[''.join(pred_tokens[i: i + n])] -= 1
360 |         score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n))
361 |     return score
362 | 
363 | 


--------------------------------------------------------------------------------