├── .gitignore ├── rc ├── rnet │ ├── code │ │ ├── __init__.py │ │ ├── rnns │ │ │ ├── __init__.py │ │ │ ├── lrn.py │ │ │ ├── atr.py │ │ │ ├── cell.py │ │ │ ├── gru.py │ │ │ ├── rnn.py │ │ │ ├── sru.py │ │ │ └── lstm.py │ │ ├── download.sh │ │ └── evaluate-v1.1.py │ ├── config.py │ ├── test_lrn.sh │ └── train_lrn.sh ├── elmo_rnet │ ├── config.py │ ├── code │ │ ├── bilm │ │ │ ├── __init__.py │ │ │ └── elmo.py │ │ ├── rnns │ │ │ ├── __init__.py │ │ │ ├── lrn.py │ │ │ ├── atr.py │ │ │ ├── cell.py │ │ │ ├── gru.py │ │ │ ├── rnn.py │ │ │ ├── sru.py │ │ │ └── lstm.py │ │ ├── download.sh │ │ ├── cycle.py │ │ └── evaluate-v1.1.py │ ├── train_lrn.sh │ └── test_lrn.sh └── README.md ├── doc ├── code │ ├── utils │ │ ├── __init__.py │ │ ├── recorder.py │ │ ├── thread.py │ │ ├── initializer.py │ │ ├── saver.py │ │ └── cycle.py │ ├── bert │ │ ├── __init__.py │ │ ├── load.py │ │ ├── tokenizer.py │ │ └── vocab.py │ ├── lrs │ │ ├── vanillalr.py │ │ ├── epochlr.py │ │ ├── noamlr.py │ │ ├── lr.py │ │ ├── __init__.py │ │ ├── scorelr.py │ │ └── gnmtplr.py │ ├── rnns │ │ ├── __init__.py │ │ ├── lrn.py │ │ ├── atr.py │ │ ├── cell.py │ │ ├── gru.py │ │ ├── sru.py │ │ └── lstm.py │ ├── tasks.py │ ├── evalu.py │ └── vocab.py ├── config.py └── README.md ├── nli ├── code │ ├── utils │ │ ├── __init__.py │ │ ├── recorder.py │ │ ├── thread.py │ │ ├── initializer.py │ │ ├── saver.py │ │ └── cycle.py │ ├── bert │ │ ├── __init__.py │ │ ├── load.py │ │ ├── tokenizer.py │ │ └── vocab.py │ ├── lrs │ │ ├── vanillalr.py │ │ ├── epochlr.py │ │ ├── noamlr.py │ │ ├── lr.py │ │ ├── __init__.py │ │ ├── scorelr.py │ │ └── gnmtplr.py │ ├── scripts │ │ └── convert_to_plain.py │ ├── rnns │ │ ├── __init__.py │ │ ├── lrn.py │ │ ├── atr.py │ │ ├── cell.py │ │ ├── gru.py │ │ ├── sru.py │ │ └── lstm.py │ ├── evalu.py │ └── vocab.py ├── config.py ├── config_bert.py └── README.md ├── figures ├── ls_mem.png ├── memory.png └── san_corr.png ├── ner ├── code │ ├── scripts │ │ └── get_test_score.py │ ├── requirements.txt │ ├── ner_glove.py │ ├── callbacks.py │ ├── trainer.py │ └── tagger.py └── README.md ├── lm ├── code │ ├── locked_dropout.py │ ├── get_data.sh │ ├── embed_regularize.py │ ├── utils.py │ ├── generate.py │ ├── weight_drop.py │ └── data.py └── README.md ├── LICENSE ├── mt └── README.md └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .idea -------------------------------------------------------------------------------- /rc/rnet/code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rc/rnet/config.py: -------------------------------------------------------------------------------- 1 | code/config.py -------------------------------------------------------------------------------- /rc/elmo_rnet/config.py: -------------------------------------------------------------------------------- 1 | code/config.py -------------------------------------------------------------------------------- /doc/code/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | -------------------------------------------------------------------------------- /nli/code/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | -------------------------------------------------------------------------------- /figures/ls_mem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bzhangGo/lrn/HEAD/figures/ls_mem.png -------------------------------------------------------------------------------- /figures/memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bzhangGo/lrn/HEAD/figures/memory.png -------------------------------------------------------------------------------- /figures/san_corr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bzhangGo/lrn/HEAD/figures/san_corr.png -------------------------------------------------------------------------------- /doc/code/bert/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | from .bert import * 5 | from .load import * 6 | from .tokenizer import * 7 | -------------------------------------------------------------------------------- /nli/code/bert/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | from .bert import * 5 | from .load import * 6 | from .tokenizer import * 7 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/bilm/__init__.py: -------------------------------------------------------------------------------- 1 | # Elmo Interface 2 | # Deep contextualized word representations 3 | 4 | from .data import Batcher, TokenBatcher 5 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \ 6 | dump_bilm_embeddings 7 | from .elmo import weight_layers 8 | 9 | -------------------------------------------------------------------------------- /rc/rnet/test_lrn.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_ROOT=XXX 4 | export PATH=$CUDA_ROOT/bin:$PATH 5 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH 6 | 7 | export CUDA_VISIBLE_DEVICES=0 8 | 9 | export name=log_lrn 10 | 11 | python config.py --mode test --cell lrn 12 | 13 | -------------------------------------------------------------------------------- /rc/rnet/train_lrn.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_ROOT=XXX 4 | export PATH=$CUDA_ROOT/bin:$PATH 5 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH 6 | 7 | export CUDA_VISIBLE_DEVICES=0 8 | 9 | export name=log_lrn 10 | 11 | python config.py --mode train --cell lrn 12 | 13 | -------------------------------------------------------------------------------- /rc/elmo_rnet/train_lrn.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_ROOT=XXX 4 | export PATH=$CUDA_ROOT/bin:$PATH 5 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH 6 | 7 | export CUDA_VISIBLE_DEVICES=0 8 | 9 | export name=log_lrn 10 | 11 | python config.py --mode train --cell lrn 12 | 13 | -------------------------------------------------------------------------------- /rc/elmo_rnet/test_lrn.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export CUDA_ROOT=XXX 4 | export PATH=$CUDA_ROOT/bin:$PATH 5 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH 6 | 7 | export CUDA_VISIBLE_DEVICES=0 8 | 9 | export name=log_lrn 10 | 11 | python config.py --mode test --cell lrn --batch_size 8 12 | 13 | -------------------------------------------------------------------------------- /doc/code/lrs/vanillalr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | from lrs import lr 9 | 10 | 11 | class VanillaLR(lr.Lr): 12 | """Very basic learning rate, constant learning rate""" 13 | def __init__(self, 14 | init_lr, 15 | name="vanilla_lr" 16 | ): 17 | super(VanillaLR, self).__init__(init_lr, name=name) 18 | -------------------------------------------------------------------------------- /nli/code/lrs/vanillalr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | from lrs import lr 9 | 10 | 11 | class VanillaLR(lr.Lr): 12 | """Very basic learning rate, constant learning rate""" 13 | def __init__(self, 14 | init_lr, 15 | name="vanilla_lr" 16 | ): 17 | super(VanillaLR, self).__init__(init_lr, name=name) 18 | -------------------------------------------------------------------------------- /ner/code/scripts/get_test_score.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | import sys 4 | import numpy as np 5 | 6 | 7 | def extract_dev_test_score(fname): 8 | test_score = float(open(fname, 'rU').readlines()[-1].strip()) 9 | 10 | return test_score 11 | 12 | 13 | cell_type = sys.argv[1] 14 | exp_dirs = sys.argv[2:] 15 | 16 | scores = [] 17 | for exp_dir in exp_dirs: 18 | test_score = extract_dev_test_score("{}/log.{}".format(exp_dir, cell_type)) 19 | scores.append(test_score) 20 | 21 | print(np.mean(scores), np.std(scores)) 22 | -------------------------------------------------------------------------------- /lm/code/locked_dropout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | class LockedDropout(nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def forward(self, x, dropout=0.5): 10 | if not self.training or not dropout: 11 | return x 12 | m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout) 13 | # mask = Variable(m, requires_grad=False) / (1 - dropout) 14 | mask = Variable(m.div_(1 - dropout), requires_grad=False) 15 | mask = mask.expand_as(x) 16 | return mask * x 17 | -------------------------------------------------------------------------------- /ner/code/requirements.txt: -------------------------------------------------------------------------------- 1 | backports.weakref==1.0rc1 2 | bleach==1.5.0 3 | boto==2.48.0 4 | bz2file==0.98 5 | certifi==2017.11.5 6 | chardet==3.0.4 7 | enum34==1.1.6 8 | gensim==3.1.0 9 | h5py==2.7.1 10 | html5lib==0.9999999 11 | idna==2.6 12 | Keras==2.2.0 13 | m2r==0.1.12 14 | Markdown==2.6.9 15 | numpy==1.13.3 16 | protobuf==3.5.1 17 | python-dateutil==2.6.0 18 | pytz==2017.2 19 | PyYAML==4.2b1 20 | requests==2.21.0 21 | scikit-learn==0.19.1 22 | scipy==1.0.0 23 | seqeval==0.0.3 24 | six==1.11.0 25 | smart-open==1.5.3 26 | tensorboard==1.8.0 27 | tensorflow>=1.12.1 28 | Theano==0.9.0 29 | urllib3>=1.24.2 30 | Werkzeug>=0.15.3 31 | allennlp==0.7.1 32 | -------------------------------------------------------------------------------- /nli/code/scripts/convert_to_plain.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | import sys 4 | 5 | data = open(sys.argv[1], 'r') 6 | data.readline() 7 | 8 | out_l = open(sys.argv[2]+".l", 'w') 9 | out_p = open(sys.argv[2]+".p", 'w') 10 | out_q = open(sys.argv[2]+".q", 'w') 11 | 12 | label = {'entailment': 0, 13 | 'neutral': 1, 14 | 'contradiction': 2} 15 | 16 | for line in data: 17 | l, p, q = line.strip().split('\t')[:3] 18 | if l not in label: 19 | continue 20 | out_l.write(str(label[l]) + '\n') 21 | out_p.write(p.replace('( ', '').replace(' )', '') + '\n') 22 | out_q.write(q.replace('( ', '').replace(' )', '') + '\n') 23 | 24 | out_l.close() 25 | out_p.close() 26 | out_q.close() 27 | -------------------------------------------------------------------------------- /lm/code/get_data.sh: -------------------------------------------------------------------------------- 1 | mkdir data 2 | cd data 3 | 4 | echo "- Downloading Penn Treebank (PTB)" 5 | mkdir -p penn 6 | cd penn 7 | URL="https://raw.githubusercontent.com/lanpa/tensorboard-pytorch-examples/master/word_language_model/data/penn" 8 | wget --quiet --continue $URL/train.txt 9 | wget --quiet --continue $URL/valid.txt 10 | wget --quiet --continue $URL/test.txt 11 | cd .. 12 | 13 | echo "- Downloading WikiText-2 (WT2)" 14 | wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 15 | unzip -q wikitext-2-v1.zip 16 | cd wikitext-2 17 | mv wiki.train.tokens train.txt 18 | mv wiki.valid.tokens valid.txt 19 | mv wiki.test.tokens test.txt 20 | 21 | echo "---" 22 | echo "Happy language modeling :)" 23 | 24 | cd .. 25 | -------------------------------------------------------------------------------- /doc/code/lrs/epochlr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | from lrs import lr 9 | 10 | 11 | class EpochDecayLr(lr.Lr): 12 | """Decay the learning rate after each epoch""" 13 | def __init__(self, 14 | init_lr, 15 | decay=0.5, # learning rate decay rate 16 | name="epoch_decay_lr" 17 | ): 18 | super(EpochDecayLr, self).__init__(init_lr, name=name) 19 | 20 | self.decay = decay 21 | 22 | def after_epoch(self, eidx=None): 23 | if eidx is None: 24 | self.lrate = self.init_lrate * self.decay 25 | else: 26 | self.lrate = self.init_lrate * self.decay ** int(eidx) 27 | -------------------------------------------------------------------------------- /nli/code/lrs/epochlr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | from lrs import lr 9 | 10 | 11 | class EpochDecayLr(lr.Lr): 12 | """Decay the learning rate after each epoch""" 13 | def __init__(self, 14 | init_lr, 15 | decay=0.5, # learning rate decay rate 16 | name="epoch_decay_lr" 17 | ): 18 | super(EpochDecayLr, self).__init__(init_lr, name=name) 19 | 20 | self.decay = decay 21 | 22 | def after_epoch(self, eidx=None): 23 | if eidx is None: 24 | self.lrate = self.init_lrate * self.decay 25 | else: 26 | self.lrate = self.init_lrate * self.decay ** int(eidx) 27 | -------------------------------------------------------------------------------- /doc/code/utils/recorder.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import json 8 | import tensorflow as tf 9 | 10 | 11 | class Recorder(object): 12 | """To save training processes, inspired by Nematus""" 13 | 14 | def load_from_json(self, file_name): 15 | tf.logging.info("Loading recoder file from {}".format(file_name)) 16 | record = json.load(open(file_name, 'rb')) 17 | record = dict((key.encode("UTF-8"), value) for (key, value) in record.items()) 18 | self.__dict__.update(record) 19 | 20 | def save_to_json(self, file_name): 21 | tf.logging.info("Saving recorder file into {}".format(file_name)) 22 | json.dump(self.__dict__, open(file_name, 'wb'), indent=2) 23 | -------------------------------------------------------------------------------- /nli/code/utils/recorder.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import json 8 | import tensorflow as tf 9 | 10 | 11 | class Recorder(object): 12 | """To save training processes, inspired by Nematus""" 13 | 14 | def load_from_json(self, file_name): 15 | tf.logging.info("Loading recoder file from {}".format(file_name)) 16 | record = json.load(open(file_name, 'rb')) 17 | record = dict((key.encode("UTF-8"), value) for (key, value) in record.items()) 18 | self.__dict__.update(record) 19 | 20 | def save_to_json(self, file_name): 21 | tf.logging.info("Saving recorder file into {}".format(file_name)) 22 | json.dump(self.__dict__, open(file_name, 'wb'), indent=2) 23 | -------------------------------------------------------------------------------- /doc/code/rnns/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from rnns import gru, lstm, atr, sru, lrn 4 | 5 | 6 | def get_cell(cell_name, hidden_size, ln=False, scope=None): 7 | """Convert the cell_name into cell instance.""" 8 | cell_name = cell_name.lower() 9 | 10 | if cell_name == "gru": 11 | return gru.gru(hidden_size, ln=ln, scope=scope or "gru") 12 | elif cell_name == "lstm": 13 | return lstm.lstm(hidden_size, ln=ln, scope=scope or "lstm") 14 | elif cell_name == "atr": 15 | return atr.atr(hidden_size, ln=ln, scope=scope or "atr") 16 | elif cell_name == "sru": 17 | return sru.sru(hidden_size, ln=ln, scope=scope or "sru") 18 | elif cell_name == "lrn": 19 | return lrn.lrn(hidden_size, ln=ln, scope=scope or "lrn") 20 | else: 21 | raise NotImplementedError( 22 | "{} is not supported".format(cell_name)) 23 | -------------------------------------------------------------------------------- /nli/code/rnns/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from rnns import gru, lstm, atr, sru, lrn 4 | 5 | 6 | def get_cell(cell_name, hidden_size, ln=False, scope=None): 7 | """Convert the cell_name into cell instance.""" 8 | cell_name = cell_name.lower() 9 | 10 | if cell_name == "gru": 11 | return gru.gru(hidden_size, ln=ln, scope=scope or "gru") 12 | elif cell_name == "lstm": 13 | return lstm.lstm(hidden_size, ln=ln, scope=scope or "lstm") 14 | elif cell_name == "atr": 15 | return atr.atr(hidden_size, ln=ln, scope=scope or "atr") 16 | elif cell_name == "sru": 17 | return sru.sru(hidden_size, ln=ln, scope=scope or "sru") 18 | elif cell_name == "lrn": 19 | return lrn.lrn(hidden_size, ln=ln, scope=scope or "lrn") 20 | else: 21 | raise NotImplementedError( 22 | "{} is not supported".format(cell_name)) 23 | -------------------------------------------------------------------------------- /rc/rnet/code/rnns/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from rnns import gru, lstm, atr, sru, lrn 4 | 5 | 6 | def get_cell(cell_name, hidden_size, ln=False, scope=None): 7 | """Convert the cell_name into cell instance.""" 8 | cell_name = cell_name.lower() 9 | 10 | if cell_name == "gru": 11 | return gru.gru(hidden_size, ln=ln, scope=scope or "gru") 12 | elif cell_name == "lstm": 13 | return lstm.lstm(hidden_size, ln=ln, scope=scope or "lstm") 14 | elif cell_name == "atr": 15 | return atr.atr(hidden_size, ln=ln, scope=scope or "atr") 16 | elif cell_name == "sru": 17 | return sru.sru(hidden_size, ln=ln, scope=scope or "sru") 18 | elif cell_name == "lrn": 19 | return lrn.lrn(hidden_size, ln=ln, scope=scope or "lrn") 20 | else: 21 | raise NotImplementedError( 22 | "{} is not supported".format(cell_name)) 23 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/rnns/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from rnns import gru, lstm, atr, sru, lrn 4 | 5 | 6 | def get_cell(cell_name, hidden_size, ln=False, scope=None): 7 | """Convert the cell_name into cell instance.""" 8 | cell_name = cell_name.lower() 9 | 10 | if cell_name == "gru": 11 | return gru.gru(hidden_size, ln=ln, scope=scope or "gru") 12 | elif cell_name == "lstm": 13 | return lstm.lstm(hidden_size, ln=ln, scope=scope or "lstm") 14 | elif cell_name == "atr": 15 | return atr.atr(hidden_size, ln=ln, scope=scope or "atr") 16 | elif cell_name == "sru": 17 | return sru.sru(hidden_size, ln=ln, scope=scope or "sru") 18 | elif cell_name == "lrn": 19 | return lrn.lrn(hidden_size, ln=ln, scope=scope or "lrn") 20 | else: 21 | raise NotImplementedError( 22 | "{} is not supported".format(cell_name)) 23 | -------------------------------------------------------------------------------- /doc/code/utils/thread.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import threading 8 | 9 | 10 | class threadsafe_iter: 11 | """Takes an iterator/generator and makes it thread-safe by 12 | serializing call to the `next` method of given iterator/generator. 13 | """ 14 | 15 | def __init__(self, it): 16 | self.it = it 17 | self.lock = threading.Lock() 18 | 19 | def __iter__(self): 20 | return self 21 | 22 | def __next__(self): 23 | return self.next() 24 | 25 | def next(self): 26 | with self.lock: 27 | return next(self.it) 28 | 29 | 30 | def threadsafe_generator(f): 31 | """A decorator that takes a generator function and makes it thread-safe. 32 | """ 33 | 34 | def g(*a, **kw): 35 | return threadsafe_iter(f(*a, **kw)) 36 | 37 | return g -------------------------------------------------------------------------------- /nli/code/utils/thread.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import threading 8 | 9 | 10 | class threadsafe_iter: 11 | """Takes an iterator/generator and makes it thread-safe by 12 | serializing call to the `next` method of given iterator/generator. 13 | """ 14 | 15 | def __init__(self, it): 16 | self.it = it 17 | self.lock = threading.Lock() 18 | 19 | def __iter__(self): 20 | return self 21 | 22 | def __next__(self): 23 | return self.next() 24 | 25 | def next(self): 26 | with self.lock: 27 | return next(self.it) 28 | 29 | 30 | def threadsafe_generator(f): 31 | """A decorator that takes a generator function and makes it thread-safe. 32 | """ 33 | 34 | def g(*a, **kw): 35 | return threadsafe_iter(f(*a, **kw)) 36 | 37 | return g -------------------------------------------------------------------------------- /rc/rnet/code/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Download SQuAD 4 | SQUAD_DIR=~/data/squad 5 | mkdir -p $SQUAD_DIR 6 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json 7 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json 8 | 9 | # Download GloVe 10 | GLOVE_DIR=~/data/glove 11 | mkdir -p $GLOVE_DIR 12 | wget http://nlp.stanford.edu/data/glove.840B.300d.zip -O $GLOVE_DIR/glove.840B.300d.zip 13 | unzip $GLOVE_DIR/glove.840B.300d.zip -d $GLOVE_DIR 14 | 15 | # Download Glove Character Embedding 16 | # wget https://raw.githubusercontent.com/minimaxir/char-embeddings/master/glove.840B.300d-char.txt -O $GLOVE_DIR/glove.840B.300d-char.txt 17 | 18 | # Download fasttext 19 | # FASTTEXT_DIR=~/data/fasttext 20 | # mkdir -p $FASTTEXT_DIR 21 | # wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M.vec.zip -O $FASTTEXT_DIR/wiki-news-300d-1M.vec.zip 22 | # unzip $FASTTEXT_DIR/wiki-news-300d-1M.vec.zip -d $FASTTEXT_DIR 23 | 24 | # Download Spacy language models 25 | python3 -m spacy download en 26 | -------------------------------------------------------------------------------- /doc/code/lrs/noamlr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import numpy as np 8 | 9 | from lrs import lr 10 | 11 | 12 | class NoamDecayLr(lr.Lr): 13 | """Decay the learning rate during each training step, follows Transformer""" 14 | def __init__(self, 15 | init_lr, # initial learning rate 16 | warmup_steps, # warmup step 17 | hidden_size, # model hidden size 18 | name="noam_decay_lr" # model name, no use 19 | ): 20 | super(NoamDecayLr, self).__init__(init_lr, name=name) 21 | 22 | self.warmup_steps = warmup_steps 23 | self.hidden_size = hidden_size 24 | 25 | def step(self, step): 26 | step = float(step) 27 | warmup_steps = float(self.warmup_steps) 28 | 29 | multiplier = float(self.hidden_size) ** -0.5 30 | decay = multiplier * np.minimum((step + 1) * (warmup_steps ** -1.5), 31 | (step + 1) ** -0.5) 32 | self.lrate = self.init_lrate * decay 33 | -------------------------------------------------------------------------------- /nli/code/lrs/noamlr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import numpy as np 8 | 9 | from lrs import lr 10 | 11 | 12 | class NoamDecayLr(lr.Lr): 13 | """Decay the learning rate during each training step, follows Transformer""" 14 | def __init__(self, 15 | init_lr, # initial learning rate 16 | warmup_steps, # warmup step 17 | hidden_size, # model hidden size 18 | name="noam_decay_lr" # model name, no use 19 | ): 20 | super(NoamDecayLr, self).__init__(init_lr, name=name) 21 | 22 | self.warmup_steps = warmup_steps 23 | self.hidden_size = hidden_size 24 | 25 | def step(self, step): 26 | step = float(step) 27 | warmup_steps = float(self.warmup_steps) 28 | 29 | multiplier = float(self.hidden_size) ** -0.5 30 | decay = multiplier * np.minimum((step + 1) * (warmup_steps ** -1.5), 31 | (step + 1) ** -0.5) 32 | self.lrate = self.init_lrate * decay 33 | -------------------------------------------------------------------------------- /ner/code/ner_glove.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from wrapper import Sequence 4 | from utils import load_data_and_labels, load_glove 5 | 6 | 7 | if __name__ == '__main__': 8 | DATA_ROOT = os.path.join(os.path.dirname(__file__), os.environ["data_dir"]) 9 | EMBEDDING_PATH = os.path.join(os.path.dirname(__file__), os.environ["glove_dir"]) 10 | 11 | train_path = os.path.join(DATA_ROOT, 'train.txt') 12 | valid_path = os.path.join(DATA_ROOT, 'valid.txt') 13 | test_path = os.path.join(DATA_ROOT, 'test.txt') 14 | 15 | print('Loading data...') 16 | x_train, y_train = load_data_and_labels(train_path) 17 | x_valid, y_valid = load_data_and_labels(valid_path) 18 | x_test, y_test = load_data_and_labels(test_path) 19 | print(len(x_train), 'train sequences') 20 | print(len(x_valid), 'valid sequences') 21 | print(len(x_test), 'test sequences') 22 | 23 | embeddings = load_glove(EMBEDDING_PATH) 24 | 25 | # Use pre-trained word embeddings 26 | model = Sequence(cell_type=os.environ['cell_type'], embeddings=embeddings, initial_vocab=embeddings.keys()) 27 | # print(model.trainable_weights) 28 | 29 | model.fit(x_train, y_train, x_valid, y_valid, epochs=30) 30 | 31 | print('Testing the model...') 32 | print(model.score(x_test, y_test)) 33 | -------------------------------------------------------------------------------- /doc/code/lrs/lr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | # This is an abstract class that deals with 9 | # different learning rate decay strategy 10 | # Generally, we decay the learning rate with GPU computation 11 | # However, in this paper, we simply decay the learning rate 12 | # at CPU level, and feed the decayed lr into GPU for 13 | # optimization 14 | class Lr(object): 15 | def __init__(self, 16 | init_lrate, # initial learning rate 17 | name="lr", # learning rate name, no use 18 | ): 19 | self.name = name 20 | self.init_lrate = init_lrate # just record the init learning rate 21 | self.lrate = init_lrate # active learning rate, change with training 22 | 23 | # suppose the eidx starts from 1 24 | def before_epoch(self, eidx=None): 25 | pass 26 | 27 | def after_epoch(self, eidx=None): 28 | pass 29 | 30 | def step(self, step): 31 | pass 32 | 33 | def after_eval(self, eval_score): 34 | pass 35 | 36 | def get_lr(self): 37 | """Return the learning rate whenever you want""" 38 | return self.lrate 39 | -------------------------------------------------------------------------------- /nli/code/lrs/lr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | # This is an abstract class that deals with 9 | # different learning rate decay strategy 10 | # Generally, we decay the learning rate with GPU computation 11 | # However, in this paper, we simply decay the learning rate 12 | # at CPU level, and feed the decayed lr into GPU for 13 | # optimization 14 | class Lr(object): 15 | def __init__(self, 16 | init_lrate, # initial learning rate 17 | name="lr", # learning rate name, no use 18 | ): 19 | self.name = name 20 | self.init_lrate = init_lrate # just record the init learning rate 21 | self.lrate = init_lrate # active learning rate, change with training 22 | 23 | # suppose the eidx starts from 1 24 | def before_epoch(self, eidx=None): 25 | pass 26 | 27 | def after_epoch(self, eidx=None): 28 | pass 29 | 30 | def step(self, step): 31 | pass 32 | 33 | def after_eval(self, eval_score): 34 | pass 35 | 36 | def get_lr(self): 37 | """Return the learning rate whenever you want""" 38 | return self.lrate 39 | -------------------------------------------------------------------------------- /lm/code/embed_regularize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | def embedded_dropout(embed, words, dropout=0.1, scale=None): 8 | if dropout: 9 | mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout) 10 | mask = Variable(mask) 11 | masked_embed_weight = mask * embed.weight 12 | else: 13 | masked_embed_weight = embed.weight 14 | if scale: 15 | masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight 16 | 17 | padding_idx = embed.padding_idx 18 | if padding_idx is None: 19 | padding_idx = -1 20 | X = F.embedding(words, masked_embed_weight, 21 | padding_idx, embed.max_norm, embed.norm_type, 22 | embed.scale_grad_by_freq, embed.sparse 23 | ) 24 | return X 25 | 26 | if __name__ == '__main__': 27 | V = 50 28 | h = 4 29 | bptt = 10 30 | batch_size = 2 31 | 32 | embed = torch.nn.Embedding(V, h) 33 | 34 | words = np.random.random_integers(low=0, high=V-1, size=(batch_size, bptt)) 35 | words = torch.LongTensor(words) 36 | words = Variable(words) 37 | 38 | origX = embed(words) 39 | X = embedded_dropout(embed, words) 40 | 41 | print(origX) 42 | print(X) 43 | -------------------------------------------------------------------------------- /doc/code/utils/initializer.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | 10 | def get_initializer(params): 11 | if params.initializer == "uniform": 12 | max_val = params.initializer_gain 13 | return tf.random_uniform_initializer(-max_val, max_val) 14 | elif params.initializer == "normal": 15 | return tf.random_normal_initializer(0.0, params.initializer_gain) 16 | elif params.initializer == "normal_unit_scaling": 17 | return tf.variance_scaling_initializer(params.initializer_gain, 18 | mode="fan_avg", 19 | distribution="normal") 20 | elif params.initializer == "uniform_unit_scaling": 21 | return tf.variance_scaling_initializer(params.initializer_gain, 22 | mode="fan_avg", 23 | distribution="uniform") 24 | else: 25 | tf.logging.warn("Unrecognized initializer: %s" % params.initializer) 26 | tf.logging.warn("Return to default initializer: glorot_uniform_initializer") 27 | return tf.glorot_uniform_initializer() 28 | -------------------------------------------------------------------------------- /nli/code/utils/initializer.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | 10 | def get_initializer(params): 11 | if params.initializer == "uniform": 12 | max_val = params.initializer_gain 13 | return tf.random_uniform_initializer(-max_val, max_val) 14 | elif params.initializer == "normal": 15 | return tf.random_normal_initializer(0.0, params.initializer_gain) 16 | elif params.initializer == "normal_unit_scaling": 17 | return tf.variance_scaling_initializer(params.initializer_gain, 18 | mode="fan_avg", 19 | distribution="normal") 20 | elif params.initializer == "uniform_unit_scaling": 21 | return tf.variance_scaling_initializer(params.initializer_gain, 22 | mode="fan_avg", 23 | distribution="uniform") 24 | else: 25 | tf.logging.warn("Unrecognized initializer: %s" % params.initializer) 26 | tf.logging.warn("Return to default initializer: glorot_uniform_initializer") 27 | return tf.glorot_uniform_initializer() 28 | -------------------------------------------------------------------------------- /doc/code/lrs/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from lrs import vanillalr, noamlr, scorelr, gnmtplr, epochlr 4 | 5 | 6 | def get_lr(params): 7 | 8 | strategy = params.lrate_strategy.lower() 9 | 10 | if strategy == "noam": 11 | return noamlr.NoamDecayLr( 12 | params.lrate, 13 | params.warmup_steps, 14 | params.hidden_size 15 | ) 16 | elif strategy == "gnmt+": 17 | return gnmtplr.GNMTPDecayLr( 18 | params.lrate, 19 | params.warmup_steps, 20 | params.nstable, 21 | params.lrdecay_start, 22 | params.lrdecay_end 23 | ) 24 | elif strategy == "epoch": 25 | return epochlr.EpochDecayLr( 26 | params.lrate, 27 | params.lrate_decay, 28 | ) 29 | elif strategy == "score": 30 | return scorelr.ScoreDecayLr( 31 | params.lrate, 32 | history_scores=[v[1] for v in params.recorder.valid_script_scores], 33 | decay=params.lrate_decay, 34 | patience=params.lrate_patience, 35 | ) 36 | elif strategy == "vanilla": 37 | return vanillalr.VanillaLR( 38 | params.lrate, 39 | ) 40 | else: 41 | raise NotImplementedError( 42 | "{} is not supported".format(strategy)) -------------------------------------------------------------------------------- /nli/code/lrs/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from lrs import vanillalr, noamlr, scorelr, gnmtplr, epochlr 4 | 5 | 6 | def get_lr(params): 7 | 8 | strategy = params.lrate_strategy.lower() 9 | 10 | if strategy == "noam": 11 | return noamlr.NoamDecayLr( 12 | params.lrate, 13 | params.warmup_steps, 14 | params.hidden_size 15 | ) 16 | elif strategy == "gnmt+": 17 | return gnmtplr.GNMTPDecayLr( 18 | params.lrate, 19 | params.warmup_steps, 20 | params.nstable, 21 | params.lrdecay_start, 22 | params.lrdecay_end 23 | ) 24 | elif strategy == "epoch": 25 | return epochlr.EpochDecayLr( 26 | params.lrate, 27 | params.lrate_decay, 28 | ) 29 | elif strategy == "score": 30 | return scorelr.ScoreDecayLr( 31 | params.lrate, 32 | history_scores=[v[1] for v in params.recorder.valid_script_scores], 33 | decay=params.lrate_decay, 34 | patience=params.lrate_patience, 35 | ) 36 | elif strategy == "vanilla": 37 | return vanillalr.VanillaLR( 38 | params.lrate, 39 | ) 40 | else: 41 | raise NotImplementedError( 42 | "{} is not supported".format(strategy)) -------------------------------------------------------------------------------- /doc/code/bert/load.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import os 8 | import json 9 | import tensorflow as tf 10 | 11 | from .vocab import Vocab 12 | 13 | 14 | def load_vocab(model_dir): 15 | vocab = Vocab( 16 | vocab_file=os.path.join(model_dir, 'vocab.txt') 17 | ) 18 | return vocab 19 | 20 | 21 | def load_config(model_dir): 22 | with tf.gfile.GFile( 23 | os.path.join(model_dir, 'bert_config.json'), 24 | "r" 25 | ) as reader: 26 | text = reader.read() 27 | return json.loads(text) 28 | 29 | 30 | def load_model(session, model_dir): 31 | tf.logging.warn("Starting Loading BERT Pre-trained Model") 32 | ops = [] 33 | reader = tf.train.load_checkpoint( 34 | os.path.join(model_dir, "bert_model.ckpt") 35 | ) 36 | 37 | for var in tf.global_variables(): 38 | name = var.op.name 39 | name = name[name.find('/bert/')+1:] 40 | 41 | if reader.has_tensor(name) and 'Adam' not in name: 42 | tf.logging.info('{} **Good**'.format(name)) 43 | ops.append( 44 | tf.assign(var, reader.get_tensor(name))) 45 | else: 46 | tf.logging.warn("{} --Bad--".format(name)) 47 | restore_op = tf.group(*ops, name="restore_global_vars") 48 | session.run(restore_op) 49 | -------------------------------------------------------------------------------- /nli/code/bert/load.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import os 8 | import json 9 | import tensorflow as tf 10 | 11 | from .vocab import Vocab 12 | 13 | 14 | def load_vocab(model_dir): 15 | vocab = Vocab( 16 | vocab_file=os.path.join(model_dir, 'vocab.txt') 17 | ) 18 | return vocab 19 | 20 | 21 | def load_config(model_dir): 22 | with tf.gfile.GFile( 23 | os.path.join(model_dir, 'bert_config.json'), 24 | "r" 25 | ) as reader: 26 | text = reader.read() 27 | return json.loads(text) 28 | 29 | 30 | def load_model(session, model_dir): 31 | tf.logging.warn("Starting Loading BERT Pre-trained Model") 32 | ops = [] 33 | reader = tf.train.load_checkpoint( 34 | os.path.join(model_dir, "bert_model.ckpt") 35 | ) 36 | 37 | for var in tf.global_variables(): 38 | name = var.op.name 39 | name = name[name.find('/bert/')+1:] 40 | 41 | if reader.has_tensor(name) and 'Adam' not in name: 42 | tf.logging.info('{} **Good**'.format(name)) 43 | ops.append( 44 | tf.assign(var, reader.get_tensor(name))) 45 | else: 46 | tf.logging.warn("{} --Bad--".format(name)) 47 | restore_op = tf.group(*ops, name="restore_global_vars") 48 | session.run(restore_op) 49 | -------------------------------------------------------------------------------- /doc/code/lrs/scorelr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | from lrs import lr 9 | 10 | 11 | class ScoreDecayLr(lr.Lr): 12 | """Decay the learning rate after each evaluation""" 13 | def __init__(self, 14 | init_lr, 15 | history_scores=None, # evaluation history metric scores, such as BLEU 16 | decay=0.5, # learning rate decay rate 17 | patience=1, # decay after this number of bad counter 18 | name="score_decay_lr" # model name, no use 19 | ): 20 | super(ScoreDecayLr, self).__init__(init_lr, name=name) 21 | 22 | self.decay = decay 23 | self.patience = patience 24 | self.bad_counter = 0 25 | self.best_score = -1e9 26 | 27 | if history_scores is not None: 28 | for score in history_scores: 29 | self.after_eval(score[1]) 30 | 31 | def after_eval(self, eval_score): 32 | if eval_score > self.best_score: 33 | self.best_score = eval_score 34 | self.bad_counter = 0 35 | else: 36 | self.bad_counter += 1 37 | if self.bad_counter >= self.patience: 38 | self.lrate = self.lrate * self.decay 39 | 40 | self.bad_counter = 0 41 | -------------------------------------------------------------------------------- /nli/code/lrs/scorelr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | from lrs import lr 9 | 10 | 11 | class ScoreDecayLr(lr.Lr): 12 | """Decay the learning rate after each evaluation""" 13 | def __init__(self, 14 | init_lr, 15 | history_scores=None, # evaluation history metric scores, such as BLEU 16 | decay=0.5, # learning rate decay rate 17 | patience=1, # decay after this number of bad counter 18 | name="score_decay_lr" # model name, no use 19 | ): 20 | super(ScoreDecayLr, self).__init__(init_lr, name=name) 21 | 22 | self.decay = decay 23 | self.patience = patience 24 | self.bad_counter = 0 25 | self.best_score = -1e9 26 | 27 | if history_scores is not None: 28 | for score in history_scores: 29 | self.after_eval(score[1]) 30 | 31 | def after_eval(self, eval_score): 32 | if eval_score > self.best_score: 33 | self.best_score = eval_score 34 | self.bad_counter = 0 35 | else: 36 | self.bad_counter += 1 37 | if self.bad_counter >= self.patience: 38 | self.lrate = self.lrate * self.decay 39 | 40 | self.bad_counter = 0 41 | -------------------------------------------------------------------------------- /ner/code/callbacks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Custom callbacks. 3 | """ 4 | import numpy as np 5 | from keras.callbacks import Callback 6 | from seqeval.metrics import f1_score, classification_report 7 | 8 | 9 | class F1score(Callback): 10 | 11 | def __init__(self, seq, preprocessor=None, name="callback"): 12 | super(F1score, self).__init__() 13 | self.seq = seq 14 | self.p = preprocessor 15 | self.name = name 16 | 17 | def get_lengths(self, y_true): 18 | lengths = [] 19 | for y in np.argmax(y_true, -1): 20 | try: 21 | i = list(y).index(0) 22 | except ValueError: 23 | i = len(y) 24 | lengths.append(i) 25 | 26 | return lengths 27 | 28 | def on_epoch_end(self, epoch, logs={}): 29 | label_true = [] 30 | label_pred = [] 31 | for i in range(len(self.seq)): 32 | x_true, y_true = self.seq[i] 33 | lengths = self.get_lengths(y_true) 34 | y_pred = self.model.predict_on_batch(x_true) 35 | 36 | y_true = self.p.inverse_transform(y_true, lengths) 37 | y_pred = self.p.inverse_transform(y_pred, lengths) 38 | 39 | label_true.extend(y_true) 40 | label_pred.extend(y_pred) 41 | 42 | score = f1_score(label_true, label_pred) 43 | print('{} - f1: {:04.2f}'.format(self.name, score * 100)) 44 | print(classification_report(label_true, label_pred)) 45 | logs['f1'] = score 46 | -------------------------------------------------------------------------------- /doc/code/bert/tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | import os 9 | import argparse 10 | 11 | from . import tokenization 12 | 13 | 14 | def load_tokenizer(params): 15 | tokenization.validate_case_matches_checkpoint( 16 | params.lower, 17 | os.path.join(params.bert_dir, 'bert_model.ckpt') 18 | ) 19 | tokenizer = tokenization.FullTokenizer( 20 | vocab_file=os.path.join(params.bert_dir, 'vocab.txt'), 21 | do_lower_case=params.lower 22 | ) 23 | return tokenizer 24 | 25 | 26 | def tokenize(params): 27 | tokenizer = load_tokenizer(params) 28 | 29 | with open(params.output, 'w') as writer: 30 | with open(params.input, 'r') as reader: 31 | for line in reader: 32 | writer.write(' '.join(tokenizer.tokenize(line.strip())).encode('utf8') + "\n") 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser('Vocabulary Preparison') 37 | parser.add_argument('--lower', action='store_true', help='whether lowercase the model') 38 | parser.add_argument('--bert_dir', type=str, help='the pre-trained model directory') 39 | parser.add_argument('input', type=str, help='the input un-tokenized file') 40 | parser.add_argument('output', type=str, help='the output tokenized file') 41 | 42 | args = parser.parse_args() 43 | 44 | tokenize(args) 45 | 46 | print("Finishing!") 47 | -------------------------------------------------------------------------------- /nli/code/bert/tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | import os 9 | import argparse 10 | 11 | from . import tokenization 12 | 13 | 14 | def load_tokenizer(params): 15 | tokenization.validate_case_matches_checkpoint( 16 | params.lower, 17 | os.path.join(params.bert_dir, 'bert_model.ckpt') 18 | ) 19 | tokenizer = tokenization.FullTokenizer( 20 | vocab_file=os.path.join(params.bert_dir, 'vocab.txt'), 21 | do_lower_case=params.bert_lower 22 | ) 23 | return tokenizer 24 | 25 | 26 | def tokenize(params): 27 | tokenizer = load_tokenizer(params) 28 | 29 | with open(params.output, 'w') as writer: 30 | with open(params.input, 'r') as reader: 31 | for line in reader: 32 | writer.write(' '.join(tokenizer.tokenize(line.strip())).encode('utf8') + "\n") 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser('Vocabulary Preparison') 37 | parser.add_argument('--lower', action='store_true', help='whether lowercase the model') 38 | parser.add_argument('--bert_dir', type=str, help='the pre-trained model directory') 39 | parser.add_argument('input', type=str, help='the input un-tokenized file') 40 | parser.add_argument('output', type=str, help='the output tokenized file') 41 | 42 | args = parser.parse_args() 43 | 44 | tokenize(args) 45 | 46 | print("Finishing!") 47 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Download SQuAD 4 | SQUAD_DIR=~/data/squad 5 | mkdir -p $SQUAD_DIR 6 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json 7 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json 8 | 9 | # Download GloVe 10 | GLOVE_DIR=~/data/glove 11 | mkdir -p $GLOVE_DIR 12 | wget http://nlp.stanford.edu/data/glove.840B.300d.zip -O $GLOVE_DIR/glove.840B.300d.zip 13 | unzip $GLOVE_DIR/glove.840B.300d.zip -d $GLOVE_DIR 14 | 15 | # Download Glove Character Embedding 16 | # wget https://raw.githubusercontent.com/minimaxir/char-embeddings/master/glove.840B.300d-char.txt -O $GLOVE_DIR/glove.840B.300d-char.txt 17 | 18 | # Download fasttext 19 | # FASTTEXT_DIR=~/data/fasttext 20 | # mkdir -p $FASTTEXT_DIR 21 | # wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M.vec.zip -O $FASTTEXT_DIR/wiki-news-300d-1M.vec.zip 22 | # unzip $FASTTEXT_DIR/wiki-news-300d-1M.vec.zip -d $FASTTEXT_DIR 23 | 24 | # Download Elmo 25 | ELMO_DIR=~/data/elmo 26 | wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5 -O $ELMO_DIR/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5 27 | wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json -O $ELMO_DIR/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json 28 | 29 | # Download Spacy language models 30 | python3 -m spacy download en 31 | -------------------------------------------------------------------------------- /doc/code/rnns/lrn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class lrn(cell.Cell): 14 | """The Lightweight Recurrent Network.""" 15 | 16 | def __init__(self, d, ln=False, scope='lrn'): 17 | super(lrn, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "lrn")): 26 | h = linear(x, self.d * 3, 27 | bias=True, ln=self.ln, scope="hide_x") 28 | return (h, ) 29 | 30 | def __call__(self, h_, x): 31 | # h_: the previous hidden state 32 | # p,q,r/x: the current input state 33 | """ 34 | p, q, r = W x 35 | i = sigmoid(p + h_) 36 | f = sigmoid(q - h_) 37 | h = i * r + f * h_ 38 | """ 39 | if isinstance(x, (list, tuple)): 40 | x = x[0] 41 | 42 | with tf.variable_scope( 43 | "cell_{}".format(self.scope or "lrn")): 44 | p, q, r = tf.split(x, 3, -1) 45 | 46 | i = tf.sigmoid(p + h_) 47 | f = tf.sigmoid(q - h_) 48 | 49 | h = tf.tanh(i * r + f * h_) 50 | 51 | return h 52 | -------------------------------------------------------------------------------- /nli/code/rnns/lrn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class lrn(cell.Cell): 14 | """The Lightweight Recurrent Neural Network.""" 15 | 16 | def __init__(self, d, ln=False, scope='lrn'): 17 | super(lrn, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "lrn")): 26 | h = linear(x, self.d * 3, 27 | bias=True, ln=self.ln, scope="hide_x") 28 | return (h, ) 29 | 30 | def __call__(self, h_, x): 31 | # h_: the previous hidden state 32 | # p,q,r/x: the current input state 33 | """ 34 | p, q, r = W x 35 | i = sigmoid(p + h_) 36 | f = sigmoid(q - h_) 37 | h = i * r + f * h_ 38 | """ 39 | if isinstance(x, (list, tuple)): 40 | x = x[0] 41 | 42 | with tf.variable_scope( 43 | "cell_{}".format(self.scope or "lrn")): 44 | p, q, r = tf.split(x, 3, -1) 45 | 46 | i = tf.sigmoid(p + h_) 47 | f = tf.sigmoid(q - h_) 48 | 49 | h = tf.tanh(i * r + f * h_) 50 | 51 | return h 52 | -------------------------------------------------------------------------------- /rc/rnet/code/rnns/lrn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class lrn(cell.Cell): 14 | """The Lightweight Recurrent Network.""" 15 | 16 | def __init__(self, d, ln=False, scope='lrn'): 17 | super(lrn, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "lrn")): 26 | h = linear(x, self.d * 3, 27 | bias=True, ln=self.ln, scope="hide_x") 28 | return (h, ) 29 | 30 | def __call__(self, h_, x): 31 | # h_: the previous hidden state 32 | # p,q,r/x: the current input state 33 | """ 34 | p, q, r = W x 35 | i = sigmoid(p + h_) 36 | f = sigmoid(q - h_) 37 | h = i * r + f * h_ 38 | """ 39 | if isinstance(x, (list, tuple)): 40 | x = x[0] 41 | 42 | with tf.variable_scope( 43 | "cell_{}".format(self.scope or "lrn")): 44 | p, q, r = tf.split(x, 3, -1) 45 | 46 | i = tf.sigmoid(p + h_) 47 | f = tf.sigmoid(q - h_) 48 | 49 | h = tf.tanh(i * r + f * h_) 50 | 51 | return h 52 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/rnns/lrn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class lrn(cell.Cell): 14 | """The Lightweight Recurrent Network.""" 15 | 16 | def __init__(self, d, ln=False, scope='lrn'): 17 | super(lrn, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "lrn")): 26 | h = linear(x, self.d * 3, 27 | bias=True, ln=self.ln, scope="hide_x") 28 | return (h, ) 29 | 30 | def __call__(self, h_, x): 31 | # h_: the previous hidden state 32 | # p,q,r/x: the current input state 33 | """ 34 | p, q, r = W x 35 | i = sigmoid(p + h_) 36 | f = sigmoid(q - h_) 37 | h = i * r + f * h_ 38 | """ 39 | if isinstance(x, (list, tuple)): 40 | x = x[0] 41 | 42 | with tf.variable_scope( 43 | "cell_{}".format(self.scope or "lrn")): 44 | p, q, r = tf.split(x, 3, -1) 45 | 46 | i = tf.sigmoid(p + h_) 47 | f = tf.sigmoid(q - h_) 48 | 49 | h = tf.tanh(i * r + f * h_) 50 | 51 | return h 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Biao Zhang 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /doc/code/lrs/gnmtplr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import numpy as np 8 | 9 | from lrs import lr 10 | 11 | 12 | class GNMTPDecayLr(lr.Lr): 13 | """Decay the learning rate during each training step, follows GNMT+""" 14 | def __init__(self, 15 | init_lr, # initial learning rate 16 | warmup_steps, # warmup step 17 | nstable, # number of replica 18 | lrdecay_start, # start of learning rate decay 19 | lrdecay_end, # end of learning rate decay 20 | name="gnmtp_decay_lr" # model name, no use 21 | ): 22 | super(GNMTPDecayLr, self).__init__(init_lr, name=name) 23 | 24 | self.warmup_steps = warmup_steps 25 | self.nstable = nstable 26 | self.lrdecay_start = lrdecay_start 27 | self.lrdecay_end = lrdecay_end 28 | 29 | if nstable < 1: 30 | raise Exception("Stabled Lrate Value should " 31 | "greater than 0, but is {}".format(nstable)) 32 | 33 | def step(self, step): 34 | t = float(step) 35 | p = float(self.warmup_steps) 36 | n = float(self.nstable) 37 | s = float(self.lrdecay_start) 38 | e = float(self.lrdecay_end) 39 | 40 | decay = np.minimum(1. + t * (n - 1) / (n * p), n) 41 | decay = np.minimum(decay, n * (2 * n) ** ((s - n * t) / (e - s))) 42 | 43 | self.lrate = self.init_lrate * decay 44 | -------------------------------------------------------------------------------- /nli/code/lrs/gnmtplr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import numpy as np 8 | 9 | from lrs import lr 10 | 11 | 12 | class GNMTPDecayLr(lr.Lr): 13 | """Decay the learning rate during each training step, follows GNMT+""" 14 | def __init__(self, 15 | init_lr, # initial learning rate 16 | warmup_steps, # warmup step 17 | nstable, # number of replica 18 | lrdecay_start, # start of learning rate decay 19 | lrdecay_end, # end of learning rate decay 20 | name="gnmtp_decay_lr" # model name, no use 21 | ): 22 | super(GNMTPDecayLr, self).__init__(init_lr, name=name) 23 | 24 | self.warmup_steps = warmup_steps 25 | self.nstable = nstable 26 | self.lrdecay_start = lrdecay_start 27 | self.lrdecay_end = lrdecay_end 28 | 29 | if nstable < 1: 30 | raise Exception("Stabled Lrate Value should " 31 | "greater than 0, but is {}".format(nstable)) 32 | 33 | def step(self, step): 34 | t = float(step) 35 | p = float(self.warmup_steps) 36 | n = float(self.nstable) 37 | s = float(self.lrdecay_start) 38 | e = float(self.lrdecay_end) 39 | 40 | decay = np.minimum(1. + t * (n - 1) / (n * p), n) 41 | decay = np.minimum(decay, n * (2 * n) ** ((s - n * t) / (e - s))) 42 | 43 | self.lrate = self.init_lrate * decay 44 | -------------------------------------------------------------------------------- /doc/code/rnns/atr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class atr(cell.Cell): 14 | """The Addition-Subtraction Twin-Gated Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='atr'): 17 | super(atr, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "atr")): 26 | h = linear(x, self.d, 27 | bias=True, ln=self.ln, scope="hide_x") 28 | return (h, ) 29 | 30 | def __call__(self, h_, x): 31 | # h_: the previous hidden state 32 | # x: the current input state 33 | """ 34 | p = W x 35 | q = U h_ 36 | i = sigmoid(p + q) 37 | f = sigmoid(p - q) 38 | h = i * p + f * h_ 39 | """ 40 | if isinstance(x, (list, tuple)): 41 | x = x[0] 42 | 43 | with tf.variable_scope( 44 | "cell_{}".format(self.scope or "atr")): 45 | q = linear(h_, self.d, 46 | ln=self.ln, scope="hide_h") 47 | p = x 48 | 49 | f = tf.sigmoid(p - q) 50 | i = tf.sigmoid(p + q) 51 | 52 | h = tf.tanh(i * p + f * h_) 53 | 54 | return h 55 | -------------------------------------------------------------------------------- /nli/code/rnns/atr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class atr(cell.Cell): 14 | """The Addition-Subtraction Twin-Gated Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='atr'): 17 | super(atr, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "atr")): 26 | h = linear(x, self.d, 27 | bias=True, ln=self.ln, scope="hide_x") 28 | return (h, ) 29 | 30 | def __call__(self, h_, x): 31 | # h_: the previous hidden state 32 | # x: the current input state 33 | """ 34 | p = W x 35 | q = U h_ 36 | i = sigmoid(p + q) 37 | f = sigmoid(p - q) 38 | h = i * p + f * h_ 39 | """ 40 | if isinstance(x, (list, tuple)): 41 | x = x[0] 42 | 43 | with tf.variable_scope( 44 | "cell_{}".format(self.scope or "atr")): 45 | q = linear(h_, self.d, 46 | ln=self.ln, scope="hide_h") 47 | p = x 48 | 49 | f = tf.sigmoid(p - q) 50 | i = tf.sigmoid(p + q) 51 | 52 | h = tf.tanh(i * p + f * h_) 53 | 54 | return h 55 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/rnns/atr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class atr(cell.Cell): 14 | """The Addition-Subtraction Twin-Gated Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='atr'): 17 | super(atr, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "atr")): 26 | h = linear(x, self.d, 27 | bias=True, ln=self.ln, scope="hide_x") 28 | return (h, ) 29 | 30 | def __call__(self, h_, x): 31 | # h_: the previous hidden state 32 | # x: the current input state 33 | """ 34 | p = W x 35 | q = U h_ 36 | i = sigmoid(p + q) 37 | f = sigmoid(p - q) 38 | h = i * p + f * h_ 39 | """ 40 | if isinstance(x, (list, tuple)): 41 | x = x[0] 42 | 43 | with tf.variable_scope( 44 | "cell_{}".format(self.scope or "atr")): 45 | q = linear(h_, self.d, 46 | ln=self.ln, scope="hide_h") 47 | p = x 48 | 49 | f = tf.sigmoid(p - q) 50 | i = tf.sigmoid(p + q) 51 | 52 | h = tf.tanh(i * p + f * h_) 53 | 54 | return h 55 | -------------------------------------------------------------------------------- /rc/rnet/code/rnns/atr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class atr(cell.Cell): 14 | """The Addition-Subtraction Twin-Gated Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, twin=False, scope='atr'): 17 | super(atr, self).__init__(d, ln=ln, scope=scope) 18 | 19 | self.twin = twin 20 | 21 | def get_init_state(self, shape=None, x=None, scope=None): 22 | return self._get_init_state( 23 | self.d, shape=shape, x=x, scope=scope) 24 | 25 | def fetch_states(self, x): 26 | with tf.variable_scope( 27 | "fetch_state_{}".format(self.scope or "atr")): 28 | h = linear(x, self.d, 29 | bias=True, ln=self.ln, scope="hide_x") 30 | return (h, ) 31 | 32 | def __call__(self, h_, x): 33 | # h_: the previous hidden state 34 | # x: the current input state 35 | """ 36 | p = W x 37 | q = U h_ 38 | i = sigmoid(p + q) 39 | f = sigmoid(p - q) 40 | h = i * p + f * h_ 41 | """ 42 | if isinstance(x, (list, tuple)): 43 | x = x[0] 44 | 45 | with tf.variable_scope( 46 | "cell_{}".format(self.scope or "atr")): 47 | q = linear(h_, self.d, 48 | ln=self.ln, scope="hide_h") 49 | p = x 50 | 51 | f = tf.sigmoid(p - q) 52 | i = tf.sigmoid(p + q) 53 | 54 | h = tf.tanh(i * p + f * h_) 55 | 56 | return h 57 | -------------------------------------------------------------------------------- /doc/code/rnns/cell.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import abc 8 | import tensorflow as tf 9 | from func import linear 10 | 11 | 12 | # This is an abstract class that deals with 13 | # recurrent cells, e.g. GRU, LSTM, ATR 14 | class Cell(object): 15 | def __init__(self, 16 | d, # hidden state dimension 17 | ln=False, # whether use layer normalization 18 | scope=None, # the name scope for this cell 19 | ): 20 | self.d = d 21 | self.scope = scope 22 | self.ln = ln 23 | 24 | def _get_init_state(self, d, shape=None, x=None, scope=None): 25 | # gen init state vector 26 | # if no evidence x is provided, use zero initialization 27 | if x is None: 28 | assert shape is not None, "you should provide shape" 29 | if not isinstance(shape, (tuple, list)): 30 | shape = [shape] 31 | shape = shape + [d] 32 | return tf.zeros(shape, tf.float32) 33 | else: 34 | return linear( 35 | x, d, bias=True, ln=self.ln, 36 | scope="{}_init".format(scope or self.scope) 37 | ) 38 | 39 | def get_hidden(self, x): 40 | return x 41 | 42 | @abc.abstractmethod 43 | def get_init_state(self, shape=None, x=None, scope=None): 44 | raise NotImplementedError("Not Supported") 45 | 46 | @abc.abstractmethod 47 | def __call__(self, h_, x): 48 | raise NotImplementedError("Not Supported") 49 | 50 | @abc.abstractmethod 51 | def fetch_states(self, x): 52 | raise NotImplementedError("Not Supported") 53 | -------------------------------------------------------------------------------- /nli/code/rnns/cell.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import abc 8 | import tensorflow as tf 9 | from func import linear 10 | 11 | 12 | # This is an abstract class that deals with 13 | # recurrent cells, e.g. GRU, LSTM, ATR 14 | class Cell(object): 15 | def __init__(self, 16 | d, # hidden state dimension 17 | ln=False, # whether use layer normalization 18 | scope=None, # the name scope for this cell 19 | ): 20 | self.d = d 21 | self.scope = scope 22 | self.ln = ln 23 | 24 | def _get_init_state(self, d, shape=None, x=None, scope=None): 25 | # gen init state vector 26 | # if no evidence x is provided, use zero initialization 27 | if x is None: 28 | assert shape is not None, "you should provide shape" 29 | if not isinstance(shape, (tuple, list)): 30 | shape = [shape] 31 | shape = shape + [d] 32 | return tf.zeros(shape, tf.float32) 33 | else: 34 | return linear( 35 | x, d, bias=True, ln=self.ln, 36 | scope="{}_init".format(scope or self.scope) 37 | ) 38 | 39 | def get_hidden(self, x): 40 | return x 41 | 42 | @abc.abstractmethod 43 | def get_init_state(self, shape=None, x=None, scope=None): 44 | raise NotImplementedError("Not Supported") 45 | 46 | @abc.abstractmethod 47 | def __call__(self, h_, x): 48 | raise NotImplementedError("Not Supported") 49 | 50 | @abc.abstractmethod 51 | def fetch_states(self, x): 52 | raise NotImplementedError("Not Supported") 53 | -------------------------------------------------------------------------------- /rc/rnet/code/rnns/cell.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import abc 8 | import tensorflow as tf 9 | from func import linear 10 | 11 | 12 | # This is an abstract class that deals with 13 | # recurrent cells, e.g. GRU, LSTM, ATR 14 | class Cell(object): 15 | def __init__(self, 16 | d, # hidden state dimension 17 | ln=False, # whether use layer normalization 18 | scope=None, # the name scope for this cell 19 | ): 20 | self.d = d 21 | self.scope = scope 22 | self.ln = ln 23 | 24 | def _get_init_state(self, d, shape=None, x=None, scope=None): 25 | # gen init state vector 26 | # if no evidence x is provided, use zero initialization 27 | if x is None: 28 | assert shape is not None, "you should provide shape" 29 | if not isinstance(shape, (tuple, list)): 30 | shape = [shape] 31 | shape = shape + [d] 32 | return tf.zeros(shape, tf.float32) 33 | else: 34 | return linear( 35 | x, d, bias=True, ln=self.ln, 36 | scope="{}_init".format(scope or self.scope) 37 | ) 38 | 39 | def get_hidden(self, x): 40 | return x 41 | 42 | @abc.abstractmethod 43 | def get_init_state(self, shape=None, x=None, scope=None): 44 | raise NotImplementedError("Not Supported") 45 | 46 | @abc.abstractmethod 47 | def __call__(self, h_, x): 48 | raise NotImplementedError("Not Supported") 49 | 50 | @abc.abstractmethod 51 | def fetch_states(self, x): 52 | raise NotImplementedError("Not Supported") 53 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/rnns/cell.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import abc 8 | import tensorflow as tf 9 | from func import linear 10 | 11 | 12 | # This is an abstract class that deals with 13 | # recurrent cells, e.g. GRU, LSTM, ATR 14 | class Cell(object): 15 | def __init__(self, 16 | d, # hidden state dimension 17 | ln=False, # whether use layer normalization 18 | scope=None, # the name scope for this cell 19 | ): 20 | self.d = d 21 | self.scope = scope 22 | self.ln = ln 23 | 24 | def _get_init_state(self, d, shape=None, x=None, scope=None): 25 | # gen init state vector 26 | # if no evidence x is provided, use zero initialization 27 | if x is None: 28 | assert shape is not None, "you should provide shape" 29 | if not isinstance(shape, (tuple, list)): 30 | shape = [shape] 31 | shape = shape + [d] 32 | return tf.zeros(shape, tf.float32) 33 | else: 34 | return linear( 35 | x, d, bias=True, ln=self.ln, 36 | scope="{}_init".format(scope or self.scope) 37 | ) 38 | 39 | def get_hidden(self, x): 40 | return x 41 | 42 | @abc.abstractmethod 43 | def get_init_state(self, shape=None, x=None, scope=None): 44 | raise NotImplementedError("Not Supported") 45 | 46 | @abc.abstractmethod 47 | def __call__(self, h_, x): 48 | raise NotImplementedError("Not Supported") 49 | 50 | @abc.abstractmethod 51 | def fetch_states(self, x): 52 | raise NotImplementedError("Not Supported") 53 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/cycle.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | 10 | def session_run(monitored_session, args): 11 | # Call raw TF session directly 12 | return monitored_session._tf_sess().run(args) 13 | 14 | 15 | def zero_variables(variables, name=None): 16 | ops = [] 17 | 18 | for var in variables: 19 | with tf.device(var.device): 20 | op = var.assign(tf.zeros(var.shape.as_list())) 21 | ops.append(op) 22 | 23 | return tf.group(*ops, name=name or "zero_op") 24 | 25 | 26 | def replicate_variables(variables, device=None): 27 | new_vars = [] 28 | 29 | for var in variables: 30 | device = device or var.device 31 | with tf.device(device): 32 | name = "replicate/" + var.name.split(":")[0] 33 | new_vars.append(tf.Variable(tf.zeros(var.shape.as_list()), 34 | name=name, trainable=False)) 35 | 36 | return new_vars 37 | 38 | 39 | def collect_gradients(gradients, variables): 40 | ops = [] 41 | 42 | for grad, var in zip(gradients, variables): 43 | if isinstance(grad, tf.Tensor): 44 | ops.append(tf.assign_add(var, grad)) 45 | else: 46 | ops.append(tf.scatter_add(var, grad.indices, grad.values)) 47 | 48 | return tf.group(*ops) 49 | 50 | 51 | def scale_gradients(gradients, scale): 52 | scaled_gradients = [] 53 | 54 | for grad in gradients: 55 | if isinstance(grad, tf.IndexedSlices): 56 | slices = tf.IndexedSlices(scale * grad.values, grad.indices) 57 | scaled_gradients.append(slices) 58 | else: 59 | scaled_gradients.append(scale * grad) 60 | 61 | return tuple(scaled_gradients) -------------------------------------------------------------------------------- /doc/code/rnns/gru.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class gru(cell.Cell): 14 | """The Gated Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='gru'): 17 | super(gru, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "gru")): 26 | g = linear(x, self.d * 2, 27 | bias=True, ln=self.ln, scope="gate_x") 28 | h = linear(x, self.d, 29 | bias=True, ln=self.ln, scope="hide_x") 30 | return g, h 31 | 32 | def __call__(self, h_, x): 33 | # h_: the previous hidden state 34 | # x_g/x: the current input state for gate 35 | # x_h/x: the current input state for hidden 36 | """ 37 | z = sigmoid(h_, x) 38 | r = sigmoid(h_, x) 39 | h' = tanh(x, r * h_) 40 | h = z * h_ + (1. - z) * h' 41 | """ 42 | with tf.variable_scope( 43 | "cell_{}".format(self.scope or "gru")): 44 | x_g, x_h = x 45 | 46 | h_g = linear(h_, self.d * 2, 47 | ln=self.ln, scope="gate_h") 48 | z, r = tf.split( 49 | tf.sigmoid(x_g + h_g), 2, -1) 50 | 51 | h_h = linear(h_ * r, self.d, 52 | ln=self.ln, scope="hide_h") 53 | h = tf.tanh(x_h + h_h) 54 | 55 | h = z * h_ + (1. - z) * h 56 | 57 | return h 58 | -------------------------------------------------------------------------------- /nli/code/rnns/gru.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class gru(cell.Cell): 14 | """The Gated Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='gru'): 17 | super(gru, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "gru")): 26 | g = linear(x, self.d * 2, 27 | bias=True, ln=self.ln, scope="gate_x") 28 | h = linear(x, self.d, 29 | bias=True, ln=self.ln, scope="hide_x") 30 | return g, h 31 | 32 | def __call__(self, h_, x): 33 | # h_: the previous hidden state 34 | # x_g/x: the current input state for gate 35 | # x_h/x: the current input state for hidden 36 | """ 37 | z = sigmoid(h_, x) 38 | r = sigmoid(h_, x) 39 | h' = tanh(x, r * h_) 40 | h = z * h_ + (1. - z) * h' 41 | """ 42 | with tf.variable_scope( 43 | "cell_{}".format(self.scope or "gru")): 44 | x_g, x_h = x 45 | 46 | h_g = linear(h_, self.d * 2, 47 | ln=self.ln, scope="gate_h") 48 | z, r = tf.split( 49 | tf.sigmoid(x_g + h_g), 2, -1) 50 | 51 | h_h = linear(h_ * r, self.d, 52 | ln=self.ln, scope="hide_h") 53 | h = tf.tanh(x_h + h_h) 54 | 55 | h = z * h_ + (1. - z) * h 56 | 57 | return h 58 | -------------------------------------------------------------------------------- /rc/rnet/code/rnns/gru.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class gru(cell.Cell): 14 | """The Gated Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='gru'): 17 | super(gru, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "gru")): 26 | g = linear(x, self.d * 2, 27 | bias=True, ln=self.ln, scope="gate_x") 28 | h = linear(x, self.d, 29 | bias=True, ln=self.ln, scope="hide_x") 30 | return g, h 31 | 32 | def __call__(self, h_, x): 33 | # h_: the previous hidden state 34 | # x_g/x: the current input state for gate 35 | # x_h/x: the current input state for hidden 36 | """ 37 | z = sigmoid(h_, x) 38 | r = sigmoid(h_, x) 39 | h' = tanh(x, r * h_) 40 | h = z * h_ + (1. - z) * h' 41 | """ 42 | with tf.variable_scope( 43 | "cell_{}".format(self.scope or "gru")): 44 | x_g, x_h = x 45 | 46 | h_g = linear(h_, self.d * 2, 47 | ln=self.ln, scope="gate_h") 48 | z, r = tf.split( 49 | tf.sigmoid(x_g + h_g), 2, -1) 50 | 51 | h_h = linear(h_ * r, self.d, 52 | ln=self.ln, scope="hide_h") 53 | h = tf.tanh(x_h + h_h) 54 | 55 | h = z * h_ + (1. - z) * h 56 | 57 | return h 58 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/rnns/gru.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class gru(cell.Cell): 14 | """The Gated Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='gru'): 17 | super(gru, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d, shape=shape, x=x, scope=scope) 22 | 23 | def fetch_states(self, x): 24 | with tf.variable_scope( 25 | "fetch_state_{}".format(self.scope or "gru")): 26 | g = linear(x, self.d * 2, 27 | bias=True, ln=self.ln, scope="gate_x") 28 | h = linear(x, self.d, 29 | bias=True, ln=self.ln, scope="hide_x") 30 | return g, h 31 | 32 | def __call__(self, h_, x): 33 | # h_: the previous hidden state 34 | # x_g/x: the current input state for gate 35 | # x_h/x: the current input state for hidden 36 | """ 37 | z = sigmoid(h_, x) 38 | r = sigmoid(h_, x) 39 | h' = tanh(x, r * h_) 40 | h = z * h_ + (1. - z) * h' 41 | """ 42 | with tf.variable_scope( 43 | "cell_{}".format(self.scope or "gru")): 44 | x_g, x_h = x 45 | 46 | h_g = linear(h_, self.d * 2, 47 | ln=self.ln, scope="gate_h") 48 | z, r = tf.split( 49 | tf.sigmoid(x_g + h_g), 2, -1) 50 | 51 | h_h = linear(h_ * r, self.d, 52 | ln=self.ln, scope="hide_h") 53 | h = tf.tanh(x_h + h_h) 54 | 55 | h = z * h_ + (1. - z) * h 56 | 57 | return h 58 | -------------------------------------------------------------------------------- /rc/README.md: -------------------------------------------------------------------------------- 1 | ## Reading Comprehension 2 | 3 | 4 | We use [SQUAD v1](https://rajpurkar.github.io/SQuAD-explorer/) for experiments and adopt the 5 | [RNET model](https://www.aclweb.org/anthology/papers/P/P17/P17-1018/). 6 | Main experimental results are summarized below. 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 |
Model#ParamsBase+Elmo
rnet-71.1/79.5-/-
LSTM2.67M70.46/78.9875.17/82.79
GRU2.31M70.41/79.1575.81/83.12
ATR1.59M69.73/78.7075.06/82.76
SRU2.44M69.27/78.4174.56/82.50
LRN2.14M70.11/78.8376.14/83.83
52 | 53 | Exact match/F1-score. 54 | 55 | ## Requirement 56 | tensorflow >= 1.8.1 57 | 58 | ## How to Run? 59 | 60 | - download and preprocess dataset 61 | 62 | - see [R-Net](https://github.com/HKUST-KnowComp/R-Net) about the preprocessing of datasets 63 | - Basically, you need the following datasets: squad v1.1, GloVe, Elmo and convert raw datasets into the required data format. 64 | 65 | - no hyperparameters are tuned, we keep them all in default. 66 | 67 | - training and evaluation 68 | 69 | Please see the `train_lrn.sh` and `test_lrn.sh` scripts in `rnet` (Base) and `elmo_rnet` (Base+Elmo). 70 | 71 | For reporting final EM/F1 score, we used the `evaluate-v1.1.py` script. 72 | 73 | ## Credits 74 | 75 | Source code structure is adapted from [R-Net](https://github.com/HKUST-KnowComp/R-Net). -------------------------------------------------------------------------------- /lm/code/utils.py: -------------------------------------------------------------------------------- 1 | import os, shutil 2 | import torch 3 | from torch.autograd import Variable 4 | 5 | def repackage_hidden(h): 6 | """Wraps hidden states in new Variables, to detach them from their history.""" 7 | if isinstance(h, tuple) or isinstance(h, list): 8 | return tuple(repackage_hidden(v) for v in h) 9 | else: 10 | return h.detach() 11 | 12 | def batchify(data, bsz, args): 13 | # Work out how cleanly we can divide the dataset into bsz parts. 14 | nbatch = data.size(0) // bsz 15 | # Trim off any extra elements that wouldn't cleanly fit (remainders). 16 | data = data.narrow(0, 0, nbatch * bsz) 17 | # Evenly divide the data across the bsz batches. 18 | data = data.view(bsz, -1).t().contiguous() 19 | print(data.size()) 20 | if args.cuda: 21 | data = data.cuda() 22 | return data 23 | 24 | def get_batch(source, i, args, seq_len=None): 25 | seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i) 26 | data = Variable(source[i:i+seq_len]) 27 | # target = Variable(source[i+1:i+1+seq_len].view(-1)) 28 | target = Variable(source[i+1:i+1+seq_len]) 29 | return data, target 30 | 31 | def create_exp_dir(path, scripts_to_save=None): 32 | if not os.path.exists(path): 33 | os.mkdir(path) 34 | 35 | print('Experiment dir : {}'.format(path)) 36 | if scripts_to_save is not None: 37 | os.mkdir(os.path.join(path, 'scripts')) 38 | for script in scripts_to_save: 39 | dst_file = os.path.join(path, 'scripts', os.path.basename(script)) 40 | shutil.copyfile(script, dst_file) 41 | 42 | def save_checkpoint(model, optimizer, path, finetune=False): 43 | if finetune: 44 | torch.save(model, os.path.join(path, 'finetune_model.pt')) 45 | torch.save(optimizer.state_dict(), os.path.join(path, 'finetune_optimizer.pt')) 46 | else: 47 | torch.save(model, os.path.join(path, 'model.pt')) 48 | torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer.pt')) 49 | -------------------------------------------------------------------------------- /doc/code/bert/vocab.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | class Vocab(object): 9 | def __init__(self, vocab_file=None): 10 | self.word2id = {} 11 | self.id2word = {} 12 | self.word2count = {} 13 | 14 | self.pad_sym = "[PAD]" 15 | self.cls_sym = "[CLS]" 16 | self.sep_sym = "[SEP]" 17 | self.unk_sym = "[UNK]" 18 | 19 | if vocab_file is not None: 20 | self.load_vocab(vocab_file) 21 | 22 | def insert(self, token): 23 | if token not in self.word2id: 24 | index = len(self.word2id) 25 | self.word2id[token] = index 26 | self.id2word[index] = token 27 | 28 | self.word2count[token] = 0 29 | self.word2count[token] += 1 30 | 31 | @property 32 | def size(self): 33 | return len(self.word2id) 34 | 35 | def load_vocab(self, vocab_file): 36 | with open(vocab_file, 'r') as reader: 37 | for token in reader: 38 | self.insert(token.strip()) 39 | 40 | def get_token(self, id): 41 | if id in self.id2word: 42 | return self.id2word[id] 43 | return self.unk_sym 44 | 45 | def get_id(self, token): 46 | if token in self.word2id: 47 | return self.word2id[token] 48 | return self.word2id[self.unk_sym] 49 | 50 | def save_vocab(self, vocab_file): 51 | with open(vocab_file, 'w') as writer: 52 | for id in range(self.size): 53 | writer.write(self.id2word[id] + "\n") 54 | 55 | def to_id(self, tokens): 56 | return [self.get_id(token) for token in tokens] 57 | 58 | def to_tokens(self, ids): 59 | return [self.get_token(id) for id in ids] 60 | 61 | @property 62 | def pad(self): 63 | return self.get_id(self.pad_sym) 64 | 65 | @property 66 | def cls(self): 67 | return self.get_id(self.cls_sym) 68 | 69 | @property 70 | def sep(self): 71 | return self.get_id(self.sep_sym) 72 | -------------------------------------------------------------------------------- /nli/code/bert/vocab.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | class Vocab(object): 9 | def __init__(self, vocab_file=None): 10 | self.word2id = {} 11 | self.id2word = {} 12 | self.word2count = {} 13 | 14 | self.pad_sym = "[PAD]" 15 | self.cls_sym = "[CLS]" 16 | self.sep_sym = "[SEP]" 17 | self.unk_sym = "[UNK]" 18 | 19 | if vocab_file is not None: 20 | self.load_vocab(vocab_file) 21 | 22 | def insert(self, token): 23 | if token not in self.word2id: 24 | index = len(self.word2id) 25 | self.word2id[token] = index 26 | self.id2word[index] = token 27 | 28 | self.word2count[token] = 0 29 | self.word2count[token] += 1 30 | 31 | @property 32 | def size(self): 33 | return len(self.word2id) 34 | 35 | def load_vocab(self, vocab_file): 36 | with open(vocab_file, 'r') as reader: 37 | for token in reader: 38 | self.insert(token.strip()) 39 | 40 | def get_token(self, id): 41 | if id in self.id2word: 42 | return self.id2word[id] 43 | return self.unk_sym 44 | 45 | def get_id(self, token): 46 | if token in self.word2id: 47 | return self.word2id[token] 48 | return self.word2id[self.unk_sym] 49 | 50 | def save_vocab(self, vocab_file): 51 | with open(vocab_file, 'w') as writer: 52 | for id in range(self.size): 53 | writer.write(self.id2word[id] + "\n") 54 | 55 | def to_id(self, tokens): 56 | return [self.get_id(token) for token in tokens] 57 | 58 | def to_tokens(self, ids): 59 | return [self.get_token(id) for id in ids] 60 | 61 | @property 62 | def pad(self): 63 | return self.get_id(self.pad_sym) 64 | 65 | @property 66 | def cls(self): 67 | return self.get_id(self.cls_sym) 68 | 69 | @property 70 | def sep(self): 71 | return self.get_id(self.sep_sym) 72 | -------------------------------------------------------------------------------- /rc/rnet/code/rnns/rnn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from rnns import get_cell 10 | 11 | 12 | def rnn(cell_name, x, d, mask=None, ln=False, init_state=None, sm=True): 13 | """Self implemented RNN procedure, supporting mask trick""" 14 | # cell_name: gru, lstm or atr 15 | # x: input sequence embedding matrix, [batch, seq_len, dim] 16 | # d: hidden dimension for rnn 17 | # mask: mask matrix, [batch, seq_len] 18 | # ln: whether use layer normalization 19 | # init_state: the initial hidden states, for cache purpose 20 | # sm: whether apply swap memory during rnn scan 21 | # dp: variational dropout 22 | 23 | in_shape = tf.shape(x) 24 | batch_size, time_steps = in_shape[0], in_shape[1] 25 | 26 | cell = get_cell(cell_name, d, ln=ln) 27 | 28 | if init_state is None: 29 | init_state = cell.get_init_state(shape=[batch_size]) 30 | if mask is None: 31 | mask = tf.ones([batch_size, time_steps], tf.float32) 32 | 33 | # prepare projected input 34 | cache_inputs = cell.fetch_states(x) 35 | cache_inputs = [tf.transpose(v, [1, 0, 2]) 36 | for v in list(cache_inputs)] 37 | mask_ta = tf.transpose(tf.expand_dims(mask, -1), [1, 0, 2]) 38 | 39 | def _step_fn(prev, x): 40 | t, h_ = prev 41 | m = x[-1] 42 | v = x[:-1] 43 | 44 | h = cell(h_, v) 45 | h = m * h + (1. - m) * h_ 46 | 47 | return t + 1, h 48 | 49 | time = tf.constant(0, dtype=tf.int32, name="time") 50 | step_states = (time, init_state) 51 | step_vars = cache_inputs + [mask_ta] 52 | 53 | outputs = tf.scan(_step_fn, 54 | step_vars, 55 | initializer=step_states, 56 | parallel_iterations=32, 57 | swap_memory=sm) 58 | 59 | output_ta = outputs[1] 60 | output_state = outputs[1][-1] 61 | 62 | outputs = tf.transpose(output_ta, [1, 0, 2]) 63 | 64 | return (outputs, output_state), \ 65 | (cell.get_hidden(outputs), cell.get_hidden(output_state)) 66 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/rnns/rnn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from rnns import get_cell 10 | 11 | 12 | def rnn(cell_name, x, d, mask=None, ln=False, init_state=None, sm=True): 13 | """Self implemented RNN procedure, supporting mask trick""" 14 | # cell_name: gru, lstm or atr 15 | # x: input sequence embedding matrix, [batch, seq_len, dim] 16 | # d: hidden dimension for rnn 17 | # mask: mask matrix, [batch, seq_len] 18 | # ln: whether use layer normalization 19 | # init_state: the initial hidden states, for cache purpose 20 | # sm: whether apply swap memory during rnn scan 21 | # dp: variational dropout 22 | 23 | in_shape = tf.shape(x) 24 | batch_size, time_steps = in_shape[0], in_shape[1] 25 | 26 | cell = get_cell(cell_name, d, ln=ln) 27 | 28 | if init_state is None: 29 | init_state = cell.get_init_state(shape=[batch_size]) 30 | if mask is None: 31 | mask = tf.ones([batch_size, time_steps], tf.float32) 32 | 33 | # prepare projected input 34 | cache_inputs = cell.fetch_states(x) 35 | cache_inputs = [tf.transpose(v, [1, 0, 2]) 36 | for v in list(cache_inputs)] 37 | mask_ta = tf.transpose(tf.expand_dims(mask, -1), [1, 0, 2]) 38 | 39 | def _step_fn(prev, x): 40 | t, h_ = prev 41 | m = x[-1] 42 | v = x[:-1] 43 | 44 | h = cell(h_, v) 45 | h = m * h + (1. - m) * h_ 46 | 47 | return t + 1, h 48 | 49 | time = tf.constant(0, dtype=tf.int32, name="time") 50 | step_states = (time, init_state) 51 | step_vars = cache_inputs + [mask_ta] 52 | 53 | outputs = tf.scan(_step_fn, 54 | step_vars, 55 | initializer=step_states, 56 | parallel_iterations=32, 57 | swap_memory=sm) 58 | 59 | output_ta = outputs[1] 60 | output_state = outputs[1][-1] 61 | 62 | outputs = tf.transpose(output_ta, [1, 0, 2]) 63 | 64 | return (outputs, output_state), \ 65 | (cell.get_hidden(outputs), cell.get_hidden(output_state)) 66 | -------------------------------------------------------------------------------- /ner/code/trainer.py: -------------------------------------------------------------------------------- 1 | """Training-related module. 2 | """ 3 | from callbacks import F1score 4 | from utils import NERSequence 5 | 6 | 7 | class Trainer(object): 8 | """A trainer that train the model. 9 | 10 | Attributes: 11 | _model: Model. 12 | _preprocessor: Transformer. Preprocessing data for feature extraction. 13 | """ 14 | 15 | def __init__(self, model, preprocessor=None): 16 | self._model = model 17 | self._preprocessor = preprocessor 18 | 19 | def train(self, x_train, y_train, x_valid=None, y_valid=None, 20 | epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True): 21 | """Trains the model for a fixed number of epochs (iterations on a dataset). 22 | 23 | Args: 24 | x_train: list of training data. 25 | y_train: list of training target (label) data. 26 | x_valid: list of validation data. 27 | y_valid: list of validation target (label) data. 28 | batch_size: Integer. 29 | Number of samples per gradient update. 30 | If unspecified, `batch_size` will default to 32. 31 | epochs: Integer. Number of epochs to train the model. 32 | verbose: Integer. 0, 1, or 2. Verbosity mode. 33 | 0 = silent, 1 = progress bar, 2 = one line per epoch. 34 | callbacks: List of `keras.callbacks.Callback` instances. 35 | List of callbacks to apply during training. 36 | shuffle: Boolean (whether to shuffle the training data 37 | before each epoch). `shuffle` will default to True. 38 | """ 39 | 40 | train_seq = NERSequence(x_train, y_train, batch_size, self._preprocessor.transform) 41 | 42 | if x_valid and y_valid: 43 | valid_seq = NERSequence(x_valid, y_valid, batch_size, self._preprocessor.transform) 44 | f1 = F1score(valid_seq, preprocessor=self._preprocessor) 45 | callbacks = [f1] + callbacks if callbacks else [f1] 46 | 47 | self._model.fit_generator(generator=train_seq, 48 | epochs=epochs, 49 | callbacks=callbacks, 50 | verbose=verbose, 51 | shuffle=shuffle) 52 | -------------------------------------------------------------------------------- /doc/code/rnns/sru.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class sru(cell.Cell): 14 | """The Simple Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='sru'): 17 | super(sru, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d * 2, shape=shape, x=x, scope=scope) 22 | 23 | def get_hidden(self, x): 24 | return tf.split(x, 2, -1)[0] 25 | 26 | def fetch_states(self, x): 27 | with tf.variable_scope( 28 | "fetch_state_{}".format(self.scope or "sru")): 29 | h = linear(x, self.d * 4, 30 | bias=True, ln=self.ln, scope="hide_x") 31 | return (h, ) 32 | 33 | def __call__(self, h_, x): 34 | # h_: the concatenation of previous hidden state 35 | # and memory cell state 36 | # x_r/x: the current input state for r gate 37 | # x_f/x: the current input state for f gate 38 | # x_c/x: the current input state for candidate cell 39 | # x_h/x: the current input state for hidden output 40 | # we increase this because we do not assume that 41 | # the input dimension equals the output dimension 42 | """ 43 | f = sigmoid(Wx, vf * c_) 44 | c = f * c_ + (1 - f) * Wx 45 | r = sigmoid(Wx, vr * c_) 46 | h = r * c + (1 - r) * Ux 47 | """ 48 | if isinstance(x, (list, tuple)): 49 | x = x[0] 50 | 51 | with tf.variable_scope( 52 | "cell_{}".format(self.scope or "sru")): 53 | x_r, x_f, x_c, x_h = tf.split(x, 4, -1) 54 | h_, c_ = tf.split(h_, 2, -1) 55 | 56 | v_f = tf.get_variable("v_f", [1, self.d]) 57 | v_r = tf.get_variable("v_r", [1, self.d]) 58 | 59 | f = tf.sigmoid(x_f + v_f * c_) 60 | c = f * c_ + (1. - f) * x_c 61 | r = tf.sigmoid(x_r + v_r * c_) 62 | h = r * c + (1. - r) * x_h 63 | 64 | return tf.concat([h, c], -1) 65 | -------------------------------------------------------------------------------- /nli/code/rnns/sru.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class sru(cell.Cell): 14 | """The Simple Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='sru'): 17 | super(sru, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d * 2, shape=shape, x=x, scope=scope) 22 | 23 | def get_hidden(self, x): 24 | return tf.split(x, 2, -1)[0] 25 | 26 | def fetch_states(self, x): 27 | with tf.variable_scope( 28 | "fetch_state_{}".format(self.scope or "sru")): 29 | h = linear(x, self.d * 4, 30 | bias=True, ln=self.ln, scope="hide_x") 31 | return (h, ) 32 | 33 | def __call__(self, h_, x): 34 | # h_: the concatenation of previous hidden state 35 | # and memory cell state 36 | # x_r/x: the current input state for r gate 37 | # x_f/x: the current input state for f gate 38 | # x_c/x: the current input state for candidate cell 39 | # x_h/x: the current input state for hidden output 40 | # we increase this because we do not assume that 41 | # the input dimension equals the output dimension 42 | """ 43 | f = sigmoid(Wx, vf * c_) 44 | c = f * c_ + (1 - f) * Wx 45 | r = sigmoid(Wx, vr * c_) 46 | h = r * c + (1 - r) * Ux 47 | """ 48 | if isinstance(x, (list, tuple)): 49 | x = x[0] 50 | 51 | with tf.variable_scope( 52 | "cell_{}".format(self.scope or "sru")): 53 | x_r, x_f, x_c, x_h = tf.split(x, 4, -1) 54 | h_, c_ = tf.split(h_, 2, -1) 55 | 56 | v_f = tf.get_variable("v_f", [1, self.d]) 57 | v_r = tf.get_variable("v_r", [1, self.d]) 58 | 59 | f = tf.sigmoid(x_f + v_f * c_) 60 | c = f * c_ + (1. - f) * x_c 61 | r = tf.sigmoid(x_r + v_r * c_) 62 | h = r * c + (1. - r) * x_h 63 | 64 | return tf.concat([h, c], -1) 65 | -------------------------------------------------------------------------------- /rc/rnet/code/rnns/sru.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class sru(cell.Cell): 14 | """The Simple Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='sru'): 17 | super(sru, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d * 2, shape=shape, x=x, scope=scope) 22 | 23 | def get_hidden(self, x): 24 | return tf.split(x, 2, -1)[0] 25 | 26 | def fetch_states(self, x): 27 | with tf.variable_scope( 28 | "fetch_state_{}".format(self.scope or "sru")): 29 | h = linear(x, self.d * 4, 30 | bias=True, ln=self.ln, scope="hide_x") 31 | return (h, ) 32 | 33 | def __call__(self, h_, x): 34 | # h_: the concatenation of previous hidden state 35 | # and memory cell state 36 | # x_r/x: the current input state for r gate 37 | # x_f/x: the current input state for f gate 38 | # x_c/x: the current input state for candidate cell 39 | # x_h/x: the current input state for hidden output 40 | # we increase this because we do not assume that 41 | # the input dimension equals the output dimension 42 | """ 43 | f = sigmoid(Wx, vf * c_) 44 | c = f * c_ + (1 - f) * Wx 45 | r = sigmoid(Wx, vr * c_) 46 | h = r * c + (1 - r) * Ux 47 | """ 48 | if isinstance(x, (list, tuple)): 49 | x = x[0] 50 | 51 | with tf.variable_scope( 52 | "cell_{}".format(self.scope or "sru")): 53 | x_r, x_f, x_c, x_h = tf.split(x, 4, -1) 54 | h_, c_ = tf.split(h_, 2, -1) 55 | 56 | v_f = tf.get_variable("v_f", [1, self.d]) 57 | v_r = tf.get_variable("v_r", [1, self.d]) 58 | 59 | f = tf.sigmoid(x_f + v_f * c_) 60 | c = f * c_ + (1. - f) * x_c 61 | r = tf.sigmoid(x_r + v_r * c_) 62 | h = r * c + (1. - r) * x_h 63 | 64 | return tf.concat([h, c], -1) 65 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/rnns/sru.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class sru(cell.Cell): 14 | """The Simple Recurrent Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='sru'): 17 | super(sru, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d * 2, shape=shape, x=x, scope=scope) 22 | 23 | def get_hidden(self, x): 24 | return tf.split(x, 2, -1)[0] 25 | 26 | def fetch_states(self, x): 27 | with tf.variable_scope( 28 | "fetch_state_{}".format(self.scope or "sru")): 29 | h = linear(x, self.d * 4, 30 | bias=True, ln=self.ln, scope="hide_x") 31 | return (h, ) 32 | 33 | def __call__(self, h_, x): 34 | # h_: the concatenation of previous hidden state 35 | # and memory cell state 36 | # x_r/x: the current input state for r gate 37 | # x_f/x: the current input state for f gate 38 | # x_c/x: the current input state for candidate cell 39 | # x_h/x: the current input state for hidden output 40 | # we increase this because we do not assume that 41 | # the input dimension equals the output dimension 42 | """ 43 | f = sigmoid(Wx, vf * c_) 44 | c = f * c_ + (1 - f) * Wx 45 | r = sigmoid(Wx, vr * c_) 46 | h = r * c + (1 - r) * Ux 47 | """ 48 | if isinstance(x, (list, tuple)): 49 | x = x[0] 50 | 51 | with tf.variable_scope( 52 | "cell_{}".format(self.scope or "sru")): 53 | x_r, x_f, x_c, x_h = tf.split(x, 4, -1) 54 | h_, c_ = tf.split(h_, 2, -1) 55 | 56 | v_f = tf.get_variable("v_f", [1, self.d]) 57 | v_r = tf.get_variable("v_r", [1, self.d]) 58 | 59 | f = tf.sigmoid(x_f + v_f * c_) 60 | c = f * c_ + (1. - f) * x_c 61 | r = tf.sigmoid(x_r + v_r * c_) 62 | h = r * c + (1. - r) * x_h 63 | 64 | return tf.concat([h, c], -1) 65 | -------------------------------------------------------------------------------- /ner/README.md: -------------------------------------------------------------------------------- 1 | ## Named Entity Recognition 2 | 3 | 4 | We employ the birnn plus CRF architecture as [Lample et al. 2016](https://www.aclweb.org/anthology/N16-1030), and 5 | experiment on CoNLL-2003 English NER data. 6 | Main experimental results are summarized below. 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 |
Model#ParamsNER
Lample et al. 2016-90.94
LSTM245K89.61
GRU192K89.35
ATR87K88.46
SRU161K88.89
LRN129K88.56
45 | 46 | F1-score. 47 | 48 | ## Requirement 49 | see [requirements.txt](code/requirements.txt) for full list. 50 | 51 | ## How to Run? 52 | 53 | - download and preprocess dataset 54 | 55 | - download the conll2003 dataset from [anago](https://github.com/Hironsan/anago/tree/master/data) (in data folder). 56 | - download the Glove-6B-100d pre-trained word embedding from: http://nlp.stanford.edu/data/glove.6B.zip 57 | 58 | - no hyperparameters are tuned, we keep them all in default. 59 | 60 | - training and evaluation 61 | 62 | the running procedure is as follows: 63 | ``` 64 | export CUDA_ROOT=XXX 65 | export PATH=$CUDA_ROOT/bin:$PATH 66 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH 67 | 68 | export CUDA_VISIBLE_DEVICES=0 69 | 70 | export data_dir=path-of/conll2003/en/ner 71 | export glove_dir=path-of/glove.6B/glove.6B.100d.txt 72 | 73 | RUN_EXP=5 74 | rnn=lrn 75 | 76 | for i in $(seq 1 $RUN_EXP); do 77 | exp_dir=exp$i 78 | mkdir $exp_dir 79 | cd $exp_dir 80 | 81 | export cell_type=$rnn 82 | python3 ner_glove.py --cell lrn >& log.lrn 83 | 84 | cd ../ 85 | done 86 | 87 | python scripts/get_test_score.py $rnn exp* >& score.$rnn 88 | ``` 89 | Results are reported over 5 runs. 90 | 91 | ## Credits 92 | 93 | Source code structure is adapted from [annago](https://github.com/Hironsan/anago/tree/master/). -------------------------------------------------------------------------------- /doc/code/rnns/lstm.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class lstm(cell.Cell): 14 | """The Long-Short Term Memory Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='lstm'): 17 | super(lstm, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d * 2, shape=shape, x=x, scope=scope) 22 | 23 | def get_hidden(self, x): 24 | return tf.split(x, 2, -1)[0] 25 | 26 | def fetch_states(self, x): 27 | with tf.variable_scope( 28 | "fetch_state_{}".format(self.scope or "lstm")): 29 | g = linear(x, self.d * 3, 30 | bias=True, ln=self.ln, scope="gate_x") 31 | c = linear(x, self.d, 32 | bias=True, ln=self.ln, scope="hide_x") 33 | return g, c 34 | 35 | def __call__(self, h_, x): 36 | # h_: the concatenation of previous hidden state 37 | # and memory cell state 38 | # x_i/x: the current input state for input gate 39 | # x_f/x: the current input state for forget gate 40 | # x_o/x: the current input state for output gate 41 | # x_c/x: the current input state for candidate cell 42 | """ 43 | f = sigmoid(h_, x) 44 | i = sigmoid(h_, x) 45 | o = sigmoid(h_, x) 46 | c' = tanh(h_, x) 47 | c = f * c_ + i * c' 48 | h = o * tanh(c) 49 | """ 50 | with tf.variable_scope( 51 | "cell_{}".format(self.scope or "lstm")): 52 | x_g, x_c = x 53 | h_, c_ = tf.split(h_, 2, -1) 54 | 55 | h_g = linear(h_, self.d * 3, 56 | ln=self.ln, scope="gate_h") 57 | i, f, o = tf.split( 58 | tf.sigmoid(x_g + h_g), 3, -1) 59 | 60 | h_c = linear(h_, self.d, 61 | ln=self.ln, scope="hide_h") 62 | h_c = tf.tanh(x_c + h_c) 63 | 64 | c = i * h_c + f * c_ 65 | 66 | h = o * tf.tanh(c) 67 | 68 | return tf.concat([h, c], -1) 69 | -------------------------------------------------------------------------------- /nli/code/rnns/lstm.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class lstm(cell.Cell): 14 | """The Long-Short Term Memory Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='lstm'): 17 | super(lstm, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d * 2, shape=shape, x=x, scope=scope) 22 | 23 | def get_hidden(self, x): 24 | return tf.split(x, 2, -1)[0] 25 | 26 | def fetch_states(self, x): 27 | with tf.variable_scope( 28 | "fetch_state_{}".format(self.scope or "lstm")): 29 | g = linear(x, self.d * 3, 30 | bias=True, ln=self.ln, scope="gate_x") 31 | c = linear(x, self.d, 32 | bias=True, ln=self.ln, scope="hide_x") 33 | return g, c 34 | 35 | def __call__(self, h_, x): 36 | # h_: the concatenation of previous hidden state 37 | # and memory cell state 38 | # x_i/x: the current input state for input gate 39 | # x_f/x: the current input state for forget gate 40 | # x_o/x: the current input state for output gate 41 | # x_c/x: the current input state for candidate cell 42 | """ 43 | f = sigmoid(h_, x) 44 | i = sigmoid(h_, x) 45 | o = sigmoid(h_, x) 46 | c' = tanh(h_, x) 47 | c = f * c_ + i * c' 48 | h = o * tanh(c) 49 | """ 50 | with tf.variable_scope( 51 | "cell_{}".format(self.scope or "lstm")): 52 | x_g, x_c = x 53 | h_, c_ = tf.split(h_, 2, -1) 54 | 55 | h_g = linear(h_, self.d * 3, 56 | ln=self.ln, scope="gate_h") 57 | i, f, o = tf.split( 58 | tf.sigmoid(x_g + h_g), 3, -1) 59 | 60 | h_c = linear(h_, self.d, 61 | ln=self.ln, scope="hide_h") 62 | h_c = tf.tanh(x_c + h_c) 63 | 64 | c = i * h_c + f * c_ 65 | 66 | h = o * tf.tanh(c) 67 | 68 | return tf.concat([h, c], -1) 69 | -------------------------------------------------------------------------------- /rc/rnet/code/rnns/lstm.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class lstm(cell.Cell): 14 | """The Long-Short Term Memory Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='lstm'): 17 | super(lstm, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d * 2, shape=shape, x=x, scope=scope) 22 | 23 | def get_hidden(self, x): 24 | return tf.split(x, 2, -1)[0] 25 | 26 | def fetch_states(self, x): 27 | with tf.variable_scope( 28 | "fetch_state_{}".format(self.scope or "lstm")): 29 | g = linear(x, self.d * 3, 30 | bias=True, ln=self.ln, scope="gate_x") 31 | c = linear(x, self.d, 32 | bias=True, ln=self.ln, scope="hide_x") 33 | return g, c 34 | 35 | def __call__(self, h_, x): 36 | # h_: the concatenation of previous hidden state 37 | # and memory cell state 38 | # x_i/x: the current input state for input gate 39 | # x_f/x: the current input state for forget gate 40 | # x_o/x: the current input state for output gate 41 | # x_c/x: the current input state for candidate cell 42 | """ 43 | f = sigmoid(h_, x) 44 | i = sigmoid(h_, x) 45 | o = sigmoid(h_, x) 46 | c' = tanh(h_, x) 47 | c = f * c_ + i * c' 48 | h = o * tanh(c) 49 | """ 50 | with tf.variable_scope( 51 | "cell_{}".format(self.scope or "lstm")): 52 | x_g, x_c = x 53 | h_, c_ = tf.split(h_, 2, -1) 54 | 55 | h_g = linear(h_, self.d * 3, 56 | ln=self.ln, scope="gate_h") 57 | i, f, o = tf.split( 58 | tf.sigmoid(x_g + h_g), 3, -1) 59 | 60 | h_c = linear(h_, self.d, 61 | ln=self.ln, scope="hide_h") 62 | h_c = tf.tanh(x_c + h_c) 63 | 64 | c = i * h_c + f * c_ 65 | 66 | h = o * tf.tanh(c) 67 | 68 | return tf.concat([h, c], -1) 69 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/rnns/lstm.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from func import linear 10 | from rnns import cell as cell 11 | 12 | 13 | class lstm(cell.Cell): 14 | """The Long-Short Term Memory Unit.""" 15 | 16 | def __init__(self, d, ln=False, scope='lstm'): 17 | super(lstm, self).__init__(d, ln=ln, scope=scope) 18 | 19 | def get_init_state(self, shape=None, x=None, scope=None): 20 | return self._get_init_state( 21 | self.d * 2, shape=shape, x=x, scope=scope) 22 | 23 | def get_hidden(self, x): 24 | return tf.split(x, 2, -1)[0] 25 | 26 | def fetch_states(self, x): 27 | with tf.variable_scope( 28 | "fetch_state_{}".format(self.scope or "lstm")): 29 | g = linear(x, self.d * 3, 30 | bias=True, ln=self.ln, scope="gate_x") 31 | c = linear(x, self.d, 32 | bias=True, ln=self.ln, scope="hide_x") 33 | return g, c 34 | 35 | def __call__(self, h_, x): 36 | # h_: the concatenation of previous hidden state 37 | # and memory cell state 38 | # x_i/x: the current input state for input gate 39 | # x_f/x: the current input state for forget gate 40 | # x_o/x: the current input state for output gate 41 | # x_c/x: the current input state for candidate cell 42 | """ 43 | f = sigmoid(h_, x) 44 | i = sigmoid(h_, x) 45 | o = sigmoid(h_, x) 46 | c' = tanh(h_, x) 47 | c = f * c_ + i * c' 48 | h = o * tanh(c) 49 | """ 50 | with tf.variable_scope( 51 | "cell_{}".format(self.scope or "lstm")): 52 | x_g, x_c = x 53 | h_, c_ = tf.split(h_, 2, -1) 54 | 55 | h_g = linear(h_, self.d * 3, 56 | ln=self.ln, scope="gate_h") 57 | i, f, o = tf.split( 58 | tf.sigmoid(x_g + h_g), 3, -1) 59 | 60 | h_c = linear(h_, self.d, 61 | ln=self.ln, scope="hide_h") 62 | h_c = tf.tanh(x_c + h_c) 63 | 64 | c = i * h_c + f * c_ 65 | 66 | h = o * tf.tanh(c) 67 | 68 | return tf.concat([h, c], -1) 69 | -------------------------------------------------------------------------------- /lm/code/generate.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Language Modeling on Penn Tree Bank 3 | # 4 | # This file generates new sentences sampled from the language model 5 | # 6 | ############################################################################### 7 | 8 | import argparse 9 | 10 | import torch 11 | from torch.autograd import Variable 12 | 13 | import data 14 | 15 | parser = argparse.ArgumentParser(description='PyTorch PTB Language Model') 16 | 17 | # Model parameters. 18 | parser.add_argument('--data', type=str, default='./penn', 19 | help='location of the data corpus') 20 | parser.add_argument('--checkpoint', type=str, default='./model.pt', 21 | help='model checkpoint to use') 22 | parser.add_argument('--outf', type=str, default='generated.txt', 23 | help='output file for generated text') 24 | parser.add_argument('--words', type=int, default='1000', 25 | help='number of words to generate') 26 | parser.add_argument('--seed', type=int, default=1111, 27 | help='random seed') 28 | parser.add_argument('--cuda', action='store_true', 29 | help='use CUDA') 30 | parser.add_argument('--temperature', type=float, default=1.0, 31 | help='temperature - higher will increase diversity') 32 | parser.add_argument('--log-interval', type=int, default=100, 33 | help='reporting interval') 34 | args = parser.parse_args() 35 | 36 | # Set the random seed manually for reproducibility. 37 | torch.manual_seed(args.seed) 38 | if torch.cuda.is_available(): 39 | if not args.cuda: 40 | print("WARNING: You have a CUDA device, so you should probably run with --cuda") 41 | else: 42 | torch.cuda.manual_seed(args.seed) 43 | 44 | if args.temperature < 1e-3: 45 | parser.error("--temperature has to be greater or equal 1e-3") 46 | 47 | with open(args.checkpoint, 'rb') as f: 48 | model = torch.load(f) 49 | model.eval() 50 | 51 | if args.cuda: 52 | model.cuda() 53 | else: 54 | model.cpu() 55 | 56 | corpus = data.Corpus(args.data) 57 | ntokens = len(corpus.dictionary) 58 | hidden = model.init_hidden(1) 59 | input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) 60 | if args.cuda: 61 | input.data = input.data.cuda() 62 | 63 | with open(args.outf, 'w') as outf: 64 | for i in range(args.words): 65 | output, hidden = model(input, hidden, return_prob=True) 66 | word_weights = output.squeeze().data.div(args.temperature).exp().cpu() 67 | word_idx = torch.multinomial(word_weights, 1)[0] 68 | input.data.fill_(word_idx) 69 | word = corpus.dictionary.idx2word[word_idx] 70 | 71 | outf.write(word + ('\n' if i % 20 == 19 else ' ')) 72 | 73 | if i % args.log_interval == 0: 74 | print('| Generated {}/{} words'.format(i, args.words)) 75 | -------------------------------------------------------------------------------- /mt/README.md: -------------------------------------------------------------------------------- 1 | ## Machine Translation 2 | 3 | 4 | Main source code will be available at [zero](https://github.com/bzhangGo/zero) (might require some time, 31/05/2019). 5 | The used NMT structure is in `deepnmt.py`. 6 | 7 | 8 | Main experimental results are summarized below. 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 |
Model#ParamsBLEUTrainDecode
GNMT-24.61--
GRU206M26.282.6745.35
ATR122M25.701.3334.40
SRU170M25.911.3442.84
LRN143M26.260.9936.50
oLRN164M26.731.1540.19
61 | 62 | *Train*: time in seconds per training batch measured from 0.2k training steps. 63 | *Decode*: time in milliseconds used to decode one sentence measured on newstest2014 dataset. 64 | *BLEU*: case-insensitive tokenized BLEU score on WMT14 English-German translation task. 65 | 66 | ## oLRN structure 67 | 68 | 69 | 70 | Unlike LRN, oLRN employs an additional output gate, inspired by LSTM, to handle output information flow. 71 | This additional gate also help avoid hidden state explosion when linear activation is applied. 72 | 73 | ## How to Run? 74 | 75 | Training and evaluation, please refer to project [zero](https://github.com/bzhangGo/zero). -------------------------------------------------------------------------------- /rc/rnet/code/evaluate-v1.1.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | 16 | def white_space_fix(text): 17 | return ' '.join(text.split()) 18 | 19 | def remove_punc(text): 20 | exclude = set(string.punctuation) 21 | return ''.join(ch for ch in text if ch not in exclude) 22 | 23 | def lower(text): 24 | return text.lower() 25 | 26 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 27 | 28 | 29 | def f1_score(prediction, ground_truth): 30 | prediction_tokens = normalize_answer(prediction).split() 31 | ground_truth_tokens = normalize_answer(ground_truth).split() 32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 33 | num_same = sum(common.values()) 34 | if num_same == 0: 35 | return 0 36 | precision = 1.0 * num_same / len(prediction_tokens) 37 | recall = 1.0 * num_same / len(ground_truth_tokens) 38 | f1 = (2 * precision * recall) / (precision + recall) 39 | return f1 40 | 41 | 42 | def exact_match_score(prediction, ground_truth): 43 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 44 | 45 | 46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 47 | scores_for_ground_truths = [] 48 | for ground_truth in ground_truths: 49 | score = metric_fn(prediction, ground_truth) 50 | scores_for_ground_truths.append(score) 51 | return max(scores_for_ground_truths) 52 | 53 | 54 | def evaluate(dataset, predictions): 55 | f1 = exact_match = total = 0 56 | for article in dataset: 57 | for paragraph in article['paragraphs']: 58 | for qa in paragraph['qas']: 59 | total += 1 60 | if qa['id'] not in predictions: 61 | message = 'Unanswered question ' + qa['id'] + \ 62 | ' will receive score 0.' 63 | print(message, file=sys.stderr) 64 | continue 65 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 66 | prediction = predictions[qa['id']] 67 | exact_match += metric_max_over_ground_truths( 68 | exact_match_score, prediction, ground_truths) 69 | f1 += metric_max_over_ground_truths( 70 | f1_score, prediction, ground_truths) 71 | 72 | exact_match = 100.0 * exact_match / total 73 | f1 = 100.0 * f1 / total 74 | 75 | return {'exact_match': exact_match, 'f1': f1} 76 | 77 | 78 | if __name__ == '__main__': 79 | expected_version = '1.1' 80 | parser = argparse.ArgumentParser( 81 | description='Evaluation for SQuAD ' + expected_version) 82 | parser.add_argument('dataset_file', help='Dataset file') 83 | parser.add_argument('prediction_file', help='Prediction File') 84 | args = parser.parse_args() 85 | with open(args.dataset_file) as dataset_file: 86 | dataset_json = json.load(dataset_file) 87 | if (dataset_json['version'] != expected_version): 88 | print('Evaluation expects v-' + expected_version + 89 | ', but got dataset with v-' + dataset_json['version'], 90 | file=sys.stderr) 91 | dataset = dataset_json['data'] 92 | with open(args.prediction_file) as prediction_file: 93 | predictions = json.load(prediction_file) 94 | print(json.dumps(evaluate(dataset, predictions))) 95 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/evaluate-v1.1.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | 16 | def white_space_fix(text): 17 | return ' '.join(text.split()) 18 | 19 | def remove_punc(text): 20 | exclude = set(string.punctuation) 21 | return ''.join(ch for ch in text if ch not in exclude) 22 | 23 | def lower(text): 24 | return text.lower() 25 | 26 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 27 | 28 | 29 | def f1_score(prediction, ground_truth): 30 | prediction_tokens = normalize_answer(prediction).split() 31 | ground_truth_tokens = normalize_answer(ground_truth).split() 32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 33 | num_same = sum(common.values()) 34 | if num_same == 0: 35 | return 0 36 | precision = 1.0 * num_same / len(prediction_tokens) 37 | recall = 1.0 * num_same / len(ground_truth_tokens) 38 | f1 = (2 * precision * recall) / (precision + recall) 39 | return f1 40 | 41 | 42 | def exact_match_score(prediction, ground_truth): 43 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 44 | 45 | 46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 47 | scores_for_ground_truths = [] 48 | for ground_truth in ground_truths: 49 | score = metric_fn(prediction, ground_truth) 50 | scores_for_ground_truths.append(score) 51 | return max(scores_for_ground_truths) 52 | 53 | 54 | def evaluate(dataset, predictions): 55 | f1 = exact_match = total = 0 56 | for article in dataset: 57 | for paragraph in article['paragraphs']: 58 | for qa in paragraph['qas']: 59 | total += 1 60 | if qa['id'] not in predictions: 61 | message = 'Unanswered question ' + qa['id'] + \ 62 | ' will receive score 0.' 63 | print(message, file=sys.stderr) 64 | continue 65 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 66 | prediction = predictions[qa['id']] 67 | exact_match += metric_max_over_ground_truths( 68 | exact_match_score, prediction, ground_truths) 69 | f1 += metric_max_over_ground_truths( 70 | f1_score, prediction, ground_truths) 71 | 72 | exact_match = 100.0 * exact_match / total 73 | f1 = 100.0 * f1 / total 74 | 75 | return {'exact_match': exact_match, 'f1': f1} 76 | 77 | 78 | if __name__ == '__main__': 79 | expected_version = '1.1' 80 | parser = argparse.ArgumentParser( 81 | description='Evaluation for SQuAD ' + expected_version) 82 | parser.add_argument('dataset_file', help='Dataset file') 83 | parser.add_argument('prediction_file', help='Prediction File') 84 | args = parser.parse_args() 85 | with open(args.dataset_file) as dataset_file: 86 | dataset_json = json.load(dataset_file) 87 | if (dataset_json['version'] != expected_version): 88 | print('Evaluation expects v-' + expected_version + 89 | ', but got dataset with v-' + dataset_json['version'], 90 | file=sys.stderr) 91 | dataset = dataset_json['data'] 92 | with open(args.prediction_file) as prediction_file: 93 | predictions = json.load(prediction_file) 94 | print(json.dumps(evaluate(dataset, predictions))) 95 | -------------------------------------------------------------------------------- /lm/code/weight_drop.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import Parameter 3 | from functools import wraps 4 | 5 | class WeightDrop(torch.nn.Module): 6 | def __init__(self, module, weights, dropout=0, variational=False): 7 | super(WeightDrop, self).__init__() 8 | self.module = module 9 | self.weights = weights 10 | self.dropout = dropout 11 | self.variational = variational 12 | self._setup() 13 | 14 | def widget_demagnetizer_y2k_edition(*args, **kwargs): 15 | # We need to replace flatten_parameters with a nothing function 16 | # It must be a function rather than a lambda as otherwise pickling explodes 17 | # We can't write boring code though, so ... WIDGET DEMAGNETIZER Y2K EDITION! 18 | # (╯°□°)╯︵ ┻━┻ 19 | return 20 | 21 | def _setup(self): 22 | # Terrible temporary solution to an issue regarding compacting weights re: CUDNN RNN 23 | if issubclass(type(self.module), torch.nn.RNNBase): 24 | self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition 25 | 26 | for name_w in self.weights: 27 | if not hasattr(self.module, name_w): 28 | continue 29 | print('Applying weight drop of {} to {}'.format(self.dropout, name_w)) 30 | w = getattr(self.module, name_w) 31 | del self.module._parameters[name_w] 32 | self.module.register_parameter(name_w + '_raw', Parameter(w.data)) 33 | 34 | def _setweights(self): 35 | for name_w in self.weights: 36 | if not hasattr(self.module, name_w): 37 | continue 38 | 39 | raw_w = getattr(self.module, name_w + '_raw') 40 | w = None 41 | if self.variational: 42 | mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1)) 43 | if raw_w.is_cuda: mask = mask.cuda() 44 | mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True) 45 | w = mask.expand_as(raw_w) * raw_w 46 | else: 47 | w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training) 48 | setattr(self.module, name_w, w) 49 | 50 | def forward(self, *args): 51 | self._setweights() 52 | return self.module.forward(*args) 53 | 54 | if __name__ == '__main__': 55 | import torch 56 | from weight_drop import WeightDrop 57 | 58 | # Input is (seq, batch, input) 59 | x = torch.autograd.Variable(torch.randn(2, 1, 10)).cuda() 60 | h0 = None 61 | 62 | ### 63 | 64 | print('Testing WeightDrop') 65 | print('=-=-=-=-=-=-=-=-=-=') 66 | 67 | ### 68 | 69 | print('Testing WeightDrop with Linear') 70 | 71 | lin = WeightDrop(torch.nn.Linear(10, 10), ['weight'], dropout=0.9) 72 | lin.cuda() 73 | run1 = [x.sum() for x in lin(x).data] 74 | run2 = [x.sum() for x in lin(x).data] 75 | 76 | print('All items should be different') 77 | print('Run 1:', run1) 78 | print('Run 2:', run2) 79 | 80 | assert run1[0] != run2[0] 81 | assert run1[1] != run2[1] 82 | 83 | print('---') 84 | 85 | ### 86 | 87 | print('Testing WeightDrop with LSTM') 88 | 89 | wdrnn = WeightDrop(torch.nn.LSTM(10, 10), ['weight_hh_l0'], dropout=0.9) 90 | wdrnn.cuda() 91 | 92 | run1 = [x.sum() for x in wdrnn(x, h0)[0].data] 93 | run2 = [x.sum() for x in wdrnn(x, h0)[0].data] 94 | 95 | print('First timesteps should be equal, all others should differ') 96 | print('Run 1:', run1) 97 | print('Run 2:', run2) 98 | 99 | # First time step, not influenced by hidden to hidden weights, should be equal 100 | assert run1[0] == run2[0] 101 | # Second step should not 102 | assert run1[1] != run2[1] 103 | 104 | print('---') 105 | -------------------------------------------------------------------------------- /doc/config.py: -------------------------------------------------------------------------------- 1 | dict( 2 | # lrate decay 3 | # select strategy: noam, gnmt+, epoch, score and vanilla 4 | lrate_strategy="epoch", 5 | # learning decay rate 6 | lrate_decay=0.5, 7 | # weight decay for L2 loss 8 | weight_decay=3e-5, 9 | 10 | # early stopping 11 | estop_patience=100, 12 | 13 | # initialization 14 | # type of initializer 15 | initializer="uniform", 16 | # initializer range control 17 | initializer_gain=0.08, 18 | 19 | # parameters for rnnsearch 20 | # encoder and decoder hidden size 21 | hidden_size=64, 22 | # source and target embedding size 23 | embed_size=300, 24 | # character embedding size 25 | char_embed_size=32, 26 | # dropout value 27 | dropout=0.1, 28 | # word random dropout 29 | word_dropout=0.1, 30 | # label smoothing value 31 | label_smooth=0.1, 32 | # gru, lstm, sru or atr 33 | cell="atr", 34 | # whether use layer normalization, it will be slow 35 | layer_norm=False, 36 | # notice that when opening the swap memory switch 37 | # you can train reasonably larger batch on condition 38 | # that your system will use much more cpu memory 39 | swap_memory=True, 40 | 41 | # whether use character embedding 42 | use_char=True, 43 | # whether lowercase word 44 | lower=False, 45 | 46 | # task name 47 | task="amafull", 48 | 49 | model_name="InferNet", 50 | 51 | # constant batch size at 'batch' mode for batch-based batching 52 | batch_size=64, 53 | token_size=2000, 54 | batch_or_token='batch', 55 | # batch size for decoding, i.e. number of source sentences decoded at the same time 56 | eval_batch_size=64, 57 | # whether shuffle batches during training 58 | shuffle_batch=True, 59 | # whether use multiprocessing deal with data reading, default true 60 | data_multiprocessing=True, 61 | 62 | # word vocabulary 63 | word_vocab_file="", 64 | # char vocabulary 65 | char_vocab_file="", 66 | # pretrained word embedding 67 | pretrain_word_embedding_file="path-of/glove.840B.300d.txt", 68 | # dataset path file 69 | data_path="path-of/data", 70 | # output directory 71 | output_dir="train", 72 | # output during testing 73 | test_output="", 74 | 75 | # adam optimizer hyperparameters 76 | beta1=0.9, 77 | beta2=0.999, 78 | epsilon=1e-8, 79 | # gradient clipping value 80 | clip_grad_norm=5.0, 81 | # initial learning rate 82 | lrate=1e-3, 83 | 84 | # allowed maximum sentence length 85 | max_len=400, 86 | # maximum word length 87 | max_w_len=25, 88 | # maximum sentence number 89 | max_p_num=10, 90 | # hierarchy neural network 91 | enable_hierarchy=False, 92 | 93 | # maximum epochs 94 | epoches=6, 95 | # the effective batch size is: batch/token size * update_cycle 96 | # sequential update cycle 97 | update_cycle=1, 98 | # the number of gpus 99 | gpus=[0], 100 | # whether enable ema 101 | ema_decay=0.9999, 102 | 103 | # print information every disp_freq training steps 104 | disp_freq=10, 105 | # evaluate on the development file every eval_freq steps 106 | eval_freq=10000, 107 | # save the model parameters every save_freq steps 108 | save_freq=5000, 109 | # saved checkpoint number 110 | checkpoints=5, 111 | # the maximum training steps, program with stop if epoches or max_training_steps is metted 112 | max_training_steps=1000000000, 113 | 114 | # bert configuration 115 | # did not use in practice, efficiency is an important issue 116 | bert=None, 117 | bert_dir="path-of/cased_L-12_H-768_A-12/", 118 | tune_bert=False, 119 | enable_bert=False, 120 | use_bert_single=True, 121 | 122 | # number of threads for threaded reading, seems useless 123 | nthreads=3, 124 | # buffer size controls the number of sentences readed in one time, 125 | buffer_size=100000, 126 | # a unique queue in multi-thread reading process 127 | max_queue_size=100, 128 | # random control, not so well for tensorflow. 129 | random_seed=1234, 130 | # whether or not train from checkpoint 131 | train_continue=True, 132 | ) 133 | -------------------------------------------------------------------------------- /nli/config.py: -------------------------------------------------------------------------------- 1 | dict( 2 | # lrate decay 3 | # select strategy: noam, gnmt+, epoch, score and vanilla 4 | lrate_strategy="epoch", 5 | # learning decay rate 6 | lrate_decay=0.5, 7 | # weight decay for L2 loss 8 | weight_decay=3e-5, 9 | 10 | # early stopping 11 | estop_patience=100, 12 | 13 | # initialization 14 | # type of initializer 15 | initializer="uniform", 16 | # initializer range control 17 | initializer_gain=0.08, 18 | 19 | # parameters for rnnsearch 20 | # encoder and decoder hidden size 21 | hidden_size=300, 22 | # source and target embedding size 23 | embed_size=300, 24 | # label number 25 | label_size=3, 26 | # number of layers 27 | char_embed_size=64, 28 | # dropout value 29 | dropout=0.3, 30 | # label smoothing value 31 | label_smooth=0.1, 32 | # gru, lstm, sru or atr 33 | cell="atr", 34 | # whether use layer normalization, it will be slow 35 | layer_norm=False, 36 | # notice that when opening the swap memory switch 37 | # you can train reasonably larger batch on condition 38 | # that your system will use much more cpu memory 39 | swap_memory=True, 40 | 41 | # bert configuration 42 | bert=None, 43 | bert_dir="path-to-bert/cased_L-12_H-768_A-12", 44 | tune_bert=False, 45 | enable_bert=False, 46 | use_bert_single=True, 47 | 48 | # whether use character embedding 49 | use_char=True, 50 | # whether lowercase word 51 | lower=False, 52 | bert_lower=False, 53 | 54 | model_name="nlinet", 55 | 56 | # constant batch size at 'batch' mode for batch-based batching 57 | batch_size=128, 58 | token_size=2000, 59 | batch_or_token='batch', 60 | # batch size for decoding, i.e. number of source sentences decoded at the same time 61 | eval_batch_size=64, 62 | # whether shuffle batches during training 63 | shuffle_batch=True, 64 | # whether use multiprocessing deal with data reading, default true 65 | data_multiprocessing=True, 66 | 67 | # word vocabulary 68 | word_vocab_file="path-of/word_vocab", 69 | # char vocabulary 70 | char_vocab_file="path-of/char_vocab", 71 | # pretrained word embedding 72 | pretrain_word_embedding_file="path-of/word_vocab.npz", 73 | # train file 74 | train_file=["path-of/train.p", "path-of/train.q", "path-of/train.l"], 75 | # dev file 76 | dev_file=["path-of/dev.p", "path-of/dev.q", "path-of/dev.l"], 77 | # test file 78 | test_file=["path-of/test.p", "path-of/test.q", "path-of/test.l"], 79 | # output directory 80 | output_dir="train", 81 | # output during testing 82 | test_output="", 83 | 84 | # adam optimizer hyperparameters 85 | beta1=0.9, 86 | beta2=0.999, 87 | epsilon=1e-8, 88 | # gradient clipping value 89 | clip_grad_norm=5.0, 90 | # initial learning rate 91 | lrate=1e-3, 92 | 93 | # allowed maximum sentence length 94 | max_len=100, 95 | # maximum word length 96 | max_w_len=25, 97 | 98 | # maximum epochs 99 | epoches=10, 100 | # the effective batch size is: batch/token size * update_cycle 101 | # sequential update cycle 102 | update_cycle=1, 103 | # the number of gpus 104 | gpus=[0], 105 | # whether enable ema 106 | ema_decay=0.9999, 107 | 108 | # print information every disp_freq training steps 109 | disp_freq=10, 110 | # evaluate on the development file every eval_freq steps 111 | eval_freq=1000, 112 | # save the model parameters every save_freq steps 113 | save_freq=1000, 114 | # saved checkpoint number 115 | checkpoints=5, 116 | # the maximum training steps, program with stop if epoches or max_training_steps is metted 117 | max_training_steps=100000, 118 | 119 | # number of threads for threaded reading, seems useless 120 | nthreads=6, 121 | # buffer size controls the number of sentences readed in one time, 122 | buffer_size=20000, 123 | # a unique queue in multi-thread reading process 124 | max_queue_size=100, 125 | # random control, not so well for tensorflow. 126 | random_seed=1234, 127 | # whether or not train from checkpoint 128 | train_continue=True, 129 | ) 130 | -------------------------------------------------------------------------------- /nli/config_bert.py: -------------------------------------------------------------------------------- 1 | dict( 2 | # lrate decay 3 | # select strategy: noam, gnmt+, epoch, score and vanilla 4 | lrate_strategy="vanilla", 5 | # learning decay rate 6 | lrate_decay=0.5, 7 | # weight decay for L2 loss 8 | weight_decay=3e-5, 9 | 10 | # early stopping 11 | estop_patience=100, 12 | 13 | # initialization 14 | # type of initializer 15 | initializer="uniform", 16 | # initializer range control 17 | initializer_gain=0.08, 18 | 19 | # parameters for rnnsearch 20 | # encoder and decoder hidden size 21 | hidden_size=300, 22 | # source and target embedding size 23 | embed_size=300, 24 | # label number 25 | label_size=3, 26 | # number of layers 27 | char_embed_size=64, 28 | # dropout value 29 | dropout=0.3, 30 | # label smoothing value 31 | label_smooth=0.1, 32 | # gru, lstm, sru or atr 33 | cell="atr", 34 | # whether use layer normalization, it will be slow 35 | layer_norm=False, 36 | # notice that when opening the swap memory switch 37 | # you can train reasonably larger batch on condition 38 | # that your system will use much more cpu memory 39 | swap_memory=True, 40 | 41 | # bert configuration 42 | bert=None, 43 | bert_dir="path-to-bert/cased_L-12_H-768_A-12", 44 | tune_bert=True, 45 | enable_bert=True, 46 | use_bert_single=True, 47 | 48 | # whether use character embedding 49 | use_char=True, 50 | # whether lowercase word 51 | lower=False, 52 | bert_lower=False, 53 | 54 | model_name="nlinet", 55 | 56 | # constant batch size at 'batch' mode for batch-based batching 57 | batch_size=32, 58 | token_size=2000, 59 | batch_or_token='batch', 60 | # batch size for decoding, i.e. number of source sentences decoded at the same time 61 | eval_batch_size=32, 62 | # whether shuffle batches during training 63 | shuffle_batch=True, 64 | # whether use multiprocessing deal with data reading, default true 65 | data_multiprocessing=True, 66 | 67 | # word vocabulary 68 | word_vocab_file="path-of/word_vocab", 69 | # char vocabulary 70 | char_vocab_file="path-of/char_vocab", 71 | # pretrained word embedding 72 | pretrain_word_embedding_file="path-of/word_vocab.npz", 73 | # train file 74 | train_file=["path-of/train.p", "path-of/train.q", "path-of/train.l"], 75 | # dev file 76 | dev_file=["path-of/dev.p", "path-of/dev.q", "path-of/dev.l"], 77 | # test file 78 | test_file=["path-of/test.p", "path-of/test.q", "path-of/test.l"], 79 | # output directory 80 | output_dir="train", 81 | # output during testing 82 | test_output="", 83 | 84 | # adam optimizer hyperparameters 85 | beta1=0.9, 86 | beta2=0.999, 87 | epsilon=1e-8, 88 | # gradient clipping value 89 | clip_grad_norm=5.0, 90 | # initial learning rate 91 | lrate=2e-5, 92 | 93 | # allowed maximum sentence length 94 | max_len=100, 95 | # maximum word length 96 | max_w_len=25, 97 | 98 | # maximum epochs 99 | epoches=5, 100 | # the effective batch size is: batch/token size * update_cycle 101 | # sequential update cycle 102 | update_cycle=1, 103 | # the number of gpus 104 | gpus=[0], 105 | # whether enable ema 106 | ema_decay=0.9999, 107 | 108 | # print information every disp_freq training steps 109 | disp_freq=10, 110 | # evaluate on the development file every eval_freq steps 111 | eval_freq=1000, 112 | # save the model parameters every save_freq steps 113 | save_freq=1000, 114 | # saved checkpoint number 115 | checkpoints=5, 116 | # the maximum training steps, program with stop if epoches or max_training_steps is metted 117 | max_training_steps=100000, 118 | 119 | # number of threads for threaded reading, seems useless 120 | nthreads=6, 121 | # buffer size controls the number of sentences readed in one time, 122 | buffer_size=20000, 123 | # a unique queue in multi-thread reading process 124 | max_queue_size=100, 125 | # random control, not so well for tensorflow. 126 | random_seed=1234, 127 | # whether or not train from checkpoint 128 | train_continue=True, 129 | ) 130 | -------------------------------------------------------------------------------- /ner/code/tagger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model API. 3 | """ 4 | import numpy as np 5 | from seqeval.metrics.sequence_labeling import get_entities 6 | 7 | 8 | class Tagger(object): 9 | """A model API that tags input sentence. 10 | 11 | Attributes: 12 | model: Model. 13 | preprocessor: Transformer. Preprocessing data for feature extraction. 14 | tokenizer: Tokenize input sentence. Default tokenizer is `str.split`. 15 | """ 16 | 17 | def __init__(self, model, preprocessor, tokenizer=str.split): 18 | self.model = model 19 | self.preprocessor = preprocessor 20 | self.tokenizer = tokenizer 21 | 22 | def predict_proba(self, text): 23 | """Probability estimates. 24 | 25 | The returned estimates for all classes are ordered by the 26 | label of classes. 27 | 28 | Args: 29 | text : string, the input text. 30 | 31 | Returns: 32 | y : array-like, shape = [num_words, num_classes] 33 | Returns the probability of the word for each class in the model, 34 | """ 35 | assert isinstance(text, str) 36 | 37 | words = self.tokenizer(text) 38 | X = self.preprocessor.transform([words]) 39 | y = self.model.predict(X) 40 | y = y[0] # reduce batch dimension. 41 | 42 | return y 43 | 44 | def _get_prob(self, pred): 45 | prob = np.max(pred, -1) 46 | 47 | return prob 48 | 49 | def _get_tags(self, pred): 50 | tags = self.preprocessor.inverse_transform([pred]) 51 | tags = tags[0] # reduce batch dimension 52 | 53 | return tags 54 | 55 | def _build_response(self, sent, tags, prob): 56 | words = self.tokenizer(sent) 57 | res = { 58 | 'words': words, 59 | 'entities': [ 60 | 61 | ] 62 | } 63 | chunks = get_entities(tags) 64 | 65 | for chunk_type, chunk_start, chunk_end in chunks: 66 | chunk_end += 1 67 | entity = { 68 | 'text': ' '.join(words[chunk_start: chunk_end]), 69 | 'type': chunk_type, 70 | 'score': float(np.average(prob[chunk_start: chunk_end])), 71 | 'beginOffset': chunk_start, 72 | 'endOffset': chunk_end 73 | } 74 | res['entities'].append(entity) 75 | 76 | return res 77 | 78 | def analyze(self, text): 79 | """Analyze text and return pretty format. 80 | 81 | Args: 82 | text: string, the input text. 83 | 84 | Returns: 85 | res: dict. 86 | 87 | Examples: 88 | >>> text = 'President Obama is speaking at the White House.' 89 | >>> model.analyze(text) 90 | { 91 | "words": [ 92 | "President", 93 | "Obama", 94 | "is", 95 | "speaking", 96 | "at", 97 | "the", 98 | "White", 99 | "House." 100 | ], 101 | "entities": [ 102 | { 103 | "beginOffset": 1, 104 | "endOffset": 2, 105 | "score": 1, 106 | "text": "Obama", 107 | "type": "PER" 108 | }, 109 | { 110 | "beginOffset": 6, 111 | "endOffset": 8, 112 | "score": 1, 113 | "text": "White House.", 114 | "type": "ORG" 115 | } 116 | ] 117 | } 118 | """ 119 | pred = self.predict_proba(text) 120 | tags = self._get_tags(pred) 121 | prob = self._get_prob(pred) 122 | res = self._build_response(text, tags, prob) 123 | 124 | return res 125 | 126 | def predict(self, text): 127 | """Predict using the model. 128 | 129 | Args: 130 | text: string, the input text. 131 | 132 | Returns: 133 | tags: list, shape = (num_words,) 134 | Returns predicted values. 135 | """ 136 | pred = self.predict_proba(text) 137 | tags = self._get_tags(pred) 138 | 139 | return tags 140 | -------------------------------------------------------------------------------- /lm/README.md: -------------------------------------------------------------------------------- 1 | ## Language Modeling 2 | 3 | 4 | We do experiments on PTB and WT2 dataset, and use the mixture of softmax model [MoS](https://arxiv.org/abs/1711.03953). 5 | Main experimental results are summarized below. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 |
Model#ParamsPTBWT2
Base+Finetune+DynamicBase+Finetune+Dynamic
Yang et al. (2018)22M55.9754.4447.6963.3361.4540.68
This
Work
LSTM22M63.7862.1253.1169.7868.6844.60
GRU17M69.0967.6160.2173.3773.0549.77
ATR9M66.2465.8658.2975.3673.3548.65
SRU13M69.6465.2960.9785.1584.9757.97
LRN11M61.2661.0054.4569.9168.8646.97
84 | 85 | Test perplexity. 86 | 87 | ## Requirement 88 | PyTorch >= 0.4.1 89 | 90 | ## How to Run? 91 | - download and preprocess dataset 92 | 93 | - see [MoS](https://github.com/zihangdai/mos) about the preprocessing of datasets 94 | 95 | - training and evaluation 96 | 97 | - training 98 | ``` 99 | #! /bin/bash 100 | 101 | export CUDA_VISIBLE_DEVICES=0 102 | 103 | # for PTB 104 | python3 main.py --data path-of/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 28 --batch_size 12 --lr 10.0 --epoch 1000 --nhid 960 --nhidlast 620 --emsize 280 --n_experts 15 --save PTB --single_gpu --model lrn 105 | # for WT2 106 | python3 main.py --epochs 1000 --data path-of/wikitext-2 --save WT2 --dropouth 0.2 --seed 1882 --n_experts 15 --nhid 1150 --nhidlast 650 --emsize 300 --batch_size 15 --lr 15.0 --dropoutl 0.29 --small_batch_size 5 --max_seq_len_delta 20 --dropouti 0.55 --single_gpu --model lrn 107 | ``` 108 | 109 | - finetuning 110 | ``` 111 | # for PTB 112 | python3 finetune.py --data path-of/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 28 --batch_size 12 --lr 15.0 --epoch 1000 --nhid 960 --emsize 280 --n_experts 15 --save PTB-XXX --single_gpu --model lrn 113 | # for WT2 114 | python3 finetune.py --epochs 1000 --data path-of/wikitext-2 --save WT2-XXX --dropouth 0.2 --seed 1882 --n_experts 15 --nhid 1150 --emsize 300 --batch_size 15 --lr 20.0 --dropoutl 0.29 --small_batch_size 5 --max_seq_len_delta 20 --dropouti 0.55 --single_gpu --model lrn 115 | ``` 116 | 117 | - dynamic evaluation 118 | ``` 119 | # for PTB 120 | python3 dynamiceval.py --model PTB-XXX/finetune_model.pt --data path-of/penn --lamb 0.075 --gpu 0 121 | # for WT2 122 | python3 dynamiceval.py --data path-of/wikitext-2 --model WT2-XXX/finetune_model.pt --epsilon 0.002 --gpu 0 123 | ``` 124 | 125 | - general evaluation 126 | ``` 127 | # for PTB 128 | python3 evaluate.py --data path-of/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 28 --batch_size 12 --lr 10.0 --epoch 1000 --nhid 960 --nhidlast 620 --emsize 280 --n_experts 15 --save PTB-XXX --single_gpu --model lrn 129 | # for WT2 130 | python3 evaluate.py --epochs 1000 --data path-of/wikitext-2 --save WT2-XXX --dropouth 0.2 --seed 1882 --n_experts 15 --nhid 1150 --nhidlast 650 --emsize 300 --batch_size 15 --lr 15.0 --dropoutl 0.29 --small_batch_size 5 --max_seq_len_delta 20 --dropouti 0.55 --single_gpu --model lrn 131 | ``` 132 | 133 | ## Credits 134 | 135 | Source code structure is adapted from [MoS](https://github.com/zihangdai/mos). -------------------------------------------------------------------------------- /lm/code/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from collections import Counter 5 | 6 | 7 | class Dictionary(object): 8 | def __init__(self): 9 | self.word2idx = {} 10 | self.idx2word = [] 11 | self.counter = Counter() 12 | self.total = 0 13 | 14 | def add_word(self, word): 15 | if word not in self.word2idx: 16 | self.idx2word.append(word) 17 | self.word2idx[word] = len(self.idx2word) - 1 18 | token_id = self.word2idx[word] 19 | self.counter[token_id] += 1 20 | self.total += 1 21 | return self.word2idx[word] 22 | 23 | def __len__(self): 24 | return len(self.idx2word) 25 | 26 | 27 | class Corpus(object): 28 | def __init__(self, path): 29 | self.dictionary = Dictionary() 30 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 31 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 32 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 33 | 34 | def tokenize(self, path): 35 | """Tokenizes a text file.""" 36 | assert os.path.exists(path) 37 | # Add words to the dictionary 38 | with open(path, 'r', encoding='utf-8') as f: 39 | tokens = 0 40 | for line in f: 41 | words = line.split() + [''] 42 | tokens += len(words) 43 | for word in words: 44 | self.dictionary.add_word(word) 45 | 46 | # Tokenize file content 47 | with open(path, 'r', encoding='utf-8') as f: 48 | ids = torch.LongTensor(tokens) 49 | token = 0 50 | for line in f: 51 | words = line.split() + [''] 52 | for word in words: 53 | ids[token] = self.dictionary.word2idx[word] 54 | token += 1 55 | 56 | return ids 57 | 58 | class SentCorpus(object): 59 | def __init__(self, path): 60 | self.dictionary = Dictionary() 61 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 62 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 63 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 64 | 65 | def tokenize(self, path): 66 | """Tokenizes a text file.""" 67 | assert os.path.exists(path) 68 | # Add words to the dictionary 69 | with open(path, 'r', encoding='utf-8') as f: 70 | tokens = 0 71 | for line in f: 72 | words = line.split() + [''] 73 | tokens += len(words) 74 | for word in words: 75 | self.dictionary.add_word(word) 76 | 77 | # Tokenize file content 78 | sents = [] 79 | with open(path, 'r', encoding='utf-8') as f: 80 | for line in f: 81 | if not line: 82 | continue 83 | words = line.split() + [''] 84 | sent = torch.LongTensor(len(words)) 85 | for i, word in enumerate(words): 86 | sent[i] = self.dictionary.word2idx[word] 87 | sents.append(sent) 88 | 89 | return sents 90 | 91 | class BatchSentLoader(object): 92 | def __init__(self, sents, batch_size, pad_id=0, cuda=False, volatile=False): 93 | self.sents = sents 94 | self.batch_size = batch_size 95 | self.sort_sents = sorted(sents, key=lambda x: x.size(0)) 96 | self.cuda = cuda 97 | self.volatile = volatile 98 | self.pad_id = pad_id 99 | 100 | def __next__(self): 101 | if self.idx >= len(self.sort_sents): 102 | raise StopIteration 103 | 104 | batch_size = min(self.batch_size, len(self.sort_sents)-self.idx) 105 | batch = self.sort_sents[self.idx:self.idx+batch_size] 106 | max_len = max([s.size(0) for s in batch]) 107 | tensor = torch.LongTensor(max_len, batch_size).fill_(self.pad_id) 108 | for i in range(len(batch)): 109 | s = batch[i] 110 | tensor[:s.size(0),i].copy_(s) 111 | if self.cuda: 112 | tensor = tensor.cuda() 113 | 114 | self.idx += batch_size 115 | 116 | return tensor 117 | 118 | next = __next__ 119 | 120 | def __iter__(self): 121 | self.idx = 0 122 | return self 123 | 124 | if __name__ == '__main__': 125 | corpus = SentCorpus('../penn') 126 | loader = BatchSentLoader(corpus.test, 10) 127 | for i, d in enumerate(loader): 128 | print(i, d.size()) 129 | -------------------------------------------------------------------------------- /doc/code/utils/saver.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import os 8 | import tensorflow as tf 9 | 10 | 11 | class Saver(object): 12 | def __init__(self, 13 | checkpoints=5, # save the latest number of checkpoints 14 | output_dir=None # the output directory 15 | ): 16 | if output_dir is None: 17 | output_dir = "./output" 18 | self.output_dir = output_dir 19 | self.output_best_dir = os.path.join(output_dir, "best") 20 | 21 | self.saver = tf.train.Saver( 22 | max_to_keep=checkpoints 23 | ) 24 | self.best_saver = tf.train.Saver( 25 | max_to_keep=1 26 | ) 27 | self.best_score = -1 28 | self.score_record = tf.gfile.Open( 29 | os.path.join(self.output_best_dir, "metric.log"), 30 | mode="a+" 31 | ) 32 | 33 | def save(self, session, step, metric_score=None): 34 | if not tf.gfile.Exists(self.output_dir): 35 | tf.gfile.MkDir(self.output_dir) 36 | if not tf.gfile.Exists(self.output_best_dir): 37 | tf.gfile.MkDir(self.output_best_dir) 38 | 39 | self.saver.save(session, 40 | os.path.join(self.output_dir, "model"), 41 | global_step=step) 42 | 43 | def _move(path, new_path): 44 | if tf.gfile.Exists(path): 45 | if tf.gfile.Exists(new_path): 46 | tf.gfile.Remove(new_path) 47 | tf.gfile.Copy(path, new_path) 48 | 49 | if metric_score is not None and metric_score > self.best_score: 50 | self.best_score = metric_score 51 | self.best_saver.save( 52 | session, os.path.join(self.output_best_dir, "model")) 53 | 54 | _move(os.path.join(self.output_dir, "param.json"), 55 | os.path.join(self.output_best_dir, "param.json")) 56 | _move(os.path.join(self.output_dir, "record.json"), 57 | os.path.join(self.output_best_dir, "record.json")) 58 | 59 | # this recorder only record best scores 60 | self.score_record.write("Steps {}, Metric Score {}\n" 61 | .format(step, metric_score)) 62 | 63 | self.score_record.flush() 64 | 65 | def restore(self, session, path=None): 66 | if path is not None and tf.gfile.Exists(path): 67 | check_dir = path 68 | else: 69 | check_dir = self.output_dir 70 | 71 | checkpoint = os.path.join(check_dir, "checkpoint") 72 | if not tf.gfile.Exists(checkpoint): 73 | tf.logging.warn("No Existing Model detected") 74 | else: 75 | latest_checkpoint = tf.gfile.Open(checkpoint).readline() 76 | model_name = latest_checkpoint.strip().split(":")[1].strip() 77 | model_name = model_name[1:-1] # remove "" 78 | model_path = os.path.join(check_dir, model_name) 79 | model_path = os.path.abspath(model_path) 80 | if not tf.gfile.Exists(model_path+".meta"): 81 | tf.logging.error("model '{}' does not exists" 82 | .format(model_path)) 83 | else: 84 | try: 85 | self.saver.restore(session, model_path) 86 | except tf.errors.NotFoundError: 87 | # In this case, we simply assume that the cycle part 88 | # is mismatched, where the replicas are missing. 89 | # This would happen if you switch from un-cycle mode 90 | # to cycle mode. 91 | tf.logging.warn("Starting Backup Restore") 92 | ops = [] 93 | reader = tf.train.load_checkpoint(model_path) 94 | for var in tf.global_variables(): 95 | name = var.op.name 96 | 97 | if reader.has_tensor(name): 98 | tf.logging.info('{} get initialization from {}' 99 | .format(name, name)) 100 | ops.append( 101 | tf.assign(var, reader.get_tensor(name))) 102 | else: 103 | tf.logging.warn("{} is missed".format(name)) 104 | restore_op = tf.group(*ops, name="restore_global_vars") 105 | session.run(restore_op) 106 | -------------------------------------------------------------------------------- /nli/code/utils/saver.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import os 8 | import tensorflow as tf 9 | 10 | 11 | class Saver(object): 12 | def __init__(self, 13 | checkpoints=5, # save the latest number of checkpoints 14 | output_dir=None # the output directory 15 | ): 16 | if output_dir is None: 17 | output_dir = "./output" 18 | self.output_dir = output_dir 19 | self.output_best_dir = os.path.join(output_dir, "best") 20 | 21 | self.saver = tf.train.Saver( 22 | max_to_keep=checkpoints 23 | ) 24 | self.best_saver = tf.train.Saver( 25 | max_to_keep=1 26 | ) 27 | self.best_score = -1 28 | self.score_record = tf.gfile.Open( 29 | os.path.join(self.output_best_dir, "metric.log"), 30 | mode="a+" 31 | ) 32 | 33 | def save(self, session, step, metric_score=None): 34 | if not tf.gfile.Exists(self.output_dir): 35 | tf.gfile.MkDir(self.output_dir) 36 | if not tf.gfile.Exists(self.output_best_dir): 37 | tf.gfile.MkDir(self.output_best_dir) 38 | 39 | self.saver.save(session, 40 | os.path.join(self.output_dir, "model"), 41 | global_step=step) 42 | 43 | def _move(path, new_path): 44 | if tf.gfile.Exists(path): 45 | if tf.gfile.Exists(new_path): 46 | tf.gfile.Remove(new_path) 47 | tf.gfile.Copy(path, new_path) 48 | 49 | if metric_score is not None and metric_score > self.best_score: 50 | self.best_score = metric_score 51 | self.best_saver.save( 52 | session, os.path.join(self.output_best_dir, "model")) 53 | 54 | _move(os.path.join(self.output_dir, "param.json"), 55 | os.path.join(self.output_best_dir, "param.json")) 56 | _move(os.path.join(self.output_dir, "record.json"), 57 | os.path.join(self.output_best_dir, "record.json")) 58 | 59 | # this recorder only record best scores 60 | self.score_record.write("Steps {}, Metric Score {}\n" 61 | .format(step, metric_score)) 62 | 63 | self.score_record.flush() 64 | 65 | def restore(self, session, path=None): 66 | if path is not None and tf.gfile.Exists(path): 67 | check_dir = path 68 | else: 69 | check_dir = self.output_dir 70 | 71 | checkpoint = os.path.join(check_dir, "checkpoint") 72 | if not tf.gfile.Exists(checkpoint): 73 | tf.logging.warn("No Existing Model detected") 74 | else: 75 | latest_checkpoint = tf.gfile.Open(checkpoint).readline() 76 | model_name = latest_checkpoint.strip().split(":")[1].strip() 77 | model_name = model_name[1:-1] # remove "" 78 | model_path = os.path.join(check_dir, model_name) 79 | model_path = os.path.abspath(model_path) 80 | if not tf.gfile.Exists(model_path+".meta"): 81 | tf.logging.error("model '{}' does not exists" 82 | .format(model_path)) 83 | else: 84 | try: 85 | self.saver.restore(session, model_path) 86 | except tf.errors.NotFoundError: 87 | # In this case, we simply assume that the cycle part 88 | # is mismatched, where the replicas are missing. 89 | # This would happen if you switch from un-cycle mode 90 | # to cycle mode. 91 | tf.logging.warn("Starting Backup Restore") 92 | ops = [] 93 | reader = tf.train.load_checkpoint(model_path) 94 | for var in tf.global_variables(): 95 | name = var.op.name 96 | 97 | if reader.has_tensor(name): 98 | tf.logging.info('{} get initialization from {}' 99 | .format(name, name)) 100 | ops.append( 101 | tf.assign(var, reader.get_tensor(name))) 102 | else: 103 | tf.logging.warn("{} is missed".format(name)) 104 | restore_op = tf.group(*ops, name="restore_global_vars") 105 | session.run(restore_op) 106 | -------------------------------------------------------------------------------- /rc/elmo_rnet/code/bilm/elmo.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | 4 | def weight_layers(name, bilm_ops, l2_coef=None, 5 | use_top_only=False, do_layer_norm=False): 6 | ''' 7 | Weight the layers of a biLM with trainable scalar weights to 8 | compute ELMo representations. 9 | 10 | For each output layer, this returns two ops. The first computes 11 | a layer specific weighted average of the biLM layers, and 12 | the second the l2 regularizer loss term. 13 | The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES 14 | 15 | Input: 16 | name = a string prefix used for the trainable variable names 17 | bilm_ops = the tensorflow ops returned to compute internal 18 | representations from a biLM. This is the return value 19 | from BidirectionalLanguageModel(...)(ids_placeholder) 20 | l2_coef: the l2 regularization coefficient $\lambda$. 21 | Pass None or 0.0 for no regularization. 22 | use_top_only: if True, then only use the top layer. 23 | do_layer_norm: if True, then apply layer normalization to each biLM 24 | layer before normalizing 25 | 26 | Output: 27 | { 28 | 'weighted_op': op to compute weighted average for output, 29 | 'regularization_op': op to compute regularization term 30 | } 31 | ''' 32 | def _l2_regularizer(weights): 33 | if l2_coef is not None: 34 | return l2_coef * tf.reduce_sum(tf.square(weights)) 35 | else: 36 | return 0.0 37 | 38 | # Get ops for computing LM embeddings and mask 39 | lm_embeddings = bilm_ops['lm_embeddings'] 40 | mask = bilm_ops['mask'] 41 | 42 | # Disable the first embedding layer 43 | # lm_embeddings = lm_embeddings[:, 1:, :, :] 44 | 45 | n_lm_layers = int(lm_embeddings.get_shape()[1]) 46 | lm_dim = int(lm_embeddings.get_shape()[3]) 47 | 48 | with tf.control_dependencies([lm_embeddings, mask]): 49 | # Cast the mask and broadcast for layer use. 50 | mask_float = tf.cast(mask, 'float32') 51 | broadcast_mask = tf.expand_dims(mask_float, axis=-1) 52 | 53 | def _do_ln(x): 54 | # do layer normalization excluding the mask 55 | x_masked = x * broadcast_mask 56 | N = tf.reduce_sum(mask_float) * lm_dim 57 | mean = tf.reduce_sum(x_masked) / N 58 | variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask)**2 59 | ) / N 60 | return tf.nn.batch_normalization( 61 | x, mean, variance, None, None, 1E-12 62 | ) 63 | 64 | if use_top_only: 65 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1) 66 | # just the top layer 67 | sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1) 68 | # no regularization 69 | reg = 0.0 70 | else: 71 | W = tf.get_variable( 72 | '{}_ELMo_W'.format(name), 73 | shape=(n_lm_layers, ), 74 | initializer=tf.zeros_initializer, 75 | regularizer=_l2_regularizer, 76 | trainable=True, 77 | ) 78 | 79 | # normalize the weights 80 | normed_weights = tf.split( 81 | tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers 82 | ) 83 | # split LM layers 84 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1) 85 | 86 | # compute the weighted, normalized LM activations 87 | pieces = [] 88 | for w, t in zip(normed_weights, layers): 89 | if do_layer_norm: 90 | pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1))) 91 | else: 92 | pieces.append(w * tf.squeeze(t, squeeze_dims=1)) 93 | sum_pieces = tf.add_n(pieces) 94 | 95 | # get the regularizer 96 | reg = [ 97 | r for r in tf.get_collection( 98 | tf.GraphKeys.REGULARIZATION_LOSSES) 99 | if r.name.find('{}_ELMo_W/'.format(name)) >= 0 100 | ] 101 | if len(reg) != 1: 102 | pass 103 | 104 | # scale the weighted sum by gamma 105 | gamma = tf.get_variable( 106 | '{}_ELMo_gamma'.format(name), 107 | shape=(1, ), 108 | initializer=tf.ones_initializer, 109 | regularizer=None, 110 | trainable=True, 111 | ) 112 | weighted_lm_layers = sum_pieces * gamma 113 | 114 | ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg} 115 | 116 | return ret 117 | 118 | -------------------------------------------------------------------------------- /doc/code/tasks.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import os 8 | import abc 9 | import csv 10 | import numpy as np 11 | 12 | 13 | def get_task(params, is_training): 14 | name = params.task.lower() 15 | 16 | if name == "amafull": 17 | return AMAFull(params.data_path, is_training) 18 | elif name == "amapolar": 19 | return AMAPolar(params.data_path, is_training) 20 | elif name == "yahoo": 21 | return YaHoo(params.data_path, is_training) 22 | elif name == "yelpfull": 23 | return YelpFull(params.data_path, is_training) 24 | elif name == "yelppolar": 25 | return YelpPolar(params.data_path, is_training) 26 | else: 27 | raise NotImplementedError("Not Supported: {}".format(name)) 28 | 29 | 30 | class Task(object): 31 | def __init__(self, data_path, is_training=False): 32 | self.data_path = data_path 33 | self.is_training = is_training 34 | 35 | self.trainset = [] 36 | self.devset = [] 37 | self.testset = [] 38 | 39 | if self.is_training: 40 | self._read_all_train_dev_data() 41 | self._read_all_test_data() 42 | 43 | def _clean_text(self, text_in): 44 | return text_in.replace('\\"', '"').replace('\\n', ' ') 45 | 46 | def _read_all_train_dev_data(self): 47 | train_data_path = os.path.join(self.data_path, "train.csv") 48 | 49 | dataset = [] 50 | with open(train_data_path) as tfile: 51 | reader = csv.reader(tfile, delimiter=",") 52 | 53 | for sample in reader: 54 | dataset.append(sample) 55 | 56 | np.random.shuffle(dataset) 57 | 58 | # split the dataset with 90% and 10% 59 | dev_size = int(len(dataset) * 0.1) 60 | 61 | self.devset = dataset[:dev_size] 62 | self.trainset = dataset[dev_size:] 63 | 64 | def _read_all_test_data(self): 65 | test_data_path = os.path.join(self.data_path, "test.csv") 66 | 67 | self.testset = [] 68 | with open(test_data_path) as tfile: 69 | reader = csv.reader(tfile, delimiter=",") 70 | 71 | for sample in reader: 72 | self.testset.append(sample) 73 | 74 | def _data_iter(self, iterator): 75 | for sample in iterator: 76 | label = int(sample[0]) - 1 77 | document = ' '.join(sample[1:]) 78 | 79 | document = self._clean_text(document) 80 | 81 | yield (label, document) 82 | 83 | def get_train_data(self): 84 | np.random.shuffle(self.trainset) 85 | for sample in self._data_iter(self.trainset): 86 | yield sample 87 | 88 | def get_dev_data(self): 89 | for sample in self._data_iter(self.devset): 90 | yield sample 91 | 92 | def get_test_data(self): 93 | for sample in self._data_iter(self.testset): 94 | yield sample 95 | 96 | @abc.abstractmethod 97 | def get_label_size(self): 98 | raise NotImplementedError("Not Supported") 99 | 100 | 101 | # amazon_review_full_csv 102 | class AMAFull(Task): 103 | def __init__(self, data_path, is_training=False): 104 | data_path = os.path.join(data_path, "amazon_review_full_csv") 105 | super(AMAFull, self).__init__(data_path, is_training) 106 | 107 | def get_label_size(self): 108 | return 5 109 | 110 | 111 | # amazon_review_polarity_csv 112 | class AMAPolar(Task): 113 | def __init__(self, data_path, is_training=False): 114 | data_path = os.path.join(data_path, "amazon_review_polarity_csv") 115 | super(AMAPolar, self).__init__(data_path, is_training) 116 | 117 | def get_label_size(self): 118 | return 2 119 | 120 | 121 | # yahoo_answers_csv 122 | class YaHoo(Task): 123 | def __init__(self, data_path, is_training=False): 124 | data_path = os.path.join(data_path, "yahoo_answers_csv") 125 | super(YaHoo, self).__init__(data_path, is_training) 126 | 127 | def get_label_size(self): 128 | return 10 129 | 130 | 131 | # yelp_review_full_csv 132 | class YelpFull(Task): 133 | def __init__(self, data_path, is_training=False): 134 | data_path = os.path.join(data_path, "yelp_review_full_csv") 135 | super(YelpFull, self).__init__(data_path, is_training) 136 | 137 | def get_label_size(self): 138 | return 5 139 | 140 | 141 | # yelp_review_polarity_csv 142 | class YelpPolar(Task): 143 | def __init__(self, data_path, is_training=False): 144 | data_path = os.path.join(data_path, "yelp_review_polarity_csv") 145 | super(YelpPolar, self).__init__(data_path, is_training) 146 | 147 | def get_label_size(self): 148 | return 2 149 | -------------------------------------------------------------------------------- /doc/code/evalu.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import time 8 | import json 9 | import numpy as np 10 | import tensorflow as tf 11 | 12 | from utils import queuer 13 | 14 | 15 | def decoding(sprobs, samples, params, mask=None): 16 | """Generate decoded sequence from seqs""" 17 | if mask is None: 18 | mask = [1.] * len(sprobs) 19 | 20 | flat_sprobs = [] 21 | for _sprobs, _m in zip(sprobs, mask): 22 | if _m < 1.: 23 | continue 24 | 25 | for start_prob in _sprobs: 26 | flat_sprobs.append(start_prob) 27 | 28 | assert len(flat_sprobs) == len(samples), 'Decoding length mismatch!' 29 | 30 | results = [] 31 | 32 | for (idx, sample), pred in zip(samples, flat_sprobs): 33 | gold_label = sample['label_id'] 34 | pred_label = pred 35 | 36 | results.append({ 37 | 'pred_answer': pred_label, 38 | 'sample_id': idx, 39 | 'gold_answer': gold_label 40 | }) 41 | 42 | return results 43 | 44 | 45 | def predict(session, features, 46 | out_pred, dataset, params, train="test"): 47 | """Performing decoding with exising information""" 48 | results = [] 49 | 50 | batcher = dataset.batcher(params.eval_batch_size, 51 | buffer_size=params.buffer_size, 52 | shuffle=False, train=train) 53 | eval_queue = queuer.EnQueuer(batcher, 54 | multiprocessing=params.data_multiprocessing, 55 | random_seed=params.random_seed) 56 | eval_queue.start(workers=params.nthreads, 57 | max_queue_size=params.max_queue_size) 58 | 59 | def _predict_one_batch(data_on_gpu): 60 | feed_dicts = {} 61 | flat_raw_data = [] 62 | for fidx, data in enumerate(data_on_gpu): 63 | # define feed_dict 64 | feed_dict = { 65 | features[fidx]["t"]: data['token_ids'], 66 | features[fidx]["l"]: data['l_id'], 67 | } 68 | if params.use_char: 69 | feed_dict[features[fidx]["c"]] = data['char_ids'] 70 | 71 | if params.enable_bert: 72 | feed_dict[features[fidx]["s"]] = data['subword_ids'] 73 | feed_dict[features[fidx]["sb"]] = data['subword_back'] 74 | 75 | feed_dicts.update(feed_dict) 76 | flat_raw_data.extend(data['raw']) 77 | 78 | # pick up valid outputs 79 | data_size = len(data_on_gpu) 80 | valid_out_pred = out_pred[:data_size] 81 | 82 | decode_spred = session.run( 83 | valid_out_pred, feed_dict=feed_dicts) 84 | 85 | predictions = decoding( 86 | decode_spred, flat_raw_data, params 87 | ) 88 | 89 | return predictions 90 | 91 | very_begin_time = time.time() 92 | data_on_gpu = [] 93 | for bidx, data in enumerate(eval_queue.get()): 94 | 95 | data_on_gpu.append(data) 96 | # use multiple gpus, and data samples is not enough 97 | if len(params.gpus) > 0 and len(data_on_gpu) < len(params.gpus): 98 | continue 99 | 100 | start_time = time.time() 101 | predictions = _predict_one_batch(data_on_gpu) 102 | data_on_gpu = [] 103 | results.extend(predictions) 104 | 105 | tf.logging.info( 106 | "Decoding Batch {} using {:.3f} s, translating {} " 107 | "sentences using {:.3f} s in total".format( 108 | bidx, time.time() - start_time, 109 | len(results), time.time() - very_begin_time 110 | ) 111 | ) 112 | 113 | eval_queue.stop() 114 | 115 | if len(data_on_gpu) > 0: 116 | start_time = time.time() 117 | predictions = _predict_one_batch(data_on_gpu) 118 | results.extend(predictions) 119 | 120 | tf.logging.info( 121 | "Decoding Batch {} using {:.3f} s, translating {} " 122 | "sentences using {:.3f} s in total".format( 123 | 'final', time.time() - start_time, 124 | len(results), time.time() - very_begin_time 125 | ) 126 | ) 127 | 128 | results = sorted(results, key=lambda x: x['sample_id']) 129 | 130 | golds = [result['gold_answer'] for result in results] 131 | preds = [result['pred_answer'] for result in results] 132 | 133 | score = np.sum(np.asarray(golds) == np.asarray(preds)) * 100. / len(golds) 134 | 135 | return results, score 136 | 137 | 138 | def dump_predictions(results, output): 139 | """save translation""" 140 | with tf.gfile.Open(output, 'w') as writer: 141 | for sample in results: 142 | sample['pred_answer'] = sample['pred_answer'] 143 | writer.write(json.dumps(sample) + "\n") 144 | tf.logging.info("Saving translations into {}".format(output)) 145 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | ## Document Classification 2 | 3 | One concern of LRN is that after simplifying the recurrent component, modeling capacity, in particular the long-range 4 | dependency, would be weakened. We answer this question by doing experiments on document classification. 5 | 6 | We choose: 7 | - Amazon Review Polarity (AmaPolar, 2 labels, 3.6M/0.4M for training/testing) 8 | - Amazon Review Full (AmaFull, 5 labels, 3M/0.65M for training/testing) 9 | - Yahoo! Answers (Yahoo, 10 labels, 1.4M/60K for training/testing) 10 | - Yelp Review Polarity (YelpPolar, 2 labels, 0.56M/38K for training/testing) 11 | 12 | Dataset comes from [Zhang et al. (2015)](https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf). 13 | We use a birnn model followed by an attentive pooling layer. Char and Glove embeddings are used for word representation. 14 | Main experimental results are summarized below. 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 |
Model#ParamsAmaPolarYahooAmaFullYelpPolar
ERRTimeERRTimeERRTimeERRTime
Zhang et al. (2015)-6.10-29.16-40.57-5.26-
This
Work
LSTM227K4.370.94724.621.33237.221.0033.581.362
GRU176K4.390.94824.681.24237.200.9823.471.230
ATR74K4.780.86725.331.11738.540.8364.001.124
SRU194K4.950.91924.781.39438.230.9073.991.310
LRN151K4.980.73125.071.03838.420.7883.981.022
109 | 110 | *Time*: time in seconds per training batch measured from 1k training steps. 111 | 112 | ## Requirement 113 | tensorflow >= 1.8.1 114 | 115 | ## How to Run? 116 | 117 | - download and preprocess dataset 118 | 119 | - The dataset link: https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M 120 | - Prepare embedding and vocabulary 121 | 122 | Download the [pre-trained GloVe embedding](http://nlp.stanford.edu/data/glove.840B.300d.zip). 123 | Generate vocabulary for each task as follows: 124 | ``` 125 | task=(amafull amapolar yahoo yelppolar) 126 | python code/run.py --mode vocab --config config.py --parameters=task="${task}",output_dir="${task}_vocab" 127 | ``` 128 | 129 | 130 | - training and evaluation 131 | 132 | - Train the model as follows: 133 | ``` 134 | # configure your cuda libaray if necessary 135 | export CUDA_ROOT=XXX 136 | export PATH=$CUDA_ROOT/bin:$PATH 137 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH 138 | 139 | task=(amafull amapolar yahoo yelppolar) 140 | python code/run.py --mode train --config config.py --parameters=task="${task}",output_dir="${task}_train",gpus=[1],word_vocab_file="${task}_vocab/vocab.word",char_vocab_file="${task}_vocab/vocab.char",enable_hierarchy=False,nthreads=2,enable_bert=False,cell="lrn",swap_memory=False 141 | ``` 142 | Other hyperparameter settings are available in the given config.py. 143 | 144 | - Test the model as follows: 145 | ``` 146 | task=(amafull amapolar yahoo yelppolar) 147 | python code/run.py --mode test --config config.py --parameters=task="${task}",output_dir="${task}_train/best",gpus=[0],word_vocab_file="${task}_vocab/vocab.word",char_vocab_file="${task}_vocab/vocab.char",enable_hierarchy=False,nthreads=2,enable_bert=False,cell="lrn",swap_memory=False,train_continue=False,test_output=${task}.out.txt 148 | ``` 149 | 150 | ## Credits 151 | 152 | Source code structure is adapted from [zero](https://github.com/bzhangGo/zero). -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lrn 2 | Source code for "A Lightweight Recurrent Network for Sequence Modeling" 3 | 4 | 5 | ## Model Architecture 6 | In our new paper, we propose lightweight recurrent network, which combines the strengths of 7 | [ATR](https://arxiv.org/abs/1810.12546) and [SRU](https://arxiv.org/abs/1709.02755). 8 | 9 | * ATR helps reduces model parameters and avoids additional free parameters for gate calculation, through the twin-gate 10 | mechanism 11 | * SRU follows the [QRNN](https://arxiv.org/abs/1611.01576) and moves all recurrent computations outside the recurrence. 12 | 13 | Based on the above units, we propose [LRN](xxx): 14 | 15 | 16 | 17 | where g(·) is an activation function, *tanh* or *identity*. Wq, Wk and Wv 18 | are model parameters. The matrix computation (as well as potential layer noramlization) can be shfited outside the 19 | recurrence. Therefore, the whole model is fast in running. 20 | 21 | When applying twin-gate mechanism, the output value in **h**t might suffer explosion issue, 22 | which could grow into infinity. This is the reason we added the activation function. Another alternative solution 23 | would be using layer normalization, which forces activation values to be stable. 24 | 25 | ## Structure Analysis 26 | One way to understand the model is to unfold the LRN structure along input tokens: 27 | 28 | 29 | 30 | The above structure which is also observed by [Zhang et al.](https://arxiv.org/abs/1810.12546), [Lee et al.](https://arxiv.org/abs/1705.07393), 31 | and etc, endows the RNN model with multiple interpretations. We provide two as follows: 32 | 33 | * *Relation with Self Attention Networks* 34 | 35 | 36 | Informally, LRN assembles forget gates from step *t* to step *k+1* in order to query the key (input gate). The result 37 | weight is assigned to the corresponding value representation and contributes to the final hidden representation. 38 | 39 | Does the learned weights make sense? We do a classification tasks on AmaPolar task with a unidirectional linear-LRN. 40 | The final hidden state is feed into the classifier. One example below shows the learned weights. The term *great* gains 41 | a large weight, which decays slowly and contributes the final *positive* decision. 42 | 43 | 44 | * *Long-term and Short-term Memory* 45 | 46 | 47 | Another view of the unfolded structure is that different gates form different memory mechanism. The input gate acts as 48 | a short-term memory and indicates how many information can be activated in this token. The forget gates form a forget 49 | chain that controls how to erase meaningless past information. 50 | 51 | ## Experiments 52 | 53 | We did experiment on six different tasks: 54 | * [Natural Language Inference](nli) 55 | * [Document Classification](doc) 56 | * [Machine Translation](mt) 57 | * [Reading Comprehension](rc) 58 | * [Named Entity Recognition](ner) 59 | * [Language Modeling](lm) 60 | 61 | 62 | ## Citation 63 | 64 | Please cite the following paper: 65 | > Biao Zhang; Rico Sennrich (2019). *A Lightweight Recurrent Network for Sequence Modeling*. 66 | In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics. Florence, Italy. 67 | 68 | ``` 69 | @inproceedings{zhang-sennrich:2019:ACL, 70 | address = "Florence, Italy", 71 | author = "Zhang, Biao and Sennrich, Rico", 72 | booktitle = "{Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}", 73 | publisher = "Association for Computational Linguistics", 74 | title = "{A Lightweight Recurrent Network for Sequence Modeling}", 75 | year = "2019" 76 | } 77 | ``` 78 | 79 | ## Contact 80 | 81 | For any further comments or questions about LRN, please email Biao Zhang. -------------------------------------------------------------------------------- /doc/code/utils/cycle.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | 10 | def _zero_variables(variables, name=None): 11 | ops = [] 12 | 13 | for var in variables: 14 | with tf.device(var.device): 15 | op = var.assign(tf.zeros(var.shape.as_list())) 16 | ops.append(op) 17 | 18 | return tf.group(*ops, name=name or "zero_variables") 19 | 20 | 21 | def _replicate_variables(variables, device=None, suffix="Replica"): 22 | new_vars = [] 23 | 24 | for var in variables: 25 | device = device or var.device 26 | with tf.device(device): 27 | name = var.op.name + "/{}".format(suffix) 28 | new_vars.append(tf.Variable(tf.zeros(var.shape.as_list()), 29 | name=name, trainable=False)) 30 | 31 | return new_vars 32 | 33 | 34 | def _collect_gradients(gradients, variables): 35 | ops = [] 36 | 37 | for grad, var in zip(gradients, variables): 38 | if isinstance(grad, tf.Tensor): 39 | ops.append(tf.assign_add(var, grad)) 40 | else: 41 | ops.append(tf.scatter_add(var, grad.indices, grad.values)) 42 | 43 | return tf.group(*ops, name="collect_gradients") 44 | 45 | 46 | def create_train_op(named_scalars, grads_and_vars, optimizer, global_step, params): 47 | gradients = [item[0] for item in grads_and_vars] 48 | variables = [item[1] for item in grads_and_vars] 49 | 50 | if params.update_cycle == 1: 51 | zero_variables_op = tf.no_op("zero_variables") 52 | collect_op = tf.no_op("collect_op") 53 | else: 54 | named_vars = {} 55 | for name in named_scalars: 56 | named_var = tf.Variable(tf.zeros([]), 57 | name="{}/CTrainOpReplica".format(name), 58 | trainable=False) 59 | named_vars[name] = named_var 60 | count_var = tf.Variable(tf.zeros([]), name="count/CTrainOpReplica", 61 | trainable=False) 62 | slot_variables = _replicate_variables(variables, suffix="CTrainOpReplica") 63 | zero_variables_op = _zero_variables( 64 | slot_variables + [count_var] + named_vars.values()) 65 | 66 | collect_ops = [] 67 | # collect gradients 68 | collect_grads_op = _collect_gradients(gradients, slot_variables) 69 | collect_ops.append(collect_grads_op) 70 | 71 | # collect other scalars 72 | for name in named_scalars: 73 | scalar = named_scalars[name] 74 | named_var = named_vars[name] 75 | collect_op = tf.assign_add(named_var, scalar) 76 | collect_ops.append(collect_op) 77 | # collect counting variable 78 | collect_count_op = tf.assign_add(count_var, 1.0) 79 | collect_ops.append(collect_count_op) 80 | 81 | collect_op = tf.group(*collect_ops, name="collect_op") 82 | scale = 1.0 / (tf.to_float(count_var) + 1.0) 83 | gradients = [scale * (g + s) 84 | for (g, s) in zip(gradients, slot_variables)] 85 | 86 | for name in named_scalars: 87 | named_scalars[name] = scale * ( 88 | named_scalars[name] + named_vars[name]) 89 | 90 | global_norm = tf.global_norm(gradients) 91 | 92 | # Gradient clipping 93 | if isinstance(params.clip_grad_norm or None, float): 94 | gradients, _ = tf.clip_by_global_norm(gradients, 95 | params.clip_grad_norm, 96 | use_norm=global_norm) 97 | 98 | # Update variables 99 | grads_and_vars = list(zip(gradients, variables)) 100 | train_op = optimizer.apply_gradients(grads_and_vars, global_step) 101 | 102 | ops = { 103 | "zero_op": zero_variables_op, 104 | "collect_op": collect_op, 105 | "train_op": train_op 106 | } 107 | 108 | # apply ema 109 | if params.ema_decay > 0.: 110 | tf.logging.info('Using Exp Moving Average to train the model with decay {}.'.format(params.ema_decay)) 111 | ema = tf.train.ExponentialMovingAverage(decay=params.ema_decay, num_updates=global_step) 112 | ema_op = ema.apply(variables) 113 | with tf.control_dependencies([ops['train_op']]): 114 | ops['train_op'] = tf.group(ema_op) 115 | bck_vars = _replicate_variables(variables, suffix="CTrainOpBackUpReplica") 116 | 117 | ops['ema_backup_op'] = tf.group(*(tf.assign(bck, var.read_value()) 118 | for bck, var in zip(bck_vars, variables))) 119 | ops['ema_restore_op'] = tf.group(*(tf.assign(var, bck.read_value()) 120 | for bck, var in zip(bck_vars, variables))) 121 | ops['ema_assign_op'] = tf.group(*(tf.assign(var, ema.average(var).read_value()) 122 | for var in variables)) 123 | 124 | ret = named_scalars 125 | ret.update({ 126 | "gradient_norm": global_norm, 127 | }) 128 | 129 | return ret, ops 130 | -------------------------------------------------------------------------------- /nli/code/utils/cycle.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | 10 | def _zero_variables(variables, name=None): 11 | ops = [] 12 | 13 | for var in variables: 14 | with tf.device(var.device): 15 | op = var.assign(tf.zeros(var.shape.as_list())) 16 | ops.append(op) 17 | 18 | return tf.group(*ops, name=name or "zero_variables") 19 | 20 | 21 | def _replicate_variables(variables, device=None, suffix="Replica"): 22 | new_vars = [] 23 | 24 | for var in variables: 25 | device = device or var.device 26 | with tf.device(device): 27 | name = var.op.name + "/{}".format(suffix) 28 | new_vars.append(tf.Variable(tf.zeros(var.shape.as_list()), 29 | name=name, trainable=False)) 30 | 31 | return new_vars 32 | 33 | 34 | def _collect_gradients(gradients, variables): 35 | ops = [] 36 | 37 | for grad, var in zip(gradients, variables): 38 | if isinstance(grad, tf.Tensor): 39 | ops.append(tf.assign_add(var, grad)) 40 | else: 41 | ops.append(tf.scatter_add(var, grad.indices, grad.values)) 42 | 43 | return tf.group(*ops, name="collect_gradients") 44 | 45 | 46 | def create_train_op(named_scalars, grads_and_vars, optimizer, global_step, params): 47 | gradients = [item[0] for item in grads_and_vars] 48 | variables = [item[1] for item in grads_and_vars] 49 | 50 | if params.update_cycle == 1: 51 | zero_variables_op = tf.no_op("zero_variables") 52 | collect_op = tf.no_op("collect_op") 53 | else: 54 | named_vars = {} 55 | for name in named_scalars: 56 | named_var = tf.Variable(tf.zeros([]), 57 | name="{}/CTrainOpReplica".format(name), 58 | trainable=False) 59 | named_vars[name] = named_var 60 | count_var = tf.Variable(tf.zeros([]), name="count/CTrainOpReplica", 61 | trainable=False) 62 | slot_variables = _replicate_variables(variables, suffix="CTrainOpReplica") 63 | zero_variables_op = _zero_variables( 64 | slot_variables + [count_var] + named_vars.values()) 65 | 66 | collect_ops = [] 67 | # collect gradients 68 | collect_grads_op = _collect_gradients(gradients, slot_variables) 69 | collect_ops.append(collect_grads_op) 70 | 71 | # collect other scalars 72 | for name in named_scalars: 73 | scalar = named_scalars[name] 74 | named_var = named_vars[name] 75 | collect_op = tf.assign_add(named_var, scalar) 76 | collect_ops.append(collect_op) 77 | # collect counting variable 78 | collect_count_op = tf.assign_add(count_var, 1.0) 79 | collect_ops.append(collect_count_op) 80 | 81 | collect_op = tf.group(*collect_ops, name="collect_op") 82 | scale = 1.0 / (tf.to_float(count_var) + 1.0) 83 | gradients = [scale * (g + s) 84 | for (g, s) in zip(gradients, slot_variables)] 85 | 86 | for name in named_scalars: 87 | named_scalars[name] = scale * ( 88 | named_scalars[name] + named_vars[name]) 89 | 90 | global_norm = tf.global_norm(gradients) 91 | 92 | # Gradient clipping 93 | if isinstance(params.clip_grad_norm or None, float): 94 | gradients, _ = tf.clip_by_global_norm(gradients, 95 | params.clip_grad_norm, 96 | use_norm=global_norm) 97 | 98 | # Update variables 99 | grads_and_vars = list(zip(gradients, variables)) 100 | train_op = optimizer.apply_gradients(grads_and_vars, global_step) 101 | 102 | ops = { 103 | "zero_op": zero_variables_op, 104 | "collect_op": collect_op, 105 | "train_op": train_op 106 | } 107 | 108 | # apply ema 109 | if params.ema_decay > 0.: 110 | tf.logging.info('Using Exp Moving Average to train the model with decay {}.'.format(params.ema_decay)) 111 | ema = tf.train.ExponentialMovingAverage(decay=params.ema_decay, num_updates=global_step) 112 | ema_op = ema.apply(variables) 113 | with tf.control_dependencies([ops['train_op']]): 114 | ops['train_op'] = tf.group(ema_op) 115 | bck_vars = _replicate_variables(variables, suffix="CTrainOpBackUpReplica") 116 | 117 | ops['ema_backup_op'] = tf.group(*(tf.assign(bck, var.read_value()) 118 | for bck, var in zip(bck_vars, variables))) 119 | ops['ema_restore_op'] = tf.group(*(tf.assign(var, bck.read_value()) 120 | for bck, var in zip(bck_vars, variables))) 121 | ops['ema_assign_op'] = tf.group(*(tf.assign(var, ema.average(var).read_value()) 122 | for var in variables)) 123 | 124 | ret = named_scalars 125 | ret.update({ 126 | "gradient_norm": global_norm, 127 | }) 128 | 129 | return ret, ops 130 | -------------------------------------------------------------------------------- /nli/code/evalu.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import time 8 | import json 9 | import tensorflow as tf 10 | 11 | from utils import queuer 12 | 13 | 14 | def decoding(sprobs, samples, params, mask=None): 15 | """Generate decoded sequence from seqs""" 16 | if mask is None: 17 | mask = [1.] * len(sprobs) 18 | 19 | flat_sprobs = [] 20 | for _sprobs, _m in zip(sprobs, mask): 21 | if _m < 1.: continue 22 | 23 | for start_prob in _sprobs: 24 | flat_sprobs.append(start_prob) 25 | 26 | assert len(flat_sprobs) == len(samples), 'Decoding length mismatch!' 27 | 28 | results = [] 29 | 30 | for (idx, sample), pred in zip(samples, flat_sprobs): 31 | gold_label = sample[0] 32 | pred_label = pred 33 | 34 | results.append({ 35 | 'pred_answer': int(pred_label), 36 | 'sample_id': idx, 37 | 'gold_answer': gold_label 38 | }) 39 | 40 | return results 41 | 42 | 43 | def predict(session, features, 44 | out_pred, dataset, params, train=True): 45 | """Performing decoding with exising information""" 46 | results = [] 47 | 48 | batcher = dataset.batcher(params.eval_batch_size, 49 | buffer_size=params.buffer_size, 50 | shuffle=False, train=train) 51 | eval_queue = queuer.EnQueuer(batcher, 52 | multiprocessing=params.data_multiprocessing, 53 | random_seed=params.random_seed) 54 | eval_queue.start(workers=params.nthreads, 55 | max_queue_size=params.max_queue_size) 56 | 57 | def _predict_one_batch(data_on_gpu): 58 | feed_dicts = {} 59 | flat_raw_data = [] 60 | for fidx, data in enumerate(data_on_gpu): 61 | # define feed_dict 62 | feed_dict = { 63 | features[fidx]["p"]: data['p_token_ids'], 64 | features[fidx]["h"]: data['h_token_ids'], 65 | features[fidx]["l"]: data['l_id'], 66 | } 67 | if params.use_char: 68 | feed_dict[features[fidx]["pc"]] = data['p_char_ids'] 69 | feed_dict[features[fidx]["hc"]] = data['h_char_ids'] 70 | 71 | if params.enable_bert: 72 | feed_dict[features[fidx]["ps"]] = data['p_subword_ids'] 73 | feed_dict[features[fidx]["hs"]] = data['h_subword_ids'] 74 | feed_dict[features[fidx]["pb"]] = data['p_subword_back'] 75 | feed_dict[features[fidx]["hb"]] = data['h_subword_back'] 76 | 77 | feed_dicts.update(feed_dict) 78 | flat_raw_data.extend(data['raw']) 79 | 80 | # pick up valid outputs 81 | data_size = len(data_on_gpu) 82 | valid_out_pred = out_pred[:data_size] 83 | 84 | decode_spred = session.run( 85 | valid_out_pred, feed_dict=feed_dicts) 86 | 87 | predictions = decoding( 88 | decode_spred, flat_raw_data, params 89 | ) 90 | 91 | return predictions 92 | 93 | very_begin_time = time.time() 94 | data_on_gpu = [] 95 | for bidx, data in enumerate(eval_queue.get()): 96 | 97 | data_on_gpu.append(data) 98 | # use multiple gpus, and data samples is not enough 99 | if len(params.gpus) > 0 and len(data_on_gpu) < len(params.gpus): 100 | continue 101 | 102 | start_time = time.time() 103 | predictions = _predict_one_batch(data_on_gpu) 104 | data_on_gpu = [] 105 | results.extend(predictions) 106 | 107 | tf.logging.info( 108 | "Decoding Batch {} using {:.3f} s, translating {} " 109 | "sentences using {:.3f} s in total".format( 110 | bidx, time.time() - start_time, 111 | len(results), time.time() - very_begin_time 112 | ) 113 | ) 114 | 115 | eval_queue.stop() 116 | 117 | if len(data_on_gpu) > 0: 118 | start_time = time.time() 119 | predictions = _predict_one_batch(data_on_gpu) 120 | results.extend(predictions) 121 | 122 | tf.logging.info( 123 | "Decoding Batch {} using {:.3f} s, translating {} " 124 | "sentences using {:.3f} s in total".format( 125 | 'final', time.time() - start_time, 126 | len(results), time.time() - very_begin_time 127 | ) 128 | ) 129 | 130 | return results 131 | 132 | 133 | def eval_metric(results, params): 134 | """BLEU Evaluate """ 135 | 136 | crr_cnt, total_cnt = 0, 0 137 | 138 | for result in results: 139 | total_cnt += 1 140 | 141 | p = result['pred_answer'] 142 | g = result['gold_answer'] 143 | 144 | if p == g: 145 | crr_cnt += 1 146 | 147 | return crr_cnt * 100. / total_cnt 148 | 149 | 150 | def dump_predictions(results, output): 151 | """save translation""" 152 | with tf.gfile.Open(output, 'w') as writer: 153 | for sample in results: 154 | writer.write(json.dumps(sample) + "\n") 155 | tf.logging.info("Saving translations into {}".format(output)) 156 | -------------------------------------------------------------------------------- /nli/code/vocab.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import json 8 | import argparse 9 | import numpy as np 10 | 11 | 12 | class Vocab(object): 13 | def __init__(self, lower=False, vocab_file=None): 14 | self.word2id = {} 15 | self.id2word = {} 16 | self.word2count = {} 17 | 18 | self.pad_sym = "" 19 | self.eos_sym = "" 20 | self.unk_sym = "" 21 | 22 | self.lower = lower 23 | 24 | self.insert(self.pad_sym) 25 | self.insert(self.unk_sym) 26 | self.insert(self.eos_sym) 27 | 28 | if vocab_file is not None: 29 | self.load_vocab(vocab_file) 30 | 31 | def insert(self, token): 32 | token = token if not self.lower else token.lower() 33 | if token not in self.word2id: 34 | index = len(self.word2id) 35 | self.word2id[token] = index 36 | self.id2word[index] = token 37 | 38 | self.word2count[token] = 0 39 | self.word2count[token] += 1 40 | 41 | def size(self): 42 | return len(self.word2id) 43 | 44 | def load_vocab(self, vocab_file): 45 | with open(vocab_file, 'r') as reader: 46 | for token in reader: 47 | self.insert(token.strip()) 48 | 49 | def get_token(self, id): 50 | if id in self.id2word: 51 | return self.id2word[id] 52 | return self.unk_sym 53 | 54 | def get_id(self, token): 55 | token = token if not self.lower else token.lower() 56 | if token in self.word2id: 57 | return self.word2id[token] 58 | return self.word2id[self.unk_sym] 59 | 60 | def sort_vocab(self, least_freq=-1): 61 | sorted_word2count = sorted( 62 | self.word2count.items(), key=lambda x: - x[1]) 63 | self.word2id, self.id2word, self.word2count = {}, {}, {} 64 | self.insert(self.pad_sym) 65 | self.insert(self.unk_sym) 66 | self.insert(self.eos_sym) 67 | for word, freq in sorted_word2count: 68 | if least_freq > 0: 69 | if freq <= least_freq: 70 | continue 71 | self.insert(word) 72 | 73 | def save_vocab(self, vocab_file): 74 | with open(vocab_file, 'w') as writer: 75 | for id in range(self.size()): 76 | writer.write(self.id2word[id].encode("utf-8") + "\n") 77 | 78 | def to_id(self, tokens, append_eos=True): 79 | if not append_eos: 80 | return [self.get_id(token) for token in tokens] 81 | else: 82 | return [self.get_id(token) for token in 83 | tokens + [self.eos_sym]] 84 | 85 | def to_tokens(self, ids): 86 | return [self.get_token(id) for id in ids] 87 | 88 | def eos(self): 89 | return self.get_id(self.eos_sym) 90 | 91 | def pad(self): 92 | return self.get_id(self.pad_sym) 93 | 94 | 95 | if __name__ == "__main__": 96 | parser = argparse.ArgumentParser('Vocabulary Preparation') 97 | parser.add_argument('--char', action='store_true', help='build char-level vocabulary') 98 | parser.add_argument('--lower', action='store_true', help='lower-case datasets') 99 | parser.add_argument('--embeddings', type=str, default='no', help='pre-trained word embedding path') 100 | parser.add_argument('inputs', type=str, help='the input file path, separate with comma') 101 | parser.add_argument('output', type=str, help='the output file name') 102 | 103 | args = parser.parse_args() 104 | 105 | vocab = Vocab(lower=args.lower) 106 | for data_file in args.inputs.split(','): 107 | with open(data_file, 'r') as reader: 108 | for text in reader: 109 | tokens = text.strip().split() 110 | 111 | for token in tokens: 112 | if not args.char: 113 | vocab.insert(token) 114 | else: 115 | for char in list(token): 116 | vocab.insert(char) 117 | 118 | vocab.sort_vocab(least_freq=3 if args.char else -1) 119 | 120 | # process the vocabulary with pretrained-embeddings 121 | if args.embeddings != "no": 122 | embed_tokens = {} 123 | embed_size = None 124 | with open(args.embeddings, 'r') as reader: 125 | for line in reader: 126 | segs = line.strip().split(' ') 127 | 128 | token = segs[0] 129 | # Not used in our training data, pass 130 | if token not in vocab.word2id: 131 | continue 132 | embed_tokens[token] = list(map(float, segs[1:])) 133 | 134 | if embed_size is None: 135 | embed_size = len(segs) - 1 136 | 137 | vocab = Vocab(lower=args.lower) 138 | for token in embed_tokens: 139 | vocab.insert(token) 140 | 141 | # load embeddings 142 | embeddings = np.zeros([len(embed_tokens), embed_size]) 143 | for token in embed_tokens: 144 | # 3: the special symbols 145 | embeddings[vocab.get_id(token) - 3] = embed_tokens[token] 146 | np.savez(args.output + ".npz", data=embeddings) 147 | 148 | vocab.save_vocab(args.output) 149 | 150 | print("Loading {} tokens from {}".format(vocab.size(), args.inputs)) 151 | -------------------------------------------------------------------------------- /doc/code/vocab.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import os 8 | import argparse 9 | import numpy as np 10 | import tensorflow as tf 11 | 12 | import sys 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | 16 | from bert.tokenization import BasicTokenizer as Tokenizer 17 | 18 | 19 | class Vocab(object): 20 | def __init__(self, lower=False, vocab_file=None): 21 | self.lower = lower 22 | 23 | self.word2id = {} 24 | self.id2word = {} 25 | self.word2count = {} 26 | 27 | self.pad_sym = "" 28 | self.eos_sym = "" 29 | self.unk_sym = "" 30 | 31 | self.clean() 32 | 33 | self.pretrained_embedding = None 34 | 35 | if vocab_file is not None: 36 | self.load_vocab(vocab_file) 37 | 38 | if os.path.exists(vocab_file + ".npz"): 39 | pretrain_embedding = np.load(vocab_file + ".npz")['data'] 40 | self.pretrained_embedding = pretrain_embedding 41 | 42 | def clean(self): 43 | self.word2id = {} 44 | self.id2word = {} 45 | self.word2count = {} 46 | 47 | self.insert(self.pad_sym) 48 | self.insert(self.unk_sym) 49 | self.insert(self.eos_sym) 50 | 51 | def insert(self, token): 52 | token = token if not self.lower else token.lower() 53 | if token not in self.word2id: 54 | index = len(self.word2id) 55 | self.word2id[token] = index 56 | self.id2word[index] = token 57 | 58 | self.word2count[token] = 0 59 | self.word2count[token] += 1 60 | 61 | def size(self): 62 | return len(self.word2id) 63 | 64 | def load_vocab(self, vocab_file): 65 | with open(vocab_file, 'r') as reader: 66 | for token in reader: 67 | self.insert(token.strip()) 68 | 69 | def get_token(self, id): 70 | if id in self.id2word: 71 | return self.id2word[id] 72 | return self.unk_sym 73 | 74 | def get_id(self, token): 75 | token = token if not self.lower else token.lower() 76 | if token in self.word2id: 77 | return self.word2id[token] 78 | return self.word2id[self.unk_sym] 79 | 80 | def sort_vocab(self, least_freq=-1): 81 | sorted_word2count = sorted( 82 | self.word2count.items(), key=lambda x: - x[1]) 83 | self.clean() 84 | for word, freq in sorted_word2count: 85 | if least_freq > 0: 86 | if freq <= least_freq: 87 | continue 88 | self.insert(word) 89 | 90 | def save_vocab(self, vocab_file): 91 | with open(vocab_file, 'w') as writer: 92 | for id in range(self.size()): 93 | writer.write(self.id2word[id].encode("utf-8") + "\n") 94 | 95 | np.savez(vocab_file + ".npz", data=self.pretrained_embedding) 96 | 97 | def to_id(self, tokens, append_eos=True): 98 | if not append_eos: 99 | return [self.get_id(token) for token in tokens] 100 | else: 101 | return [self.get_id(token) for token in 102 | tokens + [self.eos_sym]] 103 | 104 | def to_tokens(self, ids): 105 | return [self.get_token(id) for id in ids] 106 | 107 | def eos(self): 108 | return self.get_id(self.eos_sym) 109 | 110 | def pad(self): 111 | return self.get_id(self.pad_sym) 112 | 113 | def make_vocab(self, data_set, use_char=False, embedding_path=None): 114 | tf.logging.info("Starting Reading Data in {} Manner".format(use_char)) 115 | tokenizer = Tokenizer(do_lower_case=False) 116 | 117 | for data_iter in [data_set.get_train_data(), 118 | data_set.get_dev_data(), 119 | data_set.get_test_data()]: 120 | for sample in data_iter: 121 | label, document = sample 122 | 123 | tokens = tokenizer.tokenize(document) 124 | for token in tokens: 125 | if not use_char: 126 | self.insert(token) 127 | else: 128 | for char in list(token): 129 | self.insert(char) 130 | 131 | tf.logging.info("Data Loading Over, Starting Sorted") 132 | self.sort_vocab(least_freq=3 if use_char else -1) 133 | 134 | # process the vocabulary with pretrained-embeddings 135 | if embedding_path is not None: 136 | tf.logging.info("Pretrained Word Embedding Loading") 137 | embed_tokens = {} 138 | embed_size = None 139 | with open(embedding_path, 'r') as reader: 140 | for line in reader: 141 | segs = line.strip().split(' ') 142 | 143 | token = segs[0] 144 | # Not used in our training data, pass 145 | if token not in self.word2id: 146 | continue 147 | embed_tokens[token] = list(map(float, segs[1:])) 148 | 149 | if embed_size is None: 150 | embed_size = len(segs) - 1 151 | 152 | self.clean() 153 | for token in embed_tokens: 154 | self.insert(token) 155 | 156 | # load embeddings 157 | embeddings = np.zeros([len(embed_tokens), embed_size]) 158 | for token in embed_tokens: 159 | # 3: the special symbols 160 | embeddings[self.get_id(token) - 3] = embed_tokens[token] 161 | 162 | self.pretrained_embedding = embeddings 163 | 164 | tf.logging.info("Vocabulary Loading Finished") 165 | -------------------------------------------------------------------------------- /nli/README.md: -------------------------------------------------------------------------------- 1 | ## Natural Language Inference 2 | 3 | The dataset is Stanford Natural Language Inference (SNLI), which we regard as a three-way classification tasks. 4 | We use an encoder-attention-decoder architecture, and stack two additional birnn upon the final sequence representation. 5 | Both GloVe word embedding and character embedding is used for word-level representation. 6 | Main experimental results are summarized below. 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 |
Model#ParamsBase+LN+BERT+LN+BERT
ACCTimeACCTimeACCTimeACCTime
Rocktaschel et al. (2016)250K83.50-------
This
Work
LSTM8.36M84.270.26286.030.43289.950.54490.490.696
GRU6.41M85.710.24586.050.41990.290.52990.100.695
ATR2.87M84.880.21085.810.30790.000.49490.280.580
SRU5.48M84.280.25885.320.28389.980.54390.090.555
LRN4.25M84.880.20985.060.22389.980.48889.930.506
101 | 102 | *LN*: layer normalizaton; *Time*: time in seconds per training batch measured from 1k training steps. 103 | 104 | ## Requirement 105 | tensorflow >= 1.8.1 106 | 107 | ## How to Run? 108 | 109 | - download and preprocess dataset 110 | 111 | - The dataset link: https://nlp.stanford.edu/projects/snli/ 112 | - Prepare separate data files: 113 | 114 | We provide a simple processing script `convert_to_plain.py` in scripts folder. By calling: 115 | ``` 116 | python convert_to_plain.py snli_1.0/[ds].txt 117 | ``` 118 | you can get the `*.p, *.q, *.l` files as in the `config.py`. *[ds]* indicates `snli_1.0_train.txt`, 119 | `snli_1.0_dev.txt` and `snli_1.0_test.txt`. We only preserve `'entailment', 'neutral', 'contradiction'` instances, 120 | and others are dropped. 121 | 122 | - Prepare embedding and vocabulary 123 | 124 | Download the [pre-trained GloVe embedding](http://nlp.stanford.edu/data/glove.840B.300d.zip). And prepare 125 | the char as well as word vocabulary using `vocab.py` as follows: 126 | ``` 127 | # word embedding & vocabulary 128 | python vocab.py --embeddings [path-to-glove-embedding] train.p,train.q,dev.p,dev.q,test.p,test.q word_vocab 129 | # char embedding 130 | python vocab.py --char train.p,train.q,dev.p,dev.q,test.p,test.q char_vocab 131 | ``` 132 | 133 | - Download BERT pre-trained embedding (if you plan to work with BERT) 134 | 135 | - training and evaluation 136 | 137 | - Train the model as follows: 138 | ``` 139 | # configure your cuda libaray if necessary 140 | export CUDA_ROOT=XXX 141 | export PATH=$CUDA_ROOT/bin:$PATH 142 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH 143 | 144 | # LRN 145 | python code/run.py --mode train --config config.py --parameters=gpus=[0],cell="lrn",layer_norm=False,output_dir="train_no_ln" >& log.noln 146 | # LRN + LN 147 | python code/run.py --mode train --config config.py --parameters=gpus=[0],cell="lrn",layer_norm=True,output_dir="train_ln" >& log.ln 148 | # LRN + BERT 149 | python code/run.py --mode train --config config_bert.py --parameters=gpus=[0],cell="lrn",layer_norm=False,output_dir="train_no_ln_bert" >& log.noln.bert 150 | # LRN + LN + BERT 151 | python code/run.py --mode train --config config_bert.py --parameters=gpus=[0],cell="lrn",layer_norm=True,output_dir="train_ln_bert" >& log.ln.bert 152 | ``` 153 | Other hyperparameter settings are available in the given config.py. 154 | 155 | - Test the model as follows: 156 | ``` 157 | # LRN 158 | python code/run.py --mode test --config config.py --parameters=gpus=[0],cell="lrn",layer_norm=False,output_dir="train_no_ln/best",test_output="out.noln" >& log.noln.test 159 | # LRN + LN 160 | python code/run.py --mode test --config config.py --parameters=gpus=[0],cell="lrn",layer_norm=True,output_dir="train_ln/best",test_output="out.ln" >& log.ln.test 161 | # LRN + BERT 162 | python code/run.py --mode test --config config_bert.py --parameters=gpus=[0],cell="lrn",layer_norm=False,output_dir="train_no_ln_bert/best",test_output="out.noln.bert" >& log.noln.bert.test 163 | # LRN + LN + BERT 164 | python code/run.py --mode test --config config_bert.py --parameters=gpus=[0],cell="lrn",layer_norm=True,output_dir="train_ln_bert/best",test_output="out.ln.bert" >& log.ln.bert.test 165 | ``` 166 | 167 | ## Credits 168 | 169 | Source code structure is adapted from [zero](https://github.com/bzhangGo/zero). --------------------------------------------------------------------------------