├── .gitignore
├── rc
├── rnet
│ ├── code
│ │ ├── __init__.py
│ │ ├── rnns
│ │ │ ├── __init__.py
│ │ │ ├── lrn.py
│ │ │ ├── atr.py
│ │ │ ├── cell.py
│ │ │ ├── gru.py
│ │ │ ├── rnn.py
│ │ │ ├── sru.py
│ │ │ └── lstm.py
│ │ ├── download.sh
│ │ └── evaluate-v1.1.py
│ ├── config.py
│ ├── test_lrn.sh
│ └── train_lrn.sh
├── elmo_rnet
│ ├── config.py
│ ├── code
│ │ ├── bilm
│ │ │ ├── __init__.py
│ │ │ └── elmo.py
│ │ ├── rnns
│ │ │ ├── __init__.py
│ │ │ ├── lrn.py
│ │ │ ├── atr.py
│ │ │ ├── cell.py
│ │ │ ├── gru.py
│ │ │ ├── rnn.py
│ │ │ ├── sru.py
│ │ │ └── lstm.py
│ │ ├── download.sh
│ │ ├── cycle.py
│ │ └── evaluate-v1.1.py
│ ├── train_lrn.sh
│ └── test_lrn.sh
└── README.md
├── doc
├── code
│ ├── utils
│ │ ├── __init__.py
│ │ ├── recorder.py
│ │ ├── thread.py
│ │ ├── initializer.py
│ │ ├── saver.py
│ │ └── cycle.py
│ ├── bert
│ │ ├── __init__.py
│ │ ├── load.py
│ │ ├── tokenizer.py
│ │ └── vocab.py
│ ├── lrs
│ │ ├── vanillalr.py
│ │ ├── epochlr.py
│ │ ├── noamlr.py
│ │ ├── lr.py
│ │ ├── __init__.py
│ │ ├── scorelr.py
│ │ └── gnmtplr.py
│ ├── rnns
│ │ ├── __init__.py
│ │ ├── lrn.py
│ │ ├── atr.py
│ │ ├── cell.py
│ │ ├── gru.py
│ │ ├── sru.py
│ │ └── lstm.py
│ ├── tasks.py
│ ├── evalu.py
│ └── vocab.py
├── config.py
└── README.md
├── nli
├── code
│ ├── utils
│ │ ├── __init__.py
│ │ ├── recorder.py
│ │ ├── thread.py
│ │ ├── initializer.py
│ │ ├── saver.py
│ │ └── cycle.py
│ ├── bert
│ │ ├── __init__.py
│ │ ├── load.py
│ │ ├── tokenizer.py
│ │ └── vocab.py
│ ├── lrs
│ │ ├── vanillalr.py
│ │ ├── epochlr.py
│ │ ├── noamlr.py
│ │ ├── lr.py
│ │ ├── __init__.py
│ │ ├── scorelr.py
│ │ └── gnmtplr.py
│ ├── scripts
│ │ └── convert_to_plain.py
│ ├── rnns
│ │ ├── __init__.py
│ │ ├── lrn.py
│ │ ├── atr.py
│ │ ├── cell.py
│ │ ├── gru.py
│ │ ├── sru.py
│ │ └── lstm.py
│ ├── evalu.py
│ └── vocab.py
├── config.py
├── config_bert.py
└── README.md
├── figures
├── ls_mem.png
├── memory.png
└── san_corr.png
├── ner
├── code
│ ├── scripts
│ │ └── get_test_score.py
│ ├── requirements.txt
│ ├── ner_glove.py
│ ├── callbacks.py
│ ├── trainer.py
│ └── tagger.py
└── README.md
├── lm
├── code
│ ├── locked_dropout.py
│ ├── get_data.sh
│ ├── embed_regularize.py
│ ├── utils.py
│ ├── generate.py
│ ├── weight_drop.py
│ └── data.py
└── README.md
├── LICENSE
├── mt
└── README.md
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
--------------------------------------------------------------------------------
/rc/rnet/code/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rc/rnet/config.py:
--------------------------------------------------------------------------------
1 | code/config.py
--------------------------------------------------------------------------------
/rc/elmo_rnet/config.py:
--------------------------------------------------------------------------------
1 | code/config.py
--------------------------------------------------------------------------------
/doc/code/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
--------------------------------------------------------------------------------
/nli/code/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
--------------------------------------------------------------------------------
/figures/ls_mem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bzhangGo/lrn/HEAD/figures/ls_mem.png
--------------------------------------------------------------------------------
/figures/memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bzhangGo/lrn/HEAD/figures/memory.png
--------------------------------------------------------------------------------
/figures/san_corr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bzhangGo/lrn/HEAD/figures/san_corr.png
--------------------------------------------------------------------------------
/doc/code/bert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 |
4 | from .bert import *
5 | from .load import *
6 | from .tokenizer import *
7 |
--------------------------------------------------------------------------------
/nli/code/bert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 |
4 | from .bert import *
5 | from .load import *
6 | from .tokenizer import *
7 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/bilm/__init__.py:
--------------------------------------------------------------------------------
1 | # Elmo Interface
2 | # Deep contextualized word representations
3 |
4 | from .data import Batcher, TokenBatcher
5 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \
6 | dump_bilm_embeddings
7 | from .elmo import weight_layers
8 |
9 |
--------------------------------------------------------------------------------
/rc/rnet/test_lrn.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | export CUDA_ROOT=XXX
4 | export PATH=$CUDA_ROOT/bin:$PATH
5 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
6 |
7 | export CUDA_VISIBLE_DEVICES=0
8 |
9 | export name=log_lrn
10 |
11 | python config.py --mode test --cell lrn
12 |
13 |
--------------------------------------------------------------------------------
/rc/rnet/train_lrn.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | export CUDA_ROOT=XXX
4 | export PATH=$CUDA_ROOT/bin:$PATH
5 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
6 |
7 | export CUDA_VISIBLE_DEVICES=0
8 |
9 | export name=log_lrn
10 |
11 | python config.py --mode train --cell lrn
12 |
13 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/train_lrn.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | export CUDA_ROOT=XXX
4 | export PATH=$CUDA_ROOT/bin:$PATH
5 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
6 |
7 | export CUDA_VISIBLE_DEVICES=0
8 |
9 | export name=log_lrn
10 |
11 | python config.py --mode train --cell lrn
12 |
13 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/test_lrn.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | export CUDA_ROOT=XXX
4 | export PATH=$CUDA_ROOT/bin:$PATH
5 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
6 |
7 | export CUDA_VISIBLE_DEVICES=0
8 |
9 | export name=log_lrn
10 |
11 | python config.py --mode test --cell lrn --batch_size 8
12 |
13 |
--------------------------------------------------------------------------------
/doc/code/lrs/vanillalr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | from lrs import lr
9 |
10 |
11 | class VanillaLR(lr.Lr):
12 | """Very basic learning rate, constant learning rate"""
13 | def __init__(self,
14 | init_lr,
15 | name="vanilla_lr"
16 | ):
17 | super(VanillaLR, self).__init__(init_lr, name=name)
18 |
--------------------------------------------------------------------------------
/nli/code/lrs/vanillalr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | from lrs import lr
9 |
10 |
11 | class VanillaLR(lr.Lr):
12 | """Very basic learning rate, constant learning rate"""
13 | def __init__(self,
14 | init_lr,
15 | name="vanilla_lr"
16 | ):
17 | super(VanillaLR, self).__init__(init_lr, name=name)
18 |
--------------------------------------------------------------------------------
/ner/code/scripts/get_test_score.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 |
3 | import sys
4 | import numpy as np
5 |
6 |
7 | def extract_dev_test_score(fname):
8 | test_score = float(open(fname, 'rU').readlines()[-1].strip())
9 |
10 | return test_score
11 |
12 |
13 | cell_type = sys.argv[1]
14 | exp_dirs = sys.argv[2:]
15 |
16 | scores = []
17 | for exp_dir in exp_dirs:
18 | test_score = extract_dev_test_score("{}/log.{}".format(exp_dir, cell_type))
19 | scores.append(test_score)
20 |
21 | print(np.mean(scores), np.std(scores))
22 |
--------------------------------------------------------------------------------
/lm/code/locked_dropout.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.autograd import Variable
4 |
5 | class LockedDropout(nn.Module):
6 | def __init__(self):
7 | super().__init__()
8 |
9 | def forward(self, x, dropout=0.5):
10 | if not self.training or not dropout:
11 | return x
12 | m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
13 | # mask = Variable(m, requires_grad=False) / (1 - dropout)
14 | mask = Variable(m.div_(1 - dropout), requires_grad=False)
15 | mask = mask.expand_as(x)
16 | return mask * x
17 |
--------------------------------------------------------------------------------
/ner/code/requirements.txt:
--------------------------------------------------------------------------------
1 | backports.weakref==1.0rc1
2 | bleach==1.5.0
3 | boto==2.48.0
4 | bz2file==0.98
5 | certifi==2017.11.5
6 | chardet==3.0.4
7 | enum34==1.1.6
8 | gensim==3.1.0
9 | h5py==2.7.1
10 | html5lib==0.9999999
11 | idna==2.6
12 | Keras==2.2.0
13 | m2r==0.1.12
14 | Markdown==2.6.9
15 | numpy==1.13.3
16 | protobuf==3.5.1
17 | python-dateutil==2.6.0
18 | pytz==2017.2
19 | PyYAML==4.2b1
20 | requests==2.21.0
21 | scikit-learn==0.19.1
22 | scipy==1.0.0
23 | seqeval==0.0.3
24 | six==1.11.0
25 | smart-open==1.5.3
26 | tensorboard==1.8.0
27 | tensorflow>=1.12.1
28 | Theano==0.9.0
29 | urllib3>=1.24.2
30 | Werkzeug>=0.15.3
31 | allennlp==0.7.1
32 |
--------------------------------------------------------------------------------
/nli/code/scripts/convert_to_plain.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 |
3 | import sys
4 |
5 | data = open(sys.argv[1], 'r')
6 | data.readline()
7 |
8 | out_l = open(sys.argv[2]+".l", 'w')
9 | out_p = open(sys.argv[2]+".p", 'w')
10 | out_q = open(sys.argv[2]+".q", 'w')
11 |
12 | label = {'entailment': 0,
13 | 'neutral': 1,
14 | 'contradiction': 2}
15 |
16 | for line in data:
17 | l, p, q = line.strip().split('\t')[:3]
18 | if l not in label:
19 | continue
20 | out_l.write(str(label[l]) + '\n')
21 | out_p.write(p.replace('( ', '').replace(' )', '') + '\n')
22 | out_q.write(q.replace('( ', '').replace(' )', '') + '\n')
23 |
24 | out_l.close()
25 | out_p.close()
26 | out_q.close()
27 |
--------------------------------------------------------------------------------
/lm/code/get_data.sh:
--------------------------------------------------------------------------------
1 | mkdir data
2 | cd data
3 |
4 | echo "- Downloading Penn Treebank (PTB)"
5 | mkdir -p penn
6 | cd penn
7 | URL="https://raw.githubusercontent.com/lanpa/tensorboard-pytorch-examples/master/word_language_model/data/penn"
8 | wget --quiet --continue $URL/train.txt
9 | wget --quiet --continue $URL/valid.txt
10 | wget --quiet --continue $URL/test.txt
11 | cd ..
12 |
13 | echo "- Downloading WikiText-2 (WT2)"
14 | wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
15 | unzip -q wikitext-2-v1.zip
16 | cd wikitext-2
17 | mv wiki.train.tokens train.txt
18 | mv wiki.valid.tokens valid.txt
19 | mv wiki.test.tokens test.txt
20 |
21 | echo "---"
22 | echo "Happy language modeling :)"
23 |
24 | cd ..
25 |
--------------------------------------------------------------------------------
/doc/code/lrs/epochlr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | from lrs import lr
9 |
10 |
11 | class EpochDecayLr(lr.Lr):
12 | """Decay the learning rate after each epoch"""
13 | def __init__(self,
14 | init_lr,
15 | decay=0.5, # learning rate decay rate
16 | name="epoch_decay_lr"
17 | ):
18 | super(EpochDecayLr, self).__init__(init_lr, name=name)
19 |
20 | self.decay = decay
21 |
22 | def after_epoch(self, eidx=None):
23 | if eidx is None:
24 | self.lrate = self.init_lrate * self.decay
25 | else:
26 | self.lrate = self.init_lrate * self.decay ** int(eidx)
27 |
--------------------------------------------------------------------------------
/nli/code/lrs/epochlr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | from lrs import lr
9 |
10 |
11 | class EpochDecayLr(lr.Lr):
12 | """Decay the learning rate after each epoch"""
13 | def __init__(self,
14 | init_lr,
15 | decay=0.5, # learning rate decay rate
16 | name="epoch_decay_lr"
17 | ):
18 | super(EpochDecayLr, self).__init__(init_lr, name=name)
19 |
20 | self.decay = decay
21 |
22 | def after_epoch(self, eidx=None):
23 | if eidx is None:
24 | self.lrate = self.init_lrate * self.decay
25 | else:
26 | self.lrate = self.init_lrate * self.decay ** int(eidx)
27 |
--------------------------------------------------------------------------------
/doc/code/utils/recorder.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import json
8 | import tensorflow as tf
9 |
10 |
11 | class Recorder(object):
12 | """To save training processes, inspired by Nematus"""
13 |
14 | def load_from_json(self, file_name):
15 | tf.logging.info("Loading recoder file from {}".format(file_name))
16 | record = json.load(open(file_name, 'rb'))
17 | record = dict((key.encode("UTF-8"), value) for (key, value) in record.items())
18 | self.__dict__.update(record)
19 |
20 | def save_to_json(self, file_name):
21 | tf.logging.info("Saving recorder file into {}".format(file_name))
22 | json.dump(self.__dict__, open(file_name, 'wb'), indent=2)
23 |
--------------------------------------------------------------------------------
/nli/code/utils/recorder.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import json
8 | import tensorflow as tf
9 |
10 |
11 | class Recorder(object):
12 | """To save training processes, inspired by Nematus"""
13 |
14 | def load_from_json(self, file_name):
15 | tf.logging.info("Loading recoder file from {}".format(file_name))
16 | record = json.load(open(file_name, 'rb'))
17 | record = dict((key.encode("UTF-8"), value) for (key, value) in record.items())
18 | self.__dict__.update(record)
19 |
20 | def save_to_json(self, file_name):
21 | tf.logging.info("Saving recorder file into {}".format(file_name))
22 | json.dump(self.__dict__, open(file_name, 'wb'), indent=2)
23 |
--------------------------------------------------------------------------------
/doc/code/rnns/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from rnns import gru, lstm, atr, sru, lrn
4 |
5 |
6 | def get_cell(cell_name, hidden_size, ln=False, scope=None):
7 | """Convert the cell_name into cell instance."""
8 | cell_name = cell_name.lower()
9 |
10 | if cell_name == "gru":
11 | return gru.gru(hidden_size, ln=ln, scope=scope or "gru")
12 | elif cell_name == "lstm":
13 | return lstm.lstm(hidden_size, ln=ln, scope=scope or "lstm")
14 | elif cell_name == "atr":
15 | return atr.atr(hidden_size, ln=ln, scope=scope or "atr")
16 | elif cell_name == "sru":
17 | return sru.sru(hidden_size, ln=ln, scope=scope or "sru")
18 | elif cell_name == "lrn":
19 | return lrn.lrn(hidden_size, ln=ln, scope=scope or "lrn")
20 | else:
21 | raise NotImplementedError(
22 | "{} is not supported".format(cell_name))
23 |
--------------------------------------------------------------------------------
/nli/code/rnns/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from rnns import gru, lstm, atr, sru, lrn
4 |
5 |
6 | def get_cell(cell_name, hidden_size, ln=False, scope=None):
7 | """Convert the cell_name into cell instance."""
8 | cell_name = cell_name.lower()
9 |
10 | if cell_name == "gru":
11 | return gru.gru(hidden_size, ln=ln, scope=scope or "gru")
12 | elif cell_name == "lstm":
13 | return lstm.lstm(hidden_size, ln=ln, scope=scope or "lstm")
14 | elif cell_name == "atr":
15 | return atr.atr(hidden_size, ln=ln, scope=scope or "atr")
16 | elif cell_name == "sru":
17 | return sru.sru(hidden_size, ln=ln, scope=scope or "sru")
18 | elif cell_name == "lrn":
19 | return lrn.lrn(hidden_size, ln=ln, scope=scope or "lrn")
20 | else:
21 | raise NotImplementedError(
22 | "{} is not supported".format(cell_name))
23 |
--------------------------------------------------------------------------------
/rc/rnet/code/rnns/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from rnns import gru, lstm, atr, sru, lrn
4 |
5 |
6 | def get_cell(cell_name, hidden_size, ln=False, scope=None):
7 | """Convert the cell_name into cell instance."""
8 | cell_name = cell_name.lower()
9 |
10 | if cell_name == "gru":
11 | return gru.gru(hidden_size, ln=ln, scope=scope or "gru")
12 | elif cell_name == "lstm":
13 | return lstm.lstm(hidden_size, ln=ln, scope=scope or "lstm")
14 | elif cell_name == "atr":
15 | return atr.atr(hidden_size, ln=ln, scope=scope or "atr")
16 | elif cell_name == "sru":
17 | return sru.sru(hidden_size, ln=ln, scope=scope or "sru")
18 | elif cell_name == "lrn":
19 | return lrn.lrn(hidden_size, ln=ln, scope=scope or "lrn")
20 | else:
21 | raise NotImplementedError(
22 | "{} is not supported".format(cell_name))
23 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/rnns/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from rnns import gru, lstm, atr, sru, lrn
4 |
5 |
6 | def get_cell(cell_name, hidden_size, ln=False, scope=None):
7 | """Convert the cell_name into cell instance."""
8 | cell_name = cell_name.lower()
9 |
10 | if cell_name == "gru":
11 | return gru.gru(hidden_size, ln=ln, scope=scope or "gru")
12 | elif cell_name == "lstm":
13 | return lstm.lstm(hidden_size, ln=ln, scope=scope or "lstm")
14 | elif cell_name == "atr":
15 | return atr.atr(hidden_size, ln=ln, scope=scope or "atr")
16 | elif cell_name == "sru":
17 | return sru.sru(hidden_size, ln=ln, scope=scope or "sru")
18 | elif cell_name == "lrn":
19 | return lrn.lrn(hidden_size, ln=ln, scope=scope or "lrn")
20 | else:
21 | raise NotImplementedError(
22 | "{} is not supported".format(cell_name))
23 |
--------------------------------------------------------------------------------
/doc/code/utils/thread.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import threading
8 |
9 |
10 | class threadsafe_iter:
11 | """Takes an iterator/generator and makes it thread-safe by
12 | serializing call to the `next` method of given iterator/generator.
13 | """
14 |
15 | def __init__(self, it):
16 | self.it = it
17 | self.lock = threading.Lock()
18 |
19 | def __iter__(self):
20 | return self
21 |
22 | def __next__(self):
23 | return self.next()
24 |
25 | def next(self):
26 | with self.lock:
27 | return next(self.it)
28 |
29 |
30 | def threadsafe_generator(f):
31 | """A decorator that takes a generator function and makes it thread-safe.
32 | """
33 |
34 | def g(*a, **kw):
35 | return threadsafe_iter(f(*a, **kw))
36 |
37 | return g
--------------------------------------------------------------------------------
/nli/code/utils/thread.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import threading
8 |
9 |
10 | class threadsafe_iter:
11 | """Takes an iterator/generator and makes it thread-safe by
12 | serializing call to the `next` method of given iterator/generator.
13 | """
14 |
15 | def __init__(self, it):
16 | self.it = it
17 | self.lock = threading.Lock()
18 |
19 | def __iter__(self):
20 | return self
21 |
22 | def __next__(self):
23 | return self.next()
24 |
25 | def next(self):
26 | with self.lock:
27 | return next(self.it)
28 |
29 |
30 | def threadsafe_generator(f):
31 | """A decorator that takes a generator function and makes it thread-safe.
32 | """
33 |
34 | def g(*a, **kw):
35 | return threadsafe_iter(f(*a, **kw))
36 |
37 | return g
--------------------------------------------------------------------------------
/rc/rnet/code/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Download SQuAD
4 | SQUAD_DIR=~/data/squad
5 | mkdir -p $SQUAD_DIR
6 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json
7 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json
8 |
9 | # Download GloVe
10 | GLOVE_DIR=~/data/glove
11 | mkdir -p $GLOVE_DIR
12 | wget http://nlp.stanford.edu/data/glove.840B.300d.zip -O $GLOVE_DIR/glove.840B.300d.zip
13 | unzip $GLOVE_DIR/glove.840B.300d.zip -d $GLOVE_DIR
14 |
15 | # Download Glove Character Embedding
16 | # wget https://raw.githubusercontent.com/minimaxir/char-embeddings/master/glove.840B.300d-char.txt -O $GLOVE_DIR/glove.840B.300d-char.txt
17 |
18 | # Download fasttext
19 | # FASTTEXT_DIR=~/data/fasttext
20 | # mkdir -p $FASTTEXT_DIR
21 | # wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M.vec.zip -O $FASTTEXT_DIR/wiki-news-300d-1M.vec.zip
22 | # unzip $FASTTEXT_DIR/wiki-news-300d-1M.vec.zip -d $FASTTEXT_DIR
23 |
24 | # Download Spacy language models
25 | python3 -m spacy download en
26 |
--------------------------------------------------------------------------------
/doc/code/lrs/noamlr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import numpy as np
8 |
9 | from lrs import lr
10 |
11 |
12 | class NoamDecayLr(lr.Lr):
13 | """Decay the learning rate during each training step, follows Transformer"""
14 | def __init__(self,
15 | init_lr, # initial learning rate
16 | warmup_steps, # warmup step
17 | hidden_size, # model hidden size
18 | name="noam_decay_lr" # model name, no use
19 | ):
20 | super(NoamDecayLr, self).__init__(init_lr, name=name)
21 |
22 | self.warmup_steps = warmup_steps
23 | self.hidden_size = hidden_size
24 |
25 | def step(self, step):
26 | step = float(step)
27 | warmup_steps = float(self.warmup_steps)
28 |
29 | multiplier = float(self.hidden_size) ** -0.5
30 | decay = multiplier * np.minimum((step + 1) * (warmup_steps ** -1.5),
31 | (step + 1) ** -0.5)
32 | self.lrate = self.init_lrate * decay
33 |
--------------------------------------------------------------------------------
/nli/code/lrs/noamlr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import numpy as np
8 |
9 | from lrs import lr
10 |
11 |
12 | class NoamDecayLr(lr.Lr):
13 | """Decay the learning rate during each training step, follows Transformer"""
14 | def __init__(self,
15 | init_lr, # initial learning rate
16 | warmup_steps, # warmup step
17 | hidden_size, # model hidden size
18 | name="noam_decay_lr" # model name, no use
19 | ):
20 | super(NoamDecayLr, self).__init__(init_lr, name=name)
21 |
22 | self.warmup_steps = warmup_steps
23 | self.hidden_size = hidden_size
24 |
25 | def step(self, step):
26 | step = float(step)
27 | warmup_steps = float(self.warmup_steps)
28 |
29 | multiplier = float(self.hidden_size) ** -0.5
30 | decay = multiplier * np.minimum((step + 1) * (warmup_steps ** -1.5),
31 | (step + 1) ** -0.5)
32 | self.lrate = self.init_lrate * decay
33 |
--------------------------------------------------------------------------------
/ner/code/ner_glove.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from wrapper import Sequence
4 | from utils import load_data_and_labels, load_glove
5 |
6 |
7 | if __name__ == '__main__':
8 | DATA_ROOT = os.path.join(os.path.dirname(__file__), os.environ["data_dir"])
9 | EMBEDDING_PATH = os.path.join(os.path.dirname(__file__), os.environ["glove_dir"])
10 |
11 | train_path = os.path.join(DATA_ROOT, 'train.txt')
12 | valid_path = os.path.join(DATA_ROOT, 'valid.txt')
13 | test_path = os.path.join(DATA_ROOT, 'test.txt')
14 |
15 | print('Loading data...')
16 | x_train, y_train = load_data_and_labels(train_path)
17 | x_valid, y_valid = load_data_and_labels(valid_path)
18 | x_test, y_test = load_data_and_labels(test_path)
19 | print(len(x_train), 'train sequences')
20 | print(len(x_valid), 'valid sequences')
21 | print(len(x_test), 'test sequences')
22 |
23 | embeddings = load_glove(EMBEDDING_PATH)
24 |
25 | # Use pre-trained word embeddings
26 | model = Sequence(cell_type=os.environ['cell_type'], embeddings=embeddings, initial_vocab=embeddings.keys())
27 | # print(model.trainable_weights)
28 |
29 | model.fit(x_train, y_train, x_valid, y_valid, epochs=30)
30 |
31 | print('Testing the model...')
32 | print(model.score(x_test, y_test))
33 |
--------------------------------------------------------------------------------
/doc/code/lrs/lr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | # This is an abstract class that deals with
9 | # different learning rate decay strategy
10 | # Generally, we decay the learning rate with GPU computation
11 | # However, in this paper, we simply decay the learning rate
12 | # at CPU level, and feed the decayed lr into GPU for
13 | # optimization
14 | class Lr(object):
15 | def __init__(self,
16 | init_lrate, # initial learning rate
17 | name="lr", # learning rate name, no use
18 | ):
19 | self.name = name
20 | self.init_lrate = init_lrate # just record the init learning rate
21 | self.lrate = init_lrate # active learning rate, change with training
22 |
23 | # suppose the eidx starts from 1
24 | def before_epoch(self, eidx=None):
25 | pass
26 |
27 | def after_epoch(self, eidx=None):
28 | pass
29 |
30 | def step(self, step):
31 | pass
32 |
33 | def after_eval(self, eval_score):
34 | pass
35 |
36 | def get_lr(self):
37 | """Return the learning rate whenever you want"""
38 | return self.lrate
39 |
--------------------------------------------------------------------------------
/nli/code/lrs/lr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | # This is an abstract class that deals with
9 | # different learning rate decay strategy
10 | # Generally, we decay the learning rate with GPU computation
11 | # However, in this paper, we simply decay the learning rate
12 | # at CPU level, and feed the decayed lr into GPU for
13 | # optimization
14 | class Lr(object):
15 | def __init__(self,
16 | init_lrate, # initial learning rate
17 | name="lr", # learning rate name, no use
18 | ):
19 | self.name = name
20 | self.init_lrate = init_lrate # just record the init learning rate
21 | self.lrate = init_lrate # active learning rate, change with training
22 |
23 | # suppose the eidx starts from 1
24 | def before_epoch(self, eidx=None):
25 | pass
26 |
27 | def after_epoch(self, eidx=None):
28 | pass
29 |
30 | def step(self, step):
31 | pass
32 |
33 | def after_eval(self, eval_score):
34 | pass
35 |
36 | def get_lr(self):
37 | """Return the learning rate whenever you want"""
38 | return self.lrate
39 |
--------------------------------------------------------------------------------
/lm/code/embed_regularize.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import torch
4 | import torch.nn.functional as F
5 | from torch.autograd import Variable
6 |
7 | def embedded_dropout(embed, words, dropout=0.1, scale=None):
8 | if dropout:
9 | mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout)
10 | mask = Variable(mask)
11 | masked_embed_weight = mask * embed.weight
12 | else:
13 | masked_embed_weight = embed.weight
14 | if scale:
15 | masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight
16 |
17 | padding_idx = embed.padding_idx
18 | if padding_idx is None:
19 | padding_idx = -1
20 | X = F.embedding(words, masked_embed_weight,
21 | padding_idx, embed.max_norm, embed.norm_type,
22 | embed.scale_grad_by_freq, embed.sparse
23 | )
24 | return X
25 |
26 | if __name__ == '__main__':
27 | V = 50
28 | h = 4
29 | bptt = 10
30 | batch_size = 2
31 |
32 | embed = torch.nn.Embedding(V, h)
33 |
34 | words = np.random.random_integers(low=0, high=V-1, size=(batch_size, bptt))
35 | words = torch.LongTensor(words)
36 | words = Variable(words)
37 |
38 | origX = embed(words)
39 | X = embedded_dropout(embed, words)
40 |
41 | print(origX)
42 | print(X)
43 |
--------------------------------------------------------------------------------
/doc/code/utils/initializer.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 |
10 | def get_initializer(params):
11 | if params.initializer == "uniform":
12 | max_val = params.initializer_gain
13 | return tf.random_uniform_initializer(-max_val, max_val)
14 | elif params.initializer == "normal":
15 | return tf.random_normal_initializer(0.0, params.initializer_gain)
16 | elif params.initializer == "normal_unit_scaling":
17 | return tf.variance_scaling_initializer(params.initializer_gain,
18 | mode="fan_avg",
19 | distribution="normal")
20 | elif params.initializer == "uniform_unit_scaling":
21 | return tf.variance_scaling_initializer(params.initializer_gain,
22 | mode="fan_avg",
23 | distribution="uniform")
24 | else:
25 | tf.logging.warn("Unrecognized initializer: %s" % params.initializer)
26 | tf.logging.warn("Return to default initializer: glorot_uniform_initializer")
27 | return tf.glorot_uniform_initializer()
28 |
--------------------------------------------------------------------------------
/nli/code/utils/initializer.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 |
10 | def get_initializer(params):
11 | if params.initializer == "uniform":
12 | max_val = params.initializer_gain
13 | return tf.random_uniform_initializer(-max_val, max_val)
14 | elif params.initializer == "normal":
15 | return tf.random_normal_initializer(0.0, params.initializer_gain)
16 | elif params.initializer == "normal_unit_scaling":
17 | return tf.variance_scaling_initializer(params.initializer_gain,
18 | mode="fan_avg",
19 | distribution="normal")
20 | elif params.initializer == "uniform_unit_scaling":
21 | return tf.variance_scaling_initializer(params.initializer_gain,
22 | mode="fan_avg",
23 | distribution="uniform")
24 | else:
25 | tf.logging.warn("Unrecognized initializer: %s" % params.initializer)
26 | tf.logging.warn("Return to default initializer: glorot_uniform_initializer")
27 | return tf.glorot_uniform_initializer()
28 |
--------------------------------------------------------------------------------
/doc/code/lrs/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from lrs import vanillalr, noamlr, scorelr, gnmtplr, epochlr
4 |
5 |
6 | def get_lr(params):
7 |
8 | strategy = params.lrate_strategy.lower()
9 |
10 | if strategy == "noam":
11 | return noamlr.NoamDecayLr(
12 | params.lrate,
13 | params.warmup_steps,
14 | params.hidden_size
15 | )
16 | elif strategy == "gnmt+":
17 | return gnmtplr.GNMTPDecayLr(
18 | params.lrate,
19 | params.warmup_steps,
20 | params.nstable,
21 | params.lrdecay_start,
22 | params.lrdecay_end
23 | )
24 | elif strategy == "epoch":
25 | return epochlr.EpochDecayLr(
26 | params.lrate,
27 | params.lrate_decay,
28 | )
29 | elif strategy == "score":
30 | return scorelr.ScoreDecayLr(
31 | params.lrate,
32 | history_scores=[v[1] for v in params.recorder.valid_script_scores],
33 | decay=params.lrate_decay,
34 | patience=params.lrate_patience,
35 | )
36 | elif strategy == "vanilla":
37 | return vanillalr.VanillaLR(
38 | params.lrate,
39 | )
40 | else:
41 | raise NotImplementedError(
42 | "{} is not supported".format(strategy))
--------------------------------------------------------------------------------
/nli/code/lrs/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from lrs import vanillalr, noamlr, scorelr, gnmtplr, epochlr
4 |
5 |
6 | def get_lr(params):
7 |
8 | strategy = params.lrate_strategy.lower()
9 |
10 | if strategy == "noam":
11 | return noamlr.NoamDecayLr(
12 | params.lrate,
13 | params.warmup_steps,
14 | params.hidden_size
15 | )
16 | elif strategy == "gnmt+":
17 | return gnmtplr.GNMTPDecayLr(
18 | params.lrate,
19 | params.warmup_steps,
20 | params.nstable,
21 | params.lrdecay_start,
22 | params.lrdecay_end
23 | )
24 | elif strategy == "epoch":
25 | return epochlr.EpochDecayLr(
26 | params.lrate,
27 | params.lrate_decay,
28 | )
29 | elif strategy == "score":
30 | return scorelr.ScoreDecayLr(
31 | params.lrate,
32 | history_scores=[v[1] for v in params.recorder.valid_script_scores],
33 | decay=params.lrate_decay,
34 | patience=params.lrate_patience,
35 | )
36 | elif strategy == "vanilla":
37 | return vanillalr.VanillaLR(
38 | params.lrate,
39 | )
40 | else:
41 | raise NotImplementedError(
42 | "{} is not supported".format(strategy))
--------------------------------------------------------------------------------
/doc/code/bert/load.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import os
8 | import json
9 | import tensorflow as tf
10 |
11 | from .vocab import Vocab
12 |
13 |
14 | def load_vocab(model_dir):
15 | vocab = Vocab(
16 | vocab_file=os.path.join(model_dir, 'vocab.txt')
17 | )
18 | return vocab
19 |
20 |
21 | def load_config(model_dir):
22 | with tf.gfile.GFile(
23 | os.path.join(model_dir, 'bert_config.json'),
24 | "r"
25 | ) as reader:
26 | text = reader.read()
27 | return json.loads(text)
28 |
29 |
30 | def load_model(session, model_dir):
31 | tf.logging.warn("Starting Loading BERT Pre-trained Model")
32 | ops = []
33 | reader = tf.train.load_checkpoint(
34 | os.path.join(model_dir, "bert_model.ckpt")
35 | )
36 |
37 | for var in tf.global_variables():
38 | name = var.op.name
39 | name = name[name.find('/bert/')+1:]
40 |
41 | if reader.has_tensor(name) and 'Adam' not in name:
42 | tf.logging.info('{} **Good**'.format(name))
43 | ops.append(
44 | tf.assign(var, reader.get_tensor(name)))
45 | else:
46 | tf.logging.warn("{} --Bad--".format(name))
47 | restore_op = tf.group(*ops, name="restore_global_vars")
48 | session.run(restore_op)
49 |
--------------------------------------------------------------------------------
/nli/code/bert/load.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import os
8 | import json
9 | import tensorflow as tf
10 |
11 | from .vocab import Vocab
12 |
13 |
14 | def load_vocab(model_dir):
15 | vocab = Vocab(
16 | vocab_file=os.path.join(model_dir, 'vocab.txt')
17 | )
18 | return vocab
19 |
20 |
21 | def load_config(model_dir):
22 | with tf.gfile.GFile(
23 | os.path.join(model_dir, 'bert_config.json'),
24 | "r"
25 | ) as reader:
26 | text = reader.read()
27 | return json.loads(text)
28 |
29 |
30 | def load_model(session, model_dir):
31 | tf.logging.warn("Starting Loading BERT Pre-trained Model")
32 | ops = []
33 | reader = tf.train.load_checkpoint(
34 | os.path.join(model_dir, "bert_model.ckpt")
35 | )
36 |
37 | for var in tf.global_variables():
38 | name = var.op.name
39 | name = name[name.find('/bert/')+1:]
40 |
41 | if reader.has_tensor(name) and 'Adam' not in name:
42 | tf.logging.info('{} **Good**'.format(name))
43 | ops.append(
44 | tf.assign(var, reader.get_tensor(name)))
45 | else:
46 | tf.logging.warn("{} --Bad--".format(name))
47 | restore_op = tf.group(*ops, name="restore_global_vars")
48 | session.run(restore_op)
49 |
--------------------------------------------------------------------------------
/doc/code/lrs/scorelr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | from lrs import lr
9 |
10 |
11 | class ScoreDecayLr(lr.Lr):
12 | """Decay the learning rate after each evaluation"""
13 | def __init__(self,
14 | init_lr,
15 | history_scores=None, # evaluation history metric scores, such as BLEU
16 | decay=0.5, # learning rate decay rate
17 | patience=1, # decay after this number of bad counter
18 | name="score_decay_lr" # model name, no use
19 | ):
20 | super(ScoreDecayLr, self).__init__(init_lr, name=name)
21 |
22 | self.decay = decay
23 | self.patience = patience
24 | self.bad_counter = 0
25 | self.best_score = -1e9
26 |
27 | if history_scores is not None:
28 | for score in history_scores:
29 | self.after_eval(score[1])
30 |
31 | def after_eval(self, eval_score):
32 | if eval_score > self.best_score:
33 | self.best_score = eval_score
34 | self.bad_counter = 0
35 | else:
36 | self.bad_counter += 1
37 | if self.bad_counter >= self.patience:
38 | self.lrate = self.lrate * self.decay
39 |
40 | self.bad_counter = 0
41 |
--------------------------------------------------------------------------------
/nli/code/lrs/scorelr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | from lrs import lr
9 |
10 |
11 | class ScoreDecayLr(lr.Lr):
12 | """Decay the learning rate after each evaluation"""
13 | def __init__(self,
14 | init_lr,
15 | history_scores=None, # evaluation history metric scores, such as BLEU
16 | decay=0.5, # learning rate decay rate
17 | patience=1, # decay after this number of bad counter
18 | name="score_decay_lr" # model name, no use
19 | ):
20 | super(ScoreDecayLr, self).__init__(init_lr, name=name)
21 |
22 | self.decay = decay
23 | self.patience = patience
24 | self.bad_counter = 0
25 | self.best_score = -1e9
26 |
27 | if history_scores is not None:
28 | for score in history_scores:
29 | self.after_eval(score[1])
30 |
31 | def after_eval(self, eval_score):
32 | if eval_score > self.best_score:
33 | self.best_score = eval_score
34 | self.bad_counter = 0
35 | else:
36 | self.bad_counter += 1
37 | if self.bad_counter >= self.patience:
38 | self.lrate = self.lrate * self.decay
39 |
40 | self.bad_counter = 0
41 |
--------------------------------------------------------------------------------
/ner/code/callbacks.py:
--------------------------------------------------------------------------------
1 | """
2 | Custom callbacks.
3 | """
4 | import numpy as np
5 | from keras.callbacks import Callback
6 | from seqeval.metrics import f1_score, classification_report
7 |
8 |
9 | class F1score(Callback):
10 |
11 | def __init__(self, seq, preprocessor=None, name="callback"):
12 | super(F1score, self).__init__()
13 | self.seq = seq
14 | self.p = preprocessor
15 | self.name = name
16 |
17 | def get_lengths(self, y_true):
18 | lengths = []
19 | for y in np.argmax(y_true, -1):
20 | try:
21 | i = list(y).index(0)
22 | except ValueError:
23 | i = len(y)
24 | lengths.append(i)
25 |
26 | return lengths
27 |
28 | def on_epoch_end(self, epoch, logs={}):
29 | label_true = []
30 | label_pred = []
31 | for i in range(len(self.seq)):
32 | x_true, y_true = self.seq[i]
33 | lengths = self.get_lengths(y_true)
34 | y_pred = self.model.predict_on_batch(x_true)
35 |
36 | y_true = self.p.inverse_transform(y_true, lengths)
37 | y_pred = self.p.inverse_transform(y_pred, lengths)
38 |
39 | label_true.extend(y_true)
40 | label_pred.extend(y_pred)
41 |
42 | score = f1_score(label_true, label_pred)
43 | print('{} - f1: {:04.2f}'.format(self.name, score * 100))
44 | print(classification_report(label_true, label_pred))
45 | logs['f1'] = score
46 |
--------------------------------------------------------------------------------
/doc/code/bert/tokenizer.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | import os
9 | import argparse
10 |
11 | from . import tokenization
12 |
13 |
14 | def load_tokenizer(params):
15 | tokenization.validate_case_matches_checkpoint(
16 | params.lower,
17 | os.path.join(params.bert_dir, 'bert_model.ckpt')
18 | )
19 | tokenizer = tokenization.FullTokenizer(
20 | vocab_file=os.path.join(params.bert_dir, 'vocab.txt'),
21 | do_lower_case=params.lower
22 | )
23 | return tokenizer
24 |
25 |
26 | def tokenize(params):
27 | tokenizer = load_tokenizer(params)
28 |
29 | with open(params.output, 'w') as writer:
30 | with open(params.input, 'r') as reader:
31 | for line in reader:
32 | writer.write(' '.join(tokenizer.tokenize(line.strip())).encode('utf8') + "\n")
33 |
34 |
35 | if __name__ == "__main__":
36 | parser = argparse.ArgumentParser('Vocabulary Preparison')
37 | parser.add_argument('--lower', action='store_true', help='whether lowercase the model')
38 | parser.add_argument('--bert_dir', type=str, help='the pre-trained model directory')
39 | parser.add_argument('input', type=str, help='the input un-tokenized file')
40 | parser.add_argument('output', type=str, help='the output tokenized file')
41 |
42 | args = parser.parse_args()
43 |
44 | tokenize(args)
45 |
46 | print("Finishing!")
47 |
--------------------------------------------------------------------------------
/nli/code/bert/tokenizer.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | import os
9 | import argparse
10 |
11 | from . import tokenization
12 |
13 |
14 | def load_tokenizer(params):
15 | tokenization.validate_case_matches_checkpoint(
16 | params.lower,
17 | os.path.join(params.bert_dir, 'bert_model.ckpt')
18 | )
19 | tokenizer = tokenization.FullTokenizer(
20 | vocab_file=os.path.join(params.bert_dir, 'vocab.txt'),
21 | do_lower_case=params.bert_lower
22 | )
23 | return tokenizer
24 |
25 |
26 | def tokenize(params):
27 | tokenizer = load_tokenizer(params)
28 |
29 | with open(params.output, 'w') as writer:
30 | with open(params.input, 'r') as reader:
31 | for line in reader:
32 | writer.write(' '.join(tokenizer.tokenize(line.strip())).encode('utf8') + "\n")
33 |
34 |
35 | if __name__ == "__main__":
36 | parser = argparse.ArgumentParser('Vocabulary Preparison')
37 | parser.add_argument('--lower', action='store_true', help='whether lowercase the model')
38 | parser.add_argument('--bert_dir', type=str, help='the pre-trained model directory')
39 | parser.add_argument('input', type=str, help='the input un-tokenized file')
40 | parser.add_argument('output', type=str, help='the output tokenized file')
41 |
42 | args = parser.parse_args()
43 |
44 | tokenize(args)
45 |
46 | print("Finishing!")
47 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Download SQuAD
4 | SQUAD_DIR=~/data/squad
5 | mkdir -p $SQUAD_DIR
6 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json
7 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json
8 |
9 | # Download GloVe
10 | GLOVE_DIR=~/data/glove
11 | mkdir -p $GLOVE_DIR
12 | wget http://nlp.stanford.edu/data/glove.840B.300d.zip -O $GLOVE_DIR/glove.840B.300d.zip
13 | unzip $GLOVE_DIR/glove.840B.300d.zip -d $GLOVE_DIR
14 |
15 | # Download Glove Character Embedding
16 | # wget https://raw.githubusercontent.com/minimaxir/char-embeddings/master/glove.840B.300d-char.txt -O $GLOVE_DIR/glove.840B.300d-char.txt
17 |
18 | # Download fasttext
19 | # FASTTEXT_DIR=~/data/fasttext
20 | # mkdir -p $FASTTEXT_DIR
21 | # wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M.vec.zip -O $FASTTEXT_DIR/wiki-news-300d-1M.vec.zip
22 | # unzip $FASTTEXT_DIR/wiki-news-300d-1M.vec.zip -d $FASTTEXT_DIR
23 |
24 | # Download Elmo
25 | ELMO_DIR=~/data/elmo
26 | wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5 -O $ELMO_DIR/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5
27 | wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json -O $ELMO_DIR/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json
28 |
29 | # Download Spacy language models
30 | python3 -m spacy download en
31 |
--------------------------------------------------------------------------------
/doc/code/rnns/lrn.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class lrn(cell.Cell):
14 | """The Lightweight Recurrent Network."""
15 |
16 | def __init__(self, d, ln=False, scope='lrn'):
17 | super(lrn, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "lrn")):
26 | h = linear(x, self.d * 3,
27 | bias=True, ln=self.ln, scope="hide_x")
28 | return (h, )
29 |
30 | def __call__(self, h_, x):
31 | # h_: the previous hidden state
32 | # p,q,r/x: the current input state
33 | """
34 | p, q, r = W x
35 | i = sigmoid(p + h_)
36 | f = sigmoid(q - h_)
37 | h = i * r + f * h_
38 | """
39 | if isinstance(x, (list, tuple)):
40 | x = x[0]
41 |
42 | with tf.variable_scope(
43 | "cell_{}".format(self.scope or "lrn")):
44 | p, q, r = tf.split(x, 3, -1)
45 |
46 | i = tf.sigmoid(p + h_)
47 | f = tf.sigmoid(q - h_)
48 |
49 | h = tf.tanh(i * r + f * h_)
50 |
51 | return h
52 |
--------------------------------------------------------------------------------
/nli/code/rnns/lrn.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class lrn(cell.Cell):
14 | """The Lightweight Recurrent Neural Network."""
15 |
16 | def __init__(self, d, ln=False, scope='lrn'):
17 | super(lrn, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "lrn")):
26 | h = linear(x, self.d * 3,
27 | bias=True, ln=self.ln, scope="hide_x")
28 | return (h, )
29 |
30 | def __call__(self, h_, x):
31 | # h_: the previous hidden state
32 | # p,q,r/x: the current input state
33 | """
34 | p, q, r = W x
35 | i = sigmoid(p + h_)
36 | f = sigmoid(q - h_)
37 | h = i * r + f * h_
38 | """
39 | if isinstance(x, (list, tuple)):
40 | x = x[0]
41 |
42 | with tf.variable_scope(
43 | "cell_{}".format(self.scope or "lrn")):
44 | p, q, r = tf.split(x, 3, -1)
45 |
46 | i = tf.sigmoid(p + h_)
47 | f = tf.sigmoid(q - h_)
48 |
49 | h = tf.tanh(i * r + f * h_)
50 |
51 | return h
52 |
--------------------------------------------------------------------------------
/rc/rnet/code/rnns/lrn.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class lrn(cell.Cell):
14 | """The Lightweight Recurrent Network."""
15 |
16 | def __init__(self, d, ln=False, scope='lrn'):
17 | super(lrn, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "lrn")):
26 | h = linear(x, self.d * 3,
27 | bias=True, ln=self.ln, scope="hide_x")
28 | return (h, )
29 |
30 | def __call__(self, h_, x):
31 | # h_: the previous hidden state
32 | # p,q,r/x: the current input state
33 | """
34 | p, q, r = W x
35 | i = sigmoid(p + h_)
36 | f = sigmoid(q - h_)
37 | h = i * r + f * h_
38 | """
39 | if isinstance(x, (list, tuple)):
40 | x = x[0]
41 |
42 | with tf.variable_scope(
43 | "cell_{}".format(self.scope or "lrn")):
44 | p, q, r = tf.split(x, 3, -1)
45 |
46 | i = tf.sigmoid(p + h_)
47 | f = tf.sigmoid(q - h_)
48 |
49 | h = tf.tanh(i * r + f * h_)
50 |
51 | return h
52 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/rnns/lrn.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class lrn(cell.Cell):
14 | """The Lightweight Recurrent Network."""
15 |
16 | def __init__(self, d, ln=False, scope='lrn'):
17 | super(lrn, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "lrn")):
26 | h = linear(x, self.d * 3,
27 | bias=True, ln=self.ln, scope="hide_x")
28 | return (h, )
29 |
30 | def __call__(self, h_, x):
31 | # h_: the previous hidden state
32 | # p,q,r/x: the current input state
33 | """
34 | p, q, r = W x
35 | i = sigmoid(p + h_)
36 | f = sigmoid(q - h_)
37 | h = i * r + f * h_
38 | """
39 | if isinstance(x, (list, tuple)):
40 | x = x[0]
41 |
42 | with tf.variable_scope(
43 | "cell_{}".format(self.scope or "lrn")):
44 | p, q, r = tf.split(x, 3, -1)
45 |
46 | i = tf.sigmoid(p + h_)
47 | f = tf.sigmoid(q - h_)
48 |
49 | h = tf.tanh(i * r + f * h_)
50 |
51 | return h
52 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, Biao Zhang
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/doc/code/lrs/gnmtplr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import numpy as np
8 |
9 | from lrs import lr
10 |
11 |
12 | class GNMTPDecayLr(lr.Lr):
13 | """Decay the learning rate during each training step, follows GNMT+"""
14 | def __init__(self,
15 | init_lr, # initial learning rate
16 | warmup_steps, # warmup step
17 | nstable, # number of replica
18 | lrdecay_start, # start of learning rate decay
19 | lrdecay_end, # end of learning rate decay
20 | name="gnmtp_decay_lr" # model name, no use
21 | ):
22 | super(GNMTPDecayLr, self).__init__(init_lr, name=name)
23 |
24 | self.warmup_steps = warmup_steps
25 | self.nstable = nstable
26 | self.lrdecay_start = lrdecay_start
27 | self.lrdecay_end = lrdecay_end
28 |
29 | if nstable < 1:
30 | raise Exception("Stabled Lrate Value should "
31 | "greater than 0, but is {}".format(nstable))
32 |
33 | def step(self, step):
34 | t = float(step)
35 | p = float(self.warmup_steps)
36 | n = float(self.nstable)
37 | s = float(self.lrdecay_start)
38 | e = float(self.lrdecay_end)
39 |
40 | decay = np.minimum(1. + t * (n - 1) / (n * p), n)
41 | decay = np.minimum(decay, n * (2 * n) ** ((s - n * t) / (e - s)))
42 |
43 | self.lrate = self.init_lrate * decay
44 |
--------------------------------------------------------------------------------
/nli/code/lrs/gnmtplr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import numpy as np
8 |
9 | from lrs import lr
10 |
11 |
12 | class GNMTPDecayLr(lr.Lr):
13 | """Decay the learning rate during each training step, follows GNMT+"""
14 | def __init__(self,
15 | init_lr, # initial learning rate
16 | warmup_steps, # warmup step
17 | nstable, # number of replica
18 | lrdecay_start, # start of learning rate decay
19 | lrdecay_end, # end of learning rate decay
20 | name="gnmtp_decay_lr" # model name, no use
21 | ):
22 | super(GNMTPDecayLr, self).__init__(init_lr, name=name)
23 |
24 | self.warmup_steps = warmup_steps
25 | self.nstable = nstable
26 | self.lrdecay_start = lrdecay_start
27 | self.lrdecay_end = lrdecay_end
28 |
29 | if nstable < 1:
30 | raise Exception("Stabled Lrate Value should "
31 | "greater than 0, but is {}".format(nstable))
32 |
33 | def step(self, step):
34 | t = float(step)
35 | p = float(self.warmup_steps)
36 | n = float(self.nstable)
37 | s = float(self.lrdecay_start)
38 | e = float(self.lrdecay_end)
39 |
40 | decay = np.minimum(1. + t * (n - 1) / (n * p), n)
41 | decay = np.minimum(decay, n * (2 * n) ** ((s - n * t) / (e - s)))
42 |
43 | self.lrate = self.init_lrate * decay
44 |
--------------------------------------------------------------------------------
/doc/code/rnns/atr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class atr(cell.Cell):
14 | """The Addition-Subtraction Twin-Gated Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='atr'):
17 | super(atr, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "atr")):
26 | h = linear(x, self.d,
27 | bias=True, ln=self.ln, scope="hide_x")
28 | return (h, )
29 |
30 | def __call__(self, h_, x):
31 | # h_: the previous hidden state
32 | # x: the current input state
33 | """
34 | p = W x
35 | q = U h_
36 | i = sigmoid(p + q)
37 | f = sigmoid(p - q)
38 | h = i * p + f * h_
39 | """
40 | if isinstance(x, (list, tuple)):
41 | x = x[0]
42 |
43 | with tf.variable_scope(
44 | "cell_{}".format(self.scope or "atr")):
45 | q = linear(h_, self.d,
46 | ln=self.ln, scope="hide_h")
47 | p = x
48 |
49 | f = tf.sigmoid(p - q)
50 | i = tf.sigmoid(p + q)
51 |
52 | h = tf.tanh(i * p + f * h_)
53 |
54 | return h
55 |
--------------------------------------------------------------------------------
/nli/code/rnns/atr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class atr(cell.Cell):
14 | """The Addition-Subtraction Twin-Gated Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='atr'):
17 | super(atr, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "atr")):
26 | h = linear(x, self.d,
27 | bias=True, ln=self.ln, scope="hide_x")
28 | return (h, )
29 |
30 | def __call__(self, h_, x):
31 | # h_: the previous hidden state
32 | # x: the current input state
33 | """
34 | p = W x
35 | q = U h_
36 | i = sigmoid(p + q)
37 | f = sigmoid(p - q)
38 | h = i * p + f * h_
39 | """
40 | if isinstance(x, (list, tuple)):
41 | x = x[0]
42 |
43 | with tf.variable_scope(
44 | "cell_{}".format(self.scope or "atr")):
45 | q = linear(h_, self.d,
46 | ln=self.ln, scope="hide_h")
47 | p = x
48 |
49 | f = tf.sigmoid(p - q)
50 | i = tf.sigmoid(p + q)
51 |
52 | h = tf.tanh(i * p + f * h_)
53 |
54 | return h
55 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/rnns/atr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class atr(cell.Cell):
14 | """The Addition-Subtraction Twin-Gated Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='atr'):
17 | super(atr, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "atr")):
26 | h = linear(x, self.d,
27 | bias=True, ln=self.ln, scope="hide_x")
28 | return (h, )
29 |
30 | def __call__(self, h_, x):
31 | # h_: the previous hidden state
32 | # x: the current input state
33 | """
34 | p = W x
35 | q = U h_
36 | i = sigmoid(p + q)
37 | f = sigmoid(p - q)
38 | h = i * p + f * h_
39 | """
40 | if isinstance(x, (list, tuple)):
41 | x = x[0]
42 |
43 | with tf.variable_scope(
44 | "cell_{}".format(self.scope or "atr")):
45 | q = linear(h_, self.d,
46 | ln=self.ln, scope="hide_h")
47 | p = x
48 |
49 | f = tf.sigmoid(p - q)
50 | i = tf.sigmoid(p + q)
51 |
52 | h = tf.tanh(i * p + f * h_)
53 |
54 | return h
55 |
--------------------------------------------------------------------------------
/rc/rnet/code/rnns/atr.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class atr(cell.Cell):
14 | """The Addition-Subtraction Twin-Gated Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, twin=False, scope='atr'):
17 | super(atr, self).__init__(d, ln=ln, scope=scope)
18 |
19 | self.twin = twin
20 |
21 | def get_init_state(self, shape=None, x=None, scope=None):
22 | return self._get_init_state(
23 | self.d, shape=shape, x=x, scope=scope)
24 |
25 | def fetch_states(self, x):
26 | with tf.variable_scope(
27 | "fetch_state_{}".format(self.scope or "atr")):
28 | h = linear(x, self.d,
29 | bias=True, ln=self.ln, scope="hide_x")
30 | return (h, )
31 |
32 | def __call__(self, h_, x):
33 | # h_: the previous hidden state
34 | # x: the current input state
35 | """
36 | p = W x
37 | q = U h_
38 | i = sigmoid(p + q)
39 | f = sigmoid(p - q)
40 | h = i * p + f * h_
41 | """
42 | if isinstance(x, (list, tuple)):
43 | x = x[0]
44 |
45 | with tf.variable_scope(
46 | "cell_{}".format(self.scope or "atr")):
47 | q = linear(h_, self.d,
48 | ln=self.ln, scope="hide_h")
49 | p = x
50 |
51 | f = tf.sigmoid(p - q)
52 | i = tf.sigmoid(p + q)
53 |
54 | h = tf.tanh(i * p + f * h_)
55 |
56 | return h
57 |
--------------------------------------------------------------------------------
/doc/code/rnns/cell.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import abc
8 | import tensorflow as tf
9 | from func import linear
10 |
11 |
12 | # This is an abstract class that deals with
13 | # recurrent cells, e.g. GRU, LSTM, ATR
14 | class Cell(object):
15 | def __init__(self,
16 | d, # hidden state dimension
17 | ln=False, # whether use layer normalization
18 | scope=None, # the name scope for this cell
19 | ):
20 | self.d = d
21 | self.scope = scope
22 | self.ln = ln
23 |
24 | def _get_init_state(self, d, shape=None, x=None, scope=None):
25 | # gen init state vector
26 | # if no evidence x is provided, use zero initialization
27 | if x is None:
28 | assert shape is not None, "you should provide shape"
29 | if not isinstance(shape, (tuple, list)):
30 | shape = [shape]
31 | shape = shape + [d]
32 | return tf.zeros(shape, tf.float32)
33 | else:
34 | return linear(
35 | x, d, bias=True, ln=self.ln,
36 | scope="{}_init".format(scope or self.scope)
37 | )
38 |
39 | def get_hidden(self, x):
40 | return x
41 |
42 | @abc.abstractmethod
43 | def get_init_state(self, shape=None, x=None, scope=None):
44 | raise NotImplementedError("Not Supported")
45 |
46 | @abc.abstractmethod
47 | def __call__(self, h_, x):
48 | raise NotImplementedError("Not Supported")
49 |
50 | @abc.abstractmethod
51 | def fetch_states(self, x):
52 | raise NotImplementedError("Not Supported")
53 |
--------------------------------------------------------------------------------
/nli/code/rnns/cell.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import abc
8 | import tensorflow as tf
9 | from func import linear
10 |
11 |
12 | # This is an abstract class that deals with
13 | # recurrent cells, e.g. GRU, LSTM, ATR
14 | class Cell(object):
15 | def __init__(self,
16 | d, # hidden state dimension
17 | ln=False, # whether use layer normalization
18 | scope=None, # the name scope for this cell
19 | ):
20 | self.d = d
21 | self.scope = scope
22 | self.ln = ln
23 |
24 | def _get_init_state(self, d, shape=None, x=None, scope=None):
25 | # gen init state vector
26 | # if no evidence x is provided, use zero initialization
27 | if x is None:
28 | assert shape is not None, "you should provide shape"
29 | if not isinstance(shape, (tuple, list)):
30 | shape = [shape]
31 | shape = shape + [d]
32 | return tf.zeros(shape, tf.float32)
33 | else:
34 | return linear(
35 | x, d, bias=True, ln=self.ln,
36 | scope="{}_init".format(scope or self.scope)
37 | )
38 |
39 | def get_hidden(self, x):
40 | return x
41 |
42 | @abc.abstractmethod
43 | def get_init_state(self, shape=None, x=None, scope=None):
44 | raise NotImplementedError("Not Supported")
45 |
46 | @abc.abstractmethod
47 | def __call__(self, h_, x):
48 | raise NotImplementedError("Not Supported")
49 |
50 | @abc.abstractmethod
51 | def fetch_states(self, x):
52 | raise NotImplementedError("Not Supported")
53 |
--------------------------------------------------------------------------------
/rc/rnet/code/rnns/cell.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import abc
8 | import tensorflow as tf
9 | from func import linear
10 |
11 |
12 | # This is an abstract class that deals with
13 | # recurrent cells, e.g. GRU, LSTM, ATR
14 | class Cell(object):
15 | def __init__(self,
16 | d, # hidden state dimension
17 | ln=False, # whether use layer normalization
18 | scope=None, # the name scope for this cell
19 | ):
20 | self.d = d
21 | self.scope = scope
22 | self.ln = ln
23 |
24 | def _get_init_state(self, d, shape=None, x=None, scope=None):
25 | # gen init state vector
26 | # if no evidence x is provided, use zero initialization
27 | if x is None:
28 | assert shape is not None, "you should provide shape"
29 | if not isinstance(shape, (tuple, list)):
30 | shape = [shape]
31 | shape = shape + [d]
32 | return tf.zeros(shape, tf.float32)
33 | else:
34 | return linear(
35 | x, d, bias=True, ln=self.ln,
36 | scope="{}_init".format(scope or self.scope)
37 | )
38 |
39 | def get_hidden(self, x):
40 | return x
41 |
42 | @abc.abstractmethod
43 | def get_init_state(self, shape=None, x=None, scope=None):
44 | raise NotImplementedError("Not Supported")
45 |
46 | @abc.abstractmethod
47 | def __call__(self, h_, x):
48 | raise NotImplementedError("Not Supported")
49 |
50 | @abc.abstractmethod
51 | def fetch_states(self, x):
52 | raise NotImplementedError("Not Supported")
53 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/rnns/cell.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import abc
8 | import tensorflow as tf
9 | from func import linear
10 |
11 |
12 | # This is an abstract class that deals with
13 | # recurrent cells, e.g. GRU, LSTM, ATR
14 | class Cell(object):
15 | def __init__(self,
16 | d, # hidden state dimension
17 | ln=False, # whether use layer normalization
18 | scope=None, # the name scope for this cell
19 | ):
20 | self.d = d
21 | self.scope = scope
22 | self.ln = ln
23 |
24 | def _get_init_state(self, d, shape=None, x=None, scope=None):
25 | # gen init state vector
26 | # if no evidence x is provided, use zero initialization
27 | if x is None:
28 | assert shape is not None, "you should provide shape"
29 | if not isinstance(shape, (tuple, list)):
30 | shape = [shape]
31 | shape = shape + [d]
32 | return tf.zeros(shape, tf.float32)
33 | else:
34 | return linear(
35 | x, d, bias=True, ln=self.ln,
36 | scope="{}_init".format(scope or self.scope)
37 | )
38 |
39 | def get_hidden(self, x):
40 | return x
41 |
42 | @abc.abstractmethod
43 | def get_init_state(self, shape=None, x=None, scope=None):
44 | raise NotImplementedError("Not Supported")
45 |
46 | @abc.abstractmethod
47 | def __call__(self, h_, x):
48 | raise NotImplementedError("Not Supported")
49 |
50 | @abc.abstractmethod
51 | def fetch_states(self, x):
52 | raise NotImplementedError("Not Supported")
53 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/cycle.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 |
10 | def session_run(monitored_session, args):
11 | # Call raw TF session directly
12 | return monitored_session._tf_sess().run(args)
13 |
14 |
15 | def zero_variables(variables, name=None):
16 | ops = []
17 |
18 | for var in variables:
19 | with tf.device(var.device):
20 | op = var.assign(tf.zeros(var.shape.as_list()))
21 | ops.append(op)
22 |
23 | return tf.group(*ops, name=name or "zero_op")
24 |
25 |
26 | def replicate_variables(variables, device=None):
27 | new_vars = []
28 |
29 | for var in variables:
30 | device = device or var.device
31 | with tf.device(device):
32 | name = "replicate/" + var.name.split(":")[0]
33 | new_vars.append(tf.Variable(tf.zeros(var.shape.as_list()),
34 | name=name, trainable=False))
35 |
36 | return new_vars
37 |
38 |
39 | def collect_gradients(gradients, variables):
40 | ops = []
41 |
42 | for grad, var in zip(gradients, variables):
43 | if isinstance(grad, tf.Tensor):
44 | ops.append(tf.assign_add(var, grad))
45 | else:
46 | ops.append(tf.scatter_add(var, grad.indices, grad.values))
47 |
48 | return tf.group(*ops)
49 |
50 |
51 | def scale_gradients(gradients, scale):
52 | scaled_gradients = []
53 |
54 | for grad in gradients:
55 | if isinstance(grad, tf.IndexedSlices):
56 | slices = tf.IndexedSlices(scale * grad.values, grad.indices)
57 | scaled_gradients.append(slices)
58 | else:
59 | scaled_gradients.append(scale * grad)
60 |
61 | return tuple(scaled_gradients)
--------------------------------------------------------------------------------
/doc/code/rnns/gru.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class gru(cell.Cell):
14 | """The Gated Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='gru'):
17 | super(gru, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "gru")):
26 | g = linear(x, self.d * 2,
27 | bias=True, ln=self.ln, scope="gate_x")
28 | h = linear(x, self.d,
29 | bias=True, ln=self.ln, scope="hide_x")
30 | return g, h
31 |
32 | def __call__(self, h_, x):
33 | # h_: the previous hidden state
34 | # x_g/x: the current input state for gate
35 | # x_h/x: the current input state for hidden
36 | """
37 | z = sigmoid(h_, x)
38 | r = sigmoid(h_, x)
39 | h' = tanh(x, r * h_)
40 | h = z * h_ + (1. - z) * h'
41 | """
42 | with tf.variable_scope(
43 | "cell_{}".format(self.scope or "gru")):
44 | x_g, x_h = x
45 |
46 | h_g = linear(h_, self.d * 2,
47 | ln=self.ln, scope="gate_h")
48 | z, r = tf.split(
49 | tf.sigmoid(x_g + h_g), 2, -1)
50 |
51 | h_h = linear(h_ * r, self.d,
52 | ln=self.ln, scope="hide_h")
53 | h = tf.tanh(x_h + h_h)
54 |
55 | h = z * h_ + (1. - z) * h
56 |
57 | return h
58 |
--------------------------------------------------------------------------------
/nli/code/rnns/gru.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class gru(cell.Cell):
14 | """The Gated Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='gru'):
17 | super(gru, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "gru")):
26 | g = linear(x, self.d * 2,
27 | bias=True, ln=self.ln, scope="gate_x")
28 | h = linear(x, self.d,
29 | bias=True, ln=self.ln, scope="hide_x")
30 | return g, h
31 |
32 | def __call__(self, h_, x):
33 | # h_: the previous hidden state
34 | # x_g/x: the current input state for gate
35 | # x_h/x: the current input state for hidden
36 | """
37 | z = sigmoid(h_, x)
38 | r = sigmoid(h_, x)
39 | h' = tanh(x, r * h_)
40 | h = z * h_ + (1. - z) * h'
41 | """
42 | with tf.variable_scope(
43 | "cell_{}".format(self.scope or "gru")):
44 | x_g, x_h = x
45 |
46 | h_g = linear(h_, self.d * 2,
47 | ln=self.ln, scope="gate_h")
48 | z, r = tf.split(
49 | tf.sigmoid(x_g + h_g), 2, -1)
50 |
51 | h_h = linear(h_ * r, self.d,
52 | ln=self.ln, scope="hide_h")
53 | h = tf.tanh(x_h + h_h)
54 |
55 | h = z * h_ + (1. - z) * h
56 |
57 | return h
58 |
--------------------------------------------------------------------------------
/rc/rnet/code/rnns/gru.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class gru(cell.Cell):
14 | """The Gated Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='gru'):
17 | super(gru, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "gru")):
26 | g = linear(x, self.d * 2,
27 | bias=True, ln=self.ln, scope="gate_x")
28 | h = linear(x, self.d,
29 | bias=True, ln=self.ln, scope="hide_x")
30 | return g, h
31 |
32 | def __call__(self, h_, x):
33 | # h_: the previous hidden state
34 | # x_g/x: the current input state for gate
35 | # x_h/x: the current input state for hidden
36 | """
37 | z = sigmoid(h_, x)
38 | r = sigmoid(h_, x)
39 | h' = tanh(x, r * h_)
40 | h = z * h_ + (1. - z) * h'
41 | """
42 | with tf.variable_scope(
43 | "cell_{}".format(self.scope or "gru")):
44 | x_g, x_h = x
45 |
46 | h_g = linear(h_, self.d * 2,
47 | ln=self.ln, scope="gate_h")
48 | z, r = tf.split(
49 | tf.sigmoid(x_g + h_g), 2, -1)
50 |
51 | h_h = linear(h_ * r, self.d,
52 | ln=self.ln, scope="hide_h")
53 | h = tf.tanh(x_h + h_h)
54 |
55 | h = z * h_ + (1. - z) * h
56 |
57 | return h
58 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/rnns/gru.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class gru(cell.Cell):
14 | """The Gated Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='gru'):
17 | super(gru, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d, shape=shape, x=x, scope=scope)
22 |
23 | def fetch_states(self, x):
24 | with tf.variable_scope(
25 | "fetch_state_{}".format(self.scope or "gru")):
26 | g = linear(x, self.d * 2,
27 | bias=True, ln=self.ln, scope="gate_x")
28 | h = linear(x, self.d,
29 | bias=True, ln=self.ln, scope="hide_x")
30 | return g, h
31 |
32 | def __call__(self, h_, x):
33 | # h_: the previous hidden state
34 | # x_g/x: the current input state for gate
35 | # x_h/x: the current input state for hidden
36 | """
37 | z = sigmoid(h_, x)
38 | r = sigmoid(h_, x)
39 | h' = tanh(x, r * h_)
40 | h = z * h_ + (1. - z) * h'
41 | """
42 | with tf.variable_scope(
43 | "cell_{}".format(self.scope or "gru")):
44 | x_g, x_h = x
45 |
46 | h_g = linear(h_, self.d * 2,
47 | ln=self.ln, scope="gate_h")
48 | z, r = tf.split(
49 | tf.sigmoid(x_g + h_g), 2, -1)
50 |
51 | h_h = linear(h_ * r, self.d,
52 | ln=self.ln, scope="hide_h")
53 | h = tf.tanh(x_h + h_h)
54 |
55 | h = z * h_ + (1. - z) * h
56 |
57 | return h
58 |
--------------------------------------------------------------------------------
/rc/README.md:
--------------------------------------------------------------------------------
1 | ## Reading Comprehension
2 |
3 |
4 | We use [SQUAD v1](https://rajpurkar.github.io/SQuAD-explorer/) for experiments and adopt the
5 | [RNET model](https://www.aclweb.org/anthology/papers/P/P17/P17-1018/).
6 | Main experimental results are summarized below.
7 |
8 |
9 |
10 | | Model |
11 | #Params |
12 | Base |
13 | +Elmo |
14 |
15 |
16 | | rnet |
17 | - |
18 | 71.1/79.5 |
19 | -/- |
20 |
21 |
22 | | LSTM |
23 | 2.67M |
24 | 70.46/78.98 |
25 | 75.17/82.79 |
26 |
27 |
28 | | GRU |
29 | 2.31M |
30 | 70.41/79.15 |
31 | 75.81/83.12 |
32 |
33 |
34 | | ATR |
35 | 1.59M |
36 | 69.73/78.70 |
37 | 75.06/82.76 |
38 |
39 |
40 | | SRU |
41 | 2.44M |
42 | 69.27/78.41 |
43 | 74.56/82.50 |
44 |
45 |
46 | | LRN |
47 | 2.14M |
48 | 70.11/78.83 |
49 | 76.14/83.83 |
50 |
51 |
52 |
53 | Exact match/F1-score.
54 |
55 | ## Requirement
56 | tensorflow >= 1.8.1
57 |
58 | ## How to Run?
59 |
60 | - download and preprocess dataset
61 |
62 | - see [R-Net](https://github.com/HKUST-KnowComp/R-Net) about the preprocessing of datasets
63 | - Basically, you need the following datasets: squad v1.1, GloVe, Elmo and convert raw datasets into the required data format.
64 |
65 | - no hyperparameters are tuned, we keep them all in default.
66 |
67 | - training and evaluation
68 |
69 | Please see the `train_lrn.sh` and `test_lrn.sh` scripts in `rnet` (Base) and `elmo_rnet` (Base+Elmo).
70 |
71 | For reporting final EM/F1 score, we used the `evaluate-v1.1.py` script.
72 |
73 | ## Credits
74 |
75 | Source code structure is adapted from [R-Net](https://github.com/HKUST-KnowComp/R-Net).
--------------------------------------------------------------------------------
/lm/code/utils.py:
--------------------------------------------------------------------------------
1 | import os, shutil
2 | import torch
3 | from torch.autograd import Variable
4 |
5 | def repackage_hidden(h):
6 | """Wraps hidden states in new Variables, to detach them from their history."""
7 | if isinstance(h, tuple) or isinstance(h, list):
8 | return tuple(repackage_hidden(v) for v in h)
9 | else:
10 | return h.detach()
11 |
12 | def batchify(data, bsz, args):
13 | # Work out how cleanly we can divide the dataset into bsz parts.
14 | nbatch = data.size(0) // bsz
15 | # Trim off any extra elements that wouldn't cleanly fit (remainders).
16 | data = data.narrow(0, 0, nbatch * bsz)
17 | # Evenly divide the data across the bsz batches.
18 | data = data.view(bsz, -1).t().contiguous()
19 | print(data.size())
20 | if args.cuda:
21 | data = data.cuda()
22 | return data
23 |
24 | def get_batch(source, i, args, seq_len=None):
25 | seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i)
26 | data = Variable(source[i:i+seq_len])
27 | # target = Variable(source[i+1:i+1+seq_len].view(-1))
28 | target = Variable(source[i+1:i+1+seq_len])
29 | return data, target
30 |
31 | def create_exp_dir(path, scripts_to_save=None):
32 | if not os.path.exists(path):
33 | os.mkdir(path)
34 |
35 | print('Experiment dir : {}'.format(path))
36 | if scripts_to_save is not None:
37 | os.mkdir(os.path.join(path, 'scripts'))
38 | for script in scripts_to_save:
39 | dst_file = os.path.join(path, 'scripts', os.path.basename(script))
40 | shutil.copyfile(script, dst_file)
41 |
42 | def save_checkpoint(model, optimizer, path, finetune=False):
43 | if finetune:
44 | torch.save(model, os.path.join(path, 'finetune_model.pt'))
45 | torch.save(optimizer.state_dict(), os.path.join(path, 'finetune_optimizer.pt'))
46 | else:
47 | torch.save(model, os.path.join(path, 'model.pt'))
48 | torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer.pt'))
49 |
--------------------------------------------------------------------------------
/doc/code/bert/vocab.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | class Vocab(object):
9 | def __init__(self, vocab_file=None):
10 | self.word2id = {}
11 | self.id2word = {}
12 | self.word2count = {}
13 |
14 | self.pad_sym = "[PAD]"
15 | self.cls_sym = "[CLS]"
16 | self.sep_sym = "[SEP]"
17 | self.unk_sym = "[UNK]"
18 |
19 | if vocab_file is not None:
20 | self.load_vocab(vocab_file)
21 |
22 | def insert(self, token):
23 | if token not in self.word2id:
24 | index = len(self.word2id)
25 | self.word2id[token] = index
26 | self.id2word[index] = token
27 |
28 | self.word2count[token] = 0
29 | self.word2count[token] += 1
30 |
31 | @property
32 | def size(self):
33 | return len(self.word2id)
34 |
35 | def load_vocab(self, vocab_file):
36 | with open(vocab_file, 'r') as reader:
37 | for token in reader:
38 | self.insert(token.strip())
39 |
40 | def get_token(self, id):
41 | if id in self.id2word:
42 | return self.id2word[id]
43 | return self.unk_sym
44 |
45 | def get_id(self, token):
46 | if token in self.word2id:
47 | return self.word2id[token]
48 | return self.word2id[self.unk_sym]
49 |
50 | def save_vocab(self, vocab_file):
51 | with open(vocab_file, 'w') as writer:
52 | for id in range(self.size):
53 | writer.write(self.id2word[id] + "\n")
54 |
55 | def to_id(self, tokens):
56 | return [self.get_id(token) for token in tokens]
57 |
58 | def to_tokens(self, ids):
59 | return [self.get_token(id) for id in ids]
60 |
61 | @property
62 | def pad(self):
63 | return self.get_id(self.pad_sym)
64 |
65 | @property
66 | def cls(self):
67 | return self.get_id(self.cls_sym)
68 |
69 | @property
70 | def sep(self):
71 | return self.get_id(self.sep_sym)
72 |
--------------------------------------------------------------------------------
/nli/code/bert/vocab.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | class Vocab(object):
9 | def __init__(self, vocab_file=None):
10 | self.word2id = {}
11 | self.id2word = {}
12 | self.word2count = {}
13 |
14 | self.pad_sym = "[PAD]"
15 | self.cls_sym = "[CLS]"
16 | self.sep_sym = "[SEP]"
17 | self.unk_sym = "[UNK]"
18 |
19 | if vocab_file is not None:
20 | self.load_vocab(vocab_file)
21 |
22 | def insert(self, token):
23 | if token not in self.word2id:
24 | index = len(self.word2id)
25 | self.word2id[token] = index
26 | self.id2word[index] = token
27 |
28 | self.word2count[token] = 0
29 | self.word2count[token] += 1
30 |
31 | @property
32 | def size(self):
33 | return len(self.word2id)
34 |
35 | def load_vocab(self, vocab_file):
36 | with open(vocab_file, 'r') as reader:
37 | for token in reader:
38 | self.insert(token.strip())
39 |
40 | def get_token(self, id):
41 | if id in self.id2word:
42 | return self.id2word[id]
43 | return self.unk_sym
44 |
45 | def get_id(self, token):
46 | if token in self.word2id:
47 | return self.word2id[token]
48 | return self.word2id[self.unk_sym]
49 |
50 | def save_vocab(self, vocab_file):
51 | with open(vocab_file, 'w') as writer:
52 | for id in range(self.size):
53 | writer.write(self.id2word[id] + "\n")
54 |
55 | def to_id(self, tokens):
56 | return [self.get_id(token) for token in tokens]
57 |
58 | def to_tokens(self, ids):
59 | return [self.get_token(id) for id in ids]
60 |
61 | @property
62 | def pad(self):
63 | return self.get_id(self.pad_sym)
64 |
65 | @property
66 | def cls(self):
67 | return self.get_id(self.cls_sym)
68 |
69 | @property
70 | def sep(self):
71 | return self.get_id(self.sep_sym)
72 |
--------------------------------------------------------------------------------
/rc/rnet/code/rnns/rnn.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from rnns import get_cell
10 |
11 |
12 | def rnn(cell_name, x, d, mask=None, ln=False, init_state=None, sm=True):
13 | """Self implemented RNN procedure, supporting mask trick"""
14 | # cell_name: gru, lstm or atr
15 | # x: input sequence embedding matrix, [batch, seq_len, dim]
16 | # d: hidden dimension for rnn
17 | # mask: mask matrix, [batch, seq_len]
18 | # ln: whether use layer normalization
19 | # init_state: the initial hidden states, for cache purpose
20 | # sm: whether apply swap memory during rnn scan
21 | # dp: variational dropout
22 |
23 | in_shape = tf.shape(x)
24 | batch_size, time_steps = in_shape[0], in_shape[1]
25 |
26 | cell = get_cell(cell_name, d, ln=ln)
27 |
28 | if init_state is None:
29 | init_state = cell.get_init_state(shape=[batch_size])
30 | if mask is None:
31 | mask = tf.ones([batch_size, time_steps], tf.float32)
32 |
33 | # prepare projected input
34 | cache_inputs = cell.fetch_states(x)
35 | cache_inputs = [tf.transpose(v, [1, 0, 2])
36 | for v in list(cache_inputs)]
37 | mask_ta = tf.transpose(tf.expand_dims(mask, -1), [1, 0, 2])
38 |
39 | def _step_fn(prev, x):
40 | t, h_ = prev
41 | m = x[-1]
42 | v = x[:-1]
43 |
44 | h = cell(h_, v)
45 | h = m * h + (1. - m) * h_
46 |
47 | return t + 1, h
48 |
49 | time = tf.constant(0, dtype=tf.int32, name="time")
50 | step_states = (time, init_state)
51 | step_vars = cache_inputs + [mask_ta]
52 |
53 | outputs = tf.scan(_step_fn,
54 | step_vars,
55 | initializer=step_states,
56 | parallel_iterations=32,
57 | swap_memory=sm)
58 |
59 | output_ta = outputs[1]
60 | output_state = outputs[1][-1]
61 |
62 | outputs = tf.transpose(output_ta, [1, 0, 2])
63 |
64 | return (outputs, output_state), \
65 | (cell.get_hidden(outputs), cell.get_hidden(output_state))
66 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/rnns/rnn.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from rnns import get_cell
10 |
11 |
12 | def rnn(cell_name, x, d, mask=None, ln=False, init_state=None, sm=True):
13 | """Self implemented RNN procedure, supporting mask trick"""
14 | # cell_name: gru, lstm or atr
15 | # x: input sequence embedding matrix, [batch, seq_len, dim]
16 | # d: hidden dimension for rnn
17 | # mask: mask matrix, [batch, seq_len]
18 | # ln: whether use layer normalization
19 | # init_state: the initial hidden states, for cache purpose
20 | # sm: whether apply swap memory during rnn scan
21 | # dp: variational dropout
22 |
23 | in_shape = tf.shape(x)
24 | batch_size, time_steps = in_shape[0], in_shape[1]
25 |
26 | cell = get_cell(cell_name, d, ln=ln)
27 |
28 | if init_state is None:
29 | init_state = cell.get_init_state(shape=[batch_size])
30 | if mask is None:
31 | mask = tf.ones([batch_size, time_steps], tf.float32)
32 |
33 | # prepare projected input
34 | cache_inputs = cell.fetch_states(x)
35 | cache_inputs = [tf.transpose(v, [1, 0, 2])
36 | for v in list(cache_inputs)]
37 | mask_ta = tf.transpose(tf.expand_dims(mask, -1), [1, 0, 2])
38 |
39 | def _step_fn(prev, x):
40 | t, h_ = prev
41 | m = x[-1]
42 | v = x[:-1]
43 |
44 | h = cell(h_, v)
45 | h = m * h + (1. - m) * h_
46 |
47 | return t + 1, h
48 |
49 | time = tf.constant(0, dtype=tf.int32, name="time")
50 | step_states = (time, init_state)
51 | step_vars = cache_inputs + [mask_ta]
52 |
53 | outputs = tf.scan(_step_fn,
54 | step_vars,
55 | initializer=step_states,
56 | parallel_iterations=32,
57 | swap_memory=sm)
58 |
59 | output_ta = outputs[1]
60 | output_state = outputs[1][-1]
61 |
62 | outputs = tf.transpose(output_ta, [1, 0, 2])
63 |
64 | return (outputs, output_state), \
65 | (cell.get_hidden(outputs), cell.get_hidden(output_state))
66 |
--------------------------------------------------------------------------------
/ner/code/trainer.py:
--------------------------------------------------------------------------------
1 | """Training-related module.
2 | """
3 | from callbacks import F1score
4 | from utils import NERSequence
5 |
6 |
7 | class Trainer(object):
8 | """A trainer that train the model.
9 |
10 | Attributes:
11 | _model: Model.
12 | _preprocessor: Transformer. Preprocessing data for feature extraction.
13 | """
14 |
15 | def __init__(self, model, preprocessor=None):
16 | self._model = model
17 | self._preprocessor = preprocessor
18 |
19 | def train(self, x_train, y_train, x_valid=None, y_valid=None,
20 | epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True):
21 | """Trains the model for a fixed number of epochs (iterations on a dataset).
22 |
23 | Args:
24 | x_train: list of training data.
25 | y_train: list of training target (label) data.
26 | x_valid: list of validation data.
27 | y_valid: list of validation target (label) data.
28 | batch_size: Integer.
29 | Number of samples per gradient update.
30 | If unspecified, `batch_size` will default to 32.
31 | epochs: Integer. Number of epochs to train the model.
32 | verbose: Integer. 0, 1, or 2. Verbosity mode.
33 | 0 = silent, 1 = progress bar, 2 = one line per epoch.
34 | callbacks: List of `keras.callbacks.Callback` instances.
35 | List of callbacks to apply during training.
36 | shuffle: Boolean (whether to shuffle the training data
37 | before each epoch). `shuffle` will default to True.
38 | """
39 |
40 | train_seq = NERSequence(x_train, y_train, batch_size, self._preprocessor.transform)
41 |
42 | if x_valid and y_valid:
43 | valid_seq = NERSequence(x_valid, y_valid, batch_size, self._preprocessor.transform)
44 | f1 = F1score(valid_seq, preprocessor=self._preprocessor)
45 | callbacks = [f1] + callbacks if callbacks else [f1]
46 |
47 | self._model.fit_generator(generator=train_seq,
48 | epochs=epochs,
49 | callbacks=callbacks,
50 | verbose=verbose,
51 | shuffle=shuffle)
52 |
--------------------------------------------------------------------------------
/doc/code/rnns/sru.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class sru(cell.Cell):
14 | """The Simple Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='sru'):
17 | super(sru, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d * 2, shape=shape, x=x, scope=scope)
22 |
23 | def get_hidden(self, x):
24 | return tf.split(x, 2, -1)[0]
25 |
26 | def fetch_states(self, x):
27 | with tf.variable_scope(
28 | "fetch_state_{}".format(self.scope or "sru")):
29 | h = linear(x, self.d * 4,
30 | bias=True, ln=self.ln, scope="hide_x")
31 | return (h, )
32 |
33 | def __call__(self, h_, x):
34 | # h_: the concatenation of previous hidden state
35 | # and memory cell state
36 | # x_r/x: the current input state for r gate
37 | # x_f/x: the current input state for f gate
38 | # x_c/x: the current input state for candidate cell
39 | # x_h/x: the current input state for hidden output
40 | # we increase this because we do not assume that
41 | # the input dimension equals the output dimension
42 | """
43 | f = sigmoid(Wx, vf * c_)
44 | c = f * c_ + (1 - f) * Wx
45 | r = sigmoid(Wx, vr * c_)
46 | h = r * c + (1 - r) * Ux
47 | """
48 | if isinstance(x, (list, tuple)):
49 | x = x[0]
50 |
51 | with tf.variable_scope(
52 | "cell_{}".format(self.scope or "sru")):
53 | x_r, x_f, x_c, x_h = tf.split(x, 4, -1)
54 | h_, c_ = tf.split(h_, 2, -1)
55 |
56 | v_f = tf.get_variable("v_f", [1, self.d])
57 | v_r = tf.get_variable("v_r", [1, self.d])
58 |
59 | f = tf.sigmoid(x_f + v_f * c_)
60 | c = f * c_ + (1. - f) * x_c
61 | r = tf.sigmoid(x_r + v_r * c_)
62 | h = r * c + (1. - r) * x_h
63 |
64 | return tf.concat([h, c], -1)
65 |
--------------------------------------------------------------------------------
/nli/code/rnns/sru.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class sru(cell.Cell):
14 | """The Simple Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='sru'):
17 | super(sru, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d * 2, shape=shape, x=x, scope=scope)
22 |
23 | def get_hidden(self, x):
24 | return tf.split(x, 2, -1)[0]
25 |
26 | def fetch_states(self, x):
27 | with tf.variable_scope(
28 | "fetch_state_{}".format(self.scope or "sru")):
29 | h = linear(x, self.d * 4,
30 | bias=True, ln=self.ln, scope="hide_x")
31 | return (h, )
32 |
33 | def __call__(self, h_, x):
34 | # h_: the concatenation of previous hidden state
35 | # and memory cell state
36 | # x_r/x: the current input state for r gate
37 | # x_f/x: the current input state for f gate
38 | # x_c/x: the current input state for candidate cell
39 | # x_h/x: the current input state for hidden output
40 | # we increase this because we do not assume that
41 | # the input dimension equals the output dimension
42 | """
43 | f = sigmoid(Wx, vf * c_)
44 | c = f * c_ + (1 - f) * Wx
45 | r = sigmoid(Wx, vr * c_)
46 | h = r * c + (1 - r) * Ux
47 | """
48 | if isinstance(x, (list, tuple)):
49 | x = x[0]
50 |
51 | with tf.variable_scope(
52 | "cell_{}".format(self.scope or "sru")):
53 | x_r, x_f, x_c, x_h = tf.split(x, 4, -1)
54 | h_, c_ = tf.split(h_, 2, -1)
55 |
56 | v_f = tf.get_variable("v_f", [1, self.d])
57 | v_r = tf.get_variable("v_r", [1, self.d])
58 |
59 | f = tf.sigmoid(x_f + v_f * c_)
60 | c = f * c_ + (1. - f) * x_c
61 | r = tf.sigmoid(x_r + v_r * c_)
62 | h = r * c + (1. - r) * x_h
63 |
64 | return tf.concat([h, c], -1)
65 |
--------------------------------------------------------------------------------
/rc/rnet/code/rnns/sru.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class sru(cell.Cell):
14 | """The Simple Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='sru'):
17 | super(sru, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d * 2, shape=shape, x=x, scope=scope)
22 |
23 | def get_hidden(self, x):
24 | return tf.split(x, 2, -1)[0]
25 |
26 | def fetch_states(self, x):
27 | with tf.variable_scope(
28 | "fetch_state_{}".format(self.scope or "sru")):
29 | h = linear(x, self.d * 4,
30 | bias=True, ln=self.ln, scope="hide_x")
31 | return (h, )
32 |
33 | def __call__(self, h_, x):
34 | # h_: the concatenation of previous hidden state
35 | # and memory cell state
36 | # x_r/x: the current input state for r gate
37 | # x_f/x: the current input state for f gate
38 | # x_c/x: the current input state for candidate cell
39 | # x_h/x: the current input state for hidden output
40 | # we increase this because we do not assume that
41 | # the input dimension equals the output dimension
42 | """
43 | f = sigmoid(Wx, vf * c_)
44 | c = f * c_ + (1 - f) * Wx
45 | r = sigmoid(Wx, vr * c_)
46 | h = r * c + (1 - r) * Ux
47 | """
48 | if isinstance(x, (list, tuple)):
49 | x = x[0]
50 |
51 | with tf.variable_scope(
52 | "cell_{}".format(self.scope or "sru")):
53 | x_r, x_f, x_c, x_h = tf.split(x, 4, -1)
54 | h_, c_ = tf.split(h_, 2, -1)
55 |
56 | v_f = tf.get_variable("v_f", [1, self.d])
57 | v_r = tf.get_variable("v_r", [1, self.d])
58 |
59 | f = tf.sigmoid(x_f + v_f * c_)
60 | c = f * c_ + (1. - f) * x_c
61 | r = tf.sigmoid(x_r + v_r * c_)
62 | h = r * c + (1. - r) * x_h
63 |
64 | return tf.concat([h, c], -1)
65 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/rnns/sru.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class sru(cell.Cell):
14 | """The Simple Recurrent Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='sru'):
17 | super(sru, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d * 2, shape=shape, x=x, scope=scope)
22 |
23 | def get_hidden(self, x):
24 | return tf.split(x, 2, -1)[0]
25 |
26 | def fetch_states(self, x):
27 | with tf.variable_scope(
28 | "fetch_state_{}".format(self.scope or "sru")):
29 | h = linear(x, self.d * 4,
30 | bias=True, ln=self.ln, scope="hide_x")
31 | return (h, )
32 |
33 | def __call__(self, h_, x):
34 | # h_: the concatenation of previous hidden state
35 | # and memory cell state
36 | # x_r/x: the current input state for r gate
37 | # x_f/x: the current input state for f gate
38 | # x_c/x: the current input state for candidate cell
39 | # x_h/x: the current input state for hidden output
40 | # we increase this because we do not assume that
41 | # the input dimension equals the output dimension
42 | """
43 | f = sigmoid(Wx, vf * c_)
44 | c = f * c_ + (1 - f) * Wx
45 | r = sigmoid(Wx, vr * c_)
46 | h = r * c + (1 - r) * Ux
47 | """
48 | if isinstance(x, (list, tuple)):
49 | x = x[0]
50 |
51 | with tf.variable_scope(
52 | "cell_{}".format(self.scope or "sru")):
53 | x_r, x_f, x_c, x_h = tf.split(x, 4, -1)
54 | h_, c_ = tf.split(h_, 2, -1)
55 |
56 | v_f = tf.get_variable("v_f", [1, self.d])
57 | v_r = tf.get_variable("v_r", [1, self.d])
58 |
59 | f = tf.sigmoid(x_f + v_f * c_)
60 | c = f * c_ + (1. - f) * x_c
61 | r = tf.sigmoid(x_r + v_r * c_)
62 | h = r * c + (1. - r) * x_h
63 |
64 | return tf.concat([h, c], -1)
65 |
--------------------------------------------------------------------------------
/ner/README.md:
--------------------------------------------------------------------------------
1 | ## Named Entity Recognition
2 |
3 |
4 | We employ the birnn plus CRF architecture as [Lample et al. 2016](https://www.aclweb.org/anthology/N16-1030), and
5 | experiment on CoNLL-2003 English NER data.
6 | Main experimental results are summarized below.
7 |
8 |
9 |
10 | | Model |
11 | #Params |
12 | NER |
13 |
14 |
15 | | Lample et al. 2016 |
16 | - |
17 | 90.94 |
18 |
19 |
20 | | LSTM |
21 | 245K |
22 | 89.61 |
23 |
24 |
25 | | GRU |
26 | 192K |
27 | 89.35 |
28 |
29 |
30 | | ATR |
31 | 87K |
32 | 88.46 |
33 |
34 |
35 | | SRU |
36 | 161K |
37 | 88.89 |
38 |
39 |
40 | | LRN |
41 | 129K |
42 | 88.56 |
43 |
44 |
45 |
46 | F1-score.
47 |
48 | ## Requirement
49 | see [requirements.txt](code/requirements.txt) for full list.
50 |
51 | ## How to Run?
52 |
53 | - download and preprocess dataset
54 |
55 | - download the conll2003 dataset from [anago](https://github.com/Hironsan/anago/tree/master/data) (in data folder).
56 | - download the Glove-6B-100d pre-trained word embedding from: http://nlp.stanford.edu/data/glove.6B.zip
57 |
58 | - no hyperparameters are tuned, we keep them all in default.
59 |
60 | - training and evaluation
61 |
62 | the running procedure is as follows:
63 | ```
64 | export CUDA_ROOT=XXX
65 | export PATH=$CUDA_ROOT/bin:$PATH
66 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
67 |
68 | export CUDA_VISIBLE_DEVICES=0
69 |
70 | export data_dir=path-of/conll2003/en/ner
71 | export glove_dir=path-of/glove.6B/glove.6B.100d.txt
72 |
73 | RUN_EXP=5
74 | rnn=lrn
75 |
76 | for i in $(seq 1 $RUN_EXP); do
77 | exp_dir=exp$i
78 | mkdir $exp_dir
79 | cd $exp_dir
80 |
81 | export cell_type=$rnn
82 | python3 ner_glove.py --cell lrn >& log.lrn
83 |
84 | cd ../
85 | done
86 |
87 | python scripts/get_test_score.py $rnn exp* >& score.$rnn
88 | ```
89 | Results are reported over 5 runs.
90 |
91 | ## Credits
92 |
93 | Source code structure is adapted from [annago](https://github.com/Hironsan/anago/tree/master/).
--------------------------------------------------------------------------------
/doc/code/rnns/lstm.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class lstm(cell.Cell):
14 | """The Long-Short Term Memory Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='lstm'):
17 | super(lstm, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d * 2, shape=shape, x=x, scope=scope)
22 |
23 | def get_hidden(self, x):
24 | return tf.split(x, 2, -1)[0]
25 |
26 | def fetch_states(self, x):
27 | with tf.variable_scope(
28 | "fetch_state_{}".format(self.scope or "lstm")):
29 | g = linear(x, self.d * 3,
30 | bias=True, ln=self.ln, scope="gate_x")
31 | c = linear(x, self.d,
32 | bias=True, ln=self.ln, scope="hide_x")
33 | return g, c
34 |
35 | def __call__(self, h_, x):
36 | # h_: the concatenation of previous hidden state
37 | # and memory cell state
38 | # x_i/x: the current input state for input gate
39 | # x_f/x: the current input state for forget gate
40 | # x_o/x: the current input state for output gate
41 | # x_c/x: the current input state for candidate cell
42 | """
43 | f = sigmoid(h_, x)
44 | i = sigmoid(h_, x)
45 | o = sigmoid(h_, x)
46 | c' = tanh(h_, x)
47 | c = f * c_ + i * c'
48 | h = o * tanh(c)
49 | """
50 | with tf.variable_scope(
51 | "cell_{}".format(self.scope or "lstm")):
52 | x_g, x_c = x
53 | h_, c_ = tf.split(h_, 2, -1)
54 |
55 | h_g = linear(h_, self.d * 3,
56 | ln=self.ln, scope="gate_h")
57 | i, f, o = tf.split(
58 | tf.sigmoid(x_g + h_g), 3, -1)
59 |
60 | h_c = linear(h_, self.d,
61 | ln=self.ln, scope="hide_h")
62 | h_c = tf.tanh(x_c + h_c)
63 |
64 | c = i * h_c + f * c_
65 |
66 | h = o * tf.tanh(c)
67 |
68 | return tf.concat([h, c], -1)
69 |
--------------------------------------------------------------------------------
/nli/code/rnns/lstm.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class lstm(cell.Cell):
14 | """The Long-Short Term Memory Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='lstm'):
17 | super(lstm, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d * 2, shape=shape, x=x, scope=scope)
22 |
23 | def get_hidden(self, x):
24 | return tf.split(x, 2, -1)[0]
25 |
26 | def fetch_states(self, x):
27 | with tf.variable_scope(
28 | "fetch_state_{}".format(self.scope or "lstm")):
29 | g = linear(x, self.d * 3,
30 | bias=True, ln=self.ln, scope="gate_x")
31 | c = linear(x, self.d,
32 | bias=True, ln=self.ln, scope="hide_x")
33 | return g, c
34 |
35 | def __call__(self, h_, x):
36 | # h_: the concatenation of previous hidden state
37 | # and memory cell state
38 | # x_i/x: the current input state for input gate
39 | # x_f/x: the current input state for forget gate
40 | # x_o/x: the current input state for output gate
41 | # x_c/x: the current input state for candidate cell
42 | """
43 | f = sigmoid(h_, x)
44 | i = sigmoid(h_, x)
45 | o = sigmoid(h_, x)
46 | c' = tanh(h_, x)
47 | c = f * c_ + i * c'
48 | h = o * tanh(c)
49 | """
50 | with tf.variable_scope(
51 | "cell_{}".format(self.scope or "lstm")):
52 | x_g, x_c = x
53 | h_, c_ = tf.split(h_, 2, -1)
54 |
55 | h_g = linear(h_, self.d * 3,
56 | ln=self.ln, scope="gate_h")
57 | i, f, o = tf.split(
58 | tf.sigmoid(x_g + h_g), 3, -1)
59 |
60 | h_c = linear(h_, self.d,
61 | ln=self.ln, scope="hide_h")
62 | h_c = tf.tanh(x_c + h_c)
63 |
64 | c = i * h_c + f * c_
65 |
66 | h = o * tf.tanh(c)
67 |
68 | return tf.concat([h, c], -1)
69 |
--------------------------------------------------------------------------------
/rc/rnet/code/rnns/lstm.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class lstm(cell.Cell):
14 | """The Long-Short Term Memory Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='lstm'):
17 | super(lstm, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d * 2, shape=shape, x=x, scope=scope)
22 |
23 | def get_hidden(self, x):
24 | return tf.split(x, 2, -1)[0]
25 |
26 | def fetch_states(self, x):
27 | with tf.variable_scope(
28 | "fetch_state_{}".format(self.scope or "lstm")):
29 | g = linear(x, self.d * 3,
30 | bias=True, ln=self.ln, scope="gate_x")
31 | c = linear(x, self.d,
32 | bias=True, ln=self.ln, scope="hide_x")
33 | return g, c
34 |
35 | def __call__(self, h_, x):
36 | # h_: the concatenation of previous hidden state
37 | # and memory cell state
38 | # x_i/x: the current input state for input gate
39 | # x_f/x: the current input state for forget gate
40 | # x_o/x: the current input state for output gate
41 | # x_c/x: the current input state for candidate cell
42 | """
43 | f = sigmoid(h_, x)
44 | i = sigmoid(h_, x)
45 | o = sigmoid(h_, x)
46 | c' = tanh(h_, x)
47 | c = f * c_ + i * c'
48 | h = o * tanh(c)
49 | """
50 | with tf.variable_scope(
51 | "cell_{}".format(self.scope or "lstm")):
52 | x_g, x_c = x
53 | h_, c_ = tf.split(h_, 2, -1)
54 |
55 | h_g = linear(h_, self.d * 3,
56 | ln=self.ln, scope="gate_h")
57 | i, f, o = tf.split(
58 | tf.sigmoid(x_g + h_g), 3, -1)
59 |
60 | h_c = linear(h_, self.d,
61 | ln=self.ln, scope="hide_h")
62 | h_c = tf.tanh(x_c + h_c)
63 |
64 | c = i * h_c + f * c_
65 |
66 | h = o * tf.tanh(c)
67 |
68 | return tf.concat([h, c], -1)
69 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/rnns/lstm.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 | from func import linear
10 | from rnns import cell as cell
11 |
12 |
13 | class lstm(cell.Cell):
14 | """The Long-Short Term Memory Unit."""
15 |
16 | def __init__(self, d, ln=False, scope='lstm'):
17 | super(lstm, self).__init__(d, ln=ln, scope=scope)
18 |
19 | def get_init_state(self, shape=None, x=None, scope=None):
20 | return self._get_init_state(
21 | self.d * 2, shape=shape, x=x, scope=scope)
22 |
23 | def get_hidden(self, x):
24 | return tf.split(x, 2, -1)[0]
25 |
26 | def fetch_states(self, x):
27 | with tf.variable_scope(
28 | "fetch_state_{}".format(self.scope or "lstm")):
29 | g = linear(x, self.d * 3,
30 | bias=True, ln=self.ln, scope="gate_x")
31 | c = linear(x, self.d,
32 | bias=True, ln=self.ln, scope="hide_x")
33 | return g, c
34 |
35 | def __call__(self, h_, x):
36 | # h_: the concatenation of previous hidden state
37 | # and memory cell state
38 | # x_i/x: the current input state for input gate
39 | # x_f/x: the current input state for forget gate
40 | # x_o/x: the current input state for output gate
41 | # x_c/x: the current input state for candidate cell
42 | """
43 | f = sigmoid(h_, x)
44 | i = sigmoid(h_, x)
45 | o = sigmoid(h_, x)
46 | c' = tanh(h_, x)
47 | c = f * c_ + i * c'
48 | h = o * tanh(c)
49 | """
50 | with tf.variable_scope(
51 | "cell_{}".format(self.scope or "lstm")):
52 | x_g, x_c = x
53 | h_, c_ = tf.split(h_, 2, -1)
54 |
55 | h_g = linear(h_, self.d * 3,
56 | ln=self.ln, scope="gate_h")
57 | i, f, o = tf.split(
58 | tf.sigmoid(x_g + h_g), 3, -1)
59 |
60 | h_c = linear(h_, self.d,
61 | ln=self.ln, scope="hide_h")
62 | h_c = tf.tanh(x_c + h_c)
63 |
64 | c = i * h_c + f * c_
65 |
66 | h = o * tf.tanh(c)
67 |
68 | return tf.concat([h, c], -1)
69 |
--------------------------------------------------------------------------------
/lm/code/generate.py:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Language Modeling on Penn Tree Bank
3 | #
4 | # This file generates new sentences sampled from the language model
5 | #
6 | ###############################################################################
7 |
8 | import argparse
9 |
10 | import torch
11 | from torch.autograd import Variable
12 |
13 | import data
14 |
15 | parser = argparse.ArgumentParser(description='PyTorch PTB Language Model')
16 |
17 | # Model parameters.
18 | parser.add_argument('--data', type=str, default='./penn',
19 | help='location of the data corpus')
20 | parser.add_argument('--checkpoint', type=str, default='./model.pt',
21 | help='model checkpoint to use')
22 | parser.add_argument('--outf', type=str, default='generated.txt',
23 | help='output file for generated text')
24 | parser.add_argument('--words', type=int, default='1000',
25 | help='number of words to generate')
26 | parser.add_argument('--seed', type=int, default=1111,
27 | help='random seed')
28 | parser.add_argument('--cuda', action='store_true',
29 | help='use CUDA')
30 | parser.add_argument('--temperature', type=float, default=1.0,
31 | help='temperature - higher will increase diversity')
32 | parser.add_argument('--log-interval', type=int, default=100,
33 | help='reporting interval')
34 | args = parser.parse_args()
35 |
36 | # Set the random seed manually for reproducibility.
37 | torch.manual_seed(args.seed)
38 | if torch.cuda.is_available():
39 | if not args.cuda:
40 | print("WARNING: You have a CUDA device, so you should probably run with --cuda")
41 | else:
42 | torch.cuda.manual_seed(args.seed)
43 |
44 | if args.temperature < 1e-3:
45 | parser.error("--temperature has to be greater or equal 1e-3")
46 |
47 | with open(args.checkpoint, 'rb') as f:
48 | model = torch.load(f)
49 | model.eval()
50 |
51 | if args.cuda:
52 | model.cuda()
53 | else:
54 | model.cpu()
55 |
56 | corpus = data.Corpus(args.data)
57 | ntokens = len(corpus.dictionary)
58 | hidden = model.init_hidden(1)
59 | input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True)
60 | if args.cuda:
61 | input.data = input.data.cuda()
62 |
63 | with open(args.outf, 'w') as outf:
64 | for i in range(args.words):
65 | output, hidden = model(input, hidden, return_prob=True)
66 | word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
67 | word_idx = torch.multinomial(word_weights, 1)[0]
68 | input.data.fill_(word_idx)
69 | word = corpus.dictionary.idx2word[word_idx]
70 |
71 | outf.write(word + ('\n' if i % 20 == 19 else ' '))
72 |
73 | if i % args.log_interval == 0:
74 | print('| Generated {}/{} words'.format(i, args.words))
75 |
--------------------------------------------------------------------------------
/mt/README.md:
--------------------------------------------------------------------------------
1 | ## Machine Translation
2 |
3 |
4 | Main source code will be available at [zero](https://github.com/bzhangGo/zero) (might require some time, 31/05/2019).
5 | The used NMT structure is in `deepnmt.py`.
6 |
7 |
8 | Main experimental results are summarized below.
9 |
10 |
11 |
12 | | Model |
13 | #Params |
14 | BLEU |
15 | Train |
16 | Decode |
17 |
18 |
19 | | GNMT |
20 | - |
21 | 24.61 |
22 | - |
23 | - |
24 |
25 |
26 | | GRU |
27 | 206M |
28 | 26.28 |
29 | 2.67 |
30 | 45.35 |
31 |
32 |
33 | | ATR |
34 | 122M |
35 | 25.70 |
36 | 1.33 |
37 | 34.40 |
38 |
39 |
40 | | SRU |
41 | 170M |
42 | 25.91 |
43 | 1.34 |
44 | 42.84 |
45 |
46 |
47 | | LRN |
48 | 143M |
49 | 26.26 |
50 | 0.99 |
51 | 36.50 |
52 |
53 |
54 | | oLRN |
55 | 164M |
56 | 26.73 |
57 | 1.15 |
58 | 40.19 |
59 |
60 |
61 |
62 | *Train*: time in seconds per training batch measured from 0.2k training steps.
63 | *Decode*: time in milliseconds used to decode one sentence measured on newstest2014 dataset.
64 | *BLEU*: case-insensitive tokenized BLEU score on WMT14 English-German translation task.
65 |
66 | ## oLRN structure
67 |
68 |
69 |
70 | Unlike LRN, oLRN employs an additional output gate, inspired by LSTM, to handle output information flow.
71 | This additional gate also help avoid hidden state explosion when linear activation is applied.
72 |
73 | ## How to Run?
74 |
75 | Training and evaluation, please refer to project [zero](https://github.com/bzhangGo/zero).
--------------------------------------------------------------------------------
/rc/rnet/code/evaluate-v1.1.py:
--------------------------------------------------------------------------------
1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
2 | from __future__ import print_function
3 | from collections import Counter
4 | import string
5 | import re
6 | import argparse
7 | import json
8 | import sys
9 |
10 |
11 | def normalize_answer(s):
12 | """Lower text and remove punctuation, articles and extra whitespace."""
13 | def remove_articles(text):
14 | return re.sub(r'\b(a|an|the)\b', ' ', text)
15 |
16 | def white_space_fix(text):
17 | return ' '.join(text.split())
18 |
19 | def remove_punc(text):
20 | exclude = set(string.punctuation)
21 | return ''.join(ch for ch in text if ch not in exclude)
22 |
23 | def lower(text):
24 | return text.lower()
25 |
26 | return white_space_fix(remove_articles(remove_punc(lower(s))))
27 |
28 |
29 | def f1_score(prediction, ground_truth):
30 | prediction_tokens = normalize_answer(prediction).split()
31 | ground_truth_tokens = normalize_answer(ground_truth).split()
32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 | num_same = sum(common.values())
34 | if num_same == 0:
35 | return 0
36 | precision = 1.0 * num_same / len(prediction_tokens)
37 | recall = 1.0 * num_same / len(ground_truth_tokens)
38 | f1 = (2 * precision * recall) / (precision + recall)
39 | return f1
40 |
41 |
42 | def exact_match_score(prediction, ground_truth):
43 | return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 |
45 |
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 | scores_for_ground_truths = []
48 | for ground_truth in ground_truths:
49 | score = metric_fn(prediction, ground_truth)
50 | scores_for_ground_truths.append(score)
51 | return max(scores_for_ground_truths)
52 |
53 |
54 | def evaluate(dataset, predictions):
55 | f1 = exact_match = total = 0
56 | for article in dataset:
57 | for paragraph in article['paragraphs']:
58 | for qa in paragraph['qas']:
59 | total += 1
60 | if qa['id'] not in predictions:
61 | message = 'Unanswered question ' + qa['id'] + \
62 | ' will receive score 0.'
63 | print(message, file=sys.stderr)
64 | continue
65 | ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 | prediction = predictions[qa['id']]
67 | exact_match += metric_max_over_ground_truths(
68 | exact_match_score, prediction, ground_truths)
69 | f1 += metric_max_over_ground_truths(
70 | f1_score, prediction, ground_truths)
71 |
72 | exact_match = 100.0 * exact_match / total
73 | f1 = 100.0 * f1 / total
74 |
75 | return {'exact_match': exact_match, 'f1': f1}
76 |
77 |
78 | if __name__ == '__main__':
79 | expected_version = '1.1'
80 | parser = argparse.ArgumentParser(
81 | description='Evaluation for SQuAD ' + expected_version)
82 | parser.add_argument('dataset_file', help='Dataset file')
83 | parser.add_argument('prediction_file', help='Prediction File')
84 | args = parser.parse_args()
85 | with open(args.dataset_file) as dataset_file:
86 | dataset_json = json.load(dataset_file)
87 | if (dataset_json['version'] != expected_version):
88 | print('Evaluation expects v-' + expected_version +
89 | ', but got dataset with v-' + dataset_json['version'],
90 | file=sys.stderr)
91 | dataset = dataset_json['data']
92 | with open(args.prediction_file) as prediction_file:
93 | predictions = json.load(prediction_file)
94 | print(json.dumps(evaluate(dataset, predictions)))
95 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/evaluate-v1.1.py:
--------------------------------------------------------------------------------
1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
2 | from __future__ import print_function
3 | from collections import Counter
4 | import string
5 | import re
6 | import argparse
7 | import json
8 | import sys
9 |
10 |
11 | def normalize_answer(s):
12 | """Lower text and remove punctuation, articles and extra whitespace."""
13 | def remove_articles(text):
14 | return re.sub(r'\b(a|an|the)\b', ' ', text)
15 |
16 | def white_space_fix(text):
17 | return ' '.join(text.split())
18 |
19 | def remove_punc(text):
20 | exclude = set(string.punctuation)
21 | return ''.join(ch for ch in text if ch not in exclude)
22 |
23 | def lower(text):
24 | return text.lower()
25 |
26 | return white_space_fix(remove_articles(remove_punc(lower(s))))
27 |
28 |
29 | def f1_score(prediction, ground_truth):
30 | prediction_tokens = normalize_answer(prediction).split()
31 | ground_truth_tokens = normalize_answer(ground_truth).split()
32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 | num_same = sum(common.values())
34 | if num_same == 0:
35 | return 0
36 | precision = 1.0 * num_same / len(prediction_tokens)
37 | recall = 1.0 * num_same / len(ground_truth_tokens)
38 | f1 = (2 * precision * recall) / (precision + recall)
39 | return f1
40 |
41 |
42 | def exact_match_score(prediction, ground_truth):
43 | return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 |
45 |
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 | scores_for_ground_truths = []
48 | for ground_truth in ground_truths:
49 | score = metric_fn(prediction, ground_truth)
50 | scores_for_ground_truths.append(score)
51 | return max(scores_for_ground_truths)
52 |
53 |
54 | def evaluate(dataset, predictions):
55 | f1 = exact_match = total = 0
56 | for article in dataset:
57 | for paragraph in article['paragraphs']:
58 | for qa in paragraph['qas']:
59 | total += 1
60 | if qa['id'] not in predictions:
61 | message = 'Unanswered question ' + qa['id'] + \
62 | ' will receive score 0.'
63 | print(message, file=sys.stderr)
64 | continue
65 | ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 | prediction = predictions[qa['id']]
67 | exact_match += metric_max_over_ground_truths(
68 | exact_match_score, prediction, ground_truths)
69 | f1 += metric_max_over_ground_truths(
70 | f1_score, prediction, ground_truths)
71 |
72 | exact_match = 100.0 * exact_match / total
73 | f1 = 100.0 * f1 / total
74 |
75 | return {'exact_match': exact_match, 'f1': f1}
76 |
77 |
78 | if __name__ == '__main__':
79 | expected_version = '1.1'
80 | parser = argparse.ArgumentParser(
81 | description='Evaluation for SQuAD ' + expected_version)
82 | parser.add_argument('dataset_file', help='Dataset file')
83 | parser.add_argument('prediction_file', help='Prediction File')
84 | args = parser.parse_args()
85 | with open(args.dataset_file) as dataset_file:
86 | dataset_json = json.load(dataset_file)
87 | if (dataset_json['version'] != expected_version):
88 | print('Evaluation expects v-' + expected_version +
89 | ', but got dataset with v-' + dataset_json['version'],
90 | file=sys.stderr)
91 | dataset = dataset_json['data']
92 | with open(args.prediction_file) as prediction_file:
93 | predictions = json.load(prediction_file)
94 | print(json.dumps(evaluate(dataset, predictions)))
95 |
--------------------------------------------------------------------------------
/lm/code/weight_drop.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import Parameter
3 | from functools import wraps
4 |
5 | class WeightDrop(torch.nn.Module):
6 | def __init__(self, module, weights, dropout=0, variational=False):
7 | super(WeightDrop, self).__init__()
8 | self.module = module
9 | self.weights = weights
10 | self.dropout = dropout
11 | self.variational = variational
12 | self._setup()
13 |
14 | def widget_demagnetizer_y2k_edition(*args, **kwargs):
15 | # We need to replace flatten_parameters with a nothing function
16 | # It must be a function rather than a lambda as otherwise pickling explodes
17 | # We can't write boring code though, so ... WIDGET DEMAGNETIZER Y2K EDITION!
18 | # (╯°□°)╯︵ ┻━┻
19 | return
20 |
21 | def _setup(self):
22 | # Terrible temporary solution to an issue regarding compacting weights re: CUDNN RNN
23 | if issubclass(type(self.module), torch.nn.RNNBase):
24 | self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition
25 |
26 | for name_w in self.weights:
27 | if not hasattr(self.module, name_w):
28 | continue
29 | print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
30 | w = getattr(self.module, name_w)
31 | del self.module._parameters[name_w]
32 | self.module.register_parameter(name_w + '_raw', Parameter(w.data))
33 |
34 | def _setweights(self):
35 | for name_w in self.weights:
36 | if not hasattr(self.module, name_w):
37 | continue
38 |
39 | raw_w = getattr(self.module, name_w + '_raw')
40 | w = None
41 | if self.variational:
42 | mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1))
43 | if raw_w.is_cuda: mask = mask.cuda()
44 | mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True)
45 | w = mask.expand_as(raw_w) * raw_w
46 | else:
47 | w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)
48 | setattr(self.module, name_w, w)
49 |
50 | def forward(self, *args):
51 | self._setweights()
52 | return self.module.forward(*args)
53 |
54 | if __name__ == '__main__':
55 | import torch
56 | from weight_drop import WeightDrop
57 |
58 | # Input is (seq, batch, input)
59 | x = torch.autograd.Variable(torch.randn(2, 1, 10)).cuda()
60 | h0 = None
61 |
62 | ###
63 |
64 | print('Testing WeightDrop')
65 | print('=-=-=-=-=-=-=-=-=-=')
66 |
67 | ###
68 |
69 | print('Testing WeightDrop with Linear')
70 |
71 | lin = WeightDrop(torch.nn.Linear(10, 10), ['weight'], dropout=0.9)
72 | lin.cuda()
73 | run1 = [x.sum() for x in lin(x).data]
74 | run2 = [x.sum() for x in lin(x).data]
75 |
76 | print('All items should be different')
77 | print('Run 1:', run1)
78 | print('Run 2:', run2)
79 |
80 | assert run1[0] != run2[0]
81 | assert run1[1] != run2[1]
82 |
83 | print('---')
84 |
85 | ###
86 |
87 | print('Testing WeightDrop with LSTM')
88 |
89 | wdrnn = WeightDrop(torch.nn.LSTM(10, 10), ['weight_hh_l0'], dropout=0.9)
90 | wdrnn.cuda()
91 |
92 | run1 = [x.sum() for x in wdrnn(x, h0)[0].data]
93 | run2 = [x.sum() for x in wdrnn(x, h0)[0].data]
94 |
95 | print('First timesteps should be equal, all others should differ')
96 | print('Run 1:', run1)
97 | print('Run 2:', run2)
98 |
99 | # First time step, not influenced by hidden to hidden weights, should be equal
100 | assert run1[0] == run2[0]
101 | # Second step should not
102 | assert run1[1] != run2[1]
103 |
104 | print('---')
105 |
--------------------------------------------------------------------------------
/doc/config.py:
--------------------------------------------------------------------------------
1 | dict(
2 | # lrate decay
3 | # select strategy: noam, gnmt+, epoch, score and vanilla
4 | lrate_strategy="epoch",
5 | # learning decay rate
6 | lrate_decay=0.5,
7 | # weight decay for L2 loss
8 | weight_decay=3e-5,
9 |
10 | # early stopping
11 | estop_patience=100,
12 |
13 | # initialization
14 | # type of initializer
15 | initializer="uniform",
16 | # initializer range control
17 | initializer_gain=0.08,
18 |
19 | # parameters for rnnsearch
20 | # encoder and decoder hidden size
21 | hidden_size=64,
22 | # source and target embedding size
23 | embed_size=300,
24 | # character embedding size
25 | char_embed_size=32,
26 | # dropout value
27 | dropout=0.1,
28 | # word random dropout
29 | word_dropout=0.1,
30 | # label smoothing value
31 | label_smooth=0.1,
32 | # gru, lstm, sru or atr
33 | cell="atr",
34 | # whether use layer normalization, it will be slow
35 | layer_norm=False,
36 | # notice that when opening the swap memory switch
37 | # you can train reasonably larger batch on condition
38 | # that your system will use much more cpu memory
39 | swap_memory=True,
40 |
41 | # whether use character embedding
42 | use_char=True,
43 | # whether lowercase word
44 | lower=False,
45 |
46 | # task name
47 | task="amafull",
48 |
49 | model_name="InferNet",
50 |
51 | # constant batch size at 'batch' mode for batch-based batching
52 | batch_size=64,
53 | token_size=2000,
54 | batch_or_token='batch',
55 | # batch size for decoding, i.e. number of source sentences decoded at the same time
56 | eval_batch_size=64,
57 | # whether shuffle batches during training
58 | shuffle_batch=True,
59 | # whether use multiprocessing deal with data reading, default true
60 | data_multiprocessing=True,
61 |
62 | # word vocabulary
63 | word_vocab_file="",
64 | # char vocabulary
65 | char_vocab_file="",
66 | # pretrained word embedding
67 | pretrain_word_embedding_file="path-of/glove.840B.300d.txt",
68 | # dataset path file
69 | data_path="path-of/data",
70 | # output directory
71 | output_dir="train",
72 | # output during testing
73 | test_output="",
74 |
75 | # adam optimizer hyperparameters
76 | beta1=0.9,
77 | beta2=0.999,
78 | epsilon=1e-8,
79 | # gradient clipping value
80 | clip_grad_norm=5.0,
81 | # initial learning rate
82 | lrate=1e-3,
83 |
84 | # allowed maximum sentence length
85 | max_len=400,
86 | # maximum word length
87 | max_w_len=25,
88 | # maximum sentence number
89 | max_p_num=10,
90 | # hierarchy neural network
91 | enable_hierarchy=False,
92 |
93 | # maximum epochs
94 | epoches=6,
95 | # the effective batch size is: batch/token size * update_cycle
96 | # sequential update cycle
97 | update_cycle=1,
98 | # the number of gpus
99 | gpus=[0],
100 | # whether enable ema
101 | ema_decay=0.9999,
102 |
103 | # print information every disp_freq training steps
104 | disp_freq=10,
105 | # evaluate on the development file every eval_freq steps
106 | eval_freq=10000,
107 | # save the model parameters every save_freq steps
108 | save_freq=5000,
109 | # saved checkpoint number
110 | checkpoints=5,
111 | # the maximum training steps, program with stop if epoches or max_training_steps is metted
112 | max_training_steps=1000000000,
113 |
114 | # bert configuration
115 | # did not use in practice, efficiency is an important issue
116 | bert=None,
117 | bert_dir="path-of/cased_L-12_H-768_A-12/",
118 | tune_bert=False,
119 | enable_bert=False,
120 | use_bert_single=True,
121 |
122 | # number of threads for threaded reading, seems useless
123 | nthreads=3,
124 | # buffer size controls the number of sentences readed in one time,
125 | buffer_size=100000,
126 | # a unique queue in multi-thread reading process
127 | max_queue_size=100,
128 | # random control, not so well for tensorflow.
129 | random_seed=1234,
130 | # whether or not train from checkpoint
131 | train_continue=True,
132 | )
133 |
--------------------------------------------------------------------------------
/nli/config.py:
--------------------------------------------------------------------------------
1 | dict(
2 | # lrate decay
3 | # select strategy: noam, gnmt+, epoch, score and vanilla
4 | lrate_strategy="epoch",
5 | # learning decay rate
6 | lrate_decay=0.5,
7 | # weight decay for L2 loss
8 | weight_decay=3e-5,
9 |
10 | # early stopping
11 | estop_patience=100,
12 |
13 | # initialization
14 | # type of initializer
15 | initializer="uniform",
16 | # initializer range control
17 | initializer_gain=0.08,
18 |
19 | # parameters for rnnsearch
20 | # encoder and decoder hidden size
21 | hidden_size=300,
22 | # source and target embedding size
23 | embed_size=300,
24 | # label number
25 | label_size=3,
26 | # number of layers
27 | char_embed_size=64,
28 | # dropout value
29 | dropout=0.3,
30 | # label smoothing value
31 | label_smooth=0.1,
32 | # gru, lstm, sru or atr
33 | cell="atr",
34 | # whether use layer normalization, it will be slow
35 | layer_norm=False,
36 | # notice that when opening the swap memory switch
37 | # you can train reasonably larger batch on condition
38 | # that your system will use much more cpu memory
39 | swap_memory=True,
40 |
41 | # bert configuration
42 | bert=None,
43 | bert_dir="path-to-bert/cased_L-12_H-768_A-12",
44 | tune_bert=False,
45 | enable_bert=False,
46 | use_bert_single=True,
47 |
48 | # whether use character embedding
49 | use_char=True,
50 | # whether lowercase word
51 | lower=False,
52 | bert_lower=False,
53 |
54 | model_name="nlinet",
55 |
56 | # constant batch size at 'batch' mode for batch-based batching
57 | batch_size=128,
58 | token_size=2000,
59 | batch_or_token='batch',
60 | # batch size for decoding, i.e. number of source sentences decoded at the same time
61 | eval_batch_size=64,
62 | # whether shuffle batches during training
63 | shuffle_batch=True,
64 | # whether use multiprocessing deal with data reading, default true
65 | data_multiprocessing=True,
66 |
67 | # word vocabulary
68 | word_vocab_file="path-of/word_vocab",
69 | # char vocabulary
70 | char_vocab_file="path-of/char_vocab",
71 | # pretrained word embedding
72 | pretrain_word_embedding_file="path-of/word_vocab.npz",
73 | # train file
74 | train_file=["path-of/train.p", "path-of/train.q", "path-of/train.l"],
75 | # dev file
76 | dev_file=["path-of/dev.p", "path-of/dev.q", "path-of/dev.l"],
77 | # test file
78 | test_file=["path-of/test.p", "path-of/test.q", "path-of/test.l"],
79 | # output directory
80 | output_dir="train",
81 | # output during testing
82 | test_output="",
83 |
84 | # adam optimizer hyperparameters
85 | beta1=0.9,
86 | beta2=0.999,
87 | epsilon=1e-8,
88 | # gradient clipping value
89 | clip_grad_norm=5.0,
90 | # initial learning rate
91 | lrate=1e-3,
92 |
93 | # allowed maximum sentence length
94 | max_len=100,
95 | # maximum word length
96 | max_w_len=25,
97 |
98 | # maximum epochs
99 | epoches=10,
100 | # the effective batch size is: batch/token size * update_cycle
101 | # sequential update cycle
102 | update_cycle=1,
103 | # the number of gpus
104 | gpus=[0],
105 | # whether enable ema
106 | ema_decay=0.9999,
107 |
108 | # print information every disp_freq training steps
109 | disp_freq=10,
110 | # evaluate on the development file every eval_freq steps
111 | eval_freq=1000,
112 | # save the model parameters every save_freq steps
113 | save_freq=1000,
114 | # saved checkpoint number
115 | checkpoints=5,
116 | # the maximum training steps, program with stop if epoches or max_training_steps is metted
117 | max_training_steps=100000,
118 |
119 | # number of threads for threaded reading, seems useless
120 | nthreads=6,
121 | # buffer size controls the number of sentences readed in one time,
122 | buffer_size=20000,
123 | # a unique queue in multi-thread reading process
124 | max_queue_size=100,
125 | # random control, not so well for tensorflow.
126 | random_seed=1234,
127 | # whether or not train from checkpoint
128 | train_continue=True,
129 | )
130 |
--------------------------------------------------------------------------------
/nli/config_bert.py:
--------------------------------------------------------------------------------
1 | dict(
2 | # lrate decay
3 | # select strategy: noam, gnmt+, epoch, score and vanilla
4 | lrate_strategy="vanilla",
5 | # learning decay rate
6 | lrate_decay=0.5,
7 | # weight decay for L2 loss
8 | weight_decay=3e-5,
9 |
10 | # early stopping
11 | estop_patience=100,
12 |
13 | # initialization
14 | # type of initializer
15 | initializer="uniform",
16 | # initializer range control
17 | initializer_gain=0.08,
18 |
19 | # parameters for rnnsearch
20 | # encoder and decoder hidden size
21 | hidden_size=300,
22 | # source and target embedding size
23 | embed_size=300,
24 | # label number
25 | label_size=3,
26 | # number of layers
27 | char_embed_size=64,
28 | # dropout value
29 | dropout=0.3,
30 | # label smoothing value
31 | label_smooth=0.1,
32 | # gru, lstm, sru or atr
33 | cell="atr",
34 | # whether use layer normalization, it will be slow
35 | layer_norm=False,
36 | # notice that when opening the swap memory switch
37 | # you can train reasonably larger batch on condition
38 | # that your system will use much more cpu memory
39 | swap_memory=True,
40 |
41 | # bert configuration
42 | bert=None,
43 | bert_dir="path-to-bert/cased_L-12_H-768_A-12",
44 | tune_bert=True,
45 | enable_bert=True,
46 | use_bert_single=True,
47 |
48 | # whether use character embedding
49 | use_char=True,
50 | # whether lowercase word
51 | lower=False,
52 | bert_lower=False,
53 |
54 | model_name="nlinet",
55 |
56 | # constant batch size at 'batch' mode for batch-based batching
57 | batch_size=32,
58 | token_size=2000,
59 | batch_or_token='batch',
60 | # batch size for decoding, i.e. number of source sentences decoded at the same time
61 | eval_batch_size=32,
62 | # whether shuffle batches during training
63 | shuffle_batch=True,
64 | # whether use multiprocessing deal with data reading, default true
65 | data_multiprocessing=True,
66 |
67 | # word vocabulary
68 | word_vocab_file="path-of/word_vocab",
69 | # char vocabulary
70 | char_vocab_file="path-of/char_vocab",
71 | # pretrained word embedding
72 | pretrain_word_embedding_file="path-of/word_vocab.npz",
73 | # train file
74 | train_file=["path-of/train.p", "path-of/train.q", "path-of/train.l"],
75 | # dev file
76 | dev_file=["path-of/dev.p", "path-of/dev.q", "path-of/dev.l"],
77 | # test file
78 | test_file=["path-of/test.p", "path-of/test.q", "path-of/test.l"],
79 | # output directory
80 | output_dir="train",
81 | # output during testing
82 | test_output="",
83 |
84 | # adam optimizer hyperparameters
85 | beta1=0.9,
86 | beta2=0.999,
87 | epsilon=1e-8,
88 | # gradient clipping value
89 | clip_grad_norm=5.0,
90 | # initial learning rate
91 | lrate=2e-5,
92 |
93 | # allowed maximum sentence length
94 | max_len=100,
95 | # maximum word length
96 | max_w_len=25,
97 |
98 | # maximum epochs
99 | epoches=5,
100 | # the effective batch size is: batch/token size * update_cycle
101 | # sequential update cycle
102 | update_cycle=1,
103 | # the number of gpus
104 | gpus=[0],
105 | # whether enable ema
106 | ema_decay=0.9999,
107 |
108 | # print information every disp_freq training steps
109 | disp_freq=10,
110 | # evaluate on the development file every eval_freq steps
111 | eval_freq=1000,
112 | # save the model parameters every save_freq steps
113 | save_freq=1000,
114 | # saved checkpoint number
115 | checkpoints=5,
116 | # the maximum training steps, program with stop if epoches or max_training_steps is metted
117 | max_training_steps=100000,
118 |
119 | # number of threads for threaded reading, seems useless
120 | nthreads=6,
121 | # buffer size controls the number of sentences readed in one time,
122 | buffer_size=20000,
123 | # a unique queue in multi-thread reading process
124 | max_queue_size=100,
125 | # random control, not so well for tensorflow.
126 | random_seed=1234,
127 | # whether or not train from checkpoint
128 | train_continue=True,
129 | )
130 |
--------------------------------------------------------------------------------
/ner/code/tagger.py:
--------------------------------------------------------------------------------
1 | """
2 | Model API.
3 | """
4 | import numpy as np
5 | from seqeval.metrics.sequence_labeling import get_entities
6 |
7 |
8 | class Tagger(object):
9 | """A model API that tags input sentence.
10 |
11 | Attributes:
12 | model: Model.
13 | preprocessor: Transformer. Preprocessing data for feature extraction.
14 | tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.
15 | """
16 |
17 | def __init__(self, model, preprocessor, tokenizer=str.split):
18 | self.model = model
19 | self.preprocessor = preprocessor
20 | self.tokenizer = tokenizer
21 |
22 | def predict_proba(self, text):
23 | """Probability estimates.
24 |
25 | The returned estimates for all classes are ordered by the
26 | label of classes.
27 |
28 | Args:
29 | text : string, the input text.
30 |
31 | Returns:
32 | y : array-like, shape = [num_words, num_classes]
33 | Returns the probability of the word for each class in the model,
34 | """
35 | assert isinstance(text, str)
36 |
37 | words = self.tokenizer(text)
38 | X = self.preprocessor.transform([words])
39 | y = self.model.predict(X)
40 | y = y[0] # reduce batch dimension.
41 |
42 | return y
43 |
44 | def _get_prob(self, pred):
45 | prob = np.max(pred, -1)
46 |
47 | return prob
48 |
49 | def _get_tags(self, pred):
50 | tags = self.preprocessor.inverse_transform([pred])
51 | tags = tags[0] # reduce batch dimension
52 |
53 | return tags
54 |
55 | def _build_response(self, sent, tags, prob):
56 | words = self.tokenizer(sent)
57 | res = {
58 | 'words': words,
59 | 'entities': [
60 |
61 | ]
62 | }
63 | chunks = get_entities(tags)
64 |
65 | for chunk_type, chunk_start, chunk_end in chunks:
66 | chunk_end += 1
67 | entity = {
68 | 'text': ' '.join(words[chunk_start: chunk_end]),
69 | 'type': chunk_type,
70 | 'score': float(np.average(prob[chunk_start: chunk_end])),
71 | 'beginOffset': chunk_start,
72 | 'endOffset': chunk_end
73 | }
74 | res['entities'].append(entity)
75 |
76 | return res
77 |
78 | def analyze(self, text):
79 | """Analyze text and return pretty format.
80 |
81 | Args:
82 | text: string, the input text.
83 |
84 | Returns:
85 | res: dict.
86 |
87 | Examples:
88 | >>> text = 'President Obama is speaking at the White House.'
89 | >>> model.analyze(text)
90 | {
91 | "words": [
92 | "President",
93 | "Obama",
94 | "is",
95 | "speaking",
96 | "at",
97 | "the",
98 | "White",
99 | "House."
100 | ],
101 | "entities": [
102 | {
103 | "beginOffset": 1,
104 | "endOffset": 2,
105 | "score": 1,
106 | "text": "Obama",
107 | "type": "PER"
108 | },
109 | {
110 | "beginOffset": 6,
111 | "endOffset": 8,
112 | "score": 1,
113 | "text": "White House.",
114 | "type": "ORG"
115 | }
116 | ]
117 | }
118 | """
119 | pred = self.predict_proba(text)
120 | tags = self._get_tags(pred)
121 | prob = self._get_prob(pred)
122 | res = self._build_response(text, tags, prob)
123 |
124 | return res
125 |
126 | def predict(self, text):
127 | """Predict using the model.
128 |
129 | Args:
130 | text: string, the input text.
131 |
132 | Returns:
133 | tags: list, shape = (num_words,)
134 | Returns predicted values.
135 | """
136 | pred = self.predict_proba(text)
137 | tags = self._get_tags(pred)
138 |
139 | return tags
140 |
--------------------------------------------------------------------------------
/lm/README.md:
--------------------------------------------------------------------------------
1 | ## Language Modeling
2 |
3 |
4 | We do experiments on PTB and WT2 dataset, and use the mixture of softmax model [MoS](https://arxiv.org/abs/1711.03953).
5 | Main experimental results are summarized below.
6 |
7 |
8 |
9 | | Model |
10 | #Params |
11 | PTB |
12 | WT2 |
13 |
14 |
15 | | Base |
16 | +Finetune |
17 | +Dynamic |
18 | Base |
19 | +Finetune |
20 | +Dynamic |
21 |
22 |
23 | | Yang et al. (2018) |
24 | 22M |
25 | 55.97 |
26 | 54.44 |
27 | 47.69 |
28 | 63.33 |
29 | 61.45 |
30 | 40.68 |
31 |
32 |
33 | This Work |
34 | LSTM |
35 | 22M |
36 | 63.78 |
37 | 62.12 |
38 | 53.11 |
39 | 69.78 |
40 | 68.68 |
41 | 44.60 |
42 |
43 |
44 | | GRU |
45 | 17M |
46 | 69.09 |
47 | 67.61 |
48 | 60.21 |
49 | 73.37 |
50 | 73.05 |
51 | 49.77 |
52 |
53 |
54 | | ATR |
55 | 9M |
56 | 66.24 |
57 | 65.86 |
58 | 58.29 |
59 | 75.36 |
60 | 73.35 |
61 | 48.65 |
62 |
63 |
64 | | SRU |
65 | 13M |
66 | 69.64 |
67 | 65.29 |
68 | 60.97 |
69 | 85.15 |
70 | 84.97 |
71 | 57.97 |
72 |
73 |
74 | | LRN |
75 | 11M |
76 | 61.26 |
77 | 61.00 |
78 | 54.45 |
79 | 69.91 |
80 | 68.86 |
81 | 46.97 |
82 |
83 |
84 |
85 | Test perplexity.
86 |
87 | ## Requirement
88 | PyTorch >= 0.4.1
89 |
90 | ## How to Run?
91 | - download and preprocess dataset
92 |
93 | - see [MoS](https://github.com/zihangdai/mos) about the preprocessing of datasets
94 |
95 | - training and evaluation
96 |
97 | - training
98 | ```
99 | #! /bin/bash
100 |
101 | export CUDA_VISIBLE_DEVICES=0
102 |
103 | # for PTB
104 | python3 main.py --data path-of/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 28 --batch_size 12 --lr 10.0 --epoch 1000 --nhid 960 --nhidlast 620 --emsize 280 --n_experts 15 --save PTB --single_gpu --model lrn
105 | # for WT2
106 | python3 main.py --epochs 1000 --data path-of/wikitext-2 --save WT2 --dropouth 0.2 --seed 1882 --n_experts 15 --nhid 1150 --nhidlast 650 --emsize 300 --batch_size 15 --lr 15.0 --dropoutl 0.29 --small_batch_size 5 --max_seq_len_delta 20 --dropouti 0.55 --single_gpu --model lrn
107 | ```
108 |
109 | - finetuning
110 | ```
111 | # for PTB
112 | python3 finetune.py --data path-of/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 28 --batch_size 12 --lr 15.0 --epoch 1000 --nhid 960 --emsize 280 --n_experts 15 --save PTB-XXX --single_gpu --model lrn
113 | # for WT2
114 | python3 finetune.py --epochs 1000 --data path-of/wikitext-2 --save WT2-XXX --dropouth 0.2 --seed 1882 --n_experts 15 --nhid 1150 --emsize 300 --batch_size 15 --lr 20.0 --dropoutl 0.29 --small_batch_size 5 --max_seq_len_delta 20 --dropouti 0.55 --single_gpu --model lrn
115 | ```
116 |
117 | - dynamic evaluation
118 | ```
119 | # for PTB
120 | python3 dynamiceval.py --model PTB-XXX/finetune_model.pt --data path-of/penn --lamb 0.075 --gpu 0
121 | # for WT2
122 | python3 dynamiceval.py --data path-of/wikitext-2 --model WT2-XXX/finetune_model.pt --epsilon 0.002 --gpu 0
123 | ```
124 |
125 | - general evaluation
126 | ```
127 | # for PTB
128 | python3 evaluate.py --data path-of/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 28 --batch_size 12 --lr 10.0 --epoch 1000 --nhid 960 --nhidlast 620 --emsize 280 --n_experts 15 --save PTB-XXX --single_gpu --model lrn
129 | # for WT2
130 | python3 evaluate.py --epochs 1000 --data path-of/wikitext-2 --save WT2-XXX --dropouth 0.2 --seed 1882 --n_experts 15 --nhid 1150 --nhidlast 650 --emsize 300 --batch_size 15 --lr 15.0 --dropoutl 0.29 --small_batch_size 5 --max_seq_len_delta 20 --dropouti 0.55 --single_gpu --model lrn
131 | ```
132 |
133 | ## Credits
134 |
135 | Source code structure is adapted from [MoS](https://github.com/zihangdai/mos).
--------------------------------------------------------------------------------
/lm/code/data.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 |
4 | from collections import Counter
5 |
6 |
7 | class Dictionary(object):
8 | def __init__(self):
9 | self.word2idx = {}
10 | self.idx2word = []
11 | self.counter = Counter()
12 | self.total = 0
13 |
14 | def add_word(self, word):
15 | if word not in self.word2idx:
16 | self.idx2word.append(word)
17 | self.word2idx[word] = len(self.idx2word) - 1
18 | token_id = self.word2idx[word]
19 | self.counter[token_id] += 1
20 | self.total += 1
21 | return self.word2idx[word]
22 |
23 | def __len__(self):
24 | return len(self.idx2word)
25 |
26 |
27 | class Corpus(object):
28 | def __init__(self, path):
29 | self.dictionary = Dictionary()
30 | self.train = self.tokenize(os.path.join(path, 'train.txt'))
31 | self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
32 | self.test = self.tokenize(os.path.join(path, 'test.txt'))
33 |
34 | def tokenize(self, path):
35 | """Tokenizes a text file."""
36 | assert os.path.exists(path)
37 | # Add words to the dictionary
38 | with open(path, 'r', encoding='utf-8') as f:
39 | tokens = 0
40 | for line in f:
41 | words = line.split() + ['']
42 | tokens += len(words)
43 | for word in words:
44 | self.dictionary.add_word(word)
45 |
46 | # Tokenize file content
47 | with open(path, 'r', encoding='utf-8') as f:
48 | ids = torch.LongTensor(tokens)
49 | token = 0
50 | for line in f:
51 | words = line.split() + ['']
52 | for word in words:
53 | ids[token] = self.dictionary.word2idx[word]
54 | token += 1
55 |
56 | return ids
57 |
58 | class SentCorpus(object):
59 | def __init__(self, path):
60 | self.dictionary = Dictionary()
61 | self.train = self.tokenize(os.path.join(path, 'train.txt'))
62 | self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
63 | self.test = self.tokenize(os.path.join(path, 'test.txt'))
64 |
65 | def tokenize(self, path):
66 | """Tokenizes a text file."""
67 | assert os.path.exists(path)
68 | # Add words to the dictionary
69 | with open(path, 'r', encoding='utf-8') as f:
70 | tokens = 0
71 | for line in f:
72 | words = line.split() + ['']
73 | tokens += len(words)
74 | for word in words:
75 | self.dictionary.add_word(word)
76 |
77 | # Tokenize file content
78 | sents = []
79 | with open(path, 'r', encoding='utf-8') as f:
80 | for line in f:
81 | if not line:
82 | continue
83 | words = line.split() + ['']
84 | sent = torch.LongTensor(len(words))
85 | for i, word in enumerate(words):
86 | sent[i] = self.dictionary.word2idx[word]
87 | sents.append(sent)
88 |
89 | return sents
90 |
91 | class BatchSentLoader(object):
92 | def __init__(self, sents, batch_size, pad_id=0, cuda=False, volatile=False):
93 | self.sents = sents
94 | self.batch_size = batch_size
95 | self.sort_sents = sorted(sents, key=lambda x: x.size(0))
96 | self.cuda = cuda
97 | self.volatile = volatile
98 | self.pad_id = pad_id
99 |
100 | def __next__(self):
101 | if self.idx >= len(self.sort_sents):
102 | raise StopIteration
103 |
104 | batch_size = min(self.batch_size, len(self.sort_sents)-self.idx)
105 | batch = self.sort_sents[self.idx:self.idx+batch_size]
106 | max_len = max([s.size(0) for s in batch])
107 | tensor = torch.LongTensor(max_len, batch_size).fill_(self.pad_id)
108 | for i in range(len(batch)):
109 | s = batch[i]
110 | tensor[:s.size(0),i].copy_(s)
111 | if self.cuda:
112 | tensor = tensor.cuda()
113 |
114 | self.idx += batch_size
115 |
116 | return tensor
117 |
118 | next = __next__
119 |
120 | def __iter__(self):
121 | self.idx = 0
122 | return self
123 |
124 | if __name__ == '__main__':
125 | corpus = SentCorpus('../penn')
126 | loader = BatchSentLoader(corpus.test, 10)
127 | for i, d in enumerate(loader):
128 | print(i, d.size())
129 |
--------------------------------------------------------------------------------
/doc/code/utils/saver.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import os
8 | import tensorflow as tf
9 |
10 |
11 | class Saver(object):
12 | def __init__(self,
13 | checkpoints=5, # save the latest number of checkpoints
14 | output_dir=None # the output directory
15 | ):
16 | if output_dir is None:
17 | output_dir = "./output"
18 | self.output_dir = output_dir
19 | self.output_best_dir = os.path.join(output_dir, "best")
20 |
21 | self.saver = tf.train.Saver(
22 | max_to_keep=checkpoints
23 | )
24 | self.best_saver = tf.train.Saver(
25 | max_to_keep=1
26 | )
27 | self.best_score = -1
28 | self.score_record = tf.gfile.Open(
29 | os.path.join(self.output_best_dir, "metric.log"),
30 | mode="a+"
31 | )
32 |
33 | def save(self, session, step, metric_score=None):
34 | if not tf.gfile.Exists(self.output_dir):
35 | tf.gfile.MkDir(self.output_dir)
36 | if not tf.gfile.Exists(self.output_best_dir):
37 | tf.gfile.MkDir(self.output_best_dir)
38 |
39 | self.saver.save(session,
40 | os.path.join(self.output_dir, "model"),
41 | global_step=step)
42 |
43 | def _move(path, new_path):
44 | if tf.gfile.Exists(path):
45 | if tf.gfile.Exists(new_path):
46 | tf.gfile.Remove(new_path)
47 | tf.gfile.Copy(path, new_path)
48 |
49 | if metric_score is not None and metric_score > self.best_score:
50 | self.best_score = metric_score
51 | self.best_saver.save(
52 | session, os.path.join(self.output_best_dir, "model"))
53 |
54 | _move(os.path.join(self.output_dir, "param.json"),
55 | os.path.join(self.output_best_dir, "param.json"))
56 | _move(os.path.join(self.output_dir, "record.json"),
57 | os.path.join(self.output_best_dir, "record.json"))
58 |
59 | # this recorder only record best scores
60 | self.score_record.write("Steps {}, Metric Score {}\n"
61 | .format(step, metric_score))
62 |
63 | self.score_record.flush()
64 |
65 | def restore(self, session, path=None):
66 | if path is not None and tf.gfile.Exists(path):
67 | check_dir = path
68 | else:
69 | check_dir = self.output_dir
70 |
71 | checkpoint = os.path.join(check_dir, "checkpoint")
72 | if not tf.gfile.Exists(checkpoint):
73 | tf.logging.warn("No Existing Model detected")
74 | else:
75 | latest_checkpoint = tf.gfile.Open(checkpoint).readline()
76 | model_name = latest_checkpoint.strip().split(":")[1].strip()
77 | model_name = model_name[1:-1] # remove ""
78 | model_path = os.path.join(check_dir, model_name)
79 | model_path = os.path.abspath(model_path)
80 | if not tf.gfile.Exists(model_path+".meta"):
81 | tf.logging.error("model '{}' does not exists"
82 | .format(model_path))
83 | else:
84 | try:
85 | self.saver.restore(session, model_path)
86 | except tf.errors.NotFoundError:
87 | # In this case, we simply assume that the cycle part
88 | # is mismatched, where the replicas are missing.
89 | # This would happen if you switch from un-cycle mode
90 | # to cycle mode.
91 | tf.logging.warn("Starting Backup Restore")
92 | ops = []
93 | reader = tf.train.load_checkpoint(model_path)
94 | for var in tf.global_variables():
95 | name = var.op.name
96 |
97 | if reader.has_tensor(name):
98 | tf.logging.info('{} get initialization from {}'
99 | .format(name, name))
100 | ops.append(
101 | tf.assign(var, reader.get_tensor(name)))
102 | else:
103 | tf.logging.warn("{} is missed".format(name))
104 | restore_op = tf.group(*ops, name="restore_global_vars")
105 | session.run(restore_op)
106 |
--------------------------------------------------------------------------------
/nli/code/utils/saver.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import os
8 | import tensorflow as tf
9 |
10 |
11 | class Saver(object):
12 | def __init__(self,
13 | checkpoints=5, # save the latest number of checkpoints
14 | output_dir=None # the output directory
15 | ):
16 | if output_dir is None:
17 | output_dir = "./output"
18 | self.output_dir = output_dir
19 | self.output_best_dir = os.path.join(output_dir, "best")
20 |
21 | self.saver = tf.train.Saver(
22 | max_to_keep=checkpoints
23 | )
24 | self.best_saver = tf.train.Saver(
25 | max_to_keep=1
26 | )
27 | self.best_score = -1
28 | self.score_record = tf.gfile.Open(
29 | os.path.join(self.output_best_dir, "metric.log"),
30 | mode="a+"
31 | )
32 |
33 | def save(self, session, step, metric_score=None):
34 | if not tf.gfile.Exists(self.output_dir):
35 | tf.gfile.MkDir(self.output_dir)
36 | if not tf.gfile.Exists(self.output_best_dir):
37 | tf.gfile.MkDir(self.output_best_dir)
38 |
39 | self.saver.save(session,
40 | os.path.join(self.output_dir, "model"),
41 | global_step=step)
42 |
43 | def _move(path, new_path):
44 | if tf.gfile.Exists(path):
45 | if tf.gfile.Exists(new_path):
46 | tf.gfile.Remove(new_path)
47 | tf.gfile.Copy(path, new_path)
48 |
49 | if metric_score is not None and metric_score > self.best_score:
50 | self.best_score = metric_score
51 | self.best_saver.save(
52 | session, os.path.join(self.output_best_dir, "model"))
53 |
54 | _move(os.path.join(self.output_dir, "param.json"),
55 | os.path.join(self.output_best_dir, "param.json"))
56 | _move(os.path.join(self.output_dir, "record.json"),
57 | os.path.join(self.output_best_dir, "record.json"))
58 |
59 | # this recorder only record best scores
60 | self.score_record.write("Steps {}, Metric Score {}\n"
61 | .format(step, metric_score))
62 |
63 | self.score_record.flush()
64 |
65 | def restore(self, session, path=None):
66 | if path is not None and tf.gfile.Exists(path):
67 | check_dir = path
68 | else:
69 | check_dir = self.output_dir
70 |
71 | checkpoint = os.path.join(check_dir, "checkpoint")
72 | if not tf.gfile.Exists(checkpoint):
73 | tf.logging.warn("No Existing Model detected")
74 | else:
75 | latest_checkpoint = tf.gfile.Open(checkpoint).readline()
76 | model_name = latest_checkpoint.strip().split(":")[1].strip()
77 | model_name = model_name[1:-1] # remove ""
78 | model_path = os.path.join(check_dir, model_name)
79 | model_path = os.path.abspath(model_path)
80 | if not tf.gfile.Exists(model_path+".meta"):
81 | tf.logging.error("model '{}' does not exists"
82 | .format(model_path))
83 | else:
84 | try:
85 | self.saver.restore(session, model_path)
86 | except tf.errors.NotFoundError:
87 | # In this case, we simply assume that the cycle part
88 | # is mismatched, where the replicas are missing.
89 | # This would happen if you switch from un-cycle mode
90 | # to cycle mode.
91 | tf.logging.warn("Starting Backup Restore")
92 | ops = []
93 | reader = tf.train.load_checkpoint(model_path)
94 | for var in tf.global_variables():
95 | name = var.op.name
96 |
97 | if reader.has_tensor(name):
98 | tf.logging.info('{} get initialization from {}'
99 | .format(name, name))
100 | ops.append(
101 | tf.assign(var, reader.get_tensor(name)))
102 | else:
103 | tf.logging.warn("{} is missed".format(name))
104 | restore_op = tf.group(*ops, name="restore_global_vars")
105 | session.run(restore_op)
106 |
--------------------------------------------------------------------------------
/rc/elmo_rnet/code/bilm/elmo.py:
--------------------------------------------------------------------------------
1 |
2 | import tensorflow as tf
3 |
4 | def weight_layers(name, bilm_ops, l2_coef=None,
5 | use_top_only=False, do_layer_norm=False):
6 | '''
7 | Weight the layers of a biLM with trainable scalar weights to
8 | compute ELMo representations.
9 |
10 | For each output layer, this returns two ops. The first computes
11 | a layer specific weighted average of the biLM layers, and
12 | the second the l2 regularizer loss term.
13 | The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES
14 |
15 | Input:
16 | name = a string prefix used for the trainable variable names
17 | bilm_ops = the tensorflow ops returned to compute internal
18 | representations from a biLM. This is the return value
19 | from BidirectionalLanguageModel(...)(ids_placeholder)
20 | l2_coef: the l2 regularization coefficient $\lambda$.
21 | Pass None or 0.0 for no regularization.
22 | use_top_only: if True, then only use the top layer.
23 | do_layer_norm: if True, then apply layer normalization to each biLM
24 | layer before normalizing
25 |
26 | Output:
27 | {
28 | 'weighted_op': op to compute weighted average for output,
29 | 'regularization_op': op to compute regularization term
30 | }
31 | '''
32 | def _l2_regularizer(weights):
33 | if l2_coef is not None:
34 | return l2_coef * tf.reduce_sum(tf.square(weights))
35 | else:
36 | return 0.0
37 |
38 | # Get ops for computing LM embeddings and mask
39 | lm_embeddings = bilm_ops['lm_embeddings']
40 | mask = bilm_ops['mask']
41 |
42 | # Disable the first embedding layer
43 | # lm_embeddings = lm_embeddings[:, 1:, :, :]
44 |
45 | n_lm_layers = int(lm_embeddings.get_shape()[1])
46 | lm_dim = int(lm_embeddings.get_shape()[3])
47 |
48 | with tf.control_dependencies([lm_embeddings, mask]):
49 | # Cast the mask and broadcast for layer use.
50 | mask_float = tf.cast(mask, 'float32')
51 | broadcast_mask = tf.expand_dims(mask_float, axis=-1)
52 |
53 | def _do_ln(x):
54 | # do layer normalization excluding the mask
55 | x_masked = x * broadcast_mask
56 | N = tf.reduce_sum(mask_float) * lm_dim
57 | mean = tf.reduce_sum(x_masked) / N
58 | variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask)**2
59 | ) / N
60 | return tf.nn.batch_normalization(
61 | x, mean, variance, None, None, 1E-12
62 | )
63 |
64 | if use_top_only:
65 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
66 | # just the top layer
67 | sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1)
68 | # no regularization
69 | reg = 0.0
70 | else:
71 | W = tf.get_variable(
72 | '{}_ELMo_W'.format(name),
73 | shape=(n_lm_layers, ),
74 | initializer=tf.zeros_initializer,
75 | regularizer=_l2_regularizer,
76 | trainable=True,
77 | )
78 |
79 | # normalize the weights
80 | normed_weights = tf.split(
81 | tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers
82 | )
83 | # split LM layers
84 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
85 |
86 | # compute the weighted, normalized LM activations
87 | pieces = []
88 | for w, t in zip(normed_weights, layers):
89 | if do_layer_norm:
90 | pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1)))
91 | else:
92 | pieces.append(w * tf.squeeze(t, squeeze_dims=1))
93 | sum_pieces = tf.add_n(pieces)
94 |
95 | # get the regularizer
96 | reg = [
97 | r for r in tf.get_collection(
98 | tf.GraphKeys.REGULARIZATION_LOSSES)
99 | if r.name.find('{}_ELMo_W/'.format(name)) >= 0
100 | ]
101 | if len(reg) != 1:
102 | pass
103 |
104 | # scale the weighted sum by gamma
105 | gamma = tf.get_variable(
106 | '{}_ELMo_gamma'.format(name),
107 | shape=(1, ),
108 | initializer=tf.ones_initializer,
109 | regularizer=None,
110 | trainable=True,
111 | )
112 | weighted_lm_layers = sum_pieces * gamma
113 |
114 | ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg}
115 |
116 | return ret
117 |
118 |
--------------------------------------------------------------------------------
/doc/code/tasks.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import os
8 | import abc
9 | import csv
10 | import numpy as np
11 |
12 |
13 | def get_task(params, is_training):
14 | name = params.task.lower()
15 |
16 | if name == "amafull":
17 | return AMAFull(params.data_path, is_training)
18 | elif name == "amapolar":
19 | return AMAPolar(params.data_path, is_training)
20 | elif name == "yahoo":
21 | return YaHoo(params.data_path, is_training)
22 | elif name == "yelpfull":
23 | return YelpFull(params.data_path, is_training)
24 | elif name == "yelppolar":
25 | return YelpPolar(params.data_path, is_training)
26 | else:
27 | raise NotImplementedError("Not Supported: {}".format(name))
28 |
29 |
30 | class Task(object):
31 | def __init__(self, data_path, is_training=False):
32 | self.data_path = data_path
33 | self.is_training = is_training
34 |
35 | self.trainset = []
36 | self.devset = []
37 | self.testset = []
38 |
39 | if self.is_training:
40 | self._read_all_train_dev_data()
41 | self._read_all_test_data()
42 |
43 | def _clean_text(self, text_in):
44 | return text_in.replace('\\"', '"').replace('\\n', ' ')
45 |
46 | def _read_all_train_dev_data(self):
47 | train_data_path = os.path.join(self.data_path, "train.csv")
48 |
49 | dataset = []
50 | with open(train_data_path) as tfile:
51 | reader = csv.reader(tfile, delimiter=",")
52 |
53 | for sample in reader:
54 | dataset.append(sample)
55 |
56 | np.random.shuffle(dataset)
57 |
58 | # split the dataset with 90% and 10%
59 | dev_size = int(len(dataset) * 0.1)
60 |
61 | self.devset = dataset[:dev_size]
62 | self.trainset = dataset[dev_size:]
63 |
64 | def _read_all_test_data(self):
65 | test_data_path = os.path.join(self.data_path, "test.csv")
66 |
67 | self.testset = []
68 | with open(test_data_path) as tfile:
69 | reader = csv.reader(tfile, delimiter=",")
70 |
71 | for sample in reader:
72 | self.testset.append(sample)
73 |
74 | def _data_iter(self, iterator):
75 | for sample in iterator:
76 | label = int(sample[0]) - 1
77 | document = ' '.join(sample[1:])
78 |
79 | document = self._clean_text(document)
80 |
81 | yield (label, document)
82 |
83 | def get_train_data(self):
84 | np.random.shuffle(self.trainset)
85 | for sample in self._data_iter(self.trainset):
86 | yield sample
87 |
88 | def get_dev_data(self):
89 | for sample in self._data_iter(self.devset):
90 | yield sample
91 |
92 | def get_test_data(self):
93 | for sample in self._data_iter(self.testset):
94 | yield sample
95 |
96 | @abc.abstractmethod
97 | def get_label_size(self):
98 | raise NotImplementedError("Not Supported")
99 |
100 |
101 | # amazon_review_full_csv
102 | class AMAFull(Task):
103 | def __init__(self, data_path, is_training=False):
104 | data_path = os.path.join(data_path, "amazon_review_full_csv")
105 | super(AMAFull, self).__init__(data_path, is_training)
106 |
107 | def get_label_size(self):
108 | return 5
109 |
110 |
111 | # amazon_review_polarity_csv
112 | class AMAPolar(Task):
113 | def __init__(self, data_path, is_training=False):
114 | data_path = os.path.join(data_path, "amazon_review_polarity_csv")
115 | super(AMAPolar, self).__init__(data_path, is_training)
116 |
117 | def get_label_size(self):
118 | return 2
119 |
120 |
121 | # yahoo_answers_csv
122 | class YaHoo(Task):
123 | def __init__(self, data_path, is_training=False):
124 | data_path = os.path.join(data_path, "yahoo_answers_csv")
125 | super(YaHoo, self).__init__(data_path, is_training)
126 |
127 | def get_label_size(self):
128 | return 10
129 |
130 |
131 | # yelp_review_full_csv
132 | class YelpFull(Task):
133 | def __init__(self, data_path, is_training=False):
134 | data_path = os.path.join(data_path, "yelp_review_full_csv")
135 | super(YelpFull, self).__init__(data_path, is_training)
136 |
137 | def get_label_size(self):
138 | return 5
139 |
140 |
141 | # yelp_review_polarity_csv
142 | class YelpPolar(Task):
143 | def __init__(self, data_path, is_training=False):
144 | data_path = os.path.join(data_path, "yelp_review_polarity_csv")
145 | super(YelpPolar, self).__init__(data_path, is_training)
146 |
147 | def get_label_size(self):
148 | return 2
149 |
--------------------------------------------------------------------------------
/doc/code/evalu.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import time
8 | import json
9 | import numpy as np
10 | import tensorflow as tf
11 |
12 | from utils import queuer
13 |
14 |
15 | def decoding(sprobs, samples, params, mask=None):
16 | """Generate decoded sequence from seqs"""
17 | if mask is None:
18 | mask = [1.] * len(sprobs)
19 |
20 | flat_sprobs = []
21 | for _sprobs, _m in zip(sprobs, mask):
22 | if _m < 1.:
23 | continue
24 |
25 | for start_prob in _sprobs:
26 | flat_sprobs.append(start_prob)
27 |
28 | assert len(flat_sprobs) == len(samples), 'Decoding length mismatch!'
29 |
30 | results = []
31 |
32 | for (idx, sample), pred in zip(samples, flat_sprobs):
33 | gold_label = sample['label_id']
34 | pred_label = pred
35 |
36 | results.append({
37 | 'pred_answer': pred_label,
38 | 'sample_id': idx,
39 | 'gold_answer': gold_label
40 | })
41 |
42 | return results
43 |
44 |
45 | def predict(session, features,
46 | out_pred, dataset, params, train="test"):
47 | """Performing decoding with exising information"""
48 | results = []
49 |
50 | batcher = dataset.batcher(params.eval_batch_size,
51 | buffer_size=params.buffer_size,
52 | shuffle=False, train=train)
53 | eval_queue = queuer.EnQueuer(batcher,
54 | multiprocessing=params.data_multiprocessing,
55 | random_seed=params.random_seed)
56 | eval_queue.start(workers=params.nthreads,
57 | max_queue_size=params.max_queue_size)
58 |
59 | def _predict_one_batch(data_on_gpu):
60 | feed_dicts = {}
61 | flat_raw_data = []
62 | for fidx, data in enumerate(data_on_gpu):
63 | # define feed_dict
64 | feed_dict = {
65 | features[fidx]["t"]: data['token_ids'],
66 | features[fidx]["l"]: data['l_id'],
67 | }
68 | if params.use_char:
69 | feed_dict[features[fidx]["c"]] = data['char_ids']
70 |
71 | if params.enable_bert:
72 | feed_dict[features[fidx]["s"]] = data['subword_ids']
73 | feed_dict[features[fidx]["sb"]] = data['subword_back']
74 |
75 | feed_dicts.update(feed_dict)
76 | flat_raw_data.extend(data['raw'])
77 |
78 | # pick up valid outputs
79 | data_size = len(data_on_gpu)
80 | valid_out_pred = out_pred[:data_size]
81 |
82 | decode_spred = session.run(
83 | valid_out_pred, feed_dict=feed_dicts)
84 |
85 | predictions = decoding(
86 | decode_spred, flat_raw_data, params
87 | )
88 |
89 | return predictions
90 |
91 | very_begin_time = time.time()
92 | data_on_gpu = []
93 | for bidx, data in enumerate(eval_queue.get()):
94 |
95 | data_on_gpu.append(data)
96 | # use multiple gpus, and data samples is not enough
97 | if len(params.gpus) > 0 and len(data_on_gpu) < len(params.gpus):
98 | continue
99 |
100 | start_time = time.time()
101 | predictions = _predict_one_batch(data_on_gpu)
102 | data_on_gpu = []
103 | results.extend(predictions)
104 |
105 | tf.logging.info(
106 | "Decoding Batch {} using {:.3f} s, translating {} "
107 | "sentences using {:.3f} s in total".format(
108 | bidx, time.time() - start_time,
109 | len(results), time.time() - very_begin_time
110 | )
111 | )
112 |
113 | eval_queue.stop()
114 |
115 | if len(data_on_gpu) > 0:
116 | start_time = time.time()
117 | predictions = _predict_one_batch(data_on_gpu)
118 | results.extend(predictions)
119 |
120 | tf.logging.info(
121 | "Decoding Batch {} using {:.3f} s, translating {} "
122 | "sentences using {:.3f} s in total".format(
123 | 'final', time.time() - start_time,
124 | len(results), time.time() - very_begin_time
125 | )
126 | )
127 |
128 | results = sorted(results, key=lambda x: x['sample_id'])
129 |
130 | golds = [result['gold_answer'] for result in results]
131 | preds = [result['pred_answer'] for result in results]
132 |
133 | score = np.sum(np.asarray(golds) == np.asarray(preds)) * 100. / len(golds)
134 |
135 | return results, score
136 |
137 |
138 | def dump_predictions(results, output):
139 | """save translation"""
140 | with tf.gfile.Open(output, 'w') as writer:
141 | for sample in results:
142 | sample['pred_answer'] = sample['pred_answer']
143 | writer.write(json.dumps(sample) + "\n")
144 | tf.logging.info("Saving translations into {}".format(output))
145 |
--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
1 | ## Document Classification
2 |
3 | One concern of LRN is that after simplifying the recurrent component, modeling capacity, in particular the long-range
4 | dependency, would be weakened. We answer this question by doing experiments on document classification.
5 |
6 | We choose:
7 | - Amazon Review Polarity (AmaPolar, 2 labels, 3.6M/0.4M for training/testing)
8 | - Amazon Review Full (AmaFull, 5 labels, 3M/0.65M for training/testing)
9 | - Yahoo! Answers (Yahoo, 10 labels, 1.4M/60K for training/testing)
10 | - Yelp Review Polarity (YelpPolar, 2 labels, 0.56M/38K for training/testing)
11 |
12 | Dataset comes from [Zhang et al. (2015)](https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf).
13 | We use a birnn model followed by an attentive pooling layer. Char and Glove embeddings are used for word representation.
14 | Main experimental results are summarized below.
15 |
16 |
17 |
18 | | Model |
19 | #Params |
20 | AmaPolar |
21 | Yahoo |
22 | AmaFull |
23 | YelpPolar |
24 |
25 |
26 | | ERR |
27 | Time |
28 | ERR |
29 | Time |
30 | ERR |
31 | Time |
32 | ERR |
33 | Time |
34 |
35 |
36 | | Zhang et al. (2015) |
37 | - |
38 | 6.10 |
39 | - |
40 | 29.16 |
41 | - |
42 | 40.57 |
43 | - |
44 | 5.26 |
45 | - |
46 |
47 |
48 | This Work |
49 | LSTM |
50 | 227K |
51 | 4.37 |
52 | 0.947 |
53 | 24.62 |
54 | 1.332 |
55 | 37.22 |
56 | 1.003 |
57 | 3.58 |
58 | 1.362 |
59 |
60 |
61 | | GRU |
62 | 176K |
63 | 4.39 |
64 | 0.948 |
65 | 24.68 |
66 | 1.242 |
67 | 37.20 |
68 | 0.982 |
69 | 3.47 |
70 | 1.230 |
71 |
72 |
73 | | ATR |
74 | 74K |
75 | 4.78 |
76 | 0.867 |
77 | 25.33 |
78 | 1.117 |
79 | 38.54 |
80 | 0.836 |
81 | 4.00 |
82 | 1.124 |
83 |
84 |
85 | | SRU |
86 | 194K |
87 | 4.95 |
88 | 0.919 |
89 | 24.78 |
90 | 1.394 |
91 | 38.23 |
92 | 0.907 |
93 | 3.99 |
94 | 1.310 |
95 |
96 |
97 | | LRN |
98 | 151K |
99 | 4.98 |
100 | 0.731 |
101 | 25.07 |
102 | 1.038 |
103 | 38.42 |
104 | 0.788 |
105 | 3.98 |
106 | 1.022 |
107 |
108 |
109 |
110 | *Time*: time in seconds per training batch measured from 1k training steps.
111 |
112 | ## Requirement
113 | tensorflow >= 1.8.1
114 |
115 | ## How to Run?
116 |
117 | - download and preprocess dataset
118 |
119 | - The dataset link: https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M
120 | - Prepare embedding and vocabulary
121 |
122 | Download the [pre-trained GloVe embedding](http://nlp.stanford.edu/data/glove.840B.300d.zip).
123 | Generate vocabulary for each task as follows:
124 | ```
125 | task=(amafull amapolar yahoo yelppolar)
126 | python code/run.py --mode vocab --config config.py --parameters=task="${task}",output_dir="${task}_vocab"
127 | ```
128 |
129 |
130 | - training and evaluation
131 |
132 | - Train the model as follows:
133 | ```
134 | # configure your cuda libaray if necessary
135 | export CUDA_ROOT=XXX
136 | export PATH=$CUDA_ROOT/bin:$PATH
137 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
138 |
139 | task=(amafull amapolar yahoo yelppolar)
140 | python code/run.py --mode train --config config.py --parameters=task="${task}",output_dir="${task}_train",gpus=[1],word_vocab_file="${task}_vocab/vocab.word",char_vocab_file="${task}_vocab/vocab.char",enable_hierarchy=False,nthreads=2,enable_bert=False,cell="lrn",swap_memory=False
141 | ```
142 | Other hyperparameter settings are available in the given config.py.
143 |
144 | - Test the model as follows:
145 | ```
146 | task=(amafull amapolar yahoo yelppolar)
147 | python code/run.py --mode test --config config.py --parameters=task="${task}",output_dir="${task}_train/best",gpus=[0],word_vocab_file="${task}_vocab/vocab.word",char_vocab_file="${task}_vocab/vocab.char",enable_hierarchy=False,nthreads=2,enable_bert=False,cell="lrn",swap_memory=False,train_continue=False,test_output=${task}.out.txt
148 | ```
149 |
150 | ## Credits
151 |
152 | Source code structure is adapted from [zero](https://github.com/bzhangGo/zero).
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # lrn
2 | Source code for "A Lightweight Recurrent Network for Sequence Modeling"
3 |
4 |
5 | ## Model Architecture
6 | In our new paper, we propose lightweight recurrent network, which combines the strengths of
7 | [ATR](https://arxiv.org/abs/1810.12546) and [SRU](https://arxiv.org/abs/1709.02755).
8 |
9 | * ATR helps reduces model parameters and avoids additional free parameters for gate calculation, through the twin-gate
10 | mechanism
11 | * SRU follows the [QRNN](https://arxiv.org/abs/1611.01576) and moves all recurrent computations outside the recurrence.
12 |
13 | Based on the above units, we propose [LRN](xxx):
14 |
15 |
16 |
17 | where g(·) is an activation function, *tanh* or *identity*. Wq, Wk and Wv
18 | are model parameters. The matrix computation (as well as potential layer noramlization) can be shfited outside the
19 | recurrence. Therefore, the whole model is fast in running.
20 |
21 | When applying twin-gate mechanism, the output value in **h**t might suffer explosion issue,
22 | which could grow into infinity. This is the reason we added the activation function. Another alternative solution
23 | would be using layer normalization, which forces activation values to be stable.
24 |
25 | ## Structure Analysis
26 | One way to understand the model is to unfold the LRN structure along input tokens:
27 |
28 |
29 |
30 | The above structure which is also observed by [Zhang et al.](https://arxiv.org/abs/1810.12546), [Lee et al.](https://arxiv.org/abs/1705.07393),
31 | and etc, endows the RNN model with multiple interpretations. We provide two as follows:
32 |
33 | * *Relation with Self Attention Networks*
34 |
35 |
36 | Informally, LRN assembles forget gates from step *t* to step *k+1* in order to query the key (input gate). The result
37 | weight is assigned to the corresponding value representation and contributes to the final hidden representation.
38 |
39 | Does the learned weights make sense? We do a classification tasks on AmaPolar task with a unidirectional linear-LRN.
40 | The final hidden state is feed into the classifier. One example below shows the learned weights. The term *great* gains
41 | a large weight, which decays slowly and contributes the final *positive* decision.
42 |
43 |
44 | * *Long-term and Short-term Memory*
45 |
46 |
47 | Another view of the unfolded structure is that different gates form different memory mechanism. The input gate acts as
48 | a short-term memory and indicates how many information can be activated in this token. The forget gates form a forget
49 | chain that controls how to erase meaningless past information.
50 |
51 | ## Experiments
52 |
53 | We did experiment on six different tasks:
54 | * [Natural Language Inference](nli)
55 | * [Document Classification](doc)
56 | * [Machine Translation](mt)
57 | * [Reading Comprehension](rc)
58 | * [Named Entity Recognition](ner)
59 | * [Language Modeling](lm)
60 |
61 |
62 | ## Citation
63 |
64 | Please cite the following paper:
65 | > Biao Zhang; Rico Sennrich (2019). *A Lightweight Recurrent Network for Sequence Modeling*.
66 | In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics. Florence, Italy.
67 |
68 | ```
69 | @inproceedings{zhang-sennrich:2019:ACL,
70 | address = "Florence, Italy",
71 | author = "Zhang, Biao and Sennrich, Rico",
72 | booktitle = "{Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}",
73 | publisher = "Association for Computational Linguistics",
74 | title = "{A Lightweight Recurrent Network for Sequence Modeling}",
75 | year = "2019"
76 | }
77 | ```
78 |
79 | ## Contact
80 |
81 | For any further comments or questions about LRN, please email Biao Zhang.
--------------------------------------------------------------------------------
/doc/code/utils/cycle.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 |
10 | def _zero_variables(variables, name=None):
11 | ops = []
12 |
13 | for var in variables:
14 | with tf.device(var.device):
15 | op = var.assign(tf.zeros(var.shape.as_list()))
16 | ops.append(op)
17 |
18 | return tf.group(*ops, name=name or "zero_variables")
19 |
20 |
21 | def _replicate_variables(variables, device=None, suffix="Replica"):
22 | new_vars = []
23 |
24 | for var in variables:
25 | device = device or var.device
26 | with tf.device(device):
27 | name = var.op.name + "/{}".format(suffix)
28 | new_vars.append(tf.Variable(tf.zeros(var.shape.as_list()),
29 | name=name, trainable=False))
30 |
31 | return new_vars
32 |
33 |
34 | def _collect_gradients(gradients, variables):
35 | ops = []
36 |
37 | for grad, var in zip(gradients, variables):
38 | if isinstance(grad, tf.Tensor):
39 | ops.append(tf.assign_add(var, grad))
40 | else:
41 | ops.append(tf.scatter_add(var, grad.indices, grad.values))
42 |
43 | return tf.group(*ops, name="collect_gradients")
44 |
45 |
46 | def create_train_op(named_scalars, grads_and_vars, optimizer, global_step, params):
47 | gradients = [item[0] for item in grads_and_vars]
48 | variables = [item[1] for item in grads_and_vars]
49 |
50 | if params.update_cycle == 1:
51 | zero_variables_op = tf.no_op("zero_variables")
52 | collect_op = tf.no_op("collect_op")
53 | else:
54 | named_vars = {}
55 | for name in named_scalars:
56 | named_var = tf.Variable(tf.zeros([]),
57 | name="{}/CTrainOpReplica".format(name),
58 | trainable=False)
59 | named_vars[name] = named_var
60 | count_var = tf.Variable(tf.zeros([]), name="count/CTrainOpReplica",
61 | trainable=False)
62 | slot_variables = _replicate_variables(variables, suffix="CTrainOpReplica")
63 | zero_variables_op = _zero_variables(
64 | slot_variables + [count_var] + named_vars.values())
65 |
66 | collect_ops = []
67 | # collect gradients
68 | collect_grads_op = _collect_gradients(gradients, slot_variables)
69 | collect_ops.append(collect_grads_op)
70 |
71 | # collect other scalars
72 | for name in named_scalars:
73 | scalar = named_scalars[name]
74 | named_var = named_vars[name]
75 | collect_op = tf.assign_add(named_var, scalar)
76 | collect_ops.append(collect_op)
77 | # collect counting variable
78 | collect_count_op = tf.assign_add(count_var, 1.0)
79 | collect_ops.append(collect_count_op)
80 |
81 | collect_op = tf.group(*collect_ops, name="collect_op")
82 | scale = 1.0 / (tf.to_float(count_var) + 1.0)
83 | gradients = [scale * (g + s)
84 | for (g, s) in zip(gradients, slot_variables)]
85 |
86 | for name in named_scalars:
87 | named_scalars[name] = scale * (
88 | named_scalars[name] + named_vars[name])
89 |
90 | global_norm = tf.global_norm(gradients)
91 |
92 | # Gradient clipping
93 | if isinstance(params.clip_grad_norm or None, float):
94 | gradients, _ = tf.clip_by_global_norm(gradients,
95 | params.clip_grad_norm,
96 | use_norm=global_norm)
97 |
98 | # Update variables
99 | grads_and_vars = list(zip(gradients, variables))
100 | train_op = optimizer.apply_gradients(grads_and_vars, global_step)
101 |
102 | ops = {
103 | "zero_op": zero_variables_op,
104 | "collect_op": collect_op,
105 | "train_op": train_op
106 | }
107 |
108 | # apply ema
109 | if params.ema_decay > 0.:
110 | tf.logging.info('Using Exp Moving Average to train the model with decay {}.'.format(params.ema_decay))
111 | ema = tf.train.ExponentialMovingAverage(decay=params.ema_decay, num_updates=global_step)
112 | ema_op = ema.apply(variables)
113 | with tf.control_dependencies([ops['train_op']]):
114 | ops['train_op'] = tf.group(ema_op)
115 | bck_vars = _replicate_variables(variables, suffix="CTrainOpBackUpReplica")
116 |
117 | ops['ema_backup_op'] = tf.group(*(tf.assign(bck, var.read_value())
118 | for bck, var in zip(bck_vars, variables)))
119 | ops['ema_restore_op'] = tf.group(*(tf.assign(var, bck.read_value())
120 | for bck, var in zip(bck_vars, variables)))
121 | ops['ema_assign_op'] = tf.group(*(tf.assign(var, ema.average(var).read_value())
122 | for var in variables))
123 |
124 | ret = named_scalars
125 | ret.update({
126 | "gradient_norm": global_norm,
127 | })
128 |
129 | return ret, ops
130 |
--------------------------------------------------------------------------------
/nli/code/utils/cycle.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import tensorflow as tf
8 |
9 |
10 | def _zero_variables(variables, name=None):
11 | ops = []
12 |
13 | for var in variables:
14 | with tf.device(var.device):
15 | op = var.assign(tf.zeros(var.shape.as_list()))
16 | ops.append(op)
17 |
18 | return tf.group(*ops, name=name or "zero_variables")
19 |
20 |
21 | def _replicate_variables(variables, device=None, suffix="Replica"):
22 | new_vars = []
23 |
24 | for var in variables:
25 | device = device or var.device
26 | with tf.device(device):
27 | name = var.op.name + "/{}".format(suffix)
28 | new_vars.append(tf.Variable(tf.zeros(var.shape.as_list()),
29 | name=name, trainable=False))
30 |
31 | return new_vars
32 |
33 |
34 | def _collect_gradients(gradients, variables):
35 | ops = []
36 |
37 | for grad, var in zip(gradients, variables):
38 | if isinstance(grad, tf.Tensor):
39 | ops.append(tf.assign_add(var, grad))
40 | else:
41 | ops.append(tf.scatter_add(var, grad.indices, grad.values))
42 |
43 | return tf.group(*ops, name="collect_gradients")
44 |
45 |
46 | def create_train_op(named_scalars, grads_and_vars, optimizer, global_step, params):
47 | gradients = [item[0] for item in grads_and_vars]
48 | variables = [item[1] for item in grads_and_vars]
49 |
50 | if params.update_cycle == 1:
51 | zero_variables_op = tf.no_op("zero_variables")
52 | collect_op = tf.no_op("collect_op")
53 | else:
54 | named_vars = {}
55 | for name in named_scalars:
56 | named_var = tf.Variable(tf.zeros([]),
57 | name="{}/CTrainOpReplica".format(name),
58 | trainable=False)
59 | named_vars[name] = named_var
60 | count_var = tf.Variable(tf.zeros([]), name="count/CTrainOpReplica",
61 | trainable=False)
62 | slot_variables = _replicate_variables(variables, suffix="CTrainOpReplica")
63 | zero_variables_op = _zero_variables(
64 | slot_variables + [count_var] + named_vars.values())
65 |
66 | collect_ops = []
67 | # collect gradients
68 | collect_grads_op = _collect_gradients(gradients, slot_variables)
69 | collect_ops.append(collect_grads_op)
70 |
71 | # collect other scalars
72 | for name in named_scalars:
73 | scalar = named_scalars[name]
74 | named_var = named_vars[name]
75 | collect_op = tf.assign_add(named_var, scalar)
76 | collect_ops.append(collect_op)
77 | # collect counting variable
78 | collect_count_op = tf.assign_add(count_var, 1.0)
79 | collect_ops.append(collect_count_op)
80 |
81 | collect_op = tf.group(*collect_ops, name="collect_op")
82 | scale = 1.0 / (tf.to_float(count_var) + 1.0)
83 | gradients = [scale * (g + s)
84 | for (g, s) in zip(gradients, slot_variables)]
85 |
86 | for name in named_scalars:
87 | named_scalars[name] = scale * (
88 | named_scalars[name] + named_vars[name])
89 |
90 | global_norm = tf.global_norm(gradients)
91 |
92 | # Gradient clipping
93 | if isinstance(params.clip_grad_norm or None, float):
94 | gradients, _ = tf.clip_by_global_norm(gradients,
95 | params.clip_grad_norm,
96 | use_norm=global_norm)
97 |
98 | # Update variables
99 | grads_and_vars = list(zip(gradients, variables))
100 | train_op = optimizer.apply_gradients(grads_and_vars, global_step)
101 |
102 | ops = {
103 | "zero_op": zero_variables_op,
104 | "collect_op": collect_op,
105 | "train_op": train_op
106 | }
107 |
108 | # apply ema
109 | if params.ema_decay > 0.:
110 | tf.logging.info('Using Exp Moving Average to train the model with decay {}.'.format(params.ema_decay))
111 | ema = tf.train.ExponentialMovingAverage(decay=params.ema_decay, num_updates=global_step)
112 | ema_op = ema.apply(variables)
113 | with tf.control_dependencies([ops['train_op']]):
114 | ops['train_op'] = tf.group(ema_op)
115 | bck_vars = _replicate_variables(variables, suffix="CTrainOpBackUpReplica")
116 |
117 | ops['ema_backup_op'] = tf.group(*(tf.assign(bck, var.read_value())
118 | for bck, var in zip(bck_vars, variables)))
119 | ops['ema_restore_op'] = tf.group(*(tf.assign(var, bck.read_value())
120 | for bck, var in zip(bck_vars, variables)))
121 | ops['ema_assign_op'] = tf.group(*(tf.assign(var, ema.average(var).read_value())
122 | for var in variables))
123 |
124 | ret = named_scalars
125 | ret.update({
126 | "gradient_norm": global_norm,
127 | })
128 |
129 | return ret, ops
130 |
--------------------------------------------------------------------------------
/nli/code/evalu.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import time
8 | import json
9 | import tensorflow as tf
10 |
11 | from utils import queuer
12 |
13 |
14 | def decoding(sprobs, samples, params, mask=None):
15 | """Generate decoded sequence from seqs"""
16 | if mask is None:
17 | mask = [1.] * len(sprobs)
18 |
19 | flat_sprobs = []
20 | for _sprobs, _m in zip(sprobs, mask):
21 | if _m < 1.: continue
22 |
23 | for start_prob in _sprobs:
24 | flat_sprobs.append(start_prob)
25 |
26 | assert len(flat_sprobs) == len(samples), 'Decoding length mismatch!'
27 |
28 | results = []
29 |
30 | for (idx, sample), pred in zip(samples, flat_sprobs):
31 | gold_label = sample[0]
32 | pred_label = pred
33 |
34 | results.append({
35 | 'pred_answer': int(pred_label),
36 | 'sample_id': idx,
37 | 'gold_answer': gold_label
38 | })
39 |
40 | return results
41 |
42 |
43 | def predict(session, features,
44 | out_pred, dataset, params, train=True):
45 | """Performing decoding with exising information"""
46 | results = []
47 |
48 | batcher = dataset.batcher(params.eval_batch_size,
49 | buffer_size=params.buffer_size,
50 | shuffle=False, train=train)
51 | eval_queue = queuer.EnQueuer(batcher,
52 | multiprocessing=params.data_multiprocessing,
53 | random_seed=params.random_seed)
54 | eval_queue.start(workers=params.nthreads,
55 | max_queue_size=params.max_queue_size)
56 |
57 | def _predict_one_batch(data_on_gpu):
58 | feed_dicts = {}
59 | flat_raw_data = []
60 | for fidx, data in enumerate(data_on_gpu):
61 | # define feed_dict
62 | feed_dict = {
63 | features[fidx]["p"]: data['p_token_ids'],
64 | features[fidx]["h"]: data['h_token_ids'],
65 | features[fidx]["l"]: data['l_id'],
66 | }
67 | if params.use_char:
68 | feed_dict[features[fidx]["pc"]] = data['p_char_ids']
69 | feed_dict[features[fidx]["hc"]] = data['h_char_ids']
70 |
71 | if params.enable_bert:
72 | feed_dict[features[fidx]["ps"]] = data['p_subword_ids']
73 | feed_dict[features[fidx]["hs"]] = data['h_subword_ids']
74 | feed_dict[features[fidx]["pb"]] = data['p_subword_back']
75 | feed_dict[features[fidx]["hb"]] = data['h_subword_back']
76 |
77 | feed_dicts.update(feed_dict)
78 | flat_raw_data.extend(data['raw'])
79 |
80 | # pick up valid outputs
81 | data_size = len(data_on_gpu)
82 | valid_out_pred = out_pred[:data_size]
83 |
84 | decode_spred = session.run(
85 | valid_out_pred, feed_dict=feed_dicts)
86 |
87 | predictions = decoding(
88 | decode_spred, flat_raw_data, params
89 | )
90 |
91 | return predictions
92 |
93 | very_begin_time = time.time()
94 | data_on_gpu = []
95 | for bidx, data in enumerate(eval_queue.get()):
96 |
97 | data_on_gpu.append(data)
98 | # use multiple gpus, and data samples is not enough
99 | if len(params.gpus) > 0 and len(data_on_gpu) < len(params.gpus):
100 | continue
101 |
102 | start_time = time.time()
103 | predictions = _predict_one_batch(data_on_gpu)
104 | data_on_gpu = []
105 | results.extend(predictions)
106 |
107 | tf.logging.info(
108 | "Decoding Batch {} using {:.3f} s, translating {} "
109 | "sentences using {:.3f} s in total".format(
110 | bidx, time.time() - start_time,
111 | len(results), time.time() - very_begin_time
112 | )
113 | )
114 |
115 | eval_queue.stop()
116 |
117 | if len(data_on_gpu) > 0:
118 | start_time = time.time()
119 | predictions = _predict_one_batch(data_on_gpu)
120 | results.extend(predictions)
121 |
122 | tf.logging.info(
123 | "Decoding Batch {} using {:.3f} s, translating {} "
124 | "sentences using {:.3f} s in total".format(
125 | 'final', time.time() - start_time,
126 | len(results), time.time() - very_begin_time
127 | )
128 | )
129 |
130 | return results
131 |
132 |
133 | def eval_metric(results, params):
134 | """BLEU Evaluate """
135 |
136 | crr_cnt, total_cnt = 0, 0
137 |
138 | for result in results:
139 | total_cnt += 1
140 |
141 | p = result['pred_answer']
142 | g = result['gold_answer']
143 |
144 | if p == g:
145 | crr_cnt += 1
146 |
147 | return crr_cnt * 100. / total_cnt
148 |
149 |
150 | def dump_predictions(results, output):
151 | """save translation"""
152 | with tf.gfile.Open(output, 'w') as writer:
153 | for sample in results:
154 | writer.write(json.dumps(sample) + "\n")
155 | tf.logging.info("Saving translations into {}".format(output))
156 |
--------------------------------------------------------------------------------
/nli/code/vocab.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import json
8 | import argparse
9 | import numpy as np
10 |
11 |
12 | class Vocab(object):
13 | def __init__(self, lower=False, vocab_file=None):
14 | self.word2id = {}
15 | self.id2word = {}
16 | self.word2count = {}
17 |
18 | self.pad_sym = ""
19 | self.eos_sym = ""
20 | self.unk_sym = ""
21 |
22 | self.lower = lower
23 |
24 | self.insert(self.pad_sym)
25 | self.insert(self.unk_sym)
26 | self.insert(self.eos_sym)
27 |
28 | if vocab_file is not None:
29 | self.load_vocab(vocab_file)
30 |
31 | def insert(self, token):
32 | token = token if not self.lower else token.lower()
33 | if token not in self.word2id:
34 | index = len(self.word2id)
35 | self.word2id[token] = index
36 | self.id2word[index] = token
37 |
38 | self.word2count[token] = 0
39 | self.word2count[token] += 1
40 |
41 | def size(self):
42 | return len(self.word2id)
43 |
44 | def load_vocab(self, vocab_file):
45 | with open(vocab_file, 'r') as reader:
46 | for token in reader:
47 | self.insert(token.strip())
48 |
49 | def get_token(self, id):
50 | if id in self.id2word:
51 | return self.id2word[id]
52 | return self.unk_sym
53 |
54 | def get_id(self, token):
55 | token = token if not self.lower else token.lower()
56 | if token in self.word2id:
57 | return self.word2id[token]
58 | return self.word2id[self.unk_sym]
59 |
60 | def sort_vocab(self, least_freq=-1):
61 | sorted_word2count = sorted(
62 | self.word2count.items(), key=lambda x: - x[1])
63 | self.word2id, self.id2word, self.word2count = {}, {}, {}
64 | self.insert(self.pad_sym)
65 | self.insert(self.unk_sym)
66 | self.insert(self.eos_sym)
67 | for word, freq in sorted_word2count:
68 | if least_freq > 0:
69 | if freq <= least_freq:
70 | continue
71 | self.insert(word)
72 |
73 | def save_vocab(self, vocab_file):
74 | with open(vocab_file, 'w') as writer:
75 | for id in range(self.size()):
76 | writer.write(self.id2word[id].encode("utf-8") + "\n")
77 |
78 | def to_id(self, tokens, append_eos=True):
79 | if not append_eos:
80 | return [self.get_id(token) for token in tokens]
81 | else:
82 | return [self.get_id(token) for token in
83 | tokens + [self.eos_sym]]
84 |
85 | def to_tokens(self, ids):
86 | return [self.get_token(id) for id in ids]
87 |
88 | def eos(self):
89 | return self.get_id(self.eos_sym)
90 |
91 | def pad(self):
92 | return self.get_id(self.pad_sym)
93 |
94 |
95 | if __name__ == "__main__":
96 | parser = argparse.ArgumentParser('Vocabulary Preparation')
97 | parser.add_argument('--char', action='store_true', help='build char-level vocabulary')
98 | parser.add_argument('--lower', action='store_true', help='lower-case datasets')
99 | parser.add_argument('--embeddings', type=str, default='no', help='pre-trained word embedding path')
100 | parser.add_argument('inputs', type=str, help='the input file path, separate with comma')
101 | parser.add_argument('output', type=str, help='the output file name')
102 |
103 | args = parser.parse_args()
104 |
105 | vocab = Vocab(lower=args.lower)
106 | for data_file in args.inputs.split(','):
107 | with open(data_file, 'r') as reader:
108 | for text in reader:
109 | tokens = text.strip().split()
110 |
111 | for token in tokens:
112 | if not args.char:
113 | vocab.insert(token)
114 | else:
115 | for char in list(token):
116 | vocab.insert(char)
117 |
118 | vocab.sort_vocab(least_freq=3 if args.char else -1)
119 |
120 | # process the vocabulary with pretrained-embeddings
121 | if args.embeddings != "no":
122 | embed_tokens = {}
123 | embed_size = None
124 | with open(args.embeddings, 'r') as reader:
125 | for line in reader:
126 | segs = line.strip().split(' ')
127 |
128 | token = segs[0]
129 | # Not used in our training data, pass
130 | if token not in vocab.word2id:
131 | continue
132 | embed_tokens[token] = list(map(float, segs[1:]))
133 |
134 | if embed_size is None:
135 | embed_size = len(segs) - 1
136 |
137 | vocab = Vocab(lower=args.lower)
138 | for token in embed_tokens:
139 | vocab.insert(token)
140 |
141 | # load embeddings
142 | embeddings = np.zeros([len(embed_tokens), embed_size])
143 | for token in embed_tokens:
144 | # 3: the special symbols
145 | embeddings[vocab.get_id(token) - 3] = embed_tokens[token]
146 | np.savez(args.output + ".npz", data=embeddings)
147 |
148 | vocab.save_vocab(args.output)
149 |
150 | print("Loading {} tokens from {}".format(vocab.size(), args.inputs))
151 |
--------------------------------------------------------------------------------
/doc/code/vocab.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import os
8 | import argparse
9 | import numpy as np
10 | import tensorflow as tf
11 |
12 | import sys
13 | reload(sys)
14 | sys.setdefaultencoding('utf-8')
15 |
16 | from bert.tokenization import BasicTokenizer as Tokenizer
17 |
18 |
19 | class Vocab(object):
20 | def __init__(self, lower=False, vocab_file=None):
21 | self.lower = lower
22 |
23 | self.word2id = {}
24 | self.id2word = {}
25 | self.word2count = {}
26 |
27 | self.pad_sym = ""
28 | self.eos_sym = ""
29 | self.unk_sym = ""
30 |
31 | self.clean()
32 |
33 | self.pretrained_embedding = None
34 |
35 | if vocab_file is not None:
36 | self.load_vocab(vocab_file)
37 |
38 | if os.path.exists(vocab_file + ".npz"):
39 | pretrain_embedding = np.load(vocab_file + ".npz")['data']
40 | self.pretrained_embedding = pretrain_embedding
41 |
42 | def clean(self):
43 | self.word2id = {}
44 | self.id2word = {}
45 | self.word2count = {}
46 |
47 | self.insert(self.pad_sym)
48 | self.insert(self.unk_sym)
49 | self.insert(self.eos_sym)
50 |
51 | def insert(self, token):
52 | token = token if not self.lower else token.lower()
53 | if token not in self.word2id:
54 | index = len(self.word2id)
55 | self.word2id[token] = index
56 | self.id2word[index] = token
57 |
58 | self.word2count[token] = 0
59 | self.word2count[token] += 1
60 |
61 | def size(self):
62 | return len(self.word2id)
63 |
64 | def load_vocab(self, vocab_file):
65 | with open(vocab_file, 'r') as reader:
66 | for token in reader:
67 | self.insert(token.strip())
68 |
69 | def get_token(self, id):
70 | if id in self.id2word:
71 | return self.id2word[id]
72 | return self.unk_sym
73 |
74 | def get_id(self, token):
75 | token = token if not self.lower else token.lower()
76 | if token in self.word2id:
77 | return self.word2id[token]
78 | return self.word2id[self.unk_sym]
79 |
80 | def sort_vocab(self, least_freq=-1):
81 | sorted_word2count = sorted(
82 | self.word2count.items(), key=lambda x: - x[1])
83 | self.clean()
84 | for word, freq in sorted_word2count:
85 | if least_freq > 0:
86 | if freq <= least_freq:
87 | continue
88 | self.insert(word)
89 |
90 | def save_vocab(self, vocab_file):
91 | with open(vocab_file, 'w') as writer:
92 | for id in range(self.size()):
93 | writer.write(self.id2word[id].encode("utf-8") + "\n")
94 |
95 | np.savez(vocab_file + ".npz", data=self.pretrained_embedding)
96 |
97 | def to_id(self, tokens, append_eos=True):
98 | if not append_eos:
99 | return [self.get_id(token) for token in tokens]
100 | else:
101 | return [self.get_id(token) for token in
102 | tokens + [self.eos_sym]]
103 |
104 | def to_tokens(self, ids):
105 | return [self.get_token(id) for id in ids]
106 |
107 | def eos(self):
108 | return self.get_id(self.eos_sym)
109 |
110 | def pad(self):
111 | return self.get_id(self.pad_sym)
112 |
113 | def make_vocab(self, data_set, use_char=False, embedding_path=None):
114 | tf.logging.info("Starting Reading Data in {} Manner".format(use_char))
115 | tokenizer = Tokenizer(do_lower_case=False)
116 |
117 | for data_iter in [data_set.get_train_data(),
118 | data_set.get_dev_data(),
119 | data_set.get_test_data()]:
120 | for sample in data_iter:
121 | label, document = sample
122 |
123 | tokens = tokenizer.tokenize(document)
124 | for token in tokens:
125 | if not use_char:
126 | self.insert(token)
127 | else:
128 | for char in list(token):
129 | self.insert(char)
130 |
131 | tf.logging.info("Data Loading Over, Starting Sorted")
132 | self.sort_vocab(least_freq=3 if use_char else -1)
133 |
134 | # process the vocabulary with pretrained-embeddings
135 | if embedding_path is not None:
136 | tf.logging.info("Pretrained Word Embedding Loading")
137 | embed_tokens = {}
138 | embed_size = None
139 | with open(embedding_path, 'r') as reader:
140 | for line in reader:
141 | segs = line.strip().split(' ')
142 |
143 | token = segs[0]
144 | # Not used in our training data, pass
145 | if token not in self.word2id:
146 | continue
147 | embed_tokens[token] = list(map(float, segs[1:]))
148 |
149 | if embed_size is None:
150 | embed_size = len(segs) - 1
151 |
152 | self.clean()
153 | for token in embed_tokens:
154 | self.insert(token)
155 |
156 | # load embeddings
157 | embeddings = np.zeros([len(embed_tokens), embed_size])
158 | for token in embed_tokens:
159 | # 3: the special symbols
160 | embeddings[self.get_id(token) - 3] = embed_tokens[token]
161 |
162 | self.pretrained_embedding = embeddings
163 |
164 | tf.logging.info("Vocabulary Loading Finished")
165 |
--------------------------------------------------------------------------------
/nli/README.md:
--------------------------------------------------------------------------------
1 | ## Natural Language Inference
2 |
3 | The dataset is Stanford Natural Language Inference (SNLI), which we regard as a three-way classification tasks.
4 | We use an encoder-attention-decoder architecture, and stack two additional birnn upon the final sequence representation.
5 | Both GloVe word embedding and character embedding is used for word-level representation.
6 | Main experimental results are summarized below.
7 |
8 |
9 |
10 | | Model |
11 | #Params |
12 | Base |
13 | +LN |
14 | +BERT |
15 | +LN+BERT |
16 |
17 |
18 | | ACC |
19 | Time |
20 | ACC |
21 | Time |
22 | ACC |
23 | Time |
24 | ACC |
25 | Time |
26 |
27 |
28 | | Rocktaschel et al. (2016) |
29 | 250K |
30 | 83.50 |
31 | - |
32 | - |
33 | - |
34 | - |
35 | - |
36 | - |
37 | - |
38 |
39 |
40 | This Work |
41 | LSTM |
42 | 8.36M |
43 | 84.27 |
44 | 0.262 |
45 | 86.03 |
46 | 0.432 |
47 | 89.95 |
48 | 0.544 |
49 | 90.49 |
50 | 0.696 |
51 |
52 |
53 | | GRU |
54 | 6.41M |
55 | 85.71 |
56 | 0.245 |
57 | 86.05 |
58 | 0.419 |
59 | 90.29 |
60 | 0.529 |
61 | 90.10 |
62 | 0.695 |
63 |
64 |
65 | | ATR |
66 | 2.87M |
67 | 84.88 |
68 | 0.210 |
69 | 85.81 |
70 | 0.307 |
71 | 90.00 |
72 | 0.494 |
73 | 90.28 |
74 | 0.580 |
75 |
76 |
77 | | SRU |
78 | 5.48M |
79 | 84.28 |
80 | 0.258 |
81 | 85.32 |
82 | 0.283 |
83 | 89.98 |
84 | 0.543 |
85 | 90.09 |
86 | 0.555 |
87 |
88 |
89 | | LRN |
90 | 4.25M |
91 | 84.88 |
92 | 0.209 |
93 | 85.06 |
94 | 0.223 |
95 | 89.98 |
96 | 0.488 |
97 | 89.93 |
98 | 0.506 |
99 |
100 |
101 |
102 | *LN*: layer normalizaton; *Time*: time in seconds per training batch measured from 1k training steps.
103 |
104 | ## Requirement
105 | tensorflow >= 1.8.1
106 |
107 | ## How to Run?
108 |
109 | - download and preprocess dataset
110 |
111 | - The dataset link: https://nlp.stanford.edu/projects/snli/
112 | - Prepare separate data files:
113 |
114 | We provide a simple processing script `convert_to_plain.py` in scripts folder. By calling:
115 | ```
116 | python convert_to_plain.py snli_1.0/[ds].txt
117 | ```
118 | you can get the `*.p, *.q, *.l` files as in the `config.py`. *[ds]* indicates `snli_1.0_train.txt`,
119 | `snli_1.0_dev.txt` and `snli_1.0_test.txt`. We only preserve `'entailment', 'neutral', 'contradiction'` instances,
120 | and others are dropped.
121 |
122 | - Prepare embedding and vocabulary
123 |
124 | Download the [pre-trained GloVe embedding](http://nlp.stanford.edu/data/glove.840B.300d.zip). And prepare
125 | the char as well as word vocabulary using `vocab.py` as follows:
126 | ```
127 | # word embedding & vocabulary
128 | python vocab.py --embeddings [path-to-glove-embedding] train.p,train.q,dev.p,dev.q,test.p,test.q word_vocab
129 | # char embedding
130 | python vocab.py --char train.p,train.q,dev.p,dev.q,test.p,test.q char_vocab
131 | ```
132 |
133 | - Download BERT pre-trained embedding (if you plan to work with BERT)
134 |
135 | - training and evaluation
136 |
137 | - Train the model as follows:
138 | ```
139 | # configure your cuda libaray if necessary
140 | export CUDA_ROOT=XXX
141 | export PATH=$CUDA_ROOT/bin:$PATH
142 | export LD_LIBRARY_PATH=$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
143 |
144 | # LRN
145 | python code/run.py --mode train --config config.py --parameters=gpus=[0],cell="lrn",layer_norm=False,output_dir="train_no_ln" >& log.noln
146 | # LRN + LN
147 | python code/run.py --mode train --config config.py --parameters=gpus=[0],cell="lrn",layer_norm=True,output_dir="train_ln" >& log.ln
148 | # LRN + BERT
149 | python code/run.py --mode train --config config_bert.py --parameters=gpus=[0],cell="lrn",layer_norm=False,output_dir="train_no_ln_bert" >& log.noln.bert
150 | # LRN + LN + BERT
151 | python code/run.py --mode train --config config_bert.py --parameters=gpus=[0],cell="lrn",layer_norm=True,output_dir="train_ln_bert" >& log.ln.bert
152 | ```
153 | Other hyperparameter settings are available in the given config.py.
154 |
155 | - Test the model as follows:
156 | ```
157 | # LRN
158 | python code/run.py --mode test --config config.py --parameters=gpus=[0],cell="lrn",layer_norm=False,output_dir="train_no_ln/best",test_output="out.noln" >& log.noln.test
159 | # LRN + LN
160 | python code/run.py --mode test --config config.py --parameters=gpus=[0],cell="lrn",layer_norm=True,output_dir="train_ln/best",test_output="out.ln" >& log.ln.test
161 | # LRN + BERT
162 | python code/run.py --mode test --config config_bert.py --parameters=gpus=[0],cell="lrn",layer_norm=False,output_dir="train_no_ln_bert/best",test_output="out.noln.bert" >& log.noln.bert.test
163 | # LRN + LN + BERT
164 | python code/run.py --mode test --config config_bert.py --parameters=gpus=[0],cell="lrn",layer_norm=True,output_dir="train_ln_bert/best",test_output="out.ln.bert" >& log.ln.bert.test
165 | ```
166 |
167 | ## Credits
168 |
169 | Source code structure is adapted from [zero](https://github.com/bzhangGo/zero).
--------------------------------------------------------------------------------