├── lstm_model
    ├── src
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── config.cpython-36.pyc
    │   │   ├── models.cpython-36.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── base_model.cpython-36.pyc
    │   │   ├── data_utils.cpython-36.pyc
    │   │   ├── general_utils.cpython-36.pyc
    │   │   └── adversarial_losses.cpython-36.pyc
    │   ├── general_utils.py
    │   ├── base_model.py
    │   ├── config.py
    │   ├── adversarial_losses.py
    │   ├── ner_model.py
    │   └── data_utils.py
    ├── __init__.py
    ├── __pycache__
    │   └── config.cpython-36.pyc
    ├── run_train_pico.py
    ├── run_train.py
    ├── build_data.py
    ├── run_train_nicta.py
    ├── run_train_cross_validate.py
    ├── run_train_cross_validate_nicta.py
    ├── run_train_cross_validate_pico.py
    ├── general_utils.py
    ├── base_model.py
    ├── config.py
    ├── adversarial_losses.py
    ├── ner_model.py
    └── data_utils.py
├── BERT
    ├── pytorch_pretrained_bert
    │   ├── module
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── san.cpython-36.pyc
    │   │   │   ├── common.cpython-36.pyc
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── my_optim.cpython-36.pyc
    │   │   │   ├── similarity.cpython-36.pyc
    │   │   │   └── dropout_wrapper.cpython-36.pyc
    │   │   ├── common.py
    │   │   ├── dropout_wrapper.py
    │   │   ├── my_optim.py
    │   │   └── san.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── modeling.cpython-36.pyc
    │   │   ├── file_utils.cpython-36.pyc
    │   │   ├── optimization.cpython-36.pyc
    │   │   └── tokenization.cpython-36.pyc
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── convert_tf_checkpoint_to_pytorch.py
    │   └── file_utils.py
    ├── __pycache__
    │   ├── crf.cpython-36.pyc
    │   ├── utils.cpython-36.pyc
    │   ├── bert_model.cpython-36.pyc
    │   └── adversarial_losses.cpython-36.pyc
    ├── run_classifier_nicta.py
    ├── run_classifier_pico.py
    ├── run_classifier_pico_cross_validate.py
    ├── run_classifier_nicta_cross_validate.py
    ├── utils.py
    ├── adversarial_losses.py
    ├── bert_model.py
    └── crf.py
├── requirements.txt
└── README.md


/lstm_model/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lstm_model/__init__.py:
--------------------------------------------------------------------------------
1 | import .base_model
2 | import .config


--------------------------------------------------------------------------------
/BERT/__pycache__/crf.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/__pycache__/crf.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/__pycache__/bert_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/__pycache__/bert_model.cpython-36.pyc


--------------------------------------------------------------------------------
/lstm_model/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/lstm_model/src/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/lstm_model/src/__pycache__/models.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/models.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/__pycache__/adversarial_losses.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/__pycache__/adversarial_losses.cpython-36.pyc


--------------------------------------------------------------------------------
/lstm_model/src/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/lstm_model/src/__pycache__/base_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/base_model.cpython-36.pyc


--------------------------------------------------------------------------------
/lstm_model/src/__pycache__/data_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/data_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/lstm_model/src/__pycache__/general_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/general_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/lstm_model/src/__pycache__/adversarial_losses.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/adversarial_losses.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/__pycache__/modeling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/__pycache__/modeling.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/__pycache__/file_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/__pycache__/file_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/__pycache__/optimization.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/__pycache__/optimization.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/__pycache__/tokenization.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/__pycache__/tokenization.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/__pycache__/san.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/san.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/__pycache__/common.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/common.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/__pycache__/my_optim.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/my_optim.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/__pycache__/similarity.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/similarity.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/__pycache__/dropout_wrapper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/dropout_wrapper.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT/run_classifier_nicta.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | MODEL_PATH = sys.argv[1]
 5 | 
 6 | command = 'python bert_classifier.py --data_dir ../data/nicta_piboso ' \
 7 |           '--bert_model {} ' \
 8 |           '--task_name nicta --output_dir results/nicta/biobert_crf ' \
 9 |           '--train_batch_size 2 --tag_space 0 --max_seq_length 60 --use_crf ' \
10 |           '--do_train --do_eval --do_lower_case --num_train_epochs 3 ' \
11 |           '--rnn_hidden_size 512 --dropout 0.3 '.format(MODEL_PATH)
12 | 
13 | os.system(command)


--------------------------------------------------------------------------------
/BERT/run_classifier_pico.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | MODEL_PATH = sys.argv[1]
 5 | 
 6 | # use bio-bert
 7 | command = 'python bert_classifier.py --data_dir ../data/pico ' \
 8 |           '--bert_model  {} ' \
 9 |           '--task_name pico --output_dir results/PICO/biobert_crf ' \
10 |           '--train_batch_size 2 --tag_space 0 --max_seq_length 60 --use_crf ' \
11 |           '--do_train --do_eval --do_lower_case --num_train_epochs 3 ' \
12 |           '--rnn_hidden_size 512 --dropout 0.3 '.format(MODEL_PATH)
13 | 
14 | os.system(command)


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
2 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
3 |                        BertForMaskedLM, BertForNextSentencePrediction,
4 |                        BertForSequenceClassification, BertForMultipleChoice,
5 |                        BertForTokenClassification, BertForQuestionAnswering,
6 |                        BertForMultipleChoice_SAN)
7 | from .optimization import BertAdam
8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.9.0
 2 | astor==0.8.1
 3 | boto3==1.13.24
 4 | botocore==1.16.24
 5 | certifi==2020.4.5.1
 6 | chardet==3.0.4
 7 | docutils==0.15.2
 8 | future==0.18.2
 9 | gast==0.3.3
10 | grpcio==1.29.0
11 | idna==2.9
12 | importlib-metadata==1.6.1
13 | jmespath==0.10.0
14 | joblib==0.15.1
15 | Markdown==3.2.2
16 | numpy==1.14.5
17 | protobuf==3.12.2
18 | python-dateutil==2.8.1
19 | requests==2.23.0
20 | s3transfer==0.3.3
21 | scikit-learn==0.23.1
22 | scipy==1.4.1
23 | six==1.15.0
24 | sklearn==0.0
25 | tensorboard==1.10.0
26 | tensorflow-gpu==1.10.0
27 | termcolor==1.1.0
28 | threadpoolctl==2.1.0
29 | torch==1.0.0
30 | tqdm==4.46.1
31 | urllib3==1.25.9
32 | Werkzeug==1.0.1
33 | zipp==3.1.0
34 | 


--------------------------------------------------------------------------------
/BERT/run_classifier_pico_cross_validate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | MODEL_PATH = sys.argv[1]
 5 | 
 6 | for fold in range(1, 11):
 7 |     # # use bio-bert
 8 |     command = 'python bert_classifier.py --data_dir ../data/pico/10_folds/{fold} ' \
 9 |               '--bert_model {MODEL_PATH} ' \
10 |               '--task_name pico --output_dir results/PICO/biobert_crf_{fold} ' \
11 |               '--train_batch_size 2 --tag_space 0 --max_seq_length 60 --use_crf ' \
12 |               '--do_train --do_eval --do_lower_case --num_train_epochs 3 ' \
13 |               '--rnn_hidden_size 512 --dropout 0.2 '.format(fold=fold,
14 |                                                             MODEL_PATH=MODEL_PATH)
15 | 
16 |     os.system(command)


--------------------------------------------------------------------------------
/BERT/run_classifier_nicta_cross_validate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | MODEL_PATH = sys.argv[1]
 5 | 
 6 | for fold in range(1, 11):
 7 |     # use bio-bert
 8 |     command = 'python bert_classifier.py --data_dir ../data/nicta_piboso/10_folds/{fold} ' \
 9 |               '--bert_model {MODEL_PATH} ' \
10 |               '--task_name nicta --output_dir results/nicta/biobert_crf_{fold} ' \
11 |               '--train_batch_size 2 --tag_space 0 --max_seq_length 60 --use_crf ' \
12 |               '--do_train --do_eval --do_lower_case --num_train_epochs 3 ' \
13 |               '--rnn_hidden_size 512 --dropout 0.3 '.format(fold=fold,
14 |                                                             MODEL_PATH=MODEL_PATH)
15 | 
16 |     os.system(command)


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft. All rights reserved.
 2 | import torch
 3 | import math
 4 | from torch.nn.functional import tanh, relu, prelu, leaky_relu, sigmoid, elu, selu
 5 | from torch.nn.init import uniform, normal, eye, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal
 6 | 
 7 | def linear(x):
 8 |     return x
 9 | 
10 | def swish(x):
11 |     return x * sigmoid(x)
12 | 
13 | def bertgelu(x):
14 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
15 | 
16 | def gptgelu(x):
17 |     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
18 | 
19 | # default gelue
20 | gelu = bertgelu
21 | 
22 | def activation(func_a):
23 |     """Activation function wrapper
24 |     """
25 |     try:
26 |         f = eval(func_a)
27 |     except:
28 |         f = linear
29 |     return f
30 | 
31 | def init_wrapper(init='xavier_uniform'):
32 |     return eval(init)


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/__main__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | def main():
 3 |     import sys
 4 |     try:
 5 |         from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
 6 |     except ModuleNotFoundError:
 7 |         print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
 8 |               "In that case, it requires TensorFlow to be installed. Please see "
 9 |               "https://www.tensorflow.org/install/ for installation instructions.")
10 |         raise
11 | 
12 |     if len(sys.argv) != 5:
13 |         # pylint: disable=line-too-long
14 |         print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
15 |     else:
16 |         PYTORCH_DUMP_OUTPUT = sys.argv.pop()
17 |         TF_CONFIG = sys.argv.pop()
18 |         TF_CHECKPOINT = sys.argv.pop()
19 |         convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
20 | 
21 | if __name__ == '__main__':
22 |     main()
23 | 


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/dropout_wrapper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft. All rights reserved.
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | class DropoutWrapper(nn.Module):
 7 |     """
 8 |     This is a dropout wrapper which supports the fix mask dropout
 9 |     """
10 |     def __init__(self, dropout_p=0, enable_vbp=True):
11 |         super(DropoutWrapper, self).__init__()
12 |         """variational dropout means fix dropout mask
13 |         ref: https://discuss.pytorch.org/t/dropout-for-rnns/633/11
14 |         """
15 |         self.enable_variational_dropout = enable_vbp
16 |         self.dropout_p = dropout_p
17 | 
18 |     def forward(self, x):
19 |         """
20 |             :param x: batch * len * input_size
21 |         """
22 |         if self.training == False or self.dropout_p == 0:
23 |             return x
24 | 
25 |         if len(x.size()) == 3:
26 |             mask = 1.0 / (1-self.dropout_p) * torch.bernoulli((1-self.dropout_p) * (x.data.new(x.size(0), x.size(2)).zero_() + 1))
27 |             mask.requires_grad = False
28 |             return mask.unsqueeze(1).expand_as(x) * x
29 |         else:
30 |             return F.dropout(x, p=self.dropout_p, training=self.training)


--------------------------------------------------------------------------------
/lstm_model/run_train_pico.py:
--------------------------------------------------------------------------------
 1 | from src.data_utils import Dataset
 2 | from src.models import HANNModel
 3 | from src.config import Config
 4 | import argparse
 5 | import os
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | 
 9 | def main():
10 |     # create instance of config
11 |     config = Config(parser)
12 |     assert config.data_keyname == 'pico'
13 | 
14 |     # build model
15 |     model = HANNModel(config)
16 |     model.build()
17 | 
18 |     # create datasets
19 |     dev = Dataset(config.filename_dev, config.processing_word,
20 |                          config.processing_tag)
21 |     train = Dataset(config.filename_train, config.processing_word,
22 |                          config.processing_tag)
23 |     test = Dataset(config.filename_test, config.processing_word,
24 |                          config.processing_tag)
25 |     if config.num_augmentation:
26 |         data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation)
27 |     else:
28 |         data_aug = None
29 | 
30 |     # train model
31 |     model.train(train, dev, data_aug)
32 | 
33 |     # evaluate model
34 |     model.restore_session(config.dir_model)
35 |     model.evaluate(test)
36 | 
37 | if __name__ == "__main__":
38 |     main()
39 | 


--------------------------------------------------------------------------------
/lstm_model/run_train.py:
--------------------------------------------------------------------------------
 1 | from model.data_utils import Dataset
 2 | from model.models import HANNModel
 3 | from model.config import Config
 4 | import argparse
 5 | import os
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | 
 9 | def main():
10 |     # create instance of config
11 |     config = Config(parser)
12 |     config.num_augmentation = 20000
13 |     config.batch_size = 20
14 |     config.batch_size_aug = 20
15 |     config.dir_output = 'test-num_augmentation-{}-2'.format(config.num_augmentation)
16 |     config.dir_model = os.path.join(config.dir_output, "model.weights")
17 | 
18 |     # build model
19 |     model = HANNModel(config)
20 |     model.build()
21 |     # if config.restore:
22 |         # model.restore_session("results/test/model.weights/") # optional, restore weights
23 |     # model.reinitialize_weights("proj")
24 | 
25 |     # create datasets
26 |     dev   = Dataset(config.filename_dev, config.processing_word,
27 |                          config.processing_tag)
28 |     train = Dataset(config.filename_train, config.processing_word,
29 |                          config.processing_tag)
30 |     test  = Dataset(config.filename_test, config.processing_word,
31 |                          config.processing_tag)
32 |     if config.num_augmentation:
33 |         data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation)
34 |     else:
35 |         data_aug = None
36 | 
37 |     # train model
38 |     model.train(train, dev, data_aug)
39 | 
40 |     # evaluate model
41 |     model.restore_session(config.dir_model)
42 |     model.evaluate(test)
43 | 
44 | if __name__ == "__main__":
45 |     main()
46 | 


--------------------------------------------------------------------------------
/lstm_model/build_data.py:
--------------------------------------------------------------------------------
 1 | from src.config import Config
 2 | from src.data_utils import Dataset, get_vocabs, UNK, NUM, WORD_PAD, TAG_PAD, \
 3 |     get_wordvec_vocab, write_vocab, load_vocab, get_char_vocab, \
 4 |     export_trimmed_wordvec_vectors, get_processing_word
 5 | import argparse
 6 | import sys
 7 | 
 8 | # data_keyname = sys.argv[1]
 9 | 
10 | def main():
11 |     """Procedure to build data
12 | 
13 |     You MUST RUN this procedure. It iterates over the whole dataset (train,
14 |     dev and test) and extract the vocabularies in terms of words, tags, and
15 |     characters. Having built the vocabularies it writes them in a file. The
16 |     writing of vocabulary in a file assigns an id (the line #) to each word.
17 |     It then extract the relevant GloVe vectors and stores them in a np array
18 |     such that the i-th entry corresponds to the i-th word in the vocabulary.
19 | 
20 | 
21 |     Args:
22 |         config: (instance of Config) has attributes like hyper-params...
23 | 
24 |     """
25 |     # get config and processing of words
26 |     config = Config(load=False)
27 |     processing_word = get_processing_word(lowercase=True)
28 | 
29 |     # Generators
30 |     dev   = Dataset(config.filename_dev, processing_word)
31 |     test  = Dataset(config.filename_test, processing_word)
32 |     train = Dataset(config.filename_train, processing_word)
33 | 
34 |     # add data augmentation dataset
35 |     data_aug = Dataset(config.filename_aug, processing_word)
36 | 
37 |     # Build Word and Tag vocab
38 |     vocab_words_freq, vocab_tags = get_vocabs([train, dev, test, data_aug])
39 |     vocab_words_freq_ = {}
40 |     for vocab, freq in vocab_words_freq.items():
41 |         if freq > config.min_freq:
42 |             vocab_words_freq_[vocab] = freq
43 |     vocab_tags.remove('None')
44 |     # vocab_glove = get_wordvec_vocab(config.filename_wordvec)
45 | 
46 |     # vocab = vocab_words & vocab_glove
47 |     vocab_words_freq_.update({UNK: 1, WORD_PAD: 1, NUM: 1})
48 |     
49 |     # vocab_tags.add(TAG_PAD)
50 | 
51 |     # Save vocab
52 |     write_vocab(vocab_words_freq_, config.filename_words)
53 |     write_vocab(vocab_tags, config.filename_tags)
54 | 
55 |     # Trim GloVe Vectors
56 |     vocab, _ = load_vocab(config.filename_words)
57 |     export_trimmed_wordvec_vectors(vocab, config.filename_wordvec,
58 |                                 config.filename_wordvec_trimmed)
59 | 
60 |     # Build and save char vocab
61 |     # train = Dataset(config.filename_train)
62 |     # vocab_chars = get_char_vocab(train)
63 |     # write_vocab(vocab_chars, config.filename_chars)
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/lstm_model/run_train_nicta.py:
--------------------------------------------------------------------------------
 1 | from src.data_utils import Dataset
 2 | from src.models import HANNModel
 3 | from src.config import Config
 4 | import argparse
 5 | import os
 6 | import numpy as np
 7 | from collections import defaultdict
 8 | 
 9 | 
10 | def main():
11 |     # create instance of config
12 |     config = Config()
13 |     assert config.data_keyname == 'nicta'
14 |     config.num_augmentation = 0
15 |     config.batch_size = 20
16 |     config.batch_size_aug = 20
17 |     config.attention_size = 50
18 |     config.hidden_size_lstm_document = 200
19 |     config.dropout = 0.8
20 |     config.cnn_filter_num = 150
21 |     config.adv_perturb_norm_length = 4
22 |     config.va_perturb_norm_length = 4
23 |     config.adv_reg_coeff = 0.3
24 |     config.va_reg_coeff = 0.3
25 |     config.data_root = '../data/nicta_piboso'
26 |     config.dir_output = 'results/nicta/test-num_augmentation-{}-va_coeff-{}-adv-coeff-{}'.format(config.num_augmentation,
27 |                                                                                                  config.va_reg_coeff,
28 |                                                                                                  config.adv_reg_coeff)
29 |     config.dir_model = os.path.join(config.dir_output, "model.weights")
30 | 
31 |     result_file_path = os.path.join(config.dir_output, 'cross_validate_results')
32 | 
33 |     precisions = defaultdict(list)
34 |     recalls = defaultdict(list)
35 |     f1s = defaultdict(list)
36 |     tag_ls = ['P', 'I', 'O', 'S', 'B', 'OT']
37 | 
38 |     # build model
39 |     model = HANNModel(config)
40 |     model.build()
41 |     # if config.restore:
42 |     # model.restore_session("results/test/model.weights/") # optional, restore weights
43 |     # model.reinitialize_weights("proj")
44 | 
45 |     # create datasets
46 |     train = Dataset(os.path.join(config.data_root, 'train.txt'), config.processing_word,
47 |                     config.processing_tag)
48 |     dev = Dataset(os.path.join(config.data_root, 'test.txt'), config.processing_word,
49 |                   config.processing_tag)
50 |     test = Dataset(os.path.join(config.data_root, 'test.txt'), config.processing_word,
51 |                    config.processing_tag)
52 |     if config.num_augmentation:
53 |         data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation)
54 |     else:
55 |         data_aug = None
56 | 
57 |     # train model
58 |     model.train(train, dev, data_aug)
59 | 
60 |     # evaluate model
61 |     model.restore_session(config.dir_model)
62 |     metrics = model.evaluate(test)
63 | 
64 |     [precisions[tag].append(metrics['precision_all'][tag]) for tag in tag_ls]
65 |     [recalls[tag].append(metrics['recall_all'][tag]) for tag in tag_ls]
66 |     [f1s[tag].append(metrics['f1_all'][tag]) for tag in tag_ls]
67 |     msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(fold, metrics['precision_all'],
68 |                                                                  metrics['recall_all'], metrics['f1_all'])
69 |     print(msg)
70 |     with open(result_file_path, 'a') as ofile:
71 |         ofile.write(msg)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     main()
76 | 


--------------------------------------------------------------------------------
/BERT/utils.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jindi'
 2 | 
 3 | import collections
 4 | from itertools import repeat
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.utils.rnn as rnn_utils
 8 | 
 9 | 
10 | def _ntuple(n):
11 |     def parse(x):
12 |         if isinstance(x, collections.Iterable):
13 |             return x
14 |         return tuple(repeat(x, n))
15 |     return parse
16 | 
17 | _single = _ntuple(1)
18 | _pair = _ntuple(2)
19 | _triple = _ntuple(3)
20 | _quadruple = _ntuple(4)
21 | 
22 | 
23 | # encode the sequence length information in the batch for RNN use
24 | # this is special for pytorch RNN function
25 | def prepare_rnn_seq(rnn_input, lengths, hx=None, masks=None, batch_first=False):
26 |     '''
27 | 
28 |     Args:
29 |         rnn_input: [seq_len, batch, input_size]: tensor containing the features of the input sequence.
30 |         lengths: [batch]: tensor containing the lengthes of the input sequence
31 |         hx: [num_layers * num_directions, batch, hidden_size]: tensor containing the initial hidden state for each element in the batch.
32 |         masks: [seq_len, batch]: tensor containing the mask for each element in the batch.
33 |         batch_first: If True, then the input and output tensors are provided as [batch, seq_len, feature].
34 | 
35 |     Returns:
36 | 
37 |     '''
38 |     def check_decreasing(lengths):
39 |         lens, order = torch.sort(lengths, dim=0, descending=True)
40 |         if torch.ne(lens, lengths).sum() == 0:
41 |             return None
42 |         else:
43 |             _, rev_order = torch.sort(order)
44 |             return lens, order, rev_order
45 | 
46 |     check_res = check_decreasing(lengths)
47 | 
48 |     if check_res is None:
49 |         lens = lengths
50 |         rev_order = None
51 |     else:
52 |         lens, order, rev_order = check_res
53 |         batch_dim = 0 if batch_first else 1
54 |         rnn_input = rnn_input.index_select(batch_dim, order)
55 |         if hx is not None:
56 |             # hack lstm
57 |             if isinstance(hx, tuple):
58 |                 hx, cx = hx
59 |                 hx = hx.index_select(1, order)
60 |                 cx = cx.index_select(1, order)
61 |                 hx = (hx, cx)
62 |             else:
63 |                 hx = hx.index_select(1, order)
64 | 
65 |     lens = lens.tolist()
66 |     seq = rnn_utils.pack_padded_sequence(rnn_input, lens, batch_first=batch_first)
67 |     if masks is not None:
68 |         if batch_first:
69 |             masks = masks[:, :lens[0]]
70 |         else:
71 |             masks = masks[:lens[0]]
72 |     return seq, hx, rev_order, masks
73 | 
74 | 
75 | # recover the sequence results from RNN function
76 | # this is special to pytorch RNN function
77 | def recover_rnn_seq(seq, rev_order, hx=None, batch_first=False):
78 |     output, _ = rnn_utils.pad_packed_sequence(seq, batch_first=batch_first)
79 |     if rev_order is not None:
80 |         batch_dim = 0 if batch_first else 1
81 |         output = output.index_select(batch_dim, rev_order)
82 |         if hx is not None:
83 |             # hack lstm
84 |             if isinstance(hx, tuple):
85 |                 hx, cx = hx
86 |                 hx = hx.index_select(1, rev_order)
87 |                 cx = cx.index_select(1, rev_order)
88 |                 hx = (hx, cx)
89 |             else:
90 |                 hx = hx.index_select(1, rev_order)
91 |     return output, hx
92 | 


--------------------------------------------------------------------------------
/lstm_model/run_train_cross_validate.py:
--------------------------------------------------------------------------------
 1 | from model.data_utils import Dataset
 2 | from model.models import HANNModel
 3 | from model.config import Config
 4 | import argparse
 5 | import os
 6 | import numpy as np
 7 | 
 8 | 
 9 | def main():
10 |     # create instance of config
11 |     config = Config()
12 |     config.num_augmentation = 20000
13 |     config.batch_size = 20
14 |     config.batch_size_aug = 20
15 |     config.dir_output = 'test-num_augmentation-{}'.format(config.num_augmentation)
16 |     config.dir_model = os.path.join(config.dir_output, "model.weights")
17 | 
18 |     result_file_path = os.path.join(config.dir_output, 'cross_validate_results')
19 | 
20 |     precisions = {'P': [], 'I': [], 'O': []}
21 |     recalls = {'P': [], 'I': [], 'O': []}
22 |     f1s = {'P': [], 'I': [], 'O': []}
23 | 
24 |     for fold in range(2, 6):
25 |         # build model
26 |         # tf.reset_default_graph()
27 |         print('Fold {}'.format(fold))
28 | 
29 |         # build model
30 |         model = HANNModel(config)
31 |         model.build()
32 |         # if config.restore:
33 |         # model.restore_session("results/test/model.weights/") # optional, restore weights
34 |         # model.reinitialize_weights("proj")
35 | 
36 |         # create datasets
37 |         train = Dataset(os.path.join(config.data_root, str(fold), 'train.txt'), config.processing_word,
38 |                         config.processing_tag)
39 |         dev = Dataset(os.path.join(config.data_root, str(fold), 'dev.txt'), config.processing_word,
40 |                       config.processing_tag)
41 |         test = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word,
42 |                        config.processing_tag)
43 |         if config.num_augmentation:
44 |             data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation)
45 |         else:
46 |             data_aug = None
47 | 
48 |         # train model
49 |         model.train(train, dev, data_aug)
50 | 
51 |         # evaluate model
52 |         model.restore_session(config.dir_model)
53 |         metrics = model.evaluate(test)
54 | 
55 |         [precisions[tag].append(metrics['precision'][tag]) for tag in ['P', 'I', 'O']]
56 |         [recalls[tag].append(metrics['recall'][tag]) for tag in ['P', 'I', 'O']]
57 |         [f1s[tag].append(metrics['f1'][tag]) for tag in ['P', 'I', 'O']]
58 |         msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(fold, metrics['precision'], metrics['recall'], metrics['f1'])
59 |         print(msg)
60 |         with open(result_file_path, 'a') as ofile:
61 |             ofile.write(msg)
62 | 
63 | 
64 |     # print('Precision: ', 'P: ', (precisions['P']), 'I: ', (precisions['I']), 'O: ', (precisions['O']))
65 |     # print('Recall: ', 'P: ', (recalls['P']), 'I: ', (recalls['I']), 'O: ', (recalls['O']))
66 |     # print('F1: ', 'P: ', (f1s['P']), 'I: ', (f1s['I']), 'O: ', (f1s['O']))
67 |     # print('Precision: ', 'P: ', np.mean(precisions['P']), 'I: ', np.mean(precisions['I']), 'O: ', np.mean(precisions['O']))
68 |     # print('Recall: ', 'P: ', np.mean(recalls['P']), 'I: ', np.mean(recalls['I']), 'O: ', np.mean(recalls['O']))
69 |     # res = np.mean([np.mean(values) for values in f1s.values()])
70 |     # print('F1: ', 'P: ', np.mean(f1s['P']), 'I: ', np.mean(f1s['I']), 'O: ', np.mean(f1s['O']), 'all avg: ', res)
71 |     msg = 'Average Precision: P: {}\tI: {}\tO: {}\n'.format(np.mean(precisions['P']), np.mean(precisions['I']), np.mean(precisions['O']))
72 |     print(msg)
73 |     with open(result_file_path, 'a') as ofile:
74 |         ofile.write(msg)
75 |     msg = 'Average Recall: P: {}\tI: {}\tO: {}\n'.format(np.mean(recalls['P']), np.mean(recalls['I']), np.mean(recalls['O']))
76 |     print(msg)
77 |     with open(result_file_path, 'a') as ofile:
78 |         ofile.write(msg)
79 |     res = np.mean([np.mean(values) for values in f1s.values()])
80 |     msg = 'Average F1: P: {}\tI: {}\tO: {}\tall: {}\n'.format(np.mean(f1s['P']), np.mean(f1s['I']), np.mean(f1s['O']), res)
81 |     print(msg)
82 |     with open(result_file_path, 'a') as ofile:
83 |         ofile.write(msg)
84 |         ofile.write('\n\n\n')
85 | 
86 | if __name__ == "__main__":
87 |     main()
88 | 


--------------------------------------------------------------------------------
/lstm_model/run_train_cross_validate_nicta.py:
--------------------------------------------------------------------------------
 1 | from src.data_utils import Dataset
 2 | from src.models import HANNModel
 3 | from src.config import Config
 4 | import argparse
 5 | import os
 6 | import numpy as np
 7 | from collections import defaultdict
 8 | 
 9 | 
10 | def main():
11 |     # create instance of config
12 |     config = Config()
13 |     assert config.data_keyname == 'nicta'
14 |     config.num_augmentation = 200000
15 |     config.batch_size = 20
16 |     config.batch_size_aug = 20
17 |     config.attention_size = 50
18 |     config.hidden_size_lstm_document = 200
19 |     config.dropout = 0.8
20 |     config.cnn_filter_num = 150
21 |     config.adv_perturb_norm_length = 4
22 |     config.va_perturb_norm_length = 4
23 |     config.adv_reg_coeff = 0.3
24 |     config.va_reg_coeff = 0.3
25 |     config.data_root = '../data/nicta_piboso/10_folds'
26 |     config.dir_output = 'results/nicta/test-num_augmentation-{}-va_coeff-{}-adv-coeff-{}'.format(config.num_augmentation,
27 |                                                                                                  config.va_reg_coeff,
28 |                                                                                                  config.adv_reg_coeff)
29 |     config.dir_model = os.path.join(config.dir_output, "model.weights")
30 | 
31 |     result_file_path = os.path.join(config.dir_output, 'cross_validate_results')
32 | 
33 |     precisions = defaultdict(list)
34 |     recalls = defaultdict(list)
35 |     f1s = defaultdict(list)
36 |     tag_ls = ['P', 'I', 'O', 'S', 'B', 'OT']
37 | 
38 |     for fold in range(1, 11):
39 |         # build model
40 |         # tf.reset_default_graph()
41 |         print('Fold {}'.format(fold))
42 | 
43 |         # build model
44 |         model = HANNModel(config)
45 |         model.build()
46 |         # if config.restore:
47 |         # model.restore_session("results/test/model.weights/") # optional, restore weights
48 |         # model.reinitialize_weights("proj")
49 | 
50 |         # create datasets
51 |         train = Dataset(os.path.join(config.data_root, str(fold), 'train.txt'), config.processing_word,
52 |                         config.processing_tag)
53 |         dev = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word,
54 |                       config.processing_tag)
55 |         test = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word,
56 |                        config.processing_tag)
57 |         if config.num_augmentation:
58 |             data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation)
59 |         else:
60 |             data_aug = None
61 | 
62 |         # train model
63 |         model.train(train, dev, data_aug)
64 | 
65 |         # evaluate model
66 |         model.restore_session(config.dir_model)
67 |         metrics = model.evaluate(test)
68 | 
69 |         [precisions[tag].append(metrics['precision_all'][tag]) for tag in tag_ls]
70 |         [recalls[tag].append(metrics['recall_all'][tag]) for tag in tag_ls]
71 |         [f1s[tag].append(metrics['f1_all'][tag]) for tag in tag_ls]
72 |         msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(fold, metrics['precision_all'],
73 |                                                                      metrics['recall_all'], metrics['f1_all'])
74 |         print(msg)
75 |         with open(result_file_path, 'a') as ofile:
76 |             ofile.write(msg)
77 | 
78 |     msg = 'Average Precision: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(precisions[tag])) for tag in tag_ls]))
79 |     print(msg)
80 |     with open(result_file_path, 'a') as ofile:
81 |         ofile.write(msg)
82 |     msg = 'Average Recall: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(recalls[tag])) for tag in tag_ls]))
83 |     print(msg)
84 |     with open(result_file_path, 'a') as ofile:
85 |         ofile.write(msg)
86 |     res = np.mean([np.mean(values) for values in f1s.values()])
87 |     msg = 'Average F1: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(f1s[tag])) for tag in tag_ls]))
88 |     print(msg)
89 |     with open(result_file_path, 'a') as ofile:
90 |         ofile.write(msg)
91 |         ofile.write('\n\n\n')
92 | 
93 | if __name__ == "__main__":
94 |     main()
95 | 


--------------------------------------------------------------------------------
/lstm_model/run_train_cross_validate_pico.py:
--------------------------------------------------------------------------------
 1 | from src.data_utils import Dataset
 2 | from src.models import HANNModel
 3 | from src.config import Config
 4 | import argparse
 5 | import os
 6 | import numpy as np
 7 | 
 8 | 
 9 | def main():
10 |     # create instance of config
11 |     config = Config()
12 |     assert config.data_keyname == 'pico'
13 |     config.num_augmentation = 0
14 |     config.batch_size = 20
15 |     config.batch_size_aug = 20
16 |     config.dir_output = 'test-num_augmentation-{}'.format(config.num_augmentation)
17 |     config.dir_model = os.path.join(config.dir_output, "model.weights")
18 |     config.data_root = '../data/{}/10_folds'.format(config.data_keyname)
19 | 
20 |     result_file_path = os.path.join(config.dir_output, 'cross_validate_results')
21 | 
22 |     precisions = {'P': [], 'I': [], 'O': []}
23 |     recalls = {'P': [], 'I': [], 'O': []}
24 |     f1s = {'P': [], 'I': [], 'O': []}
25 | 
26 |     for fold in range(1, 11):
27 |         # build model
28 |         # tf.reset_default_graph()
29 |         print('Fold {}'.format(fold))
30 | 
31 |         # build model
32 |         model = HANNModel(config)
33 |         model.build()
34 |         # if config.restore:
35 |         # model.restore_session("results/test/model.weights/") # optional, restore weights
36 |         # model.reinitialize_weights("proj")
37 | 
38 |         # create datasets
39 |         train = Dataset(os.path.join(config.data_root, str(fold), 'train.txt'), config.processing_word,
40 |                         config.processing_tag)
41 |         dev = Dataset(os.path.join(config.data_root, str(fold), 'dev.txt'), config.processing_word,
42 |                       config.processing_tag)
43 |         test = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word,
44 |                        config.processing_tag)
45 |         if config.num_augmentation:
46 |             data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation)
47 |         else:
48 |             data_aug = None
49 | 
50 |         # train model
51 |         model.train(train, dev, data_aug)
52 | 
53 |         # evaluate model
54 |         model.restore_session(config.dir_model)
55 |         metrics = model.evaluate(test)
56 | 
57 |         [precisions[tag].append(metrics['precision'][tag]) for tag in ['P', 'I', 'O']]
58 |         [recalls[tag].append(metrics['recall'][tag]) for tag in ['P', 'I', 'O']]
59 |         [f1s[tag].append(metrics['f1'][tag]) for tag in ['P', 'I', 'O']]
60 |         msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(fold, metrics['precision'], metrics['recall'], metrics['f1'])
61 |         print(msg)
62 |         with open(result_file_path, 'a') as ofile:
63 |             ofile.write(msg)
64 | 
65 | 
66 |     # print('Precision: ', 'P: ', (precisions['P']), 'I: ', (precisions['I']), 'O: ', (precisions['O']))
67 |     # print('Recall: ', 'P: ', (recalls['P']), 'I: ', (recalls['I']), 'O: ', (recalls['O']))
68 |     # print('F1: ', 'P: ', (f1s['P']), 'I: ', (f1s['I']), 'O: ', (f1s['O']))
69 |     # print('Precision: ', 'P: ', np.mean(precisions['P']), 'I: ', np.mean(precisions['I']), 'O: ', np.mean(precisions['O']))
70 |     # print('Recall: ', 'P: ', np.mean(recalls['P']), 'I: ', np.mean(recalls['I']), 'O: ', np.mean(recalls['O']))
71 |     # res = np.mean([np.mean(values) for values in f1s.values()])
72 |     # print('F1: ', 'P: ', np.mean(f1s['P']), 'I: ', np.mean(f1s['I']), 'O: ', np.mean(f1s['O']), 'all avg: ', res)
73 |     msg = 'Average Precision: P: {}\tI: {}\tO: {}\n'.format(np.mean(precisions['P']), np.mean(precisions['I']), np.mean(precisions['O']))
74 |     print(msg)
75 |     with open(result_file_path, 'a') as ofile:
76 |         ofile.write(msg)
77 |     msg = 'Average Recall: P: {}\tI: {}\tO: {}\n'.format(np.mean(recalls['P']), np.mean(recalls['I']), np.mean(recalls['O']))
78 |     print(msg)
79 |     with open(result_file_path, 'a') as ofile:
80 |         ofile.write(msg)
81 |     res = np.mean([np.mean(values) for values in f1s.values()])
82 |     msg = 'Average F1: P: {}\tI: {}\tO: {}\tall: {}\n'.format(np.mean(f1s['P']), np.mean(f1s['I']), np.mean(f1s['O']), res)
83 |     print(msg)
84 |     with open(result_file_path, 'a') as ofile:
85 |         ofile.write(msg)
86 |         ofile.write('\n\n\n')
87 | 
88 | if __name__ == "__main__":
89 |     main()
90 | 


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/my_optim.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft. All rights reserved.
  2 | from copy import deepcopy
  3 | import torch
  4 | from torch.nn import Parameter
  5 | 
  6 | class EMA:
  7 |     def __init__(self, gamma, model):
  8 |         super(EMA, self).__init__()
  9 |         self.gamma = gamma
 10 |         self.shadow = {}
 11 |         self.model = model
 12 |         self.setup()
 13 | 
 14 |     def setup(self):
 15 |         for name, para in self.model.named_parameters():
 16 |             if para.requires_grad:
 17 |                 self.shadow[name] = para.clone()
 18 |     def cuda(self):
 19 |         for k, v in self.shadow.items():
 20 |             self.shadow[k] = v.cuda()
 21 | 
 22 |     def update(self):
 23 |         for name,para in self.model.named_parameters():
 24 |             if para.requires_grad:
 25 |                 self.shadow[name] = (1.0 - self.gamma) * para + self.gamma * self.shadow[name]
 26 | 
 27 |     def swap_parameters(self):
 28 |         for name, para in self.model.named_parameters():
 29 |             if para.requires_grad:
 30 |                 temp_data = para.data
 31 |                 para.data = self.shadow[name].data
 32 |                 self.shadow[name].data = temp_data
 33 | 
 34 |     def state_dict(self):
 35 |         return self.shadow
 36 | 
 37 | 
 38 | # Adapted from
 39 | # https://github.com/pytorch/pytorch/blob/master/torch/nn/utils/weight_norm.py
 40 | # and https://github.com/salesforce/awd-lstm-lm/blob/master/weight_drop.py
 41 | def _norm(p, dim):
 42 |     """Computes the norm over all dimensions except dim"""
 43 |     if dim is None:
 44 |         return p.norm()
 45 |     elif dim == 0:
 46 |         output_size = (p.size(0),) + (1,) * (p.dim() - 1)
 47 |         return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size)
 48 |     elif dim == p.dim() - 1:
 49 |         output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
 50 |         return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size)
 51 |     else:
 52 |         return _norm(p.transpose(0, dim), 0).transpose(0, dim)
 53 | 
 54 | 
 55 | def _dummy(*args, **kwargs):
 56 |     # We need to replace flatten_parameters with a nothing function
 57 |     return
 58 | 
 59 | 
 60 | class WeightNorm(torch.nn.Module):
 61 | 
 62 |     def __init__(self, weights, dim):
 63 |         super(WeightNorm, self).__init__()
 64 |         self.weights = weights
 65 |         self.dim = dim
 66 | 
 67 |     def compute_weight(self, module, name):
 68 |         g = getattr(module, name + '_g')
 69 |         v = getattr(module, name + '_v')
 70 |         return v * (g / _norm(v, self.dim))
 71 | 
 72 |     @staticmethod
 73 |     def apply(module, weights, dim):
 74 |         # Terrible temporary solution to an issue regarding compacting weights
 75 |         # re: CUDNN RNN
 76 |         if issubclass(type(module), torch.nn.RNNBase):
 77 |             module.flatten_parameters = _dummy
 78 |         if weights is None:  # do for all weight params
 79 |             weights = [w for w in module._parameters.keys() if 'weight' in w]
 80 |         fn = WeightNorm(weights, dim)
 81 |         for name in weights:
 82 |             if hasattr(module, name):
 83 |                 print('Applying weight norm to {} - {}'.format(str(module), name))
 84 |                 weight = getattr(module, name)
 85 |                 del module._parameters[name]
 86 |                 module.register_parameter(
 87 |                     name + '_g', Parameter(_norm(weight, dim).data))
 88 |                 module.register_parameter(name + '_v', Parameter(weight.data))
 89 |                 setattr(module, name, fn.compute_weight(module, name))
 90 | 
 91 |         module.register_forward_pre_hook(fn)
 92 | 
 93 |         return fn
 94 | 
 95 |     def remove(self, module):
 96 |         for name in self.weights:
 97 |             weight = self.compute_weight(module)
 98 |             delattr(module, name)
 99 |             del module._parameters[name + '_g']
100 |             del module._parameters[name + '_v']
101 |             module.register_parameter(name, Parameter(weight.data))
102 | 
103 |     def __call__(self, module, inputs):
104 |         for name in self.weights:
105 |             setattr(module, name, self.compute_weight(module, name))
106 | 
107 | 
108 | def weight_norm(module, weights=None, dim=0):
109 |     WeightNorm.apply(module, weights, dim)
110 |     return module


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep-PICO-Detection
 2 | A model for identifying PICO elements in a given biomedical/clinical text.
 3 | 
 4 | This is the source code for the paper: [Di Jin, Peter Szolovits, Advancing PICO Element Detection in Biomedical Text via Deep Neural Networks, Bioinformatics, , btaa256](https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btaa256/5822877?guestAccessKey=7f54ea86-4ec0-4080-9d5c-1251b730aa42). If you use the code, please cite the paper:
 5 | 
 6 | ```
 7 | @article{10.1093/bioinformatics/btaa256,
 8 |     author = {Jin, Di and Szolovits, Peter},
 9 |     title = "{Advancing PICO element detection in biomedical text via deep neural networks}",
10 |     journal = {Bioinformatics},
11 |     year = {2020},
12 |     month = {04},
13 |     issn = {1367-4803},
14 |     doi = {10.1093/bioinformatics/btaa256},
15 |     url = {https://doi.org/10.1093/bioinformatics/btaa256},
16 |     note = {btaa256},
17 |     eprint = {https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btaa256/33363807/btaa256.pdf},
18 | }
19 | ```
20 | 
21 | ## Prerequisites:
22 | Run the following command to install the prerequisite packages:
23 | ```
24 | pip install -r requirements.txt
25 | ```
26 | 
27 | ## Data:
28 | Please download the data including PICO and NICTA-PIBOSO from the [Google Drive](https://drive.google.com/file/d/1M9QCgrRjERZnD9LM2FeK-3jjvXJbjRTl/view?usp=sharing) and unzip it to the main directory of this repository so that the folder layout is like this:
29 | ```
30 | ./BERT
31 | ./lstm_model
32 | ./data
33 | ```
34 | 
35 | ## How to use
36 | ### For LSTM based models
37 | * The code for the LSTM based models is in the folder of "lstm_model", so run the following command to enter it:
38 | ```
39 | cd lstm_model
40 | ```
41 | 
42 | * First we need to process the data to get vocabulary and trim the embedding file. The embeddings we used in experiments are from [here](http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin). Please download it and convert it to "txt" format. Of course, you can also try other kinds of embeddings such as fasttext. Then run the following command:
43 | ```
44 | python build_data.py --data_keyname DATA_KEYNAME --filename_wordvec PATH_TO_EMBEDDING_FILE
45 | ```
46 | DATA_KEYNAME can be "pico" for the PICO dataset and "nicta" for the NICTA-PIBOSO dataset; PATH_TO_EMBEDDING_FILE specifies where you store the embedding file.
47 | 
48 | * Then we can start training the model for the PICO dataset by running the following command:
49 | ```
50 | python run_train_pico.py --data_keyname pico
51 | ```
52 | And the following command is for the NICTA-PIBOSO dataset:
53 | ```
54 | python run_train_nicta.py --data_keyname nicta
55 | ```
56 | 
57 | * If we want to implement the 10 fold cross-validation, we run the following commands:
58 | ```
59 | python run_train_cross_validate_pico.py --data_keyname pico
60 | python run_train_cross_validate_nicta.py --data_keyname nicta
61 | ```
62 | 
63 | There are several important arguments in the file of "src/config.py" that configures the model architecture and they are explains here:
64 | 
65 | * --adv_reg_coeff: The coefficient for the adversarial loss regularization. Setting it to zero means we do not conduct the adversarial training.
66 | * --va_reg_coeff: The coefficient for the virtual adversarial loss regularization. Setting it to zero means we do not conduct the virtual adversarial training.
67 | * --num_augmentation: The number of samples we use for the virtual adversarial training.
68 | 
69 | ### For the BERT Models
70 | * Code for the BERT models is in the folder of "BERT" and please enter this folder.
71 | 
72 | * The best BERT model we found is the [BioBERT model](https://github.com/dmis-lab/biobert). The pretrained model parameter files available in the original repository only have the tensorflow version, and if you want the pytorch version, you can download from [here](https://drive.google.com/file/d/1H6DTBXlXDZ6tJYcJWdZnZ3UCoY16p19m/view?usp=sharing). Once you obtain the pretrained BERT model file, run the following commands for training:
73 | ```
74 | python run_classifier_pico.py PATH_TO_BERT_MODEL
75 | python run_classifier_nicta.py PATH_TO_BERT_MODEL
76 | ```
77 | In this command, PATH_TO_BERT_MODEL specifies the directory where you put your downloaded BERT model files. 
78 | 
79 | * The following commands are for the 10-fold cross-validation:
80 | ```
81 | python run_classifier_pico_cross_validate.py PATH_TO_BERT_MODEL
82 | python run_classifier_nicta_cross_validate.py PATH_TO_BERT_MODEL
83 | ```
84 | 


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BERT checkpoint."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import re
 23 | import argparse
 24 | import tensorflow as tf
 25 | import torch
 26 | import numpy as np
 27 | 
 28 | from .modeling import BertConfig, BertForPreTraining
 29 | 
 30 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
 31 |     config_path = os.path.abspath(bert_config_file)
 32 |     tf_path = os.path.abspath(tf_checkpoint_path)
 33 |     print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
 34 |     # Load weights from TF model
 35 |     init_vars = tf.train.list_variables(tf_path)
 36 |     names = []
 37 |     arrays = []
 38 |     for name, shape in init_vars:
 39 |         print("Loading TF weight {} with shape {}".format(name, shape))
 40 |         array = tf.train.load_variable(tf_path, name)
 41 |         names.append(name)
 42 |         arrays.append(array)
 43 | 
 44 |     # Initialise PyTorch model
 45 |     config = BertConfig.from_json_file(bert_config_file)
 46 |     print("Building PyTorch model from configuration: {}".format(str(config)))
 47 |     model = BertForPreTraining(config)
 48 | 
 49 |     for name, array in zip(names, arrays):
 50 |         name = name.split('/')
 51 |         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
 52 |         # which are not required for using pretrained model
 53 |         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
 54 |             print("Skipping {}".format("/".join(name)))
 55 |             continue
 56 |         pointer = model
 57 |         for m_name in name:
 58 |             if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
 59 |                 l = re.split(r'_(\d+)', m_name)
 60 |             else:
 61 |                 l = [m_name]
 62 |             if l[0] == 'kernel' or l[0] == 'gamma':
 63 |                 pointer = getattr(pointer, 'weight')
 64 |             elif l[0] == 'output_bias' or l[0] == 'beta':
 65 |                 pointer = getattr(pointer, 'bias')
 66 |             elif l[0] == 'output_weights':
 67 |                 pointer = getattr(pointer, 'weight')
 68 |             else:
 69 |                 pointer = getattr(pointer, l[0])
 70 |             if len(l) >= 2:
 71 |                 num = int(l[1])
 72 |                 pointer = pointer[num]
 73 |         if m_name[-11:] == '_embeddings':
 74 |             pointer = getattr(pointer, 'weight')
 75 |         elif m_name == 'kernel':
 76 |             array = np.transpose(array)
 77 |         try:
 78 |             assert pointer.shape == array.shape
 79 |         except AssertionError as e:
 80 |             e.args += (pointer.shape, array.shape)
 81 |             raise
 82 |         print("Initialize PyTorch weight {}".format(name))
 83 |         pointer.data = torch.from_numpy(array)
 84 | 
 85 |     # Save pytorch-model
 86 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
 87 |     torch.save(model.state_dict(), pytorch_dump_path)
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 |     parser = argparse.ArgumentParser()
 92 |     ## Required parameters
 93 |     parser.add_argument("--tf_checkpoint_path",
 94 |                         default = None,
 95 |                         type = str,
 96 |                         required = True,
 97 |                         help = "Path the TensorFlow checkpoint path.")
 98 |     parser.add_argument("--bert_config_file",
 99 |                         default = None,
100 |                         type = str,
101 |                         required = True,
102 |                         help = "The config json file corresponding to the pre-trained BERT model. \n"
103 |                             "This specifies the model architecture.")
104 |     parser.add_argument("--pytorch_dump_path",
105 |                         default = None,
106 |                         type = str,
107 |                         required = True,
108 |                         help = "Path to the output PyTorch model.")
109 |     args = parser.parse_args()
110 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
111 |                                      args.bert_config_file,
112 |                                      args.pytorch_dump_path)
113 | 


--------------------------------------------------------------------------------
/lstm_model/general_utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import sys
  3 | import logging
  4 | import numpy as np
  5 | 
  6 | 
  7 | def get_logger(filename):
  8 |     """Return a logger instance that writes in filename
  9 | 
 10 |     Args:
 11 |         filename: (string) path to log.txt
 12 | 
 13 |     Returns:
 14 |         logger: (instance of logger)
 15 | 
 16 |     """
 17 |     logger = logging.getLogger('logger')
 18 |     logger.setLevel(logging.DEBUG)
 19 |     logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 20 |     handler = logging.FileHandler(filename)
 21 |     handler.setLevel(logging.DEBUG)
 22 |     handler.setFormatter(logging.Formatter(
 23 |             '%(asctime)s:%(levelname)s: %(message)s'))
 24 |     logging.getLogger().addHandler(handler)
 25 | 
 26 |     return logger
 27 | 
 28 | 
 29 | class Progbar(object):
 30 |     """Progbar class copied from keras (https://github.com/fchollet/keras/)
 31 | 
 32 |     Displays a progress bar.
 33 |     Small edit : added strict arg to update
 34 |     # Arguments
 35 |         target: Total number of steps expected.
 36 |         interval: Minimum visual progress update interval (in seconds).
 37 |     """
 38 | 
 39 |     def __init__(self, target, width=30, verbose=1):
 40 |         self.width = width
 41 |         self.target = target
 42 |         self.sum_values = {}
 43 |         self.unique_values = []
 44 |         self.start = time.time()
 45 |         self.total_width = 0
 46 |         self.seen_so_far = 0
 47 |         self.verbose = verbose
 48 | 
 49 |     def update(self, current, values=[], exact=[], strict=[]):
 50 |         """
 51 |         Updates the progress bar.
 52 |         # Arguments
 53 |             current: Index of current step.
 54 |             values: List of tuples (name, value_for_last_step).
 55 |                 The progress bar will display averages for these values.
 56 |             exact: List of tuples (name, value_for_last_step).
 57 |                 The progress bar will display these values directly.
 58 |         """
 59 | 
 60 |         for k, v in values:
 61 |             if k not in self.sum_values:
 62 |                 self.sum_values[k] = [v * (current - self.seen_so_far),
 63 |                                       current - self.seen_so_far]
 64 |                 self.unique_values.append(k)
 65 |             else:
 66 |                 self.sum_values[k][0] += v * (current - self.seen_so_far)
 67 |                 self.sum_values[k][1] += (current - self.seen_so_far)
 68 |         for k, v in exact:
 69 |             if k not in self.sum_values:
 70 |                 self.unique_values.append(k)
 71 |             self.sum_values[k] = [v, 1]
 72 | 
 73 |         for k, v in strict:
 74 |             if k not in self.sum_values:
 75 |                 self.unique_values.append(k)
 76 |             self.sum_values[k] = v
 77 | 
 78 |         self.seen_so_far = current
 79 | 
 80 |         now = time.time()
 81 |         if self.verbose == 1:
 82 |             prev_total_width = self.total_width
 83 |             sys.stdout.write("\b" * prev_total_width)
 84 |             sys.stdout.write("\r")
 85 | 
 86 |             numdigits = int(np.floor(np.log10(self.target))) + 1
 87 |             barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
 88 |             bar = barstr % (current, self.target)
 89 |             prog = float(current)/self.target
 90 |             prog_width = int(self.width*prog)
 91 |             if prog_width > 0:
 92 |                 bar += ('='*(prog_width-1))
 93 |                 if current < self.target:
 94 |                     bar += '>'
 95 |                 else:
 96 |                     bar += '='
 97 |             bar += ('.'*(self.width-prog_width))
 98 |             bar += ']'
 99 |             sys.stdout.write(bar)
100 |             self.total_width = len(bar)
101 | 
102 |             if current:
103 |                 time_per_unit = (now - self.start) / current
104 |             else:
105 |                 time_per_unit = 0
106 |             eta = time_per_unit*(self.target - current)
107 |             info = ''
108 |             if current < self.target:
109 |                 info += ' - ETA: %ds' % eta
110 |             else:
111 |                 info += ' - %ds' % (now - self.start)
112 |             for k in self.unique_values:
113 |                 if type(self.sum_values[k]) is list:
114 |                     info += ' - %s: %.4f' % (k,
115 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
116 |                 else:
117 |                     info += ' - %s: %s' % (k, self.sum_values[k])
118 | 
119 |             self.total_width += len(info)
120 |             if prev_total_width > self.total_width:
121 |                 info += ((prev_total_width-self.total_width) * " ")
122 | 
123 |             sys.stdout.write(info)
124 |             sys.stdout.flush()
125 | 
126 |             if current >= self.target:
127 |                 sys.stdout.write("\n")
128 | 
129 |         if self.verbose == 2:
130 |             if current >= self.target:
131 |                 info = '%ds' % (now - self.start)
132 |                 for k in self.unique_values:
133 |                     info += ' - %s: %.4f' % (k,
134 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
135 |                 sys.stdout.write(info + "\n")
136 | 
137 |     def add(self, n, values=[]):
138 |         self.update(self.seen_so_far+n, values)
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/lstm_model/src/general_utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import sys
  3 | import logging
  4 | import numpy as np
  5 | 
  6 | 
  7 | def get_logger(filename):
  8 |     """Return a logger instance that writes in filename
  9 | 
 10 |     Args:
 11 |         filename: (string) path to log.txt
 12 | 
 13 |     Returns:
 14 |         logger: (instance of logger)
 15 | 
 16 |     """
 17 |     logger = logging.getLogger('logger')
 18 |     logger.setLevel(logging.DEBUG)
 19 |     logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 20 |     handler = logging.FileHandler(filename)
 21 |     handler.setLevel(logging.DEBUG)
 22 |     handler.setFormatter(logging.Formatter(
 23 |             '%(asctime)s:%(levelname)s: %(message)s'))
 24 |     logging.getLogger().addHandler(handler)
 25 | 
 26 |     return logger
 27 | 
 28 | 
 29 | class Progbar(object):
 30 |     """Progbar class copied from keras (https://github.com/fchollet/keras/)
 31 | 
 32 |     Displays a progress bar.
 33 |     Small edit : added strict arg to update
 34 |     # Arguments
 35 |         target: Total number of steps expected.
 36 |         interval: Minimum visual progress update interval (in seconds).
 37 |     """
 38 | 
 39 |     def __init__(self, target, width=30, verbose=1):
 40 |         self.width = width
 41 |         self.target = target
 42 |         self.sum_values = {}
 43 |         self.unique_values = []
 44 |         self.start = time.time()
 45 |         self.total_width = 0
 46 |         self.seen_so_far = 0
 47 |         self.verbose = verbose
 48 | 
 49 |     def update(self, current, values=[], exact=[], strict=[]):
 50 |         """
 51 |         Updates the progress bar.
 52 |         # Arguments
 53 |             current: Index of current step.
 54 |             values: List of tuples (name, value_for_last_step).
 55 |                 The progress bar will display averages for these values.
 56 |             exact: List of tuples (name, value_for_last_step).
 57 |                 The progress bar will display these values directly.
 58 |         """
 59 | 
 60 |         for k, v in values:
 61 |             if k not in self.sum_values:
 62 |                 self.sum_values[k] = [v * (current - self.seen_so_far),
 63 |                                       current - self.seen_so_far]
 64 |                 self.unique_values.append(k)
 65 |             else:
 66 |                 self.sum_values[k][0] += v * (current - self.seen_so_far)
 67 |                 self.sum_values[k][1] += (current - self.seen_so_far)
 68 |         for k, v in exact:
 69 |             if k not in self.sum_values:
 70 |                 self.unique_values.append(k)
 71 |             self.sum_values[k] = [v, 1]
 72 | 
 73 |         for k, v in strict:
 74 |             if k not in self.sum_values:
 75 |                 self.unique_values.append(k)
 76 |             self.sum_values[k] = v
 77 | 
 78 |         self.seen_so_far = current
 79 | 
 80 |         now = time.time()
 81 |         if self.verbose == 1:
 82 |             prev_total_width = self.total_width
 83 |             sys.stdout.write("\b" * prev_total_width)
 84 |             sys.stdout.write("\r")
 85 | 
 86 |             numdigits = int(np.floor(np.log10(self.target))) + 1
 87 |             barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
 88 |             bar = barstr % (current, self.target)
 89 |             prog = float(current)/self.target
 90 |             prog_width = int(self.width*prog)
 91 |             if prog_width > 0:
 92 |                 bar += ('='*(prog_width-1))
 93 |                 if current < self.target:
 94 |                     bar += '>'
 95 |                 else:
 96 |                     bar += '='
 97 |             bar += ('.'*(self.width-prog_width))
 98 |             bar += ']'
 99 |             sys.stdout.write(bar)
100 |             self.total_width = len(bar)
101 | 
102 |             if current:
103 |                 time_per_unit = (now - self.start) / current
104 |             else:
105 |                 time_per_unit = 0
106 |             eta = time_per_unit*(self.target - current)
107 |             info = ''
108 |             if current < self.target:
109 |                 info += ' - ETA: %ds' % eta
110 |             else:
111 |                 info += ' - %ds' % (now - self.start)
112 |             for k in self.unique_values:
113 |                 if type(self.sum_values[k]) is list:
114 |                     info += ' - %s: %.4f' % (k,
115 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
116 |                 else:
117 |                     info += ' - %s: %s' % (k, self.sum_values[k])
118 | 
119 |             self.total_width += len(info)
120 |             if prev_total_width > self.total_width:
121 |                 info += ((prev_total_width-self.total_width) * " ")
122 | 
123 |             sys.stdout.write(info)
124 |             sys.stdout.flush()
125 | 
126 |             if current >= self.target:
127 |                 sys.stdout.write("\n")
128 | 
129 |         if self.verbose == 2:
130 |             if current >= self.target:
131 |                 info = '%ds' % (now - self.start)
132 |                 for k in self.unique_values:
133 |                     info += ' - %s: %.4f' % (k,
134 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
135 |                 sys.stdout.write(info + "\n")
136 | 
137 |     def add(self, n, values=[]):
138 |         self.update(self.seen_so_far+n, values)
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/BERT/adversarial_losses.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from six.moves import xrange
  3 | 
  4 | # Virtual adversarial training parameters
  5 | num_power_iteration = 1
  6 | small_constant_for_finite_diff = 1e-1
  7 | 
  8 | 
  9 | def adversarial_loss(embedded, segment_ids, input_mask, document_mask, label_ids, loss, loss_fn, perturb_norm_length):
 10 |     """Adds gradient to embedding and recomputes classification loss."""
 11 |     grad, = torch.autograd.grad(
 12 |       loss,
 13 |       embedded,
 14 |       retain_graph=True)
 15 |     grad.detach_()
 16 |     perturb = _scale_l2(grad, perturb_norm_length)
 17 |     return loss_fn(token_type_ids=segment_ids, attention_mask=input_mask,
 18 |                    document_mask=document_mask, labels=label_ids, input_embeddings=embedded + perturb)
 19 | 
 20 | 
 21 | def virtual_adversarial_loss(logits, embedded, segment_ids, input_mask, document_mask,
 22 |                              num_classes, logits_from_embedding_fn, perturb_norm_length):
 23 |     """Virtual adversarial loss.
 24 |     Computes virtual adversarial perturbation by finite difference method and
 25 |     power iteration, adds it to the embedding, and computes the KL divergence
 26 |     between the new logits and the original logits.
 27 |     Args:
 28 |     logits: 3-D float Tensor, [batch_size, num_timesteps, m], where m=1 if
 29 |       num_classes=2, otherwise m=num_classes.
 30 |     embedded: 3-D float Tensor, [batch_size, num_timesteps, embedding_dim].
 31 |     inputs: VatxtInput.
 32 |     logits_from_embedding_fn: callable that takes embeddings and returns
 33 |       classifier logits.
 34 |     Returns:
 35 |     kl: float scalar.
 36 |     """
 37 |     # Stop gradient of logits. See https://arxiv.org/abs/1507.00677 for details.
 38 |     # logits = tf.stop_gradient(logits)
 39 | 
 40 |     # Only care about the KL divergence on the final timestep.
 41 |     # weights = inputs.eos_weights
 42 |     # assert weights is not None
 43 |     # if FLAGS.single_label:
 44 |     # indices = tf.stack([tf.range(FLAGS.batch_size), inputs.length - 1], 1)
 45 |     # weights = tf.expand_dims(tf.gather_nd(inputs.eos_weights, indices), 1)
 46 | 
 47 |     # Initialize perturbation with random noise.
 48 |     # shape(embedded) = (batch_size, num_timesteps, embedding_dim)
 49 |     d = torch.autograd.Variable(torch.empty(embedded.size()).normal_(), requires_grad=True).cuda()
 50 | 
 51 |     # Perform finite difference method and power iteration.
 52 |     # See Eq.(8) in the paper http://arxiv.org/pdf/1507.00677.pdf,
 53 |     # Adding small noise to input and taking gradient with respect to the noise
 54 |     # corresponds to 1 power iteration.
 55 |     for _ in xrange(num_power_iteration):
 56 |         d = _scale_l2(
 57 |             _mask_by_mask(d, input_mask), small_constant_for_finite_diff)
 58 | 
 59 |     _, d_logits, _ = logits_from_embedding_fn(token_type_ids=segment_ids, attention_mask=input_mask,
 60 |                                             document_mask=document_mask, input_embeddings=embedded + d)
 61 |     kl = _kl_divergence_with_logits(logits, d_logits, num_classes)
 62 |     perturb, = torch.autograd.grad(
 63 |         kl,
 64 |         d)
 65 |     perturb.detach_()
 66 | 
 67 |     perturb = _scale_l2(perturb, perturb_norm_length)
 68 |     _, vadv_logits, _ = logits_from_embedding_fn(token_type_ids=segment_ids, attention_mask=input_mask,
 69 |                                         document_mask=document_mask, input_embeddings=embedded + perturb)
 70 |     return _kl_divergence_with_logits(logits, vadv_logits, num_classes)
 71 | 
 72 | 
 73 | def _scale_l2(x, norm_length):
 74 |   # shape(x) = (batch, num_timesteps, d)
 75 |   # Divide x by max(abs(x)) for a numerically stable L2 norm.
 76 |   # 2norm(x) = a * 2norm(x/a)
 77 |   # Scale over the full sequence, dims (1, 2)
 78 |   alpha = torch.max(torch.max(torch.abs(x), dim=1, keepdim=True)[0], dim=2, keepdim=True)[0] + 1e-12
 79 |   l2_norm = alpha * torch.sqrt(
 80 |       torch.sum(torch.pow(x / alpha, 2), dim=(1, 2), keepdim=True) + 1e-6)
 81 |   x_unit = x / l2_norm
 82 |   return norm_length * x_unit
 83 | 
 84 | 
 85 | def _mask_by_mask(t, mask):
 86 |   """Mask t, 3-D [batch, time, dim], by Mask, 2-D [batch, time]."""
 87 | 
 88 |   return t * torch.unsqueeze(mask, dim=2).expand(t.size()).float()
 89 | 
 90 | 
 91 | def _kl_divergence_with_logits(q_logits, p_logits, num_classes):
 92 |     """Returns weighted KL divergence between distributions q and p.
 93 |     Args:
 94 |         q_logits: logits for 1st argument of KL divergence shape
 95 |                   [batch_size, num_timesteps, num_classes] if num_classes > 2, and
 96 |                   [batch_size, num_timesteps] if num_classes == 2.
 97 |         p_logits: logits for 2nd argument of KL divergence with same shape q_logits.
 98 |         weights: 1-D float tensor with shape [batch_size, num_timesteps].
 99 |                  Elements should be 1.0 only on end of sequences
100 |     Returns:
101 |         KL: float scalar.
102 |     """
103 |     # For logistic regression
104 |     if num_classes == 2:
105 |         # q = tf.nn.sigmoid(q_logits)
106 |         # kl = (-tf.nn.sigmoid_cross_entropy_with_logits(logits=q_logits, labels=q) +
107 |         #       tf.nn.sigmoid_cross_entropy_with_logits(logits=p_logits, labels=q))
108 |         # kl = tf.squeeze(kl, 2)
109 |         raise NotImplementedError
110 | 
111 |     # For softmax regression
112 |     else:
113 |         q = torch.nn.functional.softmax(q_logits, -1)
114 |         kl = torch.sum(
115 |             q * (torch.nn.functional.log_softmax(q_logits, -1) - torch.nn.functional.log_softmax(p_logits, -1)), -1)
116 | 
117 |     # num_labels = tf.reduce_sum(weights)
118 |     # num_labels = tf.where(tf.equal(num_labels, 0.), 1., num_labels)
119 | 
120 |     # kl.get_shape().assert_has_rank(2)
121 |     assert len(kl.size()) == 2
122 |     # weights.get_shape().assert_has_rank(2)
123 | 
124 |     loss = torch.mean(kl)
125 |     return loss


--------------------------------------------------------------------------------
/lstm_model/base_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | class BaseModel(object):
  6 |     """Generic class for general methods that are not specific to NER"""
  7 | 
  8 |     def __init__(self, config):
  9 |         """Defines self.config and self.logger
 10 | 
 11 |         Args:
 12 |             config: (Config instance) class with hyper parameters,
 13 |                 vocab and embeddings
 14 | 
 15 |         """
 16 |         self.config = config
 17 |         self.logger = config.logger
 18 |         self.sess   = None
 19 |         self.saver  = None
 20 | 
 21 | 
 22 |     def reinitialize_weights(self, scope_name):
 23 |         """Reinitializes the weights of a given layer"""
 24 |         variables = tf.contrib.framework.get_variables(scope_name)
 25 |         init = tf.variables_initializer(variables)
 26 |         self.sess.run(init)
 27 | 
 28 | 
 29 |     def add_train_op(self, lr_method, lr, loss, clip=-1):
 30 |         """Defines self.train_op that performs an update on a batch
 31 | 
 32 |         Args:
 33 |             lr_method: (string) sgd method, for example "adam"
 34 |             lr: (tf.placeholder) tf.float32, learning rate
 35 |             loss: (tensor) tf.float32 loss to minimize
 36 |             clip: (python float) clipping of gradient. If < 0, no clipping
 37 | 
 38 |         """
 39 |         _lr_m = lr_method.lower() # lower to make sure
 40 | 
 41 |         with tf.variable_scope("train_step"):
 42 |             if _lr_m == 'adam': # sgd method
 43 |                 optimizer = tf.train.AdamOptimizer(lr)
 44 |             elif _lr_m == 'adagrad':
 45 |                 optimizer = tf.train.AdagradOptimizer(lr)
 46 |             elif _lr_m == 'sgd':
 47 |                 optimizer = tf.train.GradientDescentOptimizer(lr)
 48 |             elif _lr_m == 'rmsprop':
 49 |                 optimizer = tf.train.RMSPropOptimizer(lr)
 50 |             else:
 51 |                 raise NotImplementedError("Unknown method {}".format(_lr_m))
 52 | 
 53 |             if clip > 0: # gradient clipping if clip is positive
 54 |                 grads, vs     = zip(*optimizer.compute_gradients(loss))
 55 |                 grads, gnorm  = tf.clip_by_global_norm(grads, clip)
 56 |                 self.train_op = optimizer.apply_gradients(zip(grads, vs))
 57 |             else:
 58 |                 self.train_op = optimizer.minimize(loss)
 59 | 
 60 | 
 61 |     def initialize_session(self):
 62 |         """Defines self.sess and initialize the variables"""
 63 |         self.logger.info("Initializing tf session")
 64 |         config = tf.ConfigProto()
 65 |         config.gpu_options.allow_growth = True
 66 |         self.sess = tf.Session(config=config)
 67 |         self.sess.run(tf.global_variables_initializer())
 68 |         self.saver = tf.train.Saver()
 69 | 
 70 | 
 71 |     def restore_session(self, dir_model):
 72 |         """Reload weights into session
 73 | 
 74 |         Args:
 75 |             sess: tf.Session()
 76 |             dir_model: dir with weights
 77 | 
 78 |         """
 79 |         self.logger.info("Reloading the latest trained model...")
 80 |         self.saver.restore(self.sess, dir_model)
 81 | 
 82 | 
 83 |     def save_session(self):
 84 |         """Saves session = weights"""
 85 |         if not os.path.exists(self.config.dir_model):
 86 |             os.makedirs(self.config.dir_model)
 87 |         self.saver.save(self.sess, self.config.dir_model)
 88 | 
 89 | 
 90 |     def close_session(self):
 91 |         """Closes the session"""
 92 |         self.sess.close()
 93 | 
 94 | 
 95 |     def add_summary(self):
 96 |         """Defines variables for Tensorboard
 97 | 
 98 |         Args:
 99 |             dir_output: (string) where the results are written
100 | 
101 |         """
102 |         self.merged      = tf.summary.merge_all()
103 |         self.file_writer = tf.summary.FileWriter(self.config.dir_output,
104 |                 self.sess.graph)
105 | 
106 | 
107 |     def train(self, train, dev, data_aug=None):
108 |         """Performs training with early stopping and lr exponential decay
109 | 
110 |         Args:
111 |             train: dataset that yields tuple of (sentences, tags)
112 |             dev: dataset
113 | 
114 |         """
115 |         best_score = 0
116 |         nepoch_no_imprv = 0 # for early stopping
117 |         self.add_summary() # tensorboard
118 | 
119 |         for epoch in range(self.config.nepochs):
120 |             self.logger.info("Epoch {:} out of {:}".format(epoch + 1,
121 |                         self.config.nepochs))
122 | 
123 |             score = self.run_epoch(train, dev, data_aug=data_aug)
124 |             self.config.lr *= self.config.lr_decay # decay learning rate
125 | 
126 |             # early stopping and saving best parameters
127 |             if score >= best_score:
128 |                 nepoch_no_imprv = 0
129 |                 self.save_session()
130 |                 best_score = score
131 |                 self.logger.info("- new best score!")
132 |             else:
133 |                 nepoch_no_imprv += 1
134 |                 if nepoch_no_imprv >= self.config.nepoch_no_imprv:
135 |                     self.logger.info("- early stopping {} epochs without "\
136 |                             "improvement".format(nepoch_no_imprv))
137 |                     break
138 | 
139 |         return best_score
140 | 
141 | 
142 |     def evaluate(self, test):
143 |         """Evaluate model on test set
144 | 
145 |         Args:
146 |             test: instance of class Dataset
147 | 
148 |         """
149 |         self.logger.info("Testing model over test set")
150 |         metrics = self.run_evaluate(test, report=True)
151 |         msg = " - ".join(["{} {:04.4f}".format(k, v)
152 |                     if k == 'acc' else '{} {}'.format(k, ', '.join(['{}: {:04.4f}'.format(a, b) \
153 |                     for a, b in v.items()])) for k, v in metrics.items()])
154 |         self.logger.info(msg)
155 | 
156 |         return metrics
157 | 


--------------------------------------------------------------------------------
/lstm_model/src/base_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | class BaseModel(object):
  6 |     """Generic class for general methods that are not specific to NER"""
  7 | 
  8 |     def __init__(self, config):
  9 |         """Defines self.config and self.logger
 10 | 
 11 |         Args:
 12 |             config: (Config instance) class with hyper parameters,
 13 |                 vocab and embeddings
 14 | 
 15 |         """
 16 |         self.config = config
 17 |         self.logger = config.logger
 18 |         self.sess   = None
 19 |         self.saver  = None
 20 | 
 21 | 
 22 |     def reinitialize_weights(self, scope_name):
 23 |         """Reinitializes the weights of a given layer"""
 24 |         variables = tf.contrib.framework.get_variables(scope_name)
 25 |         init = tf.variables_initializer(variables)
 26 |         self.sess.run(init)
 27 | 
 28 | 
 29 |     def add_train_op(self, lr_method, lr, loss, clip=-1):
 30 |         """Defines self.train_op that performs an update on a batch
 31 | 
 32 |         Args:
 33 |             lr_method: (string) sgd method, for example "adam"
 34 |             lr: (tf.placeholder) tf.float32, learning rate
 35 |             loss: (tensor) tf.float32 loss to minimize
 36 |             clip: (python float) clipping of gradient. If < 0, no clipping
 37 | 
 38 |         """
 39 |         _lr_m = lr_method.lower() # lower to make sure
 40 | 
 41 |         with tf.variable_scope("train_step"):
 42 |             if _lr_m == 'adam': # sgd method
 43 |                 optimizer = tf.train.AdamOptimizer(lr)
 44 |             elif _lr_m == 'adagrad':
 45 |                 optimizer = tf.train.AdagradOptimizer(lr)
 46 |             elif _lr_m == 'sgd':
 47 |                 optimizer = tf.train.GradientDescentOptimizer(lr)
 48 |             elif _lr_m == 'rmsprop':
 49 |                 optimizer = tf.train.RMSPropOptimizer(lr)
 50 |             else:
 51 |                 raise NotImplementedError("Unknown method {}".format(_lr_m))
 52 | 
 53 |             if clip > 0: # gradient clipping if clip is positive
 54 |                 grads, vs     = zip(*optimizer.compute_gradients(loss))
 55 |                 grads, gnorm  = tf.clip_by_global_norm(grads, clip)
 56 |                 self.train_op = optimizer.apply_gradients(zip(grads, vs))
 57 |             else:
 58 |                 self.train_op = optimizer.minimize(loss)
 59 | 
 60 | 
 61 |     def initialize_session(self):
 62 |         """Defines self.sess and initialize the variables"""
 63 |         self.logger.info("Initializing tf session")
 64 |         config = tf.ConfigProto()
 65 |         config.gpu_options.allow_growth = True
 66 |         self.sess = tf.Session(config=config)
 67 |         self.sess.run(tf.global_variables_initializer())
 68 |         self.saver = tf.train.Saver()
 69 | 
 70 | 
 71 |     def restore_session(self, dir_model):
 72 |         """Reload weights into session
 73 | 
 74 |         Args:
 75 |             sess: tf.Session()
 76 |             dir_model: dir with weights
 77 | 
 78 |         """
 79 |         self.logger.info("Reloading the latest trained model...")
 80 |         self.saver.restore(self.sess, dir_model)
 81 | 
 82 | 
 83 |     def save_session(self):
 84 |         """Saves session = weights"""
 85 |         if not os.path.exists(self.config.dir_model):
 86 |             os.makedirs(self.config.dir_model)
 87 |         self.saver.save(self.sess, self.config.dir_model)
 88 | 
 89 | 
 90 |     def close_session(self):
 91 |         """Closes the session"""
 92 |         self.sess.close()
 93 | 
 94 | 
 95 |     def add_summary(self):
 96 |         """Defines variables for Tensorboard
 97 | 
 98 |         Args:
 99 |             dir_output: (string) where the results are written
100 | 
101 |         """
102 |         self.merged      = tf.summary.merge_all()
103 |         self.file_writer = tf.summary.FileWriter(self.config.dir_output,
104 |                 self.sess.graph)
105 | 
106 | 
107 |     def train(self, train, dev, data_aug=None):
108 |         """Performs training with early stopping and lr exponential decay
109 | 
110 |         Args:
111 |             train: dataset that yields tuple of (sentences, tags)
112 |             dev: dataset
113 | 
114 |         """
115 |         best_score = 0
116 |         nepoch_no_imprv = 0 # for early stopping
117 |         self.add_summary() # tensorboard
118 | 
119 |         for epoch in range(self.config.nepochs):
120 |             self.logger.info("Epoch {:} out of {:}".format(epoch + 1,
121 |                         self.config.nepochs))
122 | 
123 |             score = self.run_epoch(train, dev, data_aug=data_aug)
124 |             self.config.lr *= self.config.lr_decay # decay learning rate
125 | 
126 |             # early stopping and saving best parameters
127 |             if score >= best_score:
128 |                 nepoch_no_imprv = 0
129 |                 self.save_session()
130 |                 best_score = score
131 |                 self.logger.info("- new best score!")
132 |             else:
133 |                 nepoch_no_imprv += 1
134 |                 if nepoch_no_imprv >= self.config.nepoch_no_imprv:
135 |                     self.logger.info("- early stopping {} epochs without "\
136 |                             "improvement".format(nepoch_no_imprv))
137 |                     break
138 | 
139 |         return best_score
140 | 
141 | 
142 |     def evaluate(self, test):
143 |         """Evaluate model on test set
144 | 
145 |         Args:
146 |             test: instance of class Dataset
147 | 
148 |         """
149 |         self.logger.info("Testing model over test set")
150 |         metrics = self.run_evaluate(test, report=True)
151 |         msg = " - ".join(["{} {:04.4f}".format(k, v)
152 |                     if k == 'acc' else '{} {}'.format(k, ', '.join(['{}: {:04.4f}'.format(a, b) \
153 |                     for a, b in v.items()])) for k, v in metrics.items()])
154 |         self.logger.info(msg)
155 | 
156 |         return metrics
157 | 


--------------------------------------------------------------------------------
/BERT/bert_model.py:
--------------------------------------------------------------------------------
  1 | from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel
  2 | import torch
  3 | from torch import nn
  4 | import torch.nn.functional as F
  5 | 
  6 | from crf import ChainCRF
  7 | import utils
  8 | 
  9 | 
 10 | class BertForSequentialClassification(BertPreTrainedModel):
 11 |     """BERT model for classification.
 12 |     This module is composed of the BERT model with a linear layer on top of
 13 |     the pooled output.
 14 |     Params:
 15 |         `config`: a BertConfig class instance with the configuration to build a new model.
 16 |         `num_labels`: the number of classes for the classifier. Default = 2.
 17 |     Inputs:
 18 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 19 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 20 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 21 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 22 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 23 |             a `sentence B` token (see BERT paper for more details).
 24 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 25 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 26 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 27 |             a batch has varying length sentences.
 28 |         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
 29 |             with indices selected in [0, ..., num_labels].
 30 |     Outputs:
 31 |         if `labels` is not `None`:
 32 |             Outputs the CrossEntropy classification loss of the output with the labels.
 33 |         if `labels` is `None`:
 34 |             Outputs the classification logits of shape [batch_size, num_labels].
 35 |     Example usage:
 36 |     ```python
 37 |     # Already been converted into WordPiece token ids
 38 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 39 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 40 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 41 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 42 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 43 |     num_labels = 2
 44 |     model = BertForSequenceClassification(config, num_labels)
 45 |     logits = model(input_ids, token_type_ids, input_mask)
 46 |     ```
 47 |     """
 48 |     def __init__(self, config, num_labels, tag_space=0, rnn_mode='LSTM', use_crf=False,
 49 |                  rnn_hidden_size=None, dropout=None):
 50 |         super(BertForSequentialClassification, self).__init__(config)
 51 |         self.num_labels = num_labels
 52 |         self.bert = BertModel(config)
 53 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 54 |         if dropout is None:
 55 |             dropout = config.hidden_dropout_prob
 56 |         self.dropout_other = nn.Dropout(dropout)
 57 |         self.use_crf = use_crf
 58 | 
 59 |         if rnn_mode == 'RNN':
 60 |             RNN = nn.RNN
 61 |         elif rnn_mode == 'LSTM':
 62 |             RNN = nn.LSTM
 63 |         elif rnn_mode == 'GRU':
 64 |             RNN = nn.GRU
 65 | 
 66 |         if rnn_hidden_size is not None:
 67 |             self.rnn = RNN(config.hidden_size, rnn_hidden_size, num_layers=1, batch_first=True, bidirectional=True)
 68 |             out_dim = rnn_hidden_size * 2
 69 |         else:
 70 |             self.rnn = None
 71 |             out_dim = config.hidden_size
 72 | 
 73 |         if tag_space:
 74 |             self.dense = nn.Linear(out_dim, tag_space)
 75 |             out_dim = tag_space
 76 |         else:
 77 |             self.dense = None
 78 | 
 79 |         if use_crf:
 80 |             self.crf = ChainCRF(out_dim, num_labels, bigram=True)
 81 |         else:
 82 |             self.dense_softmax = nn.Linear(out_dim, num_labels)
 83 | 
 84 |         self.apply(self.init_bert_weights)
 85 | 
 86 |     def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, document_mask=None, labels=None,
 87 |                 input_embeddings=None):
 88 |         _, output, embeddings = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False,
 89 |                                           input_embeddings=input_embeddings)
 90 |         output = self.dropout(output)
 91 | 
 92 |         # sentence level transform to document level
 93 |         length = document_mask.sum(dim=1).long()
 94 |         max_len = length.max()
 95 |         output = output.view(-1, max_len, self.config.hidden_size)
 96 | 
 97 |         # document level RNN processing
 98 |         if self.rnn is not None:
 99 |             output, hx, rev_order, mask = utils.prepare_rnn_seq(output, length, hx=None,
100 |                                                                 masks=document_mask, batch_first=True)
101 |             output, hn = self.rnn(output, hx=hx)
102 |             output, hn = utils.recover_rnn_seq(output, rev_order, hx=hn, batch_first=True)
103 | 
104 |         # apply dropout for the output of rnn
105 |         output = self.dropout_other(output)
106 |         if self.dense is not None:
107 |             # [batch, length, tag_space]
108 |             output = self.dropout_other(F.elu(self.dense(output)))
109 | 
110 |         # final output layer
111 |         if not self.use_crf:
112 |             # not use crf
113 |             output = self.dense_softmax(output) # [batch, length, num_labels]
114 |             if labels is None:
115 |                 _, preds = torch.max(output, dim=2)
116 |                 return preds, None, embeddings
117 |             else:
118 |                 return (F.cross_entropy(output.view(-1, output.size(-1)), labels.view(-1), reduction='none') *
119 |                         document_mask.view(-1)).sum() / document_mask.sum(), None, embeddings
120 |         else:
121 |             # CRF processing
122 |             if labels is not None:
123 |                 loss, logits = self.crf.loss(output, labels, mask=document_mask)
124 |                 return loss.mean(), logits, embeddings
125 |             else:
126 |                 seq_pred, logits = self.crf.decode(output, mask=document_mask, leading_symbolic=0)
127 |                 return seq_pred, logits, embeddings


--------------------------------------------------------------------------------
/BERT/crf.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | from torch.nn.parameter import Parameter
  5 | 
  6 | 
  7 | def logdet(x):
  8 |     """
  9 |     Args:
 10 |         x: 2D positive semidefinite matrix.
 11 |     Returns: log determinant of x
 12 |     """
 13 |     # TODO for pytorch 2.0.4, use inside potrf for variable.
 14 |     print(torch.log(torch.eig(x.data)[0]))
 15 |     print(x)
 16 |     u_chol = x.potrf()
 17 |     return torch.sum(torch.log(u_chol.diag())) * 2
 18 | 
 19 | 
 20 | def logsumexp(x, dim=None):
 21 |     """
 22 |     Args:
 23 |         x: A pytorch tensor (any dimension will do)
 24 |         dim: int or None, over which to perform the summation. `None`, the
 25 |              default, performs over all axes.
 26 |     Returns: The result of the log(sum(exp(...))) operation.
 27 |     """
 28 |     if dim is None:
 29 |         xmax = x.max()
 30 |         xmax_ = x.max()
 31 |         return xmax_ + torch.log(torch.exp(x - xmax).sum())
 32 |     else:
 33 |         xmax, _ = x.max(dim, keepdim=True)
 34 |         xmax_, _ = x.max(dim)
 35 |         return xmax_ + torch.log(torch.exp(x - xmax).sum(dim))
 36 | 
 37 | class ChainCRF(nn.Module):
 38 |     def __init__(self, input_size, num_labels, bigram=True, **kwargs):
 39 |         '''
 40 |         Args:
 41 |             input_size: int
 42 |                 the dimension of the input.
 43 |             num_labels: int
 44 |                 the number of labels of the crf layer
 45 |             bigram: bool
 46 |                 if apply bi-gram parameter.
 47 |             **kwargs:
 48 |         '''
 49 |         super(ChainCRF, self).__init__()
 50 |         self.input_size = input_size
 51 |         self.num_labels = num_labels + 1
 52 |         self.pad_label_id = num_labels
 53 |         self.bigram = bigram
 54 | 
 55 | 
 56 |         # state weight tensor
 57 |         self.state_nn = nn.Linear(input_size, self.num_labels)
 58 |         if bigram:
 59 |             # transition weight tensor
 60 |             self.trans_nn = nn.Linear(input_size, self.num_labels * self.num_labels)
 61 |             self.register_parameter('trans_matrix', None)
 62 |         else:
 63 |             self.trans_nn = None
 64 |             self.trans_matrix = Parameter(torch.Tensor(self.num_labels, self.num_labels))
 65 | 
 66 |         self.reset_parameters()
 67 | 
 68 |     def reset_parameters(self):
 69 |         nn.init.constant(self.state_nn.bias, 0.)
 70 |         if self.bigram:
 71 |             nn.init.xavier_uniform(self.trans_nn.weight)
 72 |             nn.init.constant(self.trans_nn.bias, 0.)
 73 |         else:
 74 |             nn.init.normal(self.trans_matrix)
 75 |         # if not self.bigram:
 76 |         #     nn.init.normal(self.trans_matrix)
 77 | 
 78 |     def forward(self, input, mask=None):
 79 |         '''
 80 |         Args:
 81 |             input: Tensor
 82 |                 the input tensor with shape = [batch, length, input_size]
 83 |             mask: Tensor or None
 84 |                 the mask tensor with shape = [batch, length]
 85 |         Returns: Tensor
 86 |             the energy tensor with shape = [batch, length, num_label, num_label]
 87 |         '''
 88 |         batch, length, _ = input.size()
 89 | 
 90 |         # compute out_s by tensor dot [batch, length, input_size] * [input_size, num_label]
 91 |         # thus out_s should be [batch, length, num_label] --> [batch, length, num_label, 1]
 92 |         logits = self.state_nn(input)
 93 |         out_s = logits.unsqueeze(2)
 94 | 
 95 |         if self.bigram:
 96 |             # compute out_s by tensor dot: [batch, length, input_size] * [input_size, num_label * num_label]
 97 |             # the output should be [batch, length, num_label,  num_label]
 98 |             out_t = self.trans_nn(input).view(batch, length, self.num_labels, self.num_labels)
 99 |             output = out_t + out_s
100 |         else:
101 |             # [batch, length, num_label, num_label]
102 |             output = self.trans_matrix + out_s
103 | 
104 |         if mask is not None:
105 |             output = output * mask.unsqueeze(2).unsqueeze(3)
106 | 
107 |         return output, logits
108 | 
109 |     def loss(self, input, target, mask=None):
110 |         '''
111 |         Args:
112 |             input: Tensor
113 |                 the input tensor with shape = [batch, length, input_size]
114 |             target: Tensor
115 |                 the tensor of target labels with shape [batch, length]
116 |             mask:Tensor or None
117 |                 the mask tensor with shape = [batch, length]
118 |         Returns: Tensor
119 |                 A 1D tensor for minus log likelihood loss
120 |         '''
121 |         batch, length, _ = input.size()
122 |         energy, logits = self.forward(input, mask=mask)
123 |         # shape = [length, batch, num_label, num_label]
124 |         energy_transpose = energy.transpose(0, 1)
125 |         # shape = [length, batch]
126 |         target_transpose = target.transpose(0, 1)
127 |         # shape = [length, batch, 1]
128 |         mask_transpose = None
129 |         if mask is not None:
130 |             mask_transpose = mask.unsqueeze(2).transpose(0, 1)
131 | 
132 | 
133 |         # shape = [batch, num_label]
134 |         partition = None
135 | 
136 |         if input.is_cuda:
137 |             # shape = [batch]
138 |             batch_index = torch.arange(0, batch).long().cuda()
139 |             prev_label = torch.cuda.LongTensor(batch).fill_(self.num_labels - 1)
140 |             tgt_energy = Variable(torch.zeros(batch)).cuda()
141 |         else:
142 |             # shape = [batch]
143 |             batch_index = torch.arange(0, batch).long()
144 |             prev_label = torch.LongTensor(batch).fill_(self.num_labels - 1)
145 |             tgt_energy = Variable(torch.zeros(batch))
146 | 
147 |         for t in range(length):
148 |             # shape = [batch, num_label, num_label]
149 |             curr_energy = energy_transpose[t]
150 |             if t == 0:
151 |                 partition = curr_energy[:, -1, :]
152 |             else:
153 |                 # shape = [batch, num_label]
154 |                 partition_new = logsumexp(curr_energy + partition.unsqueeze(2), dim=1)
155 |                 if mask_transpose is None:
156 |                     partition = partition_new
157 |                 else:
158 |                     mask_t = mask_transpose[t]
159 |                     partition = partition + (partition_new - partition) * mask_t
160 |             tgt_energy += curr_energy[batch_index, prev_label, target_transpose[t].data]
161 |             prev_label = target_transpose[t].data
162 | 
163 |         return logsumexp(partition, dim=1) - tgt_energy, logits
164 | 
165 |     def decode(self, input, mask=None, leading_symbolic=0):
166 |         """
167 |         Args:
168 |             input: Tensor
169 |                 the input tensor with shape = [batch, length, input_size]
170 |             mask: Tensor or None
171 |                 the mask tensor with shape = [batch, length]
172 |             leading_symbolic: nt
173 |                 number of symbolic labels leading in type alphabets (set it to 0 if you are not sure)
174 |         Returns: Tensor
175 |             decoding results in shape [batch, length]
176 |         """
177 | 
178 |         energy, logits = self.forward(input, mask=mask)
179 |         energy = energy.data
180 | 
181 |         # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels)
182 |         # For convenience, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels)
183 |         energy_transpose = energy.transpose(0, 1)
184 | 
185 |         # the last row and column is the tag for pad symbol. reduce these two dimensions by 1 to remove that.
186 |         # also remove the first #symbolic rows and columns.
187 |         # now the shape of energies_shuffled is [n_time_steps, b_batch, t, t] where t = num_labels - #symbolic - 1.
188 |         energy_transpose = energy_transpose[:, :, leading_symbolic:-1, leading_symbolic:-1]
189 | 
190 |         length, batch_size, num_label, _ = energy_transpose.size()
191 | 
192 |         if input.is_cuda:
193 |             batch_index = torch.arange(0, batch_size).long().cuda()
194 |             pi = torch.zeros([length, batch_size, num_label]).cuda()
195 |             pointer = torch.cuda.LongTensor(length, batch_size, num_label).zero_()
196 |             back_pointer = torch.cuda.LongTensor(length, batch_size).zero_()
197 |         else:
198 |             batch_index = torch.arange(0, batch_size).long()
199 |             pi = torch.zeros([length, batch_size, num_label, 1])
200 |             pointer = torch.LongTensor(length, batch_size, num_label).zero_()
201 |             back_pointer = torch.LongTensor(length, batch_size).zero_()
202 | 
203 |         pi[0] = energy[:, 0, -1, leading_symbolic:-1]
204 |         pointer[0] = -1
205 |         for t in range(1, length):
206 |             pi_prev = pi[t - 1]
207 |             pi[t], pointer[t] = torch.max(energy_transpose[t] + pi_prev.unsqueeze(2), dim=1)
208 | 
209 |         _, back_pointer[-1] = torch.max(pi[-1], dim=1)
210 |         for t in reversed(range(length - 1)):
211 |             pointer_last = pointer[t + 1]
212 |             back_pointer[t] = pointer_last[batch_index, back_pointer[t + 1]]
213 | 
214 |         return back_pointer.transpose(0, 1) + leading_symbolic, logits


--------------------------------------------------------------------------------
/lstm_model/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from .general_utils import get_logger
  4 | from .data_utils import get_trimmed_wordvec_vectors, load_vocab, \
  5 |         get_processing_word
  6 | 
  7 | 
  8 | def Config(load=True):
  9 |     """Initialize hyperparameters and load vocabs
 10 | 
 11 |     Args:
 12 |         load_embeddings: (bool) if True, load embeddings into
 13 |             np array, else None
 14 | 
 15 |     """
 16 |     def load_(args):
 17 |         """Loads vocabulary, processing functions and embeddings
 18 | 
 19 |         Supposes that build_data.py has been run successfully and that
 20 |         the corresponding files have been created (vocab and trimmed GloVe
 21 |         vectors)
 22 | 
 23 |         """
 24 |         # 1. vocabulary
 25 |         args.vocab_words, args.vocab_words_freq = load_vocab(args.filename_words)
 26 |         args.vocab_tags = load_vocab(args.filename_tags)
 27 |         # args.vocab_chars = load_vocab(args.filename_chars)
 28 | 
 29 |         args.nwords = len(args.vocab_words)
 30 |         # args.nchars     = len(args.vocab_chars)
 31 |         args.ntags = len(args.vocab_tags)
 32 | 
 33 |         # 2. get processing functions that map str -> id
 34 |         # args.use_chars = args.use_lstm_chars | args.use_cnn_chars
 35 |         args.processing_word = get_processing_word(args.vocab_words, lowercase=True, chars=False)
 36 |         args.processing_tag  = get_processing_word(args.vocab_tags,
 37 |                 lowercase=False, allow_unk=False)
 38 | 
 39 |         # 3. get pre-trained embeddings
 40 |         args.embeddings = (get_trimmed_wordvec_vectors(args.filename_wordvec_trimmed, args.vocab_words)
 41 |                 if args.use_pretrained else None)
 42 |         args.dim_word = args.embeddings.shape[1]
 43 | 
 44 |         return args
 45 | 
 46 |     ## parse args
 47 |     parser = argparse.ArgumentParser()
 48 |     # training parameters
 49 |     parser.add_argument('--nepochs', default='100', type=int,
 50 |                 help='number of epochs')
 51 |     parser.add_argument('--dropout', default='0.8', type=float,
 52 |                 help='number of epochs')
 53 |     parser.add_argument('--batch_size', default='30', type=int,
 54 |                 help='batch size')
 55 |     parser.add_argument('--batch_size_aug', default='30', type=int,
 56 |                         help='batch size for data augmentation')
 57 |     parser.add_argument('--lr', default='0.001', type=float,
 58 |                 help='learning rate')
 59 |     parser.add_argument('--lr_method', default='adam', type=str,
 60 |                 help='optimization method')
 61 |     parser.add_argument('--lr_decay', default='0.99', type=float,
 62 |                 help='learning rate decay rate')
 63 |     parser.add_argument('--clip', default='2', type=float,
 64 |                 help='gradient clipping')
 65 |     parser.add_argument('--nepoch_no_imprv', default='4', type=int,
 66 |                 help='number of epoch patience')
 67 |     parser.add_argument('--l2_reg_lambda', default='1e-6', type=float,
 68 |                 help='l2 regularization coefficient')
 69 | 
 70 |     # data and results paths
 71 |     parser.add_argument('--dir_output', default='test', type=str,
 72 |                 help='directory for output')
 73 |     parser.add_argument('--data_keyname', default='nicta', type=str,
 74 |                 help='directory for output')
 75 |     parser.add_argument('--filename_wordvec_trimmed', default='../data/word2vec_pubmed.trimmed.txt',
 76 |                 type=str, help='directory for trimmed word embeddings file')
 77 |     parser.add_argument('--filename_wordvec', default='/data/medg/misc/jindi/nlp/embeddings/word2vec/wikipedia-pubmed-and-PMC-w2v.txt',
 78 |                 type=str, help='directory for original word embeddings file')
 79 | 
 80 |     # model hyperparameters
 81 |     parser.add_argument('--hidden_size_char', default='50', type=int,
 82 |                 help='hidden size of character level lstm')
 83 |     parser.add_argument('--hidden_size_lstm_sentence', default='100', type=int,
 84 |                 help='hidden size of sentence level lstm')
 85 |     parser.add_argument('--hidden_size_lstm_document', default='100', type=int,
 86 |                 help='hidden size of document level lstm')
 87 |     parser.add_argument('--attention_size', default='400', type=int,
 88 |                 help='attention vector size')
 89 |     parser.add_argument('--cnn_filter_num', default='300', type=int,
 90 |                 help='number of cnn filters for each window size')
 91 |     parser.add_argument('--dim_char', default='50', type=int,
 92 |                 help='character embedding dimension')
 93 |     parser.add_argument('--cnn_filter_sizes', default='2,3,4', type=str,
 94 |                 help='cnn filter window sizes')
 95 |     parser.add_argument('--cnn_char_windows', default='3', type=str,
 96 |                 help='cnn filter window sizes')
 97 |     parser.add_argument('--adv_reg_coeff', default='0.2', type=float,
 98 |                 help='Regularization coefficient of adversarial loss')
 99 |     parser.add_argument('--va_reg_coeff', default='0.05', type=float,
100 |                 help='Regularization coefficient of virtual adversarial loss')
101 |     parser.add_argument('--adv_perturb_norm_length', default='8.0', type=float,
102 |                 help='Norm length of adversarial perturbation to be')
103 |     parser.add_argument('--va_perturb_norm_length', default='4.0', type=float,
104 |                 help='Norm length of virtual adversarial perturbation to be')
105 |     parser.add_argument('--embedding_dropout', default='0.8', type=float,
106 |                 help='Keep dropout for embeddings')
107 |     parser.add_argument('--embedding_normalize', action='store_false',
108 |                 help='Whether normalize the embeddings')
109 | 
110 |     # misc
111 |     parser.add_argument('--restore', action='store_true',
112 |                 help='whether restore from previous trained model')
113 |     parser.add_argument('--use_crf', action='store_false',
114 |                 help='whether use crf optimization layer')
115 |     parser.add_argument('--use_document_level', action='store_false',
116 |                 help='whether use document level lstm layer')
117 |     parser.add_argument('--use_document_attention', action='store_true',
118 |                         help='whether use document level attention')
119 |     parser.add_argument('--use_attention', action='store_false',
120 |                 help='whether use attention based pooling')
121 |     parser.add_argument('--use_cnn', action='store_false',
122 |                 help='whether use cnn or lstm for sentence representation')
123 |     parser.add_argument('--train_embeddings', action='store_true',
124 |                 help='whether use cnn or lstm for sentence representation')
125 |     parser.add_argument('--use_pretrained', action='store_false',
126 |                 help='whether use pre-trained word embeddings')
127 |     parser.add_argument('--train_accuracy', action='store_true',
128 |                 help='whether report accuracy while training')
129 |     parser.add_argument('--min_freq', default='20', type=int,
130 |                         help='remove tokens with small frequency for vocab')
131 |     parser.add_argument('--num_augmentation', default='0', type=int,
132 |                         help='Number of abstracts for data augmentation for VADV')
133 | 
134 |     args = parser.parse_args()
135 | 
136 |     # args.filename_wordvec = os.path.join('/data/medg/misc/jindi/nlp/embeddings',
137 |     #                                     args.filename_wordvec)
138 |     args.dir_output = os.path.join('results', args.dir_output)
139 |     if not os.path.exists(args.dir_output):
140 |         os.makedirs(args.dir_output)
141 |     args.dir_model = os.path.join(args.dir_output, "model.weights")
142 |     args.path_log = os.path.join(args.dir_output, "log.txt")
143 | 
144 |     # dataset
145 |     if args.data_keyname == 'PICO':
146 |         args.data_root = '../data/pico'
147 |         args.filename_dev = os.path.join(args.data_root, 'dev.txt')
148 |         args.filename_test = os.path.join(args.data_root, 'test.txt')
149 |         args.filename_train = os.path.join(args.data_root, 'train.txt')
150 |     elif args.data_keyname == 'nicta':
151 |         args.data_root = '../data/nicta_piboso'
152 |         args.filename_dev = os.path.join(args.data_root, 'test_clean.txt')
153 |         args.filename_test = os.path.join(args.data_root, 'test_clean.txt')
154 |         args.filename_train = os.path.join(args.data_root, 'train_clean.txt')
155 | 
156 |     # data augmentation dataset
157 |     args.filename_aug = '../data/unlabeled_corpus'
158 | 
159 |     # vocab (created from dataset with build_data.py)
160 |     args.filename_words = os.path.join('data', args.data_keyname, 'words.txt')
161 |     args.filename_tags = os.path.join('data', args.data_keyname, 'tags.txt')
162 |     # args.filename_chars = os.path.join('data', args.data_keyname, 'chars.txt')
163 | 
164 |     args.cnn_filter_sizes = [int(i) for i in args.cnn_filter_sizes.split(',')]
165 |     args.cnn_char_windows = [int(i) for i in args.cnn_char_windows.split(',')]
166 | 
167 |     # directory for training outputs
168 |     if not os.path.exists(os.path.join('data', args.data_keyname)):
169 |         os.makedirs(os.path.join('data', args.data_keyname))
170 | 
171 |     # directory for data output
172 |     if not os.path.exists(args.dir_output):
173 |         os.makedirs(args.dir_output)
174 | 
175 |     # create instance of logger
176 |     args.logger = get_logger(args.path_log)
177 | 
178 |     # log the attributes
179 |     msg = ', '.join(['{}: {}'.format(attr, getattr(args, attr)) for attr in dir(args) \
180 |                     if not callable(getattr(args, attr)) and not attr.startswith("__")])
181 |     args.logger.info(msg)
182 | 
183 |     # load if requested (default)
184 |     if load:
185 |         args = load_(args)
186 | 
187 |     return args
188 | 


--------------------------------------------------------------------------------
/lstm_model/src/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from .general_utils import get_logger
  4 | from .data_utils import get_trimmed_wordvec_vectors, load_vocab, \
  5 |         get_processing_word
  6 | 
  7 | 
  8 | def Config(load=True):
  9 |     """Initialize hyperparameters and load vocabs
 10 | 
 11 |     Args:
 12 |         load_embeddings: (bool) if True, load embeddings into
 13 |             np array, else None
 14 | 
 15 |     """
 16 |     def load_(args):
 17 |         """Loads vocabulary, processing functions and embeddings
 18 | 
 19 |         Supposes that build_data.py has been run successfully and that
 20 |         the corresponding files have been created (vocab and trimmed GloVe
 21 |         vectors)
 22 | 
 23 |         """
 24 |         # 1. vocabulary
 25 |         args.vocab_words, args.vocab_words_freq = load_vocab(args.filename_words)
 26 |         args.vocab_tags = load_vocab(args.filename_tags)
 27 |         # args.vocab_chars = load_vocab(args.filename_chars)
 28 | 
 29 |         args.nwords = len(args.vocab_words)
 30 |         # args.nchars     = len(args.vocab_chars)
 31 |         args.ntags = len(args.vocab_tags)
 32 | 
 33 |         # 2. get processing functions that map str -> id
 34 |         # args.use_chars = args.use_lstm_chars | args.use_cnn_chars
 35 |         args.processing_word = get_processing_word(args.vocab_words, lowercase=True, chars=False)
 36 |         args.processing_tag  = get_processing_word(args.vocab_tags,
 37 |                 lowercase=False, allow_unk=False)
 38 | 
 39 |         # 3. get pre-trained embeddings
 40 |         args.embeddings = (get_trimmed_wordvec_vectors(args.filename_wordvec_trimmed, args.vocab_words)
 41 |                 if args.use_pretrained else None)
 42 |         args.dim_word = args.embeddings.shape[1]
 43 | 
 44 |         return args
 45 | 
 46 |     ## parse args
 47 |     parser = argparse.ArgumentParser()
 48 |     # training parameters
 49 |     parser.add_argument('--nepochs', default='100', type=int,
 50 |                 help='number of epochs')
 51 |     parser.add_argument('--dropout', default='0.8', type=float,
 52 |                 help='number of epochs')
 53 |     parser.add_argument('--batch_size', default='30', type=int,
 54 |                 help='batch size')
 55 |     parser.add_argument('--batch_size_aug', default='30', type=int,
 56 |                         help='batch size for data augmentation')
 57 |     parser.add_argument('--lr', default='0.001', type=float,
 58 |                 help='learning rate')
 59 |     parser.add_argument('--lr_method', default='adam', type=str,
 60 |                 help='optimization method')
 61 |     parser.add_argument('--lr_decay', default='0.99', type=float,
 62 |                 help='learning rate decay rate')
 63 |     parser.add_argument('--clip', default='2', type=float,
 64 |                 help='gradient clipping')
 65 |     parser.add_argument('--nepoch_no_imprv', default='4', type=int,
 66 |                 help='number of epoch patience')
 67 |     parser.add_argument('--l2_reg_lambda', default='1e-6', type=float,
 68 |                 help='l2 regularization coefficient')
 69 | 
 70 |     # data and results paths
 71 |     parser.add_argument('--dir_output', default='test', type=str,
 72 |                 help='directory for output')
 73 |     parser.add_argument('--data_keyname', required=True, type=str,
 74 |                 help='directory for output')
 75 |     parser.add_argument('--filename_wordvec_trimmed', default='',
 76 |                 type=str, help='directory for trimmed word embeddings file')
 77 |     parser.add_argument('--filename_wordvec', default='/data/medg/misc/jindi/nlp/embeddings/word2vec/wikipedia-pubmed-and-PMC-w2v.txt',
 78 |                 type=str, help='directory for original word embeddings file')
 79 | 
 80 |     # model hyperparameters
 81 |     parser.add_argument('--hidden_size_char', default='50', type=int,
 82 |                 help='hidden size of character level lstm')
 83 |     parser.add_argument('--hidden_size_lstm_sentence', default='100', type=int,
 84 |                 help='hidden size of sentence level lstm')
 85 |     parser.add_argument('--hidden_size_lstm_document', default='100', type=int,
 86 |                 help='hidden size of document level lstm')
 87 |     parser.add_argument('--attention_size', default='400', type=int,
 88 |                 help='attention vector size')
 89 |     parser.add_argument('--cnn_filter_num', default='300', type=int,
 90 |                 help='number of cnn filters for each window size')
 91 |     parser.add_argument('--dim_char', default='50', type=int,
 92 |                 help='character embedding dimension')
 93 |     parser.add_argument('--cnn_filter_sizes', default='2,3,4', type=str,
 94 |                 help='cnn filter window sizes')
 95 |     parser.add_argument('--cnn_char_windows', default='3', type=str,
 96 |                 help='cnn filter window sizes')
 97 |     parser.add_argument('--adv_reg_coeff', default='0.2', type=float,
 98 |                 help='Regularization coefficient of adversarial loss')
 99 |     parser.add_argument('--va_reg_coeff', default='0.05', type=float,
100 |                 help='Regularization coefficient of virtual adversarial loss')
101 |     parser.add_argument('--adv_perturb_norm_length', default='8.0', type=float,
102 |                 help='Norm length of adversarial perturbation to be')
103 |     parser.add_argument('--va_perturb_norm_length', default='4.0', type=float,
104 |                 help='Norm length of virtual adversarial perturbation to be')
105 |     parser.add_argument('--embedding_dropout', default='0.8', type=float,
106 |                 help='Keep dropout for embeddings')
107 |     parser.add_argument('--embedding_normalize', action='store_false',
108 |                 help='Whether normalize the embeddings')
109 | 
110 |     # misc
111 |     parser.add_argument('--restore', action='store_true',
112 |                 help='whether restore from previous trained model')
113 |     parser.add_argument('--use_crf', action='store_false',
114 |                 help='whether use crf optimization layer')
115 |     parser.add_argument('--use_document_level', action='store_false',
116 |                 help='whether use document level lstm layer')
117 |     parser.add_argument('--use_document_attention', action='store_true',
118 |                         help='whether use document level attention')
119 |     parser.add_argument('--use_attention', action='store_false',
120 |                 help='whether use attention based pooling')
121 |     parser.add_argument('--use_cnn', action='store_false',
122 |                 help='whether use cnn or lstm for sentence representation')
123 |     parser.add_argument('--train_embeddings', action='store_true',
124 |                 help='whether use cnn or lstm for sentence representation')
125 |     parser.add_argument('--use_pretrained', action='store_false',
126 |                 help='whether use pre-trained word embeddings')
127 |     parser.add_argument('--train_accuracy', action='store_true',
128 |                 help='whether report accuracy while training')
129 |     parser.add_argument('--min_freq', default='20', type=int,
130 |                         help='remove tokens with small frequency for vocab')
131 |     parser.add_argument('--num_augmentation', default='0', type=int,
132 |                         help='Number of abstracts for data augmentation for VADV')
133 | 
134 |     args = parser.parse_args()
135 | 
136 |     # args.filename_wordvec = os.path.join('/data/medg/misc/jindi/nlp/embeddings',
137 |     #                                     args.filename_wordvec)
138 |     args.dir_output = os.path.join('results', args.dir_output)
139 |     if not os.path.exists(args.dir_output):
140 |         os.makedirs(args.dir_output)
141 |     args.dir_model = os.path.join(args.dir_output, "model.weights")
142 |     args.path_log = os.path.join(args.dir_output, "log.txt")
143 | 
144 |     # dataset
145 |     if args.data_keyname == 'pico':
146 |         args.data_root = '../data/pico'
147 |         args.filename_dev = os.path.join(args.data_root, 'dev.txt')
148 |         args.filename_test = os.path.join(args.data_root, 'test.txt')
149 |         args.filename_train = os.path.join(args.data_root, 'train.txt')
150 |     elif args.data_keyname == 'nicta':
151 |         args.data_root = '../data/nicta_piboso'
152 |         args.filename_dev = os.path.join(args.data_root, 'test.txt')
153 |         args.filename_test = os.path.join(args.data_root, 'test.txt')
154 |         args.filename_train = os.path.join(args.data_root, 'train.txt')
155 | 
156 |     # data augmentation dataset
157 |     args.filename_aug = '../data/unlabeled_corpus'
158 | 
159 |     # vocab (created from dataset with build_data.py)
160 |     args.filename_words = os.path.join('data', args.data_keyname, 'words.txt')
161 |     args.filename_tags = os.path.join('data', args.data_keyname, 'tags.txt')
162 |     args.filename_wordvec_trimmed = os.path.join('data', args.data_keyname, 'word.embeddings.trimmed.txt')
163 |     # args.filename_chars = os.path.join('data', args.data_keyname, 'chars.txt')
164 | 
165 |     args.cnn_filter_sizes = [int(i) for i in args.cnn_filter_sizes.split(',')]
166 |     args.cnn_char_windows = [int(i) for i in args.cnn_char_windows.split(',')]
167 | 
168 |     # directory for training outputs
169 |     if not os.path.exists(os.path.join('data', args.data_keyname)):
170 |         os.makedirs(os.path.join('data', args.data_keyname))
171 | 
172 |     # directory for data output
173 |     if not os.path.exists(args.dir_output):
174 |         os.makedirs(args.dir_output)
175 | 
176 |     # create instance of logger
177 |     args.logger = get_logger(args.path_log)
178 | 
179 |     # log the attributes
180 |     msg = ', '.join(['{}: {}'.format(attr, getattr(args, attr)) for attr in dir(args) \
181 |                     if not callable(getattr(args, attr)) and not attr.startswith("__")])
182 |     args.logger.info(msg)
183 | 
184 |     # load if requested (default)
185 |     if load:
186 |         args = load_(args)
187 | 
188 |     return args
189 | 


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/file_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for working with the local dataset cache.
  3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
  4 | Copyright by the AllenNLP authors.
  5 | """
  6 | from __future__ import (absolute_import, division, print_function, unicode_literals)
  7 | 
  8 | import sys
  9 | import json
 10 | import logging
 11 | import os
 12 | import shutil
 13 | import tempfile
 14 | import fnmatch
 15 | from functools import wraps
 16 | from hashlib import sha256
 17 | import sys
 18 | from io import open
 19 | 
 20 | import boto3
 21 | import requests
 22 | from botocore.exceptions import ClientError
 23 | from tqdm import tqdm
 24 | 
 25 | CONFIG_NAME = "config.json"
 26 | WEIGHTS_NAME = "pytorch_model.bin"
 27 | 
 28 | try:
 29 |     from torch.hub import _get_torch_home
 30 |     torch_cache_home = _get_torch_home()
 31 | except ImportError:
 32 |     torch_cache_home = os.path.expanduser(
 33 |         os.getenv('TORCH_HOME', os.path.join(
 34 |             os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
 35 | default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert')
 36 | 
 37 | try:
 38 |     from urllib.parse import urlparse
 39 | except ImportError:
 40 |     from urlparse import urlparse
 41 | 
 42 | try:
 43 |     from pathlib import Path
 44 |     PYTORCH_PRETRAINED_BERT_CACHE = Path(
 45 |         os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))
 46 | except (AttributeError, ImportError):
 47 |     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 48 |                                               default_cache_path)
 49 | 
 50 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 51 | 
 52 | 
 53 | def url_to_filename(url, etag=None):
 54 |     """
 55 |     Convert `url` into a hashed filename in a repeatable way.
 56 |     If `etag` is specified, append its hash to the url's, delimited
 57 |     by a period.
 58 |     """
 59 |     url_bytes = url.encode('utf-8')
 60 |     url_hash = sha256(url_bytes)
 61 |     filename = url_hash.hexdigest()
 62 | 
 63 |     if etag:
 64 |         etag_bytes = etag.encode('utf-8')
 65 |         etag_hash = sha256(etag_bytes)
 66 |         filename += '.' + etag_hash.hexdigest()
 67 | 
 68 |     return filename
 69 | 
 70 | 
 71 | def filename_to_url(filename, cache_dir=None):
 72 |     """
 73 |     Return the url and etag (which may be ``None``) stored for `filename`.
 74 |     Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
 75 |     """
 76 |     if cache_dir is None:
 77 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
 78 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
 79 |         cache_dir = str(cache_dir)
 80 | 
 81 |     cache_path = os.path.join(cache_dir, filename)
 82 |     if not os.path.exists(cache_path):
 83 |         raise EnvironmentError("file {} not found".format(cache_path))
 84 | 
 85 |     meta_path = cache_path + '.json'
 86 |     if not os.path.exists(meta_path):
 87 |         raise EnvironmentError("file {} not found".format(meta_path))
 88 | 
 89 |     with open(meta_path, encoding="utf-8") as meta_file:
 90 |         metadata = json.load(meta_file)
 91 |     url = metadata['url']
 92 |     etag = metadata['etag']
 93 | 
 94 |     return url, etag
 95 | 
 96 | 
 97 | def cached_path(url_or_filename, cache_dir=None):
 98 |     """
 99 |     Given something that might be a URL (or might be a local path),
100 |     determine which. If it's a URL, download the file and cache it, and
101 |     return the path to the cached file. If it's already a local path,
102 |     make sure the file exists and then return the path.
103 |     """
104 |     if cache_dir is None:
105 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
106 |     if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
107 |         url_or_filename = str(url_or_filename)
108 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
109 |         cache_dir = str(cache_dir)
110 | 
111 |     parsed = urlparse(url_or_filename)
112 | 
113 |     if parsed.scheme in ('http', 'https', 's3'):
114 |         # URL, so get it from the cache (downloading if necessary)
115 |         return get_from_cache(url_or_filename, cache_dir)
116 |     elif os.path.exists(url_or_filename):
117 |         # File, and it exists.
118 |         return url_or_filename
119 |     elif parsed.scheme == '':
120 |         # File, but it doesn't exist.
121 |         raise EnvironmentError("file {} not found".format(url_or_filename))
122 |     else:
123 |         # Something unknown
124 |         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
125 | 
126 | 
127 | def split_s3_path(url):
128 |     """Split a full s3 path into the bucket name and path."""
129 |     parsed = urlparse(url)
130 |     if not parsed.netloc or not parsed.path:
131 |         raise ValueError("bad s3 path {}".format(url))
132 |     bucket_name = parsed.netloc
133 |     s3_path = parsed.path
134 |     # Remove '/' at beginning of path.
135 |     if s3_path.startswith("/"):
136 |         s3_path = s3_path[1:]
137 |     return bucket_name, s3_path
138 | 
139 | 
140 | def s3_request(func):
141 |     """
142 |     Wrapper function for s3 requests in order to create more helpful error
143 |     messages.
144 |     """
145 | 
146 |     @wraps(func)
147 |     def wrapper(url, *args, **kwargs):
148 |         try:
149 |             return func(url, *args, **kwargs)
150 |         except ClientError as exc:
151 |             if int(exc.response["Error"]["Code"]) == 404:
152 |                 raise EnvironmentError("file {} not found".format(url))
153 |             else:
154 |                 raise
155 | 
156 |     return wrapper
157 | 
158 | 
159 | @s3_request
160 | def s3_etag(url):
161 |     """Check ETag on S3 object."""
162 |     s3_resource = boto3.resource("s3")
163 |     bucket_name, s3_path = split_s3_path(url)
164 |     s3_object = s3_resource.Object(bucket_name, s3_path)
165 |     return s3_object.e_tag
166 | 
167 | 
168 | @s3_request
169 | def s3_get(url, temp_file):
170 |     """Pull a file directly from S3."""
171 |     s3_resource = boto3.resource("s3")
172 |     bucket_name, s3_path = split_s3_path(url)
173 |     s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
174 | 
175 | 
176 | def http_get(url, temp_file):
177 |     req = requests.get(url, stream=True)
178 |     content_length = req.headers.get('Content-Length')
179 |     total = int(content_length) if content_length is not None else None
180 |     progress = tqdm(unit="B", total=total)
181 |     for chunk in req.iter_content(chunk_size=1024):
182 |         if chunk: # filter out keep-alive new chunks
183 |             progress.update(len(chunk))
184 |             temp_file.write(chunk)
185 |     progress.close()
186 | 
187 | 
188 | def get_from_cache(url, cache_dir=None):
189 |     """
190 |     Given a URL, look for the corresponding dataset in the local cache.
191 |     If it's not there, download it. Then return the path to the cached file.
192 |     """
193 |     if cache_dir is None:
194 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
195 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
196 |         cache_dir = str(cache_dir)
197 | 
198 |     if not os.path.exists(cache_dir):
199 |         os.makedirs(cache_dir)
200 | 
201 |     # Get eTag to add to filename, if it exists.
202 |     if url.startswith("s3://"):
203 |         etag = s3_etag(url)
204 |     else:
205 |         try:
206 |             response = requests.head(url, allow_redirects=True)
207 |             if response.status_code != 200:
208 |                 etag = None
209 |             else:
210 |                 etag = response.headers.get("ETag")
211 |         except EnvironmentError:
212 |             etag = None
213 | 
214 |     if sys.version_info[0] == 2 and etag is not None:
215 |         etag = etag.decode('utf-8')
216 |     filename = url_to_filename(url, etag)
217 | 
218 |     # get cache path to put the file
219 |     cache_path = os.path.join(cache_dir, filename)
220 | 
221 |     # If we don't have a connection (etag is None) and can't identify the file
222 |     # try to get the last downloaded one
223 |     if not os.path.exists(cache_path) and etag is None:
224 |         matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
225 |         matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
226 |         if matching_files:
227 |             cache_path = os.path.join(cache_dir, matching_files[-1])
228 | 
229 |     if not os.path.exists(cache_path):
230 |         # Download to temporary file, then copy to cache dir once finished.
231 |         # Otherwise you get corrupt cache entries if the download gets interrupted.
232 |         with tempfile.NamedTemporaryFile() as temp_file:
233 |             logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
234 | 
235 |             # GET file object
236 |             if url.startswith("s3://"):
237 |                 s3_get(url, temp_file)
238 |             else:
239 |                 http_get(url, temp_file)
240 | 
241 |             # we are copying the file before closing it, so flush to avoid truncation
242 |             temp_file.flush()
243 |             # shutil.copyfileobj() starts at the current position, so go to the start
244 |             temp_file.seek(0)
245 | 
246 |             logger.info("copying %s to cache at %s", temp_file.name, cache_path)
247 |             with open(cache_path, 'wb') as cache_file:
248 |                 shutil.copyfileobj(temp_file, cache_file)
249 | 
250 |             logger.info("creating metadata file for %s", cache_path)
251 |             meta = {'url': url, 'etag': etag}
252 |             meta_path = cache_path + '.json'
253 |             with open(meta_path, 'w') as meta_file:
254 |                 output_string = json.dumps(meta)
255 |                 if sys.version_info[0] == 2 and isinstance(output_string, str):
256 |                     output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
257 |                 meta_file.write(output_string)
258 | 
259 |             logger.info("removing temp file %s", temp_file.name)
260 | 
261 |     return cache_path
262 | 
263 | 
264 | def read_set_from_file(filename):
265 |     '''
266 |     Extract a de-duped collection (set) of text from a file.
267 |     Expected file format is one item per line.
268 |     '''
269 |     collection = set()
270 |     with open(filename, 'r', encoding='utf-8') as file_:
271 |         for line in file_:
272 |             collection.add(line.rstrip())
273 |     return collection
274 | 
275 | 
276 | def get_file_extension(path, dot=True, lower=True):
277 |     ext = os.path.splitext(path)[1]
278 |     ext = ext if dot else ext[1:]
279 |     return ext.lower() if lower else ext


--------------------------------------------------------------------------------
/lstm_model/adversarial_losses.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Adversarial losses for text models."""
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | 
 20 | # Dependency imports
 21 | 
 22 | from six.moves import xrange
 23 | import tensorflow as tf
 24 | 
 25 | # # Adversarial and virtual adversarial training parameters.
 26 | # flags.DEFINE_float('perturb_norm_length', 5.0,
 27 | #                    'Norm length of adversarial perturbation to be '
 28 | #                    'optimized with validation. '
 29 | #                    '5.0 is optimal on IMDB with virtual adversarial training. ')
 30 | 
 31 | # Virtual adversarial training parameters
 32 | num_power_iteration = 1
 33 | small_constant_for_finite_diff = 1e-1
 34 | 
 35 | # # Parameters for building the graph
 36 | # flags.DEFINE_string('adv_training_method', None,
 37 | #                     'The flag which specifies training method. '
 38 | #                     '""    : non-adversarial training (e.g. for running the '
 39 | #                     '        semi-supervised sequence learning model) '
 40 | #                     '"rp"  : random perturbation training '
 41 | #                     '"at"  : adversarial training '
 42 | #                     '"vat" : virtual adversarial training '
 43 | #                     '"atvat" : at + vat ')
 44 | # flags.DEFINE_float('adv_reg_coeff', 1.0,
 45 | #                    'Regularization coefficient of adversarial loss.')
 46 | 
 47 | 
 48 | def random_perturbation_loss(embedded, length, loss_fn):
 49 |   """Adds noise to embeddings and recomputes classification loss."""
 50 |   noise = tf.random_normal(shape=tf.shape(embedded))
 51 |   perturb = _scale_l2(_mask_by_length(noise, length), FLAGS.perturb_norm_length)
 52 |   return loss_fn(embedded + perturb)
 53 | 
 54 | 
 55 | def adversarial_loss(embedded, loss, loss_fn, perturb_norm_length):
 56 |   """Adds gradient to embedding and recomputes classification loss."""
 57 |   grad, = tf.gradients(
 58 |       loss,
 59 |       embedded,
 60 |       aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
 61 |   grad = tf.stop_gradient(grad)
 62 |   perturb = _scale_l2_dim4(grad, perturb_norm_length)
 63 |   return loss_fn(embedded + perturb)
 64 | 
 65 | 
 66 | def virtual_adversarial_loss(logits, embedded, num_classes, sentence_length, document_length,
 67 |                              logits_from_embedding_fn, perturb_norm_length):
 68 |   """Virtual adversarial loss.
 69 |   Computes virtual adversarial perturbation by finite difference method and
 70 |   power iteration, adds it to the embedding, and computes the KL divergence
 71 |   between the new logits and the original logits.
 72 |   Args:
 73 |     logits: 3-D float Tensor, [batch_size, num_timesteps, m], where m=1 if
 74 |       num_classes=2, otherwise m=num_classes.
 75 |     embedded: 3-D float Tensor, [batch_size, num_timesteps, embedding_dim].
 76 |     inputs: VatxtInput.
 77 |     logits_from_embedding_fn: callable that takes embeddings and returns
 78 |       classifier logits.
 79 |   Returns:
 80 |     kl: float scalar.
 81 |   """
 82 |   # Stop gradient of logits. See https://arxiv.org/abs/1507.00677 for details.
 83 |   logits = tf.stop_gradient(logits)
 84 | 
 85 |   # Only care about the KL divergence on the final timestep.
 86 |   # weights = inputs.eos_weights
 87 |   # assert weights is not None
 88 |   # if FLAGS.single_label:
 89 |     # indices = tf.stack([tf.range(FLAGS.batch_size), inputs.length - 1], 1)
 90 |     # weights = tf.expand_dims(tf.gather_nd(inputs.eos_weights, indices), 1)
 91 | 
 92 |   # Initialize perturbation with random noise.
 93 |   # shape(embedded) = (batch_size, num_timesteps, embedding_dim)
 94 |   d = tf.random_normal(shape=tf.shape(embedded))
 95 | 
 96 |   # Perform finite difference method and power iteration.
 97 |   # See Eq.(8) in the paper http://arxiv.org/pdf/1507.00677.pdf,
 98 |   # Adding small noise to input and taking gradient with respect to the noise
 99 |   # corresponds to 1 power iteration.
100 |   for _ in xrange(num_power_iteration):
101 |     d = _scale_l2_dim4(
102 |         _mask_by_length_4D(d, sentence_length), small_constant_for_finite_diff)
103 | 
104 |     d_logits = logits_from_embedding_fn(embedded + d, sentence_length, document_length)
105 |     kl = _kl_divergence_with_logits(logits, d_logits, num_classes)
106 |     d, = tf.gradients(
107 |         kl,
108 |         d,
109 |         aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
110 |     d = tf.stop_gradient(d)
111 | 
112 |   perturb = _scale_l2_dim4(d, perturb_norm_length)
113 |   vadv_logits = logits_from_embedding_fn(embedded + perturb, sentence_length, document_length)
114 |   return _kl_divergence_with_logits(logits, vadv_logits, num_classes)
115 | 
116 | 
117 | def random_perturbation_loss_bidir(embedded, length, loss_fn):
118 |   """Adds noise to embeddings and recomputes classification loss."""
119 |   noise = [tf.random_normal(shape=tf.shape(emb)) for emb in embedded]
120 |   masked = [_mask_by_length(n, length) for n in noise]
121 |   scaled = [_scale_l2(m, FLAGS.perturb_norm_length) for m in masked]
122 |   return loss_fn([e + s for (e, s) in zip(embedded, scaled)])
123 | 
124 | 
125 | def adversarial_loss_bidir(embedded, loss, loss_fn):
126 |   """Adds gradient to embeddings and recomputes classification loss."""
127 |   grads = tf.gradients(
128 |       loss,
129 |       embedded,
130 |       aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
131 |   adv_exs = [
132 |       emb + _scale_l2(tf.stop_gradient(g), FLAGS.perturb_norm_length)
133 |       for emb, g in zip(embedded, grads)
134 |   ]
135 |   return loss_fn(adv_exs)
136 | 
137 | 
138 | def virtual_adversarial_loss_bidir(logits, embedded, inputs,
139 |                                    logits_from_embedding_fn):
140 |   """Virtual adversarial loss for bidirectional models."""
141 |   logits = tf.stop_gradient(logits)
142 |   f_inputs, _ = inputs
143 |   weights = f_inputs.eos_weights
144 |   if FLAGS.single_label:
145 |     indices = tf.stack([tf.range(FLAGS.batch_size), f_inputs.length - 1], 1)
146 |     weights = tf.expand_dims(tf.gather_nd(f_inputs.eos_weights, indices), 1)
147 |   assert weights is not None
148 | 
149 |   perturbs = [
150 |       _mask_by_length(tf.random_normal(shape=tf.shape(emb)), f_inputs.length)
151 |       for emb in embedded
152 |   ]
153 |   for _ in xrange(num_power_iteration):
154 |     perturbs = [
155 |         _scale_l2(d, small_constant_for_finite_diff) for d in perturbs
156 |     ]
157 |     d_logits = logits_from_embedding_fn(
158 |         [emb + d for (emb, d) in zip(embedded, perturbs)])
159 |     kl = _kl_divergence_with_logits(logits, d_logits, weights)
160 |     perturbs = tf.gradients(
161 |         kl,
162 |         perturbs,
163 |         aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
164 |     perturbs = [tf.stop_gradient(d) for d in perturbs]
165 | 
166 |   perturbs = [_scale_l2(d, FLAGS.perturb_norm_length) for d in perturbs]
167 |   vadv_logits = logits_from_embedding_fn(
168 |       [emb + d for (emb, d) in zip(embedded, perturbs)])
169 |   return _kl_divergence_with_logits(logits, vadv_logits, weights)
170 | 
171 | 
172 | def _mask_by_length(t, length):
173 |   """Mask t, 3-D [batch, time, dim], by length, 1-D [batch,]."""
174 |   maxlen = t.get_shape().as_list()[1]
175 | 
176 |   # Subtract 1 from length to prevent the perturbation from going on 'eos'
177 |   mask = tf.sequence_mask(length - 1, maxlen=maxlen)
178 |   mask = tf.expand_dims(tf.cast(mask, tf.float32), -1)
179 |   # shape(mask) = (batch, num_timesteps, 1)
180 |   return t * mask
181 | 
182 | 
183 | def _mask_by_length_4D(t, length):
184 |   """Mask t, 4-D [batch, time1, time2, dim], by length, 2-D [batch, time1]."""
185 |   maxlen = t.get_shape().as_list()[2]
186 | 
187 |   mask = tf.sequence_mask(length, maxlen=maxlen)
188 |   mask = tf.expand_dims(tf.cast(mask, tf.float32), -1)
189 |   # shape(mask) = (batch, num_timesteps, num_timesteps, 1)
190 |   return t * mask
191 | 
192 | 
193 | def _scale_l2(x, norm_length):
194 |   # shape(x) = (batch, num_timesteps, d)
195 |   # Divide x by max(abs(x)) for a numerically stable L2 norm.
196 |   # 2norm(x) = a * 2norm(x/a)
197 |   # Scale over the full sequence, dims (1, 2)
198 |   alpha = tf.reduce_max(tf.abs(x), (1, 2), keep_dims=True) + 1e-12
199 |   l2_norm = alpha * tf.sqrt(
200 |       tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2), keep_dims=True) + 1e-6)
201 |   x_unit = x / l2_norm
202 |   return norm_length * x_unit
203 | 
204 | 
205 | def _scale_l2_dim4(x, norm_length):
206 |   # shape(x) = (batch, num_timesteps, num_timesteps, d)
207 |   # Divide x by max(abs(x)) for a numerically stable L2 norm.
208 |   # 2norm(x) = a * 2norm(x/a)
209 |   # Scale over the full sequence, dims (1, 2, 3)
210 |   alpha = tf.reduce_max(tf.abs(x), (1, 2, 3), keepdims=True) + 1e-12
211 |   l2_norm = alpha * tf.sqrt(
212 |       tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2, 3), keepdims=True) + 1e-6)
213 |   x_unit = x / l2_norm
214 |   return norm_length * x_unit
215 | 
216 | 
217 | def _kl_divergence_with_logits(q_logits, p_logits, num_classes):
218 |   """Returns weighted KL divergence between distributions q and p.
219 |   Args:
220 |     q_logits: logits for 1st argument of KL divergence shape
221 |               [batch_size, num_timesteps, num_classes] if num_classes > 2, and
222 |               [batch_size, num_timesteps] if num_classes == 2.
223 |     p_logits: logits for 2nd argument of KL divergence with same shape q_logits.
224 |     weights: 1-D float tensor with shape [batch_size, num_timesteps].
225 |              Elements should be 1.0 only on end of sequences
226 |   Returns:
227 |     KL: float scalar.
228 |   """
229 |   # For logistic regression
230 |   if num_classes == 2:
231 |     q = tf.nn.sigmoid(q_logits)
232 |     kl = (-tf.nn.sigmoid_cross_entropy_with_logits(logits=q_logits, labels=q) +
233 |           tf.nn.sigmoid_cross_entropy_with_logits(logits=p_logits, labels=q))
234 |     kl = tf.squeeze(kl, 2)
235 | 
236 |   # For softmax regression
237 |   else:
238 |     q = tf.nn.softmax(q_logits)
239 |     kl = tf.reduce_sum(
240 |         q * (tf.nn.log_softmax(q_logits) - tf.nn.log_softmax(p_logits)), -1)
241 | 
242 |   # num_labels = tf.reduce_sum(weights)
243 |   # num_labels = tf.where(tf.equal(num_labels, 0.), 1., num_labels)
244 | 
245 |   kl.get_shape().assert_has_rank(2)
246 |   # weights.get_shape().assert_has_rank(2)
247 | 
248 |   loss = tf.identity(tf.reduce_mean(kl), name='kl')
249 |   return loss


--------------------------------------------------------------------------------
/lstm_model/src/adversarial_losses.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Adversarial losses for text models."""
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | 
 20 | # Dependency imports
 21 | 
 22 | from six.moves import xrange
 23 | import tensorflow as tf
 24 | 
 25 | # # Adversarial and virtual adversarial training parameters.
 26 | # flags.DEFINE_float('perturb_norm_length', 5.0,
 27 | #                    'Norm length of adversarial perturbation to be '
 28 | #                    'optimized with validation. '
 29 | #                    '5.0 is optimal on IMDB with virtual adversarial training. ')
 30 | 
 31 | # Virtual adversarial training parameters
 32 | num_power_iteration = 1
 33 | small_constant_for_finite_diff = 1e-1
 34 | 
 35 | # # Parameters for building the graph
 36 | # flags.DEFINE_string('adv_training_method', None,
 37 | #                     'The flag which specifies training method. '
 38 | #                     '""    : non-adversarial training (e.g. for running the '
 39 | #                     '        semi-supervised sequence learning model) '
 40 | #                     '"rp"  : random perturbation training '
 41 | #                     '"at"  : adversarial training '
 42 | #                     '"vat" : virtual adversarial training '
 43 | #                     '"atvat" : at + vat ')
 44 | # flags.DEFINE_float('adv_reg_coeff', 1.0,
 45 | #                    'Regularization coefficient of adversarial loss.')
 46 | 
 47 | 
 48 | def random_perturbation_loss(embedded, length, loss_fn):
 49 |   """Adds noise to embeddings and recomputes classification loss."""
 50 |   noise = tf.random_normal(shape=tf.shape(embedded))
 51 |   perturb = _scale_l2(_mask_by_length(noise, length), FLAGS.perturb_norm_length)
 52 |   return loss_fn(embedded + perturb)
 53 | 
 54 | 
 55 | def adversarial_loss(embedded, loss, loss_fn, perturb_norm_length):
 56 |   """Adds gradient to embedding and recomputes classification loss."""
 57 |   grad, = tf.gradients(
 58 |       loss,
 59 |       embedded,
 60 |       aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
 61 |   grad = tf.stop_gradient(grad)
 62 |   perturb = _scale_l2_dim4(grad, perturb_norm_length)
 63 |   return loss_fn(embedded + perturb)
 64 | 
 65 | 
 66 | def virtual_adversarial_loss(logits, embedded, num_classes, sentence_length, document_length,
 67 |                              logits_from_embedding_fn, perturb_norm_length):
 68 |   """Virtual adversarial loss.
 69 |   Computes virtual adversarial perturbation by finite difference method and
 70 |   power iteration, adds it to the embedding, and computes the KL divergence
 71 |   between the new logits and the original logits.
 72 |   Args:
 73 |     logits: 3-D float Tensor, [batch_size, num_timesteps, m], where m=1 if
 74 |       num_classes=2, otherwise m=num_classes.
 75 |     embedded: 3-D float Tensor, [batch_size, num_timesteps, embedding_dim].
 76 |     inputs: VatxtInput.
 77 |     logits_from_embedding_fn: callable that takes embeddings and returns
 78 |       classifier logits.
 79 |   Returns:
 80 |     kl: float scalar.
 81 |   """
 82 |   # Stop gradient of logits. See https://arxiv.org/abs/1507.00677 for details.
 83 |   logits = tf.stop_gradient(logits)
 84 | 
 85 |   # Only care about the KL divergence on the final timestep.
 86 |   # weights = inputs.eos_weights
 87 |   # assert weights is not None
 88 |   # if FLAGS.single_label:
 89 |     # indices = tf.stack([tf.range(FLAGS.batch_size), inputs.length - 1], 1)
 90 |     # weights = tf.expand_dims(tf.gather_nd(inputs.eos_weights, indices), 1)
 91 | 
 92 |   # Initialize perturbation with random noise.
 93 |   # shape(embedded) = (batch_size, num_timesteps, embedding_dim)
 94 |   d = tf.random_normal(shape=tf.shape(embedded))
 95 | 
 96 |   # Perform finite difference method and power iteration.
 97 |   # See Eq.(8) in the paper http://arxiv.org/pdf/1507.00677.pdf,
 98 |   # Adding small noise to input and taking gradient with respect to the noise
 99 |   # corresponds to 1 power iteration.
100 |   for _ in xrange(num_power_iteration):
101 |     d = _scale_l2_dim4(
102 |         _mask_by_length_4D(d, sentence_length), small_constant_for_finite_diff)
103 | 
104 |     d_logits = logits_from_embedding_fn(embedded + d, sentence_length, document_length)
105 |     kl = _kl_divergence_with_logits(logits, d_logits, num_classes)
106 |     d, = tf.gradients(
107 |         kl,
108 |         d,
109 |         aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
110 |     d = tf.stop_gradient(d)
111 | 
112 |   perturb = _scale_l2_dim4(d, perturb_norm_length)
113 |   vadv_logits = logits_from_embedding_fn(embedded + perturb, sentence_length, document_length)
114 |   return _kl_divergence_with_logits(logits, vadv_logits, num_classes)
115 | 
116 | 
117 | def random_perturbation_loss_bidir(embedded, length, loss_fn):
118 |   """Adds noise to embeddings and recomputes classification loss."""
119 |   noise = [tf.random_normal(shape=tf.shape(emb)) for emb in embedded]
120 |   masked = [_mask_by_length(n, length) for n in noise]
121 |   scaled = [_scale_l2(m, FLAGS.perturb_norm_length) for m in masked]
122 |   return loss_fn([e + s for (e, s) in zip(embedded, scaled)])
123 | 
124 | 
125 | def adversarial_loss_bidir(embedded, loss, loss_fn):
126 |   """Adds gradient to embeddings and recomputes classification loss."""
127 |   grads = tf.gradients(
128 |       loss,
129 |       embedded,
130 |       aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
131 |   adv_exs = [
132 |       emb + _scale_l2(tf.stop_gradient(g), FLAGS.perturb_norm_length)
133 |       for emb, g in zip(embedded, grads)
134 |   ]
135 |   return loss_fn(adv_exs)
136 | 
137 | 
138 | def virtual_adversarial_loss_bidir(logits, embedded, inputs,
139 |                                    logits_from_embedding_fn):
140 |   """Virtual adversarial loss for bidirectional models."""
141 |   logits = tf.stop_gradient(logits)
142 |   f_inputs, _ = inputs
143 |   weights = f_inputs.eos_weights
144 |   if FLAGS.single_label:
145 |     indices = tf.stack([tf.range(FLAGS.batch_size), f_inputs.length - 1], 1)
146 |     weights = tf.expand_dims(tf.gather_nd(f_inputs.eos_weights, indices), 1)
147 |   assert weights is not None
148 | 
149 |   perturbs = [
150 |       _mask_by_length(tf.random_normal(shape=tf.shape(emb)), f_inputs.length)
151 |       for emb in embedded
152 |   ]
153 |   for _ in xrange(num_power_iteration):
154 |     perturbs = [
155 |         _scale_l2(d, small_constant_for_finite_diff) for d in perturbs
156 |     ]
157 |     d_logits = logits_from_embedding_fn(
158 |         [emb + d for (emb, d) in zip(embedded, perturbs)])
159 |     kl = _kl_divergence_with_logits(logits, d_logits, weights)
160 |     perturbs = tf.gradients(
161 |         kl,
162 |         perturbs,
163 |         aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
164 |     perturbs = [tf.stop_gradient(d) for d in perturbs]
165 | 
166 |   perturbs = [_scale_l2(d, FLAGS.perturb_norm_length) for d in perturbs]
167 |   vadv_logits = logits_from_embedding_fn(
168 |       [emb + d for (emb, d) in zip(embedded, perturbs)])
169 |   return _kl_divergence_with_logits(logits, vadv_logits, weights)
170 | 
171 | 
172 | def _mask_by_length(t, length):
173 |   """Mask t, 3-D [batch, time, dim], by length, 1-D [batch,]."""
174 |   maxlen = t.get_shape().as_list()[1]
175 | 
176 |   # Subtract 1 from length to prevent the perturbation from going on 'eos'
177 |   mask = tf.sequence_mask(length - 1, maxlen=maxlen)
178 |   mask = tf.expand_dims(tf.cast(mask, tf.float32), -1)
179 |   # shape(mask) = (batch, num_timesteps, 1)
180 |   return t * mask
181 | 
182 | 
183 | def _mask_by_length_4D(t, length):
184 |   """Mask t, 4-D [batch, time1, time2, dim], by length, 2-D [batch, time1]."""
185 |   maxlen = t.get_shape().as_list()[2]
186 | 
187 |   mask = tf.sequence_mask(length, maxlen=maxlen)
188 |   mask = tf.expand_dims(tf.cast(mask, tf.float32), -1)
189 |   # shape(mask) = (batch, num_timesteps, num_timesteps, 1)
190 |   return t * mask
191 | 
192 | 
193 | def _scale_l2(x, norm_length):
194 |   # shape(x) = (batch, num_timesteps, d)
195 |   # Divide x by max(abs(x)) for a numerically stable L2 norm.
196 |   # 2norm(x) = a * 2norm(x/a)
197 |   # Scale over the full sequence, dims (1, 2)
198 |   alpha = tf.reduce_max(tf.abs(x), (1, 2), keep_dims=True) + 1e-12
199 |   l2_norm = alpha * tf.sqrt(
200 |       tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2), keep_dims=True) + 1e-6)
201 |   x_unit = x / l2_norm
202 |   return norm_length * x_unit
203 | 
204 | 
205 | def _scale_l2_dim4(x, norm_length):
206 |   # shape(x) = (batch, num_timesteps, num_timesteps, d)
207 |   # Divide x by max(abs(x)) for a numerically stable L2 norm.
208 |   # 2norm(x) = a * 2norm(x/a)
209 |   # Scale over the full sequence, dims (1, 2, 3)
210 |   alpha = tf.reduce_max(tf.abs(x), (1, 2, 3), keepdims=True) + 1e-12
211 |   l2_norm = alpha * tf.sqrt(
212 |       tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2, 3), keepdims=True) + 1e-6)
213 |   x_unit = x / l2_norm
214 |   return norm_length * x_unit
215 | 
216 | 
217 | def _kl_divergence_with_logits(q_logits, p_logits, num_classes):
218 |   """Returns weighted KL divergence between distributions q and p.
219 |   Args:
220 |     q_logits: logits for 1st argument of KL divergence shape
221 |               [batch_size, num_timesteps, num_classes] if num_classes > 2, and
222 |               [batch_size, num_timesteps] if num_classes == 2.
223 |     p_logits: logits for 2nd argument of KL divergence with same shape q_logits.
224 |     weights: 1-D float tensor with shape [batch_size, num_timesteps].
225 |              Elements should be 1.0 only on end of sequences
226 |   Returns:
227 |     KL: float scalar.
228 |   """
229 |   # For logistic regression
230 |   if num_classes == 2:
231 |     q = tf.nn.sigmoid(q_logits)
232 |     kl = (-tf.nn.sigmoid_cross_entropy_with_logits(logits=q_logits, labels=q) +
233 |           tf.nn.sigmoid_cross_entropy_with_logits(logits=p_logits, labels=q))
234 |     kl = tf.squeeze(kl, 2)
235 | 
236 |   # For softmax regression
237 |   else:
238 |     q = tf.nn.softmax(q_logits)
239 |     kl = tf.reduce_sum(
240 |         q * (tf.nn.log_softmax(q_logits) - tf.nn.log_softmax(p_logits)), -1)
241 | 
242 |   # num_labels = tf.reduce_sum(weights)
243 |   # num_labels = tf.where(tf.equal(num_labels, 0.), 1., num_labels)
244 | 
245 |   kl.get_shape().assert_has_rank(2)
246 |   # weights.get_shape().assert_has_rank(2)
247 | 
248 |   loss = tf.identity(tf.reduce_mean(kl), name='kl')
249 |   return loss


--------------------------------------------------------------------------------
/BERT/pytorch_pretrained_bert/module/san.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft. All rights reserved.
  2 | import torch
  3 | import random
  4 | import torch.nn as nn
  5 | from torch.nn.utils import weight_norm
  6 | from torch.nn.parameter import Parameter
  7 | import torch.nn.functional as F
  8 | from .dropout_wrapper import DropoutWrapper
  9 | from .similarity import FlatSimilarityWrapper, SelfAttnWrapper, DualAttentionWrapper, AttentionWrapper
 10 | from .my_optim import weight_norm as WN
 11 | from .common import activation, init_wrapper
 12 | 
 13 | SMALL_POS_NUM=1.0e-30
 14 | 
 15 | def generate_mask(new_data, dropout_p=0.0, is_training=False):
 16 |     if not is_training: dropout_p = 0.0
 17 |     new_data = (1-dropout_p) * (new_data.zero_() + 1)
 18 |     for i in range(new_data.size(0)):
 19 |         one = random.randint(0, new_data.size(1)-1)
 20 |         new_data[i][one] = 1
 21 |     mask = 1.0/(1 - dropout_p) * torch.bernoulli(new_data)
 22 |     mask.requires_grad = False
 23 |     return mask
 24 | 
 25 | 
 26 | def masked_select(tensor, mask):
 27 |     mask_len = mask.sum(dim=-1)
 28 |     max_seq_len = mask_len.max()
 29 |     new_tensor = torch.zeros(tensor.size(0), max_seq_len, tensor.size(-1)).cuda()
 30 |     new_mask = torch.ones(tensor.size(0), max_seq_len).cuda()
 31 |     for i in range(tensor.size(0)):
 32 |         new_tensor[i, :mask_len[i]] = torch.masked_select(tensor[i], mask[i].unsqueeze(1).expand_as(tensor[i])).view(-1, tensor.size(-1))
 33 |         new_mask[i, :mask_len[i]] = 0
 34 |     return new_tensor, new_mask.byte()
 35 | 
 36 | 
 37 | class Classifier(nn.Module):
 38 |     def __init__(self, x_size, y_size, opt, prefix='decoder', dropout=None):
 39 |         super(Classifier, self).__init__()
 40 |         self.opt = opt
 41 |         if dropout is None:
 42 |             self.dropout = DropoutWrapper(opt.get('{}_dropout_p'.format(prefix), 0))
 43 |         else:
 44 |             self.dropout = dropout
 45 |         self.merge_opt = opt.get('{}_merge_opt'.format(prefix), 0)
 46 |         self.weight_norm_on = opt.get('{}_weight_norm_on'.format(prefix), False)
 47 | 
 48 |         if self.merge_opt == 1:
 49 |             self.proj = nn.Linear(x_size * 4, y_size)
 50 |         else:
 51 |             self.proj = nn.Linear(x_size * 2, y_size)
 52 | 
 53 |         if self.weight_norm_on:
 54 |             self.proj = weight_norm(self.proj)
 55 | 
 56 |     def forward(self, x1, x2, mask=None, activation=None):
 57 |         seq_len = None
 58 |         if len(x1.size()) == 3:
 59 |             bz, seq_len, hidden_size = x1.size()
 60 |             x1 = x1.contiguous().view(-1, hidden_size)
 61 |             x2 = x2.contiguous().view(-1, hidden_size)
 62 | 
 63 |         if self.merge_opt == 1:
 64 |             x = torch.cat([x1, x2, (x1 - x2).abs(), x1 * x2], 1)
 65 |         else:
 66 |             x = torch.cat([x1, x2], 1)
 67 |         x = self.dropout(x)
 68 |         if activation:
 69 |             scores = activation(self.proj(x))
 70 |         else:
 71 |             scores = self.proj(x)
 72 | 
 73 |         if seq_len:
 74 |             return scores.view(bz, seq_len, -1)
 75 |         else:
 76 |             return scores
 77 | 
 78 | 
 79 | class SANClassifier(nn.Module):
 80 |     """Implementation of Stochastic Answer Networks for Natural Language Inference, Xiaodong Liu, Kevin Duh and Jianfeng Gao
 81 |     https://arxiv.org/abs/1804.07888
 82 |     """
 83 |     def __init__(self, x_size, h_size, label_size, opt={}, prefix='decoder', dropout=None):
 84 |         super(SANClassifier, self).__init__()
 85 |         self.prefix = prefix
 86 |         if dropout is None:
 87 |             self.dropout = DropoutWrapper(opt.get('{}_dropout_p'.format(self.prefix), 0))
 88 |         else:
 89 |             self.dropout = dropout
 90 |         self.query_wsum = SelfAttnWrapper(x_size, prefix='mem_cum', opt=opt, dropout=self.dropout)
 91 |         self.attn = FlatSimilarityWrapper(x_size, h_size, prefix, opt, self.dropout)
 92 |         self.rnn_type = '{}{}'.format(opt.get('{}_rnn_type'.format(prefix), 'gru').upper(), 'Cell')
 93 |         self.rnn = getattr(nn, self.rnn_type)(x_size, h_size)
 94 |         self.num_turn = opt.get('{}_num_turn'.format(prefix), 5)
 95 |         self.opt = opt
 96 |         self.mem_random_drop = opt.get('{}_mem_drop_p'.format(prefix), 0)
 97 |         self.mem_type = opt.get('{}_mem_type'.format(prefix), 0)
 98 |         self.weight_norm_on = opt.get('{}_weight_norm_on'.format(prefix), False)
 99 |         self.label_size = label_size
100 |         self.dump_state = opt.get('dump_state_on', False)
101 |         self.alpha = Parameter(torch.zeros(1, 1), requires_grad=False)
102 |         # self.hyp_attn = None
103 |         # if opt.get('hyp_attn_premise', 0):
104 |         #     self.hyp_attn = AttentionWrapper(x_size, h_size, prefix=prefix, opt=opt, dropout=self.dropout)
105 |         #     self.hyp_merge = Classifier(x_size, x_size, opt, prefix=prefix, dropout=self.dropout)
106 |         self.f = activation(opt.get('{}_activation'.format(self.prefix), 'relu'))
107 |         if self.weight_norm_on:
108 |             self.rnn = WN(self.rnn)
109 | 
110 |         self.classifier = Classifier(x_size, 1, opt, prefix=prefix, dropout=self.dropout)
111 | 
112 |     def forward(self, x, h0, x_mask=None, h_mask=None, is_training=True):
113 |         # if self.hyp_attn:
114 |         #     h_attn = self.hyp_attn(h0, x, key_padding_mask=x_mask)
115 |         #     h0 = self.hyp_merge(h0, h_attn, activation=self.f)
116 | 
117 |         h0 = self.query_wsum(h0, h_mask)
118 |         if type(self.rnn) is nn.LSTMCell:
119 |             c0 = h0.new(h0.size()).zero_()
120 |         scores_list = []
121 |         for turn in range(self.num_turn):
122 |             att_scores = self.attn(x, h0, x_mask)
123 |             x_sum = torch.bmm(F.softmax(att_scores, 1).unsqueeze(1), x).squeeze(1)
124 |             scores = self.classifier(x_sum, h0)
125 |             scores_list.append(scores)
126 |             # next turn
127 |             if self.rnn is not None:
128 |                 h0 = self.dropout(h0)
129 |                 if type(self.rnn) is nn.LSTMCell:
130 |                     h0, c0 = self.rnn(x_sum, (h0, c0))
131 |                 else:
132 |                     h0 = self.rnn(x_sum, h0)
133 |         if self.mem_type == 1:
134 |             batch_size = x.size(0) // self.label_size
135 |             mask = generate_mask(self.alpha.data.new(batch_size, self.num_turn), self.mem_random_drop, is_training)
136 |             mask = [m.contiguous() for m in torch.unbind(mask, 1)]
137 |             tmp_scores_list = [mask[idx].view(batch_size, 1).expand_as(inp.view(-1, self.label_size))
138 |                                * F.softmax(inp.view(-1, self.label_size), 1)
139 |                                for idx, inp in enumerate(scores_list)]
140 |             scores = torch.stack(tmp_scores_list, 2)
141 |             scores = torch.mean(scores, 2)
142 |             scores = torch.log(scores)
143 |         else:
144 |             scores = scores_list[-1]
145 |         if self.dump_state:
146 |             return scores, scores_list
147 |         else:
148 |             return scores
149 | 
150 | 
151 | class SANClassifier2(nn.Module):
152 |     """Implementation of Stochastic Answer Networks for Natural Language Inference, Xiaodong Liu, Kevin Duh and Jianfeng Gao
153 |     https://arxiv.org/abs/1804.07888
154 |     """
155 |     def __init__(self, x_size, h_size, label_size, opt={}, prefix='decoder', dropout=None):
156 |         super(SANClassifier2, self).__init__()
157 |         self.prefix = prefix
158 |         if dropout is None:
159 |             self.dropout = DropoutWrapper(opt.get('{}_dropout_p'.format(self.prefix), 0))
160 |         else:
161 |             self.dropout = dropout
162 |         self.dual_attn = DualAttentionWrapper(x_size, h_size, prefix, opt, self.dropout)
163 |         self.query_wsum = SelfAttnWrapper(x_size, prefix='mem_cum', opt=opt, dropout=self.dropout)
164 |         self.attn = FlatSimilarityWrapper(x_size, h_size, prefix, opt, self.dropout)
165 |         self.rnn_type = '{}{}'.format(opt.get('{}_rnn_type'.format(prefix), 'gru').upper(), 'Cell')
166 |         self.rnn = getattr(nn, self.rnn_type)(x_size, h_size)
167 |         self.num_turn = opt.get('{}_num_turn'.format(prefix), 5)
168 |         self.opt = opt
169 |         self.mem_random_drop = opt.get('{}_mem_drop_p'.format(prefix), 0)
170 |         self.mem_type = opt.get('{}_mem_type'.format(prefix), 0)
171 |         self.weight_norm_on = opt.get('{}_weight_norm_on'.format(prefix), False)
172 |         self.label_size = label_size
173 |         self.dump_state = opt.get('dump_state_on', False)
174 |         self.alpha = Parameter(torch.zeros(1, 1), requires_grad=False)
175 |         self.f = activation(opt.get('{}_activation'.format(self.prefix), 'relu'))
176 |         self.hyp_first = opt.get('{}_hyp_first'.format(prefix), 1)
177 |         self.hyp_raw = opt.get('{}_hyp_raw'.format(prefix), 0)
178 |         if self.weight_norm_on:
179 |             self.rnn = WN(self.rnn)
180 | 
181 |         self.classifier = Classifier(x_size, 1, opt, prefix=prefix, dropout=self.dropout)
182 | 
183 |         self.premise_merge = Classifier(x_size, x_size, opt, prefix=prefix, dropout=self.dropout)
184 |         self.hyp_merge = Classifier(x_size, x_size, opt, prefix=prefix, dropout=self.dropout)
185 | 
186 | 
187 |     def forward(self, x, h, x_mask=None, h_mask=None, is_training=True):
188 |         if self.hyp_first and self.hyp_raw:
189 |             pass
190 |         elif self.hyp_first and not self.hyp_raw:
191 |             _, h_attn = self.dual_attn(x, h, x_mask, h_mask)
192 | 
193 |         # x_prime = self.premise_merge(x, x_attn, activation=self.f)
194 |             h = self.hyp_merge(h, h_attn, activation=self.f)
195 |         else:
196 |             raise NotImplementedError
197 | 
198 |         # if self.num_turn == 0:
199 |         #     scores = self.classifier(x_prime.max(dim=1)[0], h_prime.max(dim=1)[0])
200 |         #     return scores
201 | 
202 |         # if self.hyp_first and not self.hyp_raw:
203 |         #     # x = x_prime
204 |         #     h = h_prime
205 |         # elif self.hyp_first and self.hyp_raw:
206 |         #     # x = x_prime
207 |         #     pass
208 |         # elif not self.hyp_first and not self.hyp_raw:
209 |         #     x = h_prime
210 |         #     h = x_prime
211 |         # else:
212 |         #     h = x
213 |         #     x = h_prime
214 | 
215 |         h0 = self.query_wsum(h, h_mask)
216 |         if type(self.rnn) is nn.LSTMCell:
217 |             c0 = h0.new(h0.size()).zero_()
218 |         scores_list = []
219 |         for turn in range(self.num_turn):
220 |             att_scores = self.attn(x, h0, x_mask)
221 |             x_sum = torch.bmm(F.softmax(att_scores, 1).unsqueeze(1), x).squeeze(1)
222 |             scores = self.classifier(x_sum, h0)
223 |             scores_list.append(scores)
224 |             # next turn
225 |             if self.rnn is not None:
226 |                 h0 = self.dropout(h0)
227 |                 if type(self.rnn) is nn.LSTMCell:
228 |                     h0, c0 = self.rnn(x_sum, (h0, c0))
229 |                 else:
230 |                     h0 = self.rnn(x_sum, h0)
231 |         if self.mem_type == 1:
232 |             batch_size = x.size(0) // self.label_size
233 |             mask = generate_mask(self.alpha.data.new(batch_size, self.num_turn), self.mem_random_drop, is_training)
234 |             mask = [m.contiguous() for m in torch.unbind(mask, 1)]
235 |             tmp_scores_list = [mask[idx].view(batch_size, 1).expand_as(inp.view(-1, self.label_size))
236 |                                * F.softmax(inp.view(-1, self.label_size), 1)
237 |                                for idx, inp in enumerate(scores_list)]
238 |             scores = torch.stack(tmp_scores_list, 2)
239 |             scores = torch.mean(scores, 2)
240 |             scores = torch.log(scores)
241 |         else:
242 |             scores = scores_list[-1]
243 |         if self.dump_state:
244 |             return scores, scores_list
245 |         else:
246 |             return scores


--------------------------------------------------------------------------------
/lstm_model/ner_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | from .data_utils import minibatches, pad_sequences, get_chunks
  7 | from .general_utils import Progbar
  8 | from .base_model import BaseModel
  9 | 
 10 | 
 11 | class NERModel(BaseModel):
 12 |     """Specialized class of Model for NER"""
 13 | 
 14 |     def __init__(self, config):
 15 |         super(NERModel, self).__init__(config)
 16 |         self.idx_to_tag = {idx: tag for tag, idx in
 17 |                            self.config.vocab_tags.items()}
 18 | 
 19 | 
 20 |     def add_placeholders(self):
 21 |         """Define placeholders = entries to computational graph"""
 22 |         # shape = (batch size, max length of sentence in batch)
 23 |         self.word_ids = tf.placeholder(tf.int32, shape=[None, None],
 24 |                         name="word_ids")
 25 | 
 26 |         # shape = (batch size)
 27 |         self.sequence_lengths = tf.placeholder(tf.int32, shape=[None],
 28 |                         name="sequence_lengths")
 29 | 
 30 |         # shape = (batch size, max length of sentence, max length of word)
 31 |         self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None],
 32 |                         name="char_ids")
 33 | 
 34 |         # shape = (batch_size, max_length of sentence)
 35 |         self.word_lengths = tf.placeholder(tf.int32, shape=[None, None],
 36 |                         name="word_lengths")
 37 | 
 38 |         # shape = (batch size, max length of sentence in batch)
 39 |         self.labels = tf.placeholder(tf.int32, shape=[None, None],
 40 |                         name="labels")
 41 | 
 42 |         # hyper parameters
 43 |         self.dropout = tf.placeholder(dtype=tf.float32, shape=[],
 44 |                         name="dropout")
 45 |         self.lr = tf.placeholder(dtype=tf.float32, shape=[],
 46 |                         name="lr")
 47 | 
 48 | 
 49 |     def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
 50 |         """Given some data, pad it and build a feed dictionary
 51 | 
 52 |         Args:
 53 |             words: list of sentences. A sentence is a list of ids of a list of
 54 |                 words. A word is a list of ids
 55 |             labels: list of ids
 56 |             lr: (float) learning rate
 57 |             dropout: (float) keep prob
 58 | 
 59 |         Returns:
 60 |             dict {placeholder: value}
 61 | 
 62 |         """
 63 |         # perform padding of the given data
 64 |         if self.config.use_chars:
 65 |             char_ids, word_ids = zip(*words)
 66 |             word_ids, sequence_lengths = pad_sequences(word_ids, 0)
 67 |             char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
 68 |                 nlevels=2)
 69 |         else:
 70 |             word_ids, sequence_lengths = pad_sequences(words, 0)
 71 | 
 72 |         # build feed dictionary
 73 |         feed = {
 74 |             self.word_ids: word_ids,
 75 |             self.sequence_lengths: sequence_lengths
 76 |         }
 77 | 
 78 |         if self.config.use_chars:
 79 |             feed[self.char_ids] = char_ids
 80 |             feed[self.word_lengths] = word_lengths
 81 | 
 82 |         if labels is not None:
 83 |             labels, _ = pad_sequences(labels, 0)
 84 |             feed[self.labels] = labels
 85 | 
 86 |         if lr is not None:
 87 |             feed[self.lr] = lr
 88 | 
 89 |         if dropout is not None:
 90 |             feed[self.dropout] = dropout
 91 | 
 92 |         return feed, sequence_lengths
 93 | 
 94 | 
 95 |     def add_word_embeddings_op(self):
 96 |         """Defines self.word_embeddings
 97 | 
 98 |         If self.config.embeddings is not None and is a np array initialized
 99 |         with pre-trained word vectors, the word embeddings is just a look-up
100 |         and we don't train the vectors. Otherwise, a random matrix with
101 |         the correct shape is initialized.
102 |         """
103 |         with tf.variable_scope("words"):
104 |             if self.config.embeddings is None:
105 |                 self.logger.info("WARNING: randomly initializing word vectors")
106 |                 _word_embeddings = tf.get_variable(
107 |                         name="_word_embeddings",
108 |                         dtype=tf.float32,
109 |                         shape=[self.config.nwords, self.config.dim_word])
110 |             else:
111 |                 _word_embeddings = tf.Variable(
112 |                         self.config.embeddings,
113 |                         name="_word_embeddings",
114 |                         dtype=tf.float32,
115 |                         trainable=self.config.train_embeddings)
116 | 
117 |             word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
118 |                     self.word_ids, name="word_embeddings")
119 | 
120 |         with tf.variable_scope("chars"):
121 |             if self.config.use_chars:
122 |                 # get char embeddings matrix
123 |                 _char_embeddings = tf.get_variable(
124 |                         name="_char_embeddings",
125 |                         dtype=tf.float32,
126 |                         shape=[self.config.nchars, self.config.dim_char])
127 |                 char_embeddings = tf.nn.embedding_lookup(_char_embeddings,
128 |                         self.char_ids, name="char_embeddings")
129 | 
130 |                 # put the time dimension on axis=1
131 |                 s = tf.shape(char_embeddings)
132 |                 char_embeddings = tf.reshape(char_embeddings,
133 |                         shape=[s[0]*s[1], s[-2], self.config.dim_char])
134 |                 word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]])
135 | 
136 |                 # bi lstm on chars
137 |                 cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
138 |                         state_is_tuple=True)
139 |                 cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
140 |                         state_is_tuple=True)
141 |                 _output = tf.nn.bidirectional_dynamic_rnn(
142 |                         cell_fw, cell_bw, char_embeddings,
143 |                         sequence_length=word_lengths, dtype=tf.float32)
144 | 
145 |                 # read and concat output
146 |                 _, ((_, output_fw), (_, output_bw)) = _output
147 |                 output = tf.concat([output_fw, output_bw], axis=-1)
148 | 
149 |                 # shape = (batch size, max sentence length, char hidden size)
150 |                 output = tf.reshape(output,
151 |                         shape=[s[0], s[1], 2*self.config.hidden_size_char])
152 |                 word_embeddings = tf.concat([word_embeddings, output], axis=-1)
153 | 
154 |         self.word_embeddings =  tf.nn.dropout(word_embeddings, self.dropout)
155 | 
156 | 
157 |     def add_logits_op(self):
158 |         """Defines self.logits
159 | 
160 |         For each word in each sentence of the batch, it corresponds to a vector
161 |         of scores, of dimension equal to the number of tags.
162 |         """
163 |         with tf.variable_scope("bi-lstm"):
164 |             cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
165 |             cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
166 |             (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
167 |                     cell_fw, cell_bw, self.word_embeddings,
168 |                     sequence_length=self.sequence_lengths, dtype=tf.float32)
169 |             output = tf.concat([output_fw, output_bw], axis=-1)
170 |             output = tf.nn.dropout(output, self.dropout)
171 | 
172 |         with tf.variable_scope("proj"):
173 |             W = tf.get_variable("W", dtype=tf.float32,
174 |                     shape=[2*self.config.hidden_size_lstm, self.config.ntags])
175 | 
176 |             b = tf.get_variable("b", shape=[self.config.ntags],
177 |                     dtype=tf.float32, initializer=tf.zeros_initializer())
178 | 
179 |             nsteps = tf.shape(output)[1]
180 |             output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm])
181 |             pred = tf.matmul(output, W) + b
182 |             self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags])
183 | 
184 | 
185 |     def add_pred_op(self):
186 |         """Defines self.labels_pred
187 | 
188 |         This op is defined only in the case where we don't use a CRF since in
189 |         that case we can make the prediction "in the graph" (thanks to tf
190 |         functions in other words). With theCRF, as the inference is coded
191 |         in python and not in pure tensroflow, we have to make the prediciton
192 |         outside the graph.
193 |         """
194 |         if not self.config.use_crf:
195 |             self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1),
196 |                     tf.int32)
197 | 
198 | 
199 |     def add_loss_op(self):
200 |         """Defines the loss"""
201 |         if self.config.use_crf:
202 |             log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
203 |                     self.logits, self.labels, self.sequence_lengths)
204 |             self.trans_params = trans_params # need to evaluate it for decoding
205 |             self.loss = tf.reduce_mean(-log_likelihood)
206 |         else:
207 |             losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
208 |                     logits=self.logits, labels=self.labels)
209 |             mask = tf.sequence_mask(self.sequence_lengths)
210 |             losses = tf.boolean_mask(losses, mask)
211 |             self.loss = tf.reduce_mean(losses)
212 | 
213 |         # for tensorboard
214 |         tf.summary.scalar("loss", self.loss)
215 | 
216 | 
217 |     def build(self):
218 |         # NER specific functions
219 |         self.add_placeholders()
220 |         self.add_word_embeddings_op()
221 |         self.add_logits_op()
222 |         self.add_pred_op()
223 |         self.add_loss_op()
224 | 
225 |         # Generic functions that add training op and initialize session
226 |         self.add_train_op(self.config.lr_method, self.lr, self.loss,
227 |                 self.config.clip)
228 |         self.initialize_session() # now self.sess is defined and vars are init
229 | 
230 | 
231 |     def predict_batch(self, words):
232 |         """
233 |         Args:
234 |             words: list of sentences
235 | 
236 |         Returns:
237 |             labels_pred: list of labels for each sentence
238 |             sequence_length
239 | 
240 |         """
241 |         fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0)
242 | 
243 |         if self.config.use_crf:
244 |             # get tag scores and transition params of CRF
245 |             viterbi_sequences = []
246 |             logits, trans_params = self.sess.run(
247 |                     [self.logits, self.trans_params], feed_dict=fd)
248 | 
249 |             # iterate over the sentences because no batching in vitervi_decode
250 |             for logit, sequence_length in zip(logits, sequence_lengths):
251 |                 logit = logit[:sequence_length] # keep only the valid steps
252 |                 viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
253 |                         logit, trans_params)
254 |                 viterbi_sequences += [viterbi_seq]
255 | 
256 |             return viterbi_sequences, sequence_lengths
257 | 
258 |         else:
259 |             labels_pred = self.sess.run(self.labels_pred, feed_dict=fd)
260 | 
261 |             return labels_pred, sequence_lengths
262 | 
263 | 
264 |     def run_epoch(self, train, dev, epoch):
265 |         """Performs one complete pass over the train set and evaluate on dev
266 | 
267 |         Args:
268 |             train: dataset that yields tuple of sentences, tags
269 |             dev: dataset
270 |             epoch: (int) index of the current epoch
271 | 
272 |         Returns:
273 |             f1: (python float), score to select model on, higher is better
274 | 
275 |         """
276 |         # progbar stuff for logging
277 |         batch_size = self.config.batch_size
278 |         nbatches = (len(train) + batch_size - 1) // batch_size
279 |         prog = Progbar(target=nbatches)
280 | 
281 |         # iterate over dataset
282 |         for i, (words, labels) in enumerate(minibatches(train, batch_size)):
283 |             fd, _ = self.get_feed_dict(words, labels, self.config.lr,
284 |                     self.config.dropout)
285 | 
286 |             _, train_loss, summary = self.sess.run(
287 |                     [self.train_op, self.loss, self.merged], feed_dict=fd)
288 | 
289 |             prog.update(i + 1, [("train loss", train_loss)])
290 | 
291 |             # tensorboard
292 |             if i % 10 == 0:
293 |                 self.file_writer.add_summary(summary, epoch*nbatches + i)
294 | 
295 |         metrics = self.run_evaluate(dev)
296 |         msg = " - ".join(["{} {:04.2f}".format(k, v)
297 |                 for k, v in metrics.items()])
298 |         self.logger.info(msg)
299 | 
300 |         return metrics["f1"]
301 | 
302 | 
303 |     def run_evaluate(self, test):
304 |         """Evaluates performance on test set
305 | 
306 |         Args:
307 |             test: dataset that yields tuple of (sentences, tags)
308 | 
309 |         Returns:
310 |             metrics: (dict) metrics["acc"] = 98.4, ...
311 | 
312 |         """
313 |         accs = []
314 |         correct_preds, total_correct, total_preds = 0., 0., 0.
315 |         for words, labels in minibatches(test, self.config.batch_size):
316 |             labels_pred, sequence_lengths = self.predict_batch(words)
317 | 
318 |             for lab, lab_pred, length in zip(labels, labels_pred,
319 |                                              sequence_lengths):
320 |                 lab      = lab[:length]
321 |                 lab_pred = lab_pred[:length]
322 |                 accs    += [a==b for (a, b) in zip(lab, lab_pred)]
323 | 
324 |                 lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
325 |                 lab_pred_chunks = set(get_chunks(lab_pred,
326 |                                                  self.config.vocab_tags))
327 | 
328 |                 correct_preds += len(lab_chunks & lab_pred_chunks)
329 |                 total_preds   += len(lab_pred_chunks)
330 |                 total_correct += len(lab_chunks)
331 | 
332 |         p   = correct_preds / total_preds if correct_preds > 0 else 0
333 |         r   = correct_preds / total_correct if correct_preds > 0 else 0
334 |         f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
335 |         acc = np.mean(accs)
336 | 
337 |         return {"acc": 100*acc, "f1": 100*f1}
338 | 
339 | 
340 |     def predict(self, words_raw):
341 |         """Returns list of tags
342 | 
343 |         Args:
344 |             words_raw: list of words (string), just one sentence (no batch)
345 | 
346 |         Returns:
347 |             preds: list of tags (string), one for each word in the sentence
348 | 
349 |         """
350 |         words = [self.config.processing_word(w) for w in words_raw]
351 |         if type(words[0]) == tuple:
352 |             words = zip(*words)
353 |         pred_ids, _ = self.predict_batch([words])
354 |         preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]
355 | 
356 |         return preds
357 | 


--------------------------------------------------------------------------------
/lstm_model/src/ner_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | from .data_utils import minibatches, pad_sequences, get_chunks
  7 | from .general_utils import Progbar
  8 | from .base_model import BaseModel
  9 | 
 10 | 
 11 | class NERModel(BaseModel):
 12 |     """Specialized class of Model for NER"""
 13 | 
 14 |     def __init__(self, config):
 15 |         super(NERModel, self).__init__(config)
 16 |         self.idx_to_tag = {idx: tag for tag, idx in
 17 |                            self.config.vocab_tags.items()}
 18 | 
 19 | 
 20 |     def add_placeholders(self):
 21 |         """Define placeholders = entries to computational graph"""
 22 |         # shape = (batch size, max length of sentence in batch)
 23 |         self.word_ids = tf.placeholder(tf.int32, shape=[None, None],
 24 |                         name="word_ids")
 25 | 
 26 |         # shape = (batch size)
 27 |         self.sequence_lengths = tf.placeholder(tf.int32, shape=[None],
 28 |                         name="sequence_lengths")
 29 | 
 30 |         # shape = (batch size, max length of sentence, max length of word)
 31 |         self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None],
 32 |                         name="char_ids")
 33 | 
 34 |         # shape = (batch_size, max_length of sentence)
 35 |         self.word_lengths = tf.placeholder(tf.int32, shape=[None, None],
 36 |                         name="word_lengths")
 37 | 
 38 |         # shape = (batch size, max length of sentence in batch)
 39 |         self.labels = tf.placeholder(tf.int32, shape=[None, None],
 40 |                         name="labels")
 41 | 
 42 |         # hyper parameters
 43 |         self.dropout = tf.placeholder(dtype=tf.float32, shape=[],
 44 |                         name="dropout")
 45 |         self.lr = tf.placeholder(dtype=tf.float32, shape=[],
 46 |                         name="lr")
 47 | 
 48 | 
 49 |     def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
 50 |         """Given some data, pad it and build a feed dictionary
 51 | 
 52 |         Args:
 53 |             words: list of sentences. A sentence is a list of ids of a list of
 54 |                 words. A word is a list of ids
 55 |             labels: list of ids
 56 |             lr: (float) learning rate
 57 |             dropout: (float) keep prob
 58 | 
 59 |         Returns:
 60 |             dict {placeholder: value}
 61 | 
 62 |         """
 63 |         # perform padding of the given data
 64 |         if self.config.use_chars:
 65 |             char_ids, word_ids = zip(*words)
 66 |             word_ids, sequence_lengths = pad_sequences(word_ids, 0)
 67 |             char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
 68 |                 nlevels=2)
 69 |         else:
 70 |             word_ids, sequence_lengths = pad_sequences(words, 0)
 71 | 
 72 |         # build feed dictionary
 73 |         feed = {
 74 |             self.word_ids: word_ids,
 75 |             self.sequence_lengths: sequence_lengths
 76 |         }
 77 | 
 78 |         if self.config.use_chars:
 79 |             feed[self.char_ids] = char_ids
 80 |             feed[self.word_lengths] = word_lengths
 81 | 
 82 |         if labels is not None:
 83 |             labels, _ = pad_sequences(labels, 0)
 84 |             feed[self.labels] = labels
 85 | 
 86 |         if lr is not None:
 87 |             feed[self.lr] = lr
 88 | 
 89 |         if dropout is not None:
 90 |             feed[self.dropout] = dropout
 91 | 
 92 |         return feed, sequence_lengths
 93 | 
 94 | 
 95 |     def add_word_embeddings_op(self):
 96 |         """Defines self.word_embeddings
 97 | 
 98 |         If self.config.embeddings is not None and is a np array initialized
 99 |         with pre-trained word vectors, the word embeddings is just a look-up
100 |         and we don't train the vectors. Otherwise, a random matrix with
101 |         the correct shape is initialized.
102 |         """
103 |         with tf.variable_scope("words"):
104 |             if self.config.embeddings is None:
105 |                 self.logger.info("WARNING: randomly initializing word vectors")
106 |                 _word_embeddings = tf.get_variable(
107 |                         name="_word_embeddings",
108 |                         dtype=tf.float32,
109 |                         shape=[self.config.nwords, self.config.dim_word])
110 |             else:
111 |                 _word_embeddings = tf.Variable(
112 |                         self.config.embeddings,
113 |                         name="_word_embeddings",
114 |                         dtype=tf.float32,
115 |                         trainable=self.config.train_embeddings)
116 | 
117 |             word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
118 |                     self.word_ids, name="word_embeddings")
119 | 
120 |         with tf.variable_scope("chars"):
121 |             if self.config.use_chars:
122 |                 # get char embeddings matrix
123 |                 _char_embeddings = tf.get_variable(
124 |                         name="_char_embeddings",
125 |                         dtype=tf.float32,
126 |                         shape=[self.config.nchars, self.config.dim_char])
127 |                 char_embeddings = tf.nn.embedding_lookup(_char_embeddings,
128 |                         self.char_ids, name="char_embeddings")
129 | 
130 |                 # put the time dimension on axis=1
131 |                 s = tf.shape(char_embeddings)
132 |                 char_embeddings = tf.reshape(char_embeddings,
133 |                         shape=[s[0]*s[1], s[-2], self.config.dim_char])
134 |                 word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]])
135 | 
136 |                 # bi lstm on chars
137 |                 cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
138 |                         state_is_tuple=True)
139 |                 cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
140 |                         state_is_tuple=True)
141 |                 _output = tf.nn.bidirectional_dynamic_rnn(
142 |                         cell_fw, cell_bw, char_embeddings,
143 |                         sequence_length=word_lengths, dtype=tf.float32)
144 | 
145 |                 # read and concat output
146 |                 _, ((_, output_fw), (_, output_bw)) = _output
147 |                 output = tf.concat([output_fw, output_bw], axis=-1)
148 | 
149 |                 # shape = (batch size, max sentence length, char hidden size)
150 |                 output = tf.reshape(output,
151 |                         shape=[s[0], s[1], 2*self.config.hidden_size_char])
152 |                 word_embeddings = tf.concat([word_embeddings, output], axis=-1)
153 | 
154 |         self.word_embeddings =  tf.nn.dropout(word_embeddings, self.dropout)
155 | 
156 | 
157 |     def add_logits_op(self):
158 |         """Defines self.logits
159 | 
160 |         For each word in each sentence of the batch, it corresponds to a vector
161 |         of scores, of dimension equal to the number of tags.
162 |         """
163 |         with tf.variable_scope("bi-lstm"):
164 |             cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
165 |             cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
166 |             (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
167 |                     cell_fw, cell_bw, self.word_embeddings,
168 |                     sequence_length=self.sequence_lengths, dtype=tf.float32)
169 |             output = tf.concat([output_fw, output_bw], axis=-1)
170 |             output = tf.nn.dropout(output, self.dropout)
171 | 
172 |         with tf.variable_scope("proj"):
173 |             W = tf.get_variable("W", dtype=tf.float32,
174 |                     shape=[2*self.config.hidden_size_lstm, self.config.ntags])
175 | 
176 |             b = tf.get_variable("b", shape=[self.config.ntags],
177 |                     dtype=tf.float32, initializer=tf.zeros_initializer())
178 | 
179 |             nsteps = tf.shape(output)[1]
180 |             output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm])
181 |             pred = tf.matmul(output, W) + b
182 |             self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags])
183 | 
184 | 
185 |     def add_pred_op(self):
186 |         """Defines self.labels_pred
187 | 
188 |         This op is defined only in the case where we don't use a CRF since in
189 |         that case we can make the prediction "in the graph" (thanks to tf
190 |         functions in other words). With theCRF, as the inference is coded
191 |         in python and not in pure tensroflow, we have to make the prediciton
192 |         outside the graph.
193 |         """
194 |         if not self.config.use_crf:
195 |             self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1),
196 |                     tf.int32)
197 | 
198 | 
199 |     def add_loss_op(self):
200 |         """Defines the loss"""
201 |         if self.config.use_crf:
202 |             log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
203 |                     self.logits, self.labels, self.sequence_lengths)
204 |             self.trans_params = trans_params # need to evaluate it for decoding
205 |             self.loss = tf.reduce_mean(-log_likelihood)
206 |         else:
207 |             losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
208 |                     logits=self.logits, labels=self.labels)
209 |             mask = tf.sequence_mask(self.sequence_lengths)
210 |             losses = tf.boolean_mask(losses, mask)
211 |             self.loss = tf.reduce_mean(losses)
212 | 
213 |         # for tensorboard
214 |         tf.summary.scalar("loss", self.loss)
215 | 
216 | 
217 |     def build(self):
218 |         # NER specific functions
219 |         self.add_placeholders()
220 |         self.add_word_embeddings_op()
221 |         self.add_logits_op()
222 |         self.add_pred_op()
223 |         self.add_loss_op()
224 | 
225 |         # Generic functions that add training op and initialize session
226 |         self.add_train_op(self.config.lr_method, self.lr, self.loss,
227 |                 self.config.clip)
228 |         self.initialize_session() # now self.sess is defined and vars are init
229 | 
230 | 
231 |     def predict_batch(self, words):
232 |         """
233 |         Args:
234 |             words: list of sentences
235 | 
236 |         Returns:
237 |             labels_pred: list of labels for each sentence
238 |             sequence_length
239 | 
240 |         """
241 |         fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0)
242 | 
243 |         if self.config.use_crf:
244 |             # get tag scores and transition params of CRF
245 |             viterbi_sequences = []
246 |             logits, trans_params = self.sess.run(
247 |                     [self.logits, self.trans_params], feed_dict=fd)
248 | 
249 |             # iterate over the sentences because no batching in vitervi_decode
250 |             for logit, sequence_length in zip(logits, sequence_lengths):
251 |                 logit = logit[:sequence_length] # keep only the valid steps
252 |                 viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
253 |                         logit, trans_params)
254 |                 viterbi_sequences += [viterbi_seq]
255 | 
256 |             return viterbi_sequences, sequence_lengths
257 | 
258 |         else:
259 |             labels_pred = self.sess.run(self.labels_pred, feed_dict=fd)
260 | 
261 |             return labels_pred, sequence_lengths
262 | 
263 | 
264 |     def run_epoch(self, train, dev, epoch):
265 |         """Performs one complete pass over the train set and evaluate on dev
266 | 
267 |         Args:
268 |             train: dataset that yields tuple of sentences, tags
269 |             dev: dataset
270 |             epoch: (int) index of the current epoch
271 | 
272 |         Returns:
273 |             f1: (python float), score to select model on, higher is better
274 | 
275 |         """
276 |         # progbar stuff for logging
277 |         batch_size = self.config.batch_size
278 |         nbatches = (len(train) + batch_size - 1) // batch_size
279 |         prog = Progbar(target=nbatches)
280 | 
281 |         # iterate over dataset
282 |         for i, (words, labels) in enumerate(minibatches(train, batch_size)):
283 |             fd, _ = self.get_feed_dict(words, labels, self.config.lr,
284 |                     self.config.dropout)
285 | 
286 |             _, train_loss, summary = self.sess.run(
287 |                     [self.train_op, self.loss, self.merged], feed_dict=fd)
288 | 
289 |             prog.update(i + 1, [("train loss", train_loss)])
290 | 
291 |             # tensorboard
292 |             if i % 10 == 0:
293 |                 self.file_writer.add_summary(summary, epoch*nbatches + i)
294 | 
295 |         metrics = self.run_evaluate(dev)
296 |         msg = " - ".join(["{} {:04.2f}".format(k, v)
297 |                 for k, v in metrics.items()])
298 |         self.logger.info(msg)
299 | 
300 |         return metrics["f1"]
301 | 
302 | 
303 |     def run_evaluate(self, test):
304 |         """Evaluates performance on test set
305 | 
306 |         Args:
307 |             test: dataset that yields tuple of (sentences, tags)
308 | 
309 |         Returns:
310 |             metrics: (dict) metrics["acc"] = 98.4, ...
311 | 
312 |         """
313 |         accs = []
314 |         correct_preds, total_correct, total_preds = 0., 0., 0.
315 |         for words, labels in minibatches(test, self.config.batch_size):
316 |             labels_pred, sequence_lengths = self.predict_batch(words)
317 | 
318 |             for lab, lab_pred, length in zip(labels, labels_pred,
319 |                                              sequence_lengths):
320 |                 lab      = lab[:length]
321 |                 lab_pred = lab_pred[:length]
322 |                 accs    += [a==b for (a, b) in zip(lab, lab_pred)]
323 | 
324 |                 lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
325 |                 lab_pred_chunks = set(get_chunks(lab_pred,
326 |                                                  self.config.vocab_tags))
327 | 
328 |                 correct_preds += len(lab_chunks & lab_pred_chunks)
329 |                 total_preds   += len(lab_pred_chunks)
330 |                 total_correct += len(lab_chunks)
331 | 
332 |         p   = correct_preds / total_preds if correct_preds > 0 else 0
333 |         r   = correct_preds / total_correct if correct_preds > 0 else 0
334 |         f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
335 |         acc = np.mean(accs)
336 | 
337 |         return {"acc": 100*acc, "f1": 100*f1}
338 | 
339 | 
340 |     def predict(self, words_raw):
341 |         """Returns list of tags
342 | 
343 |         Args:
344 |             words_raw: list of words (string), just one sentence (no batch)
345 | 
346 |         Returns:
347 |             preds: list of tags (string), one for each word in the sentence
348 | 
349 |         """
350 |         words = [self.config.processing_word(w) for w in words_raw]
351 |         if type(words[0]) == tuple:
352 |             words = zip(*words)
353 |         pred_ids, _ = self.predict_batch([words])
354 |         preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]
355 | 
356 |         return preds
357 | 


--------------------------------------------------------------------------------
/lstm_model/data_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import tensorflow as tf
  4 | 
  5 | # shared global variables to be imported from model also
  6 | UNK = "$UNK$"
  7 | NUM = "$NUM$"
  8 | NONE = "O"
  9 | WORD_PAD = '$W_PAD$'
 10 | TAG_PAD = '$T_PAD$'
 11 | 
 12 | # special error message
 13 | class MyIOError(Exception):
 14 |     def __init__(self, filename):
 15 |         # custom error message
 16 |         message = """
 17 | ERROR: Unable to locate file {}.
 18 | 
 19 | FIX: Have you tried running python build_data.py first?
 20 | This will build vocab file from your train, test and dev sets and
 21 | trimm your word vectors.
 22 | """.format(filename)
 23 |         super(MyIOError, self).__init__(message)
 24 | 
 25 | 
 26 | def Dataset(filename, processing_word=None, processing_tag=None, max_iter=None):
 27 |     results = []
 28 |     with open(filename) as f:
 29 |         sentences, tags = [], []
 30 |         n_iter = 0
 31 |         for line in f:
 32 |             line = line.strip()
 33 |             if not line:
 34 |                 if len(sentences) != 0:
 35 |                     n_iter += 1
 36 |                     if max_iter is not None and n_iter > max_iter:
 37 |                         break
 38 |                     results.append((sentences, tags))
 39 |                     sentences, tags = [], []
 40 |             elif not line.startswith("###"):
 41 |                 ls = line.split('|')
 42 |                 tag, sentence = ls[1], ls[2].split()
 43 |                 # if tag != 'Others':
 44 |                 if processing_word is not None:
 45 |                     try:
 46 |                         sentence = [processing_word(word) for word in sentence]
 47 |                     except:
 48 |                         pass
 49 |                 if processing_tag is not None:
 50 |                     tag = processing_tag(tag)
 51 |                 sentences += [sentence]
 52 |                 tags += [tag]
 53 | 
 54 |     return results
 55 | 
 56 | # class Dataset(object):
 57 | #     """Class that iterates over CoNLL Dataset
 58 | #
 59 | #     __iter__ method yields a tuple (words, tags)
 60 | #         words: list of raw words
 61 | #         tags: list of raw tags
 62 | #
 63 | #     If processing_word and processing_tag are not None,
 64 | #     optional preprocessing is appplied
 65 | #
 66 | #     Example:
 67 | #         ```python
 68 | #         data = CoNLLDataset(filename)
 69 | #         for sentence, tags in data:
 70 | #             pass
 71 | #         ```
 72 | #
 73 | #     """
 74 | #     def __init__(self, filename, processing_word=None, processing_tag=None, max_iter=None):
 75 | #         """
 76 | #         Args:
 77 | #             filename: path to the file
 78 | #             processing_words: (optional) function that takes a word as input
 79 | #             processing_tags: (optional) function that takes a tag as input
 80 | #             max_iter: (optional) max number of sentences to yield
 81 | #
 82 | #         """
 83 | #         self.filename = filename
 84 | #         self.processing_word = processing_word
 85 | #         self.processing_tag = processing_tag
 86 | #         self.length = None
 87 | #         self.max_iter = max_iter
 88 | #
 89 | #
 90 | #     def __iter__(self):
 91 | #         with open(self.filename) as f:
 92 | #             sentences, tags = [], []
 93 | #             n_iter = 0
 94 | #             for line in f:
 95 | #                 line = line.strip()
 96 | #                 if not line:
 97 | #                     if len(sentences) != 0:
 98 | #                         n_iter += 1
 99 | #                         if self.max_iter is not None and n_iter > self.max_iter:
100 | #                             break
101 | #                         yield sentences, tags
102 | #                         sentences, tags = [], []
103 | #                 elif not line.startswith("###"):
104 | #                     ls = line.split('|')
105 | #                     tag, sentence = ls[1], ls[2].split()
106 | #                     # if tag != 'Others':
107 | #                     if self.processing_word is not None:
108 | #                         sentence = [self.processing_word(word) for word in sentence]
109 | #                     if self.processing_tag is not None:
110 | #                         tag = self.processing_tag(tag)
111 | #                     sentences += [sentence]
112 | #                     tags += [tag]
113 | #
114 | #
115 | #     def __len__(self):
116 | #         """Iterates once over the corpus to set and store length"""
117 | #         if self.length is None:
118 | #             self.length = 0
119 | #             for _ in self:
120 | #                 self.length += 1
121 | #
122 | #         return self.length
123 | 
124 | 
125 | class Embedding(object):
126 |     """Embedding layer with frequency-based normalization and dropout."""
127 |     def __init__(self, vocab_size=None,
128 |                 embedding_dim=None,
129 |                 embeddings=None,
130 |                 normalize=False,
131 |                 vocab_freqs=None,
132 |                 keep_prob=1.,
133 |                 trainable=False):
134 |         # super(Embedding, self).__init__(**kwargs)
135 |         with tf.variable_scope("words"):
136 |             if embeddings is None:
137 |                 assert vocab_size is not None
138 |                 assert embedding_dim is not None
139 |                 self._word_embeddings = tf.get_variable(
140 |                                 name="_word_embeddings",
141 |                                 dtype=tf.float32,
142 |                                 shape=[vocab_size, embedding_dim])
143 |             else:
144 |                 vocab_size = embeddings.shape[0]
145 |                 self._word_embeddings = tf.Variable(
146 |                                 embeddings,
147 |                                 name="_word_embeddings",
148 |                                 dtype=tf.float32,
149 |                                 trainable=trainable)
150 | 
151 |         self.keep_prob = keep_prob
152 | 
153 |         if normalize:
154 |             assert vocab_freqs is not None
155 |             vocab_freqs = tf.constant(
156 |               vocab_freqs, dtype=tf.float32, shape=(vocab_size, 1))
157 |             self._word_embeddings = self._normalize(self._word_embeddings, vocab_freqs)
158 | 
159 |     def embed(self, x):
160 |         with tf.variable_scope("words"):
161 |             embedded = tf.nn.embedding_lookup(self._word_embeddings, x)
162 |             if self.keep_prob < 1.:
163 |                 # embedded = tf.nn.dropout(embedded, self.keep_prob)
164 |                 shape = embedded.get_shape().as_list()
165 | 
166 |                 # Use same dropout masks at each timestep with specifying noise_shape.
167 |                 # This slightly improves performance.
168 |                 # Please see https://arxiv.org/abs/1512.05287 for the theoretical
169 |                 # explanation.
170 |                 if len(shape) == 3:
171 |                     embedded = tf.nn.dropout(
172 |                       embedded, self.keep_prob, noise_shape=(shape[0], 1, shape[2]))
173 |                 elif len(shape) == 4:
174 |                     embedded = tf.nn.dropout(
175 |                       embedded, self.keep_prob, noise_shape=(shape[0], 1, 1, shape[2]))
176 |                 else:
177 |                     pass
178 |         return embedded
179 | 
180 |     def _normalize(self, emb, vocab_freqs):
181 |         weights = vocab_freqs / tf.reduce_sum(vocab_freqs)
182 |         mean = tf.reduce_sum(weights * emb, 0, keepdims=True)
183 |         var = tf.reduce_sum(weights * tf.pow(emb - mean, 2.), 0, keepdims=True)
184 |         stddev = tf.sqrt(1e-6 + var)
185 |         return (emb - mean) / stddev
186 | 
187 | 
188 | def get_vocabs(datasets):
189 |     """Build vocabulary from an iterable of datasets objects
190 | 
191 |     Args:
192 |         datasets: a list of dataset objects
193 | 
194 |     Returns:
195 |         a set of all the words in the dataset
196 | 
197 |     """
198 |     print("Building vocab...")
199 |     vocab_tags = set()
200 |     vocab_words_freq = dict()
201 |     for dataset in datasets:
202 |         for sentences, tags in dataset:
203 |             for sent in sentences:
204 |                 for token in sent:
205 |                     vocab_words_freq[token] = vocab_words_freq.get(token, 0) + 1
206 |             vocab_tags.update(tags)
207 |     print("- done. {} tokens".format(len(vocab_words_freq)))
208 |     return vocab_words_freq, vocab_tags
209 | 
210 | 
211 | def get_char_vocab(dataset):
212 |     """Build char vocabulary from an iterable of datasets objects
213 | 
214 |     Args:
215 |         dataset: a iterator yielding tuples (sentence, tags)
216 | 
217 |     Returns:
218 |         a set of all the characters in the dataset
219 | 
220 |     """
221 |     vocab_char = set()
222 |     for sents, _ in dataset:
223 |         for sent in sents:
224 |             for word in sent:
225 |                 vocab_char.update(word)
226 | 
227 |     return vocab_char
228 | 
229 | 
230 | def get_wordvec_vocab(filename):
231 |     """Load vocab from file
232 | 
233 |     Args:
234 |         filename: path to the glove vectors
235 | 
236 |     Returns:
237 |         vocab: set() of strings
238 |     """
239 |     print("Building vocab...")
240 |     vocab = set()
241 |     with open(filename) as f:
242 |         for line in f:
243 |             word = line.strip().split(' ')[0]
244 |             vocab.add(word)
245 |     print("- done. {} tokens".format(len(vocab)))
246 |     return vocab
247 | 
248 | 
249 | def write_vocab(vocab, filename):
250 |     """Writes a vocab to a file
251 | 
252 |     Writes one word per line.
253 | 
254 |     Args:
255 |         vocab: iterable that yields word
256 |         filename: path to vocab file
257 | 
258 |     Returns:
259 |         write a word per line
260 | 
261 |     """
262 |     print("Writing vocab...")
263 |     with open(filename, "w") as f:
264 |         if isinstance(vocab, dict):
265 |             for i, word in enumerate(vocab):
266 |                 if i != len(vocab) - 1:
267 |                     f.write("{}\t{}\n".format(word, vocab[word]))
268 |                 else:
269 |                     f.write('{}\t{}'.format(word, vocab[word]))
270 |         else:
271 |             for i, word in enumerate(vocab):
272 |                 if i != len(vocab) - 1:
273 |                     f.write("{}\n".format(word))
274 |                 else:
275 |                     f.write(word)
276 |     print("- done. {} tokens".format(len(vocab)))
277 | 
278 | 
279 | def load_vocab(filename):
280 |     """Loads vocab from a file
281 | 
282 |     Args:
283 |         filename: (string) the format of the file must be one word per line.
284 | 
285 |     Returns:
286 |         d: dict[word] = index
287 | 
288 |     """
289 |     try:
290 |         d = dict()
291 |         vocab_freq = []
292 |         with open(filename) as f:
293 |             for idx, line in enumerate(f):
294 |                 line = line.strip().split()
295 |                 if len(line) < 2:
296 |                     word = line[0]
297 |                     d[word] = idx
298 |                 else:
299 |                     word, freq = line
300 |                     d[word] = idx
301 |                     try:
302 |                         vocab_freq.append(int(freq))
303 |                     except:
304 |                         pass
305 | 
306 |     except IOError:
307 |         raise MyIOError(filename)
308 | 
309 |     if len(vocab_freq) == 0:
310 |         return d
311 |     else:
312 |         return d, vocab_freq
313 | 
314 | 
315 | def export_trimmed_wordvec_vectors(vocab, wordvec_filename, trimmed_filename):
316 |     """Saves glove vectors in numpy array
317 | 
318 |     Args:
319 |         vocab: dictionary vocab[word] = index
320 |         glove_filename: a path to a glove file
321 |         trimmed_filename: a path where to store a matrix in npy
322 |         dim: (int) dimension of embeddings
323 | 
324 |     """
325 |     num = 0
326 |     with open(trimmed_filename, 'w') as outFile:
327 |         with open(wordvec_filename, 'r') as inFile:
328 |             for line in inFile:
329 |                 word = line.strip().split(' ')[0]
330 |                 if word in vocab:
331 |                     outFile.write(line)
332 |                     num += 1
333 | 
334 |     print('{} out of {} tokens can find pre-trained embeddings!'.format(num, len(vocab)))
335 | 
336 | 
337 | def get_trimmed_wordvec_vectors(filename, vocab):
338 |     """
339 |     Args:
340 |         filename: path to the npz file
341 | 
342 |     Returns:
343 |         matrix of embeddings (np array)
344 | 
345 |     """
346 |     f = open(filename, 'r')
347 |     f.readline()
348 |     dim = len(f.readline().strip().split()) - 1
349 |     assert dim > 30
350 |     embeddings = np.random.uniform(-0.1, 0.1, size=(len(vocab)+1, dim))
351 |     with open(filename, 'r') as inFile:
352 |         for line in inFile:
353 |             line = line.strip().split()
354 |             word = line[0]       
355 |             if word in vocab:
356 |                 embeddings[vocab[word]] = np.array([float(item) for item in line[1:]])
357 | 
358 |     return embeddings
359 | 
360 | 
361 | def get_processing_word(vocab_words=None, vocab_chars=None,
362 |                     lowercase=False, chars=False, allow_unk=True):
363 |     """Return lambda function that transform a word (string) into list,
364 |     or tuple of (list, id) of int corresponding to the ids of the word and
365 |     its corresponding characters.
366 | 
367 |     Args:
368 |         vocab: dict[word] = idx
369 | 
370 |     Returns:
371 |         f("cat") = ([12, 4, 32], 12345)
372 |                  = (list of char ids, word id)
373 | 
374 |     """
375 |     def f(word):
376 |         # 0. get chars of words
377 |         if vocab_chars is not None and chars == True:
378 |             char_ids = []
379 |             for char in word:
380 |                 # ignore chars out of vocabulary
381 |                 if char in vocab_chars:
382 |                     char_ids += [vocab_chars[char]]
383 | 
384 |         # 1. preprocess word
385 |         if lowercase:
386 |             word = word.lower()
387 |         if word.isdigit():
388 |             word = NUM
389 | 
390 |         # 2. get id of word
391 |         if vocab_words is not None:
392 |             if word in vocab_words:
393 |                 word = vocab_words[word]
394 |             else:
395 |                 if allow_unk:
396 |                     word = vocab_words[UNK]
397 |                 else:
398 |                     raise Exception("Unknow key is not allowed. Check that "\
399 |                                     "your vocab (tags?) is correct")
400 | 
401 |         # 3. return tuple char ids, word id
402 |         if vocab_chars is not None and chars == True:
403 |             return char_ids, word
404 |         else:
405 |             return word
406 | 
407 |     return f
408 | 
409 | 
410 | def _pad_sequences(sequences, pad_tok, max_length):
411 |     """
412 |     Args:
413 |         sequences: a generator of list or tuple
414 |         pad_tok: the char to pad with
415 | 
416 |     Returns:
417 |         a list of list where each sublist has same length
418 |     """
419 |     sequence_padded, sequence_length = [], []
420 | 
421 |     for seq in sequences:
422 |         seq = list(seq)
423 |         seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
424 |         sequence_padded +=  [seq_]
425 |         sequence_length += [min(len(seq), max_length)]
426 | 
427 |     return sequence_padded, sequence_length
428 | 
429 | 
430 | def pad_sequences(sequences, pad_tok, nlevels=2):
431 |     """
432 |     Args:
433 |         sequences: a generator of list or tuple
434 |         pad_tok: the char to pad with
435 |         nlevels: "depth" of padding, for the case where we have characters ids
436 | 
437 |     Returns:
438 |         a list of list where each sublist has same length
439 | 
440 |     """
441 |     if nlevels == 1:
442 |         max_length = max(map(lambda x : len(x), sequences))
443 |         sequence_padded, sequence_length = _pad_sequences(sequences,
444 |                                             pad_tok, max_length) 
445 | 
446 |     elif nlevels == 2:
447 |         max_length_sentence = max([max(map(lambda x: len(x), seq))
448 |                                for seq in sequences])
449 |         sequence_padded, sequence_length = [], []
450 |         for seq in sequences:
451 |             # all words are same length now
452 |             sp, sl = _pad_sequences(seq, pad_tok, max_length_sentence)
453 |             sequence_padded += [sp]
454 |             sequence_length += [sl]
455 | 
456 |         max_length_document = max(map(lambda x : len(x), sequences))
457 |         sequence_padded, _ = _pad_sequences(sequence_padded,
458 |                 [pad_tok]*max_length_sentence, max_length_document)
459 |         sequence_length, _ = _pad_sequences(sequence_length, 0,
460 |                 max_length_document)
461 | 
462 |     return sequence_padded, sequence_length
463 | 
464 | 
465 | def minibatches(data, minibatch_size, shuffle=True):
466 |     """
467 |     Args:
468 |         data: generator of (sentence, tags) tuples
469 |         minibatch_size: (int)
470 | 
471 |     Yields:
472 |         list of tuples
473 | 
474 |     """
475 |     if shuffle:
476 |         random.shuffle(data)
477 | 
478 |     x_batch, y_batch = [], []
479 |     for (x, y) in data:
480 |         if len(x_batch) == minibatch_size:
481 |             yield x_batch, y_batch
482 |             x_batch, y_batch = [], []
483 | 
484 |         # if type(x[0]) == tuple:
485 |             # x = zip(*x)
486 |         x_batch += [x]
487 |         y_batch += [y]
488 | 
489 |     if len(x_batch) != 0:
490 |         yield x_batch, y_batch
491 | 
492 | 
493 | def get_chunk_type(tok, idx_to_tag):
494 |     """
495 |     Args:
496 |         tok: id of token, ex 4
497 |         idx_to_tag: dictionary {4: "B-PER", ...}
498 | 
499 |     Returns:
500 |         tuple: "B", "PER"
501 | 
502 |     """
503 |     tag_name = idx_to_tag[tok]
504 |     tag_class = tag_name.split('-')[0]
505 |     tag_type = tag_name.split('-')[-1]
506 |     return tag_class, tag_type
507 | 
508 | 
509 | def get_chunks(seq, tags):
510 |     """Given a sequence of tags, group entities and their position
511 | 
512 |     Args:
513 |         seq: [4, 4, 0, 0, ...] sequence of labels
514 |         tags: dict["O"] = 4
515 | 
516 |     Returns:
517 |         list of (chunk_type, chunk_start, chunk_end)
518 | 
519 |     Example:
520 |         seq = [4, 5, 0, 3]
521 |         tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
522 |         result = [("PER", 0, 2), ("LOC", 3, 4)]
523 | 
524 |     """
525 |     default = tags[NONE]
526 |     idx_to_tag = {idx: tag for tag, idx in tags.items()}
527 |     chunks = []
528 |     chunk_type, chunk_start = None, None
529 |     for i, tok in enumerate(seq):
530 |         # End of a chunk 1
531 |         if tok == default and chunk_type is not None:
532 |             # Add a chunk.
533 |             chunk = (chunk_type, chunk_start, i)
534 |             chunks.append(chunk)
535 |             chunk_type, chunk_start = None, None
536 | 
537 |         # End of a chunk + start of a chunk!
538 |         elif tok != default:
539 |             tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
540 |             if chunk_type is None:
541 |                 chunk_type, chunk_start = tok_chunk_type, i
542 |             elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
543 |                 chunk = (chunk_type, chunk_start, i)
544 |                 chunks.append(chunk)
545 |                 chunk_type, chunk_start = tok_chunk_type, i
546 |         else:
547 |             pass
548 | 
549 |     # end condition
550 |     if chunk_type is not None:
551 |         chunk = (chunk_type, chunk_start, len(seq))
552 |         chunks.append(chunk)
553 | 
554 |     return chunks
555 | 


--------------------------------------------------------------------------------
/lstm_model/src/data_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import tensorflow as tf
  4 | 
  5 | # shared global variables to be imported from model also
  6 | UNK = "$UNK$"
  7 | NUM = "$NUM$"
  8 | NONE = "O"
  9 | WORD_PAD = '$W_PAD$'
 10 | TAG_PAD = '$T_PAD$'
 11 | 
 12 | # special error message
 13 | class MyIOError(Exception):
 14 |     def __init__(self, filename):
 15 |         # custom error message
 16 |         message = """
 17 | ERROR: Unable to locate file {}.
 18 | 
 19 | FIX: Have you tried running python build_data.py first?
 20 | This will build vocab file from your train, test and dev sets and
 21 | trimm your word vectors.
 22 | """.format(filename)
 23 |         super(MyIOError, self).__init__(message)
 24 | 
 25 | 
 26 | def Dataset(filename, processing_word=None, processing_tag=None, max_iter=None):
 27 |     results = []
 28 |     with open(filename) as f:
 29 |         sentences, tags = [], []
 30 |         n_iter = 0
 31 |         for line in f:
 32 |             line = line.strip()
 33 |             if not line:
 34 |                 if len(sentences) != 0:
 35 |                     n_iter += 1
 36 |                     if max_iter is not None and n_iter > max_iter:
 37 |                         break
 38 |                     results.append((sentences, tags))
 39 |                     sentences, tags = [], []
 40 |             elif not line.startswith("###"):
 41 |                 ls = line.split('|')
 42 |                 tag, sentence = ls[1], ls[2].split()
 43 |                 # if tag != 'Others':
 44 |                 if processing_word is not None:
 45 |                     try:
 46 |                         sentence = [processing_word(word) for word in sentence]
 47 |                     except:
 48 |                         pass
 49 |                 if processing_tag is not None:
 50 |                     tag = processing_tag(tag)
 51 |                 sentences += [sentence]
 52 |                 tags += [tag]
 53 | 
 54 |     return results
 55 | 
 56 | # class Dataset(object):
 57 | #     """Class that iterates over CoNLL Dataset
 58 | #
 59 | #     __iter__ method yields a tuple (words, tags)
 60 | #         words: list of raw words
 61 | #         tags: list of raw tags
 62 | #
 63 | #     If processing_word and processing_tag are not None,
 64 | #     optional preprocessing is appplied
 65 | #
 66 | #     Example:
 67 | #         ```python
 68 | #         data = CoNLLDataset(filename)
 69 | #         for sentence, tags in data:
 70 | #             pass
 71 | #         ```
 72 | #
 73 | #     """
 74 | #     def __init__(self, filename, processing_word=None, processing_tag=None, max_iter=None):
 75 | #         """
 76 | #         Args:
 77 | #             filename: path to the file
 78 | #             processing_words: (optional) function that takes a word as input
 79 | #             processing_tags: (optional) function that takes a tag as input
 80 | #             max_iter: (optional) max number of sentences to yield
 81 | #
 82 | #         """
 83 | #         self.filename = filename
 84 | #         self.processing_word = processing_word
 85 | #         self.processing_tag = processing_tag
 86 | #         self.length = None
 87 | #         self.max_iter = max_iter
 88 | #
 89 | #
 90 | #     def __iter__(self):
 91 | #         with open(self.filename) as f:
 92 | #             sentences, tags = [], []
 93 | #             n_iter = 0
 94 | #             for line in f:
 95 | #                 line = line.strip()
 96 | #                 if not line:
 97 | #                     if len(sentences) != 0:
 98 | #                         n_iter += 1
 99 | #                         if self.max_iter is not None and n_iter > self.max_iter:
100 | #                             break
101 | #                         yield sentences, tags
102 | #                         sentences, tags = [], []
103 | #                 elif not line.startswith("###"):
104 | #                     ls = line.split('|')
105 | #                     tag, sentence = ls[1], ls[2].split()
106 | #                     # if tag != 'Others':
107 | #                     if self.processing_word is not None:
108 | #                         sentence = [self.processing_word(word) for word in sentence]
109 | #                     if self.processing_tag is not None:
110 | #                         tag = self.processing_tag(tag)
111 | #                     sentences += [sentence]
112 | #                     tags += [tag]
113 | #
114 | #
115 | #     def __len__(self):
116 | #         """Iterates once over the corpus to set and store length"""
117 | #         if self.length is None:
118 | #             self.length = 0
119 | #             for _ in self:
120 | #                 self.length += 1
121 | #
122 | #         return self.length
123 | 
124 | 
125 | class Embedding(object):
126 |     """Embedding layer with frequency-based normalization and dropout."""
127 |     def __init__(self, vocab_size=None,
128 |                 embedding_dim=None,
129 |                 embeddings=None,
130 |                 normalize=False,
131 |                 vocab_freqs=None,
132 |                 keep_prob=1.,
133 |                 trainable=False):
134 |         # super(Embedding, self).__init__(**kwargs)
135 |         with tf.variable_scope("words"):
136 |             if embeddings is None:
137 |                 assert vocab_size is not None
138 |                 assert embedding_dim is not None
139 |                 self._word_embeddings = tf.get_variable(
140 |                                 name="_word_embeddings",
141 |                                 dtype=tf.float32,
142 |                                 shape=[vocab_size, embedding_dim])
143 |             else:
144 |                 vocab_size = embeddings.shape[0]
145 |                 self._word_embeddings = tf.Variable(
146 |                                 embeddings,
147 |                                 name="_word_embeddings",
148 |                                 dtype=tf.float32,
149 |                                 trainable=trainable)
150 | 
151 |         self.keep_prob = keep_prob
152 | 
153 |         if normalize:
154 |             assert vocab_freqs is not None
155 |             vocab_freqs = tf.constant(
156 |               vocab_freqs, dtype=tf.float32, shape=(vocab_size, 1))
157 |             self._word_embeddings = self._normalize(self._word_embeddings, vocab_freqs)
158 | 
159 |     def embed(self, x):
160 |         with tf.variable_scope("words"):
161 |             embedded = tf.nn.embedding_lookup(self._word_embeddings, x)
162 |             if self.keep_prob < 1.:
163 |                 # embedded = tf.nn.dropout(embedded, self.keep_prob)
164 |                 shape = embedded.get_shape().as_list()
165 | 
166 |                 # Use same dropout masks at each timestep with specifying noise_shape.
167 |                 # This slightly improves performance.
168 |                 # Please see https://arxiv.org/abs/1512.05287 for the theoretical
169 |                 # explanation.
170 |                 if len(shape) == 3:
171 |                     embedded = tf.nn.dropout(
172 |                       embedded, self.keep_prob, noise_shape=(shape[0], 1, shape[2]))
173 |                 elif len(shape) == 4:
174 |                     embedded = tf.nn.dropout(
175 |                       embedded, self.keep_prob, noise_shape=(shape[0], 1, 1, shape[2]))
176 |                 else:
177 |                     pass
178 |         return embedded
179 | 
180 |     def _normalize(self, emb, vocab_freqs):
181 |         weights = vocab_freqs / tf.reduce_sum(vocab_freqs)
182 |         mean = tf.reduce_sum(weights * emb, 0, keepdims=True)
183 |         var = tf.reduce_sum(weights * tf.pow(emb - mean, 2.), 0, keepdims=True)
184 |         stddev = tf.sqrt(1e-6 + var)
185 |         return (emb - mean) / stddev
186 | 
187 | 
188 | def get_vocabs(datasets):
189 |     """Build vocabulary from an iterable of datasets objects
190 | 
191 |     Args:
192 |         datasets: a list of dataset objects
193 | 
194 |     Returns:
195 |         a set of all the words in the dataset
196 | 
197 |     """
198 |     print("Building vocab...")
199 |     vocab_tags = set()
200 |     vocab_words_freq = dict()
201 |     for dataset in datasets:
202 |         for sentences, tags in dataset:
203 |             for sent in sentences:
204 |                 for token in sent:
205 |                     vocab_words_freq[token] = vocab_words_freq.get(token, 0) + 1
206 |             vocab_tags.update(tags)
207 |     print("- done. {} tokens".format(len(vocab_words_freq)))
208 |     return vocab_words_freq, vocab_tags
209 | 
210 | 
211 | def get_char_vocab(dataset):
212 |     """Build char vocabulary from an iterable of datasets objects
213 | 
214 |     Args:
215 |         dataset: a iterator yielding tuples (sentence, tags)
216 | 
217 |     Returns:
218 |         a set of all the characters in the dataset
219 | 
220 |     """
221 |     vocab_char = set()
222 |     for sents, _ in dataset:
223 |         for sent in sents:
224 |             for word in sent:
225 |                 vocab_char.update(word)
226 | 
227 |     return vocab_char
228 | 
229 | 
230 | def get_wordvec_vocab(filename):
231 |     """Load vocab from file
232 | 
233 |     Args:
234 |         filename: path to the glove vectors
235 | 
236 |     Returns:
237 |         vocab: set() of strings
238 |     """
239 |     print("Building vocab...")
240 |     vocab = set()
241 |     with open(filename) as f:
242 |         for line in f:
243 |             word = line.strip().split(' ')[0]
244 |             vocab.add(word)
245 |     print("- done. {} tokens".format(len(vocab)))
246 |     return vocab
247 | 
248 | 
249 | def write_vocab(vocab, filename):
250 |     """Writes a vocab to a file
251 | 
252 |     Writes one word per line.
253 | 
254 |     Args:
255 |         vocab: iterable that yields word
256 |         filename: path to vocab file
257 | 
258 |     Returns:
259 |         write a word per line
260 | 
261 |     """
262 |     print("Writing vocab...")
263 |     with open(filename, "w") as f:
264 |         if isinstance(vocab, dict):
265 |             for i, word in enumerate(vocab):
266 |                 if i != len(vocab) - 1:
267 |                     f.write("{}\t{}\n".format(word, vocab[word]))
268 |                 else:
269 |                     f.write('{}\t{}'.format(word, vocab[word]))
270 |         else:
271 |             for i, word in enumerate(vocab):
272 |                 if i != len(vocab) - 1:
273 |                     f.write("{}\n".format(word))
274 |                 else:
275 |                     f.write(word)
276 |     print("- done. {} tokens".format(len(vocab)))
277 | 
278 | 
279 | def load_vocab(filename):
280 |     """Loads vocab from a file
281 | 
282 |     Args:
283 |         filename: (string) the format of the file must be one word per line.
284 | 
285 |     Returns:
286 |         d: dict[word] = index
287 | 
288 |     """
289 |     try:
290 |         d = dict()
291 |         vocab_freq = []
292 |         with open(filename) as f:
293 |             for idx, line in enumerate(f):
294 |                 line = line.strip().split()
295 |                 if len(line) < 2:
296 |                     word = line[0]
297 |                     d[word] = idx
298 |                 else:
299 |                     word, freq = line
300 |                     d[word] = idx
301 |                     try:
302 |                         vocab_freq.append(int(freq))
303 |                     except:
304 |                         pass
305 | 
306 |     except IOError:
307 |         raise MyIOError(filename)
308 | 
309 |     if len(vocab_freq) == 0:
310 |         return d
311 |     else:
312 |         return d, vocab_freq
313 | 
314 | 
315 | def export_trimmed_wordvec_vectors(vocab, wordvec_filename, trimmed_filename):
316 |     """Saves glove vectors in numpy array
317 | 
318 |     Args:
319 |         vocab: dictionary vocab[word] = index
320 |         glove_filename: a path to a glove file
321 |         trimmed_filename: a path where to store a matrix in npy
322 |         dim: (int) dimension of embeddings
323 | 
324 |     """
325 |     num = 0
326 |     with open(trimmed_filename, 'w') as outFile:
327 |         with open(wordvec_filename, 'r') as inFile:
328 |             for line in inFile:
329 |                 word = line.strip().split(' ')[0]
330 |                 if word in vocab:
331 |                     outFile.write(line)
332 |                     num += 1
333 | 
334 |     print('{} out of {} tokens can find pre-trained embeddings!'.format(num, len(vocab)))
335 | 
336 | 
337 | def get_trimmed_wordvec_vectors(filename, vocab):
338 |     """
339 |     Args:
340 |         filename: path to the npz file
341 | 
342 |     Returns:
343 |         matrix of embeddings (np array)
344 | 
345 |     """
346 |     f = open(filename, 'r')
347 |     f.readline()
348 |     dim = len(f.readline().strip().split()) - 1
349 |     assert dim > 30
350 |     embeddings = np.random.uniform(-0.1, 0.1, size=(len(vocab)+1, dim))
351 |     with open(filename, 'r') as inFile:
352 |         for line in inFile:
353 |             line = line.strip().split()
354 |             word = line[0]       
355 |             if word in vocab:
356 |                 embeddings[vocab[word]] = np.array([float(item) for item in line[1:]])
357 | 
358 |     return embeddings
359 | 
360 | 
361 | def get_processing_word(vocab_words=None, vocab_chars=None,
362 |                     lowercase=False, chars=False, allow_unk=True):
363 |     """Return lambda function that transform a word (string) into list,
364 |     or tuple of (list, id) of int corresponding to the ids of the word and
365 |     its corresponding characters.
366 | 
367 |     Args:
368 |         vocab: dict[word] = idx
369 | 
370 |     Returns:
371 |         f("cat") = ([12, 4, 32], 12345)
372 |                  = (list of char ids, word id)
373 | 
374 |     """
375 |     def f(word):
376 |         # 0. get chars of words
377 |         if vocab_chars is not None and chars == True:
378 |             char_ids = []
379 |             for char in word:
380 |                 # ignore chars out of vocabulary
381 |                 if char in vocab_chars:
382 |                     char_ids += [vocab_chars[char]]
383 | 
384 |         # 1. preprocess word
385 |         if lowercase:
386 |             word = word.lower()
387 |         if word.isdigit():
388 |             word = NUM
389 | 
390 |         # 2. get id of word
391 |         if vocab_words is not None:
392 |             if word in vocab_words:
393 |                 word = vocab_words[word]
394 |             else:
395 |                 if allow_unk:
396 |                     word = vocab_words[UNK]
397 |                 else:
398 |                     raise Exception("Unknow key is not allowed. Check that "\
399 |                                     "your vocab (tags?) is correct")
400 | 
401 |         # 3. return tuple char ids, word id
402 |         if vocab_chars is not None and chars == True:
403 |             return char_ids, word
404 |         else:
405 |             return word
406 | 
407 |     return f
408 | 
409 | 
410 | def _pad_sequences(sequences, pad_tok, max_length):
411 |     """
412 |     Args:
413 |         sequences: a generator of list or tuple
414 |         pad_tok: the char to pad with
415 | 
416 |     Returns:
417 |         a list of list where each sublist has same length
418 |     """
419 |     sequence_padded, sequence_length = [], []
420 | 
421 |     for seq in sequences:
422 |         seq = list(seq)
423 |         seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
424 |         sequence_padded +=  [seq_]
425 |         sequence_length += [min(len(seq), max_length)]
426 | 
427 |     return sequence_padded, sequence_length
428 | 
429 | 
430 | def pad_sequences(sequences, pad_tok, nlevels=2):
431 |     """
432 |     Args:
433 |         sequences: a generator of list or tuple
434 |         pad_tok: the char to pad with
435 |         nlevels: "depth" of padding, for the case where we have characters ids
436 | 
437 |     Returns:
438 |         a list of list where each sublist has same length
439 | 
440 |     """
441 |     if nlevels == 1:
442 |         max_length = max(map(lambda x : len(x), sequences))
443 |         sequence_padded, sequence_length = _pad_sequences(sequences,
444 |                                             pad_tok, max_length) 
445 | 
446 |     elif nlevels == 2:
447 |         max_length_sentence = max([max(map(lambda x: len(x), seq))
448 |                                for seq in sequences])
449 |         sequence_padded, sequence_length = [], []
450 |         for seq in sequences:
451 |             # all words are same length now
452 |             sp, sl = _pad_sequences(seq, pad_tok, max_length_sentence)
453 |             sequence_padded += [sp]
454 |             sequence_length += [sl]
455 | 
456 |         max_length_document = max(map(lambda x : len(x), sequences))
457 |         sequence_padded, _ = _pad_sequences(sequence_padded,
458 |                 [pad_tok]*max_length_sentence, max_length_document)
459 |         sequence_length, _ = _pad_sequences(sequence_length, 0,
460 |                 max_length_document)
461 | 
462 |     return sequence_padded, sequence_length
463 | 
464 | 
465 | def minibatches(data, minibatch_size, shuffle=True):
466 |     """
467 |     Args:
468 |         data: generator of (sentence, tags) tuples
469 |         minibatch_size: (int)
470 | 
471 |     Yields:
472 |         list of tuples
473 | 
474 |     """
475 |     if shuffle:
476 |         random.shuffle(data)
477 | 
478 |     x_batch, y_batch = [], []
479 |     for (x, y) in data:
480 |         if len(x_batch) == minibatch_size:
481 |             yield x_batch, y_batch
482 |             x_batch, y_batch = [], []
483 | 
484 |         # if type(x[0]) == tuple:
485 |             # x = zip(*x)
486 |         x_batch += [x]
487 |         y_batch += [y]
488 | 
489 |     if len(x_batch) != 0:
490 |         yield x_batch, y_batch
491 | 
492 | 
493 | def get_chunk_type(tok, idx_to_tag):
494 |     """
495 |     Args:
496 |         tok: id of token, ex 4
497 |         idx_to_tag: dictionary {4: "B-PER", ...}
498 | 
499 |     Returns:
500 |         tuple: "B", "PER"
501 | 
502 |     """
503 |     tag_name = idx_to_tag[tok]
504 |     tag_class = tag_name.split('-')[0]
505 |     tag_type = tag_name.split('-')[-1]
506 |     return tag_class, tag_type
507 | 
508 | 
509 | def get_chunks(seq, tags):
510 |     """Given a sequence of tags, group entities and their position
511 | 
512 |     Args:
513 |         seq: [4, 4, 0, 0, ...] sequence of labels
514 |         tags: dict["O"] = 4
515 | 
516 |     Returns:
517 |         list of (chunk_type, chunk_start, chunk_end)
518 | 
519 |     Example:
520 |         seq = [4, 5, 0, 3]
521 |         tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
522 |         result = [("PER", 0, 2), ("LOC", 3, 4)]
523 | 
524 |     """
525 |     default = tags[NONE]
526 |     idx_to_tag = {idx: tag for tag, idx in tags.items()}
527 |     chunks = []
528 |     chunk_type, chunk_start = None, None
529 |     for i, tok in enumerate(seq):
530 |         # End of a chunk 1
531 |         if tok == default and chunk_type is not None:
532 |             # Add a chunk.
533 |             chunk = (chunk_type, chunk_start, i)
534 |             chunks.append(chunk)
535 |             chunk_type, chunk_start = None, None
536 | 
537 |         # End of a chunk + start of a chunk!
538 |         elif tok != default:
539 |             tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
540 |             if chunk_type is None:
541 |                 chunk_type, chunk_start = tok_chunk_type, i
542 |             elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
543 |                 chunk = (chunk_type, chunk_start, i)
544 |                 chunks.append(chunk)
545 |                 chunk_type, chunk_start = tok_chunk_type, i
546 |         else:
547 |             pass
548 | 
549 |     # end condition
550 |     if chunk_type is not None:
551 |         chunk = (chunk_type, chunk_start, len(seq))
552 |         chunks.append(chunk)
553 | 
554 |     return chunks
555 | 


--------------------------------------------------------------------------------