├── lstm_model ├── src │ ├── __init__.py │ ├── __pycache__ │ │ ├── config.cpython-36.pyc │ │ ├── models.cpython-36.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── base_model.cpython-36.pyc │ │ ├── data_utils.cpython-36.pyc │ │ ├── general_utils.cpython-36.pyc │ │ └── adversarial_losses.cpython-36.pyc │ ├── general_utils.py │ ├── base_model.py │ ├── config.py │ ├── adversarial_losses.py │ ├── ner_model.py │ └── data_utils.py ├── __init__.py ├── __pycache__ │ └── config.cpython-36.pyc ├── run_train_pico.py ├── run_train.py ├── build_data.py ├── run_train_nicta.py ├── run_train_cross_validate.py ├── run_train_cross_validate_nicta.py ├── run_train_cross_validate_pico.py ├── general_utils.py ├── base_model.py ├── config.py ├── adversarial_losses.py ├── ner_model.py └── data_utils.py ├── BERT ├── pytorch_pretrained_bert │ ├── module │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── san.cpython-36.pyc │ │ │ ├── common.cpython-36.pyc │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── my_optim.cpython-36.pyc │ │ │ ├── similarity.cpython-36.pyc │ │ │ └── dropout_wrapper.cpython-36.pyc │ │ ├── common.py │ │ ├── dropout_wrapper.py │ │ ├── my_optim.py │ │ └── san.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── modeling.cpython-36.pyc │ │ ├── file_utils.cpython-36.pyc │ │ ├── optimization.cpython-36.pyc │ │ └── tokenization.cpython-36.pyc │ ├── __init__.py │ ├── __main__.py │ ├── convert_tf_checkpoint_to_pytorch.py │ └── file_utils.py ├── __pycache__ │ ├── crf.cpython-36.pyc │ ├── utils.cpython-36.pyc │ ├── bert_model.cpython-36.pyc │ └── adversarial_losses.cpython-36.pyc ├── run_classifier_nicta.py ├── run_classifier_pico.py ├── run_classifier_pico_cross_validate.py ├── run_classifier_nicta_cross_validate.py ├── utils.py ├── adversarial_losses.py ├── bert_model.py └── crf.py ├── requirements.txt └── README.md /lstm_model/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lstm_model/__init__.py: -------------------------------------------------------------------------------- 1 | import .base_model 2 | import .config -------------------------------------------------------------------------------- /BERT/__pycache__/crf.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/__pycache__/crf.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/__pycache__/bert_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/__pycache__/bert_model.cpython-36.pyc -------------------------------------------------------------------------------- /lstm_model/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /lstm_model/src/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /lstm_model/src/__pycache__/models.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/models.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/__pycache__/adversarial_losses.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/__pycache__/adversarial_losses.cpython-36.pyc -------------------------------------------------------------------------------- /lstm_model/src/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /lstm_model/src/__pycache__/base_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/base_model.cpython-36.pyc -------------------------------------------------------------------------------- /lstm_model/src/__pycache__/data_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/data_utils.cpython-36.pyc -------------------------------------------------------------------------------- /lstm_model/src/__pycache__/general_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/general_utils.cpython-36.pyc -------------------------------------------------------------------------------- /lstm_model/src/__pycache__/adversarial_losses.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/lstm_model/src/__pycache__/adversarial_losses.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/__pycache__/modeling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/__pycache__/modeling.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/__pycache__/file_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/__pycache__/file_utils.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/__pycache__/optimization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/__pycache__/optimization.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/__pycache__/tokenization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/__pycache__/tokenization.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/__pycache__/san.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/san.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/__pycache__/common.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/common.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/__pycache__/my_optim.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/my_optim.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/__pycache__/similarity.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/similarity.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/__pycache__/dropout_wrapper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jind11/Deep-PICO-Detection/HEAD/BERT/pytorch_pretrained_bert/module/__pycache__/dropout_wrapper.cpython-36.pyc -------------------------------------------------------------------------------- /BERT/run_classifier_nicta.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | MODEL_PATH = sys.argv[1] 5 | 6 | command = 'python bert_classifier.py --data_dir ../data/nicta_piboso ' \ 7 | '--bert_model {} ' \ 8 | '--task_name nicta --output_dir results/nicta/biobert_crf ' \ 9 | '--train_batch_size 2 --tag_space 0 --max_seq_length 60 --use_crf ' \ 10 | '--do_train --do_eval --do_lower_case --num_train_epochs 3 ' \ 11 | '--rnn_hidden_size 512 --dropout 0.3 '.format(MODEL_PATH) 12 | 13 | os.system(command) -------------------------------------------------------------------------------- /BERT/run_classifier_pico.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | MODEL_PATH = sys.argv[1] 5 | 6 | # use bio-bert 7 | command = 'python bert_classifier.py --data_dir ../data/pico ' \ 8 | '--bert_model {} ' \ 9 | '--task_name pico --output_dir results/PICO/biobert_crf ' \ 10 | '--train_batch_size 2 --tag_space 0 --max_seq_length 60 --use_crf ' \ 11 | '--do_train --do_eval --do_lower_case --num_train_epochs 3 ' \ 12 | '--rnn_hidden_size 512 --dropout 0.3 '.format(MODEL_PATH) 13 | 14 | os.system(command) -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/__init__.py: -------------------------------------------------------------------------------- 1 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 2 | from .modeling import (BertConfig, BertModel, BertForPreTraining, 3 | BertForMaskedLM, BertForNextSentencePrediction, 4 | BertForSequenceClassification, BertForMultipleChoice, 5 | BertForTokenClassification, BertForQuestionAnswering, 6 | BertForMultipleChoice_SAN) 7 | from .optimization import BertAdam 8 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.9.0 2 | astor==0.8.1 3 | boto3==1.13.24 4 | botocore==1.16.24 5 | certifi==2020.4.5.1 6 | chardet==3.0.4 7 | docutils==0.15.2 8 | future==0.18.2 9 | gast==0.3.3 10 | grpcio==1.29.0 11 | idna==2.9 12 | importlib-metadata==1.6.1 13 | jmespath==0.10.0 14 | joblib==0.15.1 15 | Markdown==3.2.2 16 | numpy==1.14.5 17 | protobuf==3.12.2 18 | python-dateutil==2.8.1 19 | requests==2.23.0 20 | s3transfer==0.3.3 21 | scikit-learn==0.23.1 22 | scipy==1.4.1 23 | six==1.15.0 24 | sklearn==0.0 25 | tensorboard==1.10.0 26 | tensorflow-gpu==1.10.0 27 | termcolor==1.1.0 28 | threadpoolctl==2.1.0 29 | torch==1.0.0 30 | tqdm==4.46.1 31 | urllib3==1.25.9 32 | Werkzeug==1.0.1 33 | zipp==3.1.0 34 | -------------------------------------------------------------------------------- /BERT/run_classifier_pico_cross_validate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | MODEL_PATH = sys.argv[1] 5 | 6 | for fold in range(1, 11): 7 | # # use bio-bert 8 | command = 'python bert_classifier.py --data_dir ../data/pico/10_folds/{fold} ' \ 9 | '--bert_model {MODEL_PATH} ' \ 10 | '--task_name pico --output_dir results/PICO/biobert_crf_{fold} ' \ 11 | '--train_batch_size 2 --tag_space 0 --max_seq_length 60 --use_crf ' \ 12 | '--do_train --do_eval --do_lower_case --num_train_epochs 3 ' \ 13 | '--rnn_hidden_size 512 --dropout 0.2 '.format(fold=fold, 14 | MODEL_PATH=MODEL_PATH) 15 | 16 | os.system(command) -------------------------------------------------------------------------------- /BERT/run_classifier_nicta_cross_validate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | MODEL_PATH = sys.argv[1] 5 | 6 | for fold in range(1, 11): 7 | # use bio-bert 8 | command = 'python bert_classifier.py --data_dir ../data/nicta_piboso/10_folds/{fold} ' \ 9 | '--bert_model {MODEL_PATH} ' \ 10 | '--task_name nicta --output_dir results/nicta/biobert_crf_{fold} ' \ 11 | '--train_batch_size 2 --tag_space 0 --max_seq_length 60 --use_crf ' \ 12 | '--do_train --do_eval --do_lower_case --num_train_epochs 3 ' \ 13 | '--rnn_hidden_size 512 --dropout 0.3 '.format(fold=fold, 14 | MODEL_PATH=MODEL_PATH) 15 | 16 | os.system(command) -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | import torch 3 | import math 4 | from torch.nn.functional import tanh, relu, prelu, leaky_relu, sigmoid, elu, selu 5 | from torch.nn.init import uniform, normal, eye, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal 6 | 7 | def linear(x): 8 | return x 9 | 10 | def swish(x): 11 | return x * sigmoid(x) 12 | 13 | def bertgelu(x): 14 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 15 | 16 | def gptgelu(x): 17 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 18 | 19 | # default gelue 20 | gelu = bertgelu 21 | 22 | def activation(func_a): 23 | """Activation function wrapper 24 | """ 25 | try: 26 | f = eval(func_a) 27 | except: 28 | f = linear 29 | return f 30 | 31 | def init_wrapper(init='xavier_uniform'): 32 | return eval(init) -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | def main(): 3 | import sys 4 | try: 5 | from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 6 | except ModuleNotFoundError: 7 | print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 8 | "In that case, it requires TensorFlow to be installed. Please see " 9 | "https://www.tensorflow.org/install/ for installation instructions.") 10 | raise 11 | 12 | if len(sys.argv) != 5: 13 | # pylint: disable=line-too-long 14 | print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") 15 | else: 16 | PYTORCH_DUMP_OUTPUT = sys.argv.pop() 17 | TF_CONFIG = sys.argv.pop() 18 | TF_CHECKPOINT = sys.argv.pop() 19 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/dropout_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class DropoutWrapper(nn.Module): 7 | """ 8 | This is a dropout wrapper which supports the fix mask dropout 9 | """ 10 | def __init__(self, dropout_p=0, enable_vbp=True): 11 | super(DropoutWrapper, self).__init__() 12 | """variational dropout means fix dropout mask 13 | ref: https://discuss.pytorch.org/t/dropout-for-rnns/633/11 14 | """ 15 | self.enable_variational_dropout = enable_vbp 16 | self.dropout_p = dropout_p 17 | 18 | def forward(self, x): 19 | """ 20 | :param x: batch * len * input_size 21 | """ 22 | if self.training == False or self.dropout_p == 0: 23 | return x 24 | 25 | if len(x.size()) == 3: 26 | mask = 1.0 / (1-self.dropout_p) * torch.bernoulli((1-self.dropout_p) * (x.data.new(x.size(0), x.size(2)).zero_() + 1)) 27 | mask.requires_grad = False 28 | return mask.unsqueeze(1).expand_as(x) * x 29 | else: 30 | return F.dropout(x, p=self.dropout_p, training=self.training) -------------------------------------------------------------------------------- /lstm_model/run_train_pico.py: -------------------------------------------------------------------------------- 1 | from src.data_utils import Dataset 2 | from src.models import HANNModel 3 | from src.config import Config 4 | import argparse 5 | import os 6 | 7 | parser = argparse.ArgumentParser() 8 | 9 | def main(): 10 | # create instance of config 11 | config = Config(parser) 12 | assert config.data_keyname == 'pico' 13 | 14 | # build model 15 | model = HANNModel(config) 16 | model.build() 17 | 18 | # create datasets 19 | dev = Dataset(config.filename_dev, config.processing_word, 20 | config.processing_tag) 21 | train = Dataset(config.filename_train, config.processing_word, 22 | config.processing_tag) 23 | test = Dataset(config.filename_test, config.processing_word, 24 | config.processing_tag) 25 | if config.num_augmentation: 26 | data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation) 27 | else: 28 | data_aug = None 29 | 30 | # train model 31 | model.train(train, dev, data_aug) 32 | 33 | # evaluate model 34 | model.restore_session(config.dir_model) 35 | model.evaluate(test) 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /lstm_model/run_train.py: -------------------------------------------------------------------------------- 1 | from model.data_utils import Dataset 2 | from model.models import HANNModel 3 | from model.config import Config 4 | import argparse 5 | import os 6 | 7 | parser = argparse.ArgumentParser() 8 | 9 | def main(): 10 | # create instance of config 11 | config = Config(parser) 12 | config.num_augmentation = 20000 13 | config.batch_size = 20 14 | config.batch_size_aug = 20 15 | config.dir_output = 'test-num_augmentation-{}-2'.format(config.num_augmentation) 16 | config.dir_model = os.path.join(config.dir_output, "model.weights") 17 | 18 | # build model 19 | model = HANNModel(config) 20 | model.build() 21 | # if config.restore: 22 | # model.restore_session("results/test/model.weights/") # optional, restore weights 23 | # model.reinitialize_weights("proj") 24 | 25 | # create datasets 26 | dev = Dataset(config.filename_dev, config.processing_word, 27 | config.processing_tag) 28 | train = Dataset(config.filename_train, config.processing_word, 29 | config.processing_tag) 30 | test = Dataset(config.filename_test, config.processing_word, 31 | config.processing_tag) 32 | if config.num_augmentation: 33 | data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation) 34 | else: 35 | data_aug = None 36 | 37 | # train model 38 | model.train(train, dev, data_aug) 39 | 40 | # evaluate model 41 | model.restore_session(config.dir_model) 42 | model.evaluate(test) 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /lstm_model/build_data.py: -------------------------------------------------------------------------------- 1 | from src.config import Config 2 | from src.data_utils import Dataset, get_vocabs, UNK, NUM, WORD_PAD, TAG_PAD, \ 3 | get_wordvec_vocab, write_vocab, load_vocab, get_char_vocab, \ 4 | export_trimmed_wordvec_vectors, get_processing_word 5 | import argparse 6 | import sys 7 | 8 | # data_keyname = sys.argv[1] 9 | 10 | def main(): 11 | """Procedure to build data 12 | 13 | You MUST RUN this procedure. It iterates over the whole dataset (train, 14 | dev and test) and extract the vocabularies in terms of words, tags, and 15 | characters. Having built the vocabularies it writes them in a file. The 16 | writing of vocabulary in a file assigns an id (the line #) to each word. 17 | It then extract the relevant GloVe vectors and stores them in a np array 18 | such that the i-th entry corresponds to the i-th word in the vocabulary. 19 | 20 | 21 | Args: 22 | config: (instance of Config) has attributes like hyper-params... 23 | 24 | """ 25 | # get config and processing of words 26 | config = Config(load=False) 27 | processing_word = get_processing_word(lowercase=True) 28 | 29 | # Generators 30 | dev = Dataset(config.filename_dev, processing_word) 31 | test = Dataset(config.filename_test, processing_word) 32 | train = Dataset(config.filename_train, processing_word) 33 | 34 | # add data augmentation dataset 35 | data_aug = Dataset(config.filename_aug, processing_word) 36 | 37 | # Build Word and Tag vocab 38 | vocab_words_freq, vocab_tags = get_vocabs([train, dev, test, data_aug]) 39 | vocab_words_freq_ = {} 40 | for vocab, freq in vocab_words_freq.items(): 41 | if freq > config.min_freq: 42 | vocab_words_freq_[vocab] = freq 43 | vocab_tags.remove('None') 44 | # vocab_glove = get_wordvec_vocab(config.filename_wordvec) 45 | 46 | # vocab = vocab_words & vocab_glove 47 | vocab_words_freq_.update({UNK: 1, WORD_PAD: 1, NUM: 1}) 48 | 49 | # vocab_tags.add(TAG_PAD) 50 | 51 | # Save vocab 52 | write_vocab(vocab_words_freq_, config.filename_words) 53 | write_vocab(vocab_tags, config.filename_tags) 54 | 55 | # Trim GloVe Vectors 56 | vocab, _ = load_vocab(config.filename_words) 57 | export_trimmed_wordvec_vectors(vocab, config.filename_wordvec, 58 | config.filename_wordvec_trimmed) 59 | 60 | # Build and save char vocab 61 | # train = Dataset(config.filename_train) 62 | # vocab_chars = get_char_vocab(train) 63 | # write_vocab(vocab_chars, config.filename_chars) 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /lstm_model/run_train_nicta.py: -------------------------------------------------------------------------------- 1 | from src.data_utils import Dataset 2 | from src.models import HANNModel 3 | from src.config import Config 4 | import argparse 5 | import os 6 | import numpy as np 7 | from collections import defaultdict 8 | 9 | 10 | def main(): 11 | # create instance of config 12 | config = Config() 13 | assert config.data_keyname == 'nicta' 14 | config.num_augmentation = 0 15 | config.batch_size = 20 16 | config.batch_size_aug = 20 17 | config.attention_size = 50 18 | config.hidden_size_lstm_document = 200 19 | config.dropout = 0.8 20 | config.cnn_filter_num = 150 21 | config.adv_perturb_norm_length = 4 22 | config.va_perturb_norm_length = 4 23 | config.adv_reg_coeff = 0.3 24 | config.va_reg_coeff = 0.3 25 | config.data_root = '../data/nicta_piboso' 26 | config.dir_output = 'results/nicta/test-num_augmentation-{}-va_coeff-{}-adv-coeff-{}'.format(config.num_augmentation, 27 | config.va_reg_coeff, 28 | config.adv_reg_coeff) 29 | config.dir_model = os.path.join(config.dir_output, "model.weights") 30 | 31 | result_file_path = os.path.join(config.dir_output, 'cross_validate_results') 32 | 33 | precisions = defaultdict(list) 34 | recalls = defaultdict(list) 35 | f1s = defaultdict(list) 36 | tag_ls = ['P', 'I', 'O', 'S', 'B', 'OT'] 37 | 38 | # build model 39 | model = HANNModel(config) 40 | model.build() 41 | # if config.restore: 42 | # model.restore_session("results/test/model.weights/") # optional, restore weights 43 | # model.reinitialize_weights("proj") 44 | 45 | # create datasets 46 | train = Dataset(os.path.join(config.data_root, 'train.txt'), config.processing_word, 47 | config.processing_tag) 48 | dev = Dataset(os.path.join(config.data_root, 'test.txt'), config.processing_word, 49 | config.processing_tag) 50 | test = Dataset(os.path.join(config.data_root, 'test.txt'), config.processing_word, 51 | config.processing_tag) 52 | if config.num_augmentation: 53 | data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation) 54 | else: 55 | data_aug = None 56 | 57 | # train model 58 | model.train(train, dev, data_aug) 59 | 60 | # evaluate model 61 | model.restore_session(config.dir_model) 62 | metrics = model.evaluate(test) 63 | 64 | [precisions[tag].append(metrics['precision_all'][tag]) for tag in tag_ls] 65 | [recalls[tag].append(metrics['recall_all'][tag]) for tag in tag_ls] 66 | [f1s[tag].append(metrics['f1_all'][tag]) for tag in tag_ls] 67 | msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(fold, metrics['precision_all'], 68 | metrics['recall_all'], metrics['f1_all']) 69 | print(msg) 70 | with open(result_file_path, 'a') as ofile: 71 | ofile.write(msg) 72 | 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /BERT/utils.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jindi' 2 | 3 | import collections 4 | from itertools import repeat 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.utils.rnn as rnn_utils 8 | 9 | 10 | def _ntuple(n): 11 | def parse(x): 12 | if isinstance(x, collections.Iterable): 13 | return x 14 | return tuple(repeat(x, n)) 15 | return parse 16 | 17 | _single = _ntuple(1) 18 | _pair = _ntuple(2) 19 | _triple = _ntuple(3) 20 | _quadruple = _ntuple(4) 21 | 22 | 23 | # encode the sequence length information in the batch for RNN use 24 | # this is special for pytorch RNN function 25 | def prepare_rnn_seq(rnn_input, lengths, hx=None, masks=None, batch_first=False): 26 | ''' 27 | 28 | Args: 29 | rnn_input: [seq_len, batch, input_size]: tensor containing the features of the input sequence. 30 | lengths: [batch]: tensor containing the lengthes of the input sequence 31 | hx: [num_layers * num_directions, batch, hidden_size]: tensor containing the initial hidden state for each element in the batch. 32 | masks: [seq_len, batch]: tensor containing the mask for each element in the batch. 33 | batch_first: If True, then the input and output tensors are provided as [batch, seq_len, feature]. 34 | 35 | Returns: 36 | 37 | ''' 38 | def check_decreasing(lengths): 39 | lens, order = torch.sort(lengths, dim=0, descending=True) 40 | if torch.ne(lens, lengths).sum() == 0: 41 | return None 42 | else: 43 | _, rev_order = torch.sort(order) 44 | return lens, order, rev_order 45 | 46 | check_res = check_decreasing(lengths) 47 | 48 | if check_res is None: 49 | lens = lengths 50 | rev_order = None 51 | else: 52 | lens, order, rev_order = check_res 53 | batch_dim = 0 if batch_first else 1 54 | rnn_input = rnn_input.index_select(batch_dim, order) 55 | if hx is not None: 56 | # hack lstm 57 | if isinstance(hx, tuple): 58 | hx, cx = hx 59 | hx = hx.index_select(1, order) 60 | cx = cx.index_select(1, order) 61 | hx = (hx, cx) 62 | else: 63 | hx = hx.index_select(1, order) 64 | 65 | lens = lens.tolist() 66 | seq = rnn_utils.pack_padded_sequence(rnn_input, lens, batch_first=batch_first) 67 | if masks is not None: 68 | if batch_first: 69 | masks = masks[:, :lens[0]] 70 | else: 71 | masks = masks[:lens[0]] 72 | return seq, hx, rev_order, masks 73 | 74 | 75 | # recover the sequence results from RNN function 76 | # this is special to pytorch RNN function 77 | def recover_rnn_seq(seq, rev_order, hx=None, batch_first=False): 78 | output, _ = rnn_utils.pad_packed_sequence(seq, batch_first=batch_first) 79 | if rev_order is not None: 80 | batch_dim = 0 if batch_first else 1 81 | output = output.index_select(batch_dim, rev_order) 82 | if hx is not None: 83 | # hack lstm 84 | if isinstance(hx, tuple): 85 | hx, cx = hx 86 | hx = hx.index_select(1, rev_order) 87 | cx = cx.index_select(1, rev_order) 88 | hx = (hx, cx) 89 | else: 90 | hx = hx.index_select(1, rev_order) 91 | return output, hx 92 | -------------------------------------------------------------------------------- /lstm_model/run_train_cross_validate.py: -------------------------------------------------------------------------------- 1 | from model.data_utils import Dataset 2 | from model.models import HANNModel 3 | from model.config import Config 4 | import argparse 5 | import os 6 | import numpy as np 7 | 8 | 9 | def main(): 10 | # create instance of config 11 | config = Config() 12 | config.num_augmentation = 20000 13 | config.batch_size = 20 14 | config.batch_size_aug = 20 15 | config.dir_output = 'test-num_augmentation-{}'.format(config.num_augmentation) 16 | config.dir_model = os.path.join(config.dir_output, "model.weights") 17 | 18 | result_file_path = os.path.join(config.dir_output, 'cross_validate_results') 19 | 20 | precisions = {'P': [], 'I': [], 'O': []} 21 | recalls = {'P': [], 'I': [], 'O': []} 22 | f1s = {'P': [], 'I': [], 'O': []} 23 | 24 | for fold in range(2, 6): 25 | # build model 26 | # tf.reset_default_graph() 27 | print('Fold {}'.format(fold)) 28 | 29 | # build model 30 | model = HANNModel(config) 31 | model.build() 32 | # if config.restore: 33 | # model.restore_session("results/test/model.weights/") # optional, restore weights 34 | # model.reinitialize_weights("proj") 35 | 36 | # create datasets 37 | train = Dataset(os.path.join(config.data_root, str(fold), 'train.txt'), config.processing_word, 38 | config.processing_tag) 39 | dev = Dataset(os.path.join(config.data_root, str(fold), 'dev.txt'), config.processing_word, 40 | config.processing_tag) 41 | test = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word, 42 | config.processing_tag) 43 | if config.num_augmentation: 44 | data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation) 45 | else: 46 | data_aug = None 47 | 48 | # train model 49 | model.train(train, dev, data_aug) 50 | 51 | # evaluate model 52 | model.restore_session(config.dir_model) 53 | metrics = model.evaluate(test) 54 | 55 | [precisions[tag].append(metrics['precision'][tag]) for tag in ['P', 'I', 'O']] 56 | [recalls[tag].append(metrics['recall'][tag]) for tag in ['P', 'I', 'O']] 57 | [f1s[tag].append(metrics['f1'][tag]) for tag in ['P', 'I', 'O']] 58 | msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(fold, metrics['precision'], metrics['recall'], metrics['f1']) 59 | print(msg) 60 | with open(result_file_path, 'a') as ofile: 61 | ofile.write(msg) 62 | 63 | 64 | # print('Precision: ', 'P: ', (precisions['P']), 'I: ', (precisions['I']), 'O: ', (precisions['O'])) 65 | # print('Recall: ', 'P: ', (recalls['P']), 'I: ', (recalls['I']), 'O: ', (recalls['O'])) 66 | # print('F1: ', 'P: ', (f1s['P']), 'I: ', (f1s['I']), 'O: ', (f1s['O'])) 67 | # print('Precision: ', 'P: ', np.mean(precisions['P']), 'I: ', np.mean(precisions['I']), 'O: ', np.mean(precisions['O'])) 68 | # print('Recall: ', 'P: ', np.mean(recalls['P']), 'I: ', np.mean(recalls['I']), 'O: ', np.mean(recalls['O'])) 69 | # res = np.mean([np.mean(values) for values in f1s.values()]) 70 | # print('F1: ', 'P: ', np.mean(f1s['P']), 'I: ', np.mean(f1s['I']), 'O: ', np.mean(f1s['O']), 'all avg: ', res) 71 | msg = 'Average Precision: P: {}\tI: {}\tO: {}\n'.format(np.mean(precisions['P']), np.mean(precisions['I']), np.mean(precisions['O'])) 72 | print(msg) 73 | with open(result_file_path, 'a') as ofile: 74 | ofile.write(msg) 75 | msg = 'Average Recall: P: {}\tI: {}\tO: {}\n'.format(np.mean(recalls['P']), np.mean(recalls['I']), np.mean(recalls['O'])) 76 | print(msg) 77 | with open(result_file_path, 'a') as ofile: 78 | ofile.write(msg) 79 | res = np.mean([np.mean(values) for values in f1s.values()]) 80 | msg = 'Average F1: P: {}\tI: {}\tO: {}\tall: {}\n'.format(np.mean(f1s['P']), np.mean(f1s['I']), np.mean(f1s['O']), res) 81 | print(msg) 82 | with open(result_file_path, 'a') as ofile: 83 | ofile.write(msg) 84 | ofile.write('\n\n\n') 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /lstm_model/run_train_cross_validate_nicta.py: -------------------------------------------------------------------------------- 1 | from src.data_utils import Dataset 2 | from src.models import HANNModel 3 | from src.config import Config 4 | import argparse 5 | import os 6 | import numpy as np 7 | from collections import defaultdict 8 | 9 | 10 | def main(): 11 | # create instance of config 12 | config = Config() 13 | assert config.data_keyname == 'nicta' 14 | config.num_augmentation = 200000 15 | config.batch_size = 20 16 | config.batch_size_aug = 20 17 | config.attention_size = 50 18 | config.hidden_size_lstm_document = 200 19 | config.dropout = 0.8 20 | config.cnn_filter_num = 150 21 | config.adv_perturb_norm_length = 4 22 | config.va_perturb_norm_length = 4 23 | config.adv_reg_coeff = 0.3 24 | config.va_reg_coeff = 0.3 25 | config.data_root = '../data/nicta_piboso/10_folds' 26 | config.dir_output = 'results/nicta/test-num_augmentation-{}-va_coeff-{}-adv-coeff-{}'.format(config.num_augmentation, 27 | config.va_reg_coeff, 28 | config.adv_reg_coeff) 29 | config.dir_model = os.path.join(config.dir_output, "model.weights") 30 | 31 | result_file_path = os.path.join(config.dir_output, 'cross_validate_results') 32 | 33 | precisions = defaultdict(list) 34 | recalls = defaultdict(list) 35 | f1s = defaultdict(list) 36 | tag_ls = ['P', 'I', 'O', 'S', 'B', 'OT'] 37 | 38 | for fold in range(1, 11): 39 | # build model 40 | # tf.reset_default_graph() 41 | print('Fold {}'.format(fold)) 42 | 43 | # build model 44 | model = HANNModel(config) 45 | model.build() 46 | # if config.restore: 47 | # model.restore_session("results/test/model.weights/") # optional, restore weights 48 | # model.reinitialize_weights("proj") 49 | 50 | # create datasets 51 | train = Dataset(os.path.join(config.data_root, str(fold), 'train.txt'), config.processing_word, 52 | config.processing_tag) 53 | dev = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word, 54 | config.processing_tag) 55 | test = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word, 56 | config.processing_tag) 57 | if config.num_augmentation: 58 | data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation) 59 | else: 60 | data_aug = None 61 | 62 | # train model 63 | model.train(train, dev, data_aug) 64 | 65 | # evaluate model 66 | model.restore_session(config.dir_model) 67 | metrics = model.evaluate(test) 68 | 69 | [precisions[tag].append(metrics['precision_all'][tag]) for tag in tag_ls] 70 | [recalls[tag].append(metrics['recall_all'][tag]) for tag in tag_ls] 71 | [f1s[tag].append(metrics['f1_all'][tag]) for tag in tag_ls] 72 | msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(fold, metrics['precision_all'], 73 | metrics['recall_all'], metrics['f1_all']) 74 | print(msg) 75 | with open(result_file_path, 'a') as ofile: 76 | ofile.write(msg) 77 | 78 | msg = 'Average Precision: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(precisions[tag])) for tag in tag_ls])) 79 | print(msg) 80 | with open(result_file_path, 'a') as ofile: 81 | ofile.write(msg) 82 | msg = 'Average Recall: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(recalls[tag])) for tag in tag_ls])) 83 | print(msg) 84 | with open(result_file_path, 'a') as ofile: 85 | ofile.write(msg) 86 | res = np.mean([np.mean(values) for values in f1s.values()]) 87 | msg = 'Average F1: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(f1s[tag])) for tag in tag_ls])) 88 | print(msg) 89 | with open(result_file_path, 'a') as ofile: 90 | ofile.write(msg) 91 | ofile.write('\n\n\n') 92 | 93 | if __name__ == "__main__": 94 | main() 95 | -------------------------------------------------------------------------------- /lstm_model/run_train_cross_validate_pico.py: -------------------------------------------------------------------------------- 1 | from src.data_utils import Dataset 2 | from src.models import HANNModel 3 | from src.config import Config 4 | import argparse 5 | import os 6 | import numpy as np 7 | 8 | 9 | def main(): 10 | # create instance of config 11 | config = Config() 12 | assert config.data_keyname == 'pico' 13 | config.num_augmentation = 0 14 | config.batch_size = 20 15 | config.batch_size_aug = 20 16 | config.dir_output = 'test-num_augmentation-{}'.format(config.num_augmentation) 17 | config.dir_model = os.path.join(config.dir_output, "model.weights") 18 | config.data_root = '../data/{}/10_folds'.format(config.data_keyname) 19 | 20 | result_file_path = os.path.join(config.dir_output, 'cross_validate_results') 21 | 22 | precisions = {'P': [], 'I': [], 'O': []} 23 | recalls = {'P': [], 'I': [], 'O': []} 24 | f1s = {'P': [], 'I': [], 'O': []} 25 | 26 | for fold in range(1, 11): 27 | # build model 28 | # tf.reset_default_graph() 29 | print('Fold {}'.format(fold)) 30 | 31 | # build model 32 | model = HANNModel(config) 33 | model.build() 34 | # if config.restore: 35 | # model.restore_session("results/test/model.weights/") # optional, restore weights 36 | # model.reinitialize_weights("proj") 37 | 38 | # create datasets 39 | train = Dataset(os.path.join(config.data_root, str(fold), 'train.txt'), config.processing_word, 40 | config.processing_tag) 41 | dev = Dataset(os.path.join(config.data_root, str(fold), 'dev.txt'), config.processing_word, 42 | config.processing_tag) 43 | test = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word, 44 | config.processing_tag) 45 | if config.num_augmentation: 46 | data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation) 47 | else: 48 | data_aug = None 49 | 50 | # train model 51 | model.train(train, dev, data_aug) 52 | 53 | # evaluate model 54 | model.restore_session(config.dir_model) 55 | metrics = model.evaluate(test) 56 | 57 | [precisions[tag].append(metrics['precision'][tag]) for tag in ['P', 'I', 'O']] 58 | [recalls[tag].append(metrics['recall'][tag]) for tag in ['P', 'I', 'O']] 59 | [f1s[tag].append(metrics['f1'][tag]) for tag in ['P', 'I', 'O']] 60 | msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(fold, metrics['precision'], metrics['recall'], metrics['f1']) 61 | print(msg) 62 | with open(result_file_path, 'a') as ofile: 63 | ofile.write(msg) 64 | 65 | 66 | # print('Precision: ', 'P: ', (precisions['P']), 'I: ', (precisions['I']), 'O: ', (precisions['O'])) 67 | # print('Recall: ', 'P: ', (recalls['P']), 'I: ', (recalls['I']), 'O: ', (recalls['O'])) 68 | # print('F1: ', 'P: ', (f1s['P']), 'I: ', (f1s['I']), 'O: ', (f1s['O'])) 69 | # print('Precision: ', 'P: ', np.mean(precisions['P']), 'I: ', np.mean(precisions['I']), 'O: ', np.mean(precisions['O'])) 70 | # print('Recall: ', 'P: ', np.mean(recalls['P']), 'I: ', np.mean(recalls['I']), 'O: ', np.mean(recalls['O'])) 71 | # res = np.mean([np.mean(values) for values in f1s.values()]) 72 | # print('F1: ', 'P: ', np.mean(f1s['P']), 'I: ', np.mean(f1s['I']), 'O: ', np.mean(f1s['O']), 'all avg: ', res) 73 | msg = 'Average Precision: P: {}\tI: {}\tO: {}\n'.format(np.mean(precisions['P']), np.mean(precisions['I']), np.mean(precisions['O'])) 74 | print(msg) 75 | with open(result_file_path, 'a') as ofile: 76 | ofile.write(msg) 77 | msg = 'Average Recall: P: {}\tI: {}\tO: {}\n'.format(np.mean(recalls['P']), np.mean(recalls['I']), np.mean(recalls['O'])) 78 | print(msg) 79 | with open(result_file_path, 'a') as ofile: 80 | ofile.write(msg) 81 | res = np.mean([np.mean(values) for values in f1s.values()]) 82 | msg = 'Average F1: P: {}\tI: {}\tO: {}\tall: {}\n'.format(np.mean(f1s['P']), np.mean(f1s['I']), np.mean(f1s['O']), res) 83 | print(msg) 84 | with open(result_file_path, 'a') as ofile: 85 | ofile.write(msg) 86 | ofile.write('\n\n\n') 87 | 88 | if __name__ == "__main__": 89 | main() 90 | -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/my_optim.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | from copy import deepcopy 3 | import torch 4 | from torch.nn import Parameter 5 | 6 | class EMA: 7 | def __init__(self, gamma, model): 8 | super(EMA, self).__init__() 9 | self.gamma = gamma 10 | self.shadow = {} 11 | self.model = model 12 | self.setup() 13 | 14 | def setup(self): 15 | for name, para in self.model.named_parameters(): 16 | if para.requires_grad: 17 | self.shadow[name] = para.clone() 18 | def cuda(self): 19 | for k, v in self.shadow.items(): 20 | self.shadow[k] = v.cuda() 21 | 22 | def update(self): 23 | for name,para in self.model.named_parameters(): 24 | if para.requires_grad: 25 | self.shadow[name] = (1.0 - self.gamma) * para + self.gamma * self.shadow[name] 26 | 27 | def swap_parameters(self): 28 | for name, para in self.model.named_parameters(): 29 | if para.requires_grad: 30 | temp_data = para.data 31 | para.data = self.shadow[name].data 32 | self.shadow[name].data = temp_data 33 | 34 | def state_dict(self): 35 | return self.shadow 36 | 37 | 38 | # Adapted from 39 | # https://github.com/pytorch/pytorch/blob/master/torch/nn/utils/weight_norm.py 40 | # and https://github.com/salesforce/awd-lstm-lm/blob/master/weight_drop.py 41 | def _norm(p, dim): 42 | """Computes the norm over all dimensions except dim""" 43 | if dim is None: 44 | return p.norm() 45 | elif dim == 0: 46 | output_size = (p.size(0),) + (1,) * (p.dim() - 1) 47 | return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size) 48 | elif dim == p.dim() - 1: 49 | output_size = (1,) * (p.dim() - 1) + (p.size(-1),) 50 | return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size) 51 | else: 52 | return _norm(p.transpose(0, dim), 0).transpose(0, dim) 53 | 54 | 55 | def _dummy(*args, **kwargs): 56 | # We need to replace flatten_parameters with a nothing function 57 | return 58 | 59 | 60 | class WeightNorm(torch.nn.Module): 61 | 62 | def __init__(self, weights, dim): 63 | super(WeightNorm, self).__init__() 64 | self.weights = weights 65 | self.dim = dim 66 | 67 | def compute_weight(self, module, name): 68 | g = getattr(module, name + '_g') 69 | v = getattr(module, name + '_v') 70 | return v * (g / _norm(v, self.dim)) 71 | 72 | @staticmethod 73 | def apply(module, weights, dim): 74 | # Terrible temporary solution to an issue regarding compacting weights 75 | # re: CUDNN RNN 76 | if issubclass(type(module), torch.nn.RNNBase): 77 | module.flatten_parameters = _dummy 78 | if weights is None: # do for all weight params 79 | weights = [w for w in module._parameters.keys() if 'weight' in w] 80 | fn = WeightNorm(weights, dim) 81 | for name in weights: 82 | if hasattr(module, name): 83 | print('Applying weight norm to {} - {}'.format(str(module), name)) 84 | weight = getattr(module, name) 85 | del module._parameters[name] 86 | module.register_parameter( 87 | name + '_g', Parameter(_norm(weight, dim).data)) 88 | module.register_parameter(name + '_v', Parameter(weight.data)) 89 | setattr(module, name, fn.compute_weight(module, name)) 90 | 91 | module.register_forward_pre_hook(fn) 92 | 93 | return fn 94 | 95 | def remove(self, module): 96 | for name in self.weights: 97 | weight = self.compute_weight(module) 98 | delattr(module, name) 99 | del module._parameters[name + '_g'] 100 | del module._parameters[name + '_v'] 101 | module.register_parameter(name, Parameter(weight.data)) 102 | 103 | def __call__(self, module, inputs): 104 | for name in self.weights: 105 | setattr(module, name, self.compute_weight(module, name)) 106 | 107 | 108 | def weight_norm(module, weights=None, dim=0): 109 | WeightNorm.apply(module, weights, dim) 110 | return module -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep-PICO-Detection 2 | A model for identifying PICO elements in a given biomedical/clinical text. 3 | 4 | This is the source code for the paper: [Di Jin, Peter Szolovits, Advancing PICO Element Detection in Biomedical Text via Deep Neural Networks, Bioinformatics, , btaa256](https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btaa256/5822877?guestAccessKey=7f54ea86-4ec0-4080-9d5c-1251b730aa42). If you use the code, please cite the paper: 5 | 6 | ``` 7 | @article{10.1093/bioinformatics/btaa256, 8 | author = {Jin, Di and Szolovits, Peter}, 9 | title = "{Advancing PICO element detection in biomedical text via deep neural networks}", 10 | journal = {Bioinformatics}, 11 | year = {2020}, 12 | month = {04}, 13 | issn = {1367-4803}, 14 | doi = {10.1093/bioinformatics/btaa256}, 15 | url = {https://doi.org/10.1093/bioinformatics/btaa256}, 16 | note = {btaa256}, 17 | eprint = {https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btaa256/33363807/btaa256.pdf}, 18 | } 19 | ``` 20 | 21 | ## Prerequisites: 22 | Run the following command to install the prerequisite packages: 23 | ``` 24 | pip install -r requirements.txt 25 | ``` 26 | 27 | ## Data: 28 | Please download the data including PICO and NICTA-PIBOSO from the [Google Drive](https://drive.google.com/file/d/1M9QCgrRjERZnD9LM2FeK-3jjvXJbjRTl/view?usp=sharing) and unzip it to the main directory of this repository so that the folder layout is like this: 29 | ``` 30 | ./BERT 31 | ./lstm_model 32 | ./data 33 | ``` 34 | 35 | ## How to use 36 | ### For LSTM based models 37 | * The code for the LSTM based models is in the folder of "lstm_model", so run the following command to enter it: 38 | ``` 39 | cd lstm_model 40 | ``` 41 | 42 | * First we need to process the data to get vocabulary and trim the embedding file. The embeddings we used in experiments are from [here](http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin). Please download it and convert it to "txt" format. Of course, you can also try other kinds of embeddings such as fasttext. Then run the following command: 43 | ``` 44 | python build_data.py --data_keyname DATA_KEYNAME --filename_wordvec PATH_TO_EMBEDDING_FILE 45 | ``` 46 | DATA_KEYNAME can be "pico" for the PICO dataset and "nicta" for the NICTA-PIBOSO dataset; PATH_TO_EMBEDDING_FILE specifies where you store the embedding file. 47 | 48 | * Then we can start training the model for the PICO dataset by running the following command: 49 | ``` 50 | python run_train_pico.py --data_keyname pico 51 | ``` 52 | And the following command is for the NICTA-PIBOSO dataset: 53 | ``` 54 | python run_train_nicta.py --data_keyname nicta 55 | ``` 56 | 57 | * If we want to implement the 10 fold cross-validation, we run the following commands: 58 | ``` 59 | python run_train_cross_validate_pico.py --data_keyname pico 60 | python run_train_cross_validate_nicta.py --data_keyname nicta 61 | ``` 62 | 63 | There are several important arguments in the file of "src/config.py" that configures the model architecture and they are explains here: 64 | 65 | * --adv_reg_coeff: The coefficient for the adversarial loss regularization. Setting it to zero means we do not conduct the adversarial training. 66 | * --va_reg_coeff: The coefficient for the virtual adversarial loss regularization. Setting it to zero means we do not conduct the virtual adversarial training. 67 | * --num_augmentation: The number of samples we use for the virtual adversarial training. 68 | 69 | ### For the BERT Models 70 | * Code for the BERT models is in the folder of "BERT" and please enter this folder. 71 | 72 | * The best BERT model we found is the [BioBERT model](https://github.com/dmis-lab/biobert). The pretrained model parameter files available in the original repository only have the tensorflow version, and if you want the pytorch version, you can download from [here](https://drive.google.com/file/d/1H6DTBXlXDZ6tJYcJWdZnZ3UCoY16p19m/view?usp=sharing). Once you obtain the pretrained BERT model file, run the following commands for training: 73 | ``` 74 | python run_classifier_pico.py PATH_TO_BERT_MODEL 75 | python run_classifier_nicta.py PATH_TO_BERT_MODEL 76 | ``` 77 | In this command, PATH_TO_BERT_MODEL specifies the directory where you put your downloaded BERT model files. 78 | 79 | * The following commands are for the 10-fold cross-validation: 80 | ``` 81 | python run_classifier_pico_cross_validate.py PATH_TO_BERT_MODEL 82 | python run_classifier_nicta_cross_validate.py PATH_TO_BERT_MODEL 83 | ``` 84 | -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import re 23 | import argparse 24 | import tensorflow as tf 25 | import torch 26 | import numpy as np 27 | 28 | from .modeling import BertConfig, BertForPreTraining 29 | 30 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 31 | config_path = os.path.abspath(bert_config_file) 32 | tf_path = os.path.abspath(tf_checkpoint_path) 33 | print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path)) 34 | # Load weights from TF model 35 | init_vars = tf.train.list_variables(tf_path) 36 | names = [] 37 | arrays = [] 38 | for name, shape in init_vars: 39 | print("Loading TF weight {} with shape {}".format(name, shape)) 40 | array = tf.train.load_variable(tf_path, name) 41 | names.append(name) 42 | arrays.append(array) 43 | 44 | # Initialise PyTorch model 45 | config = BertConfig.from_json_file(bert_config_file) 46 | print("Building PyTorch model from configuration: {}".format(str(config))) 47 | model = BertForPreTraining(config) 48 | 49 | for name, array in zip(names, arrays): 50 | name = name.split('/') 51 | # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v 52 | # which are not required for using pretrained model 53 | if any(n in ["adam_v", "adam_m", "global_step"] for n in name): 54 | print("Skipping {}".format("/".join(name))) 55 | continue 56 | pointer = model 57 | for m_name in name: 58 | if re.fullmatch(r'[A-Za-z]+_\d+', m_name): 59 | l = re.split(r'_(\d+)', m_name) 60 | else: 61 | l = [m_name] 62 | if l[0] == 'kernel' or l[0] == 'gamma': 63 | pointer = getattr(pointer, 'weight') 64 | elif l[0] == 'output_bias' or l[0] == 'beta': 65 | pointer = getattr(pointer, 'bias') 66 | elif l[0] == 'output_weights': 67 | pointer = getattr(pointer, 'weight') 68 | else: 69 | pointer = getattr(pointer, l[0]) 70 | if len(l) >= 2: 71 | num = int(l[1]) 72 | pointer = pointer[num] 73 | if m_name[-11:] == '_embeddings': 74 | pointer = getattr(pointer, 'weight') 75 | elif m_name == 'kernel': 76 | array = np.transpose(array) 77 | try: 78 | assert pointer.shape == array.shape 79 | except AssertionError as e: 80 | e.args += (pointer.shape, array.shape) 81 | raise 82 | print("Initialize PyTorch weight {}".format(name)) 83 | pointer.data = torch.from_numpy(array) 84 | 85 | # Save pytorch-model 86 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 87 | torch.save(model.state_dict(), pytorch_dump_path) 88 | 89 | 90 | if __name__ == "__main__": 91 | parser = argparse.ArgumentParser() 92 | ## Required parameters 93 | parser.add_argument("--tf_checkpoint_path", 94 | default = None, 95 | type = str, 96 | required = True, 97 | help = "Path the TensorFlow checkpoint path.") 98 | parser.add_argument("--bert_config_file", 99 | default = None, 100 | type = str, 101 | required = True, 102 | help = "The config json file corresponding to the pre-trained BERT model. \n" 103 | "This specifies the model architecture.") 104 | parser.add_argument("--pytorch_dump_path", 105 | default = None, 106 | type = str, 107 | required = True, 108 | help = "Path to the output PyTorch model.") 109 | args = parser.parse_args() 110 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, 111 | args.bert_config_file, 112 | args.pytorch_dump_path) 113 | -------------------------------------------------------------------------------- /lstm_model/general_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | import logging 4 | import numpy as np 5 | 6 | 7 | def get_logger(filename): 8 | """Return a logger instance that writes in filename 9 | 10 | Args: 11 | filename: (string) path to log.txt 12 | 13 | Returns: 14 | logger: (instance of logger) 15 | 16 | """ 17 | logger = logging.getLogger('logger') 18 | logger.setLevel(logging.DEBUG) 19 | logging.basicConfig(format='%(message)s', level=logging.DEBUG) 20 | handler = logging.FileHandler(filename) 21 | handler.setLevel(logging.DEBUG) 22 | handler.setFormatter(logging.Formatter( 23 | '%(asctime)s:%(levelname)s: %(message)s')) 24 | logging.getLogger().addHandler(handler) 25 | 26 | return logger 27 | 28 | 29 | class Progbar(object): 30 | """Progbar class copied from keras (https://github.com/fchollet/keras/) 31 | 32 | Displays a progress bar. 33 | Small edit : added strict arg to update 34 | # Arguments 35 | target: Total number of steps expected. 36 | interval: Minimum visual progress update interval (in seconds). 37 | """ 38 | 39 | def __init__(self, target, width=30, verbose=1): 40 | self.width = width 41 | self.target = target 42 | self.sum_values = {} 43 | self.unique_values = [] 44 | self.start = time.time() 45 | self.total_width = 0 46 | self.seen_so_far = 0 47 | self.verbose = verbose 48 | 49 | def update(self, current, values=[], exact=[], strict=[]): 50 | """ 51 | Updates the progress bar. 52 | # Arguments 53 | current: Index of current step. 54 | values: List of tuples (name, value_for_last_step). 55 | The progress bar will display averages for these values. 56 | exact: List of tuples (name, value_for_last_step). 57 | The progress bar will display these values directly. 58 | """ 59 | 60 | for k, v in values: 61 | if k not in self.sum_values: 62 | self.sum_values[k] = [v * (current - self.seen_so_far), 63 | current - self.seen_so_far] 64 | self.unique_values.append(k) 65 | else: 66 | self.sum_values[k][0] += v * (current - self.seen_so_far) 67 | self.sum_values[k][1] += (current - self.seen_so_far) 68 | for k, v in exact: 69 | if k not in self.sum_values: 70 | self.unique_values.append(k) 71 | self.sum_values[k] = [v, 1] 72 | 73 | for k, v in strict: 74 | if k not in self.sum_values: 75 | self.unique_values.append(k) 76 | self.sum_values[k] = v 77 | 78 | self.seen_so_far = current 79 | 80 | now = time.time() 81 | if self.verbose == 1: 82 | prev_total_width = self.total_width 83 | sys.stdout.write("\b" * prev_total_width) 84 | sys.stdout.write("\r") 85 | 86 | numdigits = int(np.floor(np.log10(self.target))) + 1 87 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) 88 | bar = barstr % (current, self.target) 89 | prog = float(current)/self.target 90 | prog_width = int(self.width*prog) 91 | if prog_width > 0: 92 | bar += ('='*(prog_width-1)) 93 | if current < self.target: 94 | bar += '>' 95 | else: 96 | bar += '=' 97 | bar += ('.'*(self.width-prog_width)) 98 | bar += ']' 99 | sys.stdout.write(bar) 100 | self.total_width = len(bar) 101 | 102 | if current: 103 | time_per_unit = (now - self.start) / current 104 | else: 105 | time_per_unit = 0 106 | eta = time_per_unit*(self.target - current) 107 | info = '' 108 | if current < self.target: 109 | info += ' - ETA: %ds' % eta 110 | else: 111 | info += ' - %ds' % (now - self.start) 112 | for k in self.unique_values: 113 | if type(self.sum_values[k]) is list: 114 | info += ' - %s: %.4f' % (k, 115 | self.sum_values[k][0] / max(1, self.sum_values[k][1])) 116 | else: 117 | info += ' - %s: %s' % (k, self.sum_values[k]) 118 | 119 | self.total_width += len(info) 120 | if prev_total_width > self.total_width: 121 | info += ((prev_total_width-self.total_width) * " ") 122 | 123 | sys.stdout.write(info) 124 | sys.stdout.flush() 125 | 126 | if current >= self.target: 127 | sys.stdout.write("\n") 128 | 129 | if self.verbose == 2: 130 | if current >= self.target: 131 | info = '%ds' % (now - self.start) 132 | for k in self.unique_values: 133 | info += ' - %s: %.4f' % (k, 134 | self.sum_values[k][0] / max(1, self.sum_values[k][1])) 135 | sys.stdout.write(info + "\n") 136 | 137 | def add(self, n, values=[]): 138 | self.update(self.seen_so_far+n, values) 139 | 140 | 141 | -------------------------------------------------------------------------------- /lstm_model/src/general_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | import logging 4 | import numpy as np 5 | 6 | 7 | def get_logger(filename): 8 | """Return a logger instance that writes in filename 9 | 10 | Args: 11 | filename: (string) path to log.txt 12 | 13 | Returns: 14 | logger: (instance of logger) 15 | 16 | """ 17 | logger = logging.getLogger('logger') 18 | logger.setLevel(logging.DEBUG) 19 | logging.basicConfig(format='%(message)s', level=logging.DEBUG) 20 | handler = logging.FileHandler(filename) 21 | handler.setLevel(logging.DEBUG) 22 | handler.setFormatter(logging.Formatter( 23 | '%(asctime)s:%(levelname)s: %(message)s')) 24 | logging.getLogger().addHandler(handler) 25 | 26 | return logger 27 | 28 | 29 | class Progbar(object): 30 | """Progbar class copied from keras (https://github.com/fchollet/keras/) 31 | 32 | Displays a progress bar. 33 | Small edit : added strict arg to update 34 | # Arguments 35 | target: Total number of steps expected. 36 | interval: Minimum visual progress update interval (in seconds). 37 | """ 38 | 39 | def __init__(self, target, width=30, verbose=1): 40 | self.width = width 41 | self.target = target 42 | self.sum_values = {} 43 | self.unique_values = [] 44 | self.start = time.time() 45 | self.total_width = 0 46 | self.seen_so_far = 0 47 | self.verbose = verbose 48 | 49 | def update(self, current, values=[], exact=[], strict=[]): 50 | """ 51 | Updates the progress bar. 52 | # Arguments 53 | current: Index of current step. 54 | values: List of tuples (name, value_for_last_step). 55 | The progress bar will display averages for these values. 56 | exact: List of tuples (name, value_for_last_step). 57 | The progress bar will display these values directly. 58 | """ 59 | 60 | for k, v in values: 61 | if k not in self.sum_values: 62 | self.sum_values[k] = [v * (current - self.seen_so_far), 63 | current - self.seen_so_far] 64 | self.unique_values.append(k) 65 | else: 66 | self.sum_values[k][0] += v * (current - self.seen_so_far) 67 | self.sum_values[k][1] += (current - self.seen_so_far) 68 | for k, v in exact: 69 | if k not in self.sum_values: 70 | self.unique_values.append(k) 71 | self.sum_values[k] = [v, 1] 72 | 73 | for k, v in strict: 74 | if k not in self.sum_values: 75 | self.unique_values.append(k) 76 | self.sum_values[k] = v 77 | 78 | self.seen_so_far = current 79 | 80 | now = time.time() 81 | if self.verbose == 1: 82 | prev_total_width = self.total_width 83 | sys.stdout.write("\b" * prev_total_width) 84 | sys.stdout.write("\r") 85 | 86 | numdigits = int(np.floor(np.log10(self.target))) + 1 87 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) 88 | bar = barstr % (current, self.target) 89 | prog = float(current)/self.target 90 | prog_width = int(self.width*prog) 91 | if prog_width > 0: 92 | bar += ('='*(prog_width-1)) 93 | if current < self.target: 94 | bar += '>' 95 | else: 96 | bar += '=' 97 | bar += ('.'*(self.width-prog_width)) 98 | bar += ']' 99 | sys.stdout.write(bar) 100 | self.total_width = len(bar) 101 | 102 | if current: 103 | time_per_unit = (now - self.start) / current 104 | else: 105 | time_per_unit = 0 106 | eta = time_per_unit*(self.target - current) 107 | info = '' 108 | if current < self.target: 109 | info += ' - ETA: %ds' % eta 110 | else: 111 | info += ' - %ds' % (now - self.start) 112 | for k in self.unique_values: 113 | if type(self.sum_values[k]) is list: 114 | info += ' - %s: %.4f' % (k, 115 | self.sum_values[k][0] / max(1, self.sum_values[k][1])) 116 | else: 117 | info += ' - %s: %s' % (k, self.sum_values[k]) 118 | 119 | self.total_width += len(info) 120 | if prev_total_width > self.total_width: 121 | info += ((prev_total_width-self.total_width) * " ") 122 | 123 | sys.stdout.write(info) 124 | sys.stdout.flush() 125 | 126 | if current >= self.target: 127 | sys.stdout.write("\n") 128 | 129 | if self.verbose == 2: 130 | if current >= self.target: 131 | info = '%ds' % (now - self.start) 132 | for k in self.unique_values: 133 | info += ' - %s: %.4f' % (k, 134 | self.sum_values[k][0] / max(1, self.sum_values[k][1])) 135 | sys.stdout.write(info + "\n") 136 | 137 | def add(self, n, values=[]): 138 | self.update(self.seen_so_far+n, values) 139 | 140 | 141 | -------------------------------------------------------------------------------- /BERT/adversarial_losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from six.moves import xrange 3 | 4 | # Virtual adversarial training parameters 5 | num_power_iteration = 1 6 | small_constant_for_finite_diff = 1e-1 7 | 8 | 9 | def adversarial_loss(embedded, segment_ids, input_mask, document_mask, label_ids, loss, loss_fn, perturb_norm_length): 10 | """Adds gradient to embedding and recomputes classification loss.""" 11 | grad, = torch.autograd.grad( 12 | loss, 13 | embedded, 14 | retain_graph=True) 15 | grad.detach_() 16 | perturb = _scale_l2(grad, perturb_norm_length) 17 | return loss_fn(token_type_ids=segment_ids, attention_mask=input_mask, 18 | document_mask=document_mask, labels=label_ids, input_embeddings=embedded + perturb) 19 | 20 | 21 | def virtual_adversarial_loss(logits, embedded, segment_ids, input_mask, document_mask, 22 | num_classes, logits_from_embedding_fn, perturb_norm_length): 23 | """Virtual adversarial loss. 24 | Computes virtual adversarial perturbation by finite difference method and 25 | power iteration, adds it to the embedding, and computes the KL divergence 26 | between the new logits and the original logits. 27 | Args: 28 | logits: 3-D float Tensor, [batch_size, num_timesteps, m], where m=1 if 29 | num_classes=2, otherwise m=num_classes. 30 | embedded: 3-D float Tensor, [batch_size, num_timesteps, embedding_dim]. 31 | inputs: VatxtInput. 32 | logits_from_embedding_fn: callable that takes embeddings and returns 33 | classifier logits. 34 | Returns: 35 | kl: float scalar. 36 | """ 37 | # Stop gradient of logits. See https://arxiv.org/abs/1507.00677 for details. 38 | # logits = tf.stop_gradient(logits) 39 | 40 | # Only care about the KL divergence on the final timestep. 41 | # weights = inputs.eos_weights 42 | # assert weights is not None 43 | # if FLAGS.single_label: 44 | # indices = tf.stack([tf.range(FLAGS.batch_size), inputs.length - 1], 1) 45 | # weights = tf.expand_dims(tf.gather_nd(inputs.eos_weights, indices), 1) 46 | 47 | # Initialize perturbation with random noise. 48 | # shape(embedded) = (batch_size, num_timesteps, embedding_dim) 49 | d = torch.autograd.Variable(torch.empty(embedded.size()).normal_(), requires_grad=True).cuda() 50 | 51 | # Perform finite difference method and power iteration. 52 | # See Eq.(8) in the paper http://arxiv.org/pdf/1507.00677.pdf, 53 | # Adding small noise to input and taking gradient with respect to the noise 54 | # corresponds to 1 power iteration. 55 | for _ in xrange(num_power_iteration): 56 | d = _scale_l2( 57 | _mask_by_mask(d, input_mask), small_constant_for_finite_diff) 58 | 59 | _, d_logits, _ = logits_from_embedding_fn(token_type_ids=segment_ids, attention_mask=input_mask, 60 | document_mask=document_mask, input_embeddings=embedded + d) 61 | kl = _kl_divergence_with_logits(logits, d_logits, num_classes) 62 | perturb, = torch.autograd.grad( 63 | kl, 64 | d) 65 | perturb.detach_() 66 | 67 | perturb = _scale_l2(perturb, perturb_norm_length) 68 | _, vadv_logits, _ = logits_from_embedding_fn(token_type_ids=segment_ids, attention_mask=input_mask, 69 | document_mask=document_mask, input_embeddings=embedded + perturb) 70 | return _kl_divergence_with_logits(logits, vadv_logits, num_classes) 71 | 72 | 73 | def _scale_l2(x, norm_length): 74 | # shape(x) = (batch, num_timesteps, d) 75 | # Divide x by max(abs(x)) for a numerically stable L2 norm. 76 | # 2norm(x) = a * 2norm(x/a) 77 | # Scale over the full sequence, dims (1, 2) 78 | alpha = torch.max(torch.max(torch.abs(x), dim=1, keepdim=True)[0], dim=2, keepdim=True)[0] + 1e-12 79 | l2_norm = alpha * torch.sqrt( 80 | torch.sum(torch.pow(x / alpha, 2), dim=(1, 2), keepdim=True) + 1e-6) 81 | x_unit = x / l2_norm 82 | return norm_length * x_unit 83 | 84 | 85 | def _mask_by_mask(t, mask): 86 | """Mask t, 3-D [batch, time, dim], by Mask, 2-D [batch, time].""" 87 | 88 | return t * torch.unsqueeze(mask, dim=2).expand(t.size()).float() 89 | 90 | 91 | def _kl_divergence_with_logits(q_logits, p_logits, num_classes): 92 | """Returns weighted KL divergence between distributions q and p. 93 | Args: 94 | q_logits: logits for 1st argument of KL divergence shape 95 | [batch_size, num_timesteps, num_classes] if num_classes > 2, and 96 | [batch_size, num_timesteps] if num_classes == 2. 97 | p_logits: logits for 2nd argument of KL divergence with same shape q_logits. 98 | weights: 1-D float tensor with shape [batch_size, num_timesteps]. 99 | Elements should be 1.0 only on end of sequences 100 | Returns: 101 | KL: float scalar. 102 | """ 103 | # For logistic regression 104 | if num_classes == 2: 105 | # q = tf.nn.sigmoid(q_logits) 106 | # kl = (-tf.nn.sigmoid_cross_entropy_with_logits(logits=q_logits, labels=q) + 107 | # tf.nn.sigmoid_cross_entropy_with_logits(logits=p_logits, labels=q)) 108 | # kl = tf.squeeze(kl, 2) 109 | raise NotImplementedError 110 | 111 | # For softmax regression 112 | else: 113 | q = torch.nn.functional.softmax(q_logits, -1) 114 | kl = torch.sum( 115 | q * (torch.nn.functional.log_softmax(q_logits, -1) - torch.nn.functional.log_softmax(p_logits, -1)), -1) 116 | 117 | # num_labels = tf.reduce_sum(weights) 118 | # num_labels = tf.where(tf.equal(num_labels, 0.), 1., num_labels) 119 | 120 | # kl.get_shape().assert_has_rank(2) 121 | assert len(kl.size()) == 2 122 | # weights.get_shape().assert_has_rank(2) 123 | 124 | loss = torch.mean(kl) 125 | return loss -------------------------------------------------------------------------------- /lstm_model/base_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | 4 | 5 | class BaseModel(object): 6 | """Generic class for general methods that are not specific to NER""" 7 | 8 | def __init__(self, config): 9 | """Defines self.config and self.logger 10 | 11 | Args: 12 | config: (Config instance) class with hyper parameters, 13 | vocab and embeddings 14 | 15 | """ 16 | self.config = config 17 | self.logger = config.logger 18 | self.sess = None 19 | self.saver = None 20 | 21 | 22 | def reinitialize_weights(self, scope_name): 23 | """Reinitializes the weights of a given layer""" 24 | variables = tf.contrib.framework.get_variables(scope_name) 25 | init = tf.variables_initializer(variables) 26 | self.sess.run(init) 27 | 28 | 29 | def add_train_op(self, lr_method, lr, loss, clip=-1): 30 | """Defines self.train_op that performs an update on a batch 31 | 32 | Args: 33 | lr_method: (string) sgd method, for example "adam" 34 | lr: (tf.placeholder) tf.float32, learning rate 35 | loss: (tensor) tf.float32 loss to minimize 36 | clip: (python float) clipping of gradient. If < 0, no clipping 37 | 38 | """ 39 | _lr_m = lr_method.lower() # lower to make sure 40 | 41 | with tf.variable_scope("train_step"): 42 | if _lr_m == 'adam': # sgd method 43 | optimizer = tf.train.AdamOptimizer(lr) 44 | elif _lr_m == 'adagrad': 45 | optimizer = tf.train.AdagradOptimizer(lr) 46 | elif _lr_m == 'sgd': 47 | optimizer = tf.train.GradientDescentOptimizer(lr) 48 | elif _lr_m == 'rmsprop': 49 | optimizer = tf.train.RMSPropOptimizer(lr) 50 | else: 51 | raise NotImplementedError("Unknown method {}".format(_lr_m)) 52 | 53 | if clip > 0: # gradient clipping if clip is positive 54 | grads, vs = zip(*optimizer.compute_gradients(loss)) 55 | grads, gnorm = tf.clip_by_global_norm(grads, clip) 56 | self.train_op = optimizer.apply_gradients(zip(grads, vs)) 57 | else: 58 | self.train_op = optimizer.minimize(loss) 59 | 60 | 61 | def initialize_session(self): 62 | """Defines self.sess and initialize the variables""" 63 | self.logger.info("Initializing tf session") 64 | config = tf.ConfigProto() 65 | config.gpu_options.allow_growth = True 66 | self.sess = tf.Session(config=config) 67 | self.sess.run(tf.global_variables_initializer()) 68 | self.saver = tf.train.Saver() 69 | 70 | 71 | def restore_session(self, dir_model): 72 | """Reload weights into session 73 | 74 | Args: 75 | sess: tf.Session() 76 | dir_model: dir with weights 77 | 78 | """ 79 | self.logger.info("Reloading the latest trained model...") 80 | self.saver.restore(self.sess, dir_model) 81 | 82 | 83 | def save_session(self): 84 | """Saves session = weights""" 85 | if not os.path.exists(self.config.dir_model): 86 | os.makedirs(self.config.dir_model) 87 | self.saver.save(self.sess, self.config.dir_model) 88 | 89 | 90 | def close_session(self): 91 | """Closes the session""" 92 | self.sess.close() 93 | 94 | 95 | def add_summary(self): 96 | """Defines variables for Tensorboard 97 | 98 | Args: 99 | dir_output: (string) where the results are written 100 | 101 | """ 102 | self.merged = tf.summary.merge_all() 103 | self.file_writer = tf.summary.FileWriter(self.config.dir_output, 104 | self.sess.graph) 105 | 106 | 107 | def train(self, train, dev, data_aug=None): 108 | """Performs training with early stopping and lr exponential decay 109 | 110 | Args: 111 | train: dataset that yields tuple of (sentences, tags) 112 | dev: dataset 113 | 114 | """ 115 | best_score = 0 116 | nepoch_no_imprv = 0 # for early stopping 117 | self.add_summary() # tensorboard 118 | 119 | for epoch in range(self.config.nepochs): 120 | self.logger.info("Epoch {:} out of {:}".format(epoch + 1, 121 | self.config.nepochs)) 122 | 123 | score = self.run_epoch(train, dev, data_aug=data_aug) 124 | self.config.lr *= self.config.lr_decay # decay learning rate 125 | 126 | # early stopping and saving best parameters 127 | if score >= best_score: 128 | nepoch_no_imprv = 0 129 | self.save_session() 130 | best_score = score 131 | self.logger.info("- new best score!") 132 | else: 133 | nepoch_no_imprv += 1 134 | if nepoch_no_imprv >= self.config.nepoch_no_imprv: 135 | self.logger.info("- early stopping {} epochs without "\ 136 | "improvement".format(nepoch_no_imprv)) 137 | break 138 | 139 | return best_score 140 | 141 | 142 | def evaluate(self, test): 143 | """Evaluate model on test set 144 | 145 | Args: 146 | test: instance of class Dataset 147 | 148 | """ 149 | self.logger.info("Testing model over test set") 150 | metrics = self.run_evaluate(test, report=True) 151 | msg = " - ".join(["{} {:04.4f}".format(k, v) 152 | if k == 'acc' else '{} {}'.format(k, ', '.join(['{}: {:04.4f}'.format(a, b) \ 153 | for a, b in v.items()])) for k, v in metrics.items()]) 154 | self.logger.info(msg) 155 | 156 | return metrics 157 | -------------------------------------------------------------------------------- /lstm_model/src/base_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | 4 | 5 | class BaseModel(object): 6 | """Generic class for general methods that are not specific to NER""" 7 | 8 | def __init__(self, config): 9 | """Defines self.config and self.logger 10 | 11 | Args: 12 | config: (Config instance) class with hyper parameters, 13 | vocab and embeddings 14 | 15 | """ 16 | self.config = config 17 | self.logger = config.logger 18 | self.sess = None 19 | self.saver = None 20 | 21 | 22 | def reinitialize_weights(self, scope_name): 23 | """Reinitializes the weights of a given layer""" 24 | variables = tf.contrib.framework.get_variables(scope_name) 25 | init = tf.variables_initializer(variables) 26 | self.sess.run(init) 27 | 28 | 29 | def add_train_op(self, lr_method, lr, loss, clip=-1): 30 | """Defines self.train_op that performs an update on a batch 31 | 32 | Args: 33 | lr_method: (string) sgd method, for example "adam" 34 | lr: (tf.placeholder) tf.float32, learning rate 35 | loss: (tensor) tf.float32 loss to minimize 36 | clip: (python float) clipping of gradient. If < 0, no clipping 37 | 38 | """ 39 | _lr_m = lr_method.lower() # lower to make sure 40 | 41 | with tf.variable_scope("train_step"): 42 | if _lr_m == 'adam': # sgd method 43 | optimizer = tf.train.AdamOptimizer(lr) 44 | elif _lr_m == 'adagrad': 45 | optimizer = tf.train.AdagradOptimizer(lr) 46 | elif _lr_m == 'sgd': 47 | optimizer = tf.train.GradientDescentOptimizer(lr) 48 | elif _lr_m == 'rmsprop': 49 | optimizer = tf.train.RMSPropOptimizer(lr) 50 | else: 51 | raise NotImplementedError("Unknown method {}".format(_lr_m)) 52 | 53 | if clip > 0: # gradient clipping if clip is positive 54 | grads, vs = zip(*optimizer.compute_gradients(loss)) 55 | grads, gnorm = tf.clip_by_global_norm(grads, clip) 56 | self.train_op = optimizer.apply_gradients(zip(grads, vs)) 57 | else: 58 | self.train_op = optimizer.minimize(loss) 59 | 60 | 61 | def initialize_session(self): 62 | """Defines self.sess and initialize the variables""" 63 | self.logger.info("Initializing tf session") 64 | config = tf.ConfigProto() 65 | config.gpu_options.allow_growth = True 66 | self.sess = tf.Session(config=config) 67 | self.sess.run(tf.global_variables_initializer()) 68 | self.saver = tf.train.Saver() 69 | 70 | 71 | def restore_session(self, dir_model): 72 | """Reload weights into session 73 | 74 | Args: 75 | sess: tf.Session() 76 | dir_model: dir with weights 77 | 78 | """ 79 | self.logger.info("Reloading the latest trained model...") 80 | self.saver.restore(self.sess, dir_model) 81 | 82 | 83 | def save_session(self): 84 | """Saves session = weights""" 85 | if not os.path.exists(self.config.dir_model): 86 | os.makedirs(self.config.dir_model) 87 | self.saver.save(self.sess, self.config.dir_model) 88 | 89 | 90 | def close_session(self): 91 | """Closes the session""" 92 | self.sess.close() 93 | 94 | 95 | def add_summary(self): 96 | """Defines variables for Tensorboard 97 | 98 | Args: 99 | dir_output: (string) where the results are written 100 | 101 | """ 102 | self.merged = tf.summary.merge_all() 103 | self.file_writer = tf.summary.FileWriter(self.config.dir_output, 104 | self.sess.graph) 105 | 106 | 107 | def train(self, train, dev, data_aug=None): 108 | """Performs training with early stopping and lr exponential decay 109 | 110 | Args: 111 | train: dataset that yields tuple of (sentences, tags) 112 | dev: dataset 113 | 114 | """ 115 | best_score = 0 116 | nepoch_no_imprv = 0 # for early stopping 117 | self.add_summary() # tensorboard 118 | 119 | for epoch in range(self.config.nepochs): 120 | self.logger.info("Epoch {:} out of {:}".format(epoch + 1, 121 | self.config.nepochs)) 122 | 123 | score = self.run_epoch(train, dev, data_aug=data_aug) 124 | self.config.lr *= self.config.lr_decay # decay learning rate 125 | 126 | # early stopping and saving best parameters 127 | if score >= best_score: 128 | nepoch_no_imprv = 0 129 | self.save_session() 130 | best_score = score 131 | self.logger.info("- new best score!") 132 | else: 133 | nepoch_no_imprv += 1 134 | if nepoch_no_imprv >= self.config.nepoch_no_imprv: 135 | self.logger.info("- early stopping {} epochs without "\ 136 | "improvement".format(nepoch_no_imprv)) 137 | break 138 | 139 | return best_score 140 | 141 | 142 | def evaluate(self, test): 143 | """Evaluate model on test set 144 | 145 | Args: 146 | test: instance of class Dataset 147 | 148 | """ 149 | self.logger.info("Testing model over test set") 150 | metrics = self.run_evaluate(test, report=True) 151 | msg = " - ".join(["{} {:04.4f}".format(k, v) 152 | if k == 'acc' else '{} {}'.format(k, ', '.join(['{}: {:04.4f}'.format(a, b) \ 153 | for a, b in v.items()])) for k, v in metrics.items()]) 154 | self.logger.info(msg) 155 | 156 | return metrics 157 | -------------------------------------------------------------------------------- /BERT/bert_model.py: -------------------------------------------------------------------------------- 1 | from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel 2 | import torch 3 | from torch import nn 4 | import torch.nn.functional as F 5 | 6 | from crf import ChainCRF 7 | import utils 8 | 9 | 10 | class BertForSequentialClassification(BertPreTrainedModel): 11 | """BERT model for classification. 12 | This module is composed of the BERT model with a linear layer on top of 13 | the pooled output. 14 | Params: 15 | `config`: a BertConfig class instance with the configuration to build a new model. 16 | `num_labels`: the number of classes for the classifier. Default = 2. 17 | Inputs: 18 | `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] 19 | with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts 20 | `extract_features.py`, `run_classifier.py` and `run_squad.py`) 21 | `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token 22 | types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to 23 | a `sentence B` token (see BERT paper for more details). 24 | `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices 25 | selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max 26 | input sequence length in the current batch. It's the mask that we typically use for attention when 27 | a batch has varying length sentences. 28 | `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] 29 | with indices selected in [0, ..., num_labels]. 30 | Outputs: 31 | if `labels` is not `None`: 32 | Outputs the CrossEntropy classification loss of the output with the labels. 33 | if `labels` is `None`: 34 | Outputs the classification logits of shape [batch_size, num_labels]. 35 | Example usage: 36 | ```python 37 | # Already been converted into WordPiece token ids 38 | input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) 39 | input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) 40 | token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) 41 | config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, 42 | num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) 43 | num_labels = 2 44 | model = BertForSequenceClassification(config, num_labels) 45 | logits = model(input_ids, token_type_ids, input_mask) 46 | ``` 47 | """ 48 | def __init__(self, config, num_labels, tag_space=0, rnn_mode='LSTM', use_crf=False, 49 | rnn_hidden_size=None, dropout=None): 50 | super(BertForSequentialClassification, self).__init__(config) 51 | self.num_labels = num_labels 52 | self.bert = BertModel(config) 53 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 54 | if dropout is None: 55 | dropout = config.hidden_dropout_prob 56 | self.dropout_other = nn.Dropout(dropout) 57 | self.use_crf = use_crf 58 | 59 | if rnn_mode == 'RNN': 60 | RNN = nn.RNN 61 | elif rnn_mode == 'LSTM': 62 | RNN = nn.LSTM 63 | elif rnn_mode == 'GRU': 64 | RNN = nn.GRU 65 | 66 | if rnn_hidden_size is not None: 67 | self.rnn = RNN(config.hidden_size, rnn_hidden_size, num_layers=1, batch_first=True, bidirectional=True) 68 | out_dim = rnn_hidden_size * 2 69 | else: 70 | self.rnn = None 71 | out_dim = config.hidden_size 72 | 73 | if tag_space: 74 | self.dense = nn.Linear(out_dim, tag_space) 75 | out_dim = tag_space 76 | else: 77 | self.dense = None 78 | 79 | if use_crf: 80 | self.crf = ChainCRF(out_dim, num_labels, bigram=True) 81 | else: 82 | self.dense_softmax = nn.Linear(out_dim, num_labels) 83 | 84 | self.apply(self.init_bert_weights) 85 | 86 | def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, document_mask=None, labels=None, 87 | input_embeddings=None): 88 | _, output, embeddings = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, 89 | input_embeddings=input_embeddings) 90 | output = self.dropout(output) 91 | 92 | # sentence level transform to document level 93 | length = document_mask.sum(dim=1).long() 94 | max_len = length.max() 95 | output = output.view(-1, max_len, self.config.hidden_size) 96 | 97 | # document level RNN processing 98 | if self.rnn is not None: 99 | output, hx, rev_order, mask = utils.prepare_rnn_seq(output, length, hx=None, 100 | masks=document_mask, batch_first=True) 101 | output, hn = self.rnn(output, hx=hx) 102 | output, hn = utils.recover_rnn_seq(output, rev_order, hx=hn, batch_first=True) 103 | 104 | # apply dropout for the output of rnn 105 | output = self.dropout_other(output) 106 | if self.dense is not None: 107 | # [batch, length, tag_space] 108 | output = self.dropout_other(F.elu(self.dense(output))) 109 | 110 | # final output layer 111 | if not self.use_crf: 112 | # not use crf 113 | output = self.dense_softmax(output) # [batch, length, num_labels] 114 | if labels is None: 115 | _, preds = torch.max(output, dim=2) 116 | return preds, None, embeddings 117 | else: 118 | return (F.cross_entropy(output.view(-1, output.size(-1)), labels.view(-1), reduction='none') * 119 | document_mask.view(-1)).sum() / document_mask.sum(), None, embeddings 120 | else: 121 | # CRF processing 122 | if labels is not None: 123 | loss, logits = self.crf.loss(output, labels, mask=document_mask) 124 | return loss.mean(), logits, embeddings 125 | else: 126 | seq_pred, logits = self.crf.decode(output, mask=document_mask, leading_symbolic=0) 127 | return seq_pred, logits, embeddings -------------------------------------------------------------------------------- /BERT/crf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch.nn.parameter import Parameter 5 | 6 | 7 | def logdet(x): 8 | """ 9 | Args: 10 | x: 2D positive semidefinite matrix. 11 | Returns: log determinant of x 12 | """ 13 | # TODO for pytorch 2.0.4, use inside potrf for variable. 14 | print(torch.log(torch.eig(x.data)[0])) 15 | print(x) 16 | u_chol = x.potrf() 17 | return torch.sum(torch.log(u_chol.diag())) * 2 18 | 19 | 20 | def logsumexp(x, dim=None): 21 | """ 22 | Args: 23 | x: A pytorch tensor (any dimension will do) 24 | dim: int or None, over which to perform the summation. `None`, the 25 | default, performs over all axes. 26 | Returns: The result of the log(sum(exp(...))) operation. 27 | """ 28 | if dim is None: 29 | xmax = x.max() 30 | xmax_ = x.max() 31 | return xmax_ + torch.log(torch.exp(x - xmax).sum()) 32 | else: 33 | xmax, _ = x.max(dim, keepdim=True) 34 | xmax_, _ = x.max(dim) 35 | return xmax_ + torch.log(torch.exp(x - xmax).sum(dim)) 36 | 37 | class ChainCRF(nn.Module): 38 | def __init__(self, input_size, num_labels, bigram=True, **kwargs): 39 | ''' 40 | Args: 41 | input_size: int 42 | the dimension of the input. 43 | num_labels: int 44 | the number of labels of the crf layer 45 | bigram: bool 46 | if apply bi-gram parameter. 47 | **kwargs: 48 | ''' 49 | super(ChainCRF, self).__init__() 50 | self.input_size = input_size 51 | self.num_labels = num_labels + 1 52 | self.pad_label_id = num_labels 53 | self.bigram = bigram 54 | 55 | 56 | # state weight tensor 57 | self.state_nn = nn.Linear(input_size, self.num_labels) 58 | if bigram: 59 | # transition weight tensor 60 | self.trans_nn = nn.Linear(input_size, self.num_labels * self.num_labels) 61 | self.register_parameter('trans_matrix', None) 62 | else: 63 | self.trans_nn = None 64 | self.trans_matrix = Parameter(torch.Tensor(self.num_labels, self.num_labels)) 65 | 66 | self.reset_parameters() 67 | 68 | def reset_parameters(self): 69 | nn.init.constant(self.state_nn.bias, 0.) 70 | if self.bigram: 71 | nn.init.xavier_uniform(self.trans_nn.weight) 72 | nn.init.constant(self.trans_nn.bias, 0.) 73 | else: 74 | nn.init.normal(self.trans_matrix) 75 | # if not self.bigram: 76 | # nn.init.normal(self.trans_matrix) 77 | 78 | def forward(self, input, mask=None): 79 | ''' 80 | Args: 81 | input: Tensor 82 | the input tensor with shape = [batch, length, input_size] 83 | mask: Tensor or None 84 | the mask tensor with shape = [batch, length] 85 | Returns: Tensor 86 | the energy tensor with shape = [batch, length, num_label, num_label] 87 | ''' 88 | batch, length, _ = input.size() 89 | 90 | # compute out_s by tensor dot [batch, length, input_size] * [input_size, num_label] 91 | # thus out_s should be [batch, length, num_label] --> [batch, length, num_label, 1] 92 | logits = self.state_nn(input) 93 | out_s = logits.unsqueeze(2) 94 | 95 | if self.bigram: 96 | # compute out_s by tensor dot: [batch, length, input_size] * [input_size, num_label * num_label] 97 | # the output should be [batch, length, num_label, num_label] 98 | out_t = self.trans_nn(input).view(batch, length, self.num_labels, self.num_labels) 99 | output = out_t + out_s 100 | else: 101 | # [batch, length, num_label, num_label] 102 | output = self.trans_matrix + out_s 103 | 104 | if mask is not None: 105 | output = output * mask.unsqueeze(2).unsqueeze(3) 106 | 107 | return output, logits 108 | 109 | def loss(self, input, target, mask=None): 110 | ''' 111 | Args: 112 | input: Tensor 113 | the input tensor with shape = [batch, length, input_size] 114 | target: Tensor 115 | the tensor of target labels with shape [batch, length] 116 | mask:Tensor or None 117 | the mask tensor with shape = [batch, length] 118 | Returns: Tensor 119 | A 1D tensor for minus log likelihood loss 120 | ''' 121 | batch, length, _ = input.size() 122 | energy, logits = self.forward(input, mask=mask) 123 | # shape = [length, batch, num_label, num_label] 124 | energy_transpose = energy.transpose(0, 1) 125 | # shape = [length, batch] 126 | target_transpose = target.transpose(0, 1) 127 | # shape = [length, batch, 1] 128 | mask_transpose = None 129 | if mask is not None: 130 | mask_transpose = mask.unsqueeze(2).transpose(0, 1) 131 | 132 | 133 | # shape = [batch, num_label] 134 | partition = None 135 | 136 | if input.is_cuda: 137 | # shape = [batch] 138 | batch_index = torch.arange(0, batch).long().cuda() 139 | prev_label = torch.cuda.LongTensor(batch).fill_(self.num_labels - 1) 140 | tgt_energy = Variable(torch.zeros(batch)).cuda() 141 | else: 142 | # shape = [batch] 143 | batch_index = torch.arange(0, batch).long() 144 | prev_label = torch.LongTensor(batch).fill_(self.num_labels - 1) 145 | tgt_energy = Variable(torch.zeros(batch)) 146 | 147 | for t in range(length): 148 | # shape = [batch, num_label, num_label] 149 | curr_energy = energy_transpose[t] 150 | if t == 0: 151 | partition = curr_energy[:, -1, :] 152 | else: 153 | # shape = [batch, num_label] 154 | partition_new = logsumexp(curr_energy + partition.unsqueeze(2), dim=1) 155 | if mask_transpose is None: 156 | partition = partition_new 157 | else: 158 | mask_t = mask_transpose[t] 159 | partition = partition + (partition_new - partition) * mask_t 160 | tgt_energy += curr_energy[batch_index, prev_label, target_transpose[t].data] 161 | prev_label = target_transpose[t].data 162 | 163 | return logsumexp(partition, dim=1) - tgt_energy, logits 164 | 165 | def decode(self, input, mask=None, leading_symbolic=0): 166 | """ 167 | Args: 168 | input: Tensor 169 | the input tensor with shape = [batch, length, input_size] 170 | mask: Tensor or None 171 | the mask tensor with shape = [batch, length] 172 | leading_symbolic: nt 173 | number of symbolic labels leading in type alphabets (set it to 0 if you are not sure) 174 | Returns: Tensor 175 | decoding results in shape [batch, length] 176 | """ 177 | 178 | energy, logits = self.forward(input, mask=mask) 179 | energy = energy.data 180 | 181 | # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels) 182 | # For convenience, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels) 183 | energy_transpose = energy.transpose(0, 1) 184 | 185 | # the last row and column is the tag for pad symbol. reduce these two dimensions by 1 to remove that. 186 | # also remove the first #symbolic rows and columns. 187 | # now the shape of energies_shuffled is [n_time_steps, b_batch, t, t] where t = num_labels - #symbolic - 1. 188 | energy_transpose = energy_transpose[:, :, leading_symbolic:-1, leading_symbolic:-1] 189 | 190 | length, batch_size, num_label, _ = energy_transpose.size() 191 | 192 | if input.is_cuda: 193 | batch_index = torch.arange(0, batch_size).long().cuda() 194 | pi = torch.zeros([length, batch_size, num_label]).cuda() 195 | pointer = torch.cuda.LongTensor(length, batch_size, num_label).zero_() 196 | back_pointer = torch.cuda.LongTensor(length, batch_size).zero_() 197 | else: 198 | batch_index = torch.arange(0, batch_size).long() 199 | pi = torch.zeros([length, batch_size, num_label, 1]) 200 | pointer = torch.LongTensor(length, batch_size, num_label).zero_() 201 | back_pointer = torch.LongTensor(length, batch_size).zero_() 202 | 203 | pi[0] = energy[:, 0, -1, leading_symbolic:-1] 204 | pointer[0] = -1 205 | for t in range(1, length): 206 | pi_prev = pi[t - 1] 207 | pi[t], pointer[t] = torch.max(energy_transpose[t] + pi_prev.unsqueeze(2), dim=1) 208 | 209 | _, back_pointer[-1] = torch.max(pi[-1], dim=1) 210 | for t in reversed(range(length - 1)): 211 | pointer_last = pointer[t + 1] 212 | back_pointer[t] = pointer_last[batch_index, back_pointer[t + 1]] 213 | 214 | return back_pointer.transpose(0, 1) + leading_symbolic, logits -------------------------------------------------------------------------------- /lstm_model/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from .general_utils import get_logger 4 | from .data_utils import get_trimmed_wordvec_vectors, load_vocab, \ 5 | get_processing_word 6 | 7 | 8 | def Config(load=True): 9 | """Initialize hyperparameters and load vocabs 10 | 11 | Args: 12 | load_embeddings: (bool) if True, load embeddings into 13 | np array, else None 14 | 15 | """ 16 | def load_(args): 17 | """Loads vocabulary, processing functions and embeddings 18 | 19 | Supposes that build_data.py has been run successfully and that 20 | the corresponding files have been created (vocab and trimmed GloVe 21 | vectors) 22 | 23 | """ 24 | # 1. vocabulary 25 | args.vocab_words, args.vocab_words_freq = load_vocab(args.filename_words) 26 | args.vocab_tags = load_vocab(args.filename_tags) 27 | # args.vocab_chars = load_vocab(args.filename_chars) 28 | 29 | args.nwords = len(args.vocab_words) 30 | # args.nchars = len(args.vocab_chars) 31 | args.ntags = len(args.vocab_tags) 32 | 33 | # 2. get processing functions that map str -> id 34 | # args.use_chars = args.use_lstm_chars | args.use_cnn_chars 35 | args.processing_word = get_processing_word(args.vocab_words, lowercase=True, chars=False) 36 | args.processing_tag = get_processing_word(args.vocab_tags, 37 | lowercase=False, allow_unk=False) 38 | 39 | # 3. get pre-trained embeddings 40 | args.embeddings = (get_trimmed_wordvec_vectors(args.filename_wordvec_trimmed, args.vocab_words) 41 | if args.use_pretrained else None) 42 | args.dim_word = args.embeddings.shape[1] 43 | 44 | return args 45 | 46 | ## parse args 47 | parser = argparse.ArgumentParser() 48 | # training parameters 49 | parser.add_argument('--nepochs', default='100', type=int, 50 | help='number of epochs') 51 | parser.add_argument('--dropout', default='0.8', type=float, 52 | help='number of epochs') 53 | parser.add_argument('--batch_size', default='30', type=int, 54 | help='batch size') 55 | parser.add_argument('--batch_size_aug', default='30', type=int, 56 | help='batch size for data augmentation') 57 | parser.add_argument('--lr', default='0.001', type=float, 58 | help='learning rate') 59 | parser.add_argument('--lr_method', default='adam', type=str, 60 | help='optimization method') 61 | parser.add_argument('--lr_decay', default='0.99', type=float, 62 | help='learning rate decay rate') 63 | parser.add_argument('--clip', default='2', type=float, 64 | help='gradient clipping') 65 | parser.add_argument('--nepoch_no_imprv', default='4', type=int, 66 | help='number of epoch patience') 67 | parser.add_argument('--l2_reg_lambda', default='1e-6', type=float, 68 | help='l2 regularization coefficient') 69 | 70 | # data and results paths 71 | parser.add_argument('--dir_output', default='test', type=str, 72 | help='directory for output') 73 | parser.add_argument('--data_keyname', default='nicta', type=str, 74 | help='directory for output') 75 | parser.add_argument('--filename_wordvec_trimmed', default='../data/word2vec_pubmed.trimmed.txt', 76 | type=str, help='directory for trimmed word embeddings file') 77 | parser.add_argument('--filename_wordvec', default='/data/medg/misc/jindi/nlp/embeddings/word2vec/wikipedia-pubmed-and-PMC-w2v.txt', 78 | type=str, help='directory for original word embeddings file') 79 | 80 | # model hyperparameters 81 | parser.add_argument('--hidden_size_char', default='50', type=int, 82 | help='hidden size of character level lstm') 83 | parser.add_argument('--hidden_size_lstm_sentence', default='100', type=int, 84 | help='hidden size of sentence level lstm') 85 | parser.add_argument('--hidden_size_lstm_document', default='100', type=int, 86 | help='hidden size of document level lstm') 87 | parser.add_argument('--attention_size', default='400', type=int, 88 | help='attention vector size') 89 | parser.add_argument('--cnn_filter_num', default='300', type=int, 90 | help='number of cnn filters for each window size') 91 | parser.add_argument('--dim_char', default='50', type=int, 92 | help='character embedding dimension') 93 | parser.add_argument('--cnn_filter_sizes', default='2,3,4', type=str, 94 | help='cnn filter window sizes') 95 | parser.add_argument('--cnn_char_windows', default='3', type=str, 96 | help='cnn filter window sizes') 97 | parser.add_argument('--adv_reg_coeff', default='0.2', type=float, 98 | help='Regularization coefficient of adversarial loss') 99 | parser.add_argument('--va_reg_coeff', default='0.05', type=float, 100 | help='Regularization coefficient of virtual adversarial loss') 101 | parser.add_argument('--adv_perturb_norm_length', default='8.0', type=float, 102 | help='Norm length of adversarial perturbation to be') 103 | parser.add_argument('--va_perturb_norm_length', default='4.0', type=float, 104 | help='Norm length of virtual adversarial perturbation to be') 105 | parser.add_argument('--embedding_dropout', default='0.8', type=float, 106 | help='Keep dropout for embeddings') 107 | parser.add_argument('--embedding_normalize', action='store_false', 108 | help='Whether normalize the embeddings') 109 | 110 | # misc 111 | parser.add_argument('--restore', action='store_true', 112 | help='whether restore from previous trained model') 113 | parser.add_argument('--use_crf', action='store_false', 114 | help='whether use crf optimization layer') 115 | parser.add_argument('--use_document_level', action='store_false', 116 | help='whether use document level lstm layer') 117 | parser.add_argument('--use_document_attention', action='store_true', 118 | help='whether use document level attention') 119 | parser.add_argument('--use_attention', action='store_false', 120 | help='whether use attention based pooling') 121 | parser.add_argument('--use_cnn', action='store_false', 122 | help='whether use cnn or lstm for sentence representation') 123 | parser.add_argument('--train_embeddings', action='store_true', 124 | help='whether use cnn or lstm for sentence representation') 125 | parser.add_argument('--use_pretrained', action='store_false', 126 | help='whether use pre-trained word embeddings') 127 | parser.add_argument('--train_accuracy', action='store_true', 128 | help='whether report accuracy while training') 129 | parser.add_argument('--min_freq', default='20', type=int, 130 | help='remove tokens with small frequency for vocab') 131 | parser.add_argument('--num_augmentation', default='0', type=int, 132 | help='Number of abstracts for data augmentation for VADV') 133 | 134 | args = parser.parse_args() 135 | 136 | # args.filename_wordvec = os.path.join('/data/medg/misc/jindi/nlp/embeddings', 137 | # args.filename_wordvec) 138 | args.dir_output = os.path.join('results', args.dir_output) 139 | if not os.path.exists(args.dir_output): 140 | os.makedirs(args.dir_output) 141 | args.dir_model = os.path.join(args.dir_output, "model.weights") 142 | args.path_log = os.path.join(args.dir_output, "log.txt") 143 | 144 | # dataset 145 | if args.data_keyname == 'PICO': 146 | args.data_root = '../data/pico' 147 | args.filename_dev = os.path.join(args.data_root, 'dev.txt') 148 | args.filename_test = os.path.join(args.data_root, 'test.txt') 149 | args.filename_train = os.path.join(args.data_root, 'train.txt') 150 | elif args.data_keyname == 'nicta': 151 | args.data_root = '../data/nicta_piboso' 152 | args.filename_dev = os.path.join(args.data_root, 'test_clean.txt') 153 | args.filename_test = os.path.join(args.data_root, 'test_clean.txt') 154 | args.filename_train = os.path.join(args.data_root, 'train_clean.txt') 155 | 156 | # data augmentation dataset 157 | args.filename_aug = '../data/unlabeled_corpus' 158 | 159 | # vocab (created from dataset with build_data.py) 160 | args.filename_words = os.path.join('data', args.data_keyname, 'words.txt') 161 | args.filename_tags = os.path.join('data', args.data_keyname, 'tags.txt') 162 | # args.filename_chars = os.path.join('data', args.data_keyname, 'chars.txt') 163 | 164 | args.cnn_filter_sizes = [int(i) for i in args.cnn_filter_sizes.split(',')] 165 | args.cnn_char_windows = [int(i) for i in args.cnn_char_windows.split(',')] 166 | 167 | # directory for training outputs 168 | if not os.path.exists(os.path.join('data', args.data_keyname)): 169 | os.makedirs(os.path.join('data', args.data_keyname)) 170 | 171 | # directory for data output 172 | if not os.path.exists(args.dir_output): 173 | os.makedirs(args.dir_output) 174 | 175 | # create instance of logger 176 | args.logger = get_logger(args.path_log) 177 | 178 | # log the attributes 179 | msg = ', '.join(['{}: {}'.format(attr, getattr(args, attr)) for attr in dir(args) \ 180 | if not callable(getattr(args, attr)) and not attr.startswith("__")]) 181 | args.logger.info(msg) 182 | 183 | # load if requested (default) 184 | if load: 185 | args = load_(args) 186 | 187 | return args 188 | -------------------------------------------------------------------------------- /lstm_model/src/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from .general_utils import get_logger 4 | from .data_utils import get_trimmed_wordvec_vectors, load_vocab, \ 5 | get_processing_word 6 | 7 | 8 | def Config(load=True): 9 | """Initialize hyperparameters and load vocabs 10 | 11 | Args: 12 | load_embeddings: (bool) if True, load embeddings into 13 | np array, else None 14 | 15 | """ 16 | def load_(args): 17 | """Loads vocabulary, processing functions and embeddings 18 | 19 | Supposes that build_data.py has been run successfully and that 20 | the corresponding files have been created (vocab and trimmed GloVe 21 | vectors) 22 | 23 | """ 24 | # 1. vocabulary 25 | args.vocab_words, args.vocab_words_freq = load_vocab(args.filename_words) 26 | args.vocab_tags = load_vocab(args.filename_tags) 27 | # args.vocab_chars = load_vocab(args.filename_chars) 28 | 29 | args.nwords = len(args.vocab_words) 30 | # args.nchars = len(args.vocab_chars) 31 | args.ntags = len(args.vocab_tags) 32 | 33 | # 2. get processing functions that map str -> id 34 | # args.use_chars = args.use_lstm_chars | args.use_cnn_chars 35 | args.processing_word = get_processing_word(args.vocab_words, lowercase=True, chars=False) 36 | args.processing_tag = get_processing_word(args.vocab_tags, 37 | lowercase=False, allow_unk=False) 38 | 39 | # 3. get pre-trained embeddings 40 | args.embeddings = (get_trimmed_wordvec_vectors(args.filename_wordvec_trimmed, args.vocab_words) 41 | if args.use_pretrained else None) 42 | args.dim_word = args.embeddings.shape[1] 43 | 44 | return args 45 | 46 | ## parse args 47 | parser = argparse.ArgumentParser() 48 | # training parameters 49 | parser.add_argument('--nepochs', default='100', type=int, 50 | help='number of epochs') 51 | parser.add_argument('--dropout', default='0.8', type=float, 52 | help='number of epochs') 53 | parser.add_argument('--batch_size', default='30', type=int, 54 | help='batch size') 55 | parser.add_argument('--batch_size_aug', default='30', type=int, 56 | help='batch size for data augmentation') 57 | parser.add_argument('--lr', default='0.001', type=float, 58 | help='learning rate') 59 | parser.add_argument('--lr_method', default='adam', type=str, 60 | help='optimization method') 61 | parser.add_argument('--lr_decay', default='0.99', type=float, 62 | help='learning rate decay rate') 63 | parser.add_argument('--clip', default='2', type=float, 64 | help='gradient clipping') 65 | parser.add_argument('--nepoch_no_imprv', default='4', type=int, 66 | help='number of epoch patience') 67 | parser.add_argument('--l2_reg_lambda', default='1e-6', type=float, 68 | help='l2 regularization coefficient') 69 | 70 | # data and results paths 71 | parser.add_argument('--dir_output', default='test', type=str, 72 | help='directory for output') 73 | parser.add_argument('--data_keyname', required=True, type=str, 74 | help='directory for output') 75 | parser.add_argument('--filename_wordvec_trimmed', default='', 76 | type=str, help='directory for trimmed word embeddings file') 77 | parser.add_argument('--filename_wordvec', default='/data/medg/misc/jindi/nlp/embeddings/word2vec/wikipedia-pubmed-and-PMC-w2v.txt', 78 | type=str, help='directory for original word embeddings file') 79 | 80 | # model hyperparameters 81 | parser.add_argument('--hidden_size_char', default='50', type=int, 82 | help='hidden size of character level lstm') 83 | parser.add_argument('--hidden_size_lstm_sentence', default='100', type=int, 84 | help='hidden size of sentence level lstm') 85 | parser.add_argument('--hidden_size_lstm_document', default='100', type=int, 86 | help='hidden size of document level lstm') 87 | parser.add_argument('--attention_size', default='400', type=int, 88 | help='attention vector size') 89 | parser.add_argument('--cnn_filter_num', default='300', type=int, 90 | help='number of cnn filters for each window size') 91 | parser.add_argument('--dim_char', default='50', type=int, 92 | help='character embedding dimension') 93 | parser.add_argument('--cnn_filter_sizes', default='2,3,4', type=str, 94 | help='cnn filter window sizes') 95 | parser.add_argument('--cnn_char_windows', default='3', type=str, 96 | help='cnn filter window sizes') 97 | parser.add_argument('--adv_reg_coeff', default='0.2', type=float, 98 | help='Regularization coefficient of adversarial loss') 99 | parser.add_argument('--va_reg_coeff', default='0.05', type=float, 100 | help='Regularization coefficient of virtual adversarial loss') 101 | parser.add_argument('--adv_perturb_norm_length', default='8.0', type=float, 102 | help='Norm length of adversarial perturbation to be') 103 | parser.add_argument('--va_perturb_norm_length', default='4.0', type=float, 104 | help='Norm length of virtual adversarial perturbation to be') 105 | parser.add_argument('--embedding_dropout', default='0.8', type=float, 106 | help='Keep dropout for embeddings') 107 | parser.add_argument('--embedding_normalize', action='store_false', 108 | help='Whether normalize the embeddings') 109 | 110 | # misc 111 | parser.add_argument('--restore', action='store_true', 112 | help='whether restore from previous trained model') 113 | parser.add_argument('--use_crf', action='store_false', 114 | help='whether use crf optimization layer') 115 | parser.add_argument('--use_document_level', action='store_false', 116 | help='whether use document level lstm layer') 117 | parser.add_argument('--use_document_attention', action='store_true', 118 | help='whether use document level attention') 119 | parser.add_argument('--use_attention', action='store_false', 120 | help='whether use attention based pooling') 121 | parser.add_argument('--use_cnn', action='store_false', 122 | help='whether use cnn or lstm for sentence representation') 123 | parser.add_argument('--train_embeddings', action='store_true', 124 | help='whether use cnn or lstm for sentence representation') 125 | parser.add_argument('--use_pretrained', action='store_false', 126 | help='whether use pre-trained word embeddings') 127 | parser.add_argument('--train_accuracy', action='store_true', 128 | help='whether report accuracy while training') 129 | parser.add_argument('--min_freq', default='20', type=int, 130 | help='remove tokens with small frequency for vocab') 131 | parser.add_argument('--num_augmentation', default='0', type=int, 132 | help='Number of abstracts for data augmentation for VADV') 133 | 134 | args = parser.parse_args() 135 | 136 | # args.filename_wordvec = os.path.join('/data/medg/misc/jindi/nlp/embeddings', 137 | # args.filename_wordvec) 138 | args.dir_output = os.path.join('results', args.dir_output) 139 | if not os.path.exists(args.dir_output): 140 | os.makedirs(args.dir_output) 141 | args.dir_model = os.path.join(args.dir_output, "model.weights") 142 | args.path_log = os.path.join(args.dir_output, "log.txt") 143 | 144 | # dataset 145 | if args.data_keyname == 'pico': 146 | args.data_root = '../data/pico' 147 | args.filename_dev = os.path.join(args.data_root, 'dev.txt') 148 | args.filename_test = os.path.join(args.data_root, 'test.txt') 149 | args.filename_train = os.path.join(args.data_root, 'train.txt') 150 | elif args.data_keyname == 'nicta': 151 | args.data_root = '../data/nicta_piboso' 152 | args.filename_dev = os.path.join(args.data_root, 'test.txt') 153 | args.filename_test = os.path.join(args.data_root, 'test.txt') 154 | args.filename_train = os.path.join(args.data_root, 'train.txt') 155 | 156 | # data augmentation dataset 157 | args.filename_aug = '../data/unlabeled_corpus' 158 | 159 | # vocab (created from dataset with build_data.py) 160 | args.filename_words = os.path.join('data', args.data_keyname, 'words.txt') 161 | args.filename_tags = os.path.join('data', args.data_keyname, 'tags.txt') 162 | args.filename_wordvec_trimmed = os.path.join('data', args.data_keyname, 'word.embeddings.trimmed.txt') 163 | # args.filename_chars = os.path.join('data', args.data_keyname, 'chars.txt') 164 | 165 | args.cnn_filter_sizes = [int(i) for i in args.cnn_filter_sizes.split(',')] 166 | args.cnn_char_windows = [int(i) for i in args.cnn_char_windows.split(',')] 167 | 168 | # directory for training outputs 169 | if not os.path.exists(os.path.join('data', args.data_keyname)): 170 | os.makedirs(os.path.join('data', args.data_keyname)) 171 | 172 | # directory for data output 173 | if not os.path.exists(args.dir_output): 174 | os.makedirs(args.dir_output) 175 | 176 | # create instance of logger 177 | args.logger = get_logger(args.path_log) 178 | 179 | # log the attributes 180 | msg = ', '.join(['{}: {}'.format(attr, getattr(args, attr)) for attr in dir(args) \ 181 | if not callable(getattr(args, attr)) and not attr.startswith("__")]) 182 | args.logger.info(msg) 183 | 184 | # load if requested (default) 185 | if load: 186 | args = load_(args) 187 | 188 | return args 189 | -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/file_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with the local dataset cache. 3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp 4 | Copyright by the AllenNLP authors. 5 | """ 6 | from __future__ import (absolute_import, division, print_function, unicode_literals) 7 | 8 | import sys 9 | import json 10 | import logging 11 | import os 12 | import shutil 13 | import tempfile 14 | import fnmatch 15 | from functools import wraps 16 | from hashlib import sha256 17 | import sys 18 | from io import open 19 | 20 | import boto3 21 | import requests 22 | from botocore.exceptions import ClientError 23 | from tqdm import tqdm 24 | 25 | CONFIG_NAME = "config.json" 26 | WEIGHTS_NAME = "pytorch_model.bin" 27 | 28 | try: 29 | from torch.hub import _get_torch_home 30 | torch_cache_home = _get_torch_home() 31 | except ImportError: 32 | torch_cache_home = os.path.expanduser( 33 | os.getenv('TORCH_HOME', os.path.join( 34 | os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch'))) 35 | default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert') 36 | 37 | try: 38 | from urllib.parse import urlparse 39 | except ImportError: 40 | from urlparse import urlparse 41 | 42 | try: 43 | from pathlib import Path 44 | PYTORCH_PRETRAINED_BERT_CACHE = Path( 45 | os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)) 46 | except (AttributeError, ImportError): 47 | PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 48 | default_cache_path) 49 | 50 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 51 | 52 | 53 | def url_to_filename(url, etag=None): 54 | """ 55 | Convert `url` into a hashed filename in a repeatable way. 56 | If `etag` is specified, append its hash to the url's, delimited 57 | by a period. 58 | """ 59 | url_bytes = url.encode('utf-8') 60 | url_hash = sha256(url_bytes) 61 | filename = url_hash.hexdigest() 62 | 63 | if etag: 64 | etag_bytes = etag.encode('utf-8') 65 | etag_hash = sha256(etag_bytes) 66 | filename += '.' + etag_hash.hexdigest() 67 | 68 | return filename 69 | 70 | 71 | def filename_to_url(filename, cache_dir=None): 72 | """ 73 | Return the url and etag (which may be ``None``) stored for `filename`. 74 | Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. 75 | """ 76 | if cache_dir is None: 77 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 78 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 79 | cache_dir = str(cache_dir) 80 | 81 | cache_path = os.path.join(cache_dir, filename) 82 | if not os.path.exists(cache_path): 83 | raise EnvironmentError("file {} not found".format(cache_path)) 84 | 85 | meta_path = cache_path + '.json' 86 | if not os.path.exists(meta_path): 87 | raise EnvironmentError("file {} not found".format(meta_path)) 88 | 89 | with open(meta_path, encoding="utf-8") as meta_file: 90 | metadata = json.load(meta_file) 91 | url = metadata['url'] 92 | etag = metadata['etag'] 93 | 94 | return url, etag 95 | 96 | 97 | def cached_path(url_or_filename, cache_dir=None): 98 | """ 99 | Given something that might be a URL (or might be a local path), 100 | determine which. If it's a URL, download the file and cache it, and 101 | return the path to the cached file. If it's already a local path, 102 | make sure the file exists and then return the path. 103 | """ 104 | if cache_dir is None: 105 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 106 | if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): 107 | url_or_filename = str(url_or_filename) 108 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 109 | cache_dir = str(cache_dir) 110 | 111 | parsed = urlparse(url_or_filename) 112 | 113 | if parsed.scheme in ('http', 'https', 's3'): 114 | # URL, so get it from the cache (downloading if necessary) 115 | return get_from_cache(url_or_filename, cache_dir) 116 | elif os.path.exists(url_or_filename): 117 | # File, and it exists. 118 | return url_or_filename 119 | elif parsed.scheme == '': 120 | # File, but it doesn't exist. 121 | raise EnvironmentError("file {} not found".format(url_or_filename)) 122 | else: 123 | # Something unknown 124 | raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) 125 | 126 | 127 | def split_s3_path(url): 128 | """Split a full s3 path into the bucket name and path.""" 129 | parsed = urlparse(url) 130 | if not parsed.netloc or not parsed.path: 131 | raise ValueError("bad s3 path {}".format(url)) 132 | bucket_name = parsed.netloc 133 | s3_path = parsed.path 134 | # Remove '/' at beginning of path. 135 | if s3_path.startswith("/"): 136 | s3_path = s3_path[1:] 137 | return bucket_name, s3_path 138 | 139 | 140 | def s3_request(func): 141 | """ 142 | Wrapper function for s3 requests in order to create more helpful error 143 | messages. 144 | """ 145 | 146 | @wraps(func) 147 | def wrapper(url, *args, **kwargs): 148 | try: 149 | return func(url, *args, **kwargs) 150 | except ClientError as exc: 151 | if int(exc.response["Error"]["Code"]) == 404: 152 | raise EnvironmentError("file {} not found".format(url)) 153 | else: 154 | raise 155 | 156 | return wrapper 157 | 158 | 159 | @s3_request 160 | def s3_etag(url): 161 | """Check ETag on S3 object.""" 162 | s3_resource = boto3.resource("s3") 163 | bucket_name, s3_path = split_s3_path(url) 164 | s3_object = s3_resource.Object(bucket_name, s3_path) 165 | return s3_object.e_tag 166 | 167 | 168 | @s3_request 169 | def s3_get(url, temp_file): 170 | """Pull a file directly from S3.""" 171 | s3_resource = boto3.resource("s3") 172 | bucket_name, s3_path = split_s3_path(url) 173 | s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) 174 | 175 | 176 | def http_get(url, temp_file): 177 | req = requests.get(url, stream=True) 178 | content_length = req.headers.get('Content-Length') 179 | total = int(content_length) if content_length is not None else None 180 | progress = tqdm(unit="B", total=total) 181 | for chunk in req.iter_content(chunk_size=1024): 182 | if chunk: # filter out keep-alive new chunks 183 | progress.update(len(chunk)) 184 | temp_file.write(chunk) 185 | progress.close() 186 | 187 | 188 | def get_from_cache(url, cache_dir=None): 189 | """ 190 | Given a URL, look for the corresponding dataset in the local cache. 191 | If it's not there, download it. Then return the path to the cached file. 192 | """ 193 | if cache_dir is None: 194 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 195 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 196 | cache_dir = str(cache_dir) 197 | 198 | if not os.path.exists(cache_dir): 199 | os.makedirs(cache_dir) 200 | 201 | # Get eTag to add to filename, if it exists. 202 | if url.startswith("s3://"): 203 | etag = s3_etag(url) 204 | else: 205 | try: 206 | response = requests.head(url, allow_redirects=True) 207 | if response.status_code != 200: 208 | etag = None 209 | else: 210 | etag = response.headers.get("ETag") 211 | except EnvironmentError: 212 | etag = None 213 | 214 | if sys.version_info[0] == 2 and etag is not None: 215 | etag = etag.decode('utf-8') 216 | filename = url_to_filename(url, etag) 217 | 218 | # get cache path to put the file 219 | cache_path = os.path.join(cache_dir, filename) 220 | 221 | # If we don't have a connection (etag is None) and can't identify the file 222 | # try to get the last downloaded one 223 | if not os.path.exists(cache_path) and etag is None: 224 | matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*') 225 | matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files)) 226 | if matching_files: 227 | cache_path = os.path.join(cache_dir, matching_files[-1]) 228 | 229 | if not os.path.exists(cache_path): 230 | # Download to temporary file, then copy to cache dir once finished. 231 | # Otherwise you get corrupt cache entries if the download gets interrupted. 232 | with tempfile.NamedTemporaryFile() as temp_file: 233 | logger.info("%s not found in cache, downloading to %s", url, temp_file.name) 234 | 235 | # GET file object 236 | if url.startswith("s3://"): 237 | s3_get(url, temp_file) 238 | else: 239 | http_get(url, temp_file) 240 | 241 | # we are copying the file before closing it, so flush to avoid truncation 242 | temp_file.flush() 243 | # shutil.copyfileobj() starts at the current position, so go to the start 244 | temp_file.seek(0) 245 | 246 | logger.info("copying %s to cache at %s", temp_file.name, cache_path) 247 | with open(cache_path, 'wb') as cache_file: 248 | shutil.copyfileobj(temp_file, cache_file) 249 | 250 | logger.info("creating metadata file for %s", cache_path) 251 | meta = {'url': url, 'etag': etag} 252 | meta_path = cache_path + '.json' 253 | with open(meta_path, 'w') as meta_file: 254 | output_string = json.dumps(meta) 255 | if sys.version_info[0] == 2 and isinstance(output_string, str): 256 | output_string = unicode(output_string, 'utf-8') # The beauty of python 2 257 | meta_file.write(output_string) 258 | 259 | logger.info("removing temp file %s", temp_file.name) 260 | 261 | return cache_path 262 | 263 | 264 | def read_set_from_file(filename): 265 | ''' 266 | Extract a de-duped collection (set) of text from a file. 267 | Expected file format is one item per line. 268 | ''' 269 | collection = set() 270 | with open(filename, 'r', encoding='utf-8') as file_: 271 | for line in file_: 272 | collection.add(line.rstrip()) 273 | return collection 274 | 275 | 276 | def get_file_extension(path, dot=True, lower=True): 277 | ext = os.path.splitext(path)[1] 278 | ext = ext if dot else ext[1:] 279 | return ext.lower() if lower else ext -------------------------------------------------------------------------------- /lstm_model/adversarial_losses.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Adversarial losses for text models.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | # Dependency imports 21 | 22 | from six.moves import xrange 23 | import tensorflow as tf 24 | 25 | # # Adversarial and virtual adversarial training parameters. 26 | # flags.DEFINE_float('perturb_norm_length', 5.0, 27 | # 'Norm length of adversarial perturbation to be ' 28 | # 'optimized with validation. ' 29 | # '5.0 is optimal on IMDB with virtual adversarial training. ') 30 | 31 | # Virtual adversarial training parameters 32 | num_power_iteration = 1 33 | small_constant_for_finite_diff = 1e-1 34 | 35 | # # Parameters for building the graph 36 | # flags.DEFINE_string('adv_training_method', None, 37 | # 'The flag which specifies training method. ' 38 | # '"" : non-adversarial training (e.g. for running the ' 39 | # ' semi-supervised sequence learning model) ' 40 | # '"rp" : random perturbation training ' 41 | # '"at" : adversarial training ' 42 | # '"vat" : virtual adversarial training ' 43 | # '"atvat" : at + vat ') 44 | # flags.DEFINE_float('adv_reg_coeff', 1.0, 45 | # 'Regularization coefficient of adversarial loss.') 46 | 47 | 48 | def random_perturbation_loss(embedded, length, loss_fn): 49 | """Adds noise to embeddings and recomputes classification loss.""" 50 | noise = tf.random_normal(shape=tf.shape(embedded)) 51 | perturb = _scale_l2(_mask_by_length(noise, length), FLAGS.perturb_norm_length) 52 | return loss_fn(embedded + perturb) 53 | 54 | 55 | def adversarial_loss(embedded, loss, loss_fn, perturb_norm_length): 56 | """Adds gradient to embedding and recomputes classification loss.""" 57 | grad, = tf.gradients( 58 | loss, 59 | embedded, 60 | aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 61 | grad = tf.stop_gradient(grad) 62 | perturb = _scale_l2_dim4(grad, perturb_norm_length) 63 | return loss_fn(embedded + perturb) 64 | 65 | 66 | def virtual_adversarial_loss(logits, embedded, num_classes, sentence_length, document_length, 67 | logits_from_embedding_fn, perturb_norm_length): 68 | """Virtual adversarial loss. 69 | Computes virtual adversarial perturbation by finite difference method and 70 | power iteration, adds it to the embedding, and computes the KL divergence 71 | between the new logits and the original logits. 72 | Args: 73 | logits: 3-D float Tensor, [batch_size, num_timesteps, m], where m=1 if 74 | num_classes=2, otherwise m=num_classes. 75 | embedded: 3-D float Tensor, [batch_size, num_timesteps, embedding_dim]. 76 | inputs: VatxtInput. 77 | logits_from_embedding_fn: callable that takes embeddings and returns 78 | classifier logits. 79 | Returns: 80 | kl: float scalar. 81 | """ 82 | # Stop gradient of logits. See https://arxiv.org/abs/1507.00677 for details. 83 | logits = tf.stop_gradient(logits) 84 | 85 | # Only care about the KL divergence on the final timestep. 86 | # weights = inputs.eos_weights 87 | # assert weights is not None 88 | # if FLAGS.single_label: 89 | # indices = tf.stack([tf.range(FLAGS.batch_size), inputs.length - 1], 1) 90 | # weights = tf.expand_dims(tf.gather_nd(inputs.eos_weights, indices), 1) 91 | 92 | # Initialize perturbation with random noise. 93 | # shape(embedded) = (batch_size, num_timesteps, embedding_dim) 94 | d = tf.random_normal(shape=tf.shape(embedded)) 95 | 96 | # Perform finite difference method and power iteration. 97 | # See Eq.(8) in the paper http://arxiv.org/pdf/1507.00677.pdf, 98 | # Adding small noise to input and taking gradient with respect to the noise 99 | # corresponds to 1 power iteration. 100 | for _ in xrange(num_power_iteration): 101 | d = _scale_l2_dim4( 102 | _mask_by_length_4D(d, sentence_length), small_constant_for_finite_diff) 103 | 104 | d_logits = logits_from_embedding_fn(embedded + d, sentence_length, document_length) 105 | kl = _kl_divergence_with_logits(logits, d_logits, num_classes) 106 | d, = tf.gradients( 107 | kl, 108 | d, 109 | aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 110 | d = tf.stop_gradient(d) 111 | 112 | perturb = _scale_l2_dim4(d, perturb_norm_length) 113 | vadv_logits = logits_from_embedding_fn(embedded + perturb, sentence_length, document_length) 114 | return _kl_divergence_with_logits(logits, vadv_logits, num_classes) 115 | 116 | 117 | def random_perturbation_loss_bidir(embedded, length, loss_fn): 118 | """Adds noise to embeddings and recomputes classification loss.""" 119 | noise = [tf.random_normal(shape=tf.shape(emb)) for emb in embedded] 120 | masked = [_mask_by_length(n, length) for n in noise] 121 | scaled = [_scale_l2(m, FLAGS.perturb_norm_length) for m in masked] 122 | return loss_fn([e + s for (e, s) in zip(embedded, scaled)]) 123 | 124 | 125 | def adversarial_loss_bidir(embedded, loss, loss_fn): 126 | """Adds gradient to embeddings and recomputes classification loss.""" 127 | grads = tf.gradients( 128 | loss, 129 | embedded, 130 | aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 131 | adv_exs = [ 132 | emb + _scale_l2(tf.stop_gradient(g), FLAGS.perturb_norm_length) 133 | for emb, g in zip(embedded, grads) 134 | ] 135 | return loss_fn(adv_exs) 136 | 137 | 138 | def virtual_adversarial_loss_bidir(logits, embedded, inputs, 139 | logits_from_embedding_fn): 140 | """Virtual adversarial loss for bidirectional models.""" 141 | logits = tf.stop_gradient(logits) 142 | f_inputs, _ = inputs 143 | weights = f_inputs.eos_weights 144 | if FLAGS.single_label: 145 | indices = tf.stack([tf.range(FLAGS.batch_size), f_inputs.length - 1], 1) 146 | weights = tf.expand_dims(tf.gather_nd(f_inputs.eos_weights, indices), 1) 147 | assert weights is not None 148 | 149 | perturbs = [ 150 | _mask_by_length(tf.random_normal(shape=tf.shape(emb)), f_inputs.length) 151 | for emb in embedded 152 | ] 153 | for _ in xrange(num_power_iteration): 154 | perturbs = [ 155 | _scale_l2(d, small_constant_for_finite_diff) for d in perturbs 156 | ] 157 | d_logits = logits_from_embedding_fn( 158 | [emb + d for (emb, d) in zip(embedded, perturbs)]) 159 | kl = _kl_divergence_with_logits(logits, d_logits, weights) 160 | perturbs = tf.gradients( 161 | kl, 162 | perturbs, 163 | aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 164 | perturbs = [tf.stop_gradient(d) for d in perturbs] 165 | 166 | perturbs = [_scale_l2(d, FLAGS.perturb_norm_length) for d in perturbs] 167 | vadv_logits = logits_from_embedding_fn( 168 | [emb + d for (emb, d) in zip(embedded, perturbs)]) 169 | return _kl_divergence_with_logits(logits, vadv_logits, weights) 170 | 171 | 172 | def _mask_by_length(t, length): 173 | """Mask t, 3-D [batch, time, dim], by length, 1-D [batch,].""" 174 | maxlen = t.get_shape().as_list()[1] 175 | 176 | # Subtract 1 from length to prevent the perturbation from going on 'eos' 177 | mask = tf.sequence_mask(length - 1, maxlen=maxlen) 178 | mask = tf.expand_dims(tf.cast(mask, tf.float32), -1) 179 | # shape(mask) = (batch, num_timesteps, 1) 180 | return t * mask 181 | 182 | 183 | def _mask_by_length_4D(t, length): 184 | """Mask t, 4-D [batch, time1, time2, dim], by length, 2-D [batch, time1].""" 185 | maxlen = t.get_shape().as_list()[2] 186 | 187 | mask = tf.sequence_mask(length, maxlen=maxlen) 188 | mask = tf.expand_dims(tf.cast(mask, tf.float32), -1) 189 | # shape(mask) = (batch, num_timesteps, num_timesteps, 1) 190 | return t * mask 191 | 192 | 193 | def _scale_l2(x, norm_length): 194 | # shape(x) = (batch, num_timesteps, d) 195 | # Divide x by max(abs(x)) for a numerically stable L2 norm. 196 | # 2norm(x) = a * 2norm(x/a) 197 | # Scale over the full sequence, dims (1, 2) 198 | alpha = tf.reduce_max(tf.abs(x), (1, 2), keep_dims=True) + 1e-12 199 | l2_norm = alpha * tf.sqrt( 200 | tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2), keep_dims=True) + 1e-6) 201 | x_unit = x / l2_norm 202 | return norm_length * x_unit 203 | 204 | 205 | def _scale_l2_dim4(x, norm_length): 206 | # shape(x) = (batch, num_timesteps, num_timesteps, d) 207 | # Divide x by max(abs(x)) for a numerically stable L2 norm. 208 | # 2norm(x) = a * 2norm(x/a) 209 | # Scale over the full sequence, dims (1, 2, 3) 210 | alpha = tf.reduce_max(tf.abs(x), (1, 2, 3), keepdims=True) + 1e-12 211 | l2_norm = alpha * tf.sqrt( 212 | tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2, 3), keepdims=True) + 1e-6) 213 | x_unit = x / l2_norm 214 | return norm_length * x_unit 215 | 216 | 217 | def _kl_divergence_with_logits(q_logits, p_logits, num_classes): 218 | """Returns weighted KL divergence between distributions q and p. 219 | Args: 220 | q_logits: logits for 1st argument of KL divergence shape 221 | [batch_size, num_timesteps, num_classes] if num_classes > 2, and 222 | [batch_size, num_timesteps] if num_classes == 2. 223 | p_logits: logits for 2nd argument of KL divergence with same shape q_logits. 224 | weights: 1-D float tensor with shape [batch_size, num_timesteps]. 225 | Elements should be 1.0 only on end of sequences 226 | Returns: 227 | KL: float scalar. 228 | """ 229 | # For logistic regression 230 | if num_classes == 2: 231 | q = tf.nn.sigmoid(q_logits) 232 | kl = (-tf.nn.sigmoid_cross_entropy_with_logits(logits=q_logits, labels=q) + 233 | tf.nn.sigmoid_cross_entropy_with_logits(logits=p_logits, labels=q)) 234 | kl = tf.squeeze(kl, 2) 235 | 236 | # For softmax regression 237 | else: 238 | q = tf.nn.softmax(q_logits) 239 | kl = tf.reduce_sum( 240 | q * (tf.nn.log_softmax(q_logits) - tf.nn.log_softmax(p_logits)), -1) 241 | 242 | # num_labels = tf.reduce_sum(weights) 243 | # num_labels = tf.where(tf.equal(num_labels, 0.), 1., num_labels) 244 | 245 | kl.get_shape().assert_has_rank(2) 246 | # weights.get_shape().assert_has_rank(2) 247 | 248 | loss = tf.identity(tf.reduce_mean(kl), name='kl') 249 | return loss -------------------------------------------------------------------------------- /lstm_model/src/adversarial_losses.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Adversarial losses for text models.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | # Dependency imports 21 | 22 | from six.moves import xrange 23 | import tensorflow as tf 24 | 25 | # # Adversarial and virtual adversarial training parameters. 26 | # flags.DEFINE_float('perturb_norm_length', 5.0, 27 | # 'Norm length of adversarial perturbation to be ' 28 | # 'optimized with validation. ' 29 | # '5.0 is optimal on IMDB with virtual adversarial training. ') 30 | 31 | # Virtual adversarial training parameters 32 | num_power_iteration = 1 33 | small_constant_for_finite_diff = 1e-1 34 | 35 | # # Parameters for building the graph 36 | # flags.DEFINE_string('adv_training_method', None, 37 | # 'The flag which specifies training method. ' 38 | # '"" : non-adversarial training (e.g. for running the ' 39 | # ' semi-supervised sequence learning model) ' 40 | # '"rp" : random perturbation training ' 41 | # '"at" : adversarial training ' 42 | # '"vat" : virtual adversarial training ' 43 | # '"atvat" : at + vat ') 44 | # flags.DEFINE_float('adv_reg_coeff', 1.0, 45 | # 'Regularization coefficient of adversarial loss.') 46 | 47 | 48 | def random_perturbation_loss(embedded, length, loss_fn): 49 | """Adds noise to embeddings and recomputes classification loss.""" 50 | noise = tf.random_normal(shape=tf.shape(embedded)) 51 | perturb = _scale_l2(_mask_by_length(noise, length), FLAGS.perturb_norm_length) 52 | return loss_fn(embedded + perturb) 53 | 54 | 55 | def adversarial_loss(embedded, loss, loss_fn, perturb_norm_length): 56 | """Adds gradient to embedding and recomputes classification loss.""" 57 | grad, = tf.gradients( 58 | loss, 59 | embedded, 60 | aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 61 | grad = tf.stop_gradient(grad) 62 | perturb = _scale_l2_dim4(grad, perturb_norm_length) 63 | return loss_fn(embedded + perturb) 64 | 65 | 66 | def virtual_adversarial_loss(logits, embedded, num_classes, sentence_length, document_length, 67 | logits_from_embedding_fn, perturb_norm_length): 68 | """Virtual adversarial loss. 69 | Computes virtual adversarial perturbation by finite difference method and 70 | power iteration, adds it to the embedding, and computes the KL divergence 71 | between the new logits and the original logits. 72 | Args: 73 | logits: 3-D float Tensor, [batch_size, num_timesteps, m], where m=1 if 74 | num_classes=2, otherwise m=num_classes. 75 | embedded: 3-D float Tensor, [batch_size, num_timesteps, embedding_dim]. 76 | inputs: VatxtInput. 77 | logits_from_embedding_fn: callable that takes embeddings and returns 78 | classifier logits. 79 | Returns: 80 | kl: float scalar. 81 | """ 82 | # Stop gradient of logits. See https://arxiv.org/abs/1507.00677 for details. 83 | logits = tf.stop_gradient(logits) 84 | 85 | # Only care about the KL divergence on the final timestep. 86 | # weights = inputs.eos_weights 87 | # assert weights is not None 88 | # if FLAGS.single_label: 89 | # indices = tf.stack([tf.range(FLAGS.batch_size), inputs.length - 1], 1) 90 | # weights = tf.expand_dims(tf.gather_nd(inputs.eos_weights, indices), 1) 91 | 92 | # Initialize perturbation with random noise. 93 | # shape(embedded) = (batch_size, num_timesteps, embedding_dim) 94 | d = tf.random_normal(shape=tf.shape(embedded)) 95 | 96 | # Perform finite difference method and power iteration. 97 | # See Eq.(8) in the paper http://arxiv.org/pdf/1507.00677.pdf, 98 | # Adding small noise to input and taking gradient with respect to the noise 99 | # corresponds to 1 power iteration. 100 | for _ in xrange(num_power_iteration): 101 | d = _scale_l2_dim4( 102 | _mask_by_length_4D(d, sentence_length), small_constant_for_finite_diff) 103 | 104 | d_logits = logits_from_embedding_fn(embedded + d, sentence_length, document_length) 105 | kl = _kl_divergence_with_logits(logits, d_logits, num_classes) 106 | d, = tf.gradients( 107 | kl, 108 | d, 109 | aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 110 | d = tf.stop_gradient(d) 111 | 112 | perturb = _scale_l2_dim4(d, perturb_norm_length) 113 | vadv_logits = logits_from_embedding_fn(embedded + perturb, sentence_length, document_length) 114 | return _kl_divergence_with_logits(logits, vadv_logits, num_classes) 115 | 116 | 117 | def random_perturbation_loss_bidir(embedded, length, loss_fn): 118 | """Adds noise to embeddings and recomputes classification loss.""" 119 | noise = [tf.random_normal(shape=tf.shape(emb)) for emb in embedded] 120 | masked = [_mask_by_length(n, length) for n in noise] 121 | scaled = [_scale_l2(m, FLAGS.perturb_norm_length) for m in masked] 122 | return loss_fn([e + s for (e, s) in zip(embedded, scaled)]) 123 | 124 | 125 | def adversarial_loss_bidir(embedded, loss, loss_fn): 126 | """Adds gradient to embeddings and recomputes classification loss.""" 127 | grads = tf.gradients( 128 | loss, 129 | embedded, 130 | aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 131 | adv_exs = [ 132 | emb + _scale_l2(tf.stop_gradient(g), FLAGS.perturb_norm_length) 133 | for emb, g in zip(embedded, grads) 134 | ] 135 | return loss_fn(adv_exs) 136 | 137 | 138 | def virtual_adversarial_loss_bidir(logits, embedded, inputs, 139 | logits_from_embedding_fn): 140 | """Virtual adversarial loss for bidirectional models.""" 141 | logits = tf.stop_gradient(logits) 142 | f_inputs, _ = inputs 143 | weights = f_inputs.eos_weights 144 | if FLAGS.single_label: 145 | indices = tf.stack([tf.range(FLAGS.batch_size), f_inputs.length - 1], 1) 146 | weights = tf.expand_dims(tf.gather_nd(f_inputs.eos_weights, indices), 1) 147 | assert weights is not None 148 | 149 | perturbs = [ 150 | _mask_by_length(tf.random_normal(shape=tf.shape(emb)), f_inputs.length) 151 | for emb in embedded 152 | ] 153 | for _ in xrange(num_power_iteration): 154 | perturbs = [ 155 | _scale_l2(d, small_constant_for_finite_diff) for d in perturbs 156 | ] 157 | d_logits = logits_from_embedding_fn( 158 | [emb + d for (emb, d) in zip(embedded, perturbs)]) 159 | kl = _kl_divergence_with_logits(logits, d_logits, weights) 160 | perturbs = tf.gradients( 161 | kl, 162 | perturbs, 163 | aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 164 | perturbs = [tf.stop_gradient(d) for d in perturbs] 165 | 166 | perturbs = [_scale_l2(d, FLAGS.perturb_norm_length) for d in perturbs] 167 | vadv_logits = logits_from_embedding_fn( 168 | [emb + d for (emb, d) in zip(embedded, perturbs)]) 169 | return _kl_divergence_with_logits(logits, vadv_logits, weights) 170 | 171 | 172 | def _mask_by_length(t, length): 173 | """Mask t, 3-D [batch, time, dim], by length, 1-D [batch,].""" 174 | maxlen = t.get_shape().as_list()[1] 175 | 176 | # Subtract 1 from length to prevent the perturbation from going on 'eos' 177 | mask = tf.sequence_mask(length - 1, maxlen=maxlen) 178 | mask = tf.expand_dims(tf.cast(mask, tf.float32), -1) 179 | # shape(mask) = (batch, num_timesteps, 1) 180 | return t * mask 181 | 182 | 183 | def _mask_by_length_4D(t, length): 184 | """Mask t, 4-D [batch, time1, time2, dim], by length, 2-D [batch, time1].""" 185 | maxlen = t.get_shape().as_list()[2] 186 | 187 | mask = tf.sequence_mask(length, maxlen=maxlen) 188 | mask = tf.expand_dims(tf.cast(mask, tf.float32), -1) 189 | # shape(mask) = (batch, num_timesteps, num_timesteps, 1) 190 | return t * mask 191 | 192 | 193 | def _scale_l2(x, norm_length): 194 | # shape(x) = (batch, num_timesteps, d) 195 | # Divide x by max(abs(x)) for a numerically stable L2 norm. 196 | # 2norm(x) = a * 2norm(x/a) 197 | # Scale over the full sequence, dims (1, 2) 198 | alpha = tf.reduce_max(tf.abs(x), (1, 2), keep_dims=True) + 1e-12 199 | l2_norm = alpha * tf.sqrt( 200 | tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2), keep_dims=True) + 1e-6) 201 | x_unit = x / l2_norm 202 | return norm_length * x_unit 203 | 204 | 205 | def _scale_l2_dim4(x, norm_length): 206 | # shape(x) = (batch, num_timesteps, num_timesteps, d) 207 | # Divide x by max(abs(x)) for a numerically stable L2 norm. 208 | # 2norm(x) = a * 2norm(x/a) 209 | # Scale over the full sequence, dims (1, 2, 3) 210 | alpha = tf.reduce_max(tf.abs(x), (1, 2, 3), keepdims=True) + 1e-12 211 | l2_norm = alpha * tf.sqrt( 212 | tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2, 3), keepdims=True) + 1e-6) 213 | x_unit = x / l2_norm 214 | return norm_length * x_unit 215 | 216 | 217 | def _kl_divergence_with_logits(q_logits, p_logits, num_classes): 218 | """Returns weighted KL divergence between distributions q and p. 219 | Args: 220 | q_logits: logits for 1st argument of KL divergence shape 221 | [batch_size, num_timesteps, num_classes] if num_classes > 2, and 222 | [batch_size, num_timesteps] if num_classes == 2. 223 | p_logits: logits for 2nd argument of KL divergence with same shape q_logits. 224 | weights: 1-D float tensor with shape [batch_size, num_timesteps]. 225 | Elements should be 1.0 only on end of sequences 226 | Returns: 227 | KL: float scalar. 228 | """ 229 | # For logistic regression 230 | if num_classes == 2: 231 | q = tf.nn.sigmoid(q_logits) 232 | kl = (-tf.nn.sigmoid_cross_entropy_with_logits(logits=q_logits, labels=q) + 233 | tf.nn.sigmoid_cross_entropy_with_logits(logits=p_logits, labels=q)) 234 | kl = tf.squeeze(kl, 2) 235 | 236 | # For softmax regression 237 | else: 238 | q = tf.nn.softmax(q_logits) 239 | kl = tf.reduce_sum( 240 | q * (tf.nn.log_softmax(q_logits) - tf.nn.log_softmax(p_logits)), -1) 241 | 242 | # num_labels = tf.reduce_sum(weights) 243 | # num_labels = tf.where(tf.equal(num_labels, 0.), 1., num_labels) 244 | 245 | kl.get_shape().assert_has_rank(2) 246 | # weights.get_shape().assert_has_rank(2) 247 | 248 | loss = tf.identity(tf.reduce_mean(kl), name='kl') 249 | return loss -------------------------------------------------------------------------------- /BERT/pytorch_pretrained_bert/module/san.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | import torch 3 | import random 4 | import torch.nn as nn 5 | from torch.nn.utils import weight_norm 6 | from torch.nn.parameter import Parameter 7 | import torch.nn.functional as F 8 | from .dropout_wrapper import DropoutWrapper 9 | from .similarity import FlatSimilarityWrapper, SelfAttnWrapper, DualAttentionWrapper, AttentionWrapper 10 | from .my_optim import weight_norm as WN 11 | from .common import activation, init_wrapper 12 | 13 | SMALL_POS_NUM=1.0e-30 14 | 15 | def generate_mask(new_data, dropout_p=0.0, is_training=False): 16 | if not is_training: dropout_p = 0.0 17 | new_data = (1-dropout_p) * (new_data.zero_() + 1) 18 | for i in range(new_data.size(0)): 19 | one = random.randint(0, new_data.size(1)-1) 20 | new_data[i][one] = 1 21 | mask = 1.0/(1 - dropout_p) * torch.bernoulli(new_data) 22 | mask.requires_grad = False 23 | return mask 24 | 25 | 26 | def masked_select(tensor, mask): 27 | mask_len = mask.sum(dim=-1) 28 | max_seq_len = mask_len.max() 29 | new_tensor = torch.zeros(tensor.size(0), max_seq_len, tensor.size(-1)).cuda() 30 | new_mask = torch.ones(tensor.size(0), max_seq_len).cuda() 31 | for i in range(tensor.size(0)): 32 | new_tensor[i, :mask_len[i]] = torch.masked_select(tensor[i], mask[i].unsqueeze(1).expand_as(tensor[i])).view(-1, tensor.size(-1)) 33 | new_mask[i, :mask_len[i]] = 0 34 | return new_tensor, new_mask.byte() 35 | 36 | 37 | class Classifier(nn.Module): 38 | def __init__(self, x_size, y_size, opt, prefix='decoder', dropout=None): 39 | super(Classifier, self).__init__() 40 | self.opt = opt 41 | if dropout is None: 42 | self.dropout = DropoutWrapper(opt.get('{}_dropout_p'.format(prefix), 0)) 43 | else: 44 | self.dropout = dropout 45 | self.merge_opt = opt.get('{}_merge_opt'.format(prefix), 0) 46 | self.weight_norm_on = opt.get('{}_weight_norm_on'.format(prefix), False) 47 | 48 | if self.merge_opt == 1: 49 | self.proj = nn.Linear(x_size * 4, y_size) 50 | else: 51 | self.proj = nn.Linear(x_size * 2, y_size) 52 | 53 | if self.weight_norm_on: 54 | self.proj = weight_norm(self.proj) 55 | 56 | def forward(self, x1, x2, mask=None, activation=None): 57 | seq_len = None 58 | if len(x1.size()) == 3: 59 | bz, seq_len, hidden_size = x1.size() 60 | x1 = x1.contiguous().view(-1, hidden_size) 61 | x2 = x2.contiguous().view(-1, hidden_size) 62 | 63 | if self.merge_opt == 1: 64 | x = torch.cat([x1, x2, (x1 - x2).abs(), x1 * x2], 1) 65 | else: 66 | x = torch.cat([x1, x2], 1) 67 | x = self.dropout(x) 68 | if activation: 69 | scores = activation(self.proj(x)) 70 | else: 71 | scores = self.proj(x) 72 | 73 | if seq_len: 74 | return scores.view(bz, seq_len, -1) 75 | else: 76 | return scores 77 | 78 | 79 | class SANClassifier(nn.Module): 80 | """Implementation of Stochastic Answer Networks for Natural Language Inference, Xiaodong Liu, Kevin Duh and Jianfeng Gao 81 | https://arxiv.org/abs/1804.07888 82 | """ 83 | def __init__(self, x_size, h_size, label_size, opt={}, prefix='decoder', dropout=None): 84 | super(SANClassifier, self).__init__() 85 | self.prefix = prefix 86 | if dropout is None: 87 | self.dropout = DropoutWrapper(opt.get('{}_dropout_p'.format(self.prefix), 0)) 88 | else: 89 | self.dropout = dropout 90 | self.query_wsum = SelfAttnWrapper(x_size, prefix='mem_cum', opt=opt, dropout=self.dropout) 91 | self.attn = FlatSimilarityWrapper(x_size, h_size, prefix, opt, self.dropout) 92 | self.rnn_type = '{}{}'.format(opt.get('{}_rnn_type'.format(prefix), 'gru').upper(), 'Cell') 93 | self.rnn = getattr(nn, self.rnn_type)(x_size, h_size) 94 | self.num_turn = opt.get('{}_num_turn'.format(prefix), 5) 95 | self.opt = opt 96 | self.mem_random_drop = opt.get('{}_mem_drop_p'.format(prefix), 0) 97 | self.mem_type = opt.get('{}_mem_type'.format(prefix), 0) 98 | self.weight_norm_on = opt.get('{}_weight_norm_on'.format(prefix), False) 99 | self.label_size = label_size 100 | self.dump_state = opt.get('dump_state_on', False) 101 | self.alpha = Parameter(torch.zeros(1, 1), requires_grad=False) 102 | # self.hyp_attn = None 103 | # if opt.get('hyp_attn_premise', 0): 104 | # self.hyp_attn = AttentionWrapper(x_size, h_size, prefix=prefix, opt=opt, dropout=self.dropout) 105 | # self.hyp_merge = Classifier(x_size, x_size, opt, prefix=prefix, dropout=self.dropout) 106 | self.f = activation(opt.get('{}_activation'.format(self.prefix), 'relu')) 107 | if self.weight_norm_on: 108 | self.rnn = WN(self.rnn) 109 | 110 | self.classifier = Classifier(x_size, 1, opt, prefix=prefix, dropout=self.dropout) 111 | 112 | def forward(self, x, h0, x_mask=None, h_mask=None, is_training=True): 113 | # if self.hyp_attn: 114 | # h_attn = self.hyp_attn(h0, x, key_padding_mask=x_mask) 115 | # h0 = self.hyp_merge(h0, h_attn, activation=self.f) 116 | 117 | h0 = self.query_wsum(h0, h_mask) 118 | if type(self.rnn) is nn.LSTMCell: 119 | c0 = h0.new(h0.size()).zero_() 120 | scores_list = [] 121 | for turn in range(self.num_turn): 122 | att_scores = self.attn(x, h0, x_mask) 123 | x_sum = torch.bmm(F.softmax(att_scores, 1).unsqueeze(1), x).squeeze(1) 124 | scores = self.classifier(x_sum, h0) 125 | scores_list.append(scores) 126 | # next turn 127 | if self.rnn is not None: 128 | h0 = self.dropout(h0) 129 | if type(self.rnn) is nn.LSTMCell: 130 | h0, c0 = self.rnn(x_sum, (h0, c0)) 131 | else: 132 | h0 = self.rnn(x_sum, h0) 133 | if self.mem_type == 1: 134 | batch_size = x.size(0) // self.label_size 135 | mask = generate_mask(self.alpha.data.new(batch_size, self.num_turn), self.mem_random_drop, is_training) 136 | mask = [m.contiguous() for m in torch.unbind(mask, 1)] 137 | tmp_scores_list = [mask[idx].view(batch_size, 1).expand_as(inp.view(-1, self.label_size)) 138 | * F.softmax(inp.view(-1, self.label_size), 1) 139 | for idx, inp in enumerate(scores_list)] 140 | scores = torch.stack(tmp_scores_list, 2) 141 | scores = torch.mean(scores, 2) 142 | scores = torch.log(scores) 143 | else: 144 | scores = scores_list[-1] 145 | if self.dump_state: 146 | return scores, scores_list 147 | else: 148 | return scores 149 | 150 | 151 | class SANClassifier2(nn.Module): 152 | """Implementation of Stochastic Answer Networks for Natural Language Inference, Xiaodong Liu, Kevin Duh and Jianfeng Gao 153 | https://arxiv.org/abs/1804.07888 154 | """ 155 | def __init__(self, x_size, h_size, label_size, opt={}, prefix='decoder', dropout=None): 156 | super(SANClassifier2, self).__init__() 157 | self.prefix = prefix 158 | if dropout is None: 159 | self.dropout = DropoutWrapper(opt.get('{}_dropout_p'.format(self.prefix), 0)) 160 | else: 161 | self.dropout = dropout 162 | self.dual_attn = DualAttentionWrapper(x_size, h_size, prefix, opt, self.dropout) 163 | self.query_wsum = SelfAttnWrapper(x_size, prefix='mem_cum', opt=opt, dropout=self.dropout) 164 | self.attn = FlatSimilarityWrapper(x_size, h_size, prefix, opt, self.dropout) 165 | self.rnn_type = '{}{}'.format(opt.get('{}_rnn_type'.format(prefix), 'gru').upper(), 'Cell') 166 | self.rnn = getattr(nn, self.rnn_type)(x_size, h_size) 167 | self.num_turn = opt.get('{}_num_turn'.format(prefix), 5) 168 | self.opt = opt 169 | self.mem_random_drop = opt.get('{}_mem_drop_p'.format(prefix), 0) 170 | self.mem_type = opt.get('{}_mem_type'.format(prefix), 0) 171 | self.weight_norm_on = opt.get('{}_weight_norm_on'.format(prefix), False) 172 | self.label_size = label_size 173 | self.dump_state = opt.get('dump_state_on', False) 174 | self.alpha = Parameter(torch.zeros(1, 1), requires_grad=False) 175 | self.f = activation(opt.get('{}_activation'.format(self.prefix), 'relu')) 176 | self.hyp_first = opt.get('{}_hyp_first'.format(prefix), 1) 177 | self.hyp_raw = opt.get('{}_hyp_raw'.format(prefix), 0) 178 | if self.weight_norm_on: 179 | self.rnn = WN(self.rnn) 180 | 181 | self.classifier = Classifier(x_size, 1, opt, prefix=prefix, dropout=self.dropout) 182 | 183 | self.premise_merge = Classifier(x_size, x_size, opt, prefix=prefix, dropout=self.dropout) 184 | self.hyp_merge = Classifier(x_size, x_size, opt, prefix=prefix, dropout=self.dropout) 185 | 186 | 187 | def forward(self, x, h, x_mask=None, h_mask=None, is_training=True): 188 | if self.hyp_first and self.hyp_raw: 189 | pass 190 | elif self.hyp_first and not self.hyp_raw: 191 | _, h_attn = self.dual_attn(x, h, x_mask, h_mask) 192 | 193 | # x_prime = self.premise_merge(x, x_attn, activation=self.f) 194 | h = self.hyp_merge(h, h_attn, activation=self.f) 195 | else: 196 | raise NotImplementedError 197 | 198 | # if self.num_turn == 0: 199 | # scores = self.classifier(x_prime.max(dim=1)[0], h_prime.max(dim=1)[0]) 200 | # return scores 201 | 202 | # if self.hyp_first and not self.hyp_raw: 203 | # # x = x_prime 204 | # h = h_prime 205 | # elif self.hyp_first and self.hyp_raw: 206 | # # x = x_prime 207 | # pass 208 | # elif not self.hyp_first and not self.hyp_raw: 209 | # x = h_prime 210 | # h = x_prime 211 | # else: 212 | # h = x 213 | # x = h_prime 214 | 215 | h0 = self.query_wsum(h, h_mask) 216 | if type(self.rnn) is nn.LSTMCell: 217 | c0 = h0.new(h0.size()).zero_() 218 | scores_list = [] 219 | for turn in range(self.num_turn): 220 | att_scores = self.attn(x, h0, x_mask) 221 | x_sum = torch.bmm(F.softmax(att_scores, 1).unsqueeze(1), x).squeeze(1) 222 | scores = self.classifier(x_sum, h0) 223 | scores_list.append(scores) 224 | # next turn 225 | if self.rnn is not None: 226 | h0 = self.dropout(h0) 227 | if type(self.rnn) is nn.LSTMCell: 228 | h0, c0 = self.rnn(x_sum, (h0, c0)) 229 | else: 230 | h0 = self.rnn(x_sum, h0) 231 | if self.mem_type == 1: 232 | batch_size = x.size(0) // self.label_size 233 | mask = generate_mask(self.alpha.data.new(batch_size, self.num_turn), self.mem_random_drop, is_training) 234 | mask = [m.contiguous() for m in torch.unbind(mask, 1)] 235 | tmp_scores_list = [mask[idx].view(batch_size, 1).expand_as(inp.view(-1, self.label_size)) 236 | * F.softmax(inp.view(-1, self.label_size), 1) 237 | for idx, inp in enumerate(scores_list)] 238 | scores = torch.stack(tmp_scores_list, 2) 239 | scores = torch.mean(scores, 2) 240 | scores = torch.log(scores) 241 | else: 242 | scores = scores_list[-1] 243 | if self.dump_state: 244 | return scores, scores_list 245 | else: 246 | return scores -------------------------------------------------------------------------------- /lstm_model/ner_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import tensorflow as tf 4 | 5 | 6 | from .data_utils import minibatches, pad_sequences, get_chunks 7 | from .general_utils import Progbar 8 | from .base_model import BaseModel 9 | 10 | 11 | class NERModel(BaseModel): 12 | """Specialized class of Model for NER""" 13 | 14 | def __init__(self, config): 15 | super(NERModel, self).__init__(config) 16 | self.idx_to_tag = {idx: tag for tag, idx in 17 | self.config.vocab_tags.items()} 18 | 19 | 20 | def add_placeholders(self): 21 | """Define placeholders = entries to computational graph""" 22 | # shape = (batch size, max length of sentence in batch) 23 | self.word_ids = tf.placeholder(tf.int32, shape=[None, None], 24 | name="word_ids") 25 | 26 | # shape = (batch size) 27 | self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], 28 | name="sequence_lengths") 29 | 30 | # shape = (batch size, max length of sentence, max length of word) 31 | self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None], 32 | name="char_ids") 33 | 34 | # shape = (batch_size, max_length of sentence) 35 | self.word_lengths = tf.placeholder(tf.int32, shape=[None, None], 36 | name="word_lengths") 37 | 38 | # shape = (batch size, max length of sentence in batch) 39 | self.labels = tf.placeholder(tf.int32, shape=[None, None], 40 | name="labels") 41 | 42 | # hyper parameters 43 | self.dropout = tf.placeholder(dtype=tf.float32, shape=[], 44 | name="dropout") 45 | self.lr = tf.placeholder(dtype=tf.float32, shape=[], 46 | name="lr") 47 | 48 | 49 | def get_feed_dict(self, words, labels=None, lr=None, dropout=None): 50 | """Given some data, pad it and build a feed dictionary 51 | 52 | Args: 53 | words: list of sentences. A sentence is a list of ids of a list of 54 | words. A word is a list of ids 55 | labels: list of ids 56 | lr: (float) learning rate 57 | dropout: (float) keep prob 58 | 59 | Returns: 60 | dict {placeholder: value} 61 | 62 | """ 63 | # perform padding of the given data 64 | if self.config.use_chars: 65 | char_ids, word_ids = zip(*words) 66 | word_ids, sequence_lengths = pad_sequences(word_ids, 0) 67 | char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, 68 | nlevels=2) 69 | else: 70 | word_ids, sequence_lengths = pad_sequences(words, 0) 71 | 72 | # build feed dictionary 73 | feed = { 74 | self.word_ids: word_ids, 75 | self.sequence_lengths: sequence_lengths 76 | } 77 | 78 | if self.config.use_chars: 79 | feed[self.char_ids] = char_ids 80 | feed[self.word_lengths] = word_lengths 81 | 82 | if labels is not None: 83 | labels, _ = pad_sequences(labels, 0) 84 | feed[self.labels] = labels 85 | 86 | if lr is not None: 87 | feed[self.lr] = lr 88 | 89 | if dropout is not None: 90 | feed[self.dropout] = dropout 91 | 92 | return feed, sequence_lengths 93 | 94 | 95 | def add_word_embeddings_op(self): 96 | """Defines self.word_embeddings 97 | 98 | If self.config.embeddings is not None and is a np array initialized 99 | with pre-trained word vectors, the word embeddings is just a look-up 100 | and we don't train the vectors. Otherwise, a random matrix with 101 | the correct shape is initialized. 102 | """ 103 | with tf.variable_scope("words"): 104 | if self.config.embeddings is None: 105 | self.logger.info("WARNING: randomly initializing word vectors") 106 | _word_embeddings = tf.get_variable( 107 | name="_word_embeddings", 108 | dtype=tf.float32, 109 | shape=[self.config.nwords, self.config.dim_word]) 110 | else: 111 | _word_embeddings = tf.Variable( 112 | self.config.embeddings, 113 | name="_word_embeddings", 114 | dtype=tf.float32, 115 | trainable=self.config.train_embeddings) 116 | 117 | word_embeddings = tf.nn.embedding_lookup(_word_embeddings, 118 | self.word_ids, name="word_embeddings") 119 | 120 | with tf.variable_scope("chars"): 121 | if self.config.use_chars: 122 | # get char embeddings matrix 123 | _char_embeddings = tf.get_variable( 124 | name="_char_embeddings", 125 | dtype=tf.float32, 126 | shape=[self.config.nchars, self.config.dim_char]) 127 | char_embeddings = tf.nn.embedding_lookup(_char_embeddings, 128 | self.char_ids, name="char_embeddings") 129 | 130 | # put the time dimension on axis=1 131 | s = tf.shape(char_embeddings) 132 | char_embeddings = tf.reshape(char_embeddings, 133 | shape=[s[0]*s[1], s[-2], self.config.dim_char]) 134 | word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]]) 135 | 136 | # bi lstm on chars 137 | cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, 138 | state_is_tuple=True) 139 | cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, 140 | state_is_tuple=True) 141 | _output = tf.nn.bidirectional_dynamic_rnn( 142 | cell_fw, cell_bw, char_embeddings, 143 | sequence_length=word_lengths, dtype=tf.float32) 144 | 145 | # read and concat output 146 | _, ((_, output_fw), (_, output_bw)) = _output 147 | output = tf.concat([output_fw, output_bw], axis=-1) 148 | 149 | # shape = (batch size, max sentence length, char hidden size) 150 | output = tf.reshape(output, 151 | shape=[s[0], s[1], 2*self.config.hidden_size_char]) 152 | word_embeddings = tf.concat([word_embeddings, output], axis=-1) 153 | 154 | self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout) 155 | 156 | 157 | def add_logits_op(self): 158 | """Defines self.logits 159 | 160 | For each word in each sentence of the batch, it corresponds to a vector 161 | of scores, of dimension equal to the number of tags. 162 | """ 163 | with tf.variable_scope("bi-lstm"): 164 | cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm) 165 | cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm) 166 | (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( 167 | cell_fw, cell_bw, self.word_embeddings, 168 | sequence_length=self.sequence_lengths, dtype=tf.float32) 169 | output = tf.concat([output_fw, output_bw], axis=-1) 170 | output = tf.nn.dropout(output, self.dropout) 171 | 172 | with tf.variable_scope("proj"): 173 | W = tf.get_variable("W", dtype=tf.float32, 174 | shape=[2*self.config.hidden_size_lstm, self.config.ntags]) 175 | 176 | b = tf.get_variable("b", shape=[self.config.ntags], 177 | dtype=tf.float32, initializer=tf.zeros_initializer()) 178 | 179 | nsteps = tf.shape(output)[1] 180 | output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm]) 181 | pred = tf.matmul(output, W) + b 182 | self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags]) 183 | 184 | 185 | def add_pred_op(self): 186 | """Defines self.labels_pred 187 | 188 | This op is defined only in the case where we don't use a CRF since in 189 | that case we can make the prediction "in the graph" (thanks to tf 190 | functions in other words). With theCRF, as the inference is coded 191 | in python and not in pure tensroflow, we have to make the prediciton 192 | outside the graph. 193 | """ 194 | if not self.config.use_crf: 195 | self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1), 196 | tf.int32) 197 | 198 | 199 | def add_loss_op(self): 200 | """Defines the loss""" 201 | if self.config.use_crf: 202 | log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood( 203 | self.logits, self.labels, self.sequence_lengths) 204 | self.trans_params = trans_params # need to evaluate it for decoding 205 | self.loss = tf.reduce_mean(-log_likelihood) 206 | else: 207 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits( 208 | logits=self.logits, labels=self.labels) 209 | mask = tf.sequence_mask(self.sequence_lengths) 210 | losses = tf.boolean_mask(losses, mask) 211 | self.loss = tf.reduce_mean(losses) 212 | 213 | # for tensorboard 214 | tf.summary.scalar("loss", self.loss) 215 | 216 | 217 | def build(self): 218 | # NER specific functions 219 | self.add_placeholders() 220 | self.add_word_embeddings_op() 221 | self.add_logits_op() 222 | self.add_pred_op() 223 | self.add_loss_op() 224 | 225 | # Generic functions that add training op and initialize session 226 | self.add_train_op(self.config.lr_method, self.lr, self.loss, 227 | self.config.clip) 228 | self.initialize_session() # now self.sess is defined and vars are init 229 | 230 | 231 | def predict_batch(self, words): 232 | """ 233 | Args: 234 | words: list of sentences 235 | 236 | Returns: 237 | labels_pred: list of labels for each sentence 238 | sequence_length 239 | 240 | """ 241 | fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0) 242 | 243 | if self.config.use_crf: 244 | # get tag scores and transition params of CRF 245 | viterbi_sequences = [] 246 | logits, trans_params = self.sess.run( 247 | [self.logits, self.trans_params], feed_dict=fd) 248 | 249 | # iterate over the sentences because no batching in vitervi_decode 250 | for logit, sequence_length in zip(logits, sequence_lengths): 251 | logit = logit[:sequence_length] # keep only the valid steps 252 | viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode( 253 | logit, trans_params) 254 | viterbi_sequences += [viterbi_seq] 255 | 256 | return viterbi_sequences, sequence_lengths 257 | 258 | else: 259 | labels_pred = self.sess.run(self.labels_pred, feed_dict=fd) 260 | 261 | return labels_pred, sequence_lengths 262 | 263 | 264 | def run_epoch(self, train, dev, epoch): 265 | """Performs one complete pass over the train set and evaluate on dev 266 | 267 | Args: 268 | train: dataset that yields tuple of sentences, tags 269 | dev: dataset 270 | epoch: (int) index of the current epoch 271 | 272 | Returns: 273 | f1: (python float), score to select model on, higher is better 274 | 275 | """ 276 | # progbar stuff for logging 277 | batch_size = self.config.batch_size 278 | nbatches = (len(train) + batch_size - 1) // batch_size 279 | prog = Progbar(target=nbatches) 280 | 281 | # iterate over dataset 282 | for i, (words, labels) in enumerate(minibatches(train, batch_size)): 283 | fd, _ = self.get_feed_dict(words, labels, self.config.lr, 284 | self.config.dropout) 285 | 286 | _, train_loss, summary = self.sess.run( 287 | [self.train_op, self.loss, self.merged], feed_dict=fd) 288 | 289 | prog.update(i + 1, [("train loss", train_loss)]) 290 | 291 | # tensorboard 292 | if i % 10 == 0: 293 | self.file_writer.add_summary(summary, epoch*nbatches + i) 294 | 295 | metrics = self.run_evaluate(dev) 296 | msg = " - ".join(["{} {:04.2f}".format(k, v) 297 | for k, v in metrics.items()]) 298 | self.logger.info(msg) 299 | 300 | return metrics["f1"] 301 | 302 | 303 | def run_evaluate(self, test): 304 | """Evaluates performance on test set 305 | 306 | Args: 307 | test: dataset that yields tuple of (sentences, tags) 308 | 309 | Returns: 310 | metrics: (dict) metrics["acc"] = 98.4, ... 311 | 312 | """ 313 | accs = [] 314 | correct_preds, total_correct, total_preds = 0., 0., 0. 315 | for words, labels in minibatches(test, self.config.batch_size): 316 | labels_pred, sequence_lengths = self.predict_batch(words) 317 | 318 | for lab, lab_pred, length in zip(labels, labels_pred, 319 | sequence_lengths): 320 | lab = lab[:length] 321 | lab_pred = lab_pred[:length] 322 | accs += [a==b for (a, b) in zip(lab, lab_pred)] 323 | 324 | lab_chunks = set(get_chunks(lab, self.config.vocab_tags)) 325 | lab_pred_chunks = set(get_chunks(lab_pred, 326 | self.config.vocab_tags)) 327 | 328 | correct_preds += len(lab_chunks & lab_pred_chunks) 329 | total_preds += len(lab_pred_chunks) 330 | total_correct += len(lab_chunks) 331 | 332 | p = correct_preds / total_preds if correct_preds > 0 else 0 333 | r = correct_preds / total_correct if correct_preds > 0 else 0 334 | f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 335 | acc = np.mean(accs) 336 | 337 | return {"acc": 100*acc, "f1": 100*f1} 338 | 339 | 340 | def predict(self, words_raw): 341 | """Returns list of tags 342 | 343 | Args: 344 | words_raw: list of words (string), just one sentence (no batch) 345 | 346 | Returns: 347 | preds: list of tags (string), one for each word in the sentence 348 | 349 | """ 350 | words = [self.config.processing_word(w) for w in words_raw] 351 | if type(words[0]) == tuple: 352 | words = zip(*words) 353 | pred_ids, _ = self.predict_batch([words]) 354 | preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])] 355 | 356 | return preds 357 | -------------------------------------------------------------------------------- /lstm_model/src/ner_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import tensorflow as tf 4 | 5 | 6 | from .data_utils import minibatches, pad_sequences, get_chunks 7 | from .general_utils import Progbar 8 | from .base_model import BaseModel 9 | 10 | 11 | class NERModel(BaseModel): 12 | """Specialized class of Model for NER""" 13 | 14 | def __init__(self, config): 15 | super(NERModel, self).__init__(config) 16 | self.idx_to_tag = {idx: tag for tag, idx in 17 | self.config.vocab_tags.items()} 18 | 19 | 20 | def add_placeholders(self): 21 | """Define placeholders = entries to computational graph""" 22 | # shape = (batch size, max length of sentence in batch) 23 | self.word_ids = tf.placeholder(tf.int32, shape=[None, None], 24 | name="word_ids") 25 | 26 | # shape = (batch size) 27 | self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], 28 | name="sequence_lengths") 29 | 30 | # shape = (batch size, max length of sentence, max length of word) 31 | self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None], 32 | name="char_ids") 33 | 34 | # shape = (batch_size, max_length of sentence) 35 | self.word_lengths = tf.placeholder(tf.int32, shape=[None, None], 36 | name="word_lengths") 37 | 38 | # shape = (batch size, max length of sentence in batch) 39 | self.labels = tf.placeholder(tf.int32, shape=[None, None], 40 | name="labels") 41 | 42 | # hyper parameters 43 | self.dropout = tf.placeholder(dtype=tf.float32, shape=[], 44 | name="dropout") 45 | self.lr = tf.placeholder(dtype=tf.float32, shape=[], 46 | name="lr") 47 | 48 | 49 | def get_feed_dict(self, words, labels=None, lr=None, dropout=None): 50 | """Given some data, pad it and build a feed dictionary 51 | 52 | Args: 53 | words: list of sentences. A sentence is a list of ids of a list of 54 | words. A word is a list of ids 55 | labels: list of ids 56 | lr: (float) learning rate 57 | dropout: (float) keep prob 58 | 59 | Returns: 60 | dict {placeholder: value} 61 | 62 | """ 63 | # perform padding of the given data 64 | if self.config.use_chars: 65 | char_ids, word_ids = zip(*words) 66 | word_ids, sequence_lengths = pad_sequences(word_ids, 0) 67 | char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, 68 | nlevels=2) 69 | else: 70 | word_ids, sequence_lengths = pad_sequences(words, 0) 71 | 72 | # build feed dictionary 73 | feed = { 74 | self.word_ids: word_ids, 75 | self.sequence_lengths: sequence_lengths 76 | } 77 | 78 | if self.config.use_chars: 79 | feed[self.char_ids] = char_ids 80 | feed[self.word_lengths] = word_lengths 81 | 82 | if labels is not None: 83 | labels, _ = pad_sequences(labels, 0) 84 | feed[self.labels] = labels 85 | 86 | if lr is not None: 87 | feed[self.lr] = lr 88 | 89 | if dropout is not None: 90 | feed[self.dropout] = dropout 91 | 92 | return feed, sequence_lengths 93 | 94 | 95 | def add_word_embeddings_op(self): 96 | """Defines self.word_embeddings 97 | 98 | If self.config.embeddings is not None and is a np array initialized 99 | with pre-trained word vectors, the word embeddings is just a look-up 100 | and we don't train the vectors. Otherwise, a random matrix with 101 | the correct shape is initialized. 102 | """ 103 | with tf.variable_scope("words"): 104 | if self.config.embeddings is None: 105 | self.logger.info("WARNING: randomly initializing word vectors") 106 | _word_embeddings = tf.get_variable( 107 | name="_word_embeddings", 108 | dtype=tf.float32, 109 | shape=[self.config.nwords, self.config.dim_word]) 110 | else: 111 | _word_embeddings = tf.Variable( 112 | self.config.embeddings, 113 | name="_word_embeddings", 114 | dtype=tf.float32, 115 | trainable=self.config.train_embeddings) 116 | 117 | word_embeddings = tf.nn.embedding_lookup(_word_embeddings, 118 | self.word_ids, name="word_embeddings") 119 | 120 | with tf.variable_scope("chars"): 121 | if self.config.use_chars: 122 | # get char embeddings matrix 123 | _char_embeddings = tf.get_variable( 124 | name="_char_embeddings", 125 | dtype=tf.float32, 126 | shape=[self.config.nchars, self.config.dim_char]) 127 | char_embeddings = tf.nn.embedding_lookup(_char_embeddings, 128 | self.char_ids, name="char_embeddings") 129 | 130 | # put the time dimension on axis=1 131 | s = tf.shape(char_embeddings) 132 | char_embeddings = tf.reshape(char_embeddings, 133 | shape=[s[0]*s[1], s[-2], self.config.dim_char]) 134 | word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]]) 135 | 136 | # bi lstm on chars 137 | cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, 138 | state_is_tuple=True) 139 | cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, 140 | state_is_tuple=True) 141 | _output = tf.nn.bidirectional_dynamic_rnn( 142 | cell_fw, cell_bw, char_embeddings, 143 | sequence_length=word_lengths, dtype=tf.float32) 144 | 145 | # read and concat output 146 | _, ((_, output_fw), (_, output_bw)) = _output 147 | output = tf.concat([output_fw, output_bw], axis=-1) 148 | 149 | # shape = (batch size, max sentence length, char hidden size) 150 | output = tf.reshape(output, 151 | shape=[s[0], s[1], 2*self.config.hidden_size_char]) 152 | word_embeddings = tf.concat([word_embeddings, output], axis=-1) 153 | 154 | self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout) 155 | 156 | 157 | def add_logits_op(self): 158 | """Defines self.logits 159 | 160 | For each word in each sentence of the batch, it corresponds to a vector 161 | of scores, of dimension equal to the number of tags. 162 | """ 163 | with tf.variable_scope("bi-lstm"): 164 | cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm) 165 | cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm) 166 | (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( 167 | cell_fw, cell_bw, self.word_embeddings, 168 | sequence_length=self.sequence_lengths, dtype=tf.float32) 169 | output = tf.concat([output_fw, output_bw], axis=-1) 170 | output = tf.nn.dropout(output, self.dropout) 171 | 172 | with tf.variable_scope("proj"): 173 | W = tf.get_variable("W", dtype=tf.float32, 174 | shape=[2*self.config.hidden_size_lstm, self.config.ntags]) 175 | 176 | b = tf.get_variable("b", shape=[self.config.ntags], 177 | dtype=tf.float32, initializer=tf.zeros_initializer()) 178 | 179 | nsteps = tf.shape(output)[1] 180 | output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm]) 181 | pred = tf.matmul(output, W) + b 182 | self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags]) 183 | 184 | 185 | def add_pred_op(self): 186 | """Defines self.labels_pred 187 | 188 | This op is defined only in the case where we don't use a CRF since in 189 | that case we can make the prediction "in the graph" (thanks to tf 190 | functions in other words). With theCRF, as the inference is coded 191 | in python and not in pure tensroflow, we have to make the prediciton 192 | outside the graph. 193 | """ 194 | if not self.config.use_crf: 195 | self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1), 196 | tf.int32) 197 | 198 | 199 | def add_loss_op(self): 200 | """Defines the loss""" 201 | if self.config.use_crf: 202 | log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood( 203 | self.logits, self.labels, self.sequence_lengths) 204 | self.trans_params = trans_params # need to evaluate it for decoding 205 | self.loss = tf.reduce_mean(-log_likelihood) 206 | else: 207 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits( 208 | logits=self.logits, labels=self.labels) 209 | mask = tf.sequence_mask(self.sequence_lengths) 210 | losses = tf.boolean_mask(losses, mask) 211 | self.loss = tf.reduce_mean(losses) 212 | 213 | # for tensorboard 214 | tf.summary.scalar("loss", self.loss) 215 | 216 | 217 | def build(self): 218 | # NER specific functions 219 | self.add_placeholders() 220 | self.add_word_embeddings_op() 221 | self.add_logits_op() 222 | self.add_pred_op() 223 | self.add_loss_op() 224 | 225 | # Generic functions that add training op and initialize session 226 | self.add_train_op(self.config.lr_method, self.lr, self.loss, 227 | self.config.clip) 228 | self.initialize_session() # now self.sess is defined and vars are init 229 | 230 | 231 | def predict_batch(self, words): 232 | """ 233 | Args: 234 | words: list of sentences 235 | 236 | Returns: 237 | labels_pred: list of labels for each sentence 238 | sequence_length 239 | 240 | """ 241 | fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0) 242 | 243 | if self.config.use_crf: 244 | # get tag scores and transition params of CRF 245 | viterbi_sequences = [] 246 | logits, trans_params = self.sess.run( 247 | [self.logits, self.trans_params], feed_dict=fd) 248 | 249 | # iterate over the sentences because no batching in vitervi_decode 250 | for logit, sequence_length in zip(logits, sequence_lengths): 251 | logit = logit[:sequence_length] # keep only the valid steps 252 | viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode( 253 | logit, trans_params) 254 | viterbi_sequences += [viterbi_seq] 255 | 256 | return viterbi_sequences, sequence_lengths 257 | 258 | else: 259 | labels_pred = self.sess.run(self.labels_pred, feed_dict=fd) 260 | 261 | return labels_pred, sequence_lengths 262 | 263 | 264 | def run_epoch(self, train, dev, epoch): 265 | """Performs one complete pass over the train set and evaluate on dev 266 | 267 | Args: 268 | train: dataset that yields tuple of sentences, tags 269 | dev: dataset 270 | epoch: (int) index of the current epoch 271 | 272 | Returns: 273 | f1: (python float), score to select model on, higher is better 274 | 275 | """ 276 | # progbar stuff for logging 277 | batch_size = self.config.batch_size 278 | nbatches = (len(train) + batch_size - 1) // batch_size 279 | prog = Progbar(target=nbatches) 280 | 281 | # iterate over dataset 282 | for i, (words, labels) in enumerate(minibatches(train, batch_size)): 283 | fd, _ = self.get_feed_dict(words, labels, self.config.lr, 284 | self.config.dropout) 285 | 286 | _, train_loss, summary = self.sess.run( 287 | [self.train_op, self.loss, self.merged], feed_dict=fd) 288 | 289 | prog.update(i + 1, [("train loss", train_loss)]) 290 | 291 | # tensorboard 292 | if i % 10 == 0: 293 | self.file_writer.add_summary(summary, epoch*nbatches + i) 294 | 295 | metrics = self.run_evaluate(dev) 296 | msg = " - ".join(["{} {:04.2f}".format(k, v) 297 | for k, v in metrics.items()]) 298 | self.logger.info(msg) 299 | 300 | return metrics["f1"] 301 | 302 | 303 | def run_evaluate(self, test): 304 | """Evaluates performance on test set 305 | 306 | Args: 307 | test: dataset that yields tuple of (sentences, tags) 308 | 309 | Returns: 310 | metrics: (dict) metrics["acc"] = 98.4, ... 311 | 312 | """ 313 | accs = [] 314 | correct_preds, total_correct, total_preds = 0., 0., 0. 315 | for words, labels in minibatches(test, self.config.batch_size): 316 | labels_pred, sequence_lengths = self.predict_batch(words) 317 | 318 | for lab, lab_pred, length in zip(labels, labels_pred, 319 | sequence_lengths): 320 | lab = lab[:length] 321 | lab_pred = lab_pred[:length] 322 | accs += [a==b for (a, b) in zip(lab, lab_pred)] 323 | 324 | lab_chunks = set(get_chunks(lab, self.config.vocab_tags)) 325 | lab_pred_chunks = set(get_chunks(lab_pred, 326 | self.config.vocab_tags)) 327 | 328 | correct_preds += len(lab_chunks & lab_pred_chunks) 329 | total_preds += len(lab_pred_chunks) 330 | total_correct += len(lab_chunks) 331 | 332 | p = correct_preds / total_preds if correct_preds > 0 else 0 333 | r = correct_preds / total_correct if correct_preds > 0 else 0 334 | f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 335 | acc = np.mean(accs) 336 | 337 | return {"acc": 100*acc, "f1": 100*f1} 338 | 339 | 340 | def predict(self, words_raw): 341 | """Returns list of tags 342 | 343 | Args: 344 | words_raw: list of words (string), just one sentence (no batch) 345 | 346 | Returns: 347 | preds: list of tags (string), one for each word in the sentence 348 | 349 | """ 350 | words = [self.config.processing_word(w) for w in words_raw] 351 | if type(words[0]) == tuple: 352 | words = zip(*words) 353 | pred_ids, _ = self.predict_batch([words]) 354 | preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])] 355 | 356 | return preds 357 | -------------------------------------------------------------------------------- /lstm_model/data_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | 5 | # shared global variables to be imported from model also 6 | UNK = "$UNK$" 7 | NUM = "$NUM$" 8 | NONE = "O" 9 | WORD_PAD = '$W_PAD$' 10 | TAG_PAD = '$T_PAD$' 11 | 12 | # special error message 13 | class MyIOError(Exception): 14 | def __init__(self, filename): 15 | # custom error message 16 | message = """ 17 | ERROR: Unable to locate file {}. 18 | 19 | FIX: Have you tried running python build_data.py first? 20 | This will build vocab file from your train, test and dev sets and 21 | trimm your word vectors. 22 | """.format(filename) 23 | super(MyIOError, self).__init__(message) 24 | 25 | 26 | def Dataset(filename, processing_word=None, processing_tag=None, max_iter=None): 27 | results = [] 28 | with open(filename) as f: 29 | sentences, tags = [], [] 30 | n_iter = 0 31 | for line in f: 32 | line = line.strip() 33 | if not line: 34 | if len(sentences) != 0: 35 | n_iter += 1 36 | if max_iter is not None and n_iter > max_iter: 37 | break 38 | results.append((sentences, tags)) 39 | sentences, tags = [], [] 40 | elif not line.startswith("###"): 41 | ls = line.split('|') 42 | tag, sentence = ls[1], ls[2].split() 43 | # if tag != 'Others': 44 | if processing_word is not None: 45 | try: 46 | sentence = [processing_word(word) for word in sentence] 47 | except: 48 | pass 49 | if processing_tag is not None: 50 | tag = processing_tag(tag) 51 | sentences += [sentence] 52 | tags += [tag] 53 | 54 | return results 55 | 56 | # class Dataset(object): 57 | # """Class that iterates over CoNLL Dataset 58 | # 59 | # __iter__ method yields a tuple (words, tags) 60 | # words: list of raw words 61 | # tags: list of raw tags 62 | # 63 | # If processing_word and processing_tag are not None, 64 | # optional preprocessing is appplied 65 | # 66 | # Example: 67 | # ```python 68 | # data = CoNLLDataset(filename) 69 | # for sentence, tags in data: 70 | # pass 71 | # ``` 72 | # 73 | # """ 74 | # def __init__(self, filename, processing_word=None, processing_tag=None, max_iter=None): 75 | # """ 76 | # Args: 77 | # filename: path to the file 78 | # processing_words: (optional) function that takes a word as input 79 | # processing_tags: (optional) function that takes a tag as input 80 | # max_iter: (optional) max number of sentences to yield 81 | # 82 | # """ 83 | # self.filename = filename 84 | # self.processing_word = processing_word 85 | # self.processing_tag = processing_tag 86 | # self.length = None 87 | # self.max_iter = max_iter 88 | # 89 | # 90 | # def __iter__(self): 91 | # with open(self.filename) as f: 92 | # sentences, tags = [], [] 93 | # n_iter = 0 94 | # for line in f: 95 | # line = line.strip() 96 | # if not line: 97 | # if len(sentences) != 0: 98 | # n_iter += 1 99 | # if self.max_iter is not None and n_iter > self.max_iter: 100 | # break 101 | # yield sentences, tags 102 | # sentences, tags = [], [] 103 | # elif not line.startswith("###"): 104 | # ls = line.split('|') 105 | # tag, sentence = ls[1], ls[2].split() 106 | # # if tag != 'Others': 107 | # if self.processing_word is not None: 108 | # sentence = [self.processing_word(word) for word in sentence] 109 | # if self.processing_tag is not None: 110 | # tag = self.processing_tag(tag) 111 | # sentences += [sentence] 112 | # tags += [tag] 113 | # 114 | # 115 | # def __len__(self): 116 | # """Iterates once over the corpus to set and store length""" 117 | # if self.length is None: 118 | # self.length = 0 119 | # for _ in self: 120 | # self.length += 1 121 | # 122 | # return self.length 123 | 124 | 125 | class Embedding(object): 126 | """Embedding layer with frequency-based normalization and dropout.""" 127 | def __init__(self, vocab_size=None, 128 | embedding_dim=None, 129 | embeddings=None, 130 | normalize=False, 131 | vocab_freqs=None, 132 | keep_prob=1., 133 | trainable=False): 134 | # super(Embedding, self).__init__(**kwargs) 135 | with tf.variable_scope("words"): 136 | if embeddings is None: 137 | assert vocab_size is not None 138 | assert embedding_dim is not None 139 | self._word_embeddings = tf.get_variable( 140 | name="_word_embeddings", 141 | dtype=tf.float32, 142 | shape=[vocab_size, embedding_dim]) 143 | else: 144 | vocab_size = embeddings.shape[0] 145 | self._word_embeddings = tf.Variable( 146 | embeddings, 147 | name="_word_embeddings", 148 | dtype=tf.float32, 149 | trainable=trainable) 150 | 151 | self.keep_prob = keep_prob 152 | 153 | if normalize: 154 | assert vocab_freqs is not None 155 | vocab_freqs = tf.constant( 156 | vocab_freqs, dtype=tf.float32, shape=(vocab_size, 1)) 157 | self._word_embeddings = self._normalize(self._word_embeddings, vocab_freqs) 158 | 159 | def embed(self, x): 160 | with tf.variable_scope("words"): 161 | embedded = tf.nn.embedding_lookup(self._word_embeddings, x) 162 | if self.keep_prob < 1.: 163 | # embedded = tf.nn.dropout(embedded, self.keep_prob) 164 | shape = embedded.get_shape().as_list() 165 | 166 | # Use same dropout masks at each timestep with specifying noise_shape. 167 | # This slightly improves performance. 168 | # Please see https://arxiv.org/abs/1512.05287 for the theoretical 169 | # explanation. 170 | if len(shape) == 3: 171 | embedded = tf.nn.dropout( 172 | embedded, self.keep_prob, noise_shape=(shape[0], 1, shape[2])) 173 | elif len(shape) == 4: 174 | embedded = tf.nn.dropout( 175 | embedded, self.keep_prob, noise_shape=(shape[0], 1, 1, shape[2])) 176 | else: 177 | pass 178 | return embedded 179 | 180 | def _normalize(self, emb, vocab_freqs): 181 | weights = vocab_freqs / tf.reduce_sum(vocab_freqs) 182 | mean = tf.reduce_sum(weights * emb, 0, keepdims=True) 183 | var = tf.reduce_sum(weights * tf.pow(emb - mean, 2.), 0, keepdims=True) 184 | stddev = tf.sqrt(1e-6 + var) 185 | return (emb - mean) / stddev 186 | 187 | 188 | def get_vocabs(datasets): 189 | """Build vocabulary from an iterable of datasets objects 190 | 191 | Args: 192 | datasets: a list of dataset objects 193 | 194 | Returns: 195 | a set of all the words in the dataset 196 | 197 | """ 198 | print("Building vocab...") 199 | vocab_tags = set() 200 | vocab_words_freq = dict() 201 | for dataset in datasets: 202 | for sentences, tags in dataset: 203 | for sent in sentences: 204 | for token in sent: 205 | vocab_words_freq[token] = vocab_words_freq.get(token, 0) + 1 206 | vocab_tags.update(tags) 207 | print("- done. {} tokens".format(len(vocab_words_freq))) 208 | return vocab_words_freq, vocab_tags 209 | 210 | 211 | def get_char_vocab(dataset): 212 | """Build char vocabulary from an iterable of datasets objects 213 | 214 | Args: 215 | dataset: a iterator yielding tuples (sentence, tags) 216 | 217 | Returns: 218 | a set of all the characters in the dataset 219 | 220 | """ 221 | vocab_char = set() 222 | for sents, _ in dataset: 223 | for sent in sents: 224 | for word in sent: 225 | vocab_char.update(word) 226 | 227 | return vocab_char 228 | 229 | 230 | def get_wordvec_vocab(filename): 231 | """Load vocab from file 232 | 233 | Args: 234 | filename: path to the glove vectors 235 | 236 | Returns: 237 | vocab: set() of strings 238 | """ 239 | print("Building vocab...") 240 | vocab = set() 241 | with open(filename) as f: 242 | for line in f: 243 | word = line.strip().split(' ')[0] 244 | vocab.add(word) 245 | print("- done. {} tokens".format(len(vocab))) 246 | return vocab 247 | 248 | 249 | def write_vocab(vocab, filename): 250 | """Writes a vocab to a file 251 | 252 | Writes one word per line. 253 | 254 | Args: 255 | vocab: iterable that yields word 256 | filename: path to vocab file 257 | 258 | Returns: 259 | write a word per line 260 | 261 | """ 262 | print("Writing vocab...") 263 | with open(filename, "w") as f: 264 | if isinstance(vocab, dict): 265 | for i, word in enumerate(vocab): 266 | if i != len(vocab) - 1: 267 | f.write("{}\t{}\n".format(word, vocab[word])) 268 | else: 269 | f.write('{}\t{}'.format(word, vocab[word])) 270 | else: 271 | for i, word in enumerate(vocab): 272 | if i != len(vocab) - 1: 273 | f.write("{}\n".format(word)) 274 | else: 275 | f.write(word) 276 | print("- done. {} tokens".format(len(vocab))) 277 | 278 | 279 | def load_vocab(filename): 280 | """Loads vocab from a file 281 | 282 | Args: 283 | filename: (string) the format of the file must be one word per line. 284 | 285 | Returns: 286 | d: dict[word] = index 287 | 288 | """ 289 | try: 290 | d = dict() 291 | vocab_freq = [] 292 | with open(filename) as f: 293 | for idx, line in enumerate(f): 294 | line = line.strip().split() 295 | if len(line) < 2: 296 | word = line[0] 297 | d[word] = idx 298 | else: 299 | word, freq = line 300 | d[word] = idx 301 | try: 302 | vocab_freq.append(int(freq)) 303 | except: 304 | pass 305 | 306 | except IOError: 307 | raise MyIOError(filename) 308 | 309 | if len(vocab_freq) == 0: 310 | return d 311 | else: 312 | return d, vocab_freq 313 | 314 | 315 | def export_trimmed_wordvec_vectors(vocab, wordvec_filename, trimmed_filename): 316 | """Saves glove vectors in numpy array 317 | 318 | Args: 319 | vocab: dictionary vocab[word] = index 320 | glove_filename: a path to a glove file 321 | trimmed_filename: a path where to store a matrix in npy 322 | dim: (int) dimension of embeddings 323 | 324 | """ 325 | num = 0 326 | with open(trimmed_filename, 'w') as outFile: 327 | with open(wordvec_filename, 'r') as inFile: 328 | for line in inFile: 329 | word = line.strip().split(' ')[0] 330 | if word in vocab: 331 | outFile.write(line) 332 | num += 1 333 | 334 | print('{} out of {} tokens can find pre-trained embeddings!'.format(num, len(vocab))) 335 | 336 | 337 | def get_trimmed_wordvec_vectors(filename, vocab): 338 | """ 339 | Args: 340 | filename: path to the npz file 341 | 342 | Returns: 343 | matrix of embeddings (np array) 344 | 345 | """ 346 | f = open(filename, 'r') 347 | f.readline() 348 | dim = len(f.readline().strip().split()) - 1 349 | assert dim > 30 350 | embeddings = np.random.uniform(-0.1, 0.1, size=(len(vocab)+1, dim)) 351 | with open(filename, 'r') as inFile: 352 | for line in inFile: 353 | line = line.strip().split() 354 | word = line[0] 355 | if word in vocab: 356 | embeddings[vocab[word]] = np.array([float(item) for item in line[1:]]) 357 | 358 | return embeddings 359 | 360 | 361 | def get_processing_word(vocab_words=None, vocab_chars=None, 362 | lowercase=False, chars=False, allow_unk=True): 363 | """Return lambda function that transform a word (string) into list, 364 | or tuple of (list, id) of int corresponding to the ids of the word and 365 | its corresponding characters. 366 | 367 | Args: 368 | vocab: dict[word] = idx 369 | 370 | Returns: 371 | f("cat") = ([12, 4, 32], 12345) 372 | = (list of char ids, word id) 373 | 374 | """ 375 | def f(word): 376 | # 0. get chars of words 377 | if vocab_chars is not None and chars == True: 378 | char_ids = [] 379 | for char in word: 380 | # ignore chars out of vocabulary 381 | if char in vocab_chars: 382 | char_ids += [vocab_chars[char]] 383 | 384 | # 1. preprocess word 385 | if lowercase: 386 | word = word.lower() 387 | if word.isdigit(): 388 | word = NUM 389 | 390 | # 2. get id of word 391 | if vocab_words is not None: 392 | if word in vocab_words: 393 | word = vocab_words[word] 394 | else: 395 | if allow_unk: 396 | word = vocab_words[UNK] 397 | else: 398 | raise Exception("Unknow key is not allowed. Check that "\ 399 | "your vocab (tags?) is correct") 400 | 401 | # 3. return tuple char ids, word id 402 | if vocab_chars is not None and chars == True: 403 | return char_ids, word 404 | else: 405 | return word 406 | 407 | return f 408 | 409 | 410 | def _pad_sequences(sequences, pad_tok, max_length): 411 | """ 412 | Args: 413 | sequences: a generator of list or tuple 414 | pad_tok: the char to pad with 415 | 416 | Returns: 417 | a list of list where each sublist has same length 418 | """ 419 | sequence_padded, sequence_length = [], [] 420 | 421 | for seq in sequences: 422 | seq = list(seq) 423 | seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0) 424 | sequence_padded += [seq_] 425 | sequence_length += [min(len(seq), max_length)] 426 | 427 | return sequence_padded, sequence_length 428 | 429 | 430 | def pad_sequences(sequences, pad_tok, nlevels=2): 431 | """ 432 | Args: 433 | sequences: a generator of list or tuple 434 | pad_tok: the char to pad with 435 | nlevels: "depth" of padding, for the case where we have characters ids 436 | 437 | Returns: 438 | a list of list where each sublist has same length 439 | 440 | """ 441 | if nlevels == 1: 442 | max_length = max(map(lambda x : len(x), sequences)) 443 | sequence_padded, sequence_length = _pad_sequences(sequences, 444 | pad_tok, max_length) 445 | 446 | elif nlevels == 2: 447 | max_length_sentence = max([max(map(lambda x: len(x), seq)) 448 | for seq in sequences]) 449 | sequence_padded, sequence_length = [], [] 450 | for seq in sequences: 451 | # all words are same length now 452 | sp, sl = _pad_sequences(seq, pad_tok, max_length_sentence) 453 | sequence_padded += [sp] 454 | sequence_length += [sl] 455 | 456 | max_length_document = max(map(lambda x : len(x), sequences)) 457 | sequence_padded, _ = _pad_sequences(sequence_padded, 458 | [pad_tok]*max_length_sentence, max_length_document) 459 | sequence_length, _ = _pad_sequences(sequence_length, 0, 460 | max_length_document) 461 | 462 | return sequence_padded, sequence_length 463 | 464 | 465 | def minibatches(data, minibatch_size, shuffle=True): 466 | """ 467 | Args: 468 | data: generator of (sentence, tags) tuples 469 | minibatch_size: (int) 470 | 471 | Yields: 472 | list of tuples 473 | 474 | """ 475 | if shuffle: 476 | random.shuffle(data) 477 | 478 | x_batch, y_batch = [], [] 479 | for (x, y) in data: 480 | if len(x_batch) == minibatch_size: 481 | yield x_batch, y_batch 482 | x_batch, y_batch = [], [] 483 | 484 | # if type(x[0]) == tuple: 485 | # x = zip(*x) 486 | x_batch += [x] 487 | y_batch += [y] 488 | 489 | if len(x_batch) != 0: 490 | yield x_batch, y_batch 491 | 492 | 493 | def get_chunk_type(tok, idx_to_tag): 494 | """ 495 | Args: 496 | tok: id of token, ex 4 497 | idx_to_tag: dictionary {4: "B-PER", ...} 498 | 499 | Returns: 500 | tuple: "B", "PER" 501 | 502 | """ 503 | tag_name = idx_to_tag[tok] 504 | tag_class = tag_name.split('-')[0] 505 | tag_type = tag_name.split('-')[-1] 506 | return tag_class, tag_type 507 | 508 | 509 | def get_chunks(seq, tags): 510 | """Given a sequence of tags, group entities and their position 511 | 512 | Args: 513 | seq: [4, 4, 0, 0, ...] sequence of labels 514 | tags: dict["O"] = 4 515 | 516 | Returns: 517 | list of (chunk_type, chunk_start, chunk_end) 518 | 519 | Example: 520 | seq = [4, 5, 0, 3] 521 | tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3} 522 | result = [("PER", 0, 2), ("LOC", 3, 4)] 523 | 524 | """ 525 | default = tags[NONE] 526 | idx_to_tag = {idx: tag for tag, idx in tags.items()} 527 | chunks = [] 528 | chunk_type, chunk_start = None, None 529 | for i, tok in enumerate(seq): 530 | # End of a chunk 1 531 | if tok == default and chunk_type is not None: 532 | # Add a chunk. 533 | chunk = (chunk_type, chunk_start, i) 534 | chunks.append(chunk) 535 | chunk_type, chunk_start = None, None 536 | 537 | # End of a chunk + start of a chunk! 538 | elif tok != default: 539 | tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag) 540 | if chunk_type is None: 541 | chunk_type, chunk_start = tok_chunk_type, i 542 | elif tok_chunk_type != chunk_type or tok_chunk_class == "B": 543 | chunk = (chunk_type, chunk_start, i) 544 | chunks.append(chunk) 545 | chunk_type, chunk_start = tok_chunk_type, i 546 | else: 547 | pass 548 | 549 | # end condition 550 | if chunk_type is not None: 551 | chunk = (chunk_type, chunk_start, len(seq)) 552 | chunks.append(chunk) 553 | 554 | return chunks 555 | -------------------------------------------------------------------------------- /lstm_model/src/data_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | 5 | # shared global variables to be imported from model also 6 | UNK = "$UNK$" 7 | NUM = "$NUM$" 8 | NONE = "O" 9 | WORD_PAD = '$W_PAD$' 10 | TAG_PAD = '$T_PAD$' 11 | 12 | # special error message 13 | class MyIOError(Exception): 14 | def __init__(self, filename): 15 | # custom error message 16 | message = """ 17 | ERROR: Unable to locate file {}. 18 | 19 | FIX: Have you tried running python build_data.py first? 20 | This will build vocab file from your train, test and dev sets and 21 | trimm your word vectors. 22 | """.format(filename) 23 | super(MyIOError, self).__init__(message) 24 | 25 | 26 | def Dataset(filename, processing_word=None, processing_tag=None, max_iter=None): 27 | results = [] 28 | with open(filename) as f: 29 | sentences, tags = [], [] 30 | n_iter = 0 31 | for line in f: 32 | line = line.strip() 33 | if not line: 34 | if len(sentences) != 0: 35 | n_iter += 1 36 | if max_iter is not None and n_iter > max_iter: 37 | break 38 | results.append((sentences, tags)) 39 | sentences, tags = [], [] 40 | elif not line.startswith("###"): 41 | ls = line.split('|') 42 | tag, sentence = ls[1], ls[2].split() 43 | # if tag != 'Others': 44 | if processing_word is not None: 45 | try: 46 | sentence = [processing_word(word) for word in sentence] 47 | except: 48 | pass 49 | if processing_tag is not None: 50 | tag = processing_tag(tag) 51 | sentences += [sentence] 52 | tags += [tag] 53 | 54 | return results 55 | 56 | # class Dataset(object): 57 | # """Class that iterates over CoNLL Dataset 58 | # 59 | # __iter__ method yields a tuple (words, tags) 60 | # words: list of raw words 61 | # tags: list of raw tags 62 | # 63 | # If processing_word and processing_tag are not None, 64 | # optional preprocessing is appplied 65 | # 66 | # Example: 67 | # ```python 68 | # data = CoNLLDataset(filename) 69 | # for sentence, tags in data: 70 | # pass 71 | # ``` 72 | # 73 | # """ 74 | # def __init__(self, filename, processing_word=None, processing_tag=None, max_iter=None): 75 | # """ 76 | # Args: 77 | # filename: path to the file 78 | # processing_words: (optional) function that takes a word as input 79 | # processing_tags: (optional) function that takes a tag as input 80 | # max_iter: (optional) max number of sentences to yield 81 | # 82 | # """ 83 | # self.filename = filename 84 | # self.processing_word = processing_word 85 | # self.processing_tag = processing_tag 86 | # self.length = None 87 | # self.max_iter = max_iter 88 | # 89 | # 90 | # def __iter__(self): 91 | # with open(self.filename) as f: 92 | # sentences, tags = [], [] 93 | # n_iter = 0 94 | # for line in f: 95 | # line = line.strip() 96 | # if not line: 97 | # if len(sentences) != 0: 98 | # n_iter += 1 99 | # if self.max_iter is not None and n_iter > self.max_iter: 100 | # break 101 | # yield sentences, tags 102 | # sentences, tags = [], [] 103 | # elif not line.startswith("###"): 104 | # ls = line.split('|') 105 | # tag, sentence = ls[1], ls[2].split() 106 | # # if tag != 'Others': 107 | # if self.processing_word is not None: 108 | # sentence = [self.processing_word(word) for word in sentence] 109 | # if self.processing_tag is not None: 110 | # tag = self.processing_tag(tag) 111 | # sentences += [sentence] 112 | # tags += [tag] 113 | # 114 | # 115 | # def __len__(self): 116 | # """Iterates once over the corpus to set and store length""" 117 | # if self.length is None: 118 | # self.length = 0 119 | # for _ in self: 120 | # self.length += 1 121 | # 122 | # return self.length 123 | 124 | 125 | class Embedding(object): 126 | """Embedding layer with frequency-based normalization and dropout.""" 127 | def __init__(self, vocab_size=None, 128 | embedding_dim=None, 129 | embeddings=None, 130 | normalize=False, 131 | vocab_freqs=None, 132 | keep_prob=1., 133 | trainable=False): 134 | # super(Embedding, self).__init__(**kwargs) 135 | with tf.variable_scope("words"): 136 | if embeddings is None: 137 | assert vocab_size is not None 138 | assert embedding_dim is not None 139 | self._word_embeddings = tf.get_variable( 140 | name="_word_embeddings", 141 | dtype=tf.float32, 142 | shape=[vocab_size, embedding_dim]) 143 | else: 144 | vocab_size = embeddings.shape[0] 145 | self._word_embeddings = tf.Variable( 146 | embeddings, 147 | name="_word_embeddings", 148 | dtype=tf.float32, 149 | trainable=trainable) 150 | 151 | self.keep_prob = keep_prob 152 | 153 | if normalize: 154 | assert vocab_freqs is not None 155 | vocab_freqs = tf.constant( 156 | vocab_freqs, dtype=tf.float32, shape=(vocab_size, 1)) 157 | self._word_embeddings = self._normalize(self._word_embeddings, vocab_freqs) 158 | 159 | def embed(self, x): 160 | with tf.variable_scope("words"): 161 | embedded = tf.nn.embedding_lookup(self._word_embeddings, x) 162 | if self.keep_prob < 1.: 163 | # embedded = tf.nn.dropout(embedded, self.keep_prob) 164 | shape = embedded.get_shape().as_list() 165 | 166 | # Use same dropout masks at each timestep with specifying noise_shape. 167 | # This slightly improves performance. 168 | # Please see https://arxiv.org/abs/1512.05287 for the theoretical 169 | # explanation. 170 | if len(shape) == 3: 171 | embedded = tf.nn.dropout( 172 | embedded, self.keep_prob, noise_shape=(shape[0], 1, shape[2])) 173 | elif len(shape) == 4: 174 | embedded = tf.nn.dropout( 175 | embedded, self.keep_prob, noise_shape=(shape[0], 1, 1, shape[2])) 176 | else: 177 | pass 178 | return embedded 179 | 180 | def _normalize(self, emb, vocab_freqs): 181 | weights = vocab_freqs / tf.reduce_sum(vocab_freqs) 182 | mean = tf.reduce_sum(weights * emb, 0, keepdims=True) 183 | var = tf.reduce_sum(weights * tf.pow(emb - mean, 2.), 0, keepdims=True) 184 | stddev = tf.sqrt(1e-6 + var) 185 | return (emb - mean) / stddev 186 | 187 | 188 | def get_vocabs(datasets): 189 | """Build vocabulary from an iterable of datasets objects 190 | 191 | Args: 192 | datasets: a list of dataset objects 193 | 194 | Returns: 195 | a set of all the words in the dataset 196 | 197 | """ 198 | print("Building vocab...") 199 | vocab_tags = set() 200 | vocab_words_freq = dict() 201 | for dataset in datasets: 202 | for sentences, tags in dataset: 203 | for sent in sentences: 204 | for token in sent: 205 | vocab_words_freq[token] = vocab_words_freq.get(token, 0) + 1 206 | vocab_tags.update(tags) 207 | print("- done. {} tokens".format(len(vocab_words_freq))) 208 | return vocab_words_freq, vocab_tags 209 | 210 | 211 | def get_char_vocab(dataset): 212 | """Build char vocabulary from an iterable of datasets objects 213 | 214 | Args: 215 | dataset: a iterator yielding tuples (sentence, tags) 216 | 217 | Returns: 218 | a set of all the characters in the dataset 219 | 220 | """ 221 | vocab_char = set() 222 | for sents, _ in dataset: 223 | for sent in sents: 224 | for word in sent: 225 | vocab_char.update(word) 226 | 227 | return vocab_char 228 | 229 | 230 | def get_wordvec_vocab(filename): 231 | """Load vocab from file 232 | 233 | Args: 234 | filename: path to the glove vectors 235 | 236 | Returns: 237 | vocab: set() of strings 238 | """ 239 | print("Building vocab...") 240 | vocab = set() 241 | with open(filename) as f: 242 | for line in f: 243 | word = line.strip().split(' ')[0] 244 | vocab.add(word) 245 | print("- done. {} tokens".format(len(vocab))) 246 | return vocab 247 | 248 | 249 | def write_vocab(vocab, filename): 250 | """Writes a vocab to a file 251 | 252 | Writes one word per line. 253 | 254 | Args: 255 | vocab: iterable that yields word 256 | filename: path to vocab file 257 | 258 | Returns: 259 | write a word per line 260 | 261 | """ 262 | print("Writing vocab...") 263 | with open(filename, "w") as f: 264 | if isinstance(vocab, dict): 265 | for i, word in enumerate(vocab): 266 | if i != len(vocab) - 1: 267 | f.write("{}\t{}\n".format(word, vocab[word])) 268 | else: 269 | f.write('{}\t{}'.format(word, vocab[word])) 270 | else: 271 | for i, word in enumerate(vocab): 272 | if i != len(vocab) - 1: 273 | f.write("{}\n".format(word)) 274 | else: 275 | f.write(word) 276 | print("- done. {} tokens".format(len(vocab))) 277 | 278 | 279 | def load_vocab(filename): 280 | """Loads vocab from a file 281 | 282 | Args: 283 | filename: (string) the format of the file must be one word per line. 284 | 285 | Returns: 286 | d: dict[word] = index 287 | 288 | """ 289 | try: 290 | d = dict() 291 | vocab_freq = [] 292 | with open(filename) as f: 293 | for idx, line in enumerate(f): 294 | line = line.strip().split() 295 | if len(line) < 2: 296 | word = line[0] 297 | d[word] = idx 298 | else: 299 | word, freq = line 300 | d[word] = idx 301 | try: 302 | vocab_freq.append(int(freq)) 303 | except: 304 | pass 305 | 306 | except IOError: 307 | raise MyIOError(filename) 308 | 309 | if len(vocab_freq) == 0: 310 | return d 311 | else: 312 | return d, vocab_freq 313 | 314 | 315 | def export_trimmed_wordvec_vectors(vocab, wordvec_filename, trimmed_filename): 316 | """Saves glove vectors in numpy array 317 | 318 | Args: 319 | vocab: dictionary vocab[word] = index 320 | glove_filename: a path to a glove file 321 | trimmed_filename: a path where to store a matrix in npy 322 | dim: (int) dimension of embeddings 323 | 324 | """ 325 | num = 0 326 | with open(trimmed_filename, 'w') as outFile: 327 | with open(wordvec_filename, 'r') as inFile: 328 | for line in inFile: 329 | word = line.strip().split(' ')[0] 330 | if word in vocab: 331 | outFile.write(line) 332 | num += 1 333 | 334 | print('{} out of {} tokens can find pre-trained embeddings!'.format(num, len(vocab))) 335 | 336 | 337 | def get_trimmed_wordvec_vectors(filename, vocab): 338 | """ 339 | Args: 340 | filename: path to the npz file 341 | 342 | Returns: 343 | matrix of embeddings (np array) 344 | 345 | """ 346 | f = open(filename, 'r') 347 | f.readline() 348 | dim = len(f.readline().strip().split()) - 1 349 | assert dim > 30 350 | embeddings = np.random.uniform(-0.1, 0.1, size=(len(vocab)+1, dim)) 351 | with open(filename, 'r') as inFile: 352 | for line in inFile: 353 | line = line.strip().split() 354 | word = line[0] 355 | if word in vocab: 356 | embeddings[vocab[word]] = np.array([float(item) for item in line[1:]]) 357 | 358 | return embeddings 359 | 360 | 361 | def get_processing_word(vocab_words=None, vocab_chars=None, 362 | lowercase=False, chars=False, allow_unk=True): 363 | """Return lambda function that transform a word (string) into list, 364 | or tuple of (list, id) of int corresponding to the ids of the word and 365 | its corresponding characters. 366 | 367 | Args: 368 | vocab: dict[word] = idx 369 | 370 | Returns: 371 | f("cat") = ([12, 4, 32], 12345) 372 | = (list of char ids, word id) 373 | 374 | """ 375 | def f(word): 376 | # 0. get chars of words 377 | if vocab_chars is not None and chars == True: 378 | char_ids = [] 379 | for char in word: 380 | # ignore chars out of vocabulary 381 | if char in vocab_chars: 382 | char_ids += [vocab_chars[char]] 383 | 384 | # 1. preprocess word 385 | if lowercase: 386 | word = word.lower() 387 | if word.isdigit(): 388 | word = NUM 389 | 390 | # 2. get id of word 391 | if vocab_words is not None: 392 | if word in vocab_words: 393 | word = vocab_words[word] 394 | else: 395 | if allow_unk: 396 | word = vocab_words[UNK] 397 | else: 398 | raise Exception("Unknow key is not allowed. Check that "\ 399 | "your vocab (tags?) is correct") 400 | 401 | # 3. return tuple char ids, word id 402 | if vocab_chars is not None and chars == True: 403 | return char_ids, word 404 | else: 405 | return word 406 | 407 | return f 408 | 409 | 410 | def _pad_sequences(sequences, pad_tok, max_length): 411 | """ 412 | Args: 413 | sequences: a generator of list or tuple 414 | pad_tok: the char to pad with 415 | 416 | Returns: 417 | a list of list where each sublist has same length 418 | """ 419 | sequence_padded, sequence_length = [], [] 420 | 421 | for seq in sequences: 422 | seq = list(seq) 423 | seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0) 424 | sequence_padded += [seq_] 425 | sequence_length += [min(len(seq), max_length)] 426 | 427 | return sequence_padded, sequence_length 428 | 429 | 430 | def pad_sequences(sequences, pad_tok, nlevels=2): 431 | """ 432 | Args: 433 | sequences: a generator of list or tuple 434 | pad_tok: the char to pad with 435 | nlevels: "depth" of padding, for the case where we have characters ids 436 | 437 | Returns: 438 | a list of list where each sublist has same length 439 | 440 | """ 441 | if nlevels == 1: 442 | max_length = max(map(lambda x : len(x), sequences)) 443 | sequence_padded, sequence_length = _pad_sequences(sequences, 444 | pad_tok, max_length) 445 | 446 | elif nlevels == 2: 447 | max_length_sentence = max([max(map(lambda x: len(x), seq)) 448 | for seq in sequences]) 449 | sequence_padded, sequence_length = [], [] 450 | for seq in sequences: 451 | # all words are same length now 452 | sp, sl = _pad_sequences(seq, pad_tok, max_length_sentence) 453 | sequence_padded += [sp] 454 | sequence_length += [sl] 455 | 456 | max_length_document = max(map(lambda x : len(x), sequences)) 457 | sequence_padded, _ = _pad_sequences(sequence_padded, 458 | [pad_tok]*max_length_sentence, max_length_document) 459 | sequence_length, _ = _pad_sequences(sequence_length, 0, 460 | max_length_document) 461 | 462 | return sequence_padded, sequence_length 463 | 464 | 465 | def minibatches(data, minibatch_size, shuffle=True): 466 | """ 467 | Args: 468 | data: generator of (sentence, tags) tuples 469 | minibatch_size: (int) 470 | 471 | Yields: 472 | list of tuples 473 | 474 | """ 475 | if shuffle: 476 | random.shuffle(data) 477 | 478 | x_batch, y_batch = [], [] 479 | for (x, y) in data: 480 | if len(x_batch) == minibatch_size: 481 | yield x_batch, y_batch 482 | x_batch, y_batch = [], [] 483 | 484 | # if type(x[0]) == tuple: 485 | # x = zip(*x) 486 | x_batch += [x] 487 | y_batch += [y] 488 | 489 | if len(x_batch) != 0: 490 | yield x_batch, y_batch 491 | 492 | 493 | def get_chunk_type(tok, idx_to_tag): 494 | """ 495 | Args: 496 | tok: id of token, ex 4 497 | idx_to_tag: dictionary {4: "B-PER", ...} 498 | 499 | Returns: 500 | tuple: "B", "PER" 501 | 502 | """ 503 | tag_name = idx_to_tag[tok] 504 | tag_class = tag_name.split('-')[0] 505 | tag_type = tag_name.split('-')[-1] 506 | return tag_class, tag_type 507 | 508 | 509 | def get_chunks(seq, tags): 510 | """Given a sequence of tags, group entities and their position 511 | 512 | Args: 513 | seq: [4, 4, 0, 0, ...] sequence of labels 514 | tags: dict["O"] = 4 515 | 516 | Returns: 517 | list of (chunk_type, chunk_start, chunk_end) 518 | 519 | Example: 520 | seq = [4, 5, 0, 3] 521 | tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3} 522 | result = [("PER", 0, 2), ("LOC", 3, 4)] 523 | 524 | """ 525 | default = tags[NONE] 526 | idx_to_tag = {idx: tag for tag, idx in tags.items()} 527 | chunks = [] 528 | chunk_type, chunk_start = None, None 529 | for i, tok in enumerate(seq): 530 | # End of a chunk 1 531 | if tok == default and chunk_type is not None: 532 | # Add a chunk. 533 | chunk = (chunk_type, chunk_start, i) 534 | chunks.append(chunk) 535 | chunk_type, chunk_start = None, None 536 | 537 | # End of a chunk + start of a chunk! 538 | elif tok != default: 539 | tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag) 540 | if chunk_type is None: 541 | chunk_type, chunk_start = tok_chunk_type, i 542 | elif tok_chunk_type != chunk_type or tok_chunk_class == "B": 543 | chunk = (chunk_type, chunk_start, i) 544 | chunks.append(chunk) 545 | chunk_type, chunk_start = tok_chunk_type, i 546 | else: 547 | pass 548 | 549 | # end condition 550 | if chunk_type is not None: 551 | chunk = (chunk_type, chunk_start, len(seq)) 552 | chunks.append(chunk) 553 | 554 | return chunks 555 | --------------------------------------------------------------------------------