├── requirements.txt ├── FusionModel ├── utils.py ├── model.py ├── FusionNet.py └── layers.py ├── download.sh ├── README.md ├── predict.py ├── train.py ├── prepro.py └── general_utils.py /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.14.2 2 | msgpack-python==0.5.6 3 | spacy==1.10.1 4 | jsonlines==1.2.0 5 | -------------------------------------------------------------------------------- /FusionModel/utils.py: -------------------------------------------------------------------------------- 1 | class AverageMeter(object): 2 | """Computes and stores the average and current value.""" 3 | def __init__(self): 4 | self.reset() 5 | 6 | def reset(self): 7 | self.val = 0 8 | self.avg = 0 9 | self.sum = 0 10 | self.count = 0 11 | 12 | def update(self, val, n=1): 13 | self.val = val 14 | self.sum += val * n 15 | self.count += n 16 | self.avg = self.sum / self.count 17 | -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Download MultiNLI 4 | wget http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip 5 | unzip -a multinli_1.0.zip 6 | rm -f multinli_1.0.zip 7 | 8 | # Download GloVe 9 | mkdir -p glove 10 | wget http://nlp.stanford.edu/data/glove.840B.300d.zip -O glove/glove.840B.300d.zip 11 | unzip glove/glove.840B.300d.zip -d glove 12 | 13 | # Download CoVe 14 | wget https://s3.amazonaws.com/research.metamind.io/cove/wmtlstm-b142a7f2.pth -O glove/MT-LSTM.pth 15 | 16 | # Download SpaCy English language models 17 | python -m spacy download en 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FusionNet for Natural Language Inference 2 | 3 | This is an example for applying FusionNet to natural language inference task. 4 | For more details on FusionNet, please refer to our paper: 5 | [FusionNet: Fusing via Fully-Aware Attention with Application to Machine Comprehension](https://arxiv.org/abs/1711.07341) 6 | 7 | Requirements 8 | ------------ 9 | + Python (version 3.5.2) 10 | + PyTorch (0.2.0) 11 | + spaCy (1.x) 12 | + NumPy 13 | + JSON Lines 14 | + MessagePack 15 | 16 | Since package update sometimes break backward compatibility, it is recommended to use Docker, which can be downloaded from [here](https://www.docker.com/community-edition#/download). To enable GPU, [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) may also needs to be installed. 17 | 18 | After setting up Docker, simply perform `docker pull momohuang/fusionnet-docker` to pull the docker file. Note that this may take some time to download. Then we can run the docker image through 19 | `docker run -it momohuang/fusionnet-docker` (Only CPU) 20 | or 21 | `nvidia-docker run -it momohuang/fusionnet-docker` (GPU-enabled). 22 | 23 | Quick Start 24 | ----------- 25 | `pip install -r requirements.txt` 26 | `bash download.sh` 27 | `python prepro.py` 28 | `python train.py` 29 | 30 | `train.py` supports an option `--full_att_type`, where 31 | `--full_att_type 0`: standard attention 32 | `--full_att_type 1`: fully-aware attention 33 | `--full_att_type 2`: fully-aware multi-level attention 34 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | import random 5 | import string 6 | import logging 7 | import argparse 8 | import pickle 9 | from shutil import copyfile 10 | from datetime import datetime 11 | from collections import Counter, defaultdict 12 | import torch 13 | import msgpack 14 | import numpy as np 15 | from FusionModel.model import FusionNet_Model 16 | from general_utils import BatchGen, load_train_data, load_eval_data 17 | 18 | parser = argparse.ArgumentParser( 19 | description='Predict using FusionNet model for Natural Language Inference.' 20 | ) 21 | parser.add_argument('-m', '--model', default='', 22 | help='testing model pathname, e.g. "models/checkpoint_epoch_11.pt"') 23 | parser.add_argument('--test_data', default='snli_1.0/test_preprocessed.msgpack', 24 | help='path to preprocessed testing (dev set 2) data file.') 25 | parser.add_argument('-bs', '--batch_size', default=32) 26 | parser.add_argument('--show', type=int, default=30) 27 | parser.add_argument('--seed', type=int, default=1023, 28 | help='random seed for data shuffling, dropout, etc.') 29 | parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(), 30 | help='whether to use GPU acceleration.') 31 | args = parser.parse_args() 32 | 33 | random.seed(args.seed) 34 | np.random.seed(args.seed) 35 | torch.manual_seed(args.seed) 36 | if args.cuda: 37 | torch.cuda.manual_seed_all(args.seed) 38 | 39 | log = logging.getLogger(__name__) 40 | log.setLevel(logging.DEBUG) 41 | ch = logging.StreamHandler(sys.stdout) 42 | ch.setLevel(logging.INFO) 43 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') 44 | ch.setFormatter(formatter) 45 | log.addHandler(ch) 46 | 47 | def main(): 48 | log.info('[program starts.]') 49 | checkpoint = torch.load(args.model) 50 | 51 | opt = checkpoint['config'] 52 | state_dict = checkpoint['state_dict'] 53 | model = FusionNet_Model(opt, state_dict = state_dict) 54 | log.info('[Model loaded.]') 55 | 56 | test, test_embedding, test_ans = load_eval_data(opt, args.test_data) 57 | model.setup_eval_embed(test_embedding) 58 | log.info('[Data loaded.]') 59 | 60 | if args.cuda: 61 | model.cuda() 62 | 63 | batches = BatchGen(test, batch_size=args.batch_size, evaluation=True, gpu=args.cuda) 64 | predictions = [] 65 | for batch in batches: 66 | predictions.extend(model.predict(batch)) 67 | acc = sum([x == y for x, y in zip(predictions, test_ans)]) / len(test_ans) * 100.0 68 | print("Accuracy =", acc) 69 | print(predictions[:args.show]) 70 | print(test_ans[:args.show]) 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /FusionModel/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | 8 | from torch.autograd import Variable 9 | from .utils import AverageMeter 10 | from .FusionNet import FusionNet 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | class FusionNet_Model(object): 15 | """ 16 | High level model that handles intializing the underlying network 17 | architecture, saving, updating examples, and predicting examples. 18 | """ 19 | 20 | def __init__(self, opt, embedding=None, state_dict=None): 21 | # Book-keeping. 22 | self.opt = opt 23 | self.updates = state_dict['updates'] if state_dict and 'updates' in state_dict else 0 24 | self.eval_embed_transfer = True 25 | self.train_loss = AverageMeter() 26 | 27 | # Building network. 28 | self.network = FusionNet(opt, embedding) 29 | if state_dict: 30 | new_state = set(self.network.state_dict().keys()) 31 | for k in list(state_dict['network'].keys()): 32 | if k not in new_state: 33 | del state_dict['network'][k] 34 | for k, v in list(self.network.state_dict().items()): 35 | if k not in state_dict['network']: 36 | state_dict['network'][k] = v 37 | self.network.load_state_dict(state_dict['network']) 38 | 39 | # Building optimizer. 40 | parameters = [p for p in self.network.parameters() if p.requires_grad] 41 | if opt['optimizer'] == 'adamax': 42 | self.optimizer = optim.Adamax(parameters) 43 | elif opt['optimizer'] == 'adam': 44 | self.optimizer = optim.Adam(parameters) 45 | else: 46 | raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) 47 | if state_dict and 'optimizer' in state_dict: 48 | self.optimizer.load_state_dict(state_dict['optimizer']) 49 | 50 | if opt['fix_embeddings']: 51 | wvec_size = 0 52 | else: 53 | wvec_size = (opt['vocab_size'] - opt['tune_partial']) * opt['embedding_dim'] 54 | self.total_param = sum([p.nelement() for p in parameters]) - wvec_size 55 | 56 | def update(self, ex): 57 | # Train mode 58 | self.network.train() 59 | 60 | # Transfer to GPU 61 | if self.opt['cuda']: 62 | inputs = [Variable(e.cuda(async=True)) for e in ex[:10]] 63 | targets = Variable(ex[10].cuda(async=True)) 64 | else: 65 | inputs = [Variable(e) for e in ex[:10]] 66 | targets = Variable(ex[10]) 67 | 68 | # Run forward 69 | scores = self.network(*inputs) # output: [batch_size, 3] 70 | 71 | # Compute loss and accuracies 72 | loss = F.cross_entropy(scores, targets) 73 | self.train_loss.update(loss.data[0], ex[0].size(0)) 74 | 75 | # Clear gradients and run backward 76 | self.optimizer.zero_grad() 77 | loss.backward() 78 | 79 | # Clip gradients 80 | torch.nn.utils.clip_grad_norm(self.network.parameters(), 81 | self.opt['grad_clipping']) 82 | 83 | # Update parameters 84 | self.optimizer.step() 85 | self.updates += 1 86 | 87 | # Reset any partially fixed parameters (e.g. rare words) 88 | self.reset_embeddings() 89 | self.eval_embed_transfer = True 90 | 91 | def predict(self, ex, best_nth=1): 92 | # Eval mode 93 | self.network.eval() 94 | 95 | # Transfer trained embedding to evaluation embedding 96 | if self.eval_embed_transfer: 97 | self.update_eval_embed() 98 | self.eval_embed_transfer = False 99 | 100 | # Transfer to GPU 101 | if self.opt['cuda']: 102 | # volatile means no gradient is needed 103 | inputs = [Variable(e.cuda(async=True), volatile=True) 104 | for e in ex[:10]] 105 | else: 106 | inputs = [Variable(e, volatile=True) for e in ex[:10]] 107 | 108 | # Run forward 109 | scores = self.network(*inputs) # output: [batch_size, 3] 110 | 111 | # Transfer to CPU/normal tensors and find classes for instances 112 | scores = scores.data.cpu() 113 | predictions = torch.max(scores, 1)[1].tolist() 114 | 115 | return predictions # list of classes 116 | 117 | # allow the evaluation embedding be larger than training embedding 118 | # this is helpful if we have pretrained word embeddings 119 | def setup_eval_embed(self, eval_embed, padding_idx = 0): 120 | # eval_embed should be a supermatrix of training embedding 121 | self.network.eval_embed = nn.Embedding(eval_embed.size(0), 122 | eval_embed.size(1), 123 | padding_idx = padding_idx) 124 | self.network.eval_embed.weight.data = eval_embed 125 | for p in self.network.eval_embed.parameters(): 126 | p.requires_grad = False 127 | self.eval_embed_transfer = True 128 | 129 | self.network.CoVe.setup_eval_embed(eval_embed) 130 | 131 | def update_eval_embed(self): 132 | # update evaluation embedding to trained embedding 133 | if self.opt['tune_partial'] > 0: 134 | offset = self.opt['tune_partial'] 135 | self.network.eval_embed.weight.data[0:offset] \ 136 | = self.network.embedding.weight.data[0:offset] 137 | else: 138 | offset = 10 139 | self.network.eval_embed.weight.data[0:offset] \ 140 | = self.network.embedding.weight.data[0:offset] 141 | 142 | def reset_embeddings(self): 143 | # Reset fixed embeddings to original value 144 | if self.opt['tune_partial'] > 0: 145 | offset = self.opt['tune_partial'] 146 | if offset < self.network.embedding.weight.data.size(0): 147 | self.network.embedding.weight.data[offset:] \ 148 | = self.network.fixed_embedding 149 | 150 | def save_for_predict(self, filename, epoch): 151 | network_state = self.network.state_dict() 152 | if 'eval_embed.weight' in network_state: 153 | del network_state['eval_embed.weight'] 154 | if 'fixed_embedding' in network_state: 155 | del network_state['fixed_embedding'] 156 | params = { 157 | 'state_dict': {'network': network_state}, 158 | 'config': self.opt, 159 | } 160 | try: 161 | torch.save(params, filename) 162 | logger.info('model saved to {}'.format(filename)) 163 | except BaseException: 164 | logger.warn('[ WARN: Saving failed... continuing anyway. ]') 165 | 166 | def cuda(self): 167 | self.network.cuda() 168 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | import random 5 | import string 6 | import logging 7 | import argparse 8 | import pickle 9 | from shutil import copyfile 10 | from datetime import datetime 11 | from collections import Counter, defaultdict 12 | import torch 13 | import msgpack 14 | import numpy as np 15 | from FusionModel.model import FusionNet_Model 16 | from general_utils import BatchGen, load_train_data, load_eval_data 17 | 18 | parser = argparse.ArgumentParser( 19 | description='Train FusionNet model for Natural Language Inference.' 20 | ) 21 | # system 22 | parser.add_argument('--name', default='', help='additional name of the current run') 23 | parser.add_argument('--log_file', default='output.log', 24 | help='path for log file.') 25 | parser.add_argument('--log_per_updates', type=int, default=80, 26 | help='log model loss per x updates (mini-batches).') 27 | 28 | parser.add_argument('--train_meta', default='multinli_1.0/train_meta.msgpack', 29 | help='path to preprocessed training meta file.') 30 | parser.add_argument('--train_data', default='multinli_1.0/train_data.msgpack', 31 | help='path to preprocessed training data file.') 32 | parser.add_argument('--dev_data', default='multinli_1.0/dev_mismatch_preprocessed.msgpack', 33 | help='path to preprocessed validation data file.') 34 | parser.add_argument('--test_data', default='multinli_1.0/dev_match_preprocessed.msgpack', 35 | help='path to preprocessed testing (dev set 2) data file.') 36 | 37 | parser.add_argument('--MTLSTM_path', default='glove/MT-LSTM.pth') 38 | parser.add_argument('--model_dir', default='models', 39 | help='path to store saved models.') 40 | parser.add_argument('--save_all', dest="save_best_only", action='store_false', 41 | help='save all models in addition to the best.') 42 | parser.add_argument('--do_not_save', action='store_true', help='don\'t save any model') 43 | parser.add_argument('--seed', type=int, default=1023, 44 | help='random seed for data shuffling, dropout, etc.') 45 | parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(), 46 | help='whether to use GPU acceleration.') 47 | # training 48 | parser.add_argument('-e', '--epoches', type=int, default=20) 49 | parser.add_argument('-bs', '--batch_size', type=int, default=32) 50 | parser.add_argument('-op', '--optimizer', default='adamax', 51 | help='supported optimizer: adamax, sgd, adadelta, adam') 52 | parser.add_argument('-gc', '--grad_clipping', type=float, default=10) 53 | parser.add_argument('-tp', '--tune_partial', type=int, default=1000, 54 | help='finetune top-x embeddings (including , ).') 55 | parser.add_argument('--fix_embeddings', action='store_true', 56 | help='if true, `tune_partial` will be ignored.') 57 | # model 58 | parser.add_argument('--number_of_class', type=int, default=3) 59 | parser.add_argument('--final_merge', default='linear_self_attn') 60 | 61 | parser.add_argument('--hidden_size', type=int, default=125) 62 | parser.add_argument('--enc_rnn_layers', type=int, default=2, help="Encoding RNN layers") 63 | parser.add_argument('--inf_rnn_layers', type=int, default=2, help="Inference RNN layers") 64 | parser.add_argument('--full_att_type', type=int, default=2) 65 | 66 | parser.add_argument('--pos_size', type=int, default=56, 67 | help='how many kinds of POS tags.') 68 | parser.add_argument('--pos_dim', type=int, default=12, 69 | help='the embedding dimension for POS tags.') 70 | parser.add_argument('--ner_size', type=int, default=19, 71 | help='how many kinds of named entity tags.') 72 | parser.add_argument('--ner_dim', type=int, default=8, 73 | help='the embedding dimension for named entity tags.') 74 | 75 | parser.add_argument('--no_seq_dropout', dest='do_seq_dropout', action='store_false') 76 | parser.add_argument('--my_dropout_p', type=float, default=0.3) 77 | parser.add_argument('--dropout_emb', type=float, default=0.3) 78 | parser.add_argument('--dropout_EM', type=float, default=0.6) 79 | 80 | args = parser.parse_args() 81 | 82 | if args.name != '': 83 | args.model_dir = args.model_dir + '_' + args.name 84 | args.log_file = os.path.dirname(args.log_file) + 'output_' + args.name + '.log' 85 | 86 | # set model dir 87 | model_dir = args.model_dir 88 | os.makedirs(model_dir, exist_ok=True) 89 | model_dir = os.path.abspath(model_dir) 90 | 91 | # set random seed 92 | random.seed(args.seed) 93 | np.random.seed(args.seed) 94 | torch.manual_seed(args.seed) 95 | if args.cuda: 96 | torch.cuda.manual_seed_all(args.seed) 97 | 98 | # setup logger 99 | log = logging.getLogger(__name__) 100 | log.setLevel(logging.DEBUG) 101 | 102 | ch = logging.StreamHandler(sys.stdout) 103 | ch.setLevel(logging.INFO) 104 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') 105 | ch.setFormatter(formatter) 106 | log.addHandler(ch) 107 | 108 | def main(): 109 | log.info('[program starts.]') 110 | opt = vars(args) # changing opt will change args 111 | train, train_embedding, opt = load_train_data(opt, args.train_meta, args.train_data) 112 | dev, dev_embedding, dev_ans = load_eval_data(opt, args.dev_data) 113 | test, test_embedding, test_ans = load_eval_data(opt, args.test_data) 114 | log.info('[Data loaded.]') 115 | 116 | model = FusionNet_Model(opt, train_embedding) 117 | if args.cuda: model.cuda() 118 | log.info("[dev] Total number of params: {}".format(model.total_param)) 119 | 120 | best_acc = 0.0 121 | 122 | for epoch in range(1, 1 + args.epoches): 123 | log.warning('Epoch {}'.format(epoch)) 124 | 125 | # train 126 | batches = BatchGen(train, batch_size=args.batch_size, gpu=args.cuda) 127 | start = datetime.now() 128 | for i, batch in enumerate(batches): 129 | model.update(batch) 130 | if i % args.log_per_updates == 0: 131 | log.info('updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format( 132 | model.updates, model.train_loss.avg, 133 | str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0])) 134 | 135 | # dev eval 136 | model.setup_eval_embed(dev_embedding) 137 | if args.cuda: model.cuda() 138 | 139 | batches = BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda) 140 | predictions = [] 141 | for batch in batches: 142 | predictions.extend(model.predict(batch)) 143 | acc = sum([x == y for x, y in zip(predictions, dev_ans)]) / len(dev_ans) * 100.0 144 | 145 | # test (or dev 2) eval 146 | model.setup_eval_embed(test_embedding) 147 | if args.cuda: model.cuda() 148 | 149 | batches = BatchGen(test, batch_size=args.batch_size, evaluation=True, gpu=args.cuda) 150 | predictions = [] 151 | for batch in batches: 152 | predictions.extend(model.predict(batch)) 153 | corr_acc = sum([x == y for x, y in zip(predictions, test_ans)]) / len(test_ans) * 100.0 154 | 155 | # save for predict 156 | if args.do_not_save == False: 157 | if args.save_best_only: 158 | if (acc + corr_acc)/2 > best_acc: 159 | model_file = os.path.join(model_dir, 'best_model.pt') 160 | model.save_for_predict(model_file, epoch) 161 | log.info('[new best model saved.]') 162 | else: 163 | model_file = os.path.join(model_dir, 'checkpoint_epoch_{}.pt'.format(epoch)) 164 | model.save_for_predict(model_file, epoch) 165 | if (acc + corr_acc)/2 > best_acc: 166 | copyfile( 167 | os.path.join(model_dir, model_file), 168 | os.path.join(model_dir, 'best_model.pt')) 169 | log.info('[new best model saved.]') 170 | if (acc + corr_acc)/2 > best_acc: 171 | best_acc = (acc + corr_acc)/2 172 | 173 | log.warning("Epoch {0} - dev Acc: {1:.3f}, dev2 Acc: {2:.3f} (best Acc: {3:.3f})".format(epoch, acc, corr_acc, best_acc)) 174 | 175 | if __name__ == '__main__': 176 | main() 177 | -------------------------------------------------------------------------------- /prepro.py: -------------------------------------------------------------------------------- 1 | import re 2 | import spacy 3 | import msgpack 4 | import unicodedata 5 | import numpy as np 6 | import argparse 7 | import collections 8 | import os.path 9 | import multiprocessing 10 | import logging 11 | import random 12 | from general_utils import normalize_text, build_embedding, load_glove_vocab, pre_proc, feature_gen, token2id, process_jsonlines 13 | 14 | # Fixed Parameters for MultiNLI_1.0 15 | trn_file = 'multinli_1.0/multinli_1.0_train.jsonl' 16 | trn_meta_msgpack = 'multinli_1.0/train_meta.msgpack' 17 | trn_data_msgpack = 'multinli_1.0/train_data.msgpack' 18 | 19 | dev_file = 'multinli_1.0/multinli_1.0_dev_mismatched.jsonl' 20 | dev_msgpack = 'multinli_1.0/dev_mismatch_preprocessed.msgpack' 21 | 22 | tst_file = 'multinli_1.0/multinli_1.0_dev_matched.jsonl' 23 | tst_msgpack = 'multinli_1.0/dev_match_preprocessed.msgpack' 24 | 25 | # Parameters 26 | parser = argparse.ArgumentParser( 27 | description='Preprocess the data.' 28 | ) 29 | parser.add_argument('--wv_file', default='glove/glove.840B.300d.txt', 30 | help='path to word vector file.') 31 | parser.add_argument('--wv_dim', type=int, default=300, 32 | help='word vector dimension.') 33 | parser.add_argument('--threads', type=int, default=multiprocessing.cpu_count(), 34 | help='number of threads for preprocessing.') 35 | parser.add_argument('--seed', type=int, default=1023, 36 | help='random seed for data shuffling, embedding init, etc.') 37 | 38 | args = parser.parse_args() 39 | wv_file = args.wv_file 40 | wv_dim = args.wv_dim 41 | nlp = spacy.load('en', parser=False) 42 | 43 | random.seed(args.seed) 44 | np.random.seed(args.seed) 45 | 46 | #================================================================ 47 | #=========================== GloVe ============================== 48 | #================================================================ 49 | 50 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG, 51 | datefmt='%m/%d/%Y %I:%M:%S') 52 | log = logging.getLogger(__name__) 53 | 54 | log.info('start data preparing... (using {} threads)'.format(args.threads)) 55 | 56 | glove_vocab = load_glove_vocab(wv_file, wv_dim) # return a "set" of vocabulary 57 | log.info('glove loaded.') 58 | 59 | #=============================================================== 60 | #=================== Work on training data ===================== 61 | #=============================================================== 62 | 63 | train = process_jsonlines(trn_file) 64 | log.info('train jsonline data flattened.') 65 | 66 | trP_iter = (pre_proc(p) for p in train.P) 67 | trH_iter = (pre_proc(h) for h in train.H) 68 | trP_docs = [doc for doc in nlp.pipe( 69 | trP_iter, batch_size=64, n_threads=args.threads)] 70 | trH_docs = [doc for doc in nlp.pipe( 71 | trH_iter, batch_size=64, n_threads=args.threads)] 72 | 73 | # tokens 74 | trP_tokens = [[normalize_text(w.text) for w in doc] for doc in trP_docs] 75 | trH_tokens = [[normalize_text(w.text) for w in doc] for doc in trH_docs] 76 | log.info('All tokens for training are obtained.') 77 | 78 | # features 79 | trP_tags, trP_ents, trP_features = feature_gen(trP_docs, trH_docs) 80 | trH_tags, trH_ents, trH_features = feature_gen(trH_docs, trP_docs) 81 | log.info('features for training is generated.') 82 | 83 | def build_train_vocab(A, B): # vocabulary will also be sorted accordingly 84 | counter = collections.Counter(w for doc in A + B for w in doc) 85 | vocab = sorted([t for t in counter if t in glove_vocab], key=counter.get, reverse=True) 86 | 87 | total = sum(counter.values()) 88 | matched = sum(counter[t] for t in vocab) 89 | log.info('vocab {1}/{0} OOV {2}/{3} ({4:.4f}%)'.format( 90 | len(counter), len(vocab), (total - matched), total, (total - matched) / total * 100)) 91 | vocab.insert(0, "") 92 | vocab.insert(1, "") 93 | return vocab 94 | 95 | # vocab 96 | tr_vocab = build_train_vocab(trH_tokens, trP_tokens) 97 | trP_ids = token2id(trP_tokens, tr_vocab, unk_id=1) 98 | trH_ids = token2id(trH_tokens, tr_vocab, unk_id=1) 99 | 100 | # tags 101 | vocab_tag = [''] + list(nlp.tagger.labels) 102 | trP_tag_ids = token2id(trP_tags, vocab_tag) 103 | trH_tag_ids = token2id(trH_tags, vocab_tag) 104 | 105 | # entities 106 | vocab_ent = list(set([ent for sent in trP_ents+trH_ents for ent in sent])) 107 | trP_ent_ids = token2id(trP_ents, vocab_ent) 108 | trH_ent_ids = token2id(trH_ents, vocab_ent) 109 | 110 | log.info('Found {} POS tags.'.format(len(vocab_tag))) 111 | log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent)) 112 | log.info('vocabulary for training is built.') 113 | 114 | tr_embedding = build_embedding(wv_file, tr_vocab, wv_dim) 115 | log.info('got embedding matrix for training.') 116 | 117 | meta = { 118 | 'vocab': tr_vocab, 119 | 'embedding': tr_embedding.tolist() 120 | } 121 | with open(trn_meta_msgpack, 'wb') as f: 122 | msgpack.dump(meta, f, encoding='utf8') 123 | 124 | result = { 125 | 'premise_ids': trP_ids, 126 | 'premise_features': trP_features, # exact match, tf 127 | 'premise_tags': trP_tag_ids, # POS tagging 128 | 'premise_ents': trP_ent_ids, # Entity recognition 129 | 'hypothesis_ids': trH_ids, 130 | 'hypothesis_features': trH_features, # exact match, tf 131 | 'hypothesis_tags': trH_tag_ids, # POS tagging 132 | 'hypothesis_ents': trH_ent_ids, # Entity recognition 133 | 'answers': train.label 134 | } 135 | with open(trn_data_msgpack, 'wb') as f: 136 | msgpack.dump(result, f, encoding='utf8') 137 | 138 | log.info('saved training to disk.') 139 | 140 | #========================================================== 141 | #=================== Work on dev&test ===================== 142 | #========================================================== 143 | 144 | def preprocess_eval_data(filename, output_msgpack): 145 | EvalData = process_jsonlines(filename) 146 | 147 | filename = os.path.basename(filename) 148 | log.info(filename + ' flattened.') 149 | 150 | EvalDataP_iter = (pre_proc(p) for p in EvalData.P) 151 | EvalDataH_iter = (pre_proc(h) for h in EvalData.H) 152 | EvalDataP_docs = [doc for doc in nlp.pipe( 153 | EvalDataP_iter, batch_size=64, n_threads=args.threads)] 154 | EvalDataH_docs = [doc for doc in nlp.pipe( 155 | EvalDataH_iter, batch_size=64, n_threads=args.threads)] 156 | 157 | # tokens 158 | EvalDataP_tokens = [[normalize_text(w.text) for w in doc] for doc in EvalDataP_docs] 159 | EvalDataH_tokens = [[normalize_text(w.text) for w in doc] for doc in EvalDataH_docs] 160 | log.info('All tokens for ' + filename + ' are obtained.') 161 | 162 | # features 163 | EvalDataP_tags, EvalDataP_ents, EvalDataP_features = feature_gen(EvalDataP_docs, EvalDataH_docs) 164 | EvalDataH_tags, EvalDataH_ents, EvalDataH_features = feature_gen(EvalDataH_docs, EvalDataP_docs) 165 | log.info('features for ' + filename + ' is generated.') 166 | 167 | def build_EvalData_vocab(A, B): # most vocabulary comes from tr_vocab 168 | existing_vocab = set(tr_vocab) 169 | new_vocab = list(set([w for doc in A + B for w in doc if w not in existing_vocab and w in glove_vocab])) 170 | vocab = tr_vocab + new_vocab 171 | log.info('train vocab {0}, total vocab {1}'.format(len(tr_vocab), len(vocab))) 172 | return vocab 173 | 174 | # vocab 175 | EvalData_vocab = build_EvalData_vocab(EvalDataP_tokens, EvalDataH_tokens) # tr_vocab is a subset of EvalData_vocab 176 | EvalDataP_ids = token2id(EvalDataP_tokens, EvalData_vocab, unk_id=1) 177 | EvalDataH_ids = token2id(EvalDataH_tokens, EvalData_vocab, unk_id=1) 178 | 179 | # tags 180 | EvalDataP_tag_ids = token2id(EvalDataP_tags, vocab_tag) 181 | EvalDataH_tag_ids = token2id(EvalDataH_tags, vocab_tag) # vocab_tag same as training 182 | 183 | # entities 184 | EvalDataP_ent_ids = token2id(EvalDataP_ents, vocab_ent) # vocab_ent same as training 185 | EvalDataH_ent_ids = token2id(EvalDataH_ents, vocab_ent) # vocab_ent same as training 186 | log.info('vocabulary for ' + filename + ' is built.') 187 | 188 | EvalData_embedding = build_embedding(wv_file, EvalData_vocab, wv_dim) # tr_embedding is a submatrix of EvalData_embedding 189 | log.info('got embedding matrix for ' + filename) 190 | 191 | result = { 192 | 'premise_ids': EvalDataP_ids, 193 | 'premise_features': EvalDataP_features, # exact match, tf 194 | 'premise_tags': EvalDataP_tag_ids, # POS tagging 195 | 'premise_ents': EvalDataP_ent_ids, # Entity recognition 196 | 'hypothesis_ids': EvalDataH_ids, 197 | 'hypothesis_features': EvalDataH_features, # exact match, tf 198 | 'hypothesis_tags': EvalDataH_tag_ids, # POS tagging 199 | 'hypothesis_ents': EvalDataH_ent_ids, # Entity recognition 200 | 'vocab': EvalData_vocab, 201 | 'embedding': EvalData_embedding.tolist(), 202 | 'answers': EvalData.label 203 | } 204 | with open(output_msgpack, 'wb') as f: 205 | msgpack.dump(result, f) 206 | 207 | log.info('saved ' + output_msgpack + ' to disk.') 208 | 209 | preprocess_eval_data(dev_file, dev_msgpack) 210 | preprocess_eval_data(tst_file, tst_msgpack) 211 | -------------------------------------------------------------------------------- /general_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | import random 5 | import string 6 | import logging 7 | import argparse 8 | import unicodedata 9 | from shutil import copyfile 10 | from datetime import datetime 11 | from collections import Counter 12 | import torch 13 | import msgpack 14 | import jsonlines 15 | import numpy as np 16 | 17 | #=========================================================================== 18 | #================= All for preprocessing SQuAD data set ==================== 19 | #=========================================================================== 20 | 21 | def normalize_text(text): 22 | return unicodedata.normalize('NFD', text) 23 | 24 | def load_glove_vocab(file, wv_dim): 25 | vocab = set() 26 | with open(file, encoding="utf8") as f: 27 | for line in f: 28 | elems = line.split() 29 | token = normalize_text(''.join(elems[0:-wv_dim])) 30 | vocab.add(token) 31 | return vocab 32 | 33 | def space_extend(matchobj): 34 | return ' ' + matchobj.group(0) + ' ' 35 | 36 | def pre_proc(text): 37 | # make hyphens, spaces clean 38 | text = re.sub(u'-|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|%|\[|\]|:|\(|\)|/', space_extend, text) 39 | text = text.strip(' \n') 40 | text = re.sub('\s+', ' ', text) 41 | return text 42 | 43 | class SNLIData: 44 | def __init__(self, label, sent1, sent2): 45 | self.label = label 46 | self.P = sent1 # Premise 47 | self.H = sent2 # Hypothesis 48 | 49 | def process_jsonlines(data_file): 50 | with jsonlines.open(data_file) as reader: 51 | snli_label = [] 52 | snli_sent1 = [] 53 | snli_sent2 = [] 54 | for obj in reader: 55 | if obj['gold_label'] != '-': 56 | snli_label.append(obj['gold_label']) 57 | snli_sent1.append(obj['sentence1']) 58 | snli_sent2.append(obj['sentence2']) 59 | return SNLIData(snli_label, snli_sent1, snli_sent2) 60 | 61 | def feature_gen(A_docs, B_docs): 62 | A_tags = [[w.tag_ for w in doc] for doc in A_docs] 63 | A_ents = [[w.ent_type_ for w in doc] for doc in A_docs] 64 | A_features = [] 65 | 66 | for textA, textB in zip(A_docs, B_docs): 67 | counter_ = Counter(w.text.lower() for w in textA) 68 | total = sum(counter_.values()) 69 | term_freq = [counter_[w.text.lower()] / total for w in textA] 70 | 71 | question_word = {w.text for w in textB} 72 | question_lower = {w.text.lower() for w in textB} 73 | question_lemma = {w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower() for w in textB} 74 | match_origin = [w.text in question_word for w in textA] 75 | match_lower = [w.text.lower() in question_lower for w in textA] 76 | match_lemma = [(w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower()) in question_lemma for w in textA] 77 | A_features.append(list(zip(term_freq, match_origin, match_lower, match_lemma))) 78 | 79 | return A_tags, A_ents, A_features 80 | 81 | def build_embedding(embed_file, targ_vocab, wv_dim): 82 | vocab_size = len(targ_vocab) 83 | emb = np.random.uniform(-1, 1, (vocab_size, wv_dim)) 84 | emb[0] = 0 # should be all 0 (using broadcast) 85 | 86 | w2id = {w: i for i, w in enumerate(targ_vocab)} 87 | with open(embed_file, encoding="utf8") as f: 88 | for line in f: 89 | elems = line.split() 90 | token = normalize_text(''.join(elems[0:-wv_dim])) 91 | if token in w2id: 92 | emb[w2id[token]] = [float(v) for v in elems[-wv_dim:]] 93 | return emb 94 | 95 | def token2id(docs, vocab, unk_id=None): 96 | w2id = {w: i for i, w in enumerate(vocab)} 97 | ids = [[w2id[w] if w in w2id else unk_id for w in doc] for doc in docs] 98 | return ids 99 | 100 | #=========================================================================== 101 | #=================== Load Training and Evaluation data ===================== 102 | #=========================================================================== 103 | 104 | def text2class(ans): 105 | if ans == "neutral": return 0 106 | if ans == "entailment": return 1 107 | if ans == "contradiction": return 2 108 | assert(True) 109 | 110 | def load_train_data(opt, train_meta, train_data): 111 | with open(train_meta, 'rb') as f: 112 | meta = msgpack.load(f, encoding='utf8') 113 | embedding = torch.Tensor(meta['embedding']) 114 | opt['vocab_size'] = embedding.size(0) 115 | opt['embedding_dim'] = embedding.size(1) 116 | 117 | with open(train_data, 'rb') as f: 118 | data = msgpack.load(f, encoding='utf8') 119 | opt['num_features'] = len(data['premise_features'][0][0]) 120 | 121 | train = list(zip( # list() due to lazy evaluation of zip 122 | data['premise_ids'], 123 | data['premise_features'], 124 | data['premise_tags'], 125 | data['premise_ents'], 126 | data['hypothesis_ids'], 127 | data['hypothesis_features'], 128 | data['hypothesis_tags'], 129 | data['hypothesis_ents'], 130 | [text2class(ans) for ans in data['answers']] 131 | )) 132 | return train, embedding, opt 133 | 134 | def load_eval_data(opt, eval_data): # can be extended to true test set 135 | with open(eval_data, 'rb') as f: 136 | data = msgpack.load(f, encoding='utf8') 137 | embedding = torch.Tensor(data['embedding']) 138 | 139 | assert opt['embedding_dim'] == embedding.size(1) 140 | assert opt['num_features'] == len(data['premise_features'][0][0]) 141 | 142 | eval_set = list(zip( 143 | data['premise_ids'], 144 | data['premise_features'], 145 | data['premise_tags'], 146 | data['premise_ents'], 147 | data['hypothesis_ids'], 148 | data['hypothesis_features'], 149 | data['hypothesis_tags'], 150 | data['hypothesis_ents'] 151 | )) 152 | return eval_set, embedding, [text2class(ans) for ans in data['answers']] 153 | 154 | #=========================================================================== 155 | #================ For batch generation (train & predict) =================== 156 | #=========================================================================== 157 | 158 | class BatchGen: 159 | def __init__(self, data, batch_size, gpu, evaluation=False): 160 | ''' 161 | input: 162 | data - list of lists 163 | batch_size - int 164 | ''' 165 | self.batch_size = batch_size 166 | self.eval = evaluation 167 | self.gpu = gpu 168 | 169 | # random shuffle for training 170 | if not evaluation: 171 | indices = list(range(len(data))) 172 | random.shuffle(indices) 173 | 174 | # chunk into batches (if i + batch_size > data.size(0), it's fine) 175 | data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)] 176 | self.data = data 177 | 178 | def __len__(self): 179 | return len(self.data) 180 | 181 | def __iter__(self): 182 | for batch in self.data: 183 | batch_size = len(batch) 184 | batch = list(zip(*batch)) 185 | if self.eval: 186 | assert len(batch) == 8 187 | else: 188 | assert len(batch) == 9 # + answer 189 | 190 | P_len = max(len(x) for x in batch[0]) 191 | H_len = max(len(x) for x in batch[4]) 192 | feature_len = len(batch[1][0][0]) 193 | 194 | # Premise Tokens 195 | P_id = torch.LongTensor(batch_size, P_len).fill_(0) 196 | for i, doc in enumerate(batch[0]): 197 | P_id[i, :len(doc)] = torch.LongTensor(doc) 198 | 199 | # Premise Feature 200 | P_feature = torch.Tensor(batch_size, P_len, feature_len).fill_(0) 201 | for i, doc in enumerate(batch[1]): 202 | for j, feature in enumerate(doc): 203 | P_feature[i, j, :] = torch.Tensor(feature) 204 | 205 | # Premise PoS 206 | P_tag = torch.LongTensor(batch_size, P_len).fill_(0) 207 | for i, doc in enumerate(batch[2]): 208 | P_tag[i, :len(doc)] = torch.LongTensor(doc) 209 | 210 | # Premise NER 211 | P_ent = torch.LongTensor(batch_size, P_len).fill_(0) 212 | for i, doc in enumerate(batch[3]): 213 | P_ent[i, :len(doc)] = torch.LongTensor(doc) 214 | 215 | # Hypothesis Tokens 216 | H_id = torch.LongTensor(batch_size, H_len).fill_(0) 217 | for i, doc in enumerate(batch[4]): 218 | H_id[i, :len(doc)] = torch.LongTensor(doc) 219 | 220 | # Hypothesis Features 221 | H_feature = torch.Tensor(batch_size, H_len, feature_len).fill_(0) 222 | for i, doc in enumerate(batch[5]): 223 | for j, feature in enumerate(doc): 224 | H_feature[i, j, :] = torch.Tensor(feature) 225 | 226 | # Hypothesis PoS 227 | H_tag = torch.LongTensor(batch_size, H_len).fill_(0) 228 | for i, doc in enumerate(batch[6]): 229 | H_tag[i, :len(doc)] = torch.LongTensor(doc) 230 | 231 | # Hypothesis NER 232 | H_ent = torch.LongTensor(batch_size, H_len).fill_(0) 233 | for i, doc in enumerate(batch[7]): 234 | H_ent[i, :len(doc)] = torch.LongTensor(doc) 235 | 236 | # Premise, Hypothesis Masks 237 | P_mask = torch.eq(P_id, 0) 238 | H_mask = torch.eq(H_id, 0) 239 | 240 | # Label: neutral (0), entailment (1), contradiction (2) 241 | if not self.eval: 242 | label = torch.LongTensor(batch[8]) 243 | 244 | if self.gpu: # page locked memory for async data transfer 245 | P_id = P_id.pin_memory() 246 | P_feature = P_feature.pin_memory() 247 | P_tag = P_tag.pin_memory() 248 | P_ent = P_ent.pin_memory() 249 | 250 | H_id = H_id.pin_memory() 251 | H_feature = H_feature.pin_memory() 252 | H_tag = H_tag.pin_memory() 253 | H_ent = H_ent.pin_memory() 254 | 255 | P_mask = P_mask.pin_memory() 256 | H_mask = H_mask.pin_memory() 257 | 258 | if self.eval: 259 | yield (P_id, P_feature, P_tag, P_ent, P_mask, 260 | H_id, H_feature, H_tag, H_ent, H_mask) 261 | else: 262 | yield (P_id, P_feature, P_tag, P_ent, P_mask, 263 | H_id, H_feature, H_tag, H_ent, H_mask, label) 264 | -------------------------------------------------------------------------------- /FusionModel/FusionNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from . import layers 5 | 6 | class FusionNet(nn.Module): 7 | """Network for the FusionNet Module.""" 8 | def __init__(self, opt, embedding=None, padding_idx=0): 9 | super(FusionNet, self).__init__() 10 | 11 | # Input size to RNN: word emb + char emb + question emb + manual features 12 | input_size = 0 13 | 14 | layers.set_my_dropout_prob(opt['my_dropout_p']) 15 | layers.set_seq_dropout(opt['do_seq_dropout']) 16 | 17 | # Word embeddings 18 | self.embedding = nn.Embedding(opt['vocab_size'], 19 | opt['embedding_dim'], 20 | padding_idx=padding_idx) 21 | if embedding is not None: 22 | self.embedding.weight.data = embedding 23 | if opt['fix_embeddings'] or opt['tune_partial'] == 0: 24 | opt['fix_embeddings'] = True 25 | opt['tune_partial'] = 0 26 | for p in self.embedding.parameters(): 27 | p.requires_grad = False 28 | else: 29 | assert opt['tune_partial'] < embedding.size(0) 30 | fixed_embedding = embedding[opt['tune_partial']:] 31 | # a persistent buffer for the nn.Module 32 | self.register_buffer('fixed_embedding', fixed_embedding) 33 | self.fixed_embedding = fixed_embedding 34 | embedding_dim = opt['embedding_dim'] 35 | input_size += embedding_dim 36 | # Contextualized embeddings 37 | self.CoVe = layers.MTLSTM(opt, embedding) 38 | input_size += self.CoVe.output_size 39 | # POS embeddings 40 | self.pos_embedding = nn.Embedding(opt['pos_size'], opt['pos_dim']) 41 | input_size += opt['pos_dim'] 42 | # NER embeddings 43 | self.ner_embedding = nn.Embedding(opt['ner_size'], opt['ner_dim']) 44 | input_size += opt['ner_dim'] 45 | 46 | if opt['full_att_type'] == 2: 47 | aux_input = opt['num_features'] 48 | else: 49 | aux_input = 1 50 | 51 | # Setup the vector size for [premise, hypothesis] 52 | # they will be modified in the following code 53 | cur_hidden_size = input_size 54 | print('Initially, the vector_size is {} (+ {})'.format(cur_hidden_size, aux_input)) 55 | 56 | # RNN premise encoder 57 | self.P_rnn = layers.RNNEncoder(cur_hidden_size, opt['hidden_size'], opt['enc_rnn_layers'], aux_size = aux_input) 58 | # RNN hypothesis encoder 59 | self.H_rnn = layers.RNNEncoder(cur_hidden_size, opt['hidden_size'], opt['enc_rnn_layers'], aux_size = aux_input) 60 | cur_hidden_size = opt['hidden_size'] * 2 61 | 62 | # Output sizes of rnn encoders 63 | print('After Input LSTM, the vector_size is [', cur_hidden_size, '] *', opt['enc_rnn_layers']) 64 | 65 | # Multi-level Fusion 66 | if opt['full_att_type'] == 0: 67 | self.full_attn_P = layers.FullAttention(cur_hidden_size, cur_hidden_size, 1) 68 | self.full_attn_H = layers.FullAttention(cur_hidden_size, cur_hidden_size, 1) 69 | elif opt['full_att_type'] == 1: 70 | self.full_attn_P = layers.FullAttention(input_size + opt['enc_rnn_layers'] * cur_hidden_size, cur_hidden_size, 1) 71 | self.full_attn_H = layers.FullAttention(input_size + opt['enc_rnn_layers'] * cur_hidden_size, cur_hidden_size, 1) 72 | elif opt['full_att_type'] == 2: 73 | self.full_attn_P = layers.FullAttention(input_size + opt['enc_rnn_layers'] * cur_hidden_size, 74 | opt['enc_rnn_layers'] * cur_hidden_size, opt['enc_rnn_layers']) 75 | self.full_attn_H = layers.FullAttention(input_size + opt['enc_rnn_layers'] * cur_hidden_size, 76 | opt['enc_rnn_layers'] * cur_hidden_size, opt['enc_rnn_layers']) 77 | else: 78 | raise NotImplementedError('full_att_type = %s' % opt['full_att_type']) 79 | cur_hidden_size = self.full_attn_P.output_size * 2 80 | 81 | # RNN premise inference 82 | self.P_infer_rnn = layers.RNNEncoder(cur_hidden_size, opt['hidden_size'], opt['inf_rnn_layers']) 83 | # RNN hypothesis inference 84 | self.H_infer_rnn = layers.RNNEncoder(cur_hidden_size, opt['hidden_size'], opt['inf_rnn_layers']) 85 | cur_hidden_size = opt['hidden_size'] * 2 * opt['inf_rnn_layers'] 86 | 87 | print('Before answer finding, hidden size is', cur_hidden_size) 88 | 89 | # Question merging 90 | if opt['final_merge'] == 'linear_self_attn': 91 | self.self_attn_P = layers.LinearSelfAttn(cur_hidden_size) 92 | self.self_attn_H = layers.LinearSelfAttn(cur_hidden_size) 93 | elif opt['final_merge'] != 'avg': 94 | raise NotImplementedError('final_merge = %s' % opt['final_merge']) 95 | 96 | self.classifier = layers.MLPFunc(cur_hidden_size * 4, cur_hidden_size, opt['number_of_class']) 97 | 98 | # Store config 99 | self.opt = opt 100 | 101 | def forward(self, x1, x1_f, x1_pos, x1_ner, x1_mask, x2, x2_f, x2_pos, x2_ner, x2_mask): 102 | """Inputs: 103 | x1 = premise word indices [batch * len_1] 104 | x1_f = premise word features indices [batch * len_1 * nfeat] 105 | x1_pos = premise POS tags [batch * len_1] 106 | x1_ner = premise entity tags [batch * len_1] 107 | x1_mask = premise padding mask [batch * len_1] 108 | x2 = hypothesis word indices [batch * len_2] 109 | x2_f = hypothesis word features indices [batch * len_2 * nfeat] 110 | x2_pos = hypothesis POS tags [batch * len_2] 111 | x2_ner = hypothesis entity tags [batch * len_2] 112 | x2_mask = hypothesis padding mask [batch * len_2] 113 | """ 114 | # Prepare premise and hypothesis input 115 | Prnn_input_list = [] 116 | Hrnn_input_list = [] 117 | 118 | # Word embeddings 119 | emb = self.embedding if self.training else self.eval_embed 120 | x1_emb, x2_emb = emb(x1), emb(x2) 121 | # Dropout on embeddings 122 | if self.opt['dropout_emb'] > 0: 123 | x1_emb = layers.dropout(x1_emb, p=self.opt['dropout_emb'], training=self.training) 124 | x2_emb = layers.dropout(x2_emb, p=self.opt['dropout_emb'], training=self.training) 125 | Prnn_input_list.append(x1_emb) 126 | Hrnn_input_list.append(x2_emb) 127 | 128 | # Contextualized embeddings 129 | _, x1_cove = self.CoVe(x1, x1_mask) 130 | _, x2_cove = self.CoVe(x2, x2_mask) 131 | if self.opt['dropout_emb'] > 0: 132 | x1_cove = layers.dropout(x1_cove, p=self.opt['dropout_emb'], training=self.training) 133 | x2_cove = layers.dropout(x2_cove, p=self.opt['dropout_emb'], training=self.training) 134 | Prnn_input_list.append(x1_cove) 135 | Hrnn_input_list.append(x2_cove) 136 | 137 | # POS embeddings 138 | x1_pos_emb = self.pos_embedding(x1_pos) 139 | x2_pos_emb = self.pos_embedding(x2_pos) 140 | Prnn_input_list.append(x1_pos_emb) 141 | Hrnn_input_list.append(x2_pos_emb) 142 | 143 | # NER embeddings 144 | x1_ner_emb = self.ner_embedding(x1_ner) 145 | x2_ner_emb = self.ner_embedding(x2_ner) 146 | Prnn_input_list.append(x1_ner_emb) 147 | Hrnn_input_list.append(x2_ner_emb) 148 | 149 | x1_input = torch.cat(Prnn_input_list, 2) 150 | x2_input = torch.cat(Hrnn_input_list, 2) 151 | 152 | # Now the features are ready 153 | # x1_input: [batch_size, doc_len, input_size] 154 | # x2_input: [batch_size, doc_len, input_size] 155 | 156 | if self.opt['full_att_type'] == 2: 157 | x1_f = layers.dropout(x1_f, p=self.opt['dropout_EM'], training=self.training) 158 | x2_f = layers.dropout(x2_f, p=self.opt['dropout_EM'], training=self.training) 159 | Paux_input, Haux_input = x1_f, x2_f 160 | else: 161 | Paux_input = x1_f[:, :, 0].contiguous().view(x1_f.size(0), x1_f.size(1), 1) 162 | Haux_input = x2_f[:, :, 0].contiguous().view(x2_f.size(0), x2_f.size(1), 1) 163 | 164 | # Encode premise with RNN 165 | P_abstr_ls = self.P_rnn(x1_input, x1_mask, aux_input=Paux_input) 166 | # Encode hypothesis with RNN 167 | H_abstr_ls = self.H_rnn(x2_input, x2_mask, aux_input=Haux_input) 168 | 169 | # Fusion 170 | if self.opt['full_att_type'] == 0: 171 | P_atts = P_abstr_ls[-1].contiguous() 172 | H_atts = H_abstr_ls[-1].contiguous() 173 | P_xs = P_abstr_ls[-1].contiguous() 174 | H_xs = H_abstr_ls[-1].contiguous() 175 | elif self.opt['full_att_type'] == 1: 176 | P_atts = torch.cat([x1_input] + P_abstr_ls, 2) 177 | H_atts = torch.cat([x2_input] + H_abstr_ls, 2) 178 | P_xs = P_abstr_ls[-1].contiguous() 179 | H_xs = H_abstr_ls[-1].contiguous() 180 | elif self.opt['full_att_type'] == 2: 181 | P_atts = torch.cat([x1_input] + P_abstr_ls, 2) 182 | H_atts = torch.cat([x2_input] + H_abstr_ls, 2) 183 | P_xs = torch.cat(P_abstr_ls, 2) 184 | H_xs = torch.cat(H_abstr_ls, 2) 185 | aP_xs = self.full_attn_P(P_atts, H_atts, P_xs, H_xs, x2_mask) 186 | aH_xs = self.full_attn_H(H_atts, P_atts, H_xs, P_xs, x1_mask) 187 | P_hiddens = torch.cat([P_xs, aP_xs], 2) 188 | H_hiddens = torch.cat([H_xs, aH_xs], 2) 189 | 190 | # Inference on premise and hypothesis 191 | P_hiddens = torch.cat(self.P_infer_rnn(P_hiddens, x1_mask), 2) 192 | H_hiddens = torch.cat(self.H_infer_rnn(H_hiddens, x2_mask), 2) 193 | 194 | # Merge hiddens for answer classification 195 | if self.opt['final_merge'] == 'avg': 196 | P_merge_weights = layers.uniform_weights(P_hiddens, x1_mask) 197 | H_merge_weights = layers.uniform_weights(H_hiddens, x2_mask) 198 | elif self.opt['final_merge'] == 'linear_self_attn': 199 | P_merge_weights = self.self_attn_P(P_hiddens, x1_mask) 200 | H_merge_weights = self.self_attn_H(H_hiddens, x2_mask) 201 | P_avg_hidden = layers.weighted_avg(P_hiddens, P_merge_weights) 202 | H_avg_hidden = layers.weighted_avg(H_hiddens, H_merge_weights) 203 | P_max_hidden = torch.max(P_hiddens, 1)[0] 204 | H_max_hidden = torch.max(H_hiddens, 1)[0] 205 | 206 | # Predict scores for different classes 207 | scores = self.classifier(torch.cat([P_avg_hidden, H_avg_hidden, P_max_hidden, H_max_hidden], 1)) 208 | 209 | return scores # -inf to inf 210 | -------------------------------------------------------------------------------- /FusionModel/layers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | from torch.nn.parameter import Parameter 8 | from torch.nn.utils.rnn import pad_packed_sequence as unpack 9 | from torch.nn.utils.rnn import pack_padded_sequence as pack 10 | 11 | # ------------------------------------------------------------------------------ 12 | # Neural Modules 13 | # ------------------------------------------------------------------------------ 14 | 15 | def set_seq_dropout(option): # option = True or False 16 | global do_seq_dropout 17 | do_seq_dropout = option 18 | 19 | def set_my_dropout_prob(p): # p between 0 to 1 20 | global my_dropout_p 21 | my_dropout_p = p 22 | 23 | def seq_dropout(x, p=0, training=False): 24 | """ 25 | x: batch * len * input_size 26 | """ 27 | if training == False or p == 0: 28 | return x 29 | dropout_mask = Variable(1.0 / (1-p) * torch.bernoulli((1-p) * (x.data.new(x.size(0), x.size(2)).zero_() + 1)), requires_grad=False) 30 | return dropout_mask.unsqueeze(1).expand_as(x) * x 31 | 32 | def dropout(x, p=0, training=False): 33 | """ 34 | x: (batch * len * input_size) or (any other shape) 35 | """ 36 | if do_seq_dropout and len(x.size()) == 3: # if x is (batch * len * input_size) 37 | return seq_dropout(x, p=p, training=training) 38 | else: 39 | return F.dropout(x, p=p, training=training) 40 | 41 | class RNNEncoder(nn.Module): 42 | def __init__(self, input_size, hidden_size, num_layers, rnn_type=nn.LSTM, aux_size=0): 43 | super(RNNEncoder, self).__init__() 44 | self.num_layers = num_layers 45 | self.rnns = nn.ModuleList() 46 | for i in range(num_layers): 47 | input_size_ = (input_size + 2 * hidden_size * i) 48 | if i == 0: input_size_ += aux_size 49 | self.rnns.append(rnn_type(input_size_, hidden_size, num_layers=1, bidirectional=True)) 50 | 51 | def forward(self, x, x_mask, aux_input=None): 52 | # Transpose batch and sequence dims 53 | x = x.transpose(0, 1) 54 | if aux_input is not None: 55 | aux_input = aux_input.transpose(0, 1) 56 | 57 | # Encode all layers 58 | hiddens = [x] 59 | for i in range(self.num_layers): 60 | rnn_input = torch.cat(hiddens, 2) 61 | if i == 0 and aux_input is not None: 62 | rnn_input = torch.cat([rnn_input, aux_input], 2) 63 | 64 | # Apply dropout to input 65 | if my_dropout_p > 0: 66 | rnn_input = dropout(rnn_input, p=my_dropout_p, training=self.training) 67 | # Forward 68 | rnn_output = self.rnns[i](rnn_input)[0] 69 | hiddens.append(rnn_output) 70 | 71 | # Transpose back 72 | hiddens = [h.transpose(0, 1) for h in hiddens] 73 | return hiddens[1:] 74 | 75 | class MTLSTM(nn.Module): 76 | def __init__(self, opt, embedding=None, padding_idx=0): 77 | """Initialize an MTLSTM 78 | 79 | Arguments: 80 | embedding (Float Tensor): If not None, initialize embedding matrix with specified embedding vectors 81 | """ 82 | super(MTLSTM, self).__init__() 83 | 84 | self.embedding = nn.Embedding(opt['vocab_size'], opt['embedding_dim'], padding_idx=padding_idx) 85 | if embedding is not None: 86 | self.embedding.weight.data = embedding 87 | 88 | state_dict = torch.load(opt['MTLSTM_path']) 89 | self.rnn1 = nn.LSTM(300, 300, num_layers=1, bidirectional=True) 90 | self.rnn2 = nn.LSTM(600, 300, num_layers=1, bidirectional=True) 91 | 92 | state_dict1 = dict([(name, param.data) if isinstance(param, Parameter) else (name, param) 93 | for name, param in state_dict.items() if '0' in name]) 94 | state_dict2 = dict([(name.replace('1', '0'), param.data) if isinstance(param, Parameter) else (name.replace('1', '0'), param) 95 | for name, param in state_dict.items() if '1' in name]) 96 | self.rnn1.load_state_dict(state_dict1) 97 | self.rnn2.load_state_dict(state_dict2) 98 | 99 | for p in self.embedding.parameters(): 100 | p.requires_grad = False 101 | for p in self.rnn1.parameters(): 102 | p.requires_grad = False 103 | for p in self.rnn2.parameters(): 104 | p.requires_grad = False 105 | 106 | self.output_size = 600 107 | 108 | def setup_eval_embed(self, eval_embed, padding_idx=0): 109 | """Allow evaluation vocabulary size to be greater than training vocabulary size 110 | 111 | Arguments: 112 | eval_embed (Float Tensor): Initialize eval_embed to be the specified embedding vectors 113 | """ 114 | self.eval_embed = nn.Embedding(eval_embed.size(0), eval_embed.size(1), padding_idx = padding_idx) 115 | self.eval_embed.weight.data = eval_embed 116 | 117 | for p in self.eval_embed.parameters(): 118 | p.requires_grad = False 119 | 120 | def forward(self, x_idx, x_mask): 121 | """A pretrained MT-LSTM (McCann et. al. 2017). 122 | This LSTM was trained with 300d 840B GloVe on the WMT 2017 machine translation dataset. 123 | 124 | Arguments: 125 | x_idx (Long Tensor): a Long Tensor of size (batch * len). 126 | x_mask (Byte Tensor): a Byte Tensor of mask for the input tensor (batch * len). 127 | """ 128 | emb = self.embedding if self.training else self.eval_embed 129 | x_hiddens = emb(x_idx) 130 | 131 | lengths = x_mask.data.eq(0).long().sum(1).squeeze() 132 | lens, indices = torch.sort(lengths, 0, True) 133 | 134 | output1, _ = self.rnn1(pack(x_hiddens[indices], lens.tolist(), batch_first=True)) 135 | output2, _ = self.rnn2(output1) 136 | 137 | output1 = unpack(output1, batch_first=True)[0] 138 | output2 = unpack(output2, batch_first=True)[0] 139 | 140 | _, _indices = torch.sort(indices, 0) 141 | output1 = output1[_indices] 142 | output2 = output2[_indices] 143 | 144 | return output1, output2 145 | 146 | # Attention layer 147 | class FullAttention(nn.Module): 148 | def __init__(self, full_size, hidden_size, num_level): 149 | super(FullAttention, self).__init__() 150 | assert(hidden_size % num_level == 0) 151 | self.full_size = full_size 152 | self.hidden_size = hidden_size 153 | self.attsize_per_lvl = hidden_size // num_level 154 | self.num_level = num_level 155 | self.linear = nn.Linear(full_size, hidden_size, bias=False) 156 | self.linear_final = Parameter(torch.ones(1, hidden_size), requires_grad = True) 157 | self.output_size = hidden_size 158 | print("Full Attention: (atten. {} -> {}, take {}) x {}".format(self.full_size, self.attsize_per_lvl, hidden_size // num_level, self.num_level)) 159 | 160 | def forward(self, x1_att, x2_att, x1, x2, x2_mask): 161 | """ 162 | x1_att: batch * len1 * full_size 163 | x2_att: batch * len2 * full_size 164 | x1: batch * len1 * hidden_size 165 | x2: batch * len2 * hidden_size 166 | x2_mask: batch * len2 167 | """ 168 | x1_att = dropout(x1_att, p=my_dropout_p, training=self.training) 169 | x2_att = dropout(x2_att, p=my_dropout_p, training=self.training) 170 | 171 | x1_key = F.relu(self.linear(x1_att.view(-1, self.full_size))) 172 | x2_key = F.relu(self.linear(x2_att.view(-1, self.full_size))) 173 | final_v = self.linear_final.expand_as(x2_key) 174 | x2_key = final_v * x2_key 175 | 176 | x1_rep = x1_key.view(-1, x1.size(1), self.num_level, self.attsize_per_lvl).transpose(1, 2).contiguous().view(-1, x1.size(1), self.attsize_per_lvl) 177 | x2_rep = x2_key.view(-1, x2.size(1), self.num_level, self.attsize_per_lvl).transpose(1, 2).contiguous().view(-1, x2.size(1), self.attsize_per_lvl) 178 | 179 | scores = x1_rep.bmm(x2_rep.transpose(1, 2)).view(-1, self.num_level, x1.size(1), x2.size(1)) # batch * num_level * len1 * len2 180 | 181 | x2_mask = x2_mask.unsqueeze(1).unsqueeze(2).expand_as(scores) 182 | scores.data.masked_fill_(x2_mask.data, -float('inf')) 183 | 184 | alpha_flat = F.softmax(scores.view(-1, x2.size(1))) 185 | alpha = alpha_flat.view(-1, x1.size(1), x2.size(1)) 186 | 187 | size_per_level = self.hidden_size // self.num_level 188 | atten_seq = alpha.bmm(x2.contiguous().view(-1, x2.size(1), self.num_level, size_per_level).transpose(1, 2).contiguous().view(-1, x2.size(1), size_per_level)) 189 | 190 | return atten_seq.view(-1, self.num_level, x1.size(1), size_per_level).transpose(1, 2).contiguous().view(-1, x1.size(1), self.hidden_size) 191 | 192 | # For summarizing a set of vectors into a single vector 193 | class LinearSelfAttn(nn.Module): 194 | """Self attention over a sequence: 195 | * o_i = softmax(Wx_i) for x_i in X. 196 | """ 197 | def __init__(self, input_size): 198 | super(LinearSelfAttn, self).__init__() 199 | self.linear = nn.Linear(input_size, 1) 200 | 201 | def forward(self, x, x_mask): 202 | """ 203 | x = batch * len * hdim 204 | x_mask = batch * len 205 | """ 206 | x = dropout(x, p=my_dropout_p, training=self.training) 207 | 208 | x_flat = x.contiguous().view(-1, x.size(-1)) 209 | scores = self.linear(x_flat).view(x.size(0), x.size(1)) 210 | scores.data.masked_fill_(x_mask.data, -float('inf')) 211 | alpha = F.softmax(scores) 212 | return alpha 213 | 214 | # Answer finding 215 | class MLPFunc(nn.Module): 216 | """ 217 | A multi-layer perceptron function for x: o = v'tanh(Wx+b). 218 | """ 219 | def __init__(self, input_size, hidden_size, num_class): 220 | super(MLPFunc, self).__init__() 221 | self.linear = nn.Linear(input_size, hidden_size) 222 | self.linear_final = nn.Linear(hidden_size, num_class, bias=False) 223 | 224 | def forward(self, x): 225 | """ 226 | x = batch * input_size 227 | """ 228 | x = dropout(x, p=my_dropout_p, training=self.training) 229 | h = F.tanh(self.linear(x)) 230 | h = dropout(h, p=my_dropout_p, training=self.training) 231 | o = self.linear_final(h) 232 | return o # batch * num_classes 233 | 234 | # ------------------------------------------------------------------------------ 235 | # Functional 236 | # ------------------------------------------------------------------------------ 237 | 238 | # by default in PyTorch, +-*/ are all element-wise 239 | def uniform_weights(x, x_mask): # used in lego_reader.py 240 | """Return uniform weights over non-masked input.""" 241 | alpha = Variable(torch.ones(x.size(0), x.size(1))) 242 | if x.data.is_cuda: 243 | alpha = alpha.cuda() 244 | alpha = alpha * x_mask.eq(0).float() 245 | alpha = alpha / alpha.sum(1).expand(alpha.size()) 246 | return alpha 247 | 248 | def weighted_avg(x, weights): # used in lego_reader.py 249 | """ x = batch * len * d 250 | weights = batch * len 251 | """ 252 | return weights.unsqueeze(1).bmm(x).squeeze(1) 253 | --------------------------------------------------------------------------------