├── requirements.txt
├── FusionModel
    ├── utils.py
    ├── model.py
    ├── FusionNet.py
    └── layers.py
├── download.sh
├── README.md
├── predict.py
├── train.py
├── prepro.py
└── general_utils.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.14.2
2 | msgpack-python==0.5.6
3 | spacy==1.10.1
4 | jsonlines==1.2.0
5 | 


--------------------------------------------------------------------------------
/FusionModel/utils.py:
--------------------------------------------------------------------------------
 1 | class AverageMeter(object):
 2 |     """Computes and stores the average and current value."""
 3 |     def __init__(self):
 4 |         self.reset()
 5 | 
 6 |     def reset(self):
 7 |         self.val = 0
 8 |         self.avg = 0
 9 |         self.sum = 0
10 |         self.count = 0
11 | 
12 |     def update(self, val, n=1):
13 |         self.val = val
14 |         self.sum += val * n
15 |         self.count += n
16 |         self.avg = self.sum / self.count
17 | 


--------------------------------------------------------------------------------
/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Download MultiNLI
 4 | wget http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip
 5 | unzip -a multinli_1.0.zip
 6 | rm -f multinli_1.0.zip
 7 | 
 8 | # Download GloVe
 9 | mkdir -p glove
10 | wget http://nlp.stanford.edu/data/glove.840B.300d.zip -O glove/glove.840B.300d.zip
11 | unzip glove/glove.840B.300d.zip -d glove
12 | 
13 | # Download CoVe
14 | wget https://s3.amazonaws.com/research.metamind.io/cove/wmtlstm-b142a7f2.pth -O glove/MT-LSTM.pth
15 | 
16 | # Download SpaCy English language models
17 | python -m spacy download en
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FusionNet for Natural Language Inference
 2 | 
 3 | This is an example for applying FusionNet to natural language inference task.  
 4 | For more details on FusionNet, please refer to our paper:  
 5 | [FusionNet: Fusing via Fully-Aware Attention with Application to Machine Comprehension](https://arxiv.org/abs/1711.07341)  
 6 | 
 7 | Requirements
 8 | ------------
 9 | + Python (version 3.5.2)
10 | + PyTorch (0.2.0)
11 | + spaCy (1.x)
12 | + NumPy
13 | + JSON Lines
14 | + MessagePack
15 | 
16 | Since package update sometimes break backward compatibility, it is recommended to use Docker, which can be downloaded from [here](https://www.docker.com/community-edition#/download). To enable GPU, [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) may also needs to be installed.  
17 | 
18 | After setting up Docker, simply perform `docker pull momohuang/fusionnet-docker` to pull the docker file. Note that this may take some time to download. Then we can run the docker image through  
19 | `docker run -it momohuang/fusionnet-docker` (Only CPU)  
20 | or  
21 | `nvidia-docker run -it momohuang/fusionnet-docker` (GPU-enabled).  
22 | 
23 | Quick Start
24 | -----------
25 | `pip install -r requirements.txt`  
26 | `bash download.sh`  
27 | `python prepro.py`  
28 | `python train.py`  
29 |   
30 | `train.py` supports an option `--full_att_type`, where  
31 | `--full_att_type 0`: standard attention  
32 | `--full_att_type 1`: fully-aware attention  
33 | `--full_att_type 2`: fully-aware multi-level attention  
34 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import sys
 4 | import random
 5 | import string
 6 | import logging
 7 | import argparse
 8 | import pickle
 9 | from shutil import copyfile
10 | from datetime import datetime
11 | from collections import Counter, defaultdict
12 | import torch
13 | import msgpack
14 | import numpy as np
15 | from FusionModel.model import FusionNet_Model
16 | from general_utils import BatchGen, load_train_data, load_eval_data
17 | 
18 | parser = argparse.ArgumentParser(
19 |     description='Predict using FusionNet model for Natural Language Inference.'
20 | )
21 | parser.add_argument('-m', '--model', default='',
22 |                     help='testing model pathname, e.g. "models/checkpoint_epoch_11.pt"')
23 | parser.add_argument('--test_data', default='snli_1.0/test_preprocessed.msgpack',
24 |                     help='path to preprocessed testing (dev set 2) data file.')
25 | parser.add_argument('-bs', '--batch_size', default=32)
26 | parser.add_argument('--show', type=int, default=30)
27 | parser.add_argument('--seed', type=int, default=1023,
28 |                     help='random seed for data shuffling, dropout, etc.')
29 | parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(),
30 |                     help='whether to use GPU acceleration.')
31 | args = parser.parse_args()
32 | 
33 | random.seed(args.seed)
34 | np.random.seed(args.seed)
35 | torch.manual_seed(args.seed)
36 | if args.cuda:
37 |     torch.cuda.manual_seed_all(args.seed)
38 | 
39 | log = logging.getLogger(__name__)
40 | log.setLevel(logging.DEBUG)
41 | ch = logging.StreamHandler(sys.stdout)
42 | ch.setLevel(logging.INFO)
43 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
44 | ch.setFormatter(formatter)
45 | log.addHandler(ch)
46 | 
47 | def main():
48 |     log.info('[program starts.]')
49 |     checkpoint = torch.load(args.model)
50 | 
51 |     opt = checkpoint['config']
52 |     state_dict = checkpoint['state_dict']
53 |     model = FusionNet_Model(opt, state_dict = state_dict)
54 |     log.info('[Model loaded.]')
55 | 
56 |     test, test_embedding, test_ans = load_eval_data(opt, args.test_data)
57 |     model.setup_eval_embed(test_embedding)
58 |     log.info('[Data loaded.]')
59 | 
60 |     if args.cuda:
61 |         model.cuda()
62 | 
63 |     batches = BatchGen(test, batch_size=args.batch_size, evaluation=True, gpu=args.cuda)
64 |     predictions = []
65 |     for batch in batches:
66 |         predictions.extend(model.predict(batch))
67 |     acc = sum([x == y for x, y in zip(predictions, test_ans)]) / len(test_ans) * 100.0
68 |     print("Accuracy =", acc)
69 |     print(predictions[:args.show])
70 |     print(test_ans[:args.show])
71 | 
72 | if __name__ == '__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------
/FusionModel/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.optim as optim
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | import logging
  7 | 
  8 | from torch.autograd import Variable
  9 | from .utils import AverageMeter
 10 | from .FusionNet import FusionNet
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class FusionNet_Model(object):
 15 |     """
 16 |     High level model that handles intializing the underlying network
 17 |     architecture, saving, updating examples, and predicting examples.
 18 |     """
 19 | 
 20 |     def __init__(self, opt, embedding=None, state_dict=None):
 21 |         # Book-keeping.
 22 |         self.opt = opt
 23 |         self.updates = state_dict['updates'] if state_dict and 'updates' in state_dict else 0
 24 |         self.eval_embed_transfer = True
 25 |         self.train_loss = AverageMeter()
 26 | 
 27 |         # Building network.
 28 |         self.network = FusionNet(opt, embedding)
 29 |         if state_dict:
 30 |             new_state = set(self.network.state_dict().keys())
 31 |             for k in list(state_dict['network'].keys()):
 32 |                 if k not in new_state:
 33 |                     del state_dict['network'][k]
 34 |             for k, v in list(self.network.state_dict().items()):
 35 |                 if k not in state_dict['network']:
 36 |                     state_dict['network'][k] = v
 37 |             self.network.load_state_dict(state_dict['network'])
 38 | 
 39 |         # Building optimizer.
 40 |         parameters = [p for p in self.network.parameters() if p.requires_grad]
 41 |         if opt['optimizer'] == 'adamax':
 42 |             self.optimizer = optim.Adamax(parameters)
 43 |         elif opt['optimizer'] == 'adam':
 44 |             self.optimizer = optim.Adam(parameters)
 45 |         else:
 46 |             raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer'])
 47 |         if state_dict and 'optimizer' in state_dict:
 48 |             self.optimizer.load_state_dict(state_dict['optimizer'])
 49 | 
 50 |         if opt['fix_embeddings']:
 51 |             wvec_size = 0
 52 |         else:
 53 |             wvec_size = (opt['vocab_size'] - opt['tune_partial']) * opt['embedding_dim']
 54 |         self.total_param = sum([p.nelement() for p in parameters]) - wvec_size
 55 | 
 56 |     def update(self, ex):
 57 |         # Train mode
 58 |         self.network.train()
 59 | 
 60 |         # Transfer to GPU
 61 |         if self.opt['cuda']:
 62 |             inputs = [Variable(e.cuda(async=True)) for e in ex[:10]]
 63 |             targets = Variable(ex[10].cuda(async=True))
 64 |         else:
 65 |             inputs = [Variable(e) for e in ex[:10]]
 66 |             targets = Variable(ex[10])
 67 | 
 68 |         # Run forward
 69 |         scores = self.network(*inputs) # output: [batch_size, 3]
 70 | 
 71 |         # Compute loss and accuracies
 72 |         loss = F.cross_entropy(scores, targets)
 73 |         self.train_loss.update(loss.data[0], ex[0].size(0))
 74 | 
 75 |         # Clear gradients and run backward
 76 |         self.optimizer.zero_grad()
 77 |         loss.backward()
 78 | 
 79 |         # Clip gradients
 80 |         torch.nn.utils.clip_grad_norm(self.network.parameters(),
 81 |                                       self.opt['grad_clipping'])
 82 | 
 83 |         # Update parameters
 84 |         self.optimizer.step()
 85 |         self.updates += 1
 86 | 
 87 |         # Reset any partially fixed parameters (e.g. rare words)
 88 |         self.reset_embeddings()
 89 |         self.eval_embed_transfer = True
 90 | 
 91 |     def predict(self, ex, best_nth=1):
 92 |         # Eval mode
 93 |         self.network.eval()
 94 | 
 95 |         # Transfer trained embedding to evaluation embedding
 96 |         if self.eval_embed_transfer:
 97 |             self.update_eval_embed()
 98 |             self.eval_embed_transfer = False
 99 | 
100 |         # Transfer to GPU
101 |         if self.opt['cuda']:
102 |             # volatile means no gradient is needed
103 |             inputs = [Variable(e.cuda(async=True), volatile=True)
104 |                       for e in ex[:10]]
105 |         else:
106 |             inputs = [Variable(e, volatile=True) for e in ex[:10]]
107 | 
108 |         # Run forward
109 |         scores = self.network(*inputs) # output: [batch_size, 3]
110 | 
111 |         # Transfer to CPU/normal tensors and find classes for instances
112 |         scores = scores.data.cpu()
113 |         predictions = torch.max(scores, 1)[1].tolist()
114 | 
115 |         return predictions # list of classes
116 | 
117 |     # allow the evaluation embedding be larger than training embedding
118 |     # this is helpful if we have pretrained word embeddings
119 |     def setup_eval_embed(self, eval_embed, padding_idx = 0):
120 |         # eval_embed should be a supermatrix of training embedding
121 |         self.network.eval_embed = nn.Embedding(eval_embed.size(0),
122 |                                                eval_embed.size(1),
123 |                                                padding_idx = padding_idx)
124 |         self.network.eval_embed.weight.data = eval_embed
125 |         for p in self.network.eval_embed.parameters():
126 |             p.requires_grad = False
127 |         self.eval_embed_transfer = True
128 | 
129 |         self.network.CoVe.setup_eval_embed(eval_embed)
130 | 
131 |     def update_eval_embed(self):
132 |         # update evaluation embedding to trained embedding
133 |         if self.opt['tune_partial'] > 0:
134 |             offset = self.opt['tune_partial']
135 |             self.network.eval_embed.weight.data[0:offset] \
136 |                 = self.network.embedding.weight.data[0:offset]
137 |         else:
138 |             offset = 10
139 |             self.network.eval_embed.weight.data[0:offset] \
140 |                 = self.network.embedding.weight.data[0:offset]
141 | 
142 |     def reset_embeddings(self):
143 |         # Reset fixed embeddings to original value
144 |         if self.opt['tune_partial'] > 0:
145 |             offset = self.opt['tune_partial']
146 |             if offset < self.network.embedding.weight.data.size(0):
147 |                 self.network.embedding.weight.data[offset:] \
148 |                     = self.network.fixed_embedding
149 | 
150 |     def save_for_predict(self, filename, epoch):
151 |         network_state = self.network.state_dict()
152 |         if 'eval_embed.weight' in network_state:
153 |             del network_state['eval_embed.weight']
154 |         if 'fixed_embedding' in network_state:
155 |             del network_state['fixed_embedding']
156 |         params = {
157 |             'state_dict': {'network': network_state},
158 |             'config': self.opt,
159 |         }
160 |         try:
161 |             torch.save(params, filename)
162 |             logger.info('model saved to {}'.format(filename))
163 |         except BaseException:
164 |             logger.warn('[ WARN: Saving failed... continuing anyway. ]')
165 | 
166 |     def cuda(self):
167 |         self.network.cuda()
168 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import sys
  4 | import random
  5 | import string
  6 | import logging
  7 | import argparse
  8 | import pickle
  9 | from shutil import copyfile
 10 | from datetime import datetime
 11 | from collections import Counter, defaultdict
 12 | import torch
 13 | import msgpack
 14 | import numpy as np
 15 | from FusionModel.model import FusionNet_Model
 16 | from general_utils import BatchGen, load_train_data, load_eval_data
 17 | 
 18 | parser = argparse.ArgumentParser(
 19 |     description='Train FusionNet model for Natural Language Inference.'
 20 | )
 21 | # system
 22 | parser.add_argument('--name', default='', help='additional name of the current run')
 23 | parser.add_argument('--log_file', default='output.log',
 24 |                     help='path for log file.')
 25 | parser.add_argument('--log_per_updates', type=int, default=80,
 26 |                     help='log model loss per x updates (mini-batches).')
 27 | 
 28 | parser.add_argument('--train_meta', default='multinli_1.0/train_meta.msgpack',
 29 |                     help='path to preprocessed training meta file.')
 30 | parser.add_argument('--train_data', default='multinli_1.0/train_data.msgpack',
 31 |                     help='path to preprocessed training data file.')
 32 | parser.add_argument('--dev_data', default='multinli_1.0/dev_mismatch_preprocessed.msgpack',
 33 |                     help='path to preprocessed validation data file.')
 34 | parser.add_argument('--test_data', default='multinli_1.0/dev_match_preprocessed.msgpack',
 35 |                     help='path to preprocessed testing (dev set 2) data file.')
 36 | 
 37 | parser.add_argument('--MTLSTM_path', default='glove/MT-LSTM.pth')
 38 | parser.add_argument('--model_dir', default='models',
 39 |                     help='path to store saved models.')
 40 | parser.add_argument('--save_all', dest="save_best_only", action='store_false',
 41 |                     help='save all models in addition to the best.')
 42 | parser.add_argument('--do_not_save', action='store_true', help='don\'t save any model')
 43 | parser.add_argument('--seed', type=int, default=1023,
 44 |                     help='random seed for data shuffling, dropout, etc.')
 45 | parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(),
 46 |                     help='whether to use GPU acceleration.')
 47 | # training
 48 | parser.add_argument('-e', '--epoches', type=int, default=20)
 49 | parser.add_argument('-bs', '--batch_size', type=int, default=32)
 50 | parser.add_argument('-op', '--optimizer', default='adamax',
 51 |                     help='supported optimizer: adamax, sgd, adadelta, adam')
 52 | parser.add_argument('-gc', '--grad_clipping', type=float, default=10)
 53 | parser.add_argument('-tp', '--tune_partial', type=int, default=1000,
 54 |                     help='finetune top-x embeddings (including <PAD>, <UNK>).')
 55 | parser.add_argument('--fix_embeddings', action='store_true',
 56 |                     help='if true, `tune_partial` will be ignored.')
 57 | # model
 58 | parser.add_argument('--number_of_class', type=int, default=3)
 59 | parser.add_argument('--final_merge', default='linear_self_attn')
 60 | 
 61 | parser.add_argument('--hidden_size', type=int, default=125)
 62 | parser.add_argument('--enc_rnn_layers', type=int, default=2, help="Encoding RNN layers")
 63 | parser.add_argument('--inf_rnn_layers', type=int, default=2, help="Inference RNN layers")
 64 | parser.add_argument('--full_att_type', type=int, default=2)
 65 | 
 66 | parser.add_argument('--pos_size', type=int, default=56,
 67 |                     help='how many kinds of POS tags.')
 68 | parser.add_argument('--pos_dim', type=int, default=12,
 69 |                     help='the embedding dimension for POS tags.')
 70 | parser.add_argument('--ner_size', type=int, default=19,
 71 |                     help='how many kinds of named entity tags.')
 72 | parser.add_argument('--ner_dim', type=int, default=8,
 73 |                     help='the embedding dimension for named entity tags.')
 74 | 
 75 | parser.add_argument('--no_seq_dropout', dest='do_seq_dropout', action='store_false')
 76 | parser.add_argument('--my_dropout_p', type=float, default=0.3)
 77 | parser.add_argument('--dropout_emb', type=float, default=0.3)
 78 | parser.add_argument('--dropout_EM', type=float, default=0.6)
 79 | 
 80 | args = parser.parse_args()
 81 | 
 82 | if args.name != '':
 83 |     args.model_dir = args.model_dir + '_' + args.name
 84 |     args.log_file = os.path.dirname(args.log_file) + 'output_' + args.name + '.log'
 85 | 
 86 | # set model dir
 87 | model_dir = args.model_dir
 88 | os.makedirs(model_dir, exist_ok=True)
 89 | model_dir = os.path.abspath(model_dir)
 90 | 
 91 | # set random seed
 92 | random.seed(args.seed)
 93 | np.random.seed(args.seed)
 94 | torch.manual_seed(args.seed)
 95 | if args.cuda:
 96 |     torch.cuda.manual_seed_all(args.seed)
 97 | 
 98 | # setup logger
 99 | log = logging.getLogger(__name__)
100 | log.setLevel(logging.DEBUG)
101 | 
102 | ch = logging.StreamHandler(sys.stdout)
103 | ch.setLevel(logging.INFO)
104 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
105 | ch.setFormatter(formatter)
106 | log.addHandler(ch)
107 | 
108 | def main():
109 |     log.info('[program starts.]')
110 |     opt = vars(args) # changing opt will change args
111 |     train, train_embedding, opt = load_train_data(opt, args.train_meta, args.train_data)
112 |     dev, dev_embedding, dev_ans = load_eval_data(opt, args.dev_data)
113 |     test, test_embedding, test_ans = load_eval_data(opt, args.test_data)
114 |     log.info('[Data loaded.]')
115 | 
116 |     model = FusionNet_Model(opt, train_embedding)
117 |     if args.cuda: model.cuda()
118 |     log.info("[dev] Total number of params: {}".format(model.total_param))
119 | 
120 |     best_acc = 0.0
121 | 
122 |     for epoch in range(1, 1 + args.epoches):
123 |         log.warning('Epoch {}'.format(epoch))
124 | 
125 |         # train
126 |         batches = BatchGen(train, batch_size=args.batch_size, gpu=args.cuda)
127 |         start = datetime.now()
128 |         for i, batch in enumerate(batches):
129 |             model.update(batch)
130 |             if i % args.log_per_updates == 0:
131 |                 log.info('updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format(
132 |                     model.updates, model.train_loss.avg,
133 |                     str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0]))
134 | 
135 |         # dev eval
136 |         model.setup_eval_embed(dev_embedding)
137 |         if args.cuda: model.cuda()
138 | 
139 |         batches = BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda)
140 |         predictions = []
141 |         for batch in batches:
142 |             predictions.extend(model.predict(batch))
143 |         acc = sum([x == y for x, y in zip(predictions, dev_ans)]) / len(dev_ans) * 100.0
144 | 
145 |         # test (or dev 2) eval
146 |         model.setup_eval_embed(test_embedding)
147 |         if args.cuda: model.cuda()
148 | 
149 |         batches = BatchGen(test, batch_size=args.batch_size, evaluation=True, gpu=args.cuda)
150 |         predictions = []
151 |         for batch in batches:
152 |             predictions.extend(model.predict(batch))
153 |         corr_acc = sum([x == y for x, y in zip(predictions, test_ans)]) / len(test_ans) * 100.0
154 | 
155 |         # save for predict
156 |         if args.do_not_save == False:
157 |             if args.save_best_only:
158 |                 if (acc + corr_acc)/2 > best_acc:
159 |                     model_file = os.path.join(model_dir, 'best_model.pt')
160 |                     model.save_for_predict(model_file, epoch)
161 |                     log.info('[new best model saved.]')
162 |             else:
163 |                 model_file = os.path.join(model_dir, 'checkpoint_epoch_{}.pt'.format(epoch))
164 |                 model.save_for_predict(model_file, epoch)
165 |                 if (acc + corr_acc)/2 > best_acc:
166 |                     copyfile(
167 |                         os.path.join(model_dir, model_file),
168 |                         os.path.join(model_dir, 'best_model.pt'))
169 |                     log.info('[new best model saved.]')
170 |         if (acc + corr_acc)/2 > best_acc:
171 |             best_acc = (acc + corr_acc)/2
172 | 
173 |         log.warning("Epoch {0} - dev Acc: {1:.3f}, dev2 Acc: {2:.3f} (best Acc: {3:.3f})".format(epoch, acc, corr_acc, best_acc))
174 | 
175 | if __name__ == '__main__':
176 |     main()
177 | 


--------------------------------------------------------------------------------
/prepro.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import spacy
  3 | import msgpack
  4 | import unicodedata
  5 | import numpy as np
  6 | import argparse
  7 | import collections
  8 | import os.path
  9 | import multiprocessing
 10 | import logging
 11 | import random
 12 | from general_utils import normalize_text, build_embedding, load_glove_vocab, pre_proc, feature_gen, token2id, process_jsonlines
 13 | 
 14 | # Fixed Parameters for MultiNLI_1.0
 15 | trn_file = 'multinli_1.0/multinli_1.0_train.jsonl'
 16 | trn_meta_msgpack = 'multinli_1.0/train_meta.msgpack'
 17 | trn_data_msgpack = 'multinli_1.0/train_data.msgpack'
 18 | 
 19 | dev_file = 'multinli_1.0/multinli_1.0_dev_mismatched.jsonl'
 20 | dev_msgpack = 'multinli_1.0/dev_mismatch_preprocessed.msgpack'
 21 | 
 22 | tst_file = 'multinli_1.0/multinli_1.0_dev_matched.jsonl'
 23 | tst_msgpack = 'multinli_1.0/dev_match_preprocessed.msgpack'
 24 | 
 25 | # Parameters
 26 | parser = argparse.ArgumentParser(
 27 |     description='Preprocess the data.'
 28 | )
 29 | parser.add_argument('--wv_file', default='glove/glove.840B.300d.txt',
 30 |                     help='path to word vector file.')
 31 | parser.add_argument('--wv_dim', type=int, default=300,
 32 |                     help='word vector dimension.')
 33 | parser.add_argument('--threads', type=int, default=multiprocessing.cpu_count(),
 34 |                     help='number of threads for preprocessing.')
 35 | parser.add_argument('--seed', type=int, default=1023,
 36 |                     help='random seed for data shuffling, embedding init, etc.')
 37 | 
 38 | args = parser.parse_args()
 39 | wv_file = args.wv_file
 40 | wv_dim = args.wv_dim
 41 | nlp = spacy.load('en', parser=False)
 42 | 
 43 | random.seed(args.seed)
 44 | np.random.seed(args.seed)
 45 | 
 46 | #================================================================
 47 | #=========================== GloVe ==============================
 48 | #================================================================
 49 | 
 50 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG,
 51 |                     datefmt='%m/%d/%Y %I:%M:%S')
 52 | log = logging.getLogger(__name__)
 53 | 
 54 | log.info('start data preparing... (using {} threads)'.format(args.threads))
 55 | 
 56 | glove_vocab = load_glove_vocab(wv_file, wv_dim) # return a "set" of vocabulary
 57 | log.info('glove loaded.')
 58 | 
 59 | #===============================================================
 60 | #=================== Work on training data =====================
 61 | #===============================================================
 62 | 
 63 | train = process_jsonlines(trn_file)
 64 | log.info('train jsonline data flattened.')
 65 | 
 66 | trP_iter = (pre_proc(p) for p in train.P)
 67 | trH_iter = (pre_proc(h) for h in train.H)
 68 | trP_docs = [doc for doc in nlp.pipe(
 69 |     trP_iter, batch_size=64, n_threads=args.threads)]
 70 | trH_docs = [doc for doc in nlp.pipe(
 71 |     trH_iter, batch_size=64, n_threads=args.threads)]
 72 | 
 73 | # tokens
 74 | trP_tokens = [[normalize_text(w.text) for w in doc] for doc in trP_docs]
 75 | trH_tokens = [[normalize_text(w.text) for w in doc] for doc in trH_docs]
 76 | log.info('All tokens for training are obtained.')
 77 | 
 78 | # features
 79 | trP_tags, trP_ents, trP_features = feature_gen(trP_docs, trH_docs)
 80 | trH_tags, trH_ents, trH_features = feature_gen(trH_docs, trP_docs)
 81 | log.info('features for training is generated.')
 82 | 
 83 | def build_train_vocab(A, B): # vocabulary will also be sorted accordingly
 84 |     counter = collections.Counter(w for doc in A + B for w in doc)
 85 |     vocab = sorted([t for t in counter if t in glove_vocab], key=counter.get, reverse=True)
 86 | 
 87 |     total = sum(counter.values())
 88 |     matched = sum(counter[t] for t in vocab)
 89 |     log.info('vocab {1}/{0} OOV {2}/{3} ({4:.4f}%)'.format(
 90 |         len(counter), len(vocab), (total - matched), total, (total - matched) / total * 100))
 91 |     vocab.insert(0, "<PAD>")
 92 |     vocab.insert(1, "<UNK>")
 93 |     return vocab
 94 | 
 95 | # vocab
 96 | tr_vocab = build_train_vocab(trH_tokens, trP_tokens)
 97 | trP_ids = token2id(trP_tokens, tr_vocab, unk_id=1)
 98 | trH_ids = token2id(trH_tokens, tr_vocab, unk_id=1)
 99 | 
100 | # tags
101 | vocab_tag = [''] + list(nlp.tagger.labels)
102 | trP_tag_ids = token2id(trP_tags, vocab_tag)
103 | trH_tag_ids = token2id(trH_tags, vocab_tag)
104 | 
105 | # entities
106 | vocab_ent = list(set([ent for sent in trP_ents+trH_ents for ent in sent]))
107 | trP_ent_ids = token2id(trP_ents, vocab_ent)
108 | trH_ent_ids = token2id(trH_ents, vocab_ent)
109 | 
110 | log.info('Found {} POS tags.'.format(len(vocab_tag)))
111 | log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent))
112 | log.info('vocabulary for training is built.')
113 | 
114 | tr_embedding = build_embedding(wv_file, tr_vocab, wv_dim)
115 | log.info('got embedding matrix for training.')
116 | 
117 | meta = {
118 |     'vocab': tr_vocab,
119 |     'embedding': tr_embedding.tolist()
120 | }
121 | with open(trn_meta_msgpack, 'wb') as f:
122 |     msgpack.dump(meta, f, encoding='utf8')
123 | 
124 | result = {
125 |     'premise_ids': trP_ids,
126 |     'premise_features': trP_features, # exact match, tf
127 |     'premise_tags': trP_tag_ids, # POS tagging
128 |     'premise_ents': trP_ent_ids, # Entity recognition
129 |     'hypothesis_ids': trH_ids,
130 |     'hypothesis_features': trH_features, # exact match, tf
131 |     'hypothesis_tags': trH_tag_ids, # POS tagging
132 |     'hypothesis_ents': trH_ent_ids, # Entity recognition
133 |     'answers': train.label
134 | }
135 | with open(trn_data_msgpack, 'wb') as f:
136 |     msgpack.dump(result, f, encoding='utf8')
137 | 
138 | log.info('saved training to disk.')
139 | 
140 | #==========================================================
141 | #=================== Work on dev&test =====================
142 | #==========================================================
143 | 
144 | def preprocess_eval_data(filename, output_msgpack):
145 |     EvalData = process_jsonlines(filename)
146 | 
147 |     filename = os.path.basename(filename)
148 |     log.info(filename + ' flattened.')
149 | 
150 |     EvalDataP_iter = (pre_proc(p) for p in EvalData.P)
151 |     EvalDataH_iter = (pre_proc(h) for h in EvalData.H)
152 |     EvalDataP_docs = [doc for doc in nlp.pipe(
153 |         EvalDataP_iter, batch_size=64, n_threads=args.threads)]
154 |     EvalDataH_docs = [doc for doc in nlp.pipe(
155 |         EvalDataH_iter, batch_size=64, n_threads=args.threads)]
156 | 
157 |     # tokens
158 |     EvalDataP_tokens = [[normalize_text(w.text) for w in doc] for doc in EvalDataP_docs]
159 |     EvalDataH_tokens = [[normalize_text(w.text) for w in doc] for doc in EvalDataH_docs]
160 |     log.info('All tokens for ' + filename + ' are obtained.')
161 | 
162 |     # features
163 |     EvalDataP_tags, EvalDataP_ents, EvalDataP_features = feature_gen(EvalDataP_docs, EvalDataH_docs)
164 |     EvalDataH_tags, EvalDataH_ents, EvalDataH_features = feature_gen(EvalDataH_docs, EvalDataP_docs)
165 |     log.info('features for ' + filename + ' is generated.')
166 | 
167 |     def build_EvalData_vocab(A, B): # most vocabulary comes from tr_vocab
168 |         existing_vocab = set(tr_vocab)
169 |         new_vocab = list(set([w for doc in A + B for w in doc if w not in existing_vocab and w in glove_vocab]))
170 |         vocab = tr_vocab + new_vocab
171 |         log.info('train vocab {0}, total vocab {1}'.format(len(tr_vocab), len(vocab)))
172 |         return vocab
173 | 
174 |     # vocab
175 |     EvalData_vocab = build_EvalData_vocab(EvalDataP_tokens, EvalDataH_tokens) # tr_vocab is a subset of EvalData_vocab
176 |     EvalDataP_ids = token2id(EvalDataP_tokens, EvalData_vocab, unk_id=1)
177 |     EvalDataH_ids = token2id(EvalDataH_tokens, EvalData_vocab, unk_id=1)
178 | 
179 |     # tags
180 |     EvalDataP_tag_ids = token2id(EvalDataP_tags, vocab_tag)
181 |     EvalDataH_tag_ids = token2id(EvalDataH_tags, vocab_tag) # vocab_tag same as training
182 | 
183 |     # entities
184 |     EvalDataP_ent_ids = token2id(EvalDataP_ents, vocab_ent) # vocab_ent same as training
185 |     EvalDataH_ent_ids = token2id(EvalDataH_ents, vocab_ent) # vocab_ent same as training
186 |     log.info('vocabulary for ' + filename + ' is built.')
187 | 
188 |     EvalData_embedding = build_embedding(wv_file, EvalData_vocab, wv_dim) # tr_embedding is a submatrix of EvalData_embedding
189 |     log.info('got embedding matrix for ' + filename)
190 | 
191 |     result = {
192 |         'premise_ids': EvalDataP_ids,
193 |         'premise_features': EvalDataP_features, # exact match, tf
194 |         'premise_tags': EvalDataP_tag_ids, # POS tagging
195 |         'premise_ents': EvalDataP_ent_ids, # Entity recognition
196 |         'hypothesis_ids': EvalDataH_ids,
197 |         'hypothesis_features': EvalDataH_features, # exact match, tf
198 |         'hypothesis_tags': EvalDataH_tag_ids, # POS tagging
199 |         'hypothesis_ents': EvalDataH_ent_ids, # Entity recognition
200 |         'vocab': EvalData_vocab,
201 |         'embedding': EvalData_embedding.tolist(),
202 |         'answers': EvalData.label
203 |     }
204 |     with open(output_msgpack, 'wb') as f:
205 |         msgpack.dump(result, f)
206 | 
207 |     log.info('saved ' + output_msgpack + ' to disk.')
208 | 
209 | preprocess_eval_data(dev_file, dev_msgpack)
210 | preprocess_eval_data(tst_file, tst_msgpack)
211 | 


--------------------------------------------------------------------------------
/general_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import sys
  4 | import random
  5 | import string
  6 | import logging
  7 | import argparse
  8 | import unicodedata
  9 | from shutil import copyfile
 10 | from datetime import datetime
 11 | from collections import Counter
 12 | import torch
 13 | import msgpack
 14 | import jsonlines
 15 | import numpy as np
 16 | 
 17 | #===========================================================================
 18 | #================= All for preprocessing SQuAD data set ====================
 19 | #===========================================================================
 20 | 
 21 | def normalize_text(text):
 22 |     return unicodedata.normalize('NFD', text)
 23 | 
 24 | def load_glove_vocab(file, wv_dim):
 25 |     vocab = set()
 26 |     with open(file, encoding="utf8") as f:
 27 |         for line in f:
 28 |             elems = line.split()
 29 |             token = normalize_text(''.join(elems[0:-wv_dim]))
 30 |             vocab.add(token)
 31 |     return vocab
 32 | 
 33 | def space_extend(matchobj):
 34 |     return ' ' + matchobj.group(0) + ' '
 35 | 
 36 | def pre_proc(text):
 37 |     # make hyphens, spaces clean
 38 |     text = re.sub(u'-|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|%|\[|\]|:|\(|\)|/', space_extend, text)
 39 |     text = text.strip(' \n')
 40 |     text = re.sub('\s+', ' ', text)
 41 |     return text
 42 | 
 43 | class SNLIData:
 44 |     def __init__(self, label, sent1, sent2):
 45 |         self.label = label
 46 |         self.P = sent1 # Premise
 47 |         self.H = sent2 # Hypothesis
 48 | 
 49 | def process_jsonlines(data_file):
 50 |     with jsonlines.open(data_file) as reader:
 51 |         snli_label = []
 52 |         snli_sent1 = []
 53 |         snli_sent2 = []
 54 |         for obj in reader:
 55 |             if obj['gold_label'] != '-':
 56 |                 snli_label.append(obj['gold_label'])
 57 |                 snli_sent1.append(obj['sentence1'])
 58 |                 snli_sent2.append(obj['sentence2'])
 59 |         return SNLIData(snli_label, snli_sent1, snli_sent2)
 60 | 
 61 | def feature_gen(A_docs, B_docs):
 62 |     A_tags = [[w.tag_ for w in doc] for doc in A_docs]
 63 |     A_ents = [[w.ent_type_ for w in doc] for doc in A_docs]
 64 |     A_features = []
 65 | 
 66 |     for textA, textB in zip(A_docs, B_docs):
 67 |         counter_ = Counter(w.text.lower() for w in textA)
 68 |         total = sum(counter_.values())
 69 |         term_freq = [counter_[w.text.lower()] / total for w in textA]
 70 | 
 71 |         question_word = {w.text for w in textB}
 72 |         question_lower = {w.text.lower() for w in textB}
 73 |         question_lemma = {w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower() for w in textB}
 74 |         match_origin = [w.text in question_word for w in textA]
 75 |         match_lower = [w.text.lower() in question_lower for w in textA]
 76 |         match_lemma = [(w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower()) in question_lemma for w in textA]
 77 |         A_features.append(list(zip(term_freq, match_origin, match_lower, match_lemma)))
 78 | 
 79 |     return A_tags, A_ents, A_features
 80 | 
 81 | def build_embedding(embed_file, targ_vocab, wv_dim):
 82 |     vocab_size = len(targ_vocab)
 83 |     emb = np.random.uniform(-1, 1, (vocab_size, wv_dim))
 84 |     emb[0] = 0 # <PAD> should be all 0 (using broadcast)
 85 | 
 86 |     w2id = {w: i for i, w in enumerate(targ_vocab)}
 87 |     with open(embed_file, encoding="utf8") as f:
 88 |         for line in f:
 89 |             elems = line.split()
 90 |             token = normalize_text(''.join(elems[0:-wv_dim]))
 91 |             if token in w2id:
 92 |                 emb[w2id[token]] = [float(v) for v in elems[-wv_dim:]]
 93 |     return emb
 94 | 
 95 | def token2id(docs, vocab, unk_id=None):
 96 |     w2id = {w: i for i, w in enumerate(vocab)}
 97 |     ids = [[w2id[w] if w in w2id else unk_id for w in doc] for doc in docs]
 98 |     return ids
 99 | 
100 | #===========================================================================
101 | #=================== Load Training and Evaluation data =====================
102 | #===========================================================================
103 | 
104 | def text2class(ans):
105 |     if ans == "neutral": return 0
106 |     if ans == "entailment": return 1
107 |     if ans == "contradiction": return 2
108 |     assert(True)
109 | 
110 | def load_train_data(opt, train_meta, train_data):
111 |     with open(train_meta, 'rb') as f:
112 |         meta = msgpack.load(f, encoding='utf8')
113 |     embedding = torch.Tensor(meta['embedding'])
114 |     opt['vocab_size'] = embedding.size(0)
115 |     opt['embedding_dim'] = embedding.size(1)
116 | 
117 |     with open(train_data, 'rb') as f:
118 |         data = msgpack.load(f, encoding='utf8')
119 |     opt['num_features'] = len(data['premise_features'][0][0])
120 | 
121 |     train = list(zip( # list() due to lazy evaluation of zip
122 |         data['premise_ids'],
123 |         data['premise_features'],
124 |         data['premise_tags'],
125 |         data['premise_ents'],
126 |         data['hypothesis_ids'],
127 |         data['hypothesis_features'],
128 |         data['hypothesis_tags'],
129 |         data['hypothesis_ents'],
130 |         [text2class(ans) for ans in data['answers']]
131 |     ))
132 |     return train, embedding, opt
133 | 
134 | def load_eval_data(opt, eval_data): # can be extended to true test set
135 |     with open(eval_data, 'rb') as f:
136 |         data = msgpack.load(f, encoding='utf8')
137 |     embedding = torch.Tensor(data['embedding'])
138 | 
139 |     assert opt['embedding_dim'] == embedding.size(1)
140 |     assert opt['num_features'] == len(data['premise_features'][0][0])
141 | 
142 |     eval_set = list(zip(
143 |         data['premise_ids'],
144 |         data['premise_features'],
145 |         data['premise_tags'],
146 |         data['premise_ents'],
147 |         data['hypothesis_ids'],
148 |         data['hypothesis_features'],
149 |         data['hypothesis_tags'],
150 |         data['hypothesis_ents']
151 |     ))
152 |     return eval_set, embedding, [text2class(ans) for ans in data['answers']]
153 | 
154 | #===========================================================================
155 | #================ For batch generation (train & predict) ===================
156 | #===========================================================================
157 | 
158 | class BatchGen:
159 |     def __init__(self, data, batch_size, gpu, evaluation=False):
160 |         '''
161 |         input:
162 |             data - list of lists
163 |             batch_size - int
164 |         '''
165 |         self.batch_size = batch_size
166 |         self.eval = evaluation
167 |         self.gpu = gpu
168 | 
169 |         # random shuffle for training
170 |         if not evaluation:
171 |             indices = list(range(len(data)))
172 |             random.shuffle(indices)
173 | 
174 |         # chunk into batches (if i + batch_size > data.size(0), it's fine)
175 |         data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
176 |         self.data = data
177 | 
178 |     def __len__(self):
179 |         return len(self.data)
180 | 
181 |     def __iter__(self):
182 |         for batch in self.data:
183 |             batch_size = len(batch)
184 |             batch = list(zip(*batch))
185 |             if self.eval:
186 |                 assert len(batch) == 8
187 |             else:
188 |                 assert len(batch) == 9 # + answer
189 | 
190 |             P_len = max(len(x) for x in batch[0])
191 |             H_len = max(len(x) for x in batch[4])
192 |             feature_len = len(batch[1][0][0])
193 | 
194 |             # Premise Tokens
195 |             P_id = torch.LongTensor(batch_size, P_len).fill_(0)
196 |             for i, doc in enumerate(batch[0]):
197 |                 P_id[i, :len(doc)] = torch.LongTensor(doc)
198 | 
199 |             # Premise Feature
200 |             P_feature = torch.Tensor(batch_size, P_len, feature_len).fill_(0)
201 |             for i, doc in enumerate(batch[1]):
202 |                 for j, feature in enumerate(doc):
203 |                     P_feature[i, j, :] = torch.Tensor(feature)
204 | 
205 |             # Premise PoS
206 |             P_tag = torch.LongTensor(batch_size, P_len).fill_(0)
207 |             for i, doc in enumerate(batch[2]):
208 |                 P_tag[i, :len(doc)] = torch.LongTensor(doc)
209 | 
210 |             # Premise NER
211 |             P_ent = torch.LongTensor(batch_size, P_len).fill_(0)
212 |             for i, doc in enumerate(batch[3]):
213 |                 P_ent[i, :len(doc)] = torch.LongTensor(doc)
214 | 
215 |             # Hypothesis Tokens
216 |             H_id = torch.LongTensor(batch_size, H_len).fill_(0)
217 |             for i, doc in enumerate(batch[4]):
218 |                 H_id[i, :len(doc)] = torch.LongTensor(doc)
219 | 
220 |             # Hypothesis Features
221 |             H_feature = torch.Tensor(batch_size, H_len, feature_len).fill_(0)
222 |             for i, doc in enumerate(batch[5]):
223 |                 for j, feature in enumerate(doc):
224 |                     H_feature[i, j, :] = torch.Tensor(feature)
225 | 
226 |             # Hypothesis PoS
227 |             H_tag = torch.LongTensor(batch_size, H_len).fill_(0)
228 |             for i, doc in enumerate(batch[6]):
229 |                 H_tag[i, :len(doc)] = torch.LongTensor(doc)
230 | 
231 |             # Hypothesis NER
232 |             H_ent = torch.LongTensor(batch_size, H_len).fill_(0)
233 |             for i, doc in enumerate(batch[7]):
234 |                 H_ent[i, :len(doc)] = torch.LongTensor(doc)
235 | 
236 |             # Premise, Hypothesis Masks
237 |             P_mask = torch.eq(P_id, 0)
238 |             H_mask = torch.eq(H_id, 0)
239 | 
240 |             # Label: neutral (0), entailment (1), contradiction (2)
241 |             if not self.eval:
242 |                 label = torch.LongTensor(batch[8])
243 | 
244 |             if self.gpu: # page locked memory for async data transfer
245 |                 P_id = P_id.pin_memory()
246 |                 P_feature = P_feature.pin_memory()
247 |                 P_tag = P_tag.pin_memory()
248 |                 P_ent = P_ent.pin_memory()
249 | 
250 |                 H_id = H_id.pin_memory()
251 |                 H_feature = H_feature.pin_memory()
252 |                 H_tag = H_tag.pin_memory()
253 |                 H_ent = H_ent.pin_memory()
254 | 
255 |                 P_mask = P_mask.pin_memory()
256 |                 H_mask = H_mask.pin_memory()
257 | 
258 |             if self.eval:
259 |                 yield (P_id, P_feature, P_tag, P_ent, P_mask,
260 |                        H_id, H_feature, H_tag, H_ent, H_mask)
261 |             else:
262 |                 yield (P_id, P_feature, P_tag, P_ent, P_mask,
263 |                        H_id, H_feature, H_tag, H_ent, H_mask, label)
264 | 


--------------------------------------------------------------------------------
/FusionModel/FusionNet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from . import layers
  5 | 
  6 | class FusionNet(nn.Module):
  7 |     """Network for the FusionNet Module."""
  8 |     def __init__(self, opt, embedding=None, padding_idx=0):
  9 |         super(FusionNet, self).__init__()
 10 | 
 11 |         # Input size to RNN: word emb + char emb + question emb + manual features
 12 |         input_size = 0
 13 | 
 14 |         layers.set_my_dropout_prob(opt['my_dropout_p'])
 15 |         layers.set_seq_dropout(opt['do_seq_dropout'])
 16 | 
 17 |         # Word embeddings
 18 |         self.embedding = nn.Embedding(opt['vocab_size'],
 19 |                                       opt['embedding_dim'],
 20 |                                       padding_idx=padding_idx)
 21 |         if embedding is not None:
 22 |             self.embedding.weight.data = embedding
 23 |             if opt['fix_embeddings'] or opt['tune_partial'] == 0:
 24 |                 opt['fix_embeddings'] = True
 25 |                 opt['tune_partial'] = 0
 26 |                 for p in self.embedding.parameters():
 27 |                     p.requires_grad = False
 28 |             else:
 29 |                 assert opt['tune_partial'] < embedding.size(0)
 30 |                 fixed_embedding = embedding[opt['tune_partial']:]
 31 |                 # a persistent buffer for the nn.Module
 32 |                 self.register_buffer('fixed_embedding', fixed_embedding)
 33 |                 self.fixed_embedding = fixed_embedding
 34 |         embedding_dim = opt['embedding_dim']
 35 |         input_size += embedding_dim
 36 |         # Contextualized embeddings
 37 |         self.CoVe = layers.MTLSTM(opt, embedding)
 38 |         input_size += self.CoVe.output_size
 39 |         # POS embeddings
 40 |         self.pos_embedding = nn.Embedding(opt['pos_size'], opt['pos_dim'])
 41 |         input_size += opt['pos_dim']
 42 |         # NER embeddings
 43 |         self.ner_embedding = nn.Embedding(opt['ner_size'], opt['ner_dim'])
 44 |         input_size += opt['ner_dim']
 45 | 
 46 |         if opt['full_att_type'] == 2:
 47 |             aux_input = opt['num_features']
 48 |         else:
 49 |             aux_input = 1
 50 | 
 51 |         # Setup the vector size for [premise, hypothesis]
 52 |         # they will be modified in the following code
 53 |         cur_hidden_size = input_size
 54 |         print('Initially, the vector_size is {} (+ {})'.format(cur_hidden_size, aux_input))
 55 | 
 56 |         # RNN premise encoder
 57 |         self.P_rnn = layers.RNNEncoder(cur_hidden_size, opt['hidden_size'], opt['enc_rnn_layers'], aux_size = aux_input)
 58 |         # RNN hypothesis encoder
 59 |         self.H_rnn = layers.RNNEncoder(cur_hidden_size, opt['hidden_size'], opt['enc_rnn_layers'], aux_size = aux_input)
 60 |         cur_hidden_size = opt['hidden_size'] * 2
 61 | 
 62 |         # Output sizes of rnn encoders
 63 |         print('After Input LSTM, the vector_size is [', cur_hidden_size, '] *', opt['enc_rnn_layers'])
 64 | 
 65 |         # Multi-level Fusion
 66 |         if opt['full_att_type'] == 0:
 67 |             self.full_attn_P = layers.FullAttention(cur_hidden_size, cur_hidden_size, 1)
 68 |             self.full_attn_H = layers.FullAttention(cur_hidden_size, cur_hidden_size, 1)
 69 |         elif opt['full_att_type'] == 1:
 70 |             self.full_attn_P = layers.FullAttention(input_size + opt['enc_rnn_layers'] * cur_hidden_size, cur_hidden_size, 1)
 71 |             self.full_attn_H = layers.FullAttention(input_size + opt['enc_rnn_layers'] * cur_hidden_size, cur_hidden_size, 1)
 72 |         elif opt['full_att_type'] == 2:
 73 |             self.full_attn_P = layers.FullAttention(input_size + opt['enc_rnn_layers'] * cur_hidden_size,
 74 |                                                     opt['enc_rnn_layers'] * cur_hidden_size, opt['enc_rnn_layers'])
 75 |             self.full_attn_H = layers.FullAttention(input_size + opt['enc_rnn_layers'] * cur_hidden_size,
 76 |                                                     opt['enc_rnn_layers'] * cur_hidden_size, opt['enc_rnn_layers'])
 77 |         else:
 78 |             raise NotImplementedError('full_att_type = %s' % opt['full_att_type'])
 79 |         cur_hidden_size = self.full_attn_P.output_size * 2
 80 | 
 81 |         # RNN premise inference
 82 |         self.P_infer_rnn = layers.RNNEncoder(cur_hidden_size, opt['hidden_size'], opt['inf_rnn_layers'])
 83 |         # RNN hypothesis inference
 84 |         self.H_infer_rnn = layers.RNNEncoder(cur_hidden_size, opt['hidden_size'], opt['inf_rnn_layers'])
 85 |         cur_hidden_size = opt['hidden_size'] * 2 * opt['inf_rnn_layers']
 86 | 
 87 |         print('Before answer finding, hidden size is', cur_hidden_size)
 88 | 
 89 |         # Question merging
 90 |         if opt['final_merge'] == 'linear_self_attn':
 91 |             self.self_attn_P = layers.LinearSelfAttn(cur_hidden_size)
 92 |             self.self_attn_H = layers.LinearSelfAttn(cur_hidden_size)
 93 |         elif opt['final_merge'] != 'avg':
 94 |             raise NotImplementedError('final_merge = %s' % opt['final_merge'])
 95 | 
 96 |         self.classifier = layers.MLPFunc(cur_hidden_size * 4, cur_hidden_size, opt['number_of_class'])
 97 | 
 98 |         # Store config
 99 |         self.opt = opt
100 | 
101 |     def forward(self, x1, x1_f, x1_pos, x1_ner, x1_mask, x2, x2_f, x2_pos, x2_ner, x2_mask):
102 |         """Inputs:
103 |         x1 = premise word indices                [batch * len_1]
104 |         x1_f = premise word features indices     [batch * len_1 * nfeat]
105 |         x1_pos = premise POS tags                [batch * len_1]
106 |         x1_ner = premise entity tags             [batch * len_1]
107 |         x1_mask = premise padding mask           [batch * len_1]
108 |         x2 = hypothesis word indices             [batch * len_2]
109 |         x2_f = hypothesis word features indices  [batch * len_2 * nfeat]
110 |         x2_pos = hypothesis POS tags             [batch * len_2]
111 |         x2_ner = hypothesis entity tags          [batch * len_2]
112 |         x2_mask = hypothesis padding mask        [batch * len_2]
113 |         """
114 |         # Prepare premise and hypothesis input
115 |         Prnn_input_list = []
116 |         Hrnn_input_list = []
117 | 
118 |         # Word embeddings
119 |         emb = self.embedding if self.training else self.eval_embed
120 |         x1_emb, x2_emb = emb(x1), emb(x2)
121 |         # Dropout on embeddings
122 |         if self.opt['dropout_emb'] > 0:
123 |             x1_emb = layers.dropout(x1_emb, p=self.opt['dropout_emb'], training=self.training)
124 |             x2_emb = layers.dropout(x2_emb, p=self.opt['dropout_emb'], training=self.training)
125 |         Prnn_input_list.append(x1_emb)
126 |         Hrnn_input_list.append(x2_emb)
127 | 
128 |         # Contextualized embeddings
129 |         _, x1_cove = self.CoVe(x1, x1_mask)
130 |         _, x2_cove = self.CoVe(x2, x2_mask)
131 |         if self.opt['dropout_emb'] > 0:
132 |             x1_cove = layers.dropout(x1_cove, p=self.opt['dropout_emb'], training=self.training)
133 |             x2_cove = layers.dropout(x2_cove, p=self.opt['dropout_emb'], training=self.training)
134 |         Prnn_input_list.append(x1_cove)
135 |         Hrnn_input_list.append(x2_cove)
136 | 
137 |         # POS embeddings
138 |         x1_pos_emb = self.pos_embedding(x1_pos)
139 |         x2_pos_emb = self.pos_embedding(x2_pos)
140 |         Prnn_input_list.append(x1_pos_emb)
141 |         Hrnn_input_list.append(x2_pos_emb)
142 | 
143 |         # NER embeddings
144 |         x1_ner_emb = self.ner_embedding(x1_ner)
145 |         x2_ner_emb = self.ner_embedding(x2_ner)
146 |         Prnn_input_list.append(x1_ner_emb)
147 |         Hrnn_input_list.append(x2_ner_emb)
148 | 
149 |         x1_input = torch.cat(Prnn_input_list, 2)
150 |         x2_input = torch.cat(Hrnn_input_list, 2)
151 | 
152 |         # Now the features are ready
153 |         # x1_input: [batch_size, doc_len, input_size]
154 |         # x2_input: [batch_size, doc_len, input_size]
155 | 
156 |         if self.opt['full_att_type'] == 2:
157 |             x1_f = layers.dropout(x1_f, p=self.opt['dropout_EM'], training=self.training)
158 |             x2_f = layers.dropout(x2_f, p=self.opt['dropout_EM'], training=self.training)
159 |             Paux_input, Haux_input = x1_f, x2_f
160 |         else:
161 |             Paux_input = x1_f[:, :, 0].contiguous().view(x1_f.size(0), x1_f.size(1), 1)
162 |             Haux_input = x2_f[:, :, 0].contiguous().view(x2_f.size(0), x2_f.size(1), 1)
163 | 
164 |         # Encode premise with RNN
165 |         P_abstr_ls = self.P_rnn(x1_input, x1_mask, aux_input=Paux_input)
166 |         # Encode hypothesis with RNN
167 |         H_abstr_ls = self.H_rnn(x2_input, x2_mask, aux_input=Haux_input)
168 | 
169 |         # Fusion
170 |         if self.opt['full_att_type'] == 0:
171 |             P_atts = P_abstr_ls[-1].contiguous()
172 |             H_atts = H_abstr_ls[-1].contiguous()
173 |             P_xs = P_abstr_ls[-1].contiguous()
174 |             H_xs = H_abstr_ls[-1].contiguous()
175 |         elif self.opt['full_att_type'] == 1:
176 |             P_atts = torch.cat([x1_input] + P_abstr_ls, 2)
177 |             H_atts = torch.cat([x2_input] + H_abstr_ls, 2)
178 |             P_xs = P_abstr_ls[-1].contiguous()
179 |             H_xs = H_abstr_ls[-1].contiguous()
180 |         elif self.opt['full_att_type'] == 2:
181 |             P_atts = torch.cat([x1_input] + P_abstr_ls, 2)
182 |             H_atts = torch.cat([x2_input] + H_abstr_ls, 2)
183 |             P_xs = torch.cat(P_abstr_ls, 2)
184 |             H_xs = torch.cat(H_abstr_ls, 2)
185 |         aP_xs = self.full_attn_P(P_atts, H_atts, P_xs, H_xs, x2_mask)
186 |         aH_xs = self.full_attn_H(H_atts, P_atts, H_xs, P_xs, x1_mask)
187 |         P_hiddens = torch.cat([P_xs, aP_xs], 2)
188 |         H_hiddens = torch.cat([H_xs, aH_xs], 2)
189 | 
190 |         # Inference on premise and hypothesis
191 |         P_hiddens = torch.cat(self.P_infer_rnn(P_hiddens, x1_mask), 2)
192 |         H_hiddens = torch.cat(self.H_infer_rnn(H_hiddens, x2_mask), 2)
193 | 
194 |         # Merge hiddens for answer classification
195 |         if self.opt['final_merge'] == 'avg':
196 |             P_merge_weights = layers.uniform_weights(P_hiddens, x1_mask)
197 |             H_merge_weights = layers.uniform_weights(H_hiddens, x2_mask)
198 |         elif self.opt['final_merge'] == 'linear_self_attn':
199 |             P_merge_weights = self.self_attn_P(P_hiddens, x1_mask)
200 |             H_merge_weights = self.self_attn_H(H_hiddens, x2_mask)
201 |         P_avg_hidden = layers.weighted_avg(P_hiddens, P_merge_weights)
202 |         H_avg_hidden = layers.weighted_avg(H_hiddens, H_merge_weights)
203 |         P_max_hidden = torch.max(P_hiddens, 1)[0]
204 |         H_max_hidden = torch.max(H_hiddens, 1)[0]
205 | 
206 |         # Predict scores for different classes
207 |         scores = self.classifier(torch.cat([P_avg_hidden, H_avg_hidden, P_max_hidden, H_max_hidden], 1))
208 | 
209 |         return scores # -inf to inf
210 | 


--------------------------------------------------------------------------------
/FusionModel/layers.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import random
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.autograd import Variable
  7 | from torch.nn.parameter import Parameter
  8 | from torch.nn.utils.rnn import pad_packed_sequence as unpack
  9 | from torch.nn.utils.rnn import pack_padded_sequence as pack
 10 | 
 11 | # ------------------------------------------------------------------------------
 12 | # Neural Modules
 13 | # ------------------------------------------------------------------------------
 14 | 
 15 | def set_seq_dropout(option): # option = True or False
 16 |     global do_seq_dropout
 17 |     do_seq_dropout = option
 18 | 
 19 | def set_my_dropout_prob(p): # p between 0 to 1
 20 |     global my_dropout_p
 21 |     my_dropout_p = p
 22 | 
 23 | def seq_dropout(x, p=0, training=False):
 24 |     """
 25 |     x: batch * len * input_size
 26 |     """
 27 |     if training == False or p == 0:
 28 |         return x
 29 |     dropout_mask = Variable(1.0 / (1-p) * torch.bernoulli((1-p) * (x.data.new(x.size(0), x.size(2)).zero_() + 1)), requires_grad=False)
 30 |     return dropout_mask.unsqueeze(1).expand_as(x) * x
 31 | 
 32 | def dropout(x, p=0, training=False):
 33 |     """
 34 |     x: (batch * len * input_size) or (any other shape)
 35 |     """
 36 |     if do_seq_dropout and len(x.size()) == 3: # if x is (batch * len * input_size)
 37 |         return seq_dropout(x, p=p, training=training)
 38 |     else:
 39 |         return F.dropout(x, p=p, training=training)
 40 | 
 41 | class RNNEncoder(nn.Module):
 42 |     def __init__(self, input_size, hidden_size, num_layers, rnn_type=nn.LSTM, aux_size=0):
 43 |         super(RNNEncoder, self).__init__()
 44 |         self.num_layers = num_layers
 45 |         self.rnns = nn.ModuleList()
 46 |         for i in range(num_layers):
 47 |             input_size_ = (input_size + 2 * hidden_size * i)
 48 |             if i == 0: input_size_ += aux_size
 49 |             self.rnns.append(rnn_type(input_size_, hidden_size, num_layers=1, bidirectional=True))
 50 | 
 51 |     def forward(self, x, x_mask, aux_input=None):
 52 |         # Transpose batch and sequence dims
 53 |         x = x.transpose(0, 1)
 54 |         if aux_input is not None:
 55 |             aux_input = aux_input.transpose(0, 1)
 56 | 
 57 |         # Encode all layers
 58 |         hiddens = [x]
 59 |         for i in range(self.num_layers):
 60 |             rnn_input = torch.cat(hiddens, 2)
 61 |             if i == 0 and aux_input is not None:
 62 |                 rnn_input = torch.cat([rnn_input, aux_input], 2)
 63 | 
 64 |             # Apply dropout to input
 65 |             if my_dropout_p > 0:
 66 |                 rnn_input = dropout(rnn_input, p=my_dropout_p, training=self.training)
 67 |             # Forward
 68 |             rnn_output = self.rnns[i](rnn_input)[0]
 69 |             hiddens.append(rnn_output)
 70 | 
 71 |         # Transpose back
 72 |         hiddens = [h.transpose(0, 1) for h in hiddens]
 73 |         return hiddens[1:]
 74 | 
 75 | class MTLSTM(nn.Module):
 76 |     def __init__(self, opt, embedding=None, padding_idx=0):
 77 |         """Initialize an MTLSTM
 78 | 
 79 |         Arguments:
 80 |             embedding (Float Tensor): If not None, initialize embedding matrix with specified embedding vectors
 81 |         """
 82 |         super(MTLSTM, self).__init__()
 83 | 
 84 |         self.embedding = nn.Embedding(opt['vocab_size'], opt['embedding_dim'], padding_idx=padding_idx)
 85 |         if embedding is not None:
 86 |             self.embedding.weight.data = embedding
 87 | 
 88 |         state_dict = torch.load(opt['MTLSTM_path'])
 89 |         self.rnn1 = nn.LSTM(300, 300, num_layers=1, bidirectional=True)
 90 |         self.rnn2 = nn.LSTM(600, 300, num_layers=1, bidirectional=True)
 91 | 
 92 |         state_dict1 = dict([(name, param.data) if isinstance(param, Parameter) else (name, param)
 93 |                         for name, param in state_dict.items() if '0' in name])
 94 |         state_dict2 = dict([(name.replace('1', '0'), param.data) if isinstance(param, Parameter) else (name.replace('1', '0'), param)
 95 |                         for name, param in state_dict.items() if '1' in name])
 96 |         self.rnn1.load_state_dict(state_dict1)
 97 |         self.rnn2.load_state_dict(state_dict2)
 98 | 
 99 |         for p in self.embedding.parameters():
100 |             p.requires_grad = False
101 |         for p in self.rnn1.parameters():
102 |             p.requires_grad = False
103 |         for p in self.rnn2.parameters():
104 |             p.requires_grad = False
105 | 
106 |         self.output_size = 600
107 | 
108 |     def setup_eval_embed(self, eval_embed, padding_idx=0):
109 |         """Allow evaluation vocabulary size to be greater than training vocabulary size
110 | 
111 |         Arguments:
112 |             eval_embed (Float Tensor): Initialize eval_embed to be the specified embedding vectors
113 |         """
114 |         self.eval_embed = nn.Embedding(eval_embed.size(0), eval_embed.size(1), padding_idx = padding_idx)
115 |         self.eval_embed.weight.data = eval_embed
116 | 
117 |         for p in self.eval_embed.parameters():
118 |             p.requires_grad = False
119 | 
120 |     def forward(self, x_idx, x_mask):
121 |         """A pretrained MT-LSTM (McCann et. al. 2017).
122 |         This LSTM was trained with 300d 840B GloVe on the WMT 2017 machine translation dataset.
123 | 
124 |         Arguments:
125 |             x_idx (Long Tensor): a Long Tensor of size (batch * len).
126 |             x_mask (Byte Tensor): a Byte Tensor of mask for the input tensor (batch * len).
127 |         """
128 |         emb = self.embedding if self.training else self.eval_embed
129 |         x_hiddens = emb(x_idx)
130 | 
131 |         lengths = x_mask.data.eq(0).long().sum(1).squeeze()
132 |         lens, indices = torch.sort(lengths, 0, True)
133 | 
134 |         output1, _ = self.rnn1(pack(x_hiddens[indices], lens.tolist(), batch_first=True))
135 |         output2, _ = self.rnn2(output1)
136 | 
137 |         output1 = unpack(output1, batch_first=True)[0]
138 |         output2 = unpack(output2, batch_first=True)[0]
139 | 
140 |         _, _indices = torch.sort(indices, 0)
141 |         output1 = output1[_indices]
142 |         output2 = output2[_indices]
143 | 
144 |         return output1, output2
145 | 
146 | # Attention layer
147 | class FullAttention(nn.Module):
148 |     def __init__(self, full_size, hidden_size, num_level):
149 |         super(FullAttention, self).__init__()
150 |         assert(hidden_size % num_level == 0)
151 |         self.full_size = full_size
152 |         self.hidden_size = hidden_size
153 |         self.attsize_per_lvl = hidden_size // num_level
154 |         self.num_level = num_level
155 |         self.linear = nn.Linear(full_size, hidden_size, bias=False)
156 |         self.linear_final = Parameter(torch.ones(1, hidden_size), requires_grad = True)
157 |         self.output_size = hidden_size
158 |         print("Full Attention: (atten. {} -> {}, take {}) x {}".format(self.full_size, self.attsize_per_lvl, hidden_size // num_level, self.num_level))
159 | 
160 |     def forward(self, x1_att, x2_att, x1, x2, x2_mask):
161 |         """
162 |         x1_att: batch * len1 * full_size
163 |         x2_att: batch * len2 * full_size
164 |         x1: batch * len1 * hidden_size
165 |         x2: batch * len2 * hidden_size
166 |         x2_mask: batch * len2
167 |         """
168 |         x1_att = dropout(x1_att, p=my_dropout_p, training=self.training)
169 |         x2_att = dropout(x2_att, p=my_dropout_p, training=self.training)
170 | 
171 |         x1_key = F.relu(self.linear(x1_att.view(-1, self.full_size)))
172 |         x2_key = F.relu(self.linear(x2_att.view(-1, self.full_size)))
173 |         final_v = self.linear_final.expand_as(x2_key)
174 |         x2_key = final_v * x2_key
175 | 
176 |         x1_rep = x1_key.view(-1, x1.size(1), self.num_level, self.attsize_per_lvl).transpose(1, 2).contiguous().view(-1, x1.size(1), self.attsize_per_lvl)
177 |         x2_rep = x2_key.view(-1, x2.size(1), self.num_level, self.attsize_per_lvl).transpose(1, 2).contiguous().view(-1, x2.size(1), self.attsize_per_lvl)
178 | 
179 |         scores = x1_rep.bmm(x2_rep.transpose(1, 2)).view(-1, self.num_level, x1.size(1), x2.size(1)) # batch * num_level * len1 * len2
180 | 
181 |         x2_mask = x2_mask.unsqueeze(1).unsqueeze(2).expand_as(scores)
182 |         scores.data.masked_fill_(x2_mask.data, -float('inf'))
183 | 
184 |         alpha_flat = F.softmax(scores.view(-1, x2.size(1)))
185 |         alpha = alpha_flat.view(-1, x1.size(1), x2.size(1))
186 | 
187 |         size_per_level = self.hidden_size // self.num_level
188 |         atten_seq = alpha.bmm(x2.contiguous().view(-1, x2.size(1), self.num_level, size_per_level).transpose(1, 2).contiguous().view(-1, x2.size(1), size_per_level))
189 | 
190 |         return atten_seq.view(-1, self.num_level, x1.size(1), size_per_level).transpose(1, 2).contiguous().view(-1, x1.size(1), self.hidden_size)
191 | 
192 | # For summarizing a set of vectors into a single vector
193 | class LinearSelfAttn(nn.Module):
194 |     """Self attention over a sequence:
195 |     * o_i = softmax(Wx_i) for x_i in X.
196 |     """
197 |     def __init__(self, input_size):
198 |         super(LinearSelfAttn, self).__init__()
199 |         self.linear = nn.Linear(input_size, 1)
200 | 
201 |     def forward(self, x, x_mask):
202 |         """
203 |         x = batch * len * hdim
204 |         x_mask = batch * len
205 |         """
206 |         x = dropout(x, p=my_dropout_p, training=self.training)
207 | 
208 |         x_flat = x.contiguous().view(-1, x.size(-1))
209 |         scores = self.linear(x_flat).view(x.size(0), x.size(1))
210 |         scores.data.masked_fill_(x_mask.data, -float('inf'))
211 |         alpha = F.softmax(scores)
212 |         return alpha
213 | 
214 | # Answer finding
215 | class MLPFunc(nn.Module):
216 |     """
217 |     A multi-layer perceptron function for x: o = v'tanh(Wx+b).
218 |     """
219 |     def __init__(self, input_size, hidden_size, num_class):
220 |         super(MLPFunc, self).__init__()
221 |         self.linear = nn.Linear(input_size, hidden_size)
222 |         self.linear_final = nn.Linear(hidden_size, num_class, bias=False)
223 | 
224 |     def forward(self, x):
225 |         """
226 |         x = batch * input_size
227 |         """
228 |         x = dropout(x, p=my_dropout_p, training=self.training)
229 |         h = F.tanh(self.linear(x))
230 |         h = dropout(h, p=my_dropout_p, training=self.training)
231 |         o = self.linear_final(h)
232 |         return o # batch * num_classes
233 | 
234 | # ------------------------------------------------------------------------------
235 | # Functional
236 | # ------------------------------------------------------------------------------
237 | 
238 | # by default in PyTorch, +-*/ are all element-wise
239 | def uniform_weights(x, x_mask): # used in lego_reader.py
240 |     """Return uniform weights over non-masked input."""
241 |     alpha = Variable(torch.ones(x.size(0), x.size(1)))
242 |     if x.data.is_cuda:
243 |         alpha = alpha.cuda()
244 |     alpha = alpha * x_mask.eq(0).float()
245 |     alpha = alpha / alpha.sum(1).expand(alpha.size())
246 |     return alpha
247 | 
248 | def weighted_avg(x, weights): # used in lego_reader.py
249 |     """ x = batch * len * d
250 |         weights = batch * len
251 |     """
252 |     return weights.unsqueeze(1).bmm(x).squeeze(1)
253 | 


--------------------------------------------------------------------------------