├── ubuntu_data └── command_description.npy ├── util.py ├── requirements.txt ├── README.md ├── evaluation.py ├── preprocess.py ├── run_models.py ├── data.py └── models.py /ubuntu_data/command_description.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SmartDataAnalytics/AK-DE-biGRU/HEAD/ubuntu_data/command_description.npy -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | 5 | def save_model(model, name): 6 | if not os.path.exists('models/'): 7 | os.makedirs('models/') 8 | 9 | torch.save(model.state_dict(), 'models/{}.bin'.format(name)) 10 | 11 | 12 | def load_model(model, name, gpu=True): 13 | if gpu: 14 | model.load_state_dict(torch.load('models/{}.bin'.format(name))) 15 | else: 16 | model.load_state_dict(torch.load('models/{}.bin'.format(name), map_location=lambda storage, loc: storage)) 17 | 18 | return model 19 | 20 | 21 | def clip_gradient_threshold(model, min, max): 22 | for p in model.parameters(): 23 | if p.grad is not None: 24 | p.grad.data.clamp_(min, max) 25 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto==2.48.0 2 | boto3==1.6.6 3 | botocore==1.9.6 4 | bz2file==0.98 5 | certifi==2018.1.18 6 | cffi==1.11.5 7 | chardet==3.0.4 8 | cycler==0.10.0 9 | decorator==4.2.1 10 | docutils==0.14 11 | gensim==3.4.0 12 | h5py==2.7.1 13 | hickle==2.0.5 14 | idna==2.6 15 | ipdb==0.11 16 | ipython==6.2.1 17 | ipython-genutils==0.2.0 18 | jedi==0.11.1 19 | jmespath==0.9.3 20 | kiwisolver==1.0.1 21 | matplotlib==2.2.2 22 | nltk==3.2.5 23 | numpy==1.14.1 24 | olefile==0.45.1 25 | pandas==0.22.0 26 | parso==0.1.1 27 | pexpect==4.4.0 28 | pickleshare==0.7.4 29 | Pillow==5.0.0 30 | prompt-toolkit==1.0.15 31 | ptyprocess==0.5.2 32 | pycparser==2.18 33 | Pygments==2.2.0 34 | pyparsing==2.2.0 35 | python-dateutil==2.6.1 36 | pytz==2018.3 37 | regex==2018.2.21 38 | requests==2.18.4 39 | s3transfer==0.1.13 40 | scipy==1.0.0 41 | simplegeneric==0.8.1 42 | six==1.11.0 43 | smart-open==1.5.6 44 | Theano==1.0.1 45 | torch==0.3.1.post2 46 | torchtext==0.2.1 47 | torchvision==0.2.0 48 | tqdm==4.19.8 49 | traitlets==4.3.2 50 | urllib3==1.22 51 | wcwidth==0.1.7 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Attention and external Knowledge augmented Dual Encoder with bi-directional GRU (AK-DE-biGRU) 2 | 3 | Code for implementing the paper : "[Improving Response Selection in Multi-turn Dialogue Systems by Incorporating Domain Knowledge](https://arxiv.org/pdf/1809.03194.pdf)" 4 | 5 | ## Getting Started 6 | 7 | We use python version 3.6.4 8 | Install the requirements.txt file and install pytorch version: "0.3.1.post2" 9 | 10 | ### Prerequisites 11 | 12 | Download the pre-processed files from Wu et. al, from here: https://www.dropbox.com/s/2fdn26rj6h9bpvl/ubuntu%20data.zip?dl=0and save it in ubuntu_data. 13 | Run: python ./preprocess.py 14 | To create the required preprocessed dataset 15 | This will be read from data.py 16 | Use the train.txt file to train a fasttext model using the fasttext library:https://github.com/facebookresearch/fastText by: 17 | ``` 18 | ./fasttext skipgram -input train.txt -dim 200 -output fast_text_200 19 | ``` 20 | Save this file into a numpy array whose index corresponds to the word_id from the previous dictionary and the row contains the fasttext vector for that word. 21 | copy the file to ubuntu_data directory. 22 | 23 | Download the ubuntu_description.npy file provided and copy it to ubuntu_data directory 24 | 25 | ## Running the model 26 | 27 | The AK-DKE-biGRU model should be run as: 28 | ``` 29 | python -u run_models.py --h_dim 300 --mb_size 32 --n_epoch 20 --gpu --lr 0.0001 30 | ``` 31 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | import scipy.stats as st 5 | from tqdm import tqdm 6 | 7 | 8 | def recall_at_k_np(scores, ks=[1, 2, 3, 4, 5]): 9 | """ 10 | Evaluation recalll 11 | :param scores: sigmoid scores 12 | :param ks: 13 | :return: 14 | """ 15 | #sort the scores 16 | sorted_idxs = np.argsort(-scores, axis=1) 17 | ranks = (sorted_idxs == 0).argmax(1) 18 | recalls = [np.mean(ranks+1 <= k) for k in ks] 19 | return recalls 20 | 21 | 22 | def eval_model(model, dataset, mode='valid', gpu=False, no_tqdm=False): 23 | """ 24 | evaluation for DKE-GRU and AddGRU 25 | :param model: 26 | :param dataset: 27 | :param mode: 28 | :param gpu: 29 | :param no_tqdm: 30 | :return: 31 | """ 32 | model.eval() 33 | scores = [] 34 | 35 | assert mode in ['valid', 'test'] 36 | 37 | data_iter = dataset.get_iter(mode) 38 | 39 | if not no_tqdm: 40 | data_iter = tqdm(data_iter) 41 | data_iter.set_description_str('Evaluation') 42 | n_data = dataset.n_valid if mode == 'valid' else dataset.n_test 43 | data_iter.total = n_data // dataset.batch_size 44 | 45 | for mb in data_iter: 46 | context, response, y, cm, rm, key_r, key_mask_r = mb 47 | 48 | # Get scores 49 | scores_mb = F.sigmoid(model(context, response, cm, rm, key_r, key_mask_r)) #Appropritate this line while running different models. 50 | scores_mb = scores_mb.cpu() if gpu else scores_mb 51 | scores.append(scores_mb.data.numpy()) 52 | 53 | scores = np.concatenate(scores) 54 | 55 | # Handle the case when numb. of data not divisible by 10 56 | mod = scores.shape[0] % 10 57 | scores = scores[:-mod if mod != 0 else None] 58 | 59 | scores = scores.reshape(-1, 10) # 1 in 10 60 | recall_at_ks = [r for r in recall_at_k_np(scores)] 61 | 62 | return recall_at_ks 63 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pickle 3 | from collections import defaultdict 4 | 5 | def getw2id(word, w2id): 6 | """ 7 | get Ids of words from dictionary 8 | :param word: 9 | :param w2id: 10 | :return: 11 | """ 12 | try: 13 | return w2id[word] 14 | except KeyError: 15 | return w2id['**unknown**'] 16 | 17 | def get_values(file, get_c_d=False, w2id=None): 18 | """ 19 | get label context and response. 20 | :param file: filel name 21 | :param get_c_d: 22 | :return: 23 | """ 24 | data = open(file, 'r').readlines() 25 | data = [sent.split('\n')[0].split('\t') for sent in data] 26 | chars = [] 27 | y = [int(a[0]) for a in data] 28 | c = [' __EOS__ '.join(a[1:-1]).split() for a in data] 29 | c = [[getw2id(w, w2id) for w in s] for s in c] 30 | r = [a[-1].split() for a in data] 31 | r = [[getw2id(w, w2id) for w in s] for s in r] 32 | if get_c_d: 33 | for word in c: 34 | sent = ' '.join(word) 35 | for char in sent: 36 | chars.append(char) 37 | chars = set(chars) 38 | return y, c, r, dict(zip(chars, range(len(chars)))) 39 | else: 40 | return y, c, r 41 | 42 | 43 | if __name__ == '__main__': 44 | #load the vocab file 45 | vocab = open('ubuntu_data/vocab.txt', 'r').readlines() 46 | w2id = {} 47 | for word in vocab: 48 | w = word.split('\n')[0].split('\t') 49 | w2id[w[0]] =int(w[1]) 50 | 51 | train, test, valid = {}, {}, {} 52 | train['y'], train['c'], train['r'] = get_values('ubuntu_data/train.txt', get_c_d=False, w2id=w2id) 53 | test['y'], test['c'], test['r'] = get_values('ubuntu_data/test.txt', w2id=w2id) 54 | valid['y'], valid['c'], valid['r'] = get_values('ubuntu_data/valid.txt', w2id=w2id) 55 | #char_vocab = defaultdict(float) 56 | dataset = train, valid, test 57 | pickle.dump(dataset, open('ubuntu_data/dataset_1M.pkl', 'wb')) -------------------------------------------------------------------------------- /run_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.optim as optim 4 | import numpy as np 5 | from data import UDCv1 6 | from evaluation import eval_model 7 | from util import save_model, clip_gradient_threshold, load_model 8 | from models import biGRU, A_DE_bigRU, AK_DE_biGRU, Add_GRU 9 | import argparse 10 | from tqdm import tqdm 11 | 12 | parser = argparse.ArgumentParser( 13 | description='UDC Experiment Runner' 14 | ) 15 | 16 | parser.add_argument('--gpu', default=False, action='store_true', 17 | help='whether to run in the GPU') 18 | parser.add_argument('--h_dim', type=int, default=100, metavar='', 19 | help='hidden dimension (default: 100)') 20 | parser.add_argument('--lr', type=float, default=1e-3, metavar='', 21 | help='learning rate (default: 1e-3)') 22 | parser.add_argument('--emb_drop', type=float, default=0.3, metavar='', 23 | help='embedding dropout (default: 0.3)') 24 | parser.add_argument('--mb_size', type=int, default=128, metavar='', 25 | help='size of minibatch (default: 128)') 26 | parser.add_argument('--n_epoch', type=int, default=500, metavar='', 27 | help='number of iterations (default: 500)') 28 | parser.add_argument('--randseed', type=int, default=123, metavar='', 29 | help='random seed (default: 123)') 30 | parser.add_argument('--no_tqdm', default=False, action='store_true', 31 | help='disable tqdm progress bar') 32 | parser.add_argument('--early_stop', type=int, default=3, 33 | help='early stopping') 34 | args = parser.parse_args() 35 | 36 | # Set random seed 37 | np.random.seed(args.randseed) 38 | torch.manual_seed(args.randseed) 39 | 40 | if args.gpu: 41 | torch.cuda.manual_seed(args.randseed) 42 | 43 | max_seq_len = 320 44 | model_name = 'AK_DE_biGRU' 45 | #dataset 46 | udc = UDCv1('ubuntu_data', batch_size=args.mb_size, use_mask=True, 47 | max_seq_len=max_seq_len, gpu=args.gpu, use_fasttext=True) 48 | #model definition 49 | model = AK_DE_biGRU( 50 | udc.emb_dim, udc.vocab_size, args.h_dim, udc.vectors, 0, args.gpu 51 | ) 52 | #optimizer 53 | solver = optim.Adam(model.parameters(), lr=args.lr) 54 | 55 | if args.gpu: 56 | model.cuda() 57 | 58 | 59 | def run_model(): 60 | """ 61 | Training method 62 | :return: 63 | """ 64 | best_val = 0.0 65 | recall1s = [] 66 | for epoch in range(args.n_epoch): 67 | print('\n\n-------------------------------------------') 68 | print('Epoch-{}'.format(epoch)) 69 | print('-------------------------------------------') 70 | 71 | model.train() 72 | 73 | train_iter = enumerate(udc.get_iter('train')) 74 | 75 | if not args.no_tqdm: 76 | train_iter = tqdm(train_iter) 77 | train_iter.set_description_str('Training') 78 | train_iter.total = udc.n_train // udc.batch_size 79 | 80 | for it, mb in train_iter: 81 | context, response, y, cm, rm, key_r, key_mask_r = mb 82 | output = model(context, response, cm, rm, key_r, key_mask_r) # Appropriate this line while running different models 83 | loss = F.binary_cross_entropy_with_logits(output, y) 84 | 85 | loss.backward() 86 | solver.step() 87 | solver.zero_grad() 88 | 89 | # Validation 90 | recall_at_ks = eval_model( 91 | model, udc, 'valid', gpu=args.gpu, no_tqdm=args.no_tqdm 92 | ) 93 | 94 | print('Loss: {:.3f}; recall@1: {:.3f}; recall@2: {:.3f}; recall@5: {:.3f}' 95 | .format(loss.data[0], recall_at_ks[0], recall_at_ks[1], recall_at_ks[4])) 96 | recall_1 = recall_at_ks[0] 97 | # if epoch > 10: 98 | # eval_test() 99 | 100 | if best_val == 0.0: 101 | save_model(model, model_name) 102 | best_val = recall_1 103 | recall1s.append(recall_1) 104 | else: 105 | if recall_1 > best_val: 106 | best_val = recall_1 107 | print ("Saving model for recall@1:" + str(recall_1)) 108 | save_model(model, model_name) 109 | else: 110 | print ("Not saving, best accuracy so far:" + str(best_val)) 111 | #Early stopping 112 | if recall_1 < np.max(recall1s[-args.early_stop:]): 113 | break 114 | 115 | 116 | def eval_test(model): 117 | ''' 118 | Evaluation 119 | :param model: 120 | :return: 121 | ''' 122 | print('\n\nEvaluating on test set...') 123 | print('-------------------------------') 124 | print('Loading the best model........') 125 | model = load_model(model, model_name) 126 | model.eval() 127 | recall_at_ks = eval_model( 128 | model, udc, 'test', gpu=args.gpu, no_tqdm=args.no_tqdm 129 | ) 130 | 131 | print('Recall@1: {:.3f}; recall@2: {:.3f}; recall@5: {:.3f}' 132 | .format(recall_at_ks[0], recall_at_ks[1], recall_at_ks[4])) 133 | 134 | 135 | if __name__ == '__main__': 136 | #run the models 137 | try: 138 | run_model() 139 | eval_test(model) 140 | except KeyboardInterrupt: 141 | eval_test(model) 142 | exit(0) 143 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | from torchtext import data 2 | from torchtext.vocab import Vocab, GloVe 3 | import torch 4 | from torch.autograd import Variable 5 | import re 6 | from collections import OrderedDict, Counter 7 | import numpy as np 8 | import pickle 9 | 10 | URL_TOK = '__url__' 11 | PATH_TOK = '__path__' 12 | 13 | 14 | class UDCv1: 15 | """ 16 | Wrapper for UDCv2 taken from: http://dataset.cs.mcgill.ca/ubuntu-corpus-1.0/. 17 | Everything has been preprocessed and converted to numerical indexes. 18 | """ 19 | 20 | def __init__(self, path, batch_size=256, max_seq_len=160, use_mask=False, gpu=True, use_fasttext=False): 21 | self.batch_size = batch_size 22 | self.max_seq_len_c = max_seq_len 23 | self.max_seq_len_r = int(max_seq_len/2) 24 | self.use_mask = use_mask 25 | self.gpu = gpu 26 | 27 | self.desc_len = 44 28 | #load the dataset pickle file 29 | with open(f'{path}/dataset_1M.pkl', 'rb') as f: 30 | dataset = pickle.load(f, encoding='ISO-8859-1') 31 | self.train, self.valid, self.test = dataset 32 | #load the fasttext vector 33 | if use_fasttext: 34 | vectors = np.load(f'{path}/fast_text_200_v.npy') 35 | #vectors = np.load(f'{path}/w2vec_200.npy') 36 | #man_vec = np.load(f'{path}/key_vec.npy') 37 | else: 38 | with open(f'{path}/W.pkl', 'rb') as f: 39 | vectors, _ = pickle.load(f, encoding='ISO-8859-1') 40 | #load the command description file 41 | self.ubuntu_cmd_vec = np.load(f'{path}/command_description.npy').item() 42 | #self.ubuntu_cmd_vec = np.load(f'{path}/man_dict_key.npy').item() 43 | 44 | print('Finished loading dataset!') 45 | 46 | self.n_train = len(self.train['y']) 47 | self.n_valid = len(self.valid['y']) 48 | self.n_test = len(self.test['y']) 49 | self.vectors = torch.from_numpy(vectors.astype(np.float32)) 50 | #self.man_vec = torch.from_numpy(man_vec.astype(np.float32)) 51 | 52 | self.vocab_size = self.vectors.size(0) 53 | self.emb_dim = self.vectors.size(1) 54 | 55 | def get_iter(self, dataset='train'): 56 | if dataset == 'train': 57 | dataset = self.train 58 | elif dataset == 'valid': 59 | dataset = self.valid 60 | else: 61 | dataset = self.test 62 | 63 | for i in range(0, len(dataset['y']), self.batch_size): 64 | c = dataset['c'][i:i+self.batch_size] 65 | r = dataset['r'][i:i+self.batch_size] 66 | y = dataset['y'][i:i+self.batch_size] 67 | 68 | 69 | c, r, y, c_mask, r_mask, key_r, key_mask_r = self._load_batch(c, r, y, self.batch_size) 70 | 71 | if self.use_mask: 72 | yield c, r, y, c_mask, r_mask, key_r, key_mask_r 73 | else: 74 | yield c, r, y 75 | 76 | 77 | def get_key(self, sentence, max_seq_len, max_len): 78 | """ 79 | get key mask 80 | :param sentence: 81 | :param max_len: 82 | :return: 83 | """ 84 | key_mask = np.zeros((max_seq_len)) 85 | keys = np.zeros((max_seq_len, max_len)) 86 | for j, word in enumerate(sentence): 87 | if int(word) in self.ubuntu_cmd_vec.keys(): 88 | keys[j] = self.ubuntu_cmd_vec[int(word)][:max_len] 89 | key_mask[j] = 1 90 | else: 91 | keys[j] = np.zeros((max_len)) 92 | return key_mask, keys 93 | 94 | 95 | def _load_batch(self, c, r, y, size): 96 | c_arr = np.zeros([size, self.max_seq_len_c], np.int) 97 | r_arr = np.zeros([size, self.max_seq_len_r], np.int) 98 | y_arr = np.zeros(size, np.float32) 99 | 100 | c_mask = np.zeros([size, self.max_seq_len_c], np.float32) 101 | r_mask = np.zeros([size, self.max_seq_len_r], np.float32) 102 | 103 | #key_c = np.zeros([size, self.max_seq_len_c, self.desc_len], np.float32) 104 | key_r = np.zeros([size, self.max_seq_len_r, self.desc_len], np.float32) 105 | 106 | #key_mask_c = np.zeros([size, self.max_seq_len_c], np.float32) 107 | key_mask_r = np.zeros([size, self.max_seq_len_r], np.float32) 108 | 109 | for j, (row_c, row_r, row_y) in enumerate(zip(c, r, y)): 110 | 111 | # Truncate 112 | row_c = row_c[:self.max_seq_len_c] 113 | row_r = row_r[:self.max_seq_len_r] 114 | 115 | c_arr[j, :len(row_c)] = row_c 116 | r_arr[j, :len(row_r)] = row_r 117 | y_arr[j] = float(row_y) 118 | 119 | 120 | c_mask[j, :len(row_c)] = 1 121 | r_mask[j, :len(row_r)] = 1 122 | 123 | #key_mask_c[j], key_c[j] = self.get_key(row_c, self.max_seq_len_c, self.desc_len) 124 | key_mask_r[j], key_r[j] = self.get_key(row_r, self.max_seq_len_r, self.desc_len) 125 | 126 | # Convert to PyTorch tensor 127 | c = Variable(torch.from_numpy(c_arr)) 128 | r = Variable(torch.from_numpy(r_arr)) 129 | y = Variable(torch.from_numpy(y_arr)) 130 | c_mask = Variable(torch.from_numpy(c_mask)) 131 | r_mask = Variable(torch.from_numpy(r_mask)) 132 | 133 | 134 | #key_mask_c = Variable(torch.from_numpy(key_mask_c), requires_grad = False) 135 | key_mask_r = Variable(torch.from_numpy(key_mask_r), requires_grad = False) 136 | 137 | #key_c = Variable(torch.from_numpy(key_c)).type(torch.LongTensor) 138 | key_r = Variable(torch.from_numpy(key_r)).type(torch.LongTensor) 139 | 140 | 141 | 142 | # Load to GPU 143 | if self.gpu: 144 | c, r, y = c.cuda(), r.cuda(), y.cuda() 145 | c_mask, r_mask = c_mask.cuda(), r_mask.cuda() 146 | #r_mask = r_mask.cuda() 147 | #key_c, key_mask_c, key_r, key_mask_r = key_c.cuda(), key_mask_c.cuda(), key_r.cuda(), key_mask_r.cuda() 148 | key_r, key_mask_r = key_r.cuda(), key_mask_r.cuda() 149 | 150 | return c, r, y, c_mask, r_mask, key_r, key_mask_r 151 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | #imports 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | import math 8 | 9 | 10 | class biGRU(nn.Module): 11 | 12 | def __init__(self, emb_dim, n_vocab, h_dim=256, pretrained_emb=None, gpu=False, emb_drop=0.6, pad_idx=0): 13 | super(biGRU, self).__init__() 14 | 15 | self.word_embed = nn.Embedding(n_vocab, emb_dim, padding_idx=pad_idx) 16 | 17 | if pretrained_emb is not None: 18 | self.word_embed.weight.data.copy_(pretrained_emb) 19 | 20 | self.rnn = nn.GRU( 21 | input_size=emb_dim, hidden_size=h_dim, 22 | num_layers=1, batch_first=True, bidirectional=True 23 | ) 24 | 25 | self.emb_drop = nn.Dropout(emb_drop) 26 | 27 | self.M = nn.Parameter(torch.FloatTensor(2*h_dim, 2*h_dim)) 28 | 29 | self.b = nn.Parameter(torch.FloatTensor([0])) 30 | 31 | self.init_params_() 32 | 33 | if gpu: 34 | self.cuda() 35 | 36 | def init_params_(self): 37 | nn.init.xavier_normal(self.M) 38 | 39 | # Set forget gate bias to 2 40 | size = self.rnn.bias_hh_l0.size(0) 41 | self.rnn.bias_hh_l0.data[size//4:size//2] = 2 42 | 43 | size = self.rnn.bias_ih_l0.size(0) 44 | self.rnn.bias_ih_l0.data[size//4:size//2] = 2 45 | 46 | def forward(self, x1, x2): 47 | """ 48 | Inputs: 49 | ------- 50 | x1, x2: seqs of words (batch_size, seq_len) 51 | 52 | Outputs: 53 | -------- 54 | o: vector of (batch_size) 55 | """ 56 | c, r = self.forward_enc(x1, x2) 57 | o = self.forward_fc(c, r) 58 | 59 | return o.view(-1) 60 | 61 | def forward_enc(self, x1, x2): 62 | """ 63 | x1, x2: seqs of words (batch_size, seq_len) 64 | """ 65 | # Both are (batch_size, seq_len, emb_dim) 66 | x1_emb = self.emb_drop(self.word_embed(x1)) 67 | x2_emb = self.emb_drop(self.word_embed(x2)) 68 | 69 | # Each is (1 x batch_size x h_dim) 70 | _, c = self.rnn(x1_emb) 71 | _, r = self.rnn(x2_emb) 72 | #concatenate both layers 73 | c = torch.cat([c[0], c[1]], dim=-1) 74 | r = torch.cat([r[0], r[1]], dim=-1) 75 | 76 | return c.squeeze(), r.squeeze() 77 | 78 | def forward_fc(self, c, r): 79 | """ 80 | c, r: tensor of (batch_size, h_dim) 81 | """ 82 | # (batch_size x 1 x h_dim) 83 | o = torch.mm(c, self.M).unsqueeze(1) 84 | # (batch_size x 1 x 1) 85 | o = torch.bmm(o, r.unsqueeze(2)) 86 | o = o + self.b 87 | 88 | return o 89 | 90 | 91 | class A_DE_bigRU(nn.Module): 92 | 93 | def __init__(self, emb_dim, n_vocab, h_dim=256, pretrained_emb=None, gpu=False, emb_drop=0.6, pad_idx=0): 94 | super(A_DE_bigRU, self).__init__() 95 | 96 | self.word_embed = nn.Embedding(n_vocab, emb_dim, padding_idx=pad_idx) 97 | 98 | if pretrained_emb is not None: 99 | self.word_embed.weight.data.copy_(pretrained_emb) 100 | 101 | self.rnn = nn.GRU( 102 | input_size=emb_dim, hidden_size=h_dim, 103 | num_layers=1, batch_first=True, bidirectional=True 104 | ) 105 | 106 | self.emb_drop = nn.Dropout(emb_drop) 107 | 108 | self.M = nn.Parameter(torch.FloatTensor(h_dim, h_dim)) 109 | 110 | self.b = nn.Parameter(torch.FloatTensor([0])) 111 | self.attn = nn.Linear(h_dim, h_dim) 112 | self.init_params_() 113 | 114 | if gpu: 115 | self.cuda() 116 | 117 | def init_params_(self): 118 | nn.init.xavier_normal(self.M) 119 | 120 | # Set forget gate bias to 2 121 | size = self.rnn.bias_hh_l0.size(0) 122 | self.rnn.bias_hh_l0.data[size//4:size//2] = 2 123 | 124 | size = self.rnn.bias_ih_l0.size(0) 125 | self.rnn.bias_ih_l0.data[size//4:size//2] = 2 126 | 127 | def forward(self, x1, x2, x1mask): 128 | """ 129 | Inputs: 130 | ------- 131 | x1, x2: seqs of words (batch_size, seq_len) 132 | 133 | Outputs: 134 | -------- 135 | o: vector of (batch_size) 136 | """ 137 | sc, c, r = self.forward_enc(x1, x2) 138 | c_attn = self.forward_attn(sc, r, x1mask) 139 | o = self.forward_fc(c_attn, r) 140 | 141 | return o.view(-1) 142 | 143 | def forward_enc(self, x1, x2): 144 | """ 145 | x1, x2: seqs of words (batch_size, seq_len) 146 | """ 147 | # Both are (batch_size, seq_len, emb_dim) 148 | x1_emb = self.emb_drop(self.word_embed(x1)) 149 | x2_emb = self.emb_drop(self.word_embed(x2)) 150 | 151 | # Each is (1 x batch_size x h_dim) 152 | sc, c = self.rnn(x1_emb) 153 | _, r = self.rnn(x2_emb) 154 | 155 | return sc, c.squeeze(), r.squeeze() 156 | 157 | def forward_attn(self, x1, x2, mask): 158 | """ 159 | attention 160 | :param x1: batch X seq_len X dim 161 | :return: 162 | """ 163 | max_len = x1.size(1) 164 | b_size = x1.size(0) 165 | 166 | x2 = x2.squeeze(0).unsqueeze(2) 167 | attn = self.attn(x1.contiguous().view(b_size*max_len, -1))# B, T,D -> B*T,D 168 | attn = attn.view(b_size, max_len, -1) # B,T,D 169 | attn_energies = (attn.bmm(x2).transpose(1, 2)) #B,T,D * B,D,1 --> B,1,T 170 | alpha = F.softmax(attn_energies.squeeze(1), dim=-1) # B, T 171 | alpha = alpha * mask # B, T 172 | alpha = alpha.unsqueeze(1) # B,1,T 173 | weighted_attn = alpha.bmm(x1) # B,T 174 | 175 | return weighted_attn.squeeze() 176 | 177 | def forward_fc(self, c, r): 178 | """ 179 | c, r: tensor of (batch_size, h_dim) 180 | """ 181 | # (batch_size x 1 x h_dim) 182 | o = torch.mm(c, self.M).unsqueeze(1) 183 | # (batch_size x 1 x 1) 184 | o = torch.bmm(o, r.unsqueeze(2)) 185 | o = o + self.b 186 | 187 | return o 188 | 189 | 190 | class biCGRU(nn.Module): 191 | 192 | def __init__(self, emb_dim, n_vocab, h_dim=256, pretrained_emb=None, gpu=False, emb_drop=0.6, pad_idx=0): 193 | super(biCGRU, self).__init__() 194 | 195 | self.word_embed = nn.Embedding(n_vocab, emb_dim, padding_idx=pad_idx) 196 | 197 | if pretrained_emb is not None: 198 | self.word_embed.weight.data.copy_(pretrained_emb) 199 | 200 | self.rnn = nn.GRU( 201 | input_size=emb_dim, hidden_size=h_dim, 202 | num_layers=1, batch_first=True, bidirectional=True 203 | ) 204 | 205 | self.emb_drop = nn.Dropout(emb_drop) 206 | 207 | self.M = nn.Parameter(torch.FloatTensor(2*h_dim, 2*h_dim)) 208 | 209 | self.b = nn.Parameter(torch.FloatTensor([0])) 210 | self.attn = nn.Linear(2 * h_dim, 2 * h_dim) 211 | self.init_params_() 212 | 213 | if gpu: 214 | self.cuda() 215 | 216 | def init_params_(self): 217 | nn.init.xavier_normal(self.M) 218 | 219 | # Set forget gate bias to 2 220 | size = self.rnn.bias_hh_l0.size(0) 221 | self.rnn.bias_hh_l0.data[size//4:size//2] = 2 222 | 223 | size = self.rnn.bias_ih_l0.size(0) 224 | self.rnn.bias_ih_l0.data[size//4:size//2] = 2 225 | 226 | def forward(self, x1, x2, x1mask): 227 | """ 228 | Inputs: 229 | ------- 230 | x1, x2: seqs of words (batch_size, seq_len) 231 | 232 | Outputs: 233 | -------- 234 | o: vector of (batch_size) 235 | """ 236 | sc, c, r = self.forward_enc(x1, x2) 237 | c_attn = self.forward_attn(sc, r, x1mask) 238 | o = self.forward_fc(c_attn, r) 239 | 240 | return o.view(-1) 241 | 242 | def forward_enc(self, x1, x2): 243 | """ 244 | x1, x2: seqs of words (batch_size, seq_len) 245 | """ 246 | # Both are (batch_size, seq_len, emb_dim) 247 | x1_emb = self.emb_drop(self.word_embed(x1)) 248 | x2_emb = self.emb_drop(self.word_embed(x2)) 249 | 250 | # Each is (1 x batch_size x h_dim) 251 | sc, c = self.rnn(x1_emb) 252 | _, r = self.rnn(x2_emb) 253 | #concatenate both layers 254 | c = torch.cat([c[0], c[1]], dim=-1) 255 | r = torch.cat([r[0], r[1]], dim=-1) 256 | 257 | return sc, c.squeeze(), r.squeeze() 258 | 259 | def forward_attn(self, x1, x2, mask): 260 | """ 261 | attention 262 | :param x1: batch X seq_len X dim 263 | :return: 264 | """ 265 | max_len = x1.size(1) 266 | b_size = x1.size(0) 267 | 268 | x2 = x2.squeeze(0).unsqueeze(2) 269 | attn = self.attn(x1.contiguous().view(b_size*max_len, -1))# B, T,D -> B*T,D 270 | attn = attn.view(b_size, max_len, -1) # B,T,D 271 | attn_energies = (attn.bmm(x2).transpose(1, 2)) #B,T,D * B,D,1 --> B,1,T 272 | alpha = F.softmax(attn_energies.squeeze(1), dim=-1) # B, T 273 | alpha = alpha * mask # B, T 274 | alpha = alpha.unsqueeze(1) # B,1,T 275 | weighted_attn = alpha.bmm(x1) # B,T 276 | 277 | return weighted_attn.squeeze() 278 | 279 | def forward_fc(self, c, r): 280 | """ 281 | c, r: tensor of (batch_size, h_dim) 282 | """ 283 | # (batch_size x 1 x h_dim) 284 | o = torch.mm(c, self.M).unsqueeze(1) 285 | # (batch_size x 1 x 1) 286 | o = torch.bmm(o, r.unsqueeze(2)) 287 | o = o + self.b 288 | 289 | return o 290 | 291 | 292 | class Add_GRU(nn.Module): 293 | 294 | def __init__(self, emb_dim, n_vocab, h_dim=256, pretrained_emb=None, pad_idx=0, gpu=False, emb_drop=0.6, max_seq_len=160): 295 | super(Add_GRU, self).__init__() 296 | 297 | self.word_embed = nn.Embedding(n_vocab, emb_dim, padding_idx=pad_idx) 298 | #Load pre-trained embedding 299 | if pretrained_emb is not None: 300 | self.word_embed.weight.data.copy_(pretrained_emb) 301 | #size of description RNN 302 | self.desc_rnn_size = 100 303 | 304 | self.rnn = nn.GRU( 305 | input_size=emb_dim, hidden_size=h_dim, 306 | num_layers=1, batch_first=True, bidirectional=True 307 | ) 308 | self.rnn_desc = nn.GRU( 309 | input_size=emb_dim, hidden_size=self.desc_rnn_size, 310 | num_layers=1, batch_first=True, bidirectional=True 311 | ) 312 | 313 | self.h_dim = h_dim 314 | self.emb_dim = emb_dim 315 | self.emb_drop = nn.Dropout(emb_drop) 316 | self.max_seq_len = max_seq_len 317 | self.M = nn.Parameter(torch.FloatTensor(2*h_dim, 2*h_dim)) 318 | self.b = nn.Parameter(torch.FloatTensor([0])) 319 | self.attn = nn.Linear(2*h_dim, 2*h_dim) 320 | self.init_params_() 321 | self.tech_w = 0.0 322 | if gpu: 323 | self.cuda() 324 | 325 | def init_params_(self): 326 | #Initializing parameters 327 | nn.init.xavier_normal(self.M) 328 | 329 | # Set forget gate bias to 2 330 | size = self.rnn.bias_hh_l0.size(0) 331 | self.rnn.bias_hh_l0.data[size//4:size//2] = 2 332 | 333 | size = self.rnn.bias_ih_l0.size(0) 334 | self.rnn.bias_ih_l0.data[size//4:size//2] = 2 335 | 336 | size = self.rnn_desc.bias_hh_l0.size(0) 337 | self.rnn_desc.bias_hh_l0.data[size//4:size//2] = 2 338 | 339 | size = self.rnn_desc.bias_ih_l0.size(0) 340 | self.rnn_desc.bias_ih_l0.data[size//4:size//2] = 2 341 | 342 | def forward(self, x1, x2, x1mask, x2mask, key_r, key_mask_r): 343 | """ 344 | Inputs: 345 | ------- 346 | x1, x2: seqs of words (batch_size, seq_len) 347 | 348 | Outputs: 349 | -------- 350 | o: vector of (batch_size) 351 | """ 352 | #Masking for attention in 353 | key_mask_r = key_mask_r.unsqueeze(2).repeat(1, 1, self.desc_rnn_size * 2) 354 | key_emb_r = self.get_weighted_key(key_r, key_mask_r) 355 | #get all states from gru 356 | sc, sr, c, r = self.forward_enc(x1, x2, key_emb_r) 357 | #getting values after applying attention 358 | c_attn = self.forward_attn(sc, r, x1mask) 359 | r_attn = self.forward_attn(sr, c, x2mask) 360 | #final output 361 | o = self.forward_fc(c_attn, r_attn) 362 | 363 | return o.view(-1) 364 | 365 | def get_weighted_key(self, key_r, key_mask_r): 366 | """ 367 | get the output from desc gru 368 | response_keys, response_mask: seqs of words (batch_size, seq_len) 369 | """ 370 | #batch_size 371 | b_s = key_r.size(0) 372 | #sequence length 373 | s_len = key_r.size(1) 374 | key_emb = self.emb_drop(self.word_embed(key_r.view(b_s * s_len, -1))) 375 | key_emb = self._forward(key_emb) 376 | key_emb_r = key_emb.view(b_s, s_len, -1) * key_mask_r 377 | del (key_emb, b_s, s_len) 378 | 379 | return key_emb_r 380 | 381 | def _forward(self, x): 382 | """ 383 | get description embeddings 384 | :param x: 385 | :return: 386 | """ 387 | _, h = self.rnn_desc(x) 388 | out = torch.cat([h[0], h[1]], dim=-1) 389 | 390 | return out.squeeze() 391 | 392 | def forward_enc(self, x1, x2, key_emb_r): 393 | """ 394 | x1, x2, key_emb: seqs of words (batch_size, seq_len) 395 | """ 396 | # Both are (batch_size, seq_len, emb_dim) 397 | b, s = x2.size(0), x2.size(1) 398 | x1_emb = self.emb_drop(self.word_embed(x1)) # B X S X E 399 | sc, c = self.rnn(x1_emb) 400 | c = torch.cat([c[0], c[1]], dim=-1) # concat the bi-directional hidden layers, shape = B X H 401 | 402 | x2_emb = self.emb_drop(self.word_embed(x2)) 403 | #adding the embeddings 404 | x2_emb = x2_emb + key_emb_r 405 | # Each is (1 x batch_size x h_dim) 406 | 407 | sr, r = self.rnn(x2_emb) 408 | 409 | r = torch.cat([r[0], r[1]], dim=-1) 410 | 411 | return sc, sr, c.squeeze(), r.squeeze() 412 | 413 | def forward_attn(self, x1, x2, mask): 414 | """ 415 | attention 416 | :param x1: batch X seq_len X dim 417 | :return: 418 | """ 419 | max_len = x1.size(1) 420 | b_size = x1.size(0) 421 | 422 | x2 = x2.squeeze(0).unsqueeze(2) 423 | attn = self.attn(x1.contiguous().view(b_size*max_len, -1))# B, T,D -> B*T,D 424 | attn = attn.view(b_size, max_len, -1) # B,T,D 425 | attn_energies = (attn.bmm(x2).transpose(1, 2)) #B,T,D * B,D,1 --> B,1,T 426 | alpha = F.softmax(attn_energies.squeeze(1), dim=-1) # B, T 427 | alpha = alpha * mask # B, T 428 | alpha = alpha.unsqueeze(1) # B,1,T 429 | weighted_attn = alpha.bmm(x1) # B,T 430 | 431 | return weighted_attn.squeeze() 432 | 433 | def forward_fc(self, c, r): 434 | """ 435 | dual encoder 436 | c, r: tensor of (batch_size, h_dim) 437 | """ 438 | o = torch.mm(c, self.M).unsqueeze(1) 439 | # (batch_size x 1 x 1) 440 | o = torch.bmm(o, r.unsqueeze(2)) 441 | o = o + self.b 442 | 443 | return o 444 | 445 | 446 | class AK_DE_biGRU(nn.Module): 447 | 448 | def __init__(self, emb_dim, n_vocab, h_dim=256, pretrained_emb=None, pad_idx=0, gpu=False, emb_drop=0.6, max_seq_len=160): 449 | super(AK_DE_biGRU, self).__init__() 450 | 451 | self.word_embed = nn.Embedding(n_vocab, emb_dim, padding_idx=pad_idx) 452 | #Load pre-trained embedding 453 | if pretrained_emb is not None: 454 | self.word_embed.weight.data.copy_(pretrained_emb) 455 | #size of description RNN 456 | self.desc_rnn_size = 100 457 | 458 | self.rnn = nn.GRU( 459 | input_size=emb_dim, hidden_size=h_dim, 460 | num_layers=1, batch_first=True, bidirectional=True 461 | ) 462 | 463 | self.rnn_desc = nn.GRU( 464 | input_size=emb_dim, hidden_size=self.desc_rnn_size, 465 | num_layers=1, batch_first=True, bidirectional=True 466 | ) 467 | 468 | self.h_dim = h_dim 469 | self.emb_dim = emb_dim 470 | self.emb_drop = nn.Dropout(emb_drop) 471 | self.max_seq_len = max_seq_len 472 | self.M = nn.Parameter(torch.FloatTensor(2*h_dim, 2*h_dim)) 473 | self.b = nn.Parameter(torch.FloatTensor([0])) 474 | self.Wc = nn.Parameter(torch.FloatTensor(2*h_dim, emb_dim)) 475 | self.We = nn.Parameter(torch.FloatTensor(emb_dim, emb_dim)) 476 | self.attn = nn.Linear(2*h_dim, 2*h_dim) 477 | self.init_params_() 478 | self.tech_w = 0.0 479 | if gpu: 480 | self.cuda() 481 | 482 | def init_params_(self): 483 | #Initializing parameters 484 | nn.init.xavier_normal(self.M) 485 | 486 | # Set forget gate bias to 2 487 | size = self.rnn.bias_hh_l0.size(0) 488 | self.rnn.bias_hh_l0.data[size//4:size//2] = 2 489 | 490 | size = self.rnn.bias_ih_l0.size(0) 491 | self.rnn.bias_ih_l0.data[size//4:size//2] = 2 492 | 493 | size = self.rnn_desc.bias_hh_l0.size(0) 494 | self.rnn_desc.bias_hh_l0.data[size//4:size//2] = 2 495 | 496 | size = self.rnn_desc.bias_ih_l0.size(0) 497 | self.rnn_desc.bias_ih_l0.data[size//4:size//2] = 2 498 | 499 | def forward(self, x1, x2, x1mask, x2mask, key_r, key_mask_r): 500 | """ 501 | Inputs: 502 | ------- 503 | x1, x2: seqs of words (batch_size, seq_len) 504 | 505 | Outputs: 506 | -------- 507 | o: vector of (batch_size) 508 | """ 509 | #Masking for attention in 510 | key_mask_r = key_mask_r.unsqueeze(2).repeat(1, 1, self.desc_rnn_size * 2) 511 | key_emb_r = self.get_weighted_key(key_r, key_mask_r) 512 | #get all states from gru 513 | sc, sr, c, r = self.forward_enc(x1, x2, key_emb_r) 514 | #getting values after applying attention 515 | c_attn = self.forward_attn(sc, r, x1mask) 516 | r_attn = self.forward_attn(sr, c, x2mask) 517 | #final output 518 | o = self.forward_fc(c_attn, r_attn) 519 | 520 | return o.view(-1) 521 | 522 | def get_weighted_key(self, key_r, key_mask_r): 523 | """ 524 | get the output from desc gru 525 | response_keys, response_mask: seqs of words (batch_size, seq_len) 526 | """ 527 | #batch_size 528 | b_s = key_r.size(0) 529 | #sequence length 530 | s_len = key_r.size(1) 531 | key_emb = self.emb_drop(self.word_embed(key_r.view(b_s * s_len, -1))) 532 | key_emb = self._forward(key_emb) 533 | key_emb_r = key_emb.view(b_s, s_len, -1) * key_mask_r 534 | del (key_emb, b_s, s_len) 535 | 536 | return key_emb_r 537 | 538 | def _forward(self, x): 539 | """ 540 | get description embeddings 541 | :param x: 542 | :return: 543 | """ 544 | _, h = self.rnn_desc(x) 545 | out = torch.cat([h[0], h[1]], dim=-1) 546 | 547 | return out.squeeze() 548 | 549 | def forward_enc(self, x1, x2, key_emb_r): 550 | """ 551 | x1, x2, key_emb: seqs of words (batch_size, seq_len) 552 | """ 553 | # Both are (batch_size, seq_len, emb_dim) 554 | b, s = x2.size(0), x2.size(1) 555 | x1_emb = self.emb_drop(self.word_embed(x1)) # B X S X E 556 | sc, c = self.rnn(x1_emb) 557 | c = torch.cat([c[0], c[1]], dim=-1) # concat the bi-directional hidden layers, shape = B X H 558 | 559 | c_k = c.unsqueeze(1).repeat(1, key_emb_r.size(1), 1) 560 | 561 | x2_emb = self.emb_drop(self.word_embed(x2)) 562 | #Equation 10 563 | alpha_k = F.softmax(torch.mm(c_k.view(b*s, -1), self.Wc).view(b, s, self.emb_dim) + torch.mm(key_emb_r.view(b*s, -1), self.We).view(b, s, self.emb_dim), dim=-1) 564 | #Equation 11 565 | x2_emb = (1 - alpha_k) * x2_emb + alpha_k * key_emb_r 566 | # Each is (1 x batch_size x h_dim) 567 | 568 | sr, r = self.rnn(x2_emb) 569 | 570 | r = torch.cat([r[0], r[1]], dim=-1) 571 | 572 | return sc, sr, c.squeeze(), r.squeeze() 573 | 574 | def forward_attn(self, x1, x2, mask): 575 | """ 576 | attention 577 | :param x1: batch X seq_len X dim 578 | :return: 579 | """ 580 | max_len = x1.size(1) 581 | b_size = x1.size(0) 582 | 583 | x2 = x2.squeeze(0).unsqueeze(2) 584 | attn = self.attn(x1.contiguous().view(b_size*max_len, -1))# B, T,D -> B*T,D 585 | attn = attn.view(b_size, max_len, -1) # B,T,D 586 | attn_energies = (attn.bmm(x2).transpose(1, 2)) #B,T,D * B,D,1 --> B,1,T 587 | alpha = F.softmax(attn_energies.squeeze(1), dim=-1) # B, T 588 | alpha = alpha * mask # B, T 589 | alpha = alpha.unsqueeze(1) # B,1,T 590 | weighted_attn = alpha.bmm(x1) # B,T 591 | 592 | return weighted_attn.squeeze() 593 | 594 | def forward_fc(self, c, r): 595 | """ 596 | dual encoder 597 | c, r: tensor of (batch_size, h_dim) 598 | """ 599 | o = torch.mm(c, self.M).unsqueeze(1) 600 | # (batch_size x 1 x 1) 601 | o = torch.bmm(o, r.unsqueeze(2)) 602 | o = o + self.b 603 | 604 | return o 605 | --------------------------------------------------------------------------------