├── .gitignore ├── LICENSE ├── README.md ├── data ├── df_dialogue_test.csv ├── df_dialogue_train.csv ├── df_dialogue_val.csv ├── df_qgen_test.csv ├── df_qgen_train.csv └── df_qgen_val.csv ├── ded_detAttn ├── bleu │ └── README.md ├── ded_detAttn.py ├── logs.txt ├── model_config.py ├── predict.ipynb └── train.py ├── utils ├── __init__.py ├── data_utils.py └── eval_utils.py ├── ved_detAttn ├── bleu │ └── README.md ├── detAttention_decoder │ └── basic_decoder.py ├── logs.txt ├── model_config.py ├── predict.ipynb ├── train.py └── ved_detAttn.py ├── ved_varAttn ├── bleu │ └── README.md ├── logs.txt ├── model_config.py ├── predict.ipynb ├── train.py ├── varAttention_decoder │ ├── attention_wrapper.py │ ├── basic_decoder.py │ └── decoder.py └── ved_varAttn.py ├── w2v_generator.py └── w2v_models └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .ipynb_checkpoints 3 | .idea 4 | w2vmodel_dialogue.pkl 5 | w2vmodel_dialogue.pkl.wv.syn0.npy 6 | w2vmodel_qgen.pkl.syn1neg.npy 7 | w2vmodel_dialogue.pkl.syn1neg.npy 8 | w2vmodel_qgen.pkl 9 | w2vmodel_qgen.pkl.wv.syn0.npy 10 | events.* 11 | *.ckpt.* 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 variational-attention 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Variational Attention 2 | ![](https://img.shields.io/badge/python-3.6-brightgreen.svg) ![](https://img.shields.io/badge/tensorflow-1.3.0-orange.svg) 3 | Implentation of 'Variational Attention for Sequence to Sequence Models' in tensorflow. 4 | 5 | ## Overview 6 | This package consists of 3 models, each of which have been organized into separate folders: 7 | 1. Deterministic encoder-decoder with deterministic attention (`ded_detAttn`) 8 | 2. Variational encoder-decoder with deterministic attention (`ved_detAttn`) 9 | 3. Variational encoder-decoder with variational attention (`ved_varAttn`) 10 | 11 | ## Datasets 12 | The proposed model and baselines have been evaluated on two experiments: 13 | 1. Neural Question Generation 14 | with the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset 15 | 2. Conversation Systems with the [Cornell Movie Dialogue](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html) dataset 16 | 17 | The data has been preprocessed and the train-val-test split is provided in the `data/` directory. 18 | 19 | ## Requirements 20 | - tensorflow-gpu==1.3.0 21 | - Keras==2.0. 8 22 | - numpy==1.12.1 23 | - pandas==0.22.0 24 | - gensim==3.1.2 25 | - nltk==3.2.3 26 | - tqdm==4.19.1 27 | 28 | ## Instructions 29 | 1. Generate word2vec, required for initializing word embeddings, specifying the dataset: 30 | ``` 31 | python w2v_generator.py --dataset qgen 32 | ``` 33 | 2. Train the desired model, set configurations in the `model_config.py` file. For example, 34 | ``` 35 | cd ved_varAttn 36 | vim model_config.py # Make necessary edits 37 | python train.py 38 | ``` 39 | - The model checkpoints are stored in `models/` directory, the summaries for Tensorboard are stored in `summary_logs/` directory. As training progresses, the metrics on the validation set are dumped into`log.txt` and `bleu/` directory. 40 | 3. Evaluate performance of the trained model. Refer to `predict.ipynb` to load desired checkpoint, calculate performance metrics (BLEU and diversity score) on the test set, and generate sample outputs. 41 | -------------------------------------------------------------------------------- /ded_detAttn/bleu/README.md: -------------------------------------------------------------------------------- 1 | BLEU score pickle files will be saved in this directory. 2 | -------------------------------------------------------------------------------- /ded_detAttn/ded_detAttn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | if '../' not in sys.path: sys.path.append('../') 3 | import time 4 | import pickle 5 | import tensorflow as tf 6 | import numpy as np 7 | from utils import data_utils 8 | from utils import eval_utils 9 | from nltk.tokenize import word_tokenize 10 | from tensorflow.python.layers.core import Dense 11 | 12 | 13 | class DetSeq2SeqDetAttnModel(object): 14 | 15 | def __init__(self, config, encoder_embeddings_matrix, decoder_embeddings_matrix, 16 | encoder_word_index, decoder_word_index): 17 | 18 | self.config = config 19 | 20 | self.lstm_hidden_units = config['lstm_hidden_units'] 21 | self.embedding_size = config['embedding_size'] 22 | self.num_layers = config['num_layers'] 23 | 24 | self.encoder_vocab_size = config['encoder_vocab'] 25 | self.decoder_vocab_size = config['decoder_vocab'] 26 | 27 | self.encoder_num_tokens = config['encoder_num_tokens'] 28 | self.decoder_num_tokens = config['decoder_num_tokens'] 29 | 30 | self.dropout_keep_prob = config['dropout_keep_prob'] 31 | 32 | self.initial_learning_rate = config['initial_learning_rate'] 33 | self.learning_rate_decay = config['learning_rate_decay'] 34 | self.min_learning_rate = config['min_learning_rate'] 35 | 36 | self.batch_size = config['batch_size'] 37 | self.epochs = config['n_epochs'] 38 | 39 | self.encoder_embeddings_matrix = encoder_embeddings_matrix 40 | self.decoder_embeddings_matrix = decoder_embeddings_matrix 41 | self.encoder_word_index = encoder_word_index 42 | self.decoder_word_index = decoder_word_index 43 | self.encoder_idx_word = dict((i, word) for word, i in encoder_word_index.items()) 44 | self.decoder_idx_word = dict((i, word) for word, i in decoder_word_index.items()) 45 | 46 | self.logs_dir = config['logs_dir'] 47 | self.model_checkpoint_dir = config['model_checkpoint_dir'] 48 | self.bleu_path = config['bleu_path'] 49 | 50 | self.pad = self.decoder_word_index['PAD'] 51 | self.eos = self.decoder_word_index['EOS'] 52 | 53 | self.epoch_bleu_score_val = {'1': [], '2': [], '3': [], '4': []} 54 | self.log_str = [] 55 | 56 | self.build_model() 57 | 58 | def build_model(self): 59 | print("[INFO] Building Model ...") 60 | 61 | self.init_placeholders() 62 | self.embedding_layer() 63 | self.build_encoder() 64 | self.build_decoder() 65 | self.loss() 66 | self.optimize() 67 | self.summary() 68 | 69 | def init_placeholders(self): 70 | with tf.name_scope("model_inputs"): 71 | # Create palceholders for inputs to the model 72 | self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.encoder_num_tokens], name='input') 73 | self.target_data = tf.placeholder(tf.int32, [self.batch_size, self.decoder_num_tokens], name='targets') 74 | self.lr = tf.placeholder(tf.float32, name='learning_rate', shape=()) 75 | self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') # Dropout Keep Probability 76 | self.source_sentence_length = tf.placeholder(tf.int32, shape=(self.batch_size,), 77 | name='source_sentence_length') 78 | self.target_sentence_length = tf.placeholder(tf.int32, shape=(self.batch_size,), 79 | name='target_sentence_length') 80 | 81 | def embedding_layer(self): 82 | with tf.name_scope("word_embeddings"): 83 | self.encoder_embeddings = tf.Variable( 84 | initial_value=np.array(self.encoder_embeddings_matrix, dtype=np.float32), 85 | dtype=tf.float32, trainable=False) 86 | self.enc_embed_input = tf.nn.embedding_lookup(self.encoder_embeddings, self.input_data) 87 | # self.enc_embed_input = tf.nn.dropout(self.enc_embed_input, keep_prob=self.keep_prob) 88 | 89 | with tf.name_scope("decoder_inputs"): 90 | self.decoder_embeddings = tf.Variable( 91 | initial_value=np.array(self.decoder_embeddings_matrix, dtype=np.float32), 92 | dtype=tf.float32, trainable=False) 93 | ending = tf.strided_slice(self.target_data, [0, 0], [self.batch_size, -1], [1, 1], 94 | name='slice_input') # Minus 1 implies everything till the last dim 95 | self.dec_input = tf.concat([tf.fill([self.batch_size, 1], self.decoder_word_index['GO']), ending], 1, 96 | name='dec_input') 97 | self.dec_embed_input = tf.nn.embedding_lookup(self.decoder_embeddings, self.dec_input) 98 | # self.dec_embed_input = tf.nn.dropout(self.dec_embed_input, keep_prob=self.keep_prob) 99 | 100 | def build_encoder(self): 101 | with tf.name_scope("encode"): 102 | for layer in range(self.num_layers): 103 | with tf.variable_scope('encoder_{}'.format(layer + 1)): 104 | cell_fw = tf.contrib.rnn.LayerNormBasicLSTMCell(self.lstm_hidden_units) 105 | cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=self.keep_prob) 106 | 107 | cell_bw = tf.contrib.rnn.LayerNormBasicLSTMCell(self.lstm_hidden_units) 108 | cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=self.keep_prob) 109 | 110 | self.enc_output, self.enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 111 | cell_bw, 112 | self.enc_embed_input, 113 | self.source_sentence_length, 114 | dtype=tf.float32) 115 | 116 | # Join outputs since we are using a bidirectional RNN 117 | self.h_N = tf.concat([self.enc_state[0][1], self.enc_state[1][1]], axis=-1, 118 | name='h_N') # Concatenated h from the fw and bw LSTMs 119 | self.c_N = tf.concat([self.enc_state[0][0], self.enc_state[1][0]], axis=-1, 120 | name='c_N') # Concatenated c from the fw and bw LSTMs 121 | 122 | self.init_state = tf.contrib.rnn.LSTMStateTuple(self.c_N, self.h_N) 123 | self.enc_outputs = tf.concat([self.enc_output[0], self.enc_output[1]], axis=-1, name='encoder_outputs') 124 | 125 | def build_decoder(self): 126 | with tf.variable_scope("decode"): 127 | for layer in range(self.num_layers): 128 | with tf.variable_scope('decoder_{}'.format(layer + 1)): 129 | dec_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(2 * self.lstm_hidden_units) 130 | dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, input_keep_prob=self.keep_prob) 131 | 132 | self.output_layer = Dense(self.decoder_vocab_size) 133 | 134 | attn_mech = tf.contrib.seq2seq.LuongAttention(2 * self.lstm_hidden_units, 135 | self.enc_outputs, 136 | memory_sequence_length=self.source_sentence_length) 137 | 138 | attn_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attn_mech, self.lstm_hidden_units) 139 | 140 | self.init_state = attn_cell.zero_state(self.batch_size, tf.float32).clone(cell_state=self.init_state) 141 | 142 | with tf.name_scope("training_decoder"): 143 | training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=self.dec_embed_input, 144 | sequence_length=self.target_sentence_length, 145 | time_major=False) 146 | 147 | training_decoder = tf.contrib.seq2seq.BasicDecoder(attn_cell, 148 | training_helper, 149 | initial_state=self.init_state, 150 | output_layer=self.output_layer) 151 | 152 | self.training_logits, _state, _len = tf.contrib.seq2seq.dynamic_decode(training_decoder, 153 | output_time_major=False, 154 | impute_finished=True, 155 | maximum_iterations=self.decoder_num_tokens) 156 | 157 | self.training_logits = tf.identity(self.training_logits.rnn_output, 'logits') 158 | 159 | with tf.name_scope("inference_decoder"): 160 | start_token = self.decoder_word_index['GO'] 161 | end_token = self.decoder_word_index['EOS'] 162 | 163 | start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [self.batch_size], 164 | name='start_tokens') 165 | 166 | inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.decoder_embeddings, 167 | start_tokens, 168 | end_token) 169 | 170 | inference_decoder = tf.contrib.seq2seq.BasicDecoder(attn_cell, 171 | inference_helper, 172 | initial_state=self.init_state, 173 | output_layer=self.output_layer) 174 | 175 | self.inference_logits, _state, _len = tf.contrib.seq2seq.dynamic_decode(inference_decoder, 176 | output_time_major=False, 177 | impute_finished=True, 178 | maximum_iterations=self.decoder_num_tokens) 179 | 180 | self.inference_logits = tf.identity(self.inference_logits.sample_id, name='predictions') 181 | 182 | def loss(self): 183 | with tf.name_scope('losses'): 184 | # Create the weights for sequence_loss 185 | masks = tf.sequence_mask(self.target_sentence_length, self.decoder_num_tokens, dtype=tf.float32, name='masks') 186 | 187 | self.xent_loss = tf.contrib.seq2seq.sequence_loss( 188 | self.training_logits, 189 | self.target_data, 190 | weights=masks, average_across_batch=False) 191 | 192 | # L2-Regularization 193 | self.var_list = tf.trainable_variables() 194 | self.lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in self.var_list if 'bias' not in v.name]) * 0.001 195 | 196 | self.cost = tf.reduce_sum(self.xent_loss) + self.lossL2 197 | 198 | def optimize(self): 199 | # Optimizer 200 | with tf.name_scope('optimization'): 201 | optimizer = tf.train.AdamOptimizer(self.lr) 202 | 203 | # Gradient Clipping 204 | gradients = optimizer.compute_gradients(self.cost, var_list=self.var_list) 205 | capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] 206 | self.train_op = optimizer.apply_gradients(capped_gradients) 207 | 208 | def summary(self): 209 | with tf.name_scope('summaries'): 210 | tf.summary.scalar('xent_loss', tf.reduce_sum(self.xent_loss)) 211 | tf.summary.scalar('l2_loss', tf.reduce_sum(self.lossL2)) 212 | tf.summary.scalar('total_loss', tf.reduce_sum(self.cost)) 213 | self.summary_op = tf.summary.merge_all() 214 | 215 | def train(self, x_train, y_train, x_val, y_val, true_val): 216 | 217 | print('[INFO] Training process started') 218 | 219 | learning_rate = self.initial_learning_rate 220 | iter_i = 0 221 | 222 | with tf.Session() as sess: 223 | sess.run(tf.global_variables_initializer()) 224 | 225 | writer = tf.summary.FileWriter(self.logs_dir, sess.graph) 226 | 227 | for epoch_i in range(1, self.epochs + 1): 228 | 229 | start_time = time.time() 230 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 231 | data_utils.get_batches(x_train, y_train, self.batch_size)): 232 | 233 | try: 234 | iter_i += 1 235 | 236 | _, _summary = sess.run( 237 | [self.train_op, self.summary_op], 238 | feed_dict={self.input_data: input_batch, 239 | self.target_data: output_batch, 240 | self.lr: learning_rate, 241 | self.source_sentence_length: source_sent_lengths, 242 | self.target_sentence_length: tar_sent_lengths, 243 | self.keep_prob: self.dropout_keep_prob, 244 | }) 245 | 246 | writer.add_summary(_summary, iter_i) 247 | 248 | except Exception as e: 249 | # print(iter_i, e) 250 | pass 251 | 252 | self.validate(sess, x_val, y_val, true_val) 253 | val_bleu_str = str(self.epoch_bleu_score_val['1'][epoch_i - 1]) + ' | ' \ 254 | + str(self.epoch_bleu_score_val['2'][epoch_i - 1]) + ' | ' \ 255 | + str(self.epoch_bleu_score_val['3'][epoch_i - 1]) + ' | ' \ 256 | + str(self.epoch_bleu_score_val['4'][epoch_i - 1]) 257 | 258 | # Reduce learning rate, but not below its minimum value 259 | learning_rate = np.max([self.min_learning_rate, learning_rate * self.learning_rate_decay]) 260 | 261 | saver = tf.train.Saver() 262 | saver.save(sess, self.model_checkpoint_dir + str(epoch_i) + ".ckpt") 263 | end_time = time.time() 264 | 265 | # Save the validation BLEU scores so far 266 | with open(self.bleu_path + '.pkl', 'wb') as f: 267 | pickle.dump(self.epoch_bleu_score_val, f) 268 | 269 | self.log_str.append('Epoch {:>3}/{} - Time {:>6.1f} BLEU: {}'.format(epoch_i, 270 | self.epochs, 271 | end_time - start_time, 272 | val_bleu_str)) 273 | with open('logs.txt', 'w') as f: 274 | f.write('\n'.join(self.log_str)) 275 | print(self.log_str[-1]) 276 | 277 | def validate(self, sess, x_val, y_val, true_val): 278 | # Calculate BLEU on validation data 279 | hypotheses_val = [] 280 | references_val = [] 281 | symbol=[] 282 | if self.config['experiment'] == 'qgen': 283 | symbol.append('?') 284 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 285 | data_utils.get_batches(x_val, y_val, self.batch_size)): 286 | answer_logits = sess.run(self.inference_logits, 287 | feed_dict={self.input_data: input_batch, 288 | self.source_sentence_length: source_sent_lengths, 289 | self.keep_prob: 1.0}) 290 | 291 | for k, pred in enumerate(answer_logits): 292 | hypotheses_val.append( 293 | word_tokenize(" ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol) 294 | references_val.append([word_tokenize(true_val[batch_i * self.batch_size + k])]) 295 | 296 | bleu_scores = eval_utils.calculate_bleu_scores(references_val, hypotheses_val) 297 | self.epoch_bleu_score_val['1'].append(bleu_scores[0]) 298 | self.epoch_bleu_score_val['2'].append(bleu_scores[1]) 299 | self.epoch_bleu_score_val['3'].append(bleu_scores[2]) 300 | self.epoch_bleu_score_val['4'].append(bleu_scores[3]) 301 | 302 | def predict(self, checkpoint, x_test, y_test, true_test): 303 | pred_logits = [] 304 | hypotheses_test = [] 305 | references_test = [] 306 | symbol=[] 307 | if self.config['experiment'] == 'qgen': 308 | symbol.append('?') 309 | 310 | with tf.Session() as sess: 311 | sess.run(tf.global_variables_initializer()) 312 | saver = tf.train.Saver() 313 | saver.restore(sess, checkpoint) 314 | 315 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 316 | data_utils.get_batches(x_test, y_test, self.batch_size)): 317 | result = sess.run(self.inference_logits, feed_dict={self.input_data: input_batch, 318 | self.source_sentence_length: source_sent_lengths, 319 | self.keep_prob: 1.0}) 320 | 321 | pred_logits.extend(result) 322 | 323 | for k, pred in enumerate(result): 324 | hypotheses_test.append( 325 | word_tokenize(" ".join( 326 | [self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol) 327 | references_test.append([word_tokenize(true_test[batch_i * self.batch_size + k])]) 328 | 329 | bleu_scores = eval_utils.calculate_bleu_scores(references_test, hypotheses_test) 330 | 331 | print('BLEU 1 to 4 : {}'.format(' | '.join(map(str, bleu_scores)))) 332 | 333 | return pred_logits 334 | 335 | def show_output_sentences(self, preds, y_test, input_test, true_test): 336 | symbol=[] 337 | if self.config['experiment'] == 'qgen': 338 | symbol.append('?') 339 | for k, (pred, actual) in enumerate(zip(preds, y_test)): 340 | print('Input: {}'.format(input_test[k].strip())) 341 | print('Actual: {}'.format(true_test[k].strip())) 342 | print('Generated: {}\n'.format(" ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, self.eos]] + symbol))) 343 | -------------------------------------------------------------------------------- /ded_detAttn/logs.txt: -------------------------------------------------------------------------------- 1 | Epoch 1/3 - Time 340.2 BLEU: 29.37 | 12.63 | 6.61 | 3.63 2 | Epoch 2/3 - Time 363.8 BLEU: 30.84 | 14.33 | 7.95 | 4.57 3 | Epoch 3/3 - Time 368.2 BLEU: 32.25 | 15.47 | 8.78 | 5.14 -------------------------------------------------------------------------------- /ded_detAttn/model_config.py: -------------------------------------------------------------------------------- 1 | config = dict( 2 | 3 | experiment = 'qgen', # Experiment - either qgen or dialogue 4 | lstm_hidden_units = 100, # Number of hidden units for the LSTM 5 | embedding_size = 300, # Word embedding dimension 6 | num_layers = 1, # Number of LSTM layers 7 | 8 | encoder_vocab = 40000, # Vocabulary size on the encoder side # 30000 for dialogue 9 | decoder_vocab = 40000, # Vocabulary size on the decoder side # 30000 for dialogue 10 | encoder_num_tokens = 30, # Number of words/tokens in the input sequence # 20 for dialogue 11 | decoder_num_tokens = 20, # Number of words/tokens in the generated sequence 12 | 13 | dropout_keep_prob = 0.8, # Dropout keep probability 14 | initial_learning_rate = 0.005, # Initial learning rate 15 | learning_rate_decay = 0.75, # Learning rate decay 16 | min_learning_rate = 0.00001, # Minimum learning rate 17 | 18 | batch_size = 100, # Batch size 19 | n_epochs = 3, # Number of epochs 20 | 21 | logs_dir = 'summary_logs/det-seq2seq-det-attn', # Path to save summary information for Tensorboard 22 | model_checkpoint_dir = 'models/det-seq2seq-det-attn-', # Path to save model checkpoints 23 | bleu_path = 'bleu/det-seq2seq-det-attn', # Path to save model checkpoints 24 | w2v_dir = '../w2v_models/', # Word2Vec model directory 25 | data_dir = '../data/', # Directory to store data csv files 26 | 27 | load_checkpoint = 0, # Specify the trained model epoch/checkpoint number to be loaded for evaluation on test set, 0 means last saved checkpoint 28 | 29 | ) -------------------------------------------------------------------------------- /ded_detAttn/predict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n", 11 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", 12 | "\n", 13 | "import tensorflow as tf\n", 14 | "tf_config = tf.ConfigProto()\n", 15 | "tf_config.gpu_options.allow_growth=True \n", 16 | "sess = tf.Session(config=tf_config)\n", 17 | "\n", 18 | "import sys\n", 19 | "if not '../' in sys.path: sys.path.append('../')\n", 20 | "\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "\n", 24 | "from utils import data_utils\n", 25 | "from model_config import config\n", 26 | "from ded_detAttn import DetSeq2SeqDetAttnModel" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "print('[INFO] Preparing data for experiment: {}'.format(config['experiment']))\n", 36 | "if config['experiment'] == 'qgen':\n", 37 | " train_data = pd.read_csv(config['data_dir'] + 'df_qgen_train.csv')\n", 38 | " val_data = pd.read_csv(config['data_dir'] + 'df_qgen_val.csv')\n", 39 | " test_data = pd.read_csv(config['data_dir'] + 'df_qgen_test.csv')\n", 40 | " input_sentences = pd.concat([train_data['answer'], val_data['answer'], test_data['answer']])\n", 41 | " output_sentences = pd.concat([train_data['question'], val_data['question'], test_data['question']])\n", 42 | " true_test = test_data['question']\n", 43 | " input_test = test_data['answer']\n", 44 | " filters = '!\"#$%&()*+,./:;<=>?@[\\\\]^`{|}~\\t\\n'\n", 45 | " w2v_path = config['w2v_dir'] + 'w2vmodel_qgen.pkl'\n", 46 | " \n", 47 | "elif config['experiment'] == 'dialogue':\n", 48 | " train_data = pd.read_csv(config['data_dir'] + 'df_dialogue_train.csv')\n", 49 | " val_data = pd.read_csv(config['data_dir'] + 'df_dialogue_val.csv')\n", 50 | " test_data = pd.read_csv(config['data_dir'] + 'df_dialogue_test.csv')\n", 51 | " input_sentences = pd.concat([train_data['line'], val_data['line'], test_data['line']])\n", 52 | " output_sentences = pd.concat([train_data['reply'], val_data['reply'], test_data['reply']])\n", 53 | " true_test = test_data['reply']\n", 54 | " input_test = test_data['line']\n", 55 | " filters = '!\"#$%&()*+/:;<=>@[\\\\]^`{|}~\\t\\n'\n", 56 | " w2v_path = config['w2v_dir'] + 'w2vmodel_dialogue.pkl'\n", 57 | "\n", 58 | "else:\n", 59 | " print('Invalid experiment name specified!')\n", 60 | " \n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "print('[INFO] Tokenizing input and output sequences')\n", 70 | "x, input_word_index = data_utils.tokenize_sequence(input_sentences, \n", 71 | " filters, \n", 72 | " config['encoder_num_tokens'], \n", 73 | " config['encoder_vocab'])\n", 74 | "\n", 75 | "y, output_word_index = data_utils.tokenize_sequence(output_sentences, \n", 76 | " filters, \n", 77 | " config['decoder_num_tokens'], \n", 78 | " config['decoder_vocab'])\n", 79 | "\n", 80 | "print('[INFO] Split data into train-validation-test sets')\n", 81 | "x_train, y_train, x_val, y_val, x_test, y_test = data_utils.create_data_split(x, \n", 82 | " y, \n", 83 | " config['experiment'])\n", 84 | "\n", 85 | "encoder_embeddings_matrix = data_utils.create_embedding_matrix(input_word_index, \n", 86 | " config['embedding_size'], \n", 87 | " w2v_path)\n", 88 | "\n", 89 | "decoder_embeddings_matrix = data_utils.create_embedding_matrix(output_word_index, \n", 90 | " config['embedding_size'], \n", 91 | " w2v_path)\n", 92 | "\n", 93 | "# Re-calculate the vocab size based on the word_idx dictionary\n", 94 | "config['encoder_vocab'] = len(input_word_index)\n", 95 | "config['decoder_vocab'] = len(output_word_index)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "model = DetSeq2SeqDetAttnModel(config, \n", 105 | " encoder_embeddings_matrix, \n", 106 | " decoder_embeddings_matrix, \n", 107 | " input_word_index, \n", 108 | " output_word_index)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "if config['load_checkpoint'] != 0: \n", 125 | " checkpoint = config['model_checkpoint_dir'] + str(config['load_checkpoint']) + '.ckpt'\n", 126 | "else:\n", 127 | " checkpoint = tf.train.get_checkpoint_state(os.path.dirname('models/checkpoint')).model_checkpoint_path\n", 128 | "\n", 129 | "preds = model.predict(checkpoint, \n", 130 | " x_test, \n", 131 | " y_test, \n", 132 | " true_test, \n", 133 | " )" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "count = 100\n", 143 | "model.show_output_sentences(preds[:count], \n", 144 | " y_test[:count], \n", 145 | " input_test[:count], \n", 146 | " true_test[:count], \n", 147 | " )" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "Python 3", 161 | "language": "python", 162 | "name": "python3" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 3 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython3", 174 | "version": "3.6.1" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 2 179 | } 180 | -------------------------------------------------------------------------------- /ded_detAttn/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 4 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 5 | 6 | import tensorflow as tf 7 | 8 | tf_config = tf.ConfigProto() 9 | tf_config.gpu_options.allow_growth = True 10 | sess = tf.Session(config=tf_config) 11 | 12 | import sys 13 | 14 | if not '../' in sys.path: sys.path.append('../') 15 | 16 | import pandas as pd 17 | 18 | from utils import data_utils 19 | from model_config import config 20 | from ded_detAttn import DetSeq2SeqDetAttnModel 21 | 22 | 23 | def train_model(config): 24 | print('[INFO] Preparing data for experiment: {}'.format(config['experiment'])) 25 | if config['experiment'] == 'qgen': 26 | train_data = pd.read_csv(config['data_dir'] + 'df_qgen_train.csv') 27 | val_data = pd.read_csv(config['data_dir'] + 'df_qgen_val.csv') 28 | test_data = pd.read_csv(config['data_dir'] + 'df_qgen_test.csv') 29 | input_sentences = pd.concat([train_data['answer'], val_data['answer'], test_data['answer']]) 30 | output_sentences = pd.concat([train_data['question'], val_data['question'], test_data['question']]) 31 | true_val = val_data['question'] 32 | filters = '!"#$%&()*+,./:;<=>?@[\\]^`{|}~\t\n' 33 | w2v_path = config['w2v_dir'] + 'w2vmodel_qgen.pkl' 34 | 35 | elif config['experiment'] == 'dialogue': 36 | train_data = pd.read_csv(config['data_dir'] + 'df_dialogue_train.csv') 37 | val_data = pd.read_csv(config['data_dir'] + 'df_dialogue_val.csv') 38 | test_data = pd.read_csv(config['data_dir'] + 'df_dialogue_test.csv') 39 | input_sentences = pd.concat([train_data['line'], val_data['line'], test_data['line']]) 40 | output_sentences = pd.concat([train_data['reply'], val_data['reply'], test_data['reply']]) 41 | true_val = val_data['reply'] 42 | filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n' 43 | w2v_path = config['w2v_dir'] + 'w2vmodel_dialogue.pkl' 44 | 45 | else: 46 | print('Invalid experiment name specified!') 47 | return 48 | 49 | print('[INFO] Tokenizing input and output sequences') 50 | x, input_word_index = data_utils.tokenize_sequence(input_sentences, 51 | filters, 52 | config['encoder_num_tokens'], 53 | config['encoder_vocab']) 54 | 55 | y, output_word_index = data_utils.tokenize_sequence(output_sentences, 56 | filters, 57 | config['decoder_num_tokens'], 58 | config['decoder_vocab']) 59 | 60 | print('[INFO] Split data into train-validation-test sets') 61 | x_train, y_train, x_val, y_val, x_test, y_test = data_utils.create_data_split(x, 62 | y, 63 | config['experiment']) 64 | 65 | encoder_embeddings_matrix = data_utils.create_embedding_matrix(input_word_index, 66 | config['embedding_size'], 67 | w2v_path) 68 | 69 | decoder_embeddings_matrix = data_utils.create_embedding_matrix(output_word_index, 70 | config['embedding_size'], 71 | w2v_path) 72 | 73 | # Re-calculate the vocab size based on the word_idx dictionary 74 | config['encoder_vocab'] = len(input_word_index) 75 | config['decoder_vocab'] = len(output_word_index) 76 | 77 | model = DetSeq2SeqDetAttnModel(config, 78 | encoder_embeddings_matrix, 79 | decoder_embeddings_matrix, 80 | input_word_index, 81 | output_word_index) 82 | 83 | model.train(x_train, y_train, x_val, y_val, true_val) 84 | 85 | 86 | if __name__ == '__main__': 87 | train_model(config) 88 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/variational-attention/tf-var-attention/3d1be49d569a1e18669767c4cc96b0d2c04b2f97/utils/__init__.py -------------------------------------------------------------------------------- /utils/data_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import gensim 4 | from nltk.tokenize import word_tokenize 5 | from keras.preprocessing.text import Tokenizer 6 | from keras.preprocessing.sequence import pad_sequences 7 | 8 | 9 | def tokenize_sequence(sentences, filters, max_num_words, max_vocab_size): 10 | """ 11 | Tokenizes a given input sequence of words. 12 | 13 | Args: 14 | sentences: List of sentences 15 | filters: List of filters/punctuations to omit (for Keras tokenizer) 16 | max_num_words: Number of words to be considered in the fixed length sequence 17 | max_vocab_size: Number of most frequently occurring words to be kept in the vocabulary 18 | 19 | Returns: 20 | x : List of padded/truncated indices created from list of sentences 21 | word_index: dictionary storing the word-to-index correspondence 22 | 23 | """ 24 | 25 | sentences = [' '.join(word_tokenize(s)[:max_num_words]) for s in sentences] 26 | 27 | tokenizer = Tokenizer(filters=filters) 28 | tokenizer.fit_on_texts(sentences) 29 | 30 | word_index = dict() 31 | word_index['PAD'] = 0 32 | word_index['UNK'] = 1 33 | word_index['GO'] = 2 34 | word_index['EOS'] = 3 35 | 36 | for i, word in enumerate(dict(tokenizer.word_index).keys()): 37 | word_index[word] = i + 4 38 | 39 | tokenizer.word_index = word_index 40 | x = tokenizer.texts_to_sequences(list(sentences)) 41 | 42 | for i, seq in enumerate(x): 43 | if any(t >= max_vocab_size for t in seq): 44 | seq = [t if t < max_vocab_size else word_index['UNK'] for t in seq] 45 | seq.append(word_index['EOS']) 46 | x[i] = seq 47 | 48 | x = pad_sequences(x, padding='post', truncating='post', maxlen=max_num_words, value=word_index['PAD']) 49 | 50 | word_index = {k: v for k, v in word_index.items() if v < max_vocab_size} 51 | 52 | return x, word_index 53 | 54 | 55 | def create_embedding_matrix(word_index, embedding_dim, w2v_path): 56 | """ 57 | Create the initial embedding matrix for TF Graph. 58 | 59 | Args: 60 | word_index: dictionary storing the word-to-index correspondence 61 | embedding_dim: word2vec dimension 62 | w2v_path: file path to the w2v pickle file 63 | 64 | Returns: 65 | embeddings_matrix : numpy 2d-array with word vectors 66 | 67 | """ 68 | w2v_model = gensim.models.Word2Vec.load(w2v_path) 69 | embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(word_index), embedding_dim)) 70 | for word, i in word_index.items(): 71 | try: 72 | embeddings_vector = w2v_model[word] 73 | embeddings_matrix[i] = embeddings_vector 74 | except KeyError: 75 | pass 76 | 77 | return embeddings_matrix 78 | 79 | 80 | def create_data_split(x, y, experiment): 81 | """ 82 | Create test-train split according to previously defined CSV files 83 | Depending on the experiment - qgen or dialogue 84 | 85 | Args: 86 | x: input sequence of indices 87 | y: output sequence of indices 88 | experiment: dialogue (conversation system) or qgen (question generation) task 89 | 90 | Returns: 91 | x_train, y_train, x_val, y_val, x_test, y_test: train val test split arrays 92 | 93 | """ 94 | 95 | if experiment == 'qgen': 96 | train_size = pd.read_csv('../data/df_qgen_train.csv').shape[0] 97 | val_size = pd.read_csv('../data/df_qgen_val.csv').shape[0] 98 | test_size = pd.read_csv('../data/df_qgen_test.csv').shape[0] 99 | elif experiment == 'dialogue': 100 | train_size = pd.read_csv('../data/df_dialogue_train.csv').shape[0] 101 | val_size = pd.read_csv('../data/df_dialogue_val.csv').shape[0] 102 | test_size = pd.read_csv('../data/df_dialogue_test.csv').shape[0] 103 | else: 104 | print('Invalid experiment name specified !') 105 | return 106 | 107 | train_indices = range(train_size) 108 | val_indices = range(train_size, train_size + val_size) 109 | test_indices = range(train_size + val_size, train_size + val_size + test_size) 110 | 111 | x_train = x[train_indices] 112 | y_train = y[train_indices] 113 | x_val = x[val_indices] 114 | y_val = y[val_indices] 115 | x_test = x[test_indices] 116 | y_test = y[test_indices] 117 | 118 | return x_train, y_train, x_val, y_val, x_test, y_test 119 | 120 | 121 | def get_batches(x, y, batch_size): 122 | """ 123 | Generate inputs and targets in a batch-wise fashion for feed-dict 124 | 125 | Args: 126 | x: entire source sequence array 127 | y: entire output sequence array 128 | batch_size: batch size 129 | 130 | Returns: 131 | x_batch, y_batch, source_sentence_length, target_sentence_length 132 | 133 | """ 134 | 135 | for batch_i in range(0, len(x) // batch_size): 136 | start_i = batch_i * batch_size 137 | x_batch = x[start_i:start_i + batch_size] 138 | y_batch = y[start_i:start_i + batch_size] 139 | 140 | source_sentence_length = [np.count_nonzero(seq) for seq in x_batch] 141 | target_sentence_length = [np.count_nonzero(seq) for seq in y_batch] 142 | 143 | yield x_batch, y_batch, source_sentence_length, target_sentence_length 144 | -------------------------------------------------------------------------------- /utils/eval_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nltk.translate.bleu_score import corpus_bleu 3 | from nltk.collocations import BigramCollocationFinder 4 | from nltk.probability import FreqDist 5 | 6 | 7 | def calculate_bleu_scores(references, hypotheses): 8 | """ 9 | Calculates BLEU 1-4 scores based on NLTK functionality 10 | 11 | Args: 12 | references: List of reference sentences 13 | hypotheses: List of generated sentences 14 | 15 | Returns: 16 | bleu_1, bleu_2, bleu_3, bleu_4: BLEU scores 17 | 18 | """ 19 | bleu_1 = np.round(100 * corpus_bleu(references, hypotheses, weights=(1.0, 0., 0., 0.)), decimals=2) 20 | bleu_2 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.50, 0.50, 0., 0.)), decimals=2) 21 | bleu_3 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.34, 0.33, 0.33, 0.)), decimals=2) 22 | bleu_4 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)), decimals=2) 23 | return bleu_1, bleu_2, bleu_3, bleu_4 24 | 25 | 26 | def calculate_ngram_diversity(corpus): 27 | """ 28 | Calculates unigram and bigram diversity 29 | 30 | Args: 31 | corpus: tokenized list of sentences sampled 32 | 33 | Returns: 34 | uni_diversity: distinct-1 score 35 | bi_diversity: distinct-2 score 36 | 37 | """ 38 | bigram_finder = BigramCollocationFinder.from_words(corpus) 39 | bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N 40 | 41 | dist = FreqDist(corpus) 42 | uni_diversity = len(dist) / len(corpus) 43 | 44 | return uni_diversity, bi_diversity 45 | 46 | 47 | def calculate_entropy(corpus): 48 | """ 49 | Calculates diversity in terms of entropy (using unigram probability) 50 | 51 | Args: 52 | corpus: tokenized list of sentences sampled 53 | 54 | Returns: 55 | ent: entropy on the sample sentence list 56 | 57 | """ 58 | fdist = FreqDist(corpus) 59 | total_len = len(corpus) 60 | ent = 0 61 | for k, v in fdist.items(): 62 | p = v / total_len 63 | 64 | ent += -p * np.log(p) 65 | 66 | return ent 67 | -------------------------------------------------------------------------------- /ved_detAttn/bleu/README.md: -------------------------------------------------------------------------------- 1 | BLEU score pickle files will be saved in this directory. 2 | 3 | -------------------------------------------------------------------------------- /ved_detAttn/detAttention_decoder/basic_decoder.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import tensorflow as tf 3 | from tensorflow.contrib.seq2seq.python.ops import decoder 4 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py 5 | from tensorflow.python.framework import dtypes 6 | from tensorflow.python.framework import ops 7 | from tensorflow.python.framework import tensor_shape 8 | from tensorflow.python.layers import base as layers_base 9 | from tensorflow.python.ops import rnn_cell_impl 10 | from tensorflow.python.util import nest 11 | 12 | __all__ = [ 13 | "BasicDecoderOutput", 14 | "BasicDecoder", 15 | ] 16 | 17 | 18 | class BasicDecoderOutput(collections.namedtuple("BasicDecoderOutput", ("rnn_output", "sample_id"))): 19 | pass 20 | 21 | 22 | class BasicDecoder(decoder.Decoder): 23 | """Basic sampling decoder.""" 24 | 25 | def __init__(self, cell, helper, initial_state, latent_vector, output_layer=None): 26 | """Initialize BasicDecoder. 27 | Args: 28 | cell: An `RNNCell` instance. 29 | helper: A `Helper` instance. 30 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays. 31 | The initial state of the RNNCell. 32 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., 33 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior 34 | to storing the result or sampling. 35 | Raises: 36 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. 37 | """ 38 | if not rnn_cell_impl._like_rnncell(cell): # pylint: disable=protected-access 39 | raise TypeError("cell must be an RNNCell, received: %s" % type(cell)) 40 | if not isinstance(helper, helper_py.Helper): 41 | raise TypeError("helper must be a Helper, received: %s" % type(helper)) 42 | if (output_layer is not None and not isinstance(output_layer, layers_base.Layer)): 43 | raise TypeError("output_layer must be a Layer, received: %s" % type(output_layer)) 44 | self._cell = cell 45 | self._helper = helper 46 | self._initial_state = initial_state 47 | self._output_layer = output_layer 48 | self._latent_vector = latent_vector 49 | 50 | @property 51 | def batch_size(self): 52 | return self._helper.batch_size 53 | 54 | def _rnn_output_size(self): 55 | size = self._cell.output_size 56 | if self._output_layer is None: 57 | return size 58 | else: 59 | # To use layer's compute_output_shape, we need to convert the 60 | # RNNCell's output_size entries into shapes with an unknown 61 | # batch size. We then pass this through the layer's 62 | # compute_output_shape and read off all but the first (batch) 63 | # dimensions to get the output size of the rnn with the layer 64 | # applied to the top. 65 | output_shape_with_unknown_batch = nest.map_structure( 66 | lambda s: tensor_shape.TensorShape([None]).concatenate(s), 67 | size) 68 | layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access 69 | output_shape_with_unknown_batch) 70 | return nest.map_structure(lambda s: s[1:], layer_output_shape) 71 | 72 | @property 73 | def output_size(self): 74 | # Return the cell output and the id 75 | return BasicDecoderOutput( 76 | rnn_output=self._rnn_output_size(), 77 | sample_id=tensor_shape.TensorShape([])) 78 | 79 | @property 80 | def output_dtype(self): 81 | # Assume the dtype of the cell is the output_size structure 82 | # containing the input_state's first component's dtype. 83 | # Return that structure and int32 (the id) 84 | dtype = nest.flatten(self._initial_state)[0].dtype 85 | return BasicDecoderOutput( 86 | nest.map_structure(lambda _: dtype, self._rnn_output_size()), 87 | dtypes.int32) 88 | 89 | def initialize(self, name=None): 90 | """Initialize the decoder. 91 | Args: 92 | name: Name scope for any created operations. 93 | Returns: 94 | `(finished, first_inputs, initial_state)`. 95 | """ 96 | # Concatenate the latent vector to the 1st input to the decoder LSTM, i.e, the embedding + latent vector 97 | return (self._helper.initialize()[0], tf.concat([self._helper.initialize()[1], self._latent_vector], axis=-1)) + (self._initial_state,) 98 | 99 | def step(self, time, inputs, state, name=None): 100 | """Perform a decoding step. 101 | Args: 102 | time: scalar `int32` tensor. 103 | inputs: A (structure of) input tensors. 104 | state: A (structure of) state tensors and TensorArrays. 105 | name: Name scope for any created operations. 106 | Returns: 107 | `(outputs, next_state, next_inputs, finished)`. 108 | """ 109 | with ops.name_scope(name, "BasicDecoderStep", (time, inputs, state)): 110 | cell_outputs, cell_state = self._cell(inputs, state) 111 | 112 | if self._output_layer is not None: 113 | cell_outputs = self._output_layer(cell_outputs) 114 | sample_ids = self._helper.sample( 115 | time=time, outputs=cell_outputs, state=cell_state) 116 | (finished, next_inputs, next_state) = self._helper.next_inputs( 117 | time=time, 118 | outputs=cell_outputs, 119 | state=cell_state, 120 | sample_ids=sample_ids) 121 | 122 | # Concatenate the latent vector to the predicted word's embedding 123 | next_inputs = tf.concat([next_inputs, self._latent_vector], axis=-1) 124 | 125 | outputs = BasicDecoderOutput(cell_outputs, sample_ids) 126 | 127 | return (outputs, next_state, next_inputs, finished) 128 | -------------------------------------------------------------------------------- /ved_detAttn/logs.txt: -------------------------------------------------------------------------------- 1 | Epoch 1/3 - Time 244.9 BLEU: 28.93 | 12.27 | 6.33 | 3.4 2 | Epoch 2/3 - Time 247.2 BLEU: 31.13 | 14.23 | 7.77 | 4.35 3 | Epoch 3/3 - Time 248.0 BLEU: 31.59 | 14.97 | 8.41 | 4.87 -------------------------------------------------------------------------------- /ved_detAttn/model_config.py: -------------------------------------------------------------------------------- 1 | config = dict( 2 | 3 | experiment = 'qgen', # Experiment - either qgen or dialogue 4 | lstm_hidden_units = 100, # Number of hidden units for the LSTM 5 | embedding_size = 300, # Word embedding dimension 6 | num_layers = 1, # Number of LSTM layers 7 | 8 | encoder_vocab = 40000, # Vocabulary size on the encoder side # 30000 for dialogue 9 | decoder_vocab = 40000, # Vocabulary size on the decoder side # 30000 for dialogue 10 | encoder_num_tokens = 30, # Number of words/tokens in the input sequence # 20 for dialogue 11 | decoder_num_tokens = 20, # Number of words/tokens in the generated sequence 12 | 13 | dropout_keep_prob = 0.8, # Dropout keep probability 14 | initial_learning_rate = 0.005, # Initial learning rate 15 | learning_rate_decay = 0.75, # Learning rate decay 16 | min_learning_rate = 0.00001, # Minimum learning rate 17 | 18 | latent_dim = 100, # Dimension of z-latent space 19 | word_dropout_keep_probability = 0.75, # 1.0 - Word dropout rate for the decoder 20 | z_temp = 1.0, # Sampling temperature to be multiplied with the standard deviation 21 | 22 | batch_size = 100, # Batch size # 128 for dialogue 23 | n_epochs = 3, # Number of epochs 24 | 25 | logs_dir = 'summary_logs/var-seq2seq-det-attn', # Path to save summary information for Tensorboard 26 | model_checkpoint_dir = 'models/var-seq2seq-det-attn-', # Path to save model checkpoints 27 | bleu_path = 'bleu/var-seq2seq-det-attn', # Path to save model checkpoints 28 | w2v_dir = '../w2v_models/', # Word2Vec model directory 29 | data_dir = '../data/', # Directory to store data csv files 30 | 31 | load_checkpoint = 0, # Specify the trained model epoch/checkpoint number to be loaded for evaluation on test set, 0 means last saved checkpoint 32 | 33 | ) -------------------------------------------------------------------------------- /ved_detAttn/predict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "\n", 11 | "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", 12 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", 13 | "\n", 14 | "import tensorflow as tf\n", 15 | "\n", 16 | "config = tf.ConfigProto()\n", 17 | "config.gpu_options.allow_growth = True\n", 18 | "sess = tf.Session(config=config)\n", 19 | "\n", 20 | "import sys\n", 21 | "\n", 22 | "if not '../' in sys.path: sys.path.append('../')\n", 23 | "\n", 24 | "import pandas as pd\n", 25 | "\n", 26 | "from utils import data_utils\n", 27 | "from model_config import config\n", 28 | "from ved_detAttn import VarSeq2SeqDetAttnModel" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "if config['experiment'] == 'qgen':\n", 38 | " print('[INFO] Preparing data for experiment: {}'.format(config['experiment']))\n", 39 | " train_data = pd.read_csv(config['data_dir'] + 'df_qgen_train.csv')\n", 40 | " val_data = pd.read_csv(config['data_dir'] + 'df_qgen_val.csv')\n", 41 | " test_data = pd.read_csv(config['data_dir'] + 'df_qgen_test.csv')\n", 42 | " input_sentences = pd.concat([train_data['answer'], val_data['answer'], test_data['answer']])\n", 43 | " output_sentences = pd.concat([train_data['question'], val_data['question'], test_data['question']])\n", 44 | " true_test = test_data['question']\n", 45 | " input_test = test_data['answer']\n", 46 | " filters = '!\"#$%&()*+,./:;<=>?@[\\\\]^`{|}~\\t\\n'\n", 47 | " w2v_path = config['w2v_dir'] + 'w2vmodel_qgen.pkl'\n", 48 | " \n", 49 | "elif config['experiment'] == 'dialogue':\n", 50 | " train_data = pd.read_csv(config['data_dir'] + 'df_dialogue_train.csv')\n", 51 | " val_data = pd.read_csv(config['data_dir'] + 'df_dialogue_val.csv')\n", 52 | " test_data = pd.read_csv(config['data_dir'] + 'df_dialogue_test.csv')\n", 53 | " input_sentences = pd.concat([train_data['line'], val_data['line'], test_data['line']])\n", 54 | " output_sentences = pd.concat([train_data['reply'], val_data['reply'], test_data['reply']])\n", 55 | " true_test = test_data['reply']\n", 56 | " input_test = test_data['line']\n", 57 | " filters = '!\"#$%&()*+/:;<=>@[\\\\]^`{|}~\\t\\n'\n", 58 | " w2v_path = config['w2v_dir'] + 'w2vmodel_dialogue.pkl'\n", 59 | "\n", 60 | "else:\n", 61 | " print('Invalid experiment name specified!')\n", 62 | " " 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "print('[INFO] Tokenizing input and output sequences')\n", 72 | "x, input_word_index = data_utils.tokenize_sequence(input_sentences, \n", 73 | " filters, \n", 74 | " config['encoder_num_tokens'], \n", 75 | " config['encoder_vocab'])\n", 76 | "\n", 77 | "y, output_word_index = data_utils.tokenize_sequence(output_sentences, \n", 78 | " filters, \n", 79 | " config['decoder_num_tokens'], \n", 80 | " config['decoder_vocab'])\n", 81 | "\n", 82 | "print('[INFO] Split data into train-validation-test sets')\n", 83 | "x_train, y_train, x_val, y_val, x_test, y_test = data_utils.create_data_split(x, \n", 84 | " y, \n", 85 | " config['experiment'])\n", 86 | "\n", 87 | "encoder_embeddings_matrix = data_utils.create_embedding_matrix(input_word_index, \n", 88 | " config['embedding_size'], \n", 89 | " w2v_path)\n", 90 | "\n", 91 | "decoder_embeddings_matrix = data_utils.create_embedding_matrix(output_word_index, \n", 92 | " config['embedding_size'], \n", 93 | " w2v_path)\n", 94 | "\n", 95 | "# Re-calculate the vocab size based on the word_idx dictionary\n", 96 | "config['encoder_vocab'] = len(input_word_index)\n", 97 | "config['decoder_vocab'] = len(output_word_index)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "model = VarSeq2SeqDetAttnModel(config, \n", 107 | " encoder_embeddings_matrix, \n", 108 | " decoder_embeddings_matrix, \n", 109 | " input_word_index, \n", 110 | " output_word_index)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "if config['load_checkpoint'] != 0: \n", 127 | " checkpoint = config['model_checkpoint_dir'] + str(config['load_checkpoint']) + '.ckpt'\n", 128 | "else:\n", 129 | " checkpoint = tf.train.get_checkpoint_state(os.path.dirname('models/checkpoint')).model_checkpoint_path\n", 130 | "\n", 131 | "preds = model.predict(checkpoint, \n", 132 | " x_test, \n", 133 | " y_test, \n", 134 | " true_test, \n", 135 | " )" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "count = 100\n", 145 | "model.show_output_sentences(preds[:count], \n", 146 | " y_test[:count], \n", 147 | " input_test[:count], \n", 148 | " true_test[:count], \n", 149 | " )" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "model.get_diversity_metrics(checkpoint, x_test, y_test)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.6.1" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 2 190 | } 191 | -------------------------------------------------------------------------------- /ved_detAttn/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 4 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 5 | 6 | import tensorflow as tf 7 | 8 | tf_config = tf.ConfigProto() 9 | tf_config.gpu_options.allow_growth = True 10 | sess = tf.Session(config=tf_config) 11 | 12 | import sys 13 | 14 | if not '../' in sys.path: sys.path.append('../') 15 | 16 | import pandas as pd 17 | 18 | from utils import data_utils 19 | from model_config import config 20 | from ved_detAttn import VarSeq2SeqDetAttnModel 21 | 22 | 23 | def train_model(config): 24 | print('[INFO] Preparing data for experiment: {}'.format(config['experiment'])) 25 | if config['experiment'] == 'qgen': 26 | train_data = pd.read_csv(config['data_dir'] + 'df_qgen_train.csv') 27 | val_data = pd.read_csv(config['data_dir'] + 'df_qgen_val.csv') 28 | test_data = pd.read_csv(config['data_dir'] + 'df_qgen_test.csv') 29 | input_sentences = pd.concat([train_data['answer'], val_data['answer'], test_data['answer']]) 30 | output_sentences = pd.concat([train_data['question'], val_data['question'], test_data['question']]) 31 | true_val = val_data['question'] 32 | filters = '!"#$%&()*+,./:;<=>?@[\\]^`{|}~\t\n' 33 | w2v_path = config['w2v_dir'] + 'w2vmodel_qgen.pkl' 34 | 35 | elif config['experiment'] == 'dialogue': 36 | train_data = pd.read_csv(config['data_dir'] + 'df_dialogue_train.csv') 37 | val_data = pd.read_csv(config['data_dir'] + 'df_dialogue_val.csv') 38 | test_data = pd.read_csv(config['data_dir'] + 'df_dialogue_test.csv') 39 | input_sentences = pd.concat([train_data['line'], val_data['line'], test_data['line']]) 40 | output_sentences = pd.concat([train_data['reply'], val_data['reply'], test_data['reply']]) 41 | true_val = val_data['reply'] 42 | filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n' 43 | w2v_path = config['w2v_dir'] + 'w2vmodel_dialogue.pkl' 44 | 45 | else: 46 | print('Invalid experiment name specified!') 47 | return 48 | 49 | print('[INFO] Tokenizing input and output sequences') 50 | x, input_word_index = data_utils.tokenize_sequence(input_sentences, 51 | filters, 52 | config['encoder_num_tokens'], 53 | config['encoder_vocab']) 54 | 55 | y, output_word_index = data_utils.tokenize_sequence(output_sentences, 56 | filters, 57 | config['decoder_num_tokens'], 58 | config['decoder_vocab']) 59 | 60 | print('[INFO] Split data into train-validation-test sets') 61 | x_train, y_train, x_val, y_val, x_test, y_test = data_utils.create_data_split(x, 62 | y, 63 | config['experiment']) 64 | 65 | encoder_embeddings_matrix = data_utils.create_embedding_matrix(input_word_index, 66 | config['embedding_size'], 67 | w2v_path) 68 | 69 | decoder_embeddings_matrix = data_utils.create_embedding_matrix(output_word_index, 70 | config['embedding_size'], 71 | w2v_path) 72 | 73 | # Re-calculate the vocab size based on the word_idx dictionary 74 | config['encoder_vocab'] = len(input_word_index) 75 | config['decoder_vocab'] = len(output_word_index) 76 | 77 | model = VarSeq2SeqDetAttnModel(config, 78 | encoder_embeddings_matrix, 79 | decoder_embeddings_matrix, 80 | input_word_index, 81 | output_word_index) 82 | 83 | model.train(x_train, y_train, x_val, y_val, true_val) 84 | 85 | 86 | if __name__ == '__main__': 87 | train_model(config) 88 | -------------------------------------------------------------------------------- /ved_detAttn/ved_detAttn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if '../' not in sys.path: sys.path.append('../') 4 | import time 5 | import pickle 6 | import tensorflow as tf 7 | import numpy as np 8 | from utils import data_utils 9 | from utils import eval_utils 10 | from tqdm import tqdm 11 | from nltk.tokenize import word_tokenize 12 | from tensorflow.python.layers.core import Dense 13 | from detAttention_decoder import basic_decoder 14 | 15 | 16 | class VarSeq2SeqDetAttnModel(object): 17 | 18 | def __init__(self, config, encoder_embeddings_matrix, decoder_embeddings_matrix, 19 | encoder_word_index, decoder_word_index): 20 | 21 | self.config = config 22 | 23 | self.lstm_hidden_units = config['lstm_hidden_units'] 24 | self.embedding_size = config['embedding_size'] 25 | self.latent_dim = config['latent_dim'] 26 | self.num_layers = config['num_layers'] 27 | 28 | self.encoder_vocab_size = config['encoder_vocab'] 29 | self.decoder_vocab_size = config['decoder_vocab'] 30 | 31 | self.encoder_num_tokens = config['encoder_num_tokens'] 32 | self.decoder_num_tokens = config['decoder_num_tokens'] 33 | 34 | self.dropout_keep_prob = config['dropout_keep_prob'] 35 | self.word_dropout_keep_probability = config['word_dropout_keep_probability'] 36 | self.z_temp = config['z_temp'] 37 | 38 | self.initial_learning_rate = config['initial_learning_rate'] 39 | self.learning_rate_decay = config['learning_rate_decay'] 40 | self.min_learning_rate = config['min_learning_rate'] 41 | 42 | self.batch_size = config['batch_size'] 43 | self.epochs = config['n_epochs'] 44 | 45 | self.encoder_embeddings_matrix = encoder_embeddings_matrix 46 | self.decoder_embeddings_matrix = decoder_embeddings_matrix 47 | self.encoder_word_index = encoder_word_index 48 | self.decoder_word_index = decoder_word_index 49 | self.encoder_idx_word = dict((i, word) for word, i in encoder_word_index.items()) 50 | self.decoder_idx_word = dict((i, word) for word, i in decoder_word_index.items()) 51 | 52 | self.logs_dir = config['logs_dir'] 53 | self.model_checkpoint_dir = config['model_checkpoint_dir'] 54 | self.bleu_path = config['bleu_path'] 55 | 56 | self.pad = self.decoder_word_index['PAD'] 57 | self.eos = self.decoder_word_index['EOS'] 58 | 59 | self.epoch_bleu_score_val = {'1': [], '2': [], '3': [], '4': []} 60 | self.log_str = [] 61 | 62 | self.build_model() 63 | 64 | def build_model(self): 65 | print("[INFO] Building Model ...") 66 | 67 | self.init_placeholders() 68 | self.embedding_layer() 69 | self.build_encoder() 70 | self.build_latent_space() 71 | self.build_decoder() 72 | self.loss() 73 | self.optimize() 74 | self.summary() 75 | 76 | def init_placeholders(self): 77 | with tf.name_scope("model_inputs"): 78 | # Create palceholders for inputs to the model 79 | self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.encoder_num_tokens], name='input') 80 | self.target_data = tf.placeholder(tf.int32, [self.batch_size, self.decoder_num_tokens], name='targets') 81 | self.lr = tf.placeholder(tf.float32, name='learning_rate', shape=()) 82 | self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') # Dropout Keep Probability 83 | self.source_sentence_length = tf.placeholder(tf.int32, shape=(self.batch_size,), 84 | name='source_sentence_length') 85 | self.target_sentence_length = tf.placeholder(tf.int32, shape=(self.batch_size,), 86 | name='target_sentence_length') 87 | self.word_dropout_keep_prob = tf.placeholder(tf.float32, name='word_drop_keep_prob', shape=()) 88 | self.lambda_coeff = tf.placeholder(tf.float32, name='lambda_coeff', shape=()) 89 | self.z_temperature = tf.placeholder(tf.float32, name='z_temperature', shape=()) 90 | 91 | def embedding_layer(self): 92 | with tf.name_scope("word_embeddings"): 93 | self.encoder_embeddings = tf.Variable( 94 | initial_value=np.array(self.encoder_embeddings_matrix, dtype=np.float32), 95 | dtype=tf.float32, trainable=False) 96 | self.enc_embed_input = tf.nn.embedding_lookup(self.encoder_embeddings, self.input_data) 97 | # self.enc_embed_input = tf.nn.dropout(self.enc_embed_input, keep_prob=self.keep_prob) 98 | 99 | with tf.name_scope("decoder_inputs"): 100 | self.decoder_embeddings = tf.Variable( 101 | initial_value=np.array(self.decoder_embeddings_matrix, dtype=np.float32), 102 | dtype=tf.float32, trainable=False) 103 | keep = tf.where( 104 | tf.random_uniform([self.batch_size, self.decoder_num_tokens]) < self.word_dropout_keep_prob, 105 | tf.fill([self.batch_size, self.decoder_num_tokens], True), 106 | tf.fill([self.batch_size, self.decoder_num_tokens], False)) 107 | ending = tf.cast(keep, dtype=tf.int32) * self.target_data 108 | ending = tf.strided_slice(ending, [0, 0], [self.batch_size, -1], [1, 1], 109 | name='slice_input') # Minus 1 implies everything till the last dim 110 | self.dec_input = tf.concat([tf.fill([self.batch_size, 1], self.decoder_word_index['GO']), ending], 1, 111 | name='dec_input') 112 | self.dec_embed_input = tf.nn.embedding_lookup(self.decoder_embeddings, self.dec_input) 113 | # self.dec_embed_input = tf.nn.dropout(self.dec_embed_input, keep_prob=self.keep_prob) 114 | 115 | def build_encoder(self): 116 | with tf.name_scope("encode"): 117 | for layer in range(self.num_layers): 118 | with tf.variable_scope('encoder_{}'.format(layer + 1)): 119 | cell_fw = tf.contrib.rnn.LayerNormBasicLSTMCell(self.lstm_hidden_units) 120 | cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=self.keep_prob) 121 | 122 | cell_bw = tf.contrib.rnn.LayerNormBasicLSTMCell(self.lstm_hidden_units) 123 | cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=self.keep_prob) 124 | 125 | self.enc_output, self.enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 126 | cell_bw, 127 | self.enc_embed_input, 128 | self.source_sentence_length, 129 | dtype=tf.float32) 130 | 131 | # Join outputs since we are using a bidirectional RNN 132 | self.h_N = tf.concat([self.enc_state[0][1], self.enc_state[1][1]], axis=-1, 133 | name='h_N') # Concatenated h from the fw and bw LSTMs 134 | self.enc_outputs = tf.concat([self.enc_output[0], self.enc_output[1]], axis=-1, name='encoder_outputs') 135 | 136 | def build_latent_space(self): 137 | with tf.name_scope("latent_space"): 138 | self.z_mean = Dense(self.latent_dim, name='z_mean')(self.h_N) 139 | self.z_log_sigma = Dense(self.latent_dim, name='z_log_sigma')(self.h_N) 140 | 141 | self.z_vector = tf.identity(self.sample_gaussian(), name='z_vector') 142 | 143 | def sample_gaussian(self): 144 | """(Differentiably!) draw sample from Gaussian with given shape, subject to random noise epsilon""" 145 | with tf.name_scope('sample_gaussian'): 146 | # reparameterization trick 147 | epsilon = tf.random_normal(tf.shape(self.z_log_sigma), name='epsilon') 148 | return self.z_mean + tf.scalar_mul(self.z_temperature, 149 | epsilon * tf.exp(self.z_log_sigma)) # N(mu, I * sigma**2) 150 | 151 | def calculate_kl_loss(self): 152 | """(Gaussian) Kullback-Leibler divergence KL(q||p), per training example""" 153 | # (tf.Tensor, tf.Tensor) -> tf.Tensor 154 | with tf.name_scope("KL_divergence"): 155 | # = -0.5 * (1 + log(sigma**2) - mu**2 - sigma**2) 156 | return -0.5 * tf.reduce_sum(1.0 + 2 * self.z_log_sigma - self.z_mean ** 2 - 157 | tf.exp(2 * self.z_log_sigma), 1) 158 | 159 | def build_decoder(self): 160 | with tf.variable_scope("decode"): 161 | for layer in range(self.num_layers): 162 | with tf.variable_scope('decoder_{}'.format(layer + 1)): 163 | dec_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(2 * self.lstm_hidden_units) 164 | dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, input_keep_prob=self.keep_prob) 165 | 166 | self.output_layer = Dense(self.decoder_vocab_size) 167 | 168 | attn_mech = tf.contrib.seq2seq.LuongAttention(2 * self.lstm_hidden_units, 169 | self.enc_outputs, 170 | memory_sequence_length=self.source_sentence_length) 171 | 172 | attn_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attn_mech, self.lstm_hidden_units) 173 | 174 | self.init_state = attn_cell.zero_state(self.batch_size, tf.float32) 175 | 176 | with tf.name_scope("training_decoder"): 177 | training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=self.dec_embed_input, 178 | sequence_length=self.target_sentence_length, 179 | time_major=False) 180 | 181 | training_decoder = basic_decoder.BasicDecoder(attn_cell, 182 | training_helper, 183 | initial_state=self.init_state, 184 | latent_vector=self.z_vector, 185 | output_layer=self.output_layer) 186 | 187 | self.training_logits, _state, _len = tf.contrib.seq2seq.dynamic_decode(training_decoder, 188 | output_time_major=False, 189 | impute_finished=True, 190 | maximum_iterations=self.decoder_num_tokens) 191 | 192 | self.training_logits = tf.identity(self.training_logits.rnn_output, 'logits') 193 | 194 | with tf.name_scope("inference_decoder"): 195 | start_token = self.decoder_word_index['GO'] 196 | end_token = self.decoder_word_index['EOS'] 197 | 198 | start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [self.batch_size], 199 | name='start_tokens') 200 | 201 | inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.decoder_embeddings, 202 | start_tokens, 203 | end_token) 204 | 205 | inference_decoder = basic_decoder.BasicDecoder(attn_cell, 206 | inference_helper, 207 | initial_state=self.init_state, 208 | latent_vector=self.z_vector, 209 | output_layer=self.output_layer) 210 | 211 | self.inference_logits, _state, _len = tf.contrib.seq2seq.dynamic_decode(inference_decoder, 212 | output_time_major=False, 213 | impute_finished=True, 214 | maximum_iterations=self.decoder_num_tokens) 215 | 216 | self.inference_logits = tf.identity(self.inference_logits.sample_id, name='predictions') 217 | 218 | def loss(self): 219 | with tf.name_scope('losses'): 220 | self.kl_loss = self.calculate_kl_loss() 221 | self.kl_loss = tf.scalar_mul(self.lambda_coeff, self.kl_loss) 222 | 223 | # Create the weights for sequence_loss 224 | masks = tf.sequence_mask(self.target_sentence_length, self.decoder_num_tokens, dtype=tf.float32, name='masks') 225 | 226 | self.xent_loss = tf.contrib.seq2seq.sequence_loss( 227 | self.training_logits, 228 | self.target_data, 229 | weights=masks, 230 | average_across_batch=False) 231 | 232 | # L2-Regularization 233 | self.var_list = tf.trainable_variables() 234 | self.lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in self.var_list if 'bias' not in v.name]) * 0.001 235 | 236 | self.cost = tf.reduce_sum(self.xent_loss + self.kl_loss) + self.lossL2 237 | 238 | def optimize(self): 239 | # Optimizer 240 | with tf.name_scope('optimization'): 241 | optimizer = tf.train.AdamOptimizer(self.lr) 242 | 243 | # Gradient Clipping 244 | gradients = optimizer.compute_gradients(self.cost, var_list=self.var_list) 245 | capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] 246 | self.train_op = optimizer.apply_gradients(capped_gradients) 247 | 248 | def summary(self): 249 | with tf.name_scope('summaries'): 250 | tf.summary.scalar('xent_loss', tf.reduce_sum(self.xent_loss)) 251 | tf.summary.scalar('l2_loss', tf.reduce_sum(self.lossL2)) 252 | tf.summary.scalar("kl_loss", tf.reduce_sum(self.kl_loss)) 253 | tf.summary.scalar('total_loss', tf.reduce_sum(self.cost)) 254 | tf.summary.histogram("latent_vector", self.z_vector) 255 | tf.summary.histogram("latent_mean", self.z_mean) 256 | tf.summary.histogram("latent_log_sigma", self.z_log_sigma) 257 | self.summary_op = tf.summary.merge_all() 258 | 259 | def train(self, x_train, y_train, x_val, y_val, true_val): 260 | 261 | print('[INFO] Training process started') 262 | 263 | learning_rate = self.initial_learning_rate 264 | iter_i = 0 265 | lambda_val = 0.0 266 | 267 | with tf.Session() as sess: 268 | sess.run(tf.global_variables_initializer()) 269 | 270 | writer = tf.summary.FileWriter(self.logs_dir, sess.graph) 271 | 272 | for epoch_i in range(1, self.epochs + 1): 273 | 274 | start_time = time.time() 275 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 276 | data_utils.get_batches(x_train, y_train, self.batch_size)): 277 | 278 | try: 279 | iter_i += 1 280 | 281 | _, _summary = sess.run( 282 | [self.train_op, self.summary_op], 283 | feed_dict={self.input_data: input_batch, 284 | self.target_data: output_batch, 285 | self.lr: learning_rate, 286 | self.source_sentence_length: source_sent_lengths, 287 | self.target_sentence_length: tar_sent_lengths, 288 | self.keep_prob: self.dropout_keep_prob, 289 | self.lambda_coeff: lambda_val, 290 | self.z_temperature: self.z_temp, 291 | self.word_dropout_keep_prob: self.word_dropout_keep_probability 292 | }) 293 | 294 | writer.add_summary(_summary, iter_i) 295 | 296 | # KL Annealing till some iteration 297 | if iter_i <= 3000: 298 | lambda_val = np.round((np.tanh((iter_i - 4500) / 1000) + 1) / 2, decimals=6) 299 | 300 | except Exception as e: 301 | # print(iter_i, e) 302 | pass 303 | 304 | self.validate(sess, x_val, y_val, true_val) 305 | val_bleu_str = str(self.epoch_bleu_score_val['1'][epoch_i - 1]) + ' | ' \ 306 | + str(self.epoch_bleu_score_val['2'][epoch_i - 1]) + ' | ' \ 307 | + str(self.epoch_bleu_score_val['3'][epoch_i - 1]) + ' | ' \ 308 | + str(self.epoch_bleu_score_val['4'][epoch_i - 1]) 309 | 310 | # Reduce learning rate, but not below its minimum value 311 | learning_rate = np.max([self.min_learning_rate, learning_rate * self.learning_rate_decay]) 312 | 313 | saver = tf.train.Saver() 314 | saver.save(sess, self.model_checkpoint_dir + str(epoch_i) + ".ckpt") 315 | end_time = time.time() 316 | 317 | # Save the validation BLEU scores so far 318 | with open(self.bleu_path + '.pkl', 'wb') as f: 319 | pickle.dump(self.epoch_bleu_score_val, f) 320 | 321 | self.log_str.append('Epoch {:>3}/{} - Time {:>6.1f} BLEU: {}'.format(epoch_i, 322 | self.epochs, 323 | end_time - start_time, 324 | val_bleu_str)) 325 | with open('logs.txt', 'w') as f: 326 | f.write('\n'.join(self.log_str)) 327 | print(self.log_str[-1]) 328 | 329 | def validate(self, sess, x_val, y_val, true_val): 330 | # Calculate BLEU on validation data 331 | hypotheses_val = [] 332 | references_val = [] 333 | symbol=[] 334 | if self.config['experiment'] == 'qgen': 335 | symbol.append('?') 336 | 337 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 338 | data_utils.get_batches(x_val, y_val, self.batch_size)): 339 | answer_logits = sess.run(self.inference_logits, 340 | feed_dict={self.input_data: input_batch, 341 | self.source_sentence_length: source_sent_lengths, 342 | self.keep_prob: 1.0, 343 | self.word_dropout_keep_prob: 1.0, 344 | self.z_temperature: self.z_temp}) 345 | 346 | for k, pred in enumerate(answer_logits): 347 | hypotheses_val.append( 348 | word_tokenize( 349 | " ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol) 350 | references_val.append([word_tokenize(true_val[batch_i * self.batch_size + k])]) 351 | 352 | bleu_scores = eval_utils.calculate_bleu_scores(references_val, hypotheses_val) 353 | self.epoch_bleu_score_val['1'].append(bleu_scores[0]) 354 | self.epoch_bleu_score_val['2'].append(bleu_scores[1]) 355 | self.epoch_bleu_score_val['3'].append(bleu_scores[2]) 356 | self.epoch_bleu_score_val['4'].append(bleu_scores[3]) 357 | 358 | def predict(self, checkpoint, x_test, y_test, true_test): 359 | pred_logits = [] 360 | hypotheses_test = [] 361 | references_test = [] 362 | symbol=[] 363 | if self.config['experiment'] == 'qgen': 364 | symbol.append('?') 365 | 366 | with tf.Session() as sess: 367 | sess.run(tf.global_variables_initializer()) 368 | saver = tf.train.Saver() 369 | saver.restore(sess, checkpoint) 370 | 371 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 372 | data_utils.get_batches(x_test, y_test, self.batch_size)): 373 | result = sess.run(self.inference_logits, feed_dict={self.input_data: input_batch, 374 | self.source_sentence_length: source_sent_lengths, 375 | self.keep_prob: 1.0, 376 | self.word_dropout_keep_prob: 1.0, 377 | self.z_temperature: self.z_temp}) 378 | 379 | pred_logits.extend(result) 380 | 381 | for k, pred in enumerate(result): 382 | hypotheses_test.append( 383 | word_tokenize(" ".join( 384 | [self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol) 385 | references_test.append([word_tokenize(true_test[batch_i * self.batch_size + k])]) 386 | 387 | bleu_scores = eval_utils.calculate_bleu_scores(references_test, hypotheses_test) 388 | 389 | print('BLEU 1 to 4 : {}'.format(' | '.join(map(str, bleu_scores)))) 390 | 391 | return pred_logits 392 | 393 | def show_output_sentences(self, preds, y_test, input_test, true_test): 394 | symbol=[] 395 | if self.config['experiment'] == 'qgen': 396 | symbol.append('?') 397 | for k, (pred, actual) in enumerate(zip(preds, y_test)): 398 | print('Input: {}'.format(input_test[k].strip())) 399 | print('Actual: {}'.format(true_test[k].strip())) 400 | print('Generated: {}\n'.format( 401 | " ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, self.eos]] + symbol))) 402 | 403 | def get_diversity_metrics(self, checkpoint, x_test, y_test, num_samples=10, num_iterations = 3): 404 | 405 | x_test_repeated = np.repeat(x_test, num_samples, axis=0) 406 | y_test_repeated = np.repeat(y_test, num_samples, axis=0) 407 | 408 | entropy_list =[] 409 | uni_diversity = [] 410 | bi_diversity = [] 411 | 412 | with tf.Session() as sess: 413 | sess.run(tf.global_variables_initializer()) 414 | saver = tf.train.Saver() 415 | saver.restore(sess, checkpoint) 416 | 417 | for _ in tqdm(range(num_iterations)): 418 | total_ent = 0 419 | uni = 0 420 | bi = 0 421 | answer_logits = [] 422 | pred_sentences = [] 423 | 424 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 425 | data_utils.get_batches(x_test_repeated, y_test_repeated, self.batch_size)): 426 | result = sess.run(self.inference_logits, feed_dict={self.input_data: input_batch, 427 | self.source_sentence_length: source_sent_lengths, 428 | self.keep_prob: 1.0, 429 | self.word_dropout_keep_prob: 1.0, 430 | self.z_temperature: self.z_temp}) 431 | answer_logits.extend(result) 432 | 433 | for idx, (actual, pred) in enumerate(zip(y_test_repeated, answer_logits)): 434 | pred_sentences.append(" ".join([self.decoder_idx_word[i] for i in pred if i != self.pad][:-1])) 435 | 436 | if (idx + 1) % num_samples == 0: 437 | word_list = [word_tokenize(p) for p in pred_sentences] 438 | corpus = [item for sublist in word_list for item in sublist] 439 | total_ent += eval_utils.calculate_entropy(corpus) 440 | diversity_result = eval_utils.calculate_ngram_diversity(corpus) 441 | uni += diversity_result[0] 442 | bi += diversity_result[1] 443 | 444 | pred_sentences = [] 445 | 446 | entropy_list.append(total_ent / len(x_test)) 447 | uni_diversity.append(uni / len(x_test)) 448 | bi_diversity.append(bi / len(x_test)) 449 | 450 | print('Entropy = {:>.3f} | Distinct-1 = {:>.3f} | Distinct-2 = {:>.3f}'.format(np.mean(entropy_list), 451 | np.mean(uni_diversity), 452 | np.mean(bi_diversity))) 453 | -------------------------------------------------------------------------------- /ved_varAttn/bleu/README.md: -------------------------------------------------------------------------------- 1 | BLEU score pickle files will be saved in this directory. 2 | 3 | -------------------------------------------------------------------------------- /ved_varAttn/logs.txt: -------------------------------------------------------------------------------- 1 | Epoch 1/3 - Time 421.0 BLEU: 28.44 | 12.14 | 6.33 | 3.36 2 | Epoch 2/3 - Time 424.0 BLEU: 30.65 | 13.99 | 7.69 | 4.35 3 | Epoch 3/3 - Time 342.6 BLEU: 31.58 | 15.04 | 8.5 | 4.92 -------------------------------------------------------------------------------- /ved_varAttn/model_config.py: -------------------------------------------------------------------------------- 1 | config = dict( 2 | 3 | experiment = 'qgen', # Experiment - either qgen or dialogue 4 | lstm_hidden_units = 100, # Number of hidden units for the LSTM 5 | embedding_size = 300, # Word embedding dimension 6 | num_layers = 1, # Number of LSTM layers 7 | 8 | encoder_vocab = 40000, # Vocabulary size on the encoder side # 30000 for dialogue 9 | decoder_vocab = 40000, # Vocabulary size on the decoder side # 30000 for dialogue 10 | encoder_num_tokens = 30, # Number of words/tokens in the input sequence # 20 for dialogue 11 | decoder_num_tokens = 20, # Number of words/tokens in the generated sequence 12 | 13 | dropout_keep_prob = 0.8, # Dropout keep probability 14 | initial_learning_rate = 0.005, # Initial learning rate 15 | learning_rate_decay = 0.75, # Learning rate decay 16 | min_learning_rate = 0.00001, # Minimum learning rate 17 | 18 | latent_dim = 100, # Dimension of z-latent space 19 | word_dropout_keep_probability = 0.75, # 1.0 - Word dropout rate for the decoder 20 | z_temp = 1.0, # Sampling temperature to be multiplied with the standard deviation 21 | attention_temp = 1.0, # Sampling temperature for the attention/context vector 22 | use_hmean = True, # Whether to use N(h_mean, I) or N(0, I) as the prior 23 | gamma_val = 0.1, # Coefficient of the attention KL loss 24 | 25 | batch_size = 100, # Batch size # 128 for dialogue 26 | n_epochs = 3, # Number of epochs 27 | 28 | logs_dir = 'summary_logs/var-seq2seq-var-attn', # Path to save summary information for Tensorboard 29 | model_checkpoint_dir = 'models/var-seq2seq-var-attn-', # Path to save model checkpoints 30 | bleu_path = 'bleu/det-seq2seq-var-attn', # Path to save model checkpoints 31 | w2v_dir = '../w2v_models/', # Word2Vec model directory 32 | data_dir = '../data/', # Directory to store data csv files 33 | 34 | load_checkpoint = 0, # Specify the trained model epoch/checkpoint number to be loaded for evaluation on test set, 0 means last saved checkpoint 35 | 36 | ) -------------------------------------------------------------------------------- /ved_varAttn/predict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "\n", 11 | "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", 12 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", 13 | "\n", 14 | "import tensorflow as tf\n", 15 | "\n", 16 | "config = tf.ConfigProto()\n", 17 | "config.gpu_options.allow_growth = True\n", 18 | "sess = tf.Session(config=config)\n", 19 | "\n", 20 | "import sys\n", 21 | "\n", 22 | "if not '../' in sys.path: sys.path.append('../')\n", 23 | "\n", 24 | "import pandas as pd\n", 25 | "\n", 26 | "from utils import data_utils\n", 27 | "from model_config import config\n", 28 | "from ved_varAttn import VarSeq2SeqVarAttnModel" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "if config['experiment'] == 'qgen':\n", 38 | " print('[INFO] Preparing data for experiment: {}'.format(config['experiment']))\n", 39 | " train_data = pd.read_csv(config['data_dir'] + 'df_qgen_train.csv')\n", 40 | " val_data = pd.read_csv(config['data_dir'] + 'df_qgen_val.csv')\n", 41 | " test_data = pd.read_csv(config['data_dir'] + 'df_qgen_test.csv')\n", 42 | " input_sentences = pd.concat([train_data['answer'], val_data['answer'], test_data['answer']])\n", 43 | " output_sentences = pd.concat([train_data['question'], val_data['question'], test_data['question']])\n", 44 | " true_test = test_data['question']\n", 45 | " input_test = test_data['answer']\n", 46 | " filters = '!\"#$%&()*+,./:;<=>?@[\\\\]^`{|}~\\t\\n'\n", 47 | " w2v_path = config['w2v_dir'] + 'w2vmodel_qgen.pkl'\n", 48 | " \n", 49 | "elif config['experiment'] == 'dialogue':\n", 50 | " train_data = pd.read_csv(config['data_dir'] + 'df_dialogue_train.csv')\n", 51 | " val_data = pd.read_csv(config['data_dir'] + 'df_dialogue_val.csv')\n", 52 | " test_data = pd.read_csv(config['data_dir'] + 'df_dialogue_test.csv')\n", 53 | " input_sentences = pd.concat([train_data['line'], val_data['line'], test_data['line']])\n", 54 | " output_sentences = pd.concat([train_data['reply'], val_data['reply'], test_data['reply']])\n", 55 | " true_test = test_data['reply']\n", 56 | " input_test = test_data['line']\n", 57 | " filters = '!\"#$%&()*+/:;<=>@[\\\\]^`{|}~\\t\\n'\n", 58 | " w2v_path = config['w2v_dir'] + 'w2vmodel_dialogue.pkl'\n", 59 | "\n", 60 | "else:\n", 61 | " print('Invalid experiment name specified!')\n", 62 | " " 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "print('[INFO] Tokenizing input and output sequences')\n", 72 | "x, input_word_index = data_utils.tokenize_sequence(input_sentences, \n", 73 | " filters, \n", 74 | " config['encoder_num_tokens'], \n", 75 | " config['encoder_vocab'])\n", 76 | "\n", 77 | "y, output_word_index = data_utils.tokenize_sequence(output_sentences, \n", 78 | " filters, \n", 79 | " config['decoder_num_tokens'], \n", 80 | " config['decoder_vocab'])\n", 81 | "\n", 82 | "print('[INFO] Split data into train-validation-test sets')\n", 83 | "x_train, y_train, x_val, y_val, x_test, y_test = data_utils.create_data_split(x, \n", 84 | " y, \n", 85 | " config['experiment'])\n", 86 | "\n", 87 | "encoder_embeddings_matrix = data_utils.create_embedding_matrix(input_word_index, \n", 88 | " config['embedding_size'], \n", 89 | " w2v_path)\n", 90 | "\n", 91 | "decoder_embeddings_matrix = data_utils.create_embedding_matrix(output_word_index, \n", 92 | " config['embedding_size'], \n", 93 | " w2v_path)\n", 94 | "\n", 95 | "# Re-calculate the vocab size based on the word_idx dictionary\n", 96 | "config['encoder_vocab'] = len(input_word_index)\n", 97 | "config['decoder_vocab'] = len(output_word_index)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "model = VarSeq2SeqVarAttnModel(config, \n", 107 | " encoder_embeddings_matrix, \n", 108 | " decoder_embeddings_matrix, \n", 109 | " input_word_index, \n", 110 | " output_word_index)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "if config['load_checkpoint'] != 0: \n", 127 | " checkpoint = config['model_checkpoint_dir'] + str(config['load_checkpoint']) + '.ckpt'\n", 128 | "else:\n", 129 | " checkpoint = tf.train.get_checkpoint_state(os.path.dirname('models/checkpoint')).model_checkpoint_path\n", 130 | "\n", 131 | "preds = model.predict(checkpoint, \n", 132 | " x_test, \n", 133 | " y_test, \n", 134 | " true_test, \n", 135 | " )" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "count = 100\n", 145 | "model.show_output_sentences(preds[:count], \n", 146 | " y_test[:count], \n", 147 | " input_test[:count], \n", 148 | " true_test[:count], \n", 149 | " )" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "model.get_diversity_metrics(checkpoint, x_test, y_test)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.6.1" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 2 190 | } 191 | -------------------------------------------------------------------------------- /ved_varAttn/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 4 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 5 | 6 | import tensorflow as tf 7 | 8 | tf_config = tf.ConfigProto() 9 | tf_config.gpu_options.allow_growth = True 10 | sess = tf.Session(config=tf_config) 11 | 12 | import sys 13 | 14 | if not '../' in sys.path: sys.path.append('../') 15 | 16 | import pandas as pd 17 | 18 | from utils import data_utils 19 | from model_config import config 20 | from ved_varAttn import VarSeq2SeqVarAttnModel 21 | 22 | 23 | def train_model(config): 24 | print('[INFO] Preparing data for experiment: {}'.format(config['experiment'])) 25 | if config['experiment'] == 'qgen': 26 | train_data = pd.read_csv(config['data_dir'] + 'df_qgen_train.csv') 27 | val_data = pd.read_csv(config['data_dir'] + 'df_qgen_val.csv') 28 | test_data = pd.read_csv(config['data_dir'] + 'df_qgen_test.csv') 29 | input_sentences = pd.concat([train_data['answer'], val_data['answer'], test_data['answer']]) 30 | output_sentences = pd.concat([train_data['question'], val_data['question'], test_data['question']]) 31 | true_val = val_data['question'] 32 | filters = '!"#$%&()*+,./:;<=>?@[\\]^`{|}~\t\n' 33 | w2v_path = config['w2v_dir'] + 'w2vmodel_qgen.pkl' 34 | 35 | elif config['experiment'] == 'dialogue': 36 | train_data = pd.read_csv(config['data_dir'] + 'df_dialogue_train.csv') 37 | val_data = pd.read_csv(config['data_dir'] + 'df_dialogue_val.csv') 38 | test_data = pd.read_csv(config['data_dir'] + 'df_dialogue_test.csv') 39 | input_sentences = pd.concat([train_data['line'], val_data['line'], test_data['line']]) 40 | output_sentences = pd.concat([train_data['reply'], val_data['reply'], test_data['reply']]) 41 | true_val = val_data['reply'] 42 | filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n' 43 | w2v_path = config['w2v_dir'] + 'w2vmodel_dialogue.pkl' 44 | 45 | else: 46 | print('Invalid experiment name specified!') 47 | return 48 | 49 | print('[INFO] Tokenizing input and output sequences') 50 | x, input_word_index = data_utils.tokenize_sequence(input_sentences, 51 | filters, 52 | config['encoder_num_tokens'], 53 | config['encoder_vocab']) 54 | 55 | y, output_word_index = data_utils.tokenize_sequence(output_sentences, 56 | filters, 57 | config['decoder_num_tokens'], 58 | config['decoder_vocab']) 59 | 60 | print('[INFO] Split data into train-validation-test sets') 61 | x_train, y_train, x_val, y_val, x_test, y_test = data_utils.create_data_split(x, 62 | y, 63 | config['experiment']) 64 | 65 | encoder_embeddings_matrix = data_utils.create_embedding_matrix(input_word_index, 66 | config['embedding_size'], 67 | w2v_path) 68 | 69 | decoder_embeddings_matrix = data_utils.create_embedding_matrix(output_word_index, 70 | config['embedding_size'], 71 | w2v_path) 72 | 73 | # Re-calculate the vocab size based on the word_idx dictionary 74 | config['encoder_vocab'] = len(input_word_index) 75 | config['decoder_vocab'] = len(output_word_index) 76 | 77 | model = VarSeq2SeqVarAttnModel(config, 78 | encoder_embeddings_matrix, 79 | decoder_embeddings_matrix, 80 | input_word_index, 81 | output_word_index) 82 | 83 | model.train(x_train, y_train, x_val, y_val, true_val) 84 | 85 | 86 | if __name__ == '__main__': 87 | train_model(config) 88 | -------------------------------------------------------------------------------- /ved_varAttn/varAttention_decoder/attention_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """A powerful dynamic attention wrapper object.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import functools 23 | import math 24 | 25 | import numpy as np 26 | 27 | import tensorflow as tf 28 | from tensorflow.python.framework import dtypes 29 | from tensorflow.python.framework import ops 30 | from tensorflow.python.framework import tensor_shape 31 | from tensorflow.python.layers import base as layers_base 32 | from tensorflow.python.layers import core as layers_core 33 | from tensorflow.python.ops import array_ops 34 | from tensorflow.python.ops import check_ops 35 | from tensorflow.python.ops import clip_ops 36 | from tensorflow.python.ops import functional_ops 37 | from tensorflow.python.ops import init_ops 38 | from tensorflow.python.ops import math_ops 39 | from tensorflow.python.ops import nn_ops 40 | from tensorflow.python.ops import random_ops 41 | from tensorflow.python.ops import rnn_cell_impl 42 | from tensorflow.python.ops import tensor_array_ops 43 | from tensorflow.python.ops import variable_scope 44 | from tensorflow.python.util import nest 45 | from tensorflow.python.layers.core import Dense 46 | 47 | __all__ = [ 48 | "AttentionMechanism", 49 | "AttentionWrapper", 50 | "AttentionWrapperState", 51 | "LuongAttention", 52 | "BahdanauAttention", 53 | "hardmax", 54 | "safe_cumprod", 55 | "monotonic_attention", 56 | "BahdanauMonotonicAttention", 57 | "LuongMonotonicAttention", 58 | ] 59 | 60 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors # pylint: disable=protected-access 61 | 62 | 63 | class AttentionMechanism(object): 64 | pass 65 | 66 | 67 | def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined): 68 | """Convert to tensor and possibly mask `memory`. 69 | 70 | Args: 71 | memory: `Tensor`, shaped `[batch_size, max_time, ...]`. 72 | memory_sequence_length: `int32` `Tensor`, shaped `[batch_size]`. 73 | check_inner_dims_defined: Python boolean. If `True`, the `memory` 74 | argument's shape is checked to ensure all but the two outermost 75 | dimensions are fully defined. 76 | 77 | Returns: 78 | A (possibly masked), checked, new `memory`. 79 | 80 | Raises: 81 | ValueError: If `check_inner_dims_defined` is `True` and not 82 | `memory.shape[2:].is_fully_defined()`. 83 | """ 84 | memory = nest.map_structure( 85 | lambda m: ops.convert_to_tensor(m, name="memory"), memory) 86 | if memory_sequence_length is not None: 87 | memory_sequence_length = ops.convert_to_tensor( 88 | memory_sequence_length, name="memory_sequence_length") 89 | if check_inner_dims_defined: 90 | def _check_dims(m): 91 | if not m.get_shape()[2:].is_fully_defined(): 92 | raise ValueError("Expected memory %s to have fully defined inner dims, " 93 | "but saw shape: %s" % (m.name, m.get_shape())) 94 | 95 | nest.map_structure(_check_dims, memory) 96 | if memory_sequence_length is None: 97 | seq_len_mask = None 98 | else: 99 | seq_len_mask = array_ops.sequence_mask( 100 | memory_sequence_length, 101 | maxlen=array_ops.shape(nest.flatten(memory)[0])[1], 102 | dtype=nest.flatten(memory)[0].dtype) 103 | seq_len_batch_size = ( 104 | memory_sequence_length.shape[0].value 105 | or array_ops.shape(memory_sequence_length)[0]) 106 | 107 | def _maybe_mask(m, seq_len_mask): 108 | rank = m.get_shape().ndims 109 | rank = rank if rank is not None else array_ops.rank(m) 110 | extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32) 111 | m_batch_size = m.shape[0].value or array_ops.shape(m)[0] 112 | if memory_sequence_length is not None: 113 | message = ("memory_sequence_length and memory tensor batch sizes do not " 114 | "match.") 115 | with ops.control_dependencies([ 116 | check_ops.assert_equal( 117 | seq_len_batch_size, m_batch_size, message=message)]): 118 | seq_len_mask = array_ops.reshape( 119 | seq_len_mask, 120 | array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0)) 121 | return m * seq_len_mask 122 | else: 123 | return m 124 | 125 | return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory) 126 | 127 | 128 | def _maybe_mask_score(score, memory_sequence_length, score_mask_value): 129 | if memory_sequence_length is None: 130 | return score 131 | message = ("All values in memory_sequence_length must greater than zero.") 132 | with ops.control_dependencies( 133 | [check_ops.assert_positive(memory_sequence_length, message=message)]): 134 | score_mask = array_ops.sequence_mask( 135 | memory_sequence_length, maxlen=array_ops.shape(score)[1]) 136 | score_mask_values = score_mask_value * array_ops.ones_like(score) 137 | return array_ops.where(score_mask, score, score_mask_values) 138 | 139 | 140 | class _BaseAttentionMechanism(AttentionMechanism): 141 | """A base AttentionMechanism class providing common functionality. 142 | 143 | Common functionality includes: 144 | 1. Storing the query and memory layers. 145 | 2. Preprocessing and storing the memory. 146 | """ 147 | 148 | def __init__(self, 149 | query_layer, 150 | memory, 151 | probability_fn, 152 | memory_sequence_length=None, 153 | memory_layer=None, 154 | check_inner_dims_defined=True, 155 | score_mask_value=None, 156 | name=None): 157 | """Construct base AttentionMechanism class. 158 | 159 | Args: 160 | query_layer: Callable. Instance of `tf.layers.Layer`. The layer's depth 161 | must match the depth of `memory_layer`. If `query_layer` is not 162 | provided, the shape of `query` must match that of `memory_layer`. 163 | memory: The memory to query; usually the output of an RNN encoder. This 164 | tensor should be shaped `[batch_size, max_time, ...]`. 165 | probability_fn: A `callable`. Converts the score and previous alignments 166 | to probabilities. Its signature should be: 167 | `probabilities = probability_fn(score, previous_alignments)`. 168 | memory_sequence_length (optional): Sequence lengths for the batch entries 169 | in memory. If provided, the memory tensor rows are masked with zeros 170 | for values past the respective sequence lengths. 171 | memory_layer: Instance of `tf.layers.Layer` (may be None). The layer's 172 | depth must match the depth of `query_layer`. 173 | If `memory_layer` is not provided, the shape of `memory` must match 174 | that of `query_layer`. 175 | check_inner_dims_defined: Python boolean. If `True`, the `memory` 176 | argument's shape is checked to ensure all but the two outermost 177 | dimensions are fully defined. 178 | score_mask_value: (optional): The mask value for score before passing into 179 | `probability_fn`. The default is -inf. Only used if 180 | `memory_sequence_length` is not None. 181 | name: Name to use when creating ops. 182 | """ 183 | if (query_layer is not None 184 | and not isinstance(query_layer, layers_base.Layer)): 185 | raise TypeError( 186 | "query_layer is not a Layer: %s" % type(query_layer).__name__) 187 | if (memory_layer is not None 188 | and not isinstance(memory_layer, layers_base.Layer)): 189 | raise TypeError( 190 | "memory_layer is not a Layer: %s" % type(memory_layer).__name__) 191 | self._query_layer = query_layer 192 | self._memory_layer = memory_layer 193 | self.dtype = memory_layer.dtype 194 | if not callable(probability_fn): 195 | raise TypeError("probability_fn must be callable, saw type: %s" % 196 | type(probability_fn).__name__) 197 | if score_mask_value is None: 198 | score_mask_value = dtypes.as_dtype( 199 | self._memory_layer.dtype).as_numpy_dtype(-np.inf) 200 | self._probability_fn = lambda score, prev: ( # pylint:disable=g-long-lambda 201 | probability_fn( 202 | _maybe_mask_score(score, memory_sequence_length, score_mask_value), 203 | prev)) 204 | with ops.name_scope( 205 | name, "BaseAttentionMechanismInit", nest.flatten(memory)): 206 | self._values = _prepare_memory( 207 | memory, memory_sequence_length, 208 | check_inner_dims_defined=check_inner_dims_defined) 209 | self._keys = ( 210 | self.memory_layer(self._values) if self.memory_layer # pylint: disable=not-callable 211 | else self._values) 212 | self._batch_size = ( 213 | self._keys.shape[0].value or array_ops.shape(self._keys)[0]) 214 | self._alignments_size = (self._keys.shape[1].value or 215 | array_ops.shape(self._keys)[1]) 216 | 217 | @property 218 | def memory_layer(self): 219 | return self._memory_layer 220 | 221 | @property 222 | def query_layer(self): 223 | return self._query_layer 224 | 225 | @property 226 | def values(self): 227 | return self._values 228 | 229 | @property 230 | def keys(self): 231 | return self._keys 232 | 233 | @property 234 | def batch_size(self): 235 | return self._batch_size 236 | 237 | @property 238 | def alignments_size(self): 239 | return self._alignments_size 240 | 241 | def initial_alignments(self, batch_size, dtype): 242 | """Creates the initial alignment values for the `AttentionWrapper` class. 243 | 244 | This is important for AttentionMechanisms that use the previous alignment 245 | to calculate the alignment at the next time step (e.g. monotonic attention). 246 | 247 | The default behavior is to return a tensor of all zeros. 248 | 249 | Args: 250 | batch_size: `int32` scalar, the batch_size. 251 | dtype: The `dtype`. 252 | 253 | Returns: 254 | A `dtype` tensor shaped `[batch_size, alignments_size]` 255 | (`alignments_size` is the values' `max_time`). 256 | """ 257 | max_time = self._alignments_size 258 | return _zero_state_tensors(max_time, batch_size, dtype) 259 | 260 | 261 | def _luong_score(query, keys, scale): 262 | """Implements Luong-style (multiplicative) scoring function. 263 | 264 | This attention has two forms. The first is standard Luong attention, 265 | as described in: 266 | 267 | Minh-Thang Luong, Hieu Pham, Christopher D. Manning. 268 | "Effective Approaches to Attention-based Neural Machine Translation." 269 | EMNLP 2015. https://arxiv.org/abs/1508.04025 270 | 271 | The second is the scaled form inspired partly by the normalized form of 272 | Bahdanau attention. 273 | 274 | To enable the second form, call this function with `scale=True`. 275 | 276 | Args: 277 | query: Tensor, shape `[batch_size, num_units]` to compare to keys. 278 | keys: Processed memory, shape `[batch_size, max_time, num_units]`. 279 | scale: Whether to apply a scale to the score function. 280 | 281 | Returns: 282 | A `[batch_size, max_time]` tensor of unnormalized score values. 283 | 284 | Raises: 285 | ValueError: If `key` and `query` depths do not match. 286 | """ 287 | depth = query.get_shape()[-1] 288 | key_units = keys.get_shape()[-1] 289 | if depth != key_units: 290 | raise ValueError( 291 | "Incompatible or unknown inner dimensions between query and keys. " 292 | "Query (%s) has units: %s. Keys (%s) have units: %s. " 293 | "Perhaps you need to set num_units to the keys' dimension (%s)?" 294 | % (query, depth, keys, key_units, key_units)) 295 | dtype = query.dtype 296 | 297 | # Reshape from [batch_size, depth] to [batch_size, 1, depth] 298 | # for matmul. 299 | query = array_ops.expand_dims(query, 1) 300 | 301 | # Inner product along the query units dimension. 302 | # matmul shapes: query is [batch_size, 1, depth] and 303 | # keys is [batch_size, max_time, depth]. 304 | # the inner product is asked to **transpose keys' inner shape** to get a 305 | # batched matmul on: 306 | # [batch_size, 1, depth] . [batch_size, depth, max_time] 307 | # resulting in an output shape of: 308 | # [batch_time, 1, max_time]. 309 | # we then squeeze out the center singleton dimension. 310 | score = math_ops.matmul(query, keys, transpose_b=True) 311 | score = array_ops.squeeze(score, [1]) 312 | 313 | if scale: 314 | # Scalar used in weight scaling 315 | g = variable_scope.get_variable( 316 | "attention_g", dtype=dtype, initializer=1.) 317 | score = g * score 318 | return score 319 | 320 | 321 | class LuongAttention(_BaseAttentionMechanism): 322 | """Implements Luong-style (multiplicative) attention scoring. 323 | 324 | This attention has two forms. The first is standard Luong attention, 325 | as described in: 326 | 327 | Minh-Thang Luong, Hieu Pham, Christopher D. Manning. 328 | "Effective Approaches to Attention-based Neural Machine Translation." 329 | EMNLP 2015. https://arxiv.org/abs/1508.04025 330 | 331 | The second is the scaled form inspired partly by the normalized form of 332 | Bahdanau attention. 333 | 334 | To enable the second form, construct the object with parameter 335 | `scale=True`. 336 | """ 337 | 338 | def __init__(self, 339 | num_units, 340 | memory, 341 | memory_sequence_length=None, 342 | scale=False, 343 | probability_fn=None, 344 | score_mask_value=None, 345 | dtype=None, 346 | name="LuongAttention"): 347 | """Construct the AttentionMechanism mechanism. 348 | 349 | Args: 350 | num_units: The depth of the attention mechanism. 351 | memory: The memory to query; usually the output of an RNN encoder. This 352 | tensor should be shaped `[batch_size, max_time, ...]`. 353 | memory_sequence_length: (optional) Sequence lengths for the batch entries 354 | in memory. If provided, the memory tensor rows are masked with zeros 355 | for values past the respective sequence lengths. 356 | scale: Python boolean. Whether to scale the energy term. 357 | probability_fn: (optional) A `callable`. Converts the score to 358 | probabilities. The default is @{tf.nn.softmax}. Other options include 359 | @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}. 360 | Its signature should be: `probabilities = probability_fn(score)`. 361 | score_mask_value: (optional) The mask value for score before passing into 362 | `probability_fn`. The default is -inf. Only used if 363 | `memory_sequence_length` is not None. 364 | dtype: The data type for the memory layer of the attention mechanism. 365 | name: Name to use when creating ops. 366 | """ 367 | # For LuongAttention, we only transform the memory layer; thus 368 | # num_units **must** match expected the query depth. 369 | if probability_fn is None: 370 | probability_fn = nn_ops.softmax 371 | if dtype is None: 372 | dtype = dtypes.float32 373 | wrapped_probability_fn = lambda score, _: probability_fn(score) 374 | super(LuongAttention, self).__init__( 375 | query_layer=None, 376 | memory_layer=layers_core.Dense( 377 | num_units, name="memory_layer", use_bias=False, dtype=dtype), 378 | memory=memory, 379 | probability_fn=wrapped_probability_fn, 380 | memory_sequence_length=memory_sequence_length, 381 | score_mask_value=score_mask_value, 382 | name=name) 383 | self._num_units = num_units 384 | self._scale = scale 385 | self._name = name 386 | 387 | def __call__(self, query, previous_alignments): 388 | """Score the query based on the keys and values. 389 | 390 | Args: 391 | query: Tensor of dtype matching `self.values` and shape 392 | `[batch_size, query_depth]`. 393 | previous_alignments: Tensor of dtype matching `self.values` and shape 394 | `[batch_size, alignments_size]` 395 | (`alignments_size` is memory's `max_time`). 396 | 397 | Returns: 398 | alignments: Tensor of dtype matching `self.values` and shape 399 | `[batch_size, alignments_size]` (`alignments_size` is memory's 400 | `max_time`). 401 | """ 402 | with variable_scope.variable_scope(None, "luong_attention", [query]): 403 | score = _luong_score(query, self._keys, self._scale) 404 | alignments = self._probability_fn(score, previous_alignments) 405 | return alignments 406 | 407 | 408 | def _bahdanau_score(processed_query, keys, normalize): 409 | """Implements Bahdanau-style (additive) scoring function. 410 | 411 | This attention has two forms. The first is Bhandanau attention, 412 | as described in: 413 | 414 | Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio. 415 | "Neural Machine Translation by Jointly Learning to Align and Translate." 416 | ICLR 2015. https://arxiv.org/abs/1409.0473 417 | 418 | The second is the normalized form. This form is inspired by the 419 | weight normalization article: 420 | 421 | Tim Salimans, Diederik P. Kingma. 422 | "Weight Normalization: A Simple Reparameterization to Accelerate 423 | Training of Deep Neural Networks." 424 | https://arxiv.org/abs/1602.07868 425 | 426 | To enable the second form, set `normalize=True`. 427 | 428 | Args: 429 | processed_query: Tensor, shape `[batch_size, num_units]` to compare to keys. 430 | keys: Processed memory, shape `[batch_size, max_time, num_units]`. 431 | normalize: Whether to normalize the score function. 432 | 433 | Returns: 434 | A `[batch_size, max_time]` tensor of unnormalized score values. 435 | """ 436 | dtype = processed_query.dtype 437 | # Get the number of hidden units from the trailing dimension of keys 438 | num_units = keys.shape[2].value or array_ops.shape(keys)[2] 439 | # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. 440 | processed_query = array_ops.expand_dims(processed_query, 1) 441 | v = variable_scope.get_variable( 442 | "attention_v", [num_units], dtype=dtype) 443 | if normalize: 444 | # Scalar used in weight normalization 445 | g = variable_scope.get_variable( 446 | "attention_g", dtype=dtype, 447 | initializer=math.sqrt((1. / num_units))) 448 | # Bias added prior to the nonlinearity 449 | b = variable_scope.get_variable( 450 | "attention_b", [num_units], dtype=dtype, 451 | initializer=init_ops.zeros_initializer()) 452 | # normed_v = g * v / ||v|| 453 | normed_v = g * v * math_ops.rsqrt( 454 | math_ops.reduce_sum(math_ops.square(v))) 455 | return math_ops.reduce_sum( 456 | normed_v * math_ops.tanh(keys + processed_query + b), [2]) 457 | else: 458 | return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2]) 459 | 460 | 461 | class BahdanauAttention(_BaseAttentionMechanism): 462 | """Implements Bahdanau-style (additive) attention. 463 | 464 | This attention has two forms. The first is Bahdanau attention, 465 | as described in: 466 | 467 | Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio. 468 | "Neural Machine Translation by Jointly Learning to Align and Translate." 469 | ICLR 2015. https://arxiv.org/abs/1409.0473 470 | 471 | The second is the normalized form. This form is inspired by the 472 | weight normalization article: 473 | 474 | Tim Salimans, Diederik P. Kingma. 475 | "Weight Normalization: A Simple Reparameterization to Accelerate 476 | Training of Deep Neural Networks." 477 | https://arxiv.org/abs/1602.07868 478 | 479 | To enable the second form, construct the object with parameter 480 | `normalize=True`. 481 | """ 482 | 483 | def __init__(self, 484 | num_units, 485 | memory, 486 | memory_sequence_length=None, 487 | normalize=False, 488 | probability_fn=None, 489 | score_mask_value=None, 490 | dtype=None, 491 | name="BahdanauAttention"): 492 | """Construct the Attention mechanism. 493 | 494 | Args: 495 | num_units: The depth of the query mechanism. 496 | memory: The memory to query; usually the output of an RNN encoder. This 497 | tensor should be shaped `[batch_size, max_time, ...]`. 498 | memory_sequence_length (optional): Sequence lengths for the batch entries 499 | in memory. If provided, the memory tensor rows are masked with zeros 500 | for values past the respective sequence lengths. 501 | normalize: Python boolean. Whether to normalize the energy term. 502 | probability_fn: (optional) A `callable`. Converts the score to 503 | probabilities. The default is @{tf.nn.softmax}. Other options include 504 | @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}. 505 | Its signature should be: `probabilities = probability_fn(score)`. 506 | score_mask_value: (optional): The mask value for score before passing into 507 | `probability_fn`. The default is -inf. Only used if 508 | `memory_sequence_length` is not None. 509 | dtype: The data type for the query and memory layers of the attention 510 | mechanism. 511 | name: Name to use when creating ops. 512 | """ 513 | if probability_fn is None: 514 | probability_fn = nn_ops.softmax 515 | if dtype is None: 516 | dtype = dtypes.float32 517 | wrapped_probability_fn = lambda score, _: probability_fn(score) 518 | super(BahdanauAttention, self).__init__( 519 | query_layer=layers_core.Dense( 520 | num_units, name="query_layer", use_bias=False, dtype=dtype), 521 | memory_layer=layers_core.Dense( 522 | num_units, name="memory_layer", use_bias=False, dtype=dtype), 523 | memory=memory, 524 | probability_fn=wrapped_probability_fn, 525 | memory_sequence_length=memory_sequence_length, 526 | score_mask_value=score_mask_value, 527 | name=name) 528 | self._num_units = num_units 529 | self._normalize = normalize 530 | self._name = name 531 | 532 | def __call__(self, query, previous_alignments): 533 | """Score the query based on the keys and values. 534 | 535 | Args: 536 | query: Tensor of dtype matching `self.values` and shape 537 | `[batch_size, query_depth]`. 538 | previous_alignments: Tensor of dtype matching `self.values` and shape 539 | `[batch_size, alignments_size]` 540 | (`alignments_size` is memory's `max_time`). 541 | 542 | Returns: 543 | alignments: Tensor of dtype matching `self.values` and shape 544 | `[batch_size, alignments_size]` (`alignments_size` is memory's 545 | `max_time`). 546 | """ 547 | with variable_scope.variable_scope(None, "bahdanau_attention", [query]): 548 | processed_query = self.query_layer(query) if self.query_layer else query 549 | score = _bahdanau_score(processed_query, self._keys, self._normalize) 550 | alignments = self._probability_fn(score, previous_alignments) 551 | return alignments 552 | 553 | 554 | def safe_cumprod(x, *args, **kwargs): 555 | """Computes cumprod of x in logspace using cumsum to avoid underflow. 556 | 557 | The cumprod function and its gradient can result in numerical instabilities 558 | when its argument has very small and/or zero values. As long as the argument 559 | is all positive, we can instead compute the cumulative product as 560 | exp(cumsum(log(x))). This function can be called identically to tf.cumprod. 561 | 562 | Args: 563 | x: Tensor to take the cumulative product of. 564 | *args: Passed on to cumsum; these are identical to those in cumprod. 565 | **kwargs: Passed on to cumsum; these are identical to those in cumprod. 566 | Returns: 567 | Cumulative product of x. 568 | """ 569 | with ops.name_scope(None, "SafeCumprod", [x]): 570 | x = ops.convert_to_tensor(x, name="x") 571 | tiny = np.finfo(x.dtype.as_numpy_dtype).tiny 572 | return math_ops.exp(math_ops.cumsum( 573 | math_ops.log(clip_ops.clip_by_value(x, tiny, 1)), *args, **kwargs)) 574 | 575 | 576 | def monotonic_attention(p_choose_i, previous_attention, mode): 577 | """Compute monotonic attention distribution from choosing probabilities. 578 | 579 | Monotonic attention implies that the input sequence is processed in an 580 | explicitly left-to-right manner when generating the output sequence. In 581 | addition, once an input sequence element is attended to at a given output 582 | timestep, elements occurring before it cannot be attended to at subsequent 583 | output timesteps. This function generates attention distributions according 584 | to these assumptions. For more information, see ``Online and Linear-Time 585 | Attention by Enforcing Monotonic Alignments''. 586 | 587 | Args: 588 | p_choose_i: Probability of choosing input sequence/memory element i. Should 589 | be of shape (batch_size, input_sequence_length), and should all be in the 590 | range [0, 1]. 591 | previous_attention: The attention distribution from the previous output 592 | timestep. Should be of shape (batch_size, input_sequence_length). For 593 | the first output timestep, preevious_attention[n] should be [1, 0, 0, ..., 594 | 0] for all n in [0, ... batch_size - 1]. 595 | mode: How to compute the attention distribution. Must be one of 596 | 'recursive', 'parallel', or 'hard'. 597 | * 'recursive' uses tf.scan to recursively compute the distribution. 598 | This is slowest but is exact, general, and does not suffer from 599 | numerical instabilities. 600 | * 'parallel' uses parallelized cumulative-sum and cumulative-product 601 | operations to compute a closed-form solution to the recurrence 602 | relation defining the attention distribution. This makes it more 603 | efficient than 'recursive', but it requires numerical checks which 604 | make the distribution non-exact. This can be a problem in particular 605 | when input_sequence_length is long and/or p_choose_i has entries very 606 | close to 0 or 1. 607 | * 'hard' requires that the probabilities in p_choose_i are all either 0 608 | or 1, and subsequently uses a more efficient and exact solution. 609 | 610 | Returns: 611 | A tensor of shape (batch_size, input_sequence_length) representing the 612 | attention distributions for each sequence in the batch. 613 | 614 | Raises: 615 | ValueError: mode is not one of 'recursive', 'parallel', 'hard'. 616 | """ 617 | # Force things to be tensors 618 | p_choose_i = ops.convert_to_tensor(p_choose_i, name="p_choose_i") 619 | previous_attention = ops.convert_to_tensor( 620 | previous_attention, name="previous_attention") 621 | if mode == "recursive": 622 | # Use .shape[0].value when it's not None, or fall back on symbolic shape 623 | batch_size = p_choose_i.shape[0].value or array_ops.shape(p_choose_i)[0] 624 | # Compute [1, 1 - p_choose_i[0], 1 - p_choose_i[1], ..., 1 - p_choose_i[-2]] 625 | shifted_1mp_choose_i = array_ops.concat( 626 | [array_ops.ones((batch_size, 1)), 1 - p_choose_i[:, :-1]], 1) 627 | # Compute attention distribution recursively as 628 | # q[i] = (1 - p_choose_i[i])*q[i - 1] + previous_attention[i] 629 | # attention[i] = p_choose_i[i]*q[i] 630 | attention = p_choose_i * array_ops.transpose(functional_ops.scan( 631 | # Need to use reshape to remind TF of the shape between loop iterations 632 | lambda x, yz: array_ops.reshape(yz[0] * x + yz[1], (batch_size,)), 633 | # Loop variables yz[0] and yz[1] 634 | [array_ops.transpose(shifted_1mp_choose_i), 635 | array_ops.transpose(previous_attention)], 636 | # Initial value of x is just zeros 637 | array_ops.zeros((batch_size,)))) 638 | elif mode == "parallel": 639 | # safe_cumprod computes cumprod in logspace with numeric checks 640 | cumprod_1mp_choose_i = safe_cumprod(1 - p_choose_i, axis=1, exclusive=True) 641 | # Compute recurrence relation solution 642 | attention = p_choose_i * cumprod_1mp_choose_i * math_ops.cumsum( 643 | previous_attention / 644 | # Clip cumprod_1mp to avoid divide-by-zero 645 | clip_ops.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.), axis=1) 646 | elif mode == "hard": 647 | # Remove any probabilities before the index chosen last time step 648 | p_choose_i *= math_ops.cumsum(previous_attention, axis=1) 649 | # Now, use exclusive cumprod to remove probabilities after the first 650 | # chosen index, like so: 651 | # p_choose_i = [0, 0, 0, 1, 1, 0, 1, 1] 652 | # cumprod(1 - p_choose_i, exclusive=True) = [1, 1, 1, 1, 0, 0, 0, 0] 653 | # Product of above: [0, 0, 0, 1, 0, 0, 0, 0] 654 | attention = p_choose_i * math_ops.cumprod( 655 | 1 - p_choose_i, axis=1, exclusive=True) 656 | else: 657 | raise ValueError("mode must be 'recursive', 'parallel', or 'hard'.") 658 | return attention 659 | 660 | 661 | def _monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode, 662 | seed=None): 663 | """Attention probability function for monotonic attention. 664 | 665 | Takes in unnormalized attention scores, adds pre-sigmoid noise to encourage 666 | the model to make discrete attention decisions, passes them through a sigmoid 667 | to obtain "choosing" probabilities, and then calls monotonic_attention to 668 | obtain the attention distribution. For more information, see 669 | 670 | Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck, 671 | "Online and Linear-Time Attention by Enforcing Monotonic Alignments." 672 | ICML 2017. https://arxiv.org/abs/1704.00784 673 | 674 | Args: 675 | score: Unnormalized attention scores, shape `[batch_size, alignments_size]` 676 | previous_alignments: Previous attention distribution, shape 677 | `[batch_size, alignments_size]` 678 | sigmoid_noise: Standard deviation of pre-sigmoid noise. Setting this larger 679 | than 0 will encourage the model to produce large attention scores, 680 | effectively making the choosing probabilities discrete and the resulting 681 | attention distribution one-hot. It should be set to 0 at test-time, and 682 | when hard attention is not desired. 683 | mode: How to compute the attention distribution. Must be one of 684 | 'recursive', 'parallel', or 'hard'. See the docstring for 685 | `tf.contrib.seq2seq.monotonic_attention` for more information. 686 | seed: (optional) Random seed for pre-sigmoid noise. 687 | 688 | Returns: 689 | A `[batch_size, alignments_size]`-shape tensor corresponding to the 690 | resulting attention distribution. 691 | """ 692 | # Optionally add pre-sigmoid noise to the scores 693 | if sigmoid_noise > 0: 694 | noise = random_ops.random_normal(array_ops.shape(score), dtype=score.dtype, 695 | seed=seed) 696 | score += sigmoid_noise * noise 697 | # Compute "choosing" probabilities from the attention scores 698 | if mode == "hard": 699 | # When mode is hard, use a hard sigmoid 700 | p_choose_i = math_ops.cast(score > 0, score.dtype) 701 | else: 702 | p_choose_i = math_ops.sigmoid(score) 703 | # Convert from choosing probabilities to attention distribution 704 | return monotonic_attention(p_choose_i, previous_alignments, mode) 705 | 706 | 707 | class _BaseMonotonicAttentionMechanism(_BaseAttentionMechanism): 708 | """Base attention mechanism for monotonic attention. 709 | 710 | Simply overrides the initial_alignments function to provide a dirac 711 | distribution,which is needed in order for the monotonic attention 712 | distributions to have the correct behavior. 713 | """ 714 | 715 | def initial_alignments(self, batch_size, dtype): 716 | """Creates the initial alignment values for the monotonic attentions. 717 | 718 | Initializes to dirac distributions, i.e. [1, 0, 0, ...memory length..., 0] 719 | for all entries in the batch. 720 | 721 | Args: 722 | batch_size: `int32` scalar, the batch_size. 723 | dtype: The `dtype`. 724 | 725 | Returns: 726 | A `dtype` tensor shaped `[batch_size, alignments_size]` 727 | (`alignments_size` is the values' `max_time`). 728 | """ 729 | max_time = self._alignments_size 730 | return array_ops.one_hot( 731 | array_ops.zeros((batch_size,), dtype=dtypes.int32), max_time, 732 | dtype=dtype) 733 | 734 | 735 | class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism): 736 | """Monotonic attention mechanism with Bahadanau-style energy function. 737 | 738 | This type of attention encorces a monotonic constraint on the attention 739 | distributions; that is once the model attends to a given point in the memory 740 | it can't attend to any prior points at subsequence output timesteps. It 741 | achieves this by using the _monotonic_probability_fn instead of softmax to 742 | construct its attention distributions. Since the attention scores are passed 743 | through a sigmoid, a learnable scalar bias parameter is applied after the 744 | score function and before the sigmoid. Otherwise, it is equivalent to 745 | BahdanauAttention. This approach is proposed in 746 | 747 | Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck, 748 | "Online and Linear-Time Attention by Enforcing Monotonic Alignments." 749 | ICML 2017. https://arxiv.org/abs/1704.00784 750 | """ 751 | 752 | def __init__(self, 753 | num_units, 754 | memory, 755 | memory_sequence_length=None, 756 | normalize=False, 757 | score_mask_value=None, 758 | sigmoid_noise=0., 759 | sigmoid_noise_seed=None, 760 | score_bias_init=0., 761 | mode="parallel", 762 | dtype=None, 763 | name="BahdanauMonotonicAttention"): 764 | """Construct the Attention mechanism. 765 | 766 | Args: 767 | num_units: The depth of the query mechanism. 768 | memory: The memory to query; usually the output of an RNN encoder. This 769 | tensor should be shaped `[batch_size, max_time, ...]`. 770 | memory_sequence_length (optional): Sequence lengths for the batch entries 771 | in memory. If provided, the memory tensor rows are masked with zeros 772 | for values past the respective sequence lengths. 773 | normalize: Python boolean. Whether to normalize the energy term. 774 | score_mask_value: (optional): The mask value for score before passing into 775 | `probability_fn`. The default is -inf. Only used if 776 | `memory_sequence_length` is not None. 777 | sigmoid_noise: Standard deviation of pre-sigmoid noise. See the docstring 778 | for `_monotonic_probability_fn` for more information. 779 | sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise. 780 | score_bias_init: Initial value for score bias scalar. It's recommended to 781 | initialize this to a negative value when the length of the memory is 782 | large. 783 | mode: How to compute the attention distribution. Must be one of 784 | 'recursive', 'parallel', or 'hard'. See the docstring for 785 | `tf.contrib.seq2seq.monotonic_attention` for more information. 786 | dtype: The data type for the query and memory layers of the attention 787 | mechanism. 788 | name: Name to use when creating ops. 789 | """ 790 | # Set up the monotonic probability fn with supplied parameters 791 | if dtype is None: 792 | dtype = dtypes.float32 793 | wrapped_probability_fn = functools.partial( 794 | _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode, 795 | seed=sigmoid_noise_seed) 796 | super(BahdanauMonotonicAttention, self).__init__( 797 | query_layer=layers_core.Dense( 798 | num_units, name="query_layer", use_bias=False, dtype=dtype), 799 | memory_layer=layers_core.Dense( 800 | num_units, name="memory_layer", use_bias=False, dtype=dtype), 801 | memory=memory, 802 | probability_fn=wrapped_probability_fn, 803 | memory_sequence_length=memory_sequence_length, 804 | score_mask_value=score_mask_value, 805 | name=name) 806 | self._num_units = num_units 807 | self._normalize = normalize 808 | self._name = name 809 | self._score_bias_init = score_bias_init 810 | 811 | def __call__(self, query, previous_alignments): 812 | """Score the query based on the keys and values. 813 | 814 | Args: 815 | query: Tensor of dtype matching `self.values` and shape 816 | `[batch_size, query_depth]`. 817 | previous_alignments: Tensor of dtype matching `self.values` and shape 818 | `[batch_size, alignments_size]` 819 | (`alignments_size` is memory's `max_time`). 820 | 821 | Returns: 822 | alignments: Tensor of dtype matching `self.values` and shape 823 | `[batch_size, alignments_size]` (`alignments_size` is memory's 824 | `max_time`). 825 | """ 826 | with variable_scope.variable_scope( 827 | None, "bahdanau_monotonic_attention", [query]): 828 | processed_query = self.query_layer(query) if self.query_layer else query 829 | score = _bahdanau_score(processed_query, self._keys, self._normalize) 830 | score_bias = variable_scope.get_variable( 831 | "attention_score_bias", dtype=processed_query.dtype, 832 | initializer=self._score_bias_init) 833 | score += score_bias 834 | alignments = self._probability_fn(score, previous_alignments) 835 | return alignments 836 | 837 | 838 | class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism): 839 | """Monotonic attention mechanism with Luong-style energy function. 840 | 841 | This type of attention encorces a monotonic constraint on the attention 842 | distributions; that is once the model attends to a given point in the memory 843 | it can't attend to any prior points at subsequence output timesteps. It 844 | achieves this by using the _monotonic_probability_fn instead of softmax to 845 | construct its attention distributions. Otherwise, it is equivalent to 846 | LuongAttention. This approach is proposed in 847 | 848 | Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck, 849 | "Online and Linear-Time Attention by Enforcing Monotonic Alignments." 850 | ICML 2017. https://arxiv.org/abs/1704.00784 851 | """ 852 | 853 | def __init__(self, 854 | num_units, 855 | memory, 856 | memory_sequence_length=None, 857 | scale=False, 858 | score_mask_value=None, 859 | sigmoid_noise=0., 860 | sigmoid_noise_seed=None, 861 | score_bias_init=0., 862 | mode="parallel", 863 | dtype=None, 864 | name="LuongMonotonicAttention"): 865 | """Construct the Attention mechanism. 866 | 867 | Args: 868 | num_units: The depth of the query mechanism. 869 | memory: The memory to query; usually the output of an RNN encoder. This 870 | tensor should be shaped `[batch_size, max_time, ...]`. 871 | memory_sequence_length (optional): Sequence lengths for the batch entries 872 | in memory. If provided, the memory tensor rows are masked with zeros 873 | for values past the respective sequence lengths. 874 | scale: Python boolean. Whether to scale the energy term. 875 | score_mask_value: (optional): The mask value for score before passing into 876 | `probability_fn`. The default is -inf. Only used if 877 | `memory_sequence_length` is not None. 878 | sigmoid_noise: Standard deviation of pre-sigmoid noise. See the docstring 879 | for `_monotonic_probability_fn` for more information. 880 | sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise. 881 | score_bias_init: Initial value for score bias scalar. It's recommended to 882 | initialize this to a negative value when the length of the memory is 883 | large. 884 | mode: How to compute the attention distribution. Must be one of 885 | 'recursive', 'parallel', or 'hard'. See the docstring for 886 | `tf.contrib.seq2seq.monotonic_attention` for more information. 887 | dtype: The data type for the query and memory layers of the attention 888 | mechanism. 889 | name: Name to use when creating ops. 890 | """ 891 | # Set up the monotonic probability fn with supplied parameters 892 | if dtype is None: 893 | dtype = dtypes.float32 894 | wrapped_probability_fn = functools.partial( 895 | _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode, 896 | seed=sigmoid_noise_seed) 897 | super(LuongMonotonicAttention, self).__init__( 898 | query_layer=layers_core.Dense( 899 | num_units, name="query_layer", use_bias=False, dtype=dtype), 900 | memory_layer=layers_core.Dense( 901 | num_units, name="memory_layer", use_bias=False, dtype=dtype), 902 | memory=memory, 903 | probability_fn=wrapped_probability_fn, 904 | memory_sequence_length=memory_sequence_length, 905 | score_mask_value=score_mask_value, 906 | name=name) 907 | self._num_units = num_units 908 | self._scale = scale 909 | self._score_bias_init = score_bias_init 910 | self._name = name 911 | 912 | def __call__(self, query, previous_alignments): 913 | """Score the query based on the keys and values. 914 | 915 | Args: 916 | query: Tensor of dtype matching `self.values` and shape 917 | `[batch_size, query_depth]`. 918 | previous_alignments: Tensor of dtype matching `self.values` and shape 919 | `[batch_size, alignments_size]` 920 | (`alignments_size` is memory's `max_time`). 921 | 922 | Returns: 923 | alignments: Tensor of dtype matching `self.values` and shape 924 | `[batch_size, alignments_size]` (`alignments_size` is memory's 925 | `max_time`). 926 | """ 927 | with variable_scope.variable_scope(None, "luong_monotonic_attention", 928 | [query]): 929 | score = _luong_score(query, self._keys, self._scale) 930 | score_bias = variable_scope.get_variable( 931 | "attention_score_bias", dtype=query.dtype, 932 | initializer=self._score_bias_init) 933 | score += score_bias 934 | alignments = self._probability_fn(score, previous_alignments) 935 | return alignments 936 | 937 | 938 | class AttentionWrapperState( 939 | collections.namedtuple("AttentionWrapperState", 940 | ("cell_state", "attention", "time", "alignments", 941 | "alignment_history"))): 942 | """`namedtuple` storing the state of a `AttentionWrapper`. 943 | 944 | Contains: 945 | 946 | - `cell_state`: The state of the wrapped `RNNCell` at the previous time 947 | step. 948 | - `attention`: The attention emitted at the previous time step. 949 | - `time`: int32 scalar containing the current time step. 950 | - `alignments`: A single or tuple of `Tensor`(s) containing the alignments 951 | emitted at the previous time step for each attention mechanism. 952 | - `alignment_history`: (if enabled) a single or tuple of `TensorArray`(s) 953 | containing alignment matrices from all time steps for each attention 954 | mechanism. Call `stack()` on each to convert to a `Tensor`. 955 | """ 956 | 957 | def clone(self, **kwargs): 958 | """Clone this object, overriding components provided by kwargs. 959 | 960 | Example: 961 | 962 | ```python 963 | initial_state = attention_wrapper.zero_state(dtype=..., batch_size=...) 964 | initial_state = initial_state.clone(cell_state=encoder_state) 965 | ``` 966 | 967 | Args: 968 | **kwargs: Any properties of the state object to replace in the returned 969 | `AttentionWrapperState`. 970 | 971 | Returns: 972 | A new `AttentionWrapperState` whose properties are the same as 973 | this one, except any overridden properties as provided in `kwargs`. 974 | """ 975 | return super(AttentionWrapperState, self)._replace(**kwargs) 976 | 977 | 978 | def hardmax(logits, name=None): 979 | """Returns batched one-hot vectors. 980 | 981 | The depth index containing the `1` is that of the maximum logit value. 982 | 983 | Args: 984 | logits: A batch tensor of logit values. 985 | name: Name to use when creating ops. 986 | Returns: 987 | A batched one-hot tensor. 988 | """ 989 | with ops.name_scope(name, "Hardmax", [logits]): 990 | logits = ops.convert_to_tensor(logits, name="logits") 991 | if logits.get_shape()[-1].value is not None: 992 | depth = logits.get_shape()[-1].value 993 | else: 994 | depth = array_ops.shape(logits)[-1] 995 | return array_ops.one_hot( 996 | math_ops.argmax(logits, -1), depth, dtype=logits.dtype) 997 | 998 | 999 | def _compute_attention(attention_mechanism, cell_output, previous_alignments, 1000 | attention_layer, temperature, use_hmean): 1001 | """Computes the attention and alignments for a given attention_mechanism.""" 1002 | alignments = attention_mechanism( 1003 | cell_output, previous_alignments=previous_alignments) 1004 | 1005 | # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time] 1006 | expanded_alignments = array_ops.expand_dims(alignments, 1) 1007 | # Context is the inner product of alignments and values along the 1008 | # memory time dimension. 1009 | # alignments shape is 1010 | # [batch_size, 1, memory_time] 1011 | # attention_mechanism.values shape is 1012 | # [batch_size, memory_time, memory_size] 1013 | # the batched matmul is over memory_time, so the output shape is 1014 | # [batch_size, 1, memory_size]. 1015 | # we then squeeze out the singleton dim. 1016 | context = math_ops.matmul(expanded_alignments, attention_mechanism.values) 1017 | context = array_ops.squeeze(context, [1]) 1018 | 1019 | ## Get context vector mean and log standard deviation 1020 | c_dim = context.get_shape()[-1] # The dimension of the context vector 1021 | c_mean = tf.identity(context, name='c_mean') 1022 | c_log_sigma_intermediate = Dense(c_dim, activation=tf.tanh, name='c_log_sigma_intermediate')(context) 1023 | c_log_sigma = Dense(c_dim, name='c_log_sigma')(c_log_sigma_intermediate) 1024 | 1025 | ## Sample from the gaussian distribution 1026 | epsilon = tf.random_normal(tf.shape(c_log_sigma), name="epsilon") 1027 | context_sampled = c_mean + tf.scalar_mul(temperature, epsilon * tf.exp(c_log_sigma)) 1028 | 1029 | if use_hmean: 1030 | ## Prior mean is mean pooling of encoder outputs: Take sum and divide by num of unmasked tokens 1031 | h_source_mean = tf.reduce_sum(attention_mechanism.values, axis=1) / tf.cast( 1032 | tf.count_nonzero(tf.reduce_sum(attention_mechanism.values, axis=-1), axis=1, keep_dims=True), 1033 | dtype=tf.float32) 1034 | 1035 | ## Calculate KL Loss for the context vector for h_mean prior 1036 | c_kl_loss = -0.5 * tf.reduce_sum(1.0 + 2 * c_log_sigma - (c_mean - h_source_mean) ** 2 - tf.exp(2 * c_log_sigma), 1, 1037 | name="c_kl_loss") 1038 | else: 1039 | ## Calculate KL Loss for the context vector for zero prior 1040 | c_kl_loss = -0.5 * tf.reduce_sum( 1041 | 1.0 + 2 * c_log_sigma - (c_mean) ** 2 - tf.exp(2 * c_log_sigma), 1, 1042 | name="c_kl_loss") 1043 | 1044 | # Alternative KL loss calculation b/w two gaussian distributions 1045 | # prior_mean = tf.zeros(shape=c_mean.shape) 1046 | # prior_sigma = tf.ones(shape=c_mean.shape) 1047 | # posterior_dist = tf.contrib.distributions.Normal(loc=c_mean, scale=tf.exp(c_log_sigma)) 1048 | # prior_dist = tf.contrib.distributions.Normal(loc=prior_mean, scale=prior_sigma) 1049 | # c_kl_loss = tf.reduce_sum(tf.contrib.distributions.kl_divergence(posterior_dist, prior_dist), axis=-1, name="c_kl_loss") 1050 | 1051 | if attention_layer is not None: 1052 | attention = attention_layer(array_ops.concat([cell_output, context_sampled], 1)) 1053 | else: 1054 | attention = context_sampled 1055 | 1056 | return attention, alignments, c_kl_loss 1057 | 1058 | 1059 | class AttentionWrapper(rnn_cell_impl.RNNCell): 1060 | """Wraps another `RNNCell` with attention. 1061 | """ 1062 | 1063 | def __init__(self, 1064 | cell, 1065 | attention_mechanism, 1066 | temperature=1.0, 1067 | use_hmean = True, 1068 | attention_layer_size=None, 1069 | alignment_history=False, 1070 | cell_input_fn=None, 1071 | output_attention=True, 1072 | initial_cell_state=None, 1073 | name=None): 1074 | """Construct the `AttentionWrapper`. 1075 | 1076 | **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in 1077 | `AttentionWrapper`, then you must ensure that: 1078 | 1079 | - The encoder output has been tiled to `beam_width` via 1080 | @{tf.contrib.seq2seq.tile_batch} (NOT `tf.tile`). 1081 | - The `batch_size` argument passed to the `zero_state` method of this 1082 | wrapper is equal to `true_batch_size * beam_width`. 1083 | - The initial state created with `zero_state` above contains a 1084 | `cell_state` value containing properly tiled final state from the 1085 | encoder. 1086 | 1087 | An example: 1088 | 1089 | ``` 1090 | tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch( 1091 | encoder_outputs, multiplier=beam_width) 1092 | tiled_encoder_final_state = tf.conrib.seq2seq.tile_batch( 1093 | encoder_final_state, multiplier=beam_width) 1094 | tiled_sequence_length = tf.contrib.seq2seq.tile_batch( 1095 | sequence_length, multiplier=beam_width) 1096 | attention_mechanism = MyFavoriteAttentionMechanism( 1097 | num_units=attention_depth, 1098 | memory=tiled_inputs, 1099 | memory_sequence_length=tiled_sequence_length) 1100 | attention_cell = AttentionWrapper(cell, attention_mechanism, ...) 1101 | decoder_initial_state = attention_cell.zero_state( 1102 | dtype, batch_size=true_batch_size * beam_width) 1103 | decoder_initial_state = decoder_initial_state.clone( 1104 | cell_state=tiled_encoder_final_state) 1105 | ``` 1106 | 1107 | Args: 1108 | cell: An instance of `RNNCell`. 1109 | attention_mechanism: A list of `AttentionMechanism` instances or a single 1110 | instance. 1111 | attention_layer_size: A list of Python integers or a single Python 1112 | integer, the depth of the attention (output) layer(s). If None 1113 | (default), use the context as attention at each time step. Otherwise, 1114 | feed the context and cell output into the attention layer to generate 1115 | attention at each time step. If attention_mechanism is a list, 1116 | attention_layer_size must be a list of the same length. 1117 | alignment_history: Python boolean, whether to store alignment history 1118 | from all time steps in the final output state (currently stored as a 1119 | time major `TensorArray` on which you must call `stack()`). 1120 | cell_input_fn: (optional) A `callable`. The default is: 1121 | `lambda inputs, attention: array_ops.concat([inputs, attention], -1)`. 1122 | output_attention: Python bool. If `True` (default), the output at each 1123 | time step is the attention value. This is the behavior of Luong-style 1124 | attention mechanisms. If `False`, the output at each time step is 1125 | the output of `cell`. This is the beahvior of Bhadanau-style 1126 | attention mechanisms. In both cases, the `attention` tensor is 1127 | propagated to the next time step via the state and is used there. 1128 | This flag only controls whether the attention mechanism is propagated 1129 | up to the next cell in an RNN stack or to the top RNN output. 1130 | initial_cell_state: The initial state value to use for the cell when 1131 | the user calls `zero_state()`. Note that if this value is provided 1132 | now, and the user uses a `batch_size` argument of `zero_state` which 1133 | does not match the batch size of `initial_cell_state`, proper 1134 | behavior is not guaranteed. 1135 | name: Name to use when creating ops. 1136 | 1137 | Raises: 1138 | TypeError: `attention_layer_size` is not None and (`attention_mechanism` 1139 | is a list but `attention_layer_size` is not; or vice versa). 1140 | ValueError: if `attention_layer_size` is not None, `attention_mechanism` 1141 | is a list, and its length does not match that of `attention_layer_size`. 1142 | """ 1143 | 1144 | super(AttentionWrapper, self).__init__(name=name) 1145 | if not rnn_cell_impl._like_rnncell(cell): # pylint: disable=protected-access 1146 | raise TypeError( 1147 | "cell must be an RNNCell, saw type: %s" % type(cell).__name__) 1148 | if isinstance(attention_mechanism, (list, tuple)): 1149 | self._is_multi = True 1150 | attention_mechanisms = attention_mechanism 1151 | for attention_mechanism in attention_mechanisms: 1152 | if not isinstance(attention_mechanism, AttentionMechanism): 1153 | raise TypeError( 1154 | "attention_mechanism must contain only instances of " 1155 | "AttentionMechanism, saw type: %s" 1156 | % type(attention_mechanism).__name__) 1157 | else: 1158 | self._is_multi = False 1159 | if not isinstance(attention_mechanism, AttentionMechanism): 1160 | raise TypeError( 1161 | "attention_mechanism must be an AttentionMechanism or list of " 1162 | "multiple AttentionMechanism instances, saw type: %s" 1163 | % type(attention_mechanism).__name__) 1164 | attention_mechanisms = (attention_mechanism,) 1165 | 1166 | if cell_input_fn is None: 1167 | cell_input_fn = ( 1168 | lambda inputs, attention: array_ops.concat([inputs, attention], -1)) 1169 | else: 1170 | if not callable(cell_input_fn): 1171 | raise TypeError( 1172 | "cell_input_fn must be callable, saw type: %s" 1173 | % type(cell_input_fn).__name__) 1174 | 1175 | if attention_layer_size is not None: 1176 | attention_layer_sizes = tuple( 1177 | attention_layer_size 1178 | if isinstance(attention_layer_size, (list, tuple)) 1179 | else (attention_layer_size,)) 1180 | if len(attention_layer_sizes) != len(attention_mechanisms): 1181 | raise ValueError( 1182 | "If provided, attention_layer_size must contain exactly one " 1183 | "integer per attention_mechanism, saw: %d vs %d" 1184 | % (len(attention_layer_sizes), len(attention_mechanisms))) 1185 | self._attention_layers = tuple( 1186 | layers_core.Dense( 1187 | attention_layer_size, 1188 | name="attention_layer", 1189 | use_bias=False, 1190 | dtype=attention_mechanisms[i].dtype) 1191 | for i, attention_layer_size in enumerate(attention_layer_sizes)) 1192 | self._attention_layer_size = sum(attention_layer_sizes) 1193 | else: 1194 | self._attention_layers = None 1195 | self._attention_layer_size = sum( 1196 | attention_mechanism.values.get_shape()[-1].value 1197 | for attention_mechanism in attention_mechanisms) 1198 | 1199 | self._cell = cell 1200 | self._attention_mechanisms = attention_mechanisms 1201 | self._cell_input_fn = cell_input_fn 1202 | self._output_attention = output_attention 1203 | self._alignment_history = alignment_history 1204 | self._temperature = temperature 1205 | self._use_hmean = use_hmean # Use N(h_mean_src, I) as prior instead of N(0, I) 1206 | with ops.name_scope(name, "AttentionWrapperInit"): 1207 | if initial_cell_state is None: 1208 | self._initial_cell_state = None 1209 | else: 1210 | final_state_tensor = nest.flatten(initial_cell_state)[-1] 1211 | state_batch_size = ( 1212 | final_state_tensor.shape[0].value 1213 | or array_ops.shape(final_state_tensor)[0]) 1214 | error_message = ( 1215 | "When constructing AttentionWrapper %s: " % self._base_name + 1216 | "Non-matching batch sizes between the memory " 1217 | "(encoder output) and initial_cell_state. Are you using " 1218 | "the BeamSearchDecoder? You may need to tile your initial state " 1219 | "via the tf.contrib.seq2seq.tile_batch function with argument " 1220 | "multiple=beam_width.") 1221 | with ops.control_dependencies( 1222 | self._batch_size_checks(state_batch_size, error_message)): 1223 | self._initial_cell_state = nest.map_structure( 1224 | lambda s: array_ops.identity(s, name="check_initial_cell_state"), 1225 | initial_cell_state) 1226 | 1227 | def _batch_size_checks(self, batch_size, error_message): 1228 | return [check_ops.assert_equal(batch_size, 1229 | attention_mechanism.batch_size, 1230 | message=error_message) 1231 | for attention_mechanism in self._attention_mechanisms] 1232 | 1233 | def _item_or_tuple(self, seq): 1234 | """Returns `seq` as tuple or the singular element. 1235 | 1236 | Which is returned is determined by how the AttentionMechanism(s) were passed 1237 | to the constructor. 1238 | 1239 | Args: 1240 | seq: A non-empty sequence of items or generator. 1241 | 1242 | Returns: 1243 | Either the values in the sequence as a tuple if AttentionMechanism(s) 1244 | were passed to the constructor as a sequence or the singular element. 1245 | """ 1246 | t = tuple(seq) 1247 | if self._is_multi: 1248 | return t 1249 | else: 1250 | return t[0] 1251 | 1252 | @property 1253 | def output_size(self): 1254 | if self._output_attention: 1255 | return self._attention_layer_size 1256 | else: 1257 | return self._cell.output_size 1258 | 1259 | @property 1260 | def state_size(self): 1261 | """The `state_size` property of `AttentionWrapper`. 1262 | 1263 | Returns: 1264 | An `AttentionWrapperState` tuple containing shapes used by this object. 1265 | """ 1266 | return AttentionWrapperState( 1267 | cell_state=self._cell.state_size, 1268 | time=tensor_shape.TensorShape([]), 1269 | attention=self._attention_layer_size, 1270 | alignments=self._item_or_tuple( 1271 | a.alignments_size for a in self._attention_mechanisms), 1272 | alignment_history=self._item_or_tuple( 1273 | () for _ in self._attention_mechanisms)) # sometimes a TensorArray 1274 | 1275 | def zero_state(self, batch_size, dtype): 1276 | """Return an initial (zero) state tuple for this `AttentionWrapper`. 1277 | 1278 | **NOTE** Please see the initializer documentation for details of how 1279 | to call `zero_state` if using an `AttentionWrapper` with a 1280 | `BeamSearchDecoder`. 1281 | 1282 | Args: 1283 | batch_size: `0D` integer tensor: the batch size. 1284 | dtype: The internal state data type. 1285 | 1286 | Returns: 1287 | An `AttentionWrapperState` tuple containing zeroed out tensors and, 1288 | possibly, empty `TensorArray` objects. 1289 | 1290 | Raises: 1291 | ValueError: (or, possibly at runtime, InvalidArgument), if 1292 | `batch_size` does not match the output size of the encoder passed 1293 | to the wrapper object at initialization time. 1294 | """ 1295 | with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): 1296 | if self._initial_cell_state is not None: 1297 | cell_state = self._initial_cell_state 1298 | else: 1299 | cell_state = self._cell.zero_state(batch_size, dtype) 1300 | error_message = ( 1301 | "When calling zero_state of AttentionWrapper %s: " % self._base_name + 1302 | "Non-matching batch sizes between the memory " 1303 | "(encoder output) and the requested batch size. Are you using " 1304 | "the BeamSearchDecoder? If so, make sure your encoder output has " 1305 | "been tiled to beam_width via tf.contrib.seq2seq.tile_batch, and " 1306 | "the batch_size= argument passed to zero_state is " 1307 | "batch_size * beam_width.") 1308 | with ops.control_dependencies( 1309 | self._batch_size_checks(batch_size, error_message)): 1310 | cell_state = nest.map_structure( 1311 | lambda s: array_ops.identity(s, name="checked_cell_state"), 1312 | cell_state) 1313 | return AttentionWrapperState( 1314 | cell_state=cell_state, 1315 | time=array_ops.zeros([], dtype=dtypes.int32), 1316 | attention=_zero_state_tensors(self._attention_layer_size, batch_size, 1317 | dtype), 1318 | alignments=self._item_or_tuple( 1319 | attention_mechanism.initial_alignments(batch_size, dtype) 1320 | for attention_mechanism in self._attention_mechanisms), 1321 | alignment_history=self._item_or_tuple( 1322 | tensor_array_ops.TensorArray(dtype=dtype, size=0, 1323 | dynamic_size=True) 1324 | if self._alignment_history else () 1325 | for _ in self._attention_mechanisms)) 1326 | 1327 | def call(self, inputs, state): 1328 | """Perform a step of attention-wrapped RNN. 1329 | 1330 | - Step 1: Mix the `inputs` and previous step's `attention` output via 1331 | `cell_input_fn`. 1332 | - Step 2: Call the wrapped `cell` with this input and its previous state. 1333 | - Step 3: Score the cell's output with `attention_mechanism`. 1334 | - Step 4: Calculate the alignments by passing the score through the 1335 | `normalizer`. 1336 | - Step 5: Calculate the context vector as the inner product between the 1337 | alignments and the attention_mechanism's values (memory). 1338 | - Step 6: Calculate the attention output by concatenating the cell output 1339 | and context through the attention layer (a linear layer with 1340 | `attention_layer_size` outputs). 1341 | 1342 | Args: 1343 | inputs: (Possibly nested tuple of) Tensor, the input at this time step. 1344 | state: An instance of `AttentionWrapperState` containing 1345 | tensors from the previous time step. 1346 | 1347 | Returns: 1348 | A tuple `(attention_or_cell_output, next_state)`, where: 1349 | 1350 | - `attention_or_cell_output` depending on `output_attention`. 1351 | - `next_state` is an instance of `AttentionWrapperState` 1352 | containing the state calculated at this time step. 1353 | 1354 | Raises: 1355 | TypeError: If `state` is not an instance of `AttentionWrapperState`. 1356 | """ 1357 | if not isinstance(state, AttentionWrapperState): 1358 | raise TypeError("Expected state to be instance of AttentionWrapperState. " 1359 | "Received type %s instead." % type(state)) 1360 | 1361 | # Step 1: Calculate the true inputs to the cell based on the 1362 | # previous attention value. 1363 | cell_inputs = self._cell_input_fn(inputs, state.attention) 1364 | cell_state = state.cell_state 1365 | cell_output, next_cell_state = self._cell(cell_inputs, cell_state) 1366 | 1367 | cell_batch_size = ( 1368 | cell_output.shape[0].value or array_ops.shape(cell_output)[0]) 1369 | error_message = ( 1370 | "When applying AttentionWrapper %s: " % self.name + 1371 | "Non-matching batch sizes between the memory " 1372 | "(encoder output) and the query (decoder output). Are you using " 1373 | "the BeamSearchDecoder? You may need to tile your memory input via " 1374 | "the tf.contrib.seq2seq.tile_batch function with argument " 1375 | "multiple=beam_width.") 1376 | with ops.control_dependencies( 1377 | self._batch_size_checks(cell_batch_size, error_message)): 1378 | cell_output = array_ops.identity( 1379 | cell_output, name="checked_cell_output") 1380 | 1381 | if self._is_multi: 1382 | previous_alignments = state.alignments 1383 | previous_alignment_history = state.alignment_history 1384 | else: 1385 | previous_alignments = [state.alignments] 1386 | previous_alignment_history = [state.alignment_history] 1387 | 1388 | all_alignments = [] 1389 | all_attentions = [] 1390 | all_histories = [] 1391 | ## Obtain c_kl_loss (for the current timestep of decoder) 1392 | for i, attention_mechanism in enumerate(self._attention_mechanisms): 1393 | attention, alignments, c_kl_loss = _compute_attention( 1394 | attention_mechanism, cell_output, previous_alignments[i], 1395 | self._attention_layers[i] if self._attention_layers else None, self._temperature, self._use_hmean) 1396 | alignment_history = previous_alignment_history[i].write( 1397 | state.time, alignments) if self._alignment_history else () 1398 | 1399 | all_alignments.append(alignments) 1400 | all_histories.append(alignment_history) 1401 | all_attentions.append(attention) 1402 | 1403 | attention = array_ops.concat(all_attentions, 1) 1404 | next_state = AttentionWrapperState( 1405 | time=state.time + 1, 1406 | cell_state=next_cell_state, 1407 | attention=attention, 1408 | alignments=self._item_or_tuple(all_alignments), 1409 | alignment_history=self._item_or_tuple(all_histories)) 1410 | 1411 | if self._output_attention: 1412 | return attention, next_state, c_kl_loss 1413 | else: 1414 | return cell_output, next_state, c_kl_loss 1415 | 1416 | -------------------------------------------------------------------------------- /ved_varAttn/varAttention_decoder/basic_decoder.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import tensorflow as tf 4 | 5 | from . import decoder 6 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py 7 | from tensorflow.python.framework import dtypes 8 | from tensorflow.python.framework import ops 9 | from tensorflow.python.framework import tensor_shape 10 | from tensorflow.python.layers import base as layers_base 11 | from tensorflow.python.ops import rnn_cell_impl 12 | from tensorflow.python.util import nest 13 | 14 | __all__ = [ 15 | "BasicDecoderOutput", 16 | "BasicDecoder", 17 | ] 18 | 19 | 20 | class BasicDecoderOutput(collections.namedtuple("BasicDecoderOutput", ("rnn_output", "sample_id"))): 21 | pass 22 | 23 | 24 | class BasicDecoder(decoder.Decoder): 25 | """Basic sampling decoder.""" 26 | 27 | def __init__(self, cell, helper, initial_state, latent_vector, output_layer=None): 28 | """Initialize BasicDecoder. 29 | Args: 30 | cell: An `RNNCell` instance. 31 | helper: A `Helper` instance. 32 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays. 33 | The initial state of the RNNCell. 34 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., 35 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior 36 | to storing the result or sampling. 37 | Raises: 38 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. 39 | """ 40 | if not rnn_cell_impl._like_rnncell(cell): # pylint: disable=protected-access 41 | raise TypeError("cell must be an RNNCell, received: %s" % type(cell)) 42 | if not isinstance(helper, helper_py.Helper): 43 | raise TypeError("helper must be a Helper, received: %s" % type(helper)) 44 | if (output_layer is not None and not isinstance(output_layer, layers_base.Layer)): 45 | raise TypeError("output_layer must be a Layer, received: %s" % type(output_layer)) 46 | self._cell = cell 47 | self._helper = helper 48 | self._initial_state = initial_state 49 | self._output_layer = output_layer 50 | self._latent_vector = latent_vector 51 | self._intermediate_context_kl_loss = tf.zeros(shape=(helper.batch_size,)) # shape of (batch_size,) 52 | # CHANGE-1: Variable to keep adding the c_kl_losses from each timestep 53 | 54 | @property 55 | def batch_size(self): 56 | return self._helper.batch_size 57 | 58 | def _rnn_output_size(self): 59 | size = self._cell.output_size 60 | if self._output_layer is None: 61 | return size 62 | else: 63 | # To use layer's compute_output_shape, we need to convert the 64 | # RNNCell's output_size entries into shapes with an unknown 65 | # batch size. We then pass this through the layer's 66 | # compute_output_shape and read off all but the first (batch) 67 | # dimensions to get the output size of the rnn with the layer 68 | # applied to the top. 69 | output_shape_with_unknown_batch = nest.map_structure( 70 | lambda s: tensor_shape.TensorShape([None]).concatenate(s), 71 | size) 72 | layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access 73 | output_shape_with_unknown_batch) 74 | return nest.map_structure(lambda s: s[1:], layer_output_shape) 75 | 76 | @property 77 | def output_size(self): 78 | # Return the cell output and the id 79 | return BasicDecoderOutput( 80 | rnn_output=self._rnn_output_size(), 81 | sample_id=tensor_shape.TensorShape([])) 82 | 83 | @property 84 | def output_dtype(self): 85 | # Assume the dtype of the cell is the output_size structure 86 | # containing the input_state's first component's dtype. 87 | # Return that structure and int32 (the id) 88 | dtype = nest.flatten(self._initial_state)[0].dtype 89 | return BasicDecoderOutput( 90 | nest.map_structure(lambda _: dtype, self._rnn_output_size()), 91 | dtypes.int32) 92 | 93 | def initialize(self, name=None): 94 | """Initialize the decoder. 95 | Args: 96 | name: Name scope for any created operations. 97 | Returns: 98 | `(finished, first_inputs, initial_state)`. 99 | """ 100 | # Concatenate the latent vector to the 1st input to the decoder LSTM, i.e, the embedding + latent vector 101 | return (self._helper.initialize()[0], 102 | tf.concat([self._helper.initialize()[1], self._latent_vector], axis=-1)) + (self._initial_state,) 103 | 104 | def step(self, time, inputs, state, name=None): 105 | """Perform a decoding step. 106 | Args: 107 | time: scalar `int32` tensor. 108 | inputs: A (structure of) input tensors. 109 | state: A (structure of) state tensors and TensorArrays. 110 | name: Name scope for any created operations. 111 | Returns: 112 | `(outputs, next_state, next_inputs, finished)`. 113 | """ 114 | with ops.name_scope(name, "BasicDecoderStep", (time, inputs, state)): 115 | cell_outputs, cell_state, c_kl_loss = self._cell(inputs, state) 116 | # Accumulate the context KL loss from token at the current decoder step 117 | self._intermediate_context_kl_loss += c_kl_loss 118 | c_kl_loss = self._intermediate_context_kl_loss 119 | 120 | if self._output_layer is not None: 121 | cell_outputs = self._output_layer(cell_outputs) 122 | sample_ids = self._helper.sample( 123 | time=time, outputs=cell_outputs, state=cell_state) 124 | (finished, next_inputs, next_state) = self._helper.next_inputs( 125 | time=time, 126 | outputs=cell_outputs, 127 | state=cell_state, 128 | sample_ids=sample_ids) 129 | 130 | # Concatenate the latent vector to the predicted word's embedding 131 | next_inputs = tf.concat([next_inputs, self._latent_vector], axis=-1) 132 | 133 | outputs = BasicDecoderOutput(cell_outputs, sample_ids) 134 | return (outputs, next_state, next_inputs, finished, c_kl_loss) -------------------------------------------------------------------------------- /ved_varAttn/varAttention_decoder/decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Seq2seq layer operations for use in neural networks.""" 16 | 17 | import abc 18 | import six 19 | 20 | import tensorflow as tf 21 | from tensorflow.python.framework import constant_op 22 | from tensorflow.python.framework import dtypes 23 | from tensorflow.python.framework import ops 24 | from tensorflow.python.framework import tensor_shape 25 | from tensorflow.python.framework import tensor_util 26 | from tensorflow.python.ops import array_ops 27 | from tensorflow.python.ops import control_flow_ops 28 | from tensorflow.python.ops import math_ops 29 | from tensorflow.python.ops import rnn 30 | from tensorflow.python.ops import tensor_array_ops 31 | from tensorflow.python.ops import variable_scope 32 | from tensorflow.python.util import nest 33 | 34 | __all__ = ["Decoder", "dynamic_decode"] 35 | 36 | _transpose_batch_time = rnn._transpose_batch_time # pylint: disable=protected-access 37 | 38 | 39 | @six.add_metaclass(abc.ABCMeta) 40 | class Decoder(object): 41 | """An RNN Decoder abstract interface object. 42 | 43 | Concepts used by this interface: 44 | - `inputs`: (structure of) tensors and TensorArrays that is passed as input to 45 | the RNNCell composing the decoder, at each time step. 46 | - `state`: (structure of) tensors and TensorArrays that is passed to the 47 | RNNCell instance as the state. 48 | - `finished`: boolean tensor telling whether each sequence in the batch is 49 | finished. 50 | - `outputs`: Instance of BasicDecoderOutput. Result of the decoding, at each 51 | time step. 52 | """ 53 | 54 | @property 55 | def batch_size(self): 56 | """The batch size of input values.""" 57 | raise NotImplementedError 58 | 59 | @property 60 | def output_size(self): 61 | """A (possibly nested tuple of...) integer[s] or `TensorShape` object[s].""" 62 | raise NotImplementedError 63 | 64 | @property 65 | def output_dtype(self): 66 | """A (possibly nested tuple of...) dtype[s].""" 67 | raise NotImplementedError 68 | 69 | @abc.abstractmethod 70 | def initialize(self, name=None): 71 | """Called before any decoding iterations. 72 | 73 | This methods must compute initial input values and initial state. 74 | 75 | Args: 76 | name: Name scope for any created operations. 77 | 78 | Returns: 79 | `(finished, initial_inputs, initial_state)`: initial values of 80 | 'finished' flags, inputs and state. 81 | """ 82 | raise NotImplementedError 83 | 84 | @abc.abstractmethod 85 | def step(self, time, inputs, state, name=None): 86 | """Called per step of decoding (but only once for dynamic decoding). 87 | 88 | Args: 89 | time: Scalar `int32` tensor. Current step number. 90 | inputs: RNNCell input (possibly nested tuple of) tensor[s] for this time 91 | step. 92 | state: RNNCell state (possibly nested tuple of) tensor[s] from previous 93 | time step. 94 | name: Name scope for any created operations. 95 | 96 | Returns: 97 | `(outputs, next_state, next_inputs, finished)`: `outputs` is an object 98 | containing the decoder output, `next_state` is a (structure of) state 99 | tensors and TensorArrays, `next_inputs` is the tensor that should be used 100 | as input for the next step, `finished` is a boolean tensor telling whether 101 | the sequence is complete, for each sequence in the batch. 102 | """ 103 | raise NotImplementedError 104 | 105 | def finalize(self, outputs, final_state, sequence_lengths): 106 | raise NotImplementedError 107 | 108 | @property 109 | def tracks_own_finished(self): 110 | """Describes whether the Decoder keeps track of finished states. 111 | 112 | Most decoders will emit a true/false `finished` value independently 113 | at each time step. In this case, the `dynamic_decode` function keeps track 114 | of which batch entries are already finished, and performs a logical OR to 115 | insert new batches to the finished set. 116 | 117 | Some decoders, however, shuffle batches / beams between time steps and 118 | `dynamic_decode` will mix up the finished state across these entries because 119 | it does not track the reshuffle across time steps. In this case, it is 120 | up to the decoder to declare that it will keep track of its own finished 121 | state by setting this property to `True`. 122 | 123 | Returns: 124 | Python bool. 125 | """ 126 | return False 127 | 128 | 129 | def _create_zero_outputs(size, dtype, batch_size): 130 | """Create a zero outputs Tensor structure.""" 131 | 132 | def _t(s): 133 | return (s if isinstance(s, ops.Tensor) else constant_op.constant( 134 | tensor_shape.TensorShape(s).as_list(), 135 | dtype=dtypes.int32, 136 | name="zero_suffix_shape")) 137 | 138 | def _create(s, d): 139 | return array_ops.zeros( 140 | array_ops.concat( 141 | ([batch_size], _t(s)), axis=0), dtype=d) 142 | 143 | return nest.map_structure(_create, size, dtype) 144 | 145 | 146 | def dynamic_decode(decoder, 147 | output_time_major=False, 148 | impute_finished=False, 149 | maximum_iterations=None, 150 | parallel_iterations=32, 151 | swap_memory=False, 152 | scope=None): 153 | """Perform dynamic decoding with `decoder`. 154 | 155 | Calls initialize() once and step() repeatedly on the Decoder object. 156 | 157 | Args: 158 | decoder: A `Decoder` instance. 159 | output_time_major: Python boolean. Default: `False` (batch major). If 160 | `True`, outputs are returned as time major tensors (this mode is faster). 161 | Otherwise, outputs are returned as batch major tensors (this adds extra 162 | time to the computation). 163 | impute_finished: Python boolean. If `True`, then states for batch 164 | entries which are marked as finished get copied through and the 165 | corresponding outputs get zeroed out. This causes some slowdown at 166 | each time step, but ensures that the final state and outputs have 167 | the correct values and that backprop ignores time steps that were 168 | marked as finished. 169 | maximum_iterations: `int32` scalar, maximum allowed number of decoding 170 | steps. Default is `None` (decode until the decoder is fully done). 171 | parallel_iterations: Argument passed to `tf.while_loop`. 172 | swap_memory: Argument passed to `tf.while_loop`. 173 | scope: Optional variable scope to use. 174 | 175 | Returns: 176 | `(final_outputs, final_state, final_sequence_lengths)`. 177 | 178 | Raises: 179 | TypeError: if `decoder` is not an instance of `Decoder`. 180 | ValueError: if `maximum_iterations` is provided but is not a scalar. 181 | """ 182 | if not isinstance(decoder, Decoder): 183 | raise TypeError("Expected decoder to be type Decoder, but saw: %s" % 184 | type(decoder)) 185 | 186 | with variable_scope.variable_scope(scope, "decoder") as varscope: 187 | # Properly cache variable values inside the while_loop 188 | if varscope.caching_device is None: 189 | varscope.set_caching_device(lambda op: op.device) 190 | 191 | if maximum_iterations is not None: 192 | maximum_iterations = ops.convert_to_tensor( 193 | maximum_iterations, dtype=dtypes.int32, name="maximum_iterations") 194 | if maximum_iterations.get_shape().ndims != 0: 195 | raise ValueError("maximum_iterations must be a scalar") 196 | 197 | initial_finished, initial_inputs, initial_state = decoder.initialize() 198 | # Initial value of zero for c_kl_loss 199 | initial_context_kl_loss = tf.zeros(shape=(decoder.batch_size,), 200 | name="initial_context_kl_loss") 201 | 202 | zero_outputs = _create_zero_outputs(decoder.output_size, 203 | decoder.output_dtype, 204 | decoder.batch_size) 205 | 206 | if maximum_iterations is not None: 207 | initial_finished = math_ops.logical_or( 208 | initial_finished, 0 >= maximum_iterations) 209 | initial_sequence_lengths = array_ops.zeros_like( 210 | initial_finished, dtype=dtypes.int32) 211 | initial_time = constant_op.constant(0, dtype=dtypes.int32) 212 | 213 | def _shape(batch_size, from_shape): 214 | if not isinstance(from_shape, tensor_shape.TensorShape): 215 | return tensor_shape.TensorShape(None) 216 | else: 217 | batch_size = tensor_util.constant_value( 218 | ops.convert_to_tensor( 219 | batch_size, name="batch_size")) 220 | return tensor_shape.TensorShape([batch_size]).concatenate(from_shape) 221 | 222 | def _create_ta(s, d): 223 | return tensor_array_ops.TensorArray( 224 | dtype=d, 225 | size=0, 226 | dynamic_size=True, 227 | element_shape=_shape(decoder.batch_size, s)) 228 | 229 | initial_outputs_ta = nest.map_structure(_create_ta, decoder.output_size, 230 | decoder.output_dtype) 231 | 232 | def condition(unused_time, unused_outputs_ta, unused_state, unused_inputs, 233 | finished, unused_sequence_lengths, unused_c_kl_loss): 234 | return math_ops.logical_not(math_ops.reduce_all(finished)) 235 | 236 | def body(time, outputs_ta, state, inputs, finished, sequence_lengths, c_kl_loss): 237 | """Internal while_loop body. 238 | 239 | Args: 240 | time: scalar int32 tensor. 241 | outputs_ta: structure of TensorArray. 242 | state: (structure of) state tensors and TensorArrays. 243 | inputs: (structure of) input tensors. 244 | finished: bool tensor (keeping track of what's finished). 245 | sequence_lengths: int32 tensor (keeping track of time of finish). 246 | 247 | Returns: 248 | `(time + 1, outputs_ta, next_state, next_inputs, next_finished, 249 | next_sequence_lengths)`. 250 | ``` 251 | """ 252 | # Receive accumulated c_kl_loss and pass to next iteration 253 | (next_outputs, decoder_state, next_inputs, 254 | decoder_finished, context_kl_loss) = decoder.step(time, inputs, state) 255 | if decoder.tracks_own_finished: 256 | next_finished = decoder_finished 257 | else: 258 | next_finished = math_ops.logical_or(decoder_finished, finished) 259 | if maximum_iterations is not None: 260 | next_finished = math_ops.logical_or( 261 | next_finished, time + 1 >= maximum_iterations) 262 | next_sequence_lengths = array_ops.where( 263 | math_ops.logical_and(math_ops.logical_not(finished), next_finished), 264 | array_ops.fill(array_ops.shape(sequence_lengths), time + 1), 265 | sequence_lengths) 266 | 267 | nest.assert_same_structure(state, decoder_state) 268 | nest.assert_same_structure(outputs_ta, next_outputs) 269 | nest.assert_same_structure(inputs, next_inputs) 270 | 271 | # Zero out output values past finish 272 | if impute_finished: 273 | emit = nest.map_structure( 274 | lambda out, zero: array_ops.where(finished, zero, out), 275 | next_outputs, 276 | zero_outputs) 277 | else: 278 | emit = next_outputs 279 | 280 | # Copy through states past finish 281 | def _maybe_copy_state(new, cur): 282 | # TensorArrays and scalar states get passed through. 283 | if isinstance(cur, tensor_array_ops.TensorArray): 284 | pass_through = True 285 | else: 286 | new.set_shape(cur.shape) 287 | pass_through = (new.shape.ndims == 0) 288 | return new if pass_through else array_ops.where(finished, cur, new) 289 | 290 | if impute_finished: 291 | next_state = nest.map_structure( 292 | _maybe_copy_state, decoder_state, state) 293 | else: 294 | next_state = decoder_state 295 | 296 | outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out), 297 | outputs_ta, emit) 298 | return (time + 1, outputs_ta, next_state, next_inputs, next_finished, 299 | next_sequence_lengths, context_kl_loss) 300 | 301 | res = control_flow_ops.while_loop( 302 | condition, 303 | body, 304 | loop_vars=[ 305 | initial_time, initial_outputs_ta, initial_state, initial_inputs, 306 | initial_finished, initial_sequence_lengths, initial_context_kl_loss, 307 | ], 308 | parallel_iterations=parallel_iterations, 309 | swap_memory=swap_memory) 310 | 311 | final_outputs_ta = res[1] 312 | final_state = res[2] 313 | final_sequence_lengths = res[5] 314 | final_context_kl_loss = res[6] 315 | 316 | final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta) 317 | 318 | try: 319 | final_outputs, final_state = decoder.finalize( 320 | final_outputs, final_state, final_sequence_lengths) 321 | except NotImplementedError: 322 | pass 323 | 324 | if not output_time_major: 325 | final_outputs = nest.map_structure(_transpose_batch_time, final_outputs) 326 | 327 | return final_outputs, final_state, final_sequence_lengths, final_context_kl_loss 328 | 329 | 330 | -------------------------------------------------------------------------------- /ved_varAttn/ved_varAttn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if '../' not in sys.path: sys.path.append('../') 4 | import time 5 | import pickle 6 | import tensorflow as tf 7 | import numpy as np 8 | from utils import data_utils 9 | from utils import eval_utils 10 | from tqdm import tqdm 11 | from nltk.tokenize import word_tokenize 12 | from tensorflow.python.layers.core import Dense 13 | from varAttention_decoder import basic_decoder 14 | from varAttention_decoder import decoder 15 | from varAttention_decoder import attention_wrapper 16 | 17 | 18 | class VarSeq2SeqVarAttnModel(object): 19 | 20 | def __init__(self, config, encoder_embeddings_matrix, decoder_embeddings_matrix, 21 | encoder_word_index, decoder_word_index): 22 | 23 | self.config = config 24 | 25 | self.lstm_hidden_units = config['lstm_hidden_units'] 26 | self.embedding_size = config['embedding_size'] 27 | self.latent_dim = config['latent_dim'] 28 | self.num_layers = config['num_layers'] 29 | 30 | self.encoder_vocab_size = config['encoder_vocab'] 31 | self.decoder_vocab_size = config['decoder_vocab'] 32 | 33 | self.encoder_num_tokens = config['encoder_num_tokens'] 34 | self.decoder_num_tokens = config['decoder_num_tokens'] 35 | 36 | self.dropout_keep_prob = config['dropout_keep_prob'] 37 | self.word_dropout_keep_probability = config['word_dropout_keep_probability'] 38 | self.z_temp = config['z_temp'] 39 | self.attention_temp = config['attention_temp'] 40 | self.use_hmean = config['use_hmean'] 41 | self.gamma_val = config['gamma_val'] 42 | 43 | self.initial_learning_rate = config['initial_learning_rate'] 44 | self.learning_rate_decay = config['learning_rate_decay'] 45 | self.min_learning_rate = config['min_learning_rate'] 46 | 47 | self.batch_size = config['batch_size'] 48 | self.epochs = config['n_epochs'] 49 | 50 | self.encoder_embeddings_matrix = encoder_embeddings_matrix 51 | self.decoder_embeddings_matrix = decoder_embeddings_matrix 52 | self.encoder_word_index = encoder_word_index 53 | self.decoder_word_index = decoder_word_index 54 | self.encoder_idx_word = dict((i, word) for word, i in encoder_word_index.items()) 55 | self.decoder_idx_word = dict((i, word) for word, i in decoder_word_index.items()) 56 | 57 | self.logs_dir = config['logs_dir'] 58 | self.model_checkpoint_dir = config['model_checkpoint_dir'] 59 | self.bleu_path = config['bleu_path'] 60 | 61 | self.pad = self.decoder_word_index['PAD'] 62 | self.eos = self.decoder_word_index['EOS'] 63 | 64 | self.epoch_bleu_score_val = {'1': [], '2': [], '3': [], '4': []} 65 | self.log_str = [] 66 | 67 | self.build_model() 68 | 69 | def build_model(self): 70 | print("[INFO] Building Model ...") 71 | 72 | self.init_placeholders() 73 | self.embedding_layer() 74 | self.build_encoder() 75 | self.build_latent_space() 76 | self.build_decoder() 77 | self.loss() 78 | self.optimize() 79 | self.summary() 80 | 81 | def init_placeholders(self): 82 | with tf.name_scope("model_inputs"): 83 | # Create palceholders for inputs to the model 84 | self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.encoder_num_tokens], name='input') 85 | self.target_data = tf.placeholder(tf.int32, [self.batch_size, self.decoder_num_tokens], name='targets') 86 | self.lr = tf.placeholder(tf.float32, name='learning_rate', shape=()) 87 | self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') # Dropout Keep Probability 88 | self.source_sentence_length = tf.placeholder(tf.int32, shape=(self.batch_size,), 89 | name='source_sentence_length') 90 | self.target_sentence_length = tf.placeholder(tf.int32, shape=(self.batch_size,), 91 | name='target_sentence_length') 92 | self.word_dropout_keep_prob = tf.placeholder(tf.float32, name='word_drop_keep_prob', shape=()) 93 | self.lambda_coeff = tf.placeholder(tf.float32, name='lambda_coeff', shape=()) 94 | self.gamma_coeff = tf.placeholder(tf.float32, name='gamma_coeff', shape=()) 95 | self.z_temperature = tf.placeholder(tf.float32, name='z_temperature', shape=()) 96 | self.attention_temperature = tf.placeholder(tf.float32, name='attention_temperature', shape=()) 97 | 98 | def embedding_layer(self): 99 | with tf.name_scope("word_embeddings"): 100 | self.encoder_embeddings = tf.Variable( 101 | initial_value=np.array(self.encoder_embeddings_matrix, dtype=np.float32), 102 | dtype=tf.float32, trainable=False) 103 | self.enc_embed_input = tf.nn.embedding_lookup(self.encoder_embeddings, self.input_data) 104 | # self.enc_embed_input = tf.nn.dropout(self.enc_embed_input, keep_prob=self.keep_prob) 105 | 106 | with tf.name_scope("decoder_inputs"): 107 | self.decoder_embeddings = tf.Variable( 108 | initial_value=np.array(self.decoder_embeddings_matrix, dtype=np.float32), 109 | dtype=tf.float32, trainable=False) 110 | 111 | keep = tf.where( 112 | tf.random_uniform([self.batch_size, self.decoder_num_tokens]) < self.word_dropout_keep_prob, 113 | tf.fill([self.batch_size, self.decoder_num_tokens], True), 114 | tf.fill([self.batch_size, self.decoder_num_tokens], False)) 115 | ending = tf.cast(keep, dtype=tf.int32) * self.target_data 116 | ending = tf.strided_slice(ending, [0, 0], [self.batch_size, -1], [1, 1], 117 | name='slice_input') # Minus 1 implies everything till the last dim 118 | self.dec_input = tf.concat([tf.fill([self.batch_size, 1], self.decoder_word_index['GO']), ending], 1, 119 | name='dec_input') 120 | self.dec_embed_input = tf.nn.embedding_lookup(self.decoder_embeddings, self.dec_input) 121 | # self.dec_embed_input = tf.nn.dropout(self.dec_embed_input, keep_prob=self.keep_prob) 122 | 123 | def build_encoder(self): 124 | with tf.name_scope("encode"): 125 | for layer in range(self.num_layers): 126 | with tf.variable_scope('encoder_{}'.format(layer + 1)): 127 | cell_fw = tf.contrib.rnn.LayerNormBasicLSTMCell(self.lstm_hidden_units) 128 | cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=self.keep_prob) 129 | 130 | cell_bw = tf.contrib.rnn.LayerNormBasicLSTMCell(self.lstm_hidden_units) 131 | cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=self.keep_prob) 132 | 133 | self.enc_output, self.enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 134 | cell_bw, 135 | self.enc_embed_input, 136 | self.source_sentence_length, 137 | dtype=tf.float32) 138 | 139 | # Join outputs since we are using a bidirectional RNN 140 | self.h_N = tf.concat([self.enc_state[0][1], self.enc_state[1][1]], axis=-1, 141 | name='h_N') # Concatenated h from the fw and bw LSTMs 142 | self.enc_outputs = tf.concat([self.enc_output[0], self.enc_output[1]], axis=-1, name='encoder_outputs') 143 | 144 | def build_latent_space(self): 145 | with tf.name_scope("latent_space"): 146 | self.z_mean = Dense(self.latent_dim, name='z_mean')(self.h_N) 147 | self.z_log_sigma = Dense(self.latent_dim, name='z_log_sigma')(self.h_N) 148 | 149 | self.z_vector = tf.identity(self.sample_gaussian(), name='z_vector') 150 | 151 | def sample_gaussian(self): 152 | """(Differentiably!) draw sample from Gaussian with given shape, subject to random noise epsilon""" 153 | with tf.name_scope('sample_gaussian'): 154 | # reparameterization trick 155 | epsilon = tf.random_normal(tf.shape(self.z_log_sigma), name='epsilon') 156 | return self.z_mean + tf.scalar_mul(self.z_temperature, 157 | epsilon * tf.exp(self.z_log_sigma)) # N(mu, I * sigma**2) 158 | 159 | def calculate_kl_loss(self): 160 | """(Gaussian) Kullback-Leibler divergence KL(q||p), per training example""" 161 | # (tf.Tensor, tf.Tensor) -> tf.Tensor 162 | with tf.name_scope("KL_divergence"): 163 | # = -0.5 * (1 + log(sigma**2) - mu**2 - sigma**2) 164 | return -0.5 * tf.reduce_sum(1.0 + 2 * self.z_log_sigma - self.z_mean ** 2 - 165 | tf.exp(2 * self.z_log_sigma), 1) 166 | 167 | def build_decoder(self): 168 | with tf.variable_scope("decode"): 169 | for layer in range(self.num_layers): 170 | with tf.variable_scope('decoder_{}'.format(layer + 1)): 171 | dec_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(2 * self.lstm_hidden_units) 172 | dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, input_keep_prob=self.keep_prob) 173 | 174 | self.output_layer = Dense(self.decoder_vocab_size) 175 | 176 | attn_mech = attention_wrapper.LuongAttention(2 * self.lstm_hidden_units, 177 | self.enc_outputs, 178 | memory_sequence_length=self.source_sentence_length) 179 | 180 | attn_cell = attention_wrapper.AttentionWrapper(dec_cell, attn_mech, self.attention_temperature, self.use_hmean, self.lstm_hidden_units) 181 | 182 | self.init_state = attn_cell.zero_state(self.batch_size, tf.float32) 183 | 184 | with tf.name_scope("training_decoder"): 185 | training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=self.dec_embed_input, 186 | sequence_length=self.target_sentence_length, 187 | time_major=False) 188 | 189 | training_decoder = basic_decoder.BasicDecoder(attn_cell, 190 | training_helper, 191 | initial_state=self.init_state, 192 | latent_vector=self.z_vector, 193 | output_layer=self.output_layer) 194 | 195 | self.training_logits, _state, _len, self.c_kl_batch_train = decoder.dynamic_decode(training_decoder, 196 | output_time_major=False, 197 | impute_finished=True, 198 | maximum_iterations=self.decoder_num_tokens) 199 | 200 | self.training_logits = tf.identity(self.training_logits.rnn_output, 'logits') 201 | 202 | with tf.name_scope("inference_decoder"): 203 | start_token = self.decoder_word_index['GO'] 204 | end_token = self.decoder_word_index['EOS'] 205 | 206 | start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [self.batch_size], 207 | name='start_tokens') 208 | 209 | inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.decoder_embeddings, 210 | start_tokens, 211 | end_token) 212 | 213 | inference_decoder = basic_decoder.BasicDecoder(attn_cell, 214 | inference_helper, 215 | initial_state=self.init_state, 216 | latent_vector=self.z_vector, 217 | output_layer=self.output_layer) 218 | 219 | self.inference_logits, _state, _len, self.c_kl_batch_inf = decoder.dynamic_decode(inference_decoder, 220 | output_time_major=False, 221 | impute_finished=True, 222 | maximum_iterations=self.decoder_num_tokens) 223 | 224 | self.inference_logits = tf.identity(self.inference_logits.sample_id, name='predictions') 225 | 226 | self.c_kl_batch_train = tf.div(self.c_kl_batch_train, tf.cast(self.target_sentence_length, 227 | dtype=tf.float32)) # Divide by respective target seq lengths 228 | 229 | def loss(self): 230 | with tf.name_scope('losses'): 231 | self.kl_loss = self.calculate_kl_loss() 232 | self.kl_loss = tf.scalar_mul(self.lambda_coeff, self.kl_loss) 233 | 234 | self.context_kl_loss = tf.scalar_mul(self.gamma_coeff * self.lambda_coeff, self.c_kl_batch_train) 235 | 236 | # Create the weights for sequence_loss 237 | masks = tf.sequence_mask(self.target_sentence_length, self.decoder_num_tokens, dtype=tf.float32, name='masks') 238 | 239 | self.xent_loss = tf.contrib.seq2seq.sequence_loss( 240 | self.training_logits, 241 | self.target_data, 242 | weights=masks, 243 | average_across_batch=False) 244 | 245 | # L2-Regularization 246 | self.var_list = tf.trainable_variables() 247 | self.lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in self.var_list if 'bias' not in v.name]) * 0.001 248 | 249 | self.cost = tf.reduce_sum(self.xent_loss + self.kl_loss + self.context_kl_loss) + self.lossL2 250 | 251 | def optimize(self): 252 | # Optimizer 253 | with tf.name_scope('optimization'): 254 | optimizer = tf.train.AdamOptimizer(self.lr) 255 | 256 | # Gradient Clipping 257 | gradients = optimizer.compute_gradients(self.cost, var_list=self.var_list) 258 | capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] 259 | self.train_op = optimizer.apply_gradients(capped_gradients) 260 | 261 | def summary(self): 262 | with tf.name_scope('summaries'): 263 | tf.summary.scalar('xent_loss', tf.reduce_sum(self.xent_loss)) 264 | tf.summary.scalar('l2_loss', tf.reduce_sum(self.lossL2)) 265 | tf.summary.scalar("kl_loss", tf.reduce_sum(self.kl_loss)) 266 | tf.summary.scalar("context_kl_loss", tf.reduce_sum(self.context_kl_loss)) 267 | tf.summary.scalar('total_loss', tf.reduce_sum(self.cost)) 268 | tf.summary.histogram("latent_vector", self.z_vector) 269 | tf.summary.histogram("latent_mean", self.z_mean) 270 | tf.summary.histogram("latent_log_sigma", self.z_log_sigma) 271 | self.summary_op = tf.summary.merge_all() 272 | 273 | def train(self, x_train, y_train, x_val, y_val, true_val): 274 | 275 | print('[INFO] Training process started') 276 | 277 | learning_rate = self.initial_learning_rate 278 | iter_i = 0 279 | lambda_val = 0.0 280 | 281 | with tf.Session() as sess: 282 | sess.run(tf.global_variables_initializer()) 283 | 284 | writer = tf.summary.FileWriter(self.logs_dir, sess.graph) 285 | 286 | for epoch_i in range(1, self.epochs + 1): 287 | 288 | start_time = time.time() 289 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 290 | data_utils.get_batches(x_train, y_train, self.batch_size)): 291 | 292 | try: 293 | iter_i += 1 294 | 295 | _, _summary = sess.run( 296 | [self.train_op, self.summary_op], 297 | feed_dict={self.input_data: input_batch, 298 | self.target_data: output_batch, 299 | self.lr: learning_rate, 300 | self.source_sentence_length: source_sent_lengths, 301 | self.target_sentence_length: tar_sent_lengths, 302 | self.keep_prob: self.dropout_keep_prob, 303 | self.lambda_coeff: lambda_val, 304 | self.z_temperature: self.z_temp, 305 | self.word_dropout_keep_prob: self.word_dropout_keep_probability, 306 | self.attention_temperature: self.attention_temp, 307 | self.gamma_coeff: self.gamma_val 308 | }) 309 | 310 | writer.add_summary(_summary, iter_i) 311 | 312 | # KL Annealing till some iteration 313 | if iter_i <= 3000: 314 | lambda_val = np.round((np.tanh((iter_i - 4500) / 1000) + 1) / 2, decimals=6) 315 | 316 | except Exception as e: 317 | # print(iter_i, e) 318 | pass 319 | 320 | self.validate(sess, x_val, y_val, true_val) 321 | val_bleu_str = str(self.epoch_bleu_score_val['1'][epoch_i - 1]) + ' | ' \ 322 | + str(self.epoch_bleu_score_val['2'][epoch_i - 1]) + ' | ' \ 323 | + str(self.epoch_bleu_score_val['3'][epoch_i - 1]) + ' | ' \ 324 | + str(self.epoch_bleu_score_val['4'][epoch_i - 1]) 325 | 326 | # Reduce learning rate, but not below its minimum value 327 | learning_rate = np.max([self.min_learning_rate, learning_rate * self.learning_rate_decay]) 328 | 329 | saver = tf.train.Saver() 330 | saver.save(sess, self.model_checkpoint_dir + str(epoch_i) + ".ckpt") 331 | end_time = time.time() 332 | 333 | # Save the validation BLEU scores so far 334 | with open(self.bleu_path + '.pkl', 'wb') as f: 335 | pickle.dump(self.epoch_bleu_score_val, f) 336 | 337 | self.log_str.append('Epoch {:>3}/{} - Time {:>6.1f} BLEU: {}'.format(epoch_i, 338 | self.epochs, 339 | end_time - start_time, 340 | val_bleu_str)) 341 | with open('logs.txt', 'w') as f: 342 | f.write('\n'.join(self.log_str)) 343 | print(self.log_str[-1]) 344 | 345 | def validate(self, sess, x_val, y_val, true_val): 346 | # Calculate BLEU on validation data 347 | hypotheses_val = [] 348 | references_val = [] 349 | symbol=[] 350 | if self.config['experiment'] == 'qgen': 351 | symbol.append('?') 352 | 353 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 354 | data_utils.get_batches(x_val, y_val, self.batch_size)): 355 | answer_logits = sess.run(self.inference_logits, 356 | feed_dict={self.input_data: input_batch, 357 | self.source_sentence_length: source_sent_lengths, 358 | self.keep_prob: 1.0, 359 | self.word_dropout_keep_prob: 1.0, 360 | self.z_temperature: self.z_temp, 361 | self.attention_temperature: self.attention_temp}) 362 | 363 | for k, pred in enumerate(answer_logits): 364 | hypotheses_val.append( 365 | word_tokenize( 366 | " ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol) 367 | references_val.append([word_tokenize(true_val[batch_i * self.batch_size + k])]) 368 | 369 | bleu_scores = eval_utils.calculate_bleu_scores(references_val, hypotheses_val) 370 | self.epoch_bleu_score_val['1'].append(bleu_scores[0]) 371 | self.epoch_bleu_score_val['2'].append(bleu_scores[1]) 372 | self.epoch_bleu_score_val['3'].append(bleu_scores[2]) 373 | self.epoch_bleu_score_val['4'].append(bleu_scores[3]) 374 | 375 | def predict(self, checkpoint, x_test, y_test, true_test): 376 | pred_logits = [] 377 | hypotheses_test = [] 378 | references_test = [] 379 | symbol=[] 380 | if self.config['experiment'] == 'qgen': 381 | symbol.append('?') 382 | 383 | with tf.Session() as sess: 384 | sess.run(tf.global_variables_initializer()) 385 | saver = tf.train.Saver() 386 | saver.restore(sess, checkpoint) 387 | 388 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 389 | data_utils.get_batches(x_test, y_test, self.batch_size)): 390 | result = sess.run(self.inference_logits, feed_dict={self.input_data: input_batch, 391 | self.source_sentence_length: source_sent_lengths, 392 | self.keep_prob: 1.0, 393 | self.word_dropout_keep_prob: 1.0, 394 | self.z_temperature: self.z_temp, 395 | self.attention_temperature: self.attention_temp}) 396 | 397 | pred_logits.extend(result) 398 | 399 | for k, pred in enumerate(result): 400 | hypotheses_test.append( 401 | word_tokenize(" ".join( 402 | [self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol) 403 | references_test.append([word_tokenize(true_test[batch_i * self.batch_size + k])]) 404 | 405 | bleu_scores = eval_utils.calculate_bleu_scores(references_test, hypotheses_test) 406 | 407 | print('BLEU 1 to 4 : {}'.format(' | '.join(map(str, bleu_scores)))) 408 | 409 | return pred_logits 410 | 411 | def show_output_sentences(self, preds, y_test, input_test, true_test): 412 | symbol=[] 413 | if self.config['experiment'] == 'qgen': 414 | symbol.append('?') 415 | for k, (pred, actual) in enumerate(zip(preds, y_test)): 416 | print('Input: {}'.format(input_test[k].strip())) 417 | print('Actual: {}'.format(true_test[k].strip())) 418 | print('Generated: {}\n'.format( 419 | " ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, self.eos]] + symbol))) 420 | 421 | def get_diversity_metrics(self, checkpoint, x_test, y_test, num_samples=10, num_iterations = 3): 422 | 423 | x_test_repeated = np.repeat(x_test, num_samples, axis=0) 424 | y_test_repeated = np.repeat(y_test, num_samples, axis=0) 425 | 426 | entropy_list =[] 427 | uni_diversity = [] 428 | bi_diversity = [] 429 | 430 | with tf.Session() as sess: 431 | sess.run(tf.global_variables_initializer()) 432 | saver = tf.train.Saver() 433 | saver.restore(sess, checkpoint) 434 | 435 | for _ in tqdm(range(num_iterations)): 436 | total_ent = 0 437 | uni = 0 438 | bi = 0 439 | answer_logits = [] 440 | pred_sentences = [] 441 | 442 | for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( 443 | data_utils.get_batches(x_test_repeated, y_test_repeated, self.batch_size)): 444 | result = sess.run(self.inference_logits, feed_dict={self.input_data: input_batch, 445 | self.source_sentence_length: source_sent_lengths, 446 | self.keep_prob: 1.0, 447 | self.word_dropout_keep_prob: 1.0, 448 | self.z_temperature: self.z_temp, 449 | self.attention_temperature: self.attention_temp}) 450 | answer_logits.extend(result) 451 | 452 | for idx, (actual, pred) in enumerate(zip(y_test_repeated, answer_logits)): 453 | pred_sentences.append(" ".join([self.decoder_idx_word[i] for i in pred if i != self.pad][:-1])) 454 | 455 | if (idx + 1) % num_samples == 0: 456 | word_list = [word_tokenize(p) for p in pred_sentences] 457 | corpus = [item for sublist in word_list for item in sublist] 458 | total_ent += eval_utils.calculate_entropy(corpus) 459 | diversity_result = eval_utils.calculate_ngram_diversity(corpus) 460 | uni += diversity_result[0] 461 | bi += diversity_result[1] 462 | 463 | pred_sentences = [] 464 | 465 | entropy_list.append(total_ent / len(x_test)) 466 | uni_diversity.append(uni / len(x_test)) 467 | bi_diversity.append(bi / len(x_test)) 468 | 469 | print('Entropy = {:>.3f} | Distinct-1 = {:>.3f} | Distinct-2 = {:>.3f}'.format(np.mean(entropy_list), 470 | np.mean(uni_diversity), 471 | np.mean(bi_diversity))) 472 | -------------------------------------------------------------------------------- /w2v_generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import gensim 4 | import numpy as np 5 | import pandas as pd 6 | from nltk.tokenize import word_tokenize 7 | 8 | 9 | W2V_DIR = 'w2v_models/' 10 | DATA_DIR = 'data/' 11 | 12 | parser = argparse.ArgumentParser(description='Create word2vec embeddings for the specified dataset') 13 | parser.add_argument('-d', '--dataset', help='Specify dataset: either qgen, dialogue or both', required=True) 14 | args = vars(parser.parse_args()) 15 | 16 | 17 | def main(): 18 | if not os.path.exists(W2V_DIR): 19 | os.mkdir(W2V_DIR) 20 | 21 | all_files = os.listdir(DATA_DIR) 22 | 23 | if args['dataset'] == 'dialogue': 24 | files = [f for f in all_files if 'dialogue' in f] 25 | elif args['dataset'] == 'qgen': 26 | files = [f for f in all_files if 'qgen' in f] 27 | else: 28 | print('Invalid Argument !') 29 | return 30 | 31 | df_list = pd.concat(load_data(files)) 32 | df_list.reset_index(inplace=True, drop=True) 33 | data = list(df_list.iloc[:, 0] + df_list.iloc[:, 1]) # 1st and 2nd column 34 | create_w2v(data) 35 | print('Word2Vec created successfully for {}'.format(args['dataset'])) 36 | 37 | 38 | def load_data(files): 39 | df_list = [] 40 | for f in files: 41 | df_list.append(pd.read_csv(DATA_DIR + f)) 42 | 43 | return df_list 44 | 45 | 46 | def create_w2v(sentences): 47 | np.random.shuffle(sentences) 48 | sentences = [word_tokenize(s) for s in sentences] 49 | w2v_model = gensim.models.Word2Vec(sentences, 50 | size=300, 51 | min_count=1, 52 | iter=50) 53 | w2v_model.save(W2V_DIR + 'w2vmodel_' + args['dataset'] + '.pkl') 54 | 55 | 56 | if __name__ == '__main__': 57 | main() 58 | -------------------------------------------------------------------------------- /w2v_models/README.md: -------------------------------------------------------------------------------- 1 | Word2Vec models will be created in this directory. 2 | --------------------------------------------------------------------------------