├── LICENSE ├── LICENSE.md ├── README.md ├── V1 ├── __init__.py ├── data_preprocess.py ├── main_gen.py ├── model.py ├── modules.py ├── train_test_eval.py └── utils.py ├── V2 ├── __init__.py ├── data_preprocess.py ├── main_gen.py ├── model.py ├── modules.py ├── train_test_eval.py └── utils.py └── _config.yml /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | Copyright (C) 2019 Kevin Sylla 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Pointer_Generator_Summarizer 2 | 3 | 4 | The pointer generator is a deep neural network built for abstractive summarizations. 5 | For more informations on this model, you can check out the scientific article here: https://arxiv.org/pdf/1704.04368 6 | You can also see this blog article from the author: http://www.abigailsee.com/2017/04/16/taming-rnns-for-better-summarization.html 7 | 8 | With my collaborator Stephane Belemkoabga (https://github.com/steph1793) , we re-made this model in tensorflow for our research project. This neural net will be our baseline model. 9 | We will do some experiments with this model, and propose a new architecture based on this one. 10 | 11 | In this project, you can: 12 | - train models 13 | - test * 14 | - evaluate * 15 | 16 | * : for the test and evaluation, the main methods are not done yet, but we will release them very soon. 17 | 18 | This project reads .bin format files. For our experiments, we will be working on the ccn and dailymail datasets. 19 | You can download the preprocessed files with this link : 20 | https://github.com/JafferWilson/Process-Data-of-CNN-DailyMail 21 | 22 | Or do the pre-processing by yourself with this link : 23 | https://github.com/abisee/cnn-dailymail 24 | -------------------------------------------------------------------------------- /V1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIKevin/Pointer_Generator_Summarizer/bd8f89b1c0a3c65adef3b12c2f7d06c3e4688638/V1/__init__.py -------------------------------------------------------------------------------- /V1/data_preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Data_preprocess.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | 7 | # DATA 8 | """ 9 | 10 | import numpy as np 11 | import glob 12 | import random 13 | import struct 14 | import csv 15 | from tensorflow.core.example import example_pb2 16 | import tensorflow as tf 17 | 18 | from threading import Thread 19 | from queue import Queue 20 | import time 21 | import threading 22 | 23 | """## Vocabulary""" 24 | 25 | SENTENCE_START = '' 26 | SENTENCE_END = '' 27 | 28 | PAD_TOKEN = '[PAD]' 29 | UNKNOWN_TOKEN = '[UNK]' 30 | START_DECODING = '[START]' 31 | STOP_DECODING = '[STOP]' 32 | 33 | class Vocab: 34 | 35 | def __init__(self, vocab_file, max_size): 36 | 37 | self.word2id = {UNKNOWN_TOKEN : 0, PAD_TOKEN : 1, START_DECODING : 2, STOP_DECODING : 3} 38 | self.id2word = {0 : UNKNOWN_TOKEN, 1 : PAD_TOKEN, 2 : START_DECODING, 3 : STOP_DECODING} 39 | self.count = 4 40 | 41 | with open(vocab_file, 'r') as f: 42 | for line in f: 43 | pieces = line.split() 44 | if len(pieces) != 2 : 45 | print('Warning : incorrectly formatted line in vocabulary file : %s\n' % line) 46 | continue 47 | 48 | w = pieces[0] 49 | if w in [SENTENCE_START, SENTENCE_END, UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]: 50 | raise Exception(', , [UNK], [PAD], [START] and [STOP] shouldn\'t be in the vocab file, but %s is' % w) 51 | 52 | if w in self.word2id: 53 | raise Exception('Duplicated word in vocabulary file: %s' % w) 54 | 55 | self.word2id[w] = self.count 56 | self.id2word[self.count] = w 57 | self.count += 1 58 | if max_size != 0 and self.count >= max_size: 59 | print("max_size of vocab was specified as %i; we now have %i words. Stopping reading." % (max_size, self.count)) 60 | break 61 | 62 | print("Finished constructing vocabulary of %i total words. Last word added: %s" % (self.count, self.id2word[self.count-1])) 63 | 64 | 65 | def word_to_id(self, word): 66 | if word not in self.word2id: 67 | return self.word2id[UNKNOWN_TOKEN] 68 | return self.word2id[word] 69 | 70 | def id_to_word(self, word_id): 71 | if word_id not in self.id2word: 72 | raise ValueError('Id not found in vocab: %d' % word_id) 73 | return self.id2word[word_id] 74 | 75 | def size(self): 76 | return self.count 77 | 78 | 79 | 80 | """## Data helpers""" 81 | 82 | def article_to_ids(article_words, vocab): 83 | ids = [] 84 | oovs = [] 85 | unk_id = vocab.word_to_id(UNKNOWN_TOKEN) 86 | for w in article_words: 87 | i = vocab.word_to_id(w) 88 | if i == unk_id: # If w is OOV 89 | if w not in oovs: # Add to list of OOVs 90 | oovs.append(w) 91 | oov_num = oovs.index(w) # This is 0 for the first article OOV, 1 for the second article OOV... 92 | ids.append(vocab.size() + oov_num) # This is e.g. 50000 for the first article OOV, 50001 for the second... 93 | else: 94 | ids.append(i) 95 | return ids, oovs 96 | 97 | 98 | def abstract_to_ids(abstract_words, vocab, article_oovs): 99 | ids = [] 100 | unk_id = vocab.word_to_id(UNKNOWN_TOKEN) 101 | for w in abstract_words: 102 | i = vocab.word_to_id(w) 103 | if i == unk_id: # If w is an OOV word 104 | if w in article_oovs: # If w is an in-article OOV 105 | vocab_idx = vocab.size() + article_oovs.index(w) # Map to its temporary article OOV number 106 | ids.append(vocab_idx) 107 | else: # If w is an out-of-article OOV 108 | ids.append(unk_id) # Map to the UNK token id 109 | else: 110 | ids.append(i) 111 | return ids 112 | 113 | 114 | 115 | def output_to_words(id_list, vocab, article_oovs): 116 | words = [] 117 | for i in id_list: 118 | try: 119 | w = vocab.id_to_word(i) # might be [UNK] 120 | except ValueError as e: # w is OOV 121 | assert article_oovs is not None, "Error: model produced a word ID that isn't in the vocabulary. This should not happen in baseline (no pointer-generator) mode" 122 | article_oov_idx = i - vocab.size() 123 | try: 124 | w = article_oovs[article_oov_idx] 125 | except ValueError as e: # i doesn't correspond to an article oov 126 | raise ValueError('Error: model produced word ID %i which corresponds to article OOV %i but this example only has %i article OOVs' % (i, article_oov_idx, len(article_oovs))) 127 | words.append(w) 128 | return words 129 | 130 | 131 | 132 | def abstract_to_sents(abstract): 133 | """Splits abstract text from datafile into list of sentences. 134 | Args: 135 | abstract: string containing and tags for starts and ends of sentences 136 | Returns: 137 | sents: List of sentence strings (no tags)""" 138 | cur = 0 139 | sents = [] 140 | while True: 141 | try: 142 | start_p = abstract.index(SENTENCE_START, cur) 143 | end_p = abstract.index(SENTENCE_END, start_p + 1) 144 | cur = end_p + len(SENTENCE_END) 145 | sents.append(abstract[start_p+len(SENTENCE_START):end_p]) 146 | except ValueError as e: # no more sentences 147 | return sents 148 | 149 | 150 | 151 | def example_generator(data_path, hpm): 152 | while True: 153 | filelist = glob.glob(data_path) # get the list of datafiles 154 | assert filelist, ('Error: Empty filelist at %s' % data_path) # check filelist isn't empty 155 | if hpm['singlepass']: 156 | filelist = sorted(filelist) 157 | else: 158 | random.shuffle(filelist) 159 | for f in filelist: 160 | reader = open(f, 'rb') 161 | while True: 162 | len_bytes = reader.read(8) 163 | if not len_bytes: break # finished reading this file 164 | str_len = struct.unpack('q', len_bytes)[0] 165 | example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0] 166 | yield example_pb2.Example.FromString(example_str) 167 | if hpm['singlepass'] or hpm['finished']: 168 | print("example_generator completed reading all datafiles. No more data.") 169 | break 170 | 171 | 172 | 173 | """# Batcher""" 174 | 175 | class Example(object): 176 | """Class representing a train/val/test example for text summarization.""" 177 | def __init__(self, article, abstract_sentences, vocab, hpm): 178 | """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. 179 | Args: 180 | article: source text; a string. each token is separated by a single space. 181 | abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. 182 | vocab: Vocabulary object 183 | hps: hyperparameters 184 | """ 185 | self.hpm = hpm 186 | 187 | # Get ids of special tokens 188 | start_decoding = vocab.word_to_id(START_DECODING) 189 | stop_decoding = vocab.word_to_id(STOP_DECODING) 190 | 191 | # Process the article 192 | article_words = article.split() 193 | if len(article_words) > hpm['max_enc_len']: 194 | article_words = article_words[:hpm['max_enc_len']] 195 | self.enc_len = len(article_words) # store the length after truncation but before padding 196 | self.enc_input = [vocab.word_to_id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token 197 | 198 | # Process the abstract 199 | abstract = ' '.join(abstract_sentences) # string 200 | abstract_words = abstract.split() # list of strings 201 | abs_ids = [vocab.word_to_id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token 202 | 203 | # Get the decoder input sequence and target sequence 204 | self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hpm['max_dec_len'], start_decoding, stop_decoding) 205 | self.dec_len = len(self.dec_input) 206 | 207 | # If using pointer-generator mode, we need to store some extra info 208 | if hpm['pointer_gen']: 209 | # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves 210 | self.enc_input_extend_vocab, self.article_oovs = article_to_ids(article_words, vocab) 211 | 212 | # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id 213 | abs_ids_extend_vocab = abstract_to_ids(abstract_words, vocab, self.article_oovs) 214 | 215 | # Overwrite decoder target sequence so it uses the temp article OOV ids 216 | _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hpm['max_dec_len'], start_decoding, stop_decoding) 217 | 218 | # Store the original strings 219 | self.original_article = article 220 | self.original_abstract = abstract 221 | self.original_abstract_sents = abstract_sentences 222 | 223 | 224 | def get_dec_inp_targ_seqs(self, sequence, max_len, start_id, stop_id): 225 | """Given the reference summary as a sequence of tokens, return the input sequence for the decoder, and the target sequence which we will use to calculate loss. The sequence will be truncated if it is longer than max_len. The input sequence must start with the start_id and the target sequence must end with the stop_id (but not if it's been truncated). 226 | Args: 227 | sequence: List of ids (integers) 228 | max_len: integer 229 | start_id: integer 230 | stop_id: integer 231 | Returns: 232 | inp: sequence length <=max_len starting with start_id 233 | target: sequence same length as input, ending with stop_id only if there was no truncation 234 | """ 235 | inp = [start_id] + sequence[:] 236 | target = sequence[:] 237 | if len(inp) > max_len: # truncate 238 | inp = inp[:max_len] 239 | target = target[:max_len] # no end_token 240 | else: # no truncation 241 | target.append(stop_id) # end token 242 | assert len(inp) == len(target) 243 | return inp, target 244 | 245 | 246 | def pad_decoder_inp_targ(self, max_len, pad_id): 247 | """Pad decoder input and target sequences with pad_id up to max_len.""" 248 | while len(self.dec_input) < max_len: 249 | self.dec_input.append(pad_id) 250 | while len(self.target) < max_len: 251 | self.target.append(pad_id) 252 | 253 | 254 | def pad_encoder_input(self, max_len, pad_id): 255 | """Pad the encoder input sequence with pad_id up to max_len.""" 256 | while len(self.enc_input) < max_len: 257 | self.enc_input.append(pad_id) 258 | if self.hpm['pointer_gen']: 259 | while len(self.enc_input_extend_vocab) < max_len: 260 | self.enc_input_extend_vocab.append(pad_id) 261 | 262 | 263 | 264 | 265 | class Batch(object): 266 | """Class representing a minibatch of train/val/test examples for text summarization.""" 267 | 268 | def __init__(self, example_list, hpm, vocab): 269 | """Turns the example_list into a Batch object. 270 | Args: 271 | example_list: List of Example objects 272 | hpm: hyperparameters 273 | vocab: Vocabulary object 274 | """ 275 | self.pad_id = vocab.word_to_id(PAD_TOKEN) # id of the PAD token used to pad sequences 276 | self.init_encoder_seq(example_list, hpm) # initialize the input to the encoder 277 | self.init_decoder_seq(example_list, hpm) # initialize the input and targets for the decoder 278 | self.store_orig_strings(example_list) # store the original strings 279 | 280 | def init_encoder_seq(self, example_list, hpm): 281 | """Initializes the following: 282 | self.enc_batch: 283 | numpy array of shape (batch_size, <=max_enc_steps) containing integer ids (all OOVs represented by UNK id), padded to length of longest sequence in the batch 284 | self.enc_lens: 285 | numpy array of shape (batch_size) containing integers. The (truncated) length of each encoder input sequence (pre-padding). 286 | self.enc_padding_mask: 287 | numpy array of shape (batch_size, <=max_enc_steps), containing 1s and 0s. 1s correspond to real tokens in enc_batch and target_batch; 0s correspond to padding. 288 | If hps.pointer_gen, additionally initializes the following: 289 | self.max_art_oovs: 290 | maximum number of in-article OOVs in the batch 291 | self.art_oovs: 292 | list of list of in-article OOVs (strings), for each example in the batch 293 | self.enc_batch_extend_vocab: 294 | Same as self.enc_batch, but in-article OOVs are represented by their temporary article OOV number. 295 | """ 296 | # Determine the maximum length of the encoder input sequence in this batch 297 | max_enc_seq_len = max([ex.enc_len for ex in example_list]) 298 | 299 | # Pad the encoder input sequences up to the length of the longest sequence 300 | for ex in example_list: 301 | ex.pad_encoder_input(max_enc_seq_len, self.pad_id) 302 | 303 | # Initialize the numpy arrays 304 | # Note: our enc_batch can have different length (second dimension) for each batch because we use dynamic_rnn for the encoder. 305 | self.enc_batch = np.zeros((hpm['batch_size'], max_enc_seq_len), dtype=np.int32) 306 | self.enc_lens = np.zeros((hpm['batch_size']), dtype=np.int32) 307 | self.enc_padding_mask = np.zeros((hpm['batch_size'], max_enc_seq_len), dtype=np.float32) 308 | 309 | # Fill in the numpy arrays 310 | for i, ex in enumerate(example_list): 311 | self.enc_batch[i, :] = ex.enc_input[:] 312 | self.enc_lens[i] = ex.enc_len 313 | for j in range(ex.enc_len): 314 | self.enc_padding_mask[i][j] = 1 315 | 316 | # For pointer-generator mode, need to store some extra info 317 | if hpm['pointer_gen']: 318 | # Determine the max number of in-article OOVs in this batch 319 | self.max_art_oovs = max([len(ex.article_oovs) for ex in example_list]) 320 | # Store the in-article OOVs themselves 321 | self.art_oovs = [ex.article_oovs for ex in example_list] 322 | # Store the version of the enc_batch that uses the article OOV ids 323 | self.enc_batch_extend_vocab = np.zeros((hpm['batch_size'], max_enc_seq_len), dtype=np.int32) 324 | for i, ex in enumerate(example_list): 325 | self.enc_batch_extend_vocab[i, :] = ex.enc_input_extend_vocab[:] 326 | 327 | def init_decoder_seq(self, example_list, hpm): 328 | """Initializes the following: 329 | self.dec_batch: 330 | numpy array of shape (batch_size, max_dec_steps), containing integer ids as input for the decoder, padded to max_dec_steps length. 331 | self.target_batch: 332 | numpy array of shape (batch_size, max_dec_steps), containing integer ids for the target sequence, padded to max_dec_steps length. 333 | self.dec_padding_mask: 334 | numpy array of shape (batch_size, max_dec_steps), containing 1s and 0s. 1s correspond to real tokens in dec_batch and target_batch; 0s correspond to padding. 335 | """ 336 | # Pad the inputs and targets 337 | for ex in example_list: 338 | ex.pad_decoder_inp_targ(hpm['max_dec_len'], self.pad_id) 339 | 340 | # Initialize the numpy arrays. 341 | # Note: our decoder inputs and targets must be the same length for each batch (second dimension = max_dec_steps) because we do not use a dynamic_rnn for decoding. However I believe this is possible, or will soon be possible, with Tensorflow 1.0, in which case it may be best to upgrade to that. 342 | self.dec_batch = np.zeros((hpm['batch_size'], hpm['max_dec_len']), dtype=np.int32) 343 | self.target_batch = np.zeros((hpm['batch_size'], hpm['max_dec_len']), dtype=np.int32) 344 | self.dec_padding_mask = np.zeros((hpm['batch_size'], hpm['max_dec_len']), dtype=np.float32) 345 | 346 | # Fill in the numpy arrays 347 | for i, ex in enumerate(example_list): 348 | self.dec_batch[i, :] = ex.dec_input[:] 349 | self.target_batch[i, :] = ex.target[:] 350 | for j in range(ex.dec_len): 351 | self.dec_padding_mask[i][j] = 1 352 | 353 | def store_orig_strings(self, example_list): 354 | """Store the original article and abstract strings in the Batch object""" 355 | self.original_articles = [ex.original_article for ex in example_list] # list of lists 356 | self.original_abstracts = [ex.original_abstract for ex in example_list] # list of lists 357 | self.original_abstracts_sents = [ex.original_abstract_sents for ex in example_list] # list of list of lists 358 | 359 | 360 | 361 | 362 | class Batcher(): 363 | 364 | def __init__(self,data_path, hpm, vocab): 365 | self.hpm = hpm 366 | self.vocab = vocab 367 | self.max_examples_buffer_len = hpm['examples_max_buffer_len'] 368 | self.max_batch_buffer_len = hpm['batch_max_buffer_len'] 369 | self.max_batch_bucket_len = hpm['max_batch_bucket_len'] 370 | self.gen = self.thread_safe_generator(self.generator(example_generator(data_path, hpm))) 371 | self.num_fill_examples_threads = 4 372 | self.num_fill_batches_threads = 4 373 | self.elements_queue = Queue(self.max_examples_buffer_len) 374 | self.batch_queue = Queue(self.max_batch_buffer_len) 375 | self.launch_watch_threads() 376 | 377 | 378 | class thread_safe_generator(object): 379 | def __init__(self, gen): 380 | self.gen = gen 381 | self.lock = threading.Lock() 382 | 383 | def __next__(self): 384 | with self.lock: 385 | return next(self.gen) 386 | 387 | 388 | def generator(self, example_gen): 389 | while True : 390 | e = next(example_gen) 391 | try: 392 | article_text = e.features.feature['article'].bytes_list.value[0].decode() 393 | abstract_text = e.features.feature['abstract'].bytes_list.value[0].decode() 394 | except ValueError: 395 | tf.logging.error('Failed to get article or abstract from example') 396 | continue 397 | if len(article_text) == 0 : 398 | tf.logging.warning('Found an example with empty article text. Skipping it.') 399 | 400 | else: 401 | yield (article_text, abstract_text) 402 | 403 | 404 | 405 | def fill_examples_queue(self): 406 | while True: 407 | try: 408 | article, abstract = next(self.gen) 409 | abst = [sent.strip() for sent in abstract_to_sents(abstract)] 410 | ex = Example(article, abst,self.vocab, self.hpm) 411 | self.elements_queue.put(ex) 412 | except : 413 | break 414 | 415 | 416 | 417 | def fill_batch_queue(self): 418 | while True: 419 | try: 420 | if not self.hpm['decode']: 421 | batch = [] 422 | for _ in range(self.hpm['batch_size']*self.hpm['max_batch_bucket_len']): 423 | batch.append(self.elements_queue.get()) 424 | 425 | batch = sorted(batch, key=lambda x : x.enc_len) 426 | batches= [] 427 | i = 0 428 | while i+self.hpm['batch_size'] <= len(batch): 429 | batches.append(batch[i:i+self.hpm['batch_size']]) 430 | i = i + self.hpm['batch_size'] 431 | 432 | if i < len(batch): 433 | batches.append(batch[i:len(batch)]) 434 | 435 | if not self.hpm['singlepass']: 436 | random.shuffle(batches) 437 | 438 | for b in batches: 439 | # here again we crete batch object before doing pushing it to the batch queue 440 | self.batch_queue.put(Batch(b, self.hpm, self.vocab)) 441 | else: 442 | ex = self.elements_queue.get() 443 | b = [ex for _ in range(self.hpm['batch_size'])] 444 | self.batch_queue.put(Batch(b, self.hpm, self.vocab)) 445 | 446 | except : 447 | break 448 | 449 | def launch_watch_threads(self): 450 | 451 | self.elements_queue_threads = [] 452 | for i in range(self.num_fill_examples_threads): 453 | self.elements_queue_threads.append(Thread(target=self.fill_examples_queue)) 454 | self.elements_queue_threads[-1].setDaemon(True) 455 | self.elements_queue_threads[-1].start() 456 | 457 | 458 | self.batch_queue_threads = [] 459 | for j in range(self.num_fill_batches_threads): 460 | self.batch_queue_threads.append(Thread(target = self.fill_batch_queue)) 461 | self.batch_queue_threads[-1].setDaemon(True) 462 | self.batch_queue_threads[-1].start() 463 | 464 | 465 | def watch(): 466 | while True: 467 | time.sleep(60) 468 | for id, t in enumerate(self.elements_queue_threads): 469 | if not t.is_alive() : 470 | print("thread dead") 471 | new_t = Thread(target = self.fill_batch_queue) 472 | self.elements_queue_threads[id] = new_t 473 | new_t.daemon = True 474 | new_t.start() 475 | 476 | for id, t in enumerate(self.batch_queue_threads): 477 | if not t.is_alive() : 478 | print("batch thread dead") 479 | new_t = Thread(target=self.fill_batch_queue) 480 | self.batch_queue_threads[id] = new_t 481 | new_t.setDaemon(True) 482 | new_t.start() 483 | 484 | if not self.hpm['singlepass'] : 485 | self.watcher = Thread(target = watch) 486 | self.watcher.setDaemon(True) 487 | self.watcher.start() 488 | 489 | 490 | 491 | 492 | def next_batch(self): 493 | 494 | if self.batch_queue.qsize() ==0: 495 | tf.logging.warning('Bucket input queue is empty when calling next_batch. Bucket queue size: %i, Input queue size: %i', self.batch_queue.qsize(), self.elements_queue.qsize()) 496 | if self.hpm['singlepass'] or self.hpm['finished']: 497 | tf.logging.info("Finished reading dataset in single_pass mode.") 498 | return None 499 | return self.batch_queue.get() 500 | 501 | -------------------------------------------------------------------------------- /V1/main_gen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """pointer_gen_main.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | """ 7 | 8 | import numpy as np 9 | import random 10 | import tensorflow as tf 11 | import tensorflow.nn as nn 12 | import os 13 | import glob 14 | 15 | from data_preprocess import Vocab 16 | from data_preprocess import Batcher 17 | from data_preprocess import output_to_words 18 | 19 | from model import SummarizationModel 20 | 21 | from train_test_eval import get_config 22 | from train_test_eval import run_training 23 | from train_test_eval import restore_model 24 | from train_test_eval import total_num_params 25 | 26 | hpm={"hidden_size": 256 , 27 | "emb_size": 128, 28 | "attn_hidden_size":512, 29 | 30 | "batch_size":24 , 31 | 'beam_size':4, 32 | 33 | "max_enc_len": 400, 34 | 'max_dec_len':100, 35 | 'min_dec_steps':35, 36 | 'max_dec_steps':100, 37 | 38 | 39 | "pointer_gen":True, 40 | "coverage":True, 41 | "add_coverage":False, 42 | 43 | "training":True, 44 | 'decode':False, 45 | 'eval' : False, 46 | 47 | 48 | 'vocab_size':50000, 49 | 50 | 'examples_max_buffer_len' : 40, 51 | 'batch_max_buffer_len': 10, 52 | 'max_batch_bucket_len':5 , 53 | 54 | 'finished':False, 55 | 'singlepass':False, 56 | 57 | 'max_grad_norm':0.8, 58 | 'adagrad_init_acc':0.1, 59 | 'learning_rate':0.15, 60 | 'rand_unif_init_mag':0.02, 61 | 'trunc_norm_init_std':1e-4, 62 | 'cov_loss_weight':1.0, 63 | 64 | 'decode_using_prev' : True 65 | } 66 | 67 | 68 | vocab_path = "/content/gdrive/My Drive/cnn_stories/vocab" 69 | data_path = "/content/gdrive/My Drive/cnn_stories/train2/*" 70 | checkpoint_dir = "/content/gdrive/My Drive/pointer_gen/checkpoints/" 71 | model_path = "/content/gdrive/My Drive/pointer_gen/checkpoints/model.ckpt-33001" 72 | logdir = "/content/gdrive/My Drive/pointer_gen/logdir" 73 | GAN_gen_checkpoint = "/content/gdrive/My Drive/pointer_gen/GAN_gen_checkpoint/GAN_gen_checkpoint.ckpt" 74 | training_steps = 35000 75 | 76 | tf.logging.info('Vocab and Batcher creation') 77 | vocab = Vocab(vocab_path, hpm['vocab_size']) 78 | batcher = Batcher(data_path, hpm, vocab) 79 | 80 | 81 | def build_graph(): 82 | tf.reset_default_graph() 83 | tf.logging.info('Building the model.') 84 | if hpm['decode'] or hpm['decode_using_prev']: 85 | hpm['max_dec_len'] = 1 86 | mod = SummarizationModel(hpm) 87 | tf.logging.info('Building the graph.') 88 | mod.add_placeholder() 89 | 90 | device = "/gpu:0" if tf.test.is_gpu_available() else "/cpu:0" 91 | with tf.device(device): 92 | mod.build_graph() 93 | if hpm['training'] or hpm['eval']: 94 | tf.logging.info('Adding training ops.') 95 | mod.add_loss() 96 | mod.add_train_op(device) 97 | if hpm['decode']: 98 | assert mod.hpm['batch_size'] == mod.hpm['beam_size'] 99 | mod.add_top_k_likely_outputs() 100 | 101 | if hpm['decode_using_prev']: 102 | mod.add_loss() 103 | #mod.add_top_k_likely_outputs() 104 | #mod.add_prob_logits_samples() 105 | return mod 106 | 107 | 108 | 109 | def main(): 110 | 111 | mod = build_graph() 112 | 113 | if hpm['eval']: 114 | pass 115 | 116 | if hpm['decode']: 117 | s = tf.Session(config=get_config()) 118 | init = tf.global_variables_initializer() 119 | s.run(init) 120 | restore_model(s, hpm, model_path=model_path, check_path = checkpoint_dir) 121 | tf.logging.info(mod.beam_decode(s, batcher.next_batch(), vocab)) 122 | # and then we can call the beam_decode of the model to decode th summary (will be implemented later) 123 | 124 | if hpm['training']: 125 | tf.logging.info('Starting training.') 126 | try: 127 | run_training(mod, batcher, hpm, training_steps, checkpoint_dir, logdir) 128 | except KeyboardInterrupt: 129 | tf.logging.info('stop training.') 130 | 131 | if hpm['decode_using_prev']: 132 | tf.logging.info('Creating the generator for the GAN') 133 | with tf.Session(config=get_config()) as s: 134 | init = tf.global_variables_initializer() 135 | s.run(init) 136 | restore_model(s,hpm, model_path=model_path, check_path=checkpoint_dir) 137 | saver = tf.train.Saver() 138 | saver_path = saver.save(s, GAN_gen_checkpoint) 139 | tf.logging.info(saver_path) 140 | 141 | 142 | if __name__ == '__main__': 143 | main() 144 | 145 | -------------------------------------------------------------------------------- /V1/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """pointer_gen_model.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/11cNRDFW5_4tCVGjX-5L1OTuS7Bi9lq5R 8 | """ 9 | 10 | import numpy as np 11 | import random 12 | import tensorflow as tf 13 | import tensorflow.nn as nn 14 | 15 | from modules import Encoder 16 | from modules import Attention_decoder 17 | from utils import _mask_and_avg 18 | 19 | 20 | class SummarizationModel(): 21 | """ 22 | The pointer generator model 23 | Args: 24 | hpm : hyperparameters 25 | """ 26 | 27 | def __init__(self, hpm): 28 | self.hpm = hpm 29 | 30 | # parameters initializer objetcs 31 | self.rand_unif_init = tf.random_uniform_initializer(-self.hpm['rand_unif_init_mag'], self.hpm['rand_unif_init_mag'], seed=123) 32 | self.rand_norm_init = tf.truncated_normal_initializer(stddev=self.hpm['trunc_norm_init_std']) 33 | 34 | # encoder and attentional decoder objects 35 | self.encoder = Encoder(self.hpm, self.rand_unif_init, self.rand_norm_init) 36 | self.decoder = Attention_decoder(self.hpm, self.rand_unif_init, self.rand_norm_init) 37 | 38 | # a global step counter for the training 39 | self.step = tf.train.get_or_create_global_step() 40 | 41 | 42 | 43 | 44 | def add_placeholder(self): 45 | """ Adding placeholders to the model """ 46 | 47 | with tf.variable_scope("placeholder"): 48 | self.enc_batch = tf.placeholder(tf.int32, [self.hpm['batch_size'], None], name='enc_batch') # encoder input sequences (the 2nd dimension -max_enc_len- 49 | # of the shape is None because it varies with the batch) 50 | self.enc_mask = tf.placeholder(tf.float32, [self.hpm['batch_size'], None], name='enc_mask') # encoder input sequences masks 51 | self.enc_lens = tf.placeholder(tf.int32, [self.hpm['batch_size']], 'enc_lens') # lengths of the input sequences 52 | 53 | if self.hpm['pointer_gen']: 54 | self.enc_extend_vocab = tf.placeholder(tf.int32, [self.hpm['batch_size'], None], 'enc_extend_vocab') # encoder input sequences with oovs ids 55 | self.max_art_oovs = tf.placeholder(tf.int32, [], 'max_art_oovs') # maximum number of oovs for the current batch 56 | 57 | self.dec_batch = tf.placeholder(tf.int32, [self.hpm['batch_size'], self.hpm['max_dec_len']], name='dec_batch') # decoder input sequences (max_dec_len = 1 in decode mode) 58 | self.dec_target = tf.placeholder(tf.int32, [self.hpm['batch_size'], self.hpm['max_dec_len']], name='target_batch') 59 | self.dec_mask = tf.placeholder(tf.float32, [self.hpm['batch_size'], self.hpm['max_dec_len']], name='dec_mask') # decoder input masks tensors 60 | 61 | 62 | 63 | 64 | 65 | def build_graph(self): 66 | """ Graph building method""" 67 | with tf.variable_scope("embedding"): 68 | 69 | inp_embed = tf.get_variable('inp_embed', [self.hpm['vocab_size'], self.hpm['emb_size']], dtype=tf.float32) # encoder input embeddings 70 | dec_embed = tf.get_variable('dec_embed', [self.hpm['vocab_size'], self.hpm['emb_size']], dtype=tf.float32) # decoder input embeddings 71 | 72 | # we lookup the encoder input in the embedding matrix 73 | inps = tf.nn.embedding_lookup(inp_embed, self.enc_batch) # shape : [batch_size, , embed_size] 74 | # we lookup the decoder input in the embedding matrix 75 | dec = tf.transpose(self.dec_batch, perm=[1,0]) 76 | dec_inps = tf.nn.embedding_lookup(dec_embed, dec) # shape : [max_dec_len, batch_size, embed_size] 77 | # we add the encoder ops 78 | self.enc_outputs, self.dec_state = self.encoder(inps, self.enc_lens) 79 | 80 | 81 | 82 | self.cov_vec = tf.zeros(shape=[self.hpm['batch_size'],tf.shape(self.enc_outputs)[1] ] , dtype=tf.float32, name="cov_vec") 83 | # we add the decoder ops 84 | self.enc_outputs = tf.identity(self.enc_outputs, "enc_outputs") 85 | self.dec_state = tf.identity(self.dec_state, "dec_state") 86 | self.dec_state = tf.contrib.rnn.LSTMStateTuple(self.dec_state[0],self.dec_state[1]) 87 | 88 | self.returns = self.decoder(self.enc_outputs, self.enc_mask,self.dec_state, dec_inps, self.max_art_oovs , self.enc_extend_vocab, self.cov_vec) 89 | 90 | self.returns['last_context_vector'] = tf.identity(self.returns['last_context_vector'],name="last_context_vector") 91 | 92 | self.returns['attention_vec'] = tf.identity(self.returns['attention_vec'], name="attention_vec") 93 | 94 | #self.returns['coverage'] = tf.identity(self.returns['coverage'] , name="coverage") 95 | self.returns['p_gen'] = tf.identity(self.returns['p_gen'], name="p_gen") 96 | 97 | self.returns['coverage'] = tf.identity(self.returns['coverage'], "coverage") 98 | 99 | self.returns['dec_state'] = tf.identity(self.returns['dec_state'], 'new_dec_state') 100 | self.returns['dec_state'] = tf.contrib.rnn.LSTMStateTuple(self.returns['dec_state'][0], self.returns['dec_state'][1]) 101 | 102 | self.returns['output'] = tf.identity(self.returns['output'], "logits") 103 | 104 | if self.hpm['decode_using_prev']: 105 | self.returns['argmax_seqs'] = tf.identity(self.returns['argmax_seqs'], "argmax_seqs") 106 | self.returns['argmax_log_probs'] = tf.identity(self.returns['argmax_log_probs'], "argmax_log_probs") 107 | self.returns['samples_seqs'] = tf.identity(self.returns['samples_seqs'], "samples_seqs") 108 | self.returns['samples_log_probs'] = tf.identity(self.returns['samples_log_probs'], "samples_log_probs") 109 | 110 | 111 | 112 | 113 | 114 | def make_feed_dict(self, batch): 115 | """ 116 | Args: 117 | batch : Batch Object 118 | Return: 119 | A dictionary to feed the model during training 120 | """ 121 | feed_dict = {} 122 | 123 | feed_dict[self.enc_batch] = batch.enc_batch 124 | feed_dict[self.enc_mask] = batch.enc_padding_mask 125 | feed_dict[self.enc_lens] = batch.enc_lens 126 | 127 | if self.hpm['pointer_gen']: 128 | feed_dict[self.enc_extend_vocab] = batch.enc_batch_extend_vocab 129 | feed_dict[self.max_art_oovs] = batch.max_art_oovs 130 | 131 | feed_dict[self.dec_batch] = batch.dec_batch 132 | feed_dict[self.dec_target] = batch.target_batch 133 | feed_dict[self.dec_mask] = batch.dec_padding_mask 134 | 135 | return feed_dict 136 | 137 | 138 | 139 | def add_loss(self): 140 | """ We add the loss computation op """ 141 | with tf.variable_scope('loss'): 142 | 143 | if self.hpm['pointer_gen']: #if pointer_gen we apply the cross_entropy function ourselves: 144 | # we compute the log of the predicted probability of the target target word (this is the probability we must maximize) 145 | loss_per_step = [] 146 | batch_nums = tf.range(0, limit=self.hpm['batch_size']) # shape (batch_size) 147 | for dec_step, dist in enumerate(tf.unstack(self.returns['output'])): 148 | targets = self.dec_target[:,dec_step] # The indices of the target words. shape (batch_size) 149 | indices = tf.stack( (batch_nums, targets), axis=1) # shape (batch_size, 2) 150 | gold_probs = tf.gather_nd(dist, indices) # shape (batch_size). prob of correct words on this step 151 | losses = -tf.log(gold_probs) 152 | loss_per_step.append(losses) 153 | 154 | self.loss = _mask_and_avg(loss_per_step, self.dec_mask) # we drop the loss of the pad tokens 155 | 156 | else: 157 | self.loss = tf.contrib.seq2seq.sequence_loss(tf.stack(self.returns['output'], axis=1), self.dec_batch, self.dec_mask) 158 | #if not pointer_gen, we compute the softmax, and the sequence to squence cross_entropy loss with this helper function 159 | 160 | tf.summary.scalar('loss', self.loss) 161 | self.total_loss = self.loss 162 | if self.hpm['coverage']: 163 | 164 | # nested function 165 | def coverage_loss(self): 166 | """ coverage loss computation""" 167 | covlosses = [] 168 | coverage = tf.zeros_like(tf.unstack(self.returns['attention_vec'][0])) 169 | for a in tf.unstack(self.returns['attention_vec']): # a in an attention vector at time step t 170 | covloss = tf.reduce_sum(tf.minimum(a, coverage ), 1) 171 | covlosses.append(covloss) 172 | coverage += a 173 | coverage_loss = _mask_and_avg(covlosses, self.enc_mask) # we drop the pad tokens loss and compute the avg loss 174 | return coverage_loss 175 | 176 | self.coverage_loss = coverage_loss(self) 177 | self.coverage_loss = tf.identity(self.coverage_loss, name="coverage_loss") 178 | if self.hpm['add_coverage']: 179 | tf.summary.scalar('coverage_loss', self.coverage_loss) 180 | if self.hpm['add_coverage']: 181 | self.total_loss += self.hpm['cov_loss_weight']* self.coverage_loss # we weight the coverage loss and add it to thhe total loss 182 | # the total loss = seq2seq_loss + coverage_loss (if coverage = True) 183 | tf.summary.scalar('total_loss', self.total_loss) 184 | 185 | self.loss = tf.identity(self.loss, name="loss") 186 | self.total_loss = tf.identity(self.total_loss, name="total_loss") 187 | 188 | 189 | 190 | def add_train_op(self, device): 191 | """We add the training op to the graph""" 192 | loss_to_minimize = self.total_loss 193 | variables = tf.trainable_variables() # we recover all the trainable parameters 194 | gradients = tf.gradients(loss_to_minimize, variables, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE ) # we compute the gradients of the loss with respect to all the parameters (backpropagation) 195 | 196 | with tf.device(device): 197 | grads, global_norm = tf.clip_by_global_norm(gradients, self.hpm['max_grad_norm']) # we clip the gradients 198 | 199 | optimizer = tf.train.AdagradOptimizer(self.hpm['learning_rate'], initial_accumulator_value=self.hpm['adagrad_init_acc'], ) # we create the optimizer object 200 | with tf.device(device): 201 | self.train_op = optimizer.apply_gradients(zip(grads, variables), name='train_step', global_step=self.step) # Gradient descent (we update the parameters) 202 | # this is the training op 203 | 204 | self.summaries = tf.summary.merge_all() 205 | 206 | 207 | 208 | def setSession(self, sess): 209 | """ we set a session for the training""" 210 | self.sess = sess 211 | 212 | 213 | 214 | def train(self, batch): 215 | """We run the train op""" 216 | feed_dict = self.make_feed_dict(batch) 217 | to_return = {'train_op':self.train_op, 218 | 'loss':self.loss, 219 | 'global_step':self.step, 220 | 'summaries' : self.summaries} 221 | if (self.hpm['coverage']): 222 | to_return['coverage_loss'] = self.coverage_loss 223 | 224 | return self.sess.run(to_return, feed_dict) 225 | 226 | 227 | 228 | 229 | def add_top_k_likely_outputs(self): 230 | """We add an op to the graph that computes the top k output probabilities and their ids, used during decoding""" 231 | assert len(tf.unstack(self.returns['output'])) == 1 232 | top_k_probs, self.top_k_ids= tf.nn.top_k(self.returns['output'][0], self.hpm['beam_size']*2) 233 | self.top_k_log_probs = tf.log(top_k_probs, name="top_k_log_probs") 234 | self.top_k_ids = tf.identity(self.top_k_ids, name="top_k_ids") 235 | # we compute the log of the probalities (given the size of the vocabulary, the probaility are generally very small, it is better then to use their log) 236 | 237 | 238 | 239 | def add_prob_logits_samples(self): 240 | outputs = tf.unstack(self.returns['output']) 241 | batch_nums = tf.range(0, limit=self.hpm['batch_size'], dtype=tf.int64) 242 | argmax_seqs = [] 243 | argmax_seqs_log_probs = [] 244 | for i , x in enumerate(outputs): 245 | max_ids = tf.argmax(x, axis=-1) 246 | indices = tf.stack((batch_nums, max_ids), axis = -1) 247 | log_probs = tf.gather_nd(x, indices) 248 | argmax_seqs.append(max_ids) 249 | argmax_seqs_log_probs.append(log_probs) 250 | 251 | 252 | self.outputs = self.returns['output'] 253 | if not self.hpm['pointer_gen']: 254 | self.outputs = tf.softmax(self.outputs) 255 | 256 | self.argmax_seqs = tf.stack(argmax_seqs, name='argmax_seqs') 257 | self.argmax_seqs_log_probs = tf.stack(argmax_seqs_log_probs, name='argmax_seqs_log_probs') 258 | 259 | sampler = tf.distributions.Categorical(logits=outputs) 260 | self.samples = sampler.sample(name='samples') 261 | self.samples = tf.identity(self.samples, name='samples') 262 | self.samples_log_probs = sampler.log_prob(self.samples, name="samples_log_probs") 263 | self.samples_log_probs = tf.identity(self.samples_log_probs, name="samples_log_probs") 264 | 265 | 266 | 267 | def decode_onestep(self, sess, batch, enc_outputs, dec_state, dec_input, cov_vec): 268 | """ 269 | Method to decode the output step by step (used for beamSearch decoding) 270 | Args: 271 | sess : tf.Session object 272 | batch : current batch, shape = [beam_size, 1, vocab_size( + max_oov_len if pointer_gen)] (for the beam search decoding, batch_size = beam_size) 273 | enc_outputs : hiddens outputs computed by the encoder LSTM 274 | dec_state : beam_size-many list of decoder previous state, LSTMStateTuple objects, shape = [beam_size, 2, hidden_size] 275 | dec_input : decoder_input, the previous decoded batch_size-many words, shape = [beam_size, embed_size] 276 | cov_vec : beam_size-many list of previous coverage vector 277 | Returns: A dictionary of the results of all the ops computations (see below for more details) 278 | """ 279 | 280 | # dec_state is a batch_size-many list of LSTMStateTuple objects 281 | # we have to transform it to one LSTMStateTuple object where c and h have shape : [beam_size, hidden_size] 282 | cells = [np.expand_dims(state.c, axis=0) for state in dec_state] 283 | hiddens = [np.expand_dims(state.h, axis=0) for state in dec_state] 284 | new_c = np.concatenate(cells, axis=0) 285 | new_h = np.concatenate(hiddens, axis=0) 286 | new_dec_in_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) 287 | 288 | # dictionary of all the ops that will be computed 289 | to_return = {'last_context_vector' : self.returns['last_context_vector'], # list of the previous context_vectors , shape : [beam_size, 2 x hidden_size] 290 | 'dec_state' : self.returns['dec_state'], # beam_size-many list of LSTMStateTuple cells, where c and h have shape : [hidden_size] 291 | 'top_k_ids' : self.top_k_ids, # top (2x xbeam_size) ids of the most liikely words to appear at the current time step 292 | 'top_k_log_probs' : self.top_k_log_probs, # top (2x xbeam_size) probabilities of the most liikely words to appear at the current time step 293 | 'attention_vec':self.returns['attention_vec']} # beam_size-many list of attention vectors, shape : [1, beam_size, max_enc_len] 294 | 295 | if self.hpm['coverage']: 296 | to_return['coverage'] = self.returns['coverage'] # beam_size-many list of coverage vectors , shape : [batch_size, max_enc_len] 297 | if self.hpm['pointer_gen']: 298 | to_return['p_gen'] = self.returns['p_gen'] # shape : [beam_size, 1] 299 | 300 | to_feed = {self.enc_outputs : enc_outputs, 301 | self.enc_mask : batch.enc_padding_mask, 302 | self.dec_batch : np.transpose(np.array([dec_input])), #shape : [beam_size, 1] 303 | self.dec_state : new_dec_in_state} 304 | 305 | if self.hpm['pointer_gen']: 306 | to_feed[self.enc_extend_vocab] = batch.enc_batch_extend_vocab 307 | to_feed[self.max_art_oovs] = batch.max_art_oovs 308 | 309 | if self.hpm['coverage']: 310 | to_feed[self.cov_vec] = cov_vec 311 | 312 | results = sess.run(to_return, to_feed) 313 | states = results['dec_state'] 314 | results['dec_state'] = [tf.contrib.rnn.LSTMStateTuple(states.c[i,:], states.h[i,:]) for i in range(self.hpm['beam_size'])] 315 | #we transform dec_state into a list of LSTMStateTuple objects, an LSTMStateTuple for each likely word 316 | 317 | return results 318 | 319 | 320 | def beam_decode(self, sess, batch, vocab): 321 | 322 | # nested class 323 | class Hypothesis: 324 | """ Class designed to hold hypothesises throughout the beamSearch decoding """ 325 | def __init__(self, tokens, log_probs, state, attn_dists, p_gens, coverage): 326 | self.tokens = tokens # list of all the tokens from time 0 to the current time step t 327 | self.log_probs = log_probs # list of the log probabilities of the tokens of the tokens 328 | self.state = state # decoder state after the last token decoding 329 | self.attn_dists = attn_dists # attention dists of all the tokens 330 | self.p_gens = p_gens # generation probability of all the tokens 331 | self.coverage = coverage # coverage at the current time step t 332 | 333 | def extend(self, token, log_prob, state, attn_dist, p_gen, coverage): 334 | """Method to extend the current hypothesis by adding the next decoded toekn and all the informations associated with it""" 335 | return Hypothesis(tokens = self.tokens + [token], # we add the decoded token 336 | log_probs = self.log_probs + [log_prob], # we add the log prob of the decoded token 337 | state = state, # we update the state 338 | attn_dists = self.attn_dists + [attn_dist], # we add the attention dist of the decoded token 339 | p_gens = self.p_gens + [p_gen], # we add the p_gen 340 | coverage = coverage) # we update the coverage 341 | 342 | @property 343 | def latest_token(self): 344 | return self.tokens[-1] 345 | 346 | @property 347 | def tot_log_prob(self): 348 | return sum(self.log_probs) 349 | 350 | @property 351 | def avg_log_prob(self): 352 | return self.tot_log_prob/len(self.tokens) 353 | 354 | # end of the nested class 355 | 356 | # We run the encoder once and then we use the results to decode each time step token 357 | enc_outputs, dec_in_state = sess.run([self.enc_outputs, self.dec_state], {self.enc_batch : batch.enc_batch, 358 | self.enc_mask : batch.enc_padding_mask, 359 | self.enc_lens : batch.enc_lens}) 360 | # Initial Hypothesises (beam_size many list) 361 | hyps = [Hypothesis(tokens=[vocab.word_to_id('[START]')], # we initalize all the beam_size hypothesises with the token start 362 | log_probs = [0.0], # Initial log prob = 0 363 | state = tf.contrib.rnn.LSTMStateTuple(dec_in_state.c[0], dec_in_state.h[0]), #initial dec_state (we will use only the first dec_state because they're initially the same) 364 | attn_dists=[], 365 | p_gens = [], 366 | coverage=np.zeros([enc_outputs.shape[1]]) # we init the coverage vector to zero 367 | ) for _ in range(self.hpm['batch_size'])] # batch_size == beam_size 368 | 369 | results = [] # list to hold the top beam_size hypothesises 370 | steps=0 # initial step 371 | 372 | while steps < self.hpm['max_dec_steps'] and len(results) < self.hpm['beam_size'] : 373 | latest_tokens = [h.latest_token for h in hyps] # latest token for each hypothesis , shape : [beam_size] 374 | latest_tokens = [t if t in range(self.hpm['vocab_size']) else vocab.word_to_id('[UNK]') for t in latest_tokens] # we replace all the oov is by the unknown token 375 | states = [h.state for h in hyps] # we collect the last states for each hypothesis 376 | 377 | if self.hpm['coverage']: 378 | prev_coverage = [h.coverage for h in hyps] 379 | else: 380 | prev_coverage = None 381 | 382 | # we decode the top likely 2 x beam_size tokens tokens at time step t for each hypothesis 383 | returns = self.decode_onestep(sess, batch, enc_outputs, states, latest_tokens, prev_coverage) 384 | topk_ids, topk_log_probs, new_states, attn_dists = returns['top_k_ids'], returns['top_k_log_probs'], returns['dec_state'], returns['attention_vec'] 385 | if self.hpm['pointer_gen']: 386 | p_gens = returns['p_gen'] 387 | if self.hpm['coverage']: 388 | new_coverage = returns['coverage'] 389 | 390 | attn_dists = np.squeeze(attn_dists) # shape : [beam_size, max_enc_len] 391 | if self.hpm['pointer_gen']: 392 | p_gens = np.squeeze(p_gens) # shape : [beam_size] 393 | 394 | all_hyps = [] 395 | num_orig_hyps = 1 if steps ==0 else len(hyps) 396 | for i in range(num_orig_hyps): 397 | h, new_state, attn_dist, p_gen, new_coverage_i = hyps[i], new_states[i], attn_dists[i], p_gens[i], new_coverage[i] 398 | 399 | for j in range(self.hpm['beam_size']*2): 400 | # we extend each hypothesis with each of the top k tokens (this gives 2 x beam_size new hypothesises for each of the beam_size old hypothesises) 401 | new_hyp = h.extend(token=topk_ids[i,j], 402 | log_prob=topk_log_probs[i,j], 403 | state = new_state, 404 | attn_dist=attn_dist, 405 | p_gen=p_gen, 406 | coverage=new_coverage_i) 407 | all_hyps.append(new_hyp) 408 | 409 | # in the following lines, we sort all the hypothesises, and select only the beam_size most likely hypothesises 410 | hyps = [] 411 | sorted_hyps = sorted(all_hyps, key=lambda h: h.avg_log_prob, reverse=True) 412 | for h in sorted_hyps: 413 | if h.latest_token == vocab.word_to_id('[STOP]'): 414 | if steps >= self.hpm['min_dec_steps']: 415 | results.append(h) 416 | else: 417 | hyps.append(h) 418 | if len(hyps) == self.hpm['beam_size'] or len(results) == self.hpm['beam_size']: 419 | break 420 | 421 | steps += 1 422 | 423 | if len(results)==0: 424 | results=hyps 425 | 426 | # At the end of the loop we return the most likely hypothesis, which holds the most likely ouput sequence, given the input fed to the model 427 | hyps_sorted = sorted(results, key=lambda h: h.avg_log_prob, reverse=True) 428 | return hyps_sorted[0] -------------------------------------------------------------------------------- /V1/modules.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """pointer_gen_modules.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | """ 7 | 8 | import numpy as np 9 | import random 10 | import tensorflow as tf 11 | import tensorflow.nn as nn 12 | 13 | from utils import Linear 14 | from utils import apply_mask_normalize 15 | from utils import _mask_and_avg 16 | from utils import _calc_final_dist 17 | 18 | class Encoder(): 19 | """ A simple encoder class to encode the input via Bidirectional LSTM 20 | Args: 21 | hpm : hyperparameters 22 | rand_unif_init : Initializer Object (random uniform) to initialize LSTMs parameters 23 | rand_norm_init : Initializer object (truncate normal) to initialize weights and biases for linear transf. 24 | """ 25 | def __init__(self, hpm, rand_unif_init, rand_norm_init): 26 | self.hpm= hpm 27 | self.rand_unif_init = rand_unif_init 28 | self.rand_norm_init = rand_norm_init 29 | 30 | with tf.variable_scope('encoder'): 31 | self.lstm_cell_fw = tf.contrib.rnn.LSTMCell(self.hpm["hidden_size"], 32 | state_is_tuple= True, initializer=self.rand_unif_init) # forward lstm cell 33 | self.lstm_cell_bw = tf.contrib.rnn.LSTMCell(self.hpm["hidden_size"], 34 | state_is_tuple= True, initializer=self.rand_unif_init) # backward lstm cell 35 | 36 | self.w_c = Linear(self.hpm['hidden_size'], True, "reduce_c", self.rand_norm_init) # Parameters for the concatenated state linear transf. 37 | self.w_h = Linear(self.hpm['hidden_size'], True, 'reduce_h', self.rand_norm_init) # Parameters for the concatenated hidden output linear transf. 38 | 39 | 40 | 41 | def __call__(self, encoder_inputs, seq_lens): 42 | """ Call method for the encoding feedforward 43 | Args: 44 | encoder_inpputs : 3D tensor, shape : [batch_size, max_enc_len, embed_size] 45 | seq_lens : 1D tensor, lengths of the sequences (without padding) in the batch, shape : [batch_size] 46 | 47 | Returns: 48 | encoder_outputs : 3D tensor, output of the bidirectional dynamic rnn, shape : [batch_size, None, 2*hidden_size] (None because the max seq len vary with the batch) 49 | new state : tuple object made of two tensors : c => state, h=> last hidden output, shape : [2,batch_size, hidden_size] 50 | """ 51 | with tf.variable_scope('encoder', reuse = tf.AUTO_REUSE): 52 | (encoder_outputs, (fw_st,bw_st)) = tf.nn.bidirectional_dynamic_rnn( 53 | self.lstm_cell_fw, self.lstm_cell_bw, encoder_inputs, 54 | dtype=tf.float32, swap_memory=True, 55 | sequence_length = seq_lens) 56 | 57 | encoder_outputs=tf.concat(encoder_outputs, axis= 2) 58 | 59 | old_c= tf.concat(values=[fw_st.c,bw_st.c], axis= 1) # we concatenate the forward and backward state, shape: [batch_size, 2*hidden_size] 60 | old_h= tf.concat(values=[fw_st.h,bw_st.h], axis= 1) # we concatenate the forwarrd and backward last hidden output, shape : [batch_size, 2*hidden_size] 61 | new_c= tf.nn.relu(self.w_c(old_c)) # linear transformation + relu activation, shape : [batch_size, hidden_size] 62 | new_h= tf.nn.relu(self.w_h(old_h)) # same as above 63 | 64 | return encoder_outputs, tf.contrib.rnn.LSTMStateTuple(new_c,new_h) 65 | 66 | 67 | 68 | class Decoder(): 69 | """ 70 | A simple decoder class made of a unidirectional LSTM cell which decodes the next word given a previous one, a context vector and a previous state 71 | Args : 72 | hpm : hyperparameters 73 | rand_unif_init : Initializer Object (random uniform) to initialize LSTM parameters 74 | """ 75 | def __init__(self,hpm,rand_unif_init): 76 | self.hpm= hpm 77 | self.rand_unif_init = rand_unif_init 78 | 79 | with tf.variable_scope('decoder'): 80 | self.lstm_cell= tf.contrib.rnn.LSTMCell(self.hpm["hidden_size"], 81 | state_is_tuple= True, initializer=self.rand_unif_init) # unidirectional lstm cell 82 | 83 | 84 | def __call__(self, dec_inputs, prev_state): 85 | """ Feedforward method for the simple decoder 86 | 87 | Args: 88 | dec_inputs : 2D tensor, list of words time step t for each sequence in the batch, shape = [batch_size, embed_size] 89 | prev_state : tuple object made of two vectors : c => state, h => last hidden output, shape : [2, batch_size, hidden_size] 90 | 91 | Returns: 92 | decoder_outputs : 2D tensor, shape = [batch_size, hidden_size] 93 | curr_st : current state of the decoder, shape : [2, batch_size, hidden_size] 94 | """ 95 | with tf.variable_scope('decoder', reuse = tf.AUTO_REUSE): 96 | decoder_outputs, curr_st= tf.nn.dynamic_rnn(self.lstm_cell, dec_inputs, 97 | dtype= tf.float32, initial_state=prev_state, swap_memory= True, time_major=True) 98 | return decoder_outputs, curr_st 99 | 100 | 101 | 102 | class Attention_decoder(): 103 | """ 104 | An attentional based encoder-decoder model (bhadanau attention, additive style) 105 | Args: 106 | hpm : hyperparameters 107 | rand_unif_init : Initializer Object (random uniform) to initialize LSTMs parameters 108 | rand_norm_init : Initializer object (truncate normal) to initialize weights and biases for linear transf. 109 | 110 | """ 111 | def __init__(self,hpm, rand_unif_init, rand_norm_init ): 112 | self.rand_unif_init = rand_unif_init 113 | self.rand_norm_init = rand_norm_init 114 | self.hpm=hpm 115 | 116 | with tf.variable_scope('attention_decoder', reuse = tf.AUTO_REUSE): 117 | self.decoder= Decoder(self.hpm, self.rand_unif_init) # simple decoder object (unidirecitional lstm) 118 | 119 | # Almost all the parameters (weights and biases) for the linear transformations (see below in the call method) 120 | 121 | self.w_h = Linear(self.hpm['attn_hidden_size'], True, "h") 122 | self.w_s = Linear(self.hpm['attn_hidden_size'], True, "s" ) 123 | self.v = Linear(1, False, 'V') 124 | 125 | self.w_dec = Linear(self.hpm['emb_size'],True, "dec_inp") 126 | self.w_out = Linear(self.hpm['vocab_size'], True, 'out') 127 | 128 | if self.hpm['pointer_gen']: 129 | self.w_c_reduce = Linear(1, True, 'c_reduce') 130 | self.w_s_reduce = Linear(1, True, 's_reduce') 131 | self.w_i_reduce = Linear(1, True, 'i_reduce') 132 | 133 | 134 | 135 | def __call__(self, enc_outputs, enc_mask, enc_state, decoder_inputs,batch_max_oov_len = None, encoder_input_with_oov = None, cov_vec=None): 136 | """ 137 | Attentional feedforward graph . 138 | We call this method once during training for each batch, and max_dec_len times for decode mode. 139 | 140 | Args: 141 | enc_outputs : 3D tensor, encoder outputs, shape : [batch_size, batch_max_enc_len, 2*hidden_size] 142 | enc_mask : 2D tensor, encoder sequence mask, shape : [batch_size, batch_max_enc_len] 143 | decoder_inputs: 3D tensor, decoder inputs, shape : [batch_size, max_dec_len, embed_size] 144 | batch_max_oov_len : Integer, Maximum number of oov for the current batch, (None if pointer_gen = False) 145 | encoder_input_with_oov : 2D tensor, encoder input with oovs ids, shape : [batch_size, batch_max_enc_len] 146 | 147 | !!! NB : batch_max_enc_len is None when we build graph, and vary during the feedforward with the current batch treated, 148 | it is the maximum length of sequences of the current batch 149 | 150 | Returns : A dictionary 151 | output : list max_dec_en of 2D tensors of shape [batch_size, vocab_size + batch_max_oov_len (if pointer_gen)] 152 | last_context_vector : 2D tensor, shape : [batch_size, 2*hidden_size], this will be useful in the decode mode 153 | dec_state : 2D tensor, decoder last state, shape : [2, batch_size, hidden_size] 154 | p_gen : max_dec_len-many list of 1D tensors of length[batch_size] (only if pointer_gen is true) 155 | attention_vec : max_dec_len-many list of 2D tensors of shape [batch_size, batch_max_enc_len] (only if coverage is true) 156 | """ 157 | 158 | if(self.hpm["pointer_gen"]): 159 | p_gens=[] # if pointer gen, we add an array to store the probability of each word in the sequences to be generated or pointed on 160 | 161 | attn_dists = [] # array to store the attention distributions over the enc seq 162 | dec_state = enc_state # we init the decoder state with the encoder last state 163 | outputs=[] # array to store the final probability distributions (decoded sequence) 164 | dec_inp = tf.unstack(decoder_inputs) # we unstack the decoder input to be able to enumerate over this tensor 165 | 166 | if self.hpm['decode_using_prev']: 167 | argmax_arr = [] 168 | samples_arr = [] 169 | argmax_logprob_arr = [] 170 | samples_logprob_arr = [] 171 | 172 | # nested function 173 | def attention(dec_state, cov_vec=None): 174 | """ 175 | Attention mechanism 176 | 177 | Args: 178 | dec_state : previous state of the decoder. shape : [2, batch_size, hidden_size]. For the first step, it corresponds to the encoder last state 179 | cov_vec : only if coverage is True (default None). shape : [batch_size, ]. The previous coverage vector. 180 | 181 | Returns: 182 | attn_vec : 2D tensor, the attention vector at time step t. shape : [batch_size, ] 183 | context_vector : 2D tensor, shape: [batch_size, 2*hidden_size] 184 | cov_vec : 2D tensor, shape : [batch_size, ], the current coverage vector 185 | """ 186 | if(self.hpm["coverage"]): 187 | with tf.variable_scope('coverage', reuse = tf.AUTO_REUSE ): 188 | w_c = tf.get_variable("w_c", [1,1,1,self.hpm['attn_hidden_size']]) # we add additional parameters for the coverage vector linear transf. 189 | 190 | cov_features = tf.expand_dims(tf.expand_dims(cov_vec, axis=2),axis=2) # given that the encoder max length is unknown and variable, we cannot just apply a 191 | cov_features = tf.nn.conv2d(cov_features, w_c, [1,1,1,1], "SAME") # linear transformation as above. To avoid this issue, we can apply a convolution layer 192 | # which will transform the cov vector as a simple linear transf. would. 193 | 194 | # e = V*tanh(w_h*h + w_s*s + w_c*c ) (the last term, only is coverage = True) 195 | # attention weights all over the encoder input sequence 196 | # shape : [batch_size, , 1] 197 | e=tf.nn.tanh(self.w_h(enc_outputs) + 198 | tf.expand_dims(self.w_s(dec_state.c), axis=1) + 199 | tf.squeeze(cov_features, [2])) 200 | else: 201 | e=tf.nn.tanh(self.w_h(enc_outputs) + 202 | tf.expand_dims(self.w_s(dec_state.c), axis=1)) 203 | e = self.v(e) 204 | 205 | # we take off the last dimension which equals 1 206 | e = tf.reshape(e, [ e.get_shape().as_list()[0], -1]) # shape : [batch_size, ] 207 | 208 | 209 | attn_vec = tf.nn.softmax(e, axis=-1) # we apply a softmax on the attention weights to normalize them and obtain the attention vector. 210 | attn_vec = apply_mask_normalize(attn_vec, enc_mask) # Given that the input is padded with token, the attentions weights over those tokens 211 | # are not relevant, we apply the encoder input masks on the attention vectors to drop those 'irrelevant' attention weights 212 | # and finally we re-normalize the attention weights to obtain probability distributions 213 | 214 | # context vector computation 215 | # we multiply the encoder outputs by the attention vector weigths (a weight for each output vector, when we consider only one sequence for the example) 216 | weighted_enc_outputs = tf.multiply(enc_outputs, tf.expand_dims(attn_vec, axis=-1)) # context vector at time step t, shape : [batch_size, ] 217 | context_vec = tf.reduce_sum(weighted_enc_outputs, axis=1) 218 | 219 | if self.hpm['coverage']: 220 | cov_vec = cov_vec + attn_vec # we update the coverage 221 | 222 | return attn_vec, context_vec, cov_vec 223 | # end of nested function 224 | 225 | with tf.variable_scope('attention_decoder', reuse = tf.AUTO_REUSE): 226 | # we compute the initial context vector 227 | _ , context_vec, _ = attention( dec_state, cov_vec) 228 | timesteps = self.hpm['max_dec_len'] 229 | decoder_input = dec_inp[0] 230 | a=0 231 | if not self.hpm['decode_using_prev']: 232 | a = 1 233 | for i in range (a,timesteps): 234 | # for each item in the decoder inputs (this loops only once for decode mode) 235 | 236 | # concatenation of input (previous word) and context vector at timestep t 237 | new_dec_inp = tf.concat([decoder_input, context_vec], axis = -1) # shape : [batch_size, embed_size+2*hidden_size] 238 | new_dec_inp = self.w_dec(new_dec_inp) #shape : [batch_size, embed_size] 239 | 240 | # We apply the LSTM decoder on the new input 241 | dec_output, dec_state = self.decoder(tf.expand_dims(new_dec_inp, axis=0), dec_state) # dec_output shape : [1, batch_size, hidden_size] 242 | # dec_state shape : [2, batch_size, hidden_size] (2 for the state c and the last hidden output h) 243 | # attention vector of the current step, context vector for the next step 244 | # we update the coverage vector 245 | attn_vec, context_vec, cov_vec = attention( dec_state, cov_vec) 246 | attn_dists.append(attn_vec) 247 | 248 | dec_output = tf.reshape(dec_output, [-1, dec_output.get_shape().as_list()[-1]]) # shape : [batch_size, hidden_size] 249 | dec_output = self.w_out(dec_output) # shape : [batch_size, vocab_size] 250 | vocab_dist = dec_output 251 | 252 | if not self.hpm['pointer_gen']: 253 | outputs.append(vocab_dist) # we do not apply yet the softmax function because this function is integrated in some futures ops like the loss function 254 | else: 255 | # if pointer_gen=True, we need to compute the softmax function because of the scatter op with the attention distribution 256 | outputs.append(tf.nn.softmax(dec_output, axis=-1)) 257 | state = tf.concat([dec_state.c, dec_state.h], axis=1) 258 | 259 | #p_gen computation with the current concatenated state, context vector and the decoder input 260 | p_gen = tf.nn.sigmoid(self.w_c_reduce(context_vec)+ 261 | self.w_s_reduce(state )+ 262 | self.w_i_reduce(new_dec_inp)) # shape : [batch_size, 1] 263 | p_gens.append(p_gen) 264 | 265 | 266 | 267 | if self.hpm['pointer_gen']: 268 | # we apply the scatter op between the output distibutions (over the vocabulary) with the attention distributions 269 | outputs = _calc_final_dist(encoder_input_with_oov, outputs, attn_dists, p_gens, batch_max_oov_len, self.hpm) 270 | 271 | 272 | if not self.hpm['decode_using_prev']: 273 | decoder_input = dec_inp[i] 274 | else: 275 | 276 | batch_nums = tf.range(0, limit=self.hpm['batch_size'], dtype=tf.int64) 277 | argmax_seqs = [] 278 | argmax_seqs_log_probs = [] 279 | for i , x in enumerate(outputs): 280 | max_ids = tf.argmax(x, axis=-1) 281 | indices = tf.stack((batch_nums, max_ids), axis = -1) 282 | log_probs = tf.gather_nd(x, indices) 283 | argmax_seqs.append(max_ids) 284 | argmax_seqs_log_probs.append(log_probs) 285 | 286 | 287 | soft_outputs = tf.stack(outputs) 288 | if not self.hpm['pointer_gen']: 289 | soft_outputs = tf.softmax(soft_outputs) 290 | 291 | argmax_seqs = tf.stack(argmax_seqs) 292 | argmax_seqs_log_probs = tf.stack(argmax_seqs_log_probs) 293 | 294 | sampler = tf.distributions.Categorical(logits=soft_outputs) 295 | samples = sampler.sample() 296 | samples_log_probs = sampler.log_prob(samples) 297 | samples_log_probs = tf.identity(samples_log_probs) 298 | 299 | argmax_arr.append(argmax_seqs) 300 | argmax_logprob_arr.append(argmax_seqs_log_probs) 301 | samples_arr.append(samples) 302 | samples_logprob_arr.append(samples_log_probs) 303 | 304 | decoder_input = samples 305 | 306 | if self.hpm['decode_using_prev']: 307 | argmax_arr = tf.stack(argmax_arr) 308 | argmax_logprob_arr = tf.stack(argmax_logprob_arr) 309 | samples_arr = tf.stack(samples_arr) 310 | samples_logprob_arr = tf.stack(samples_logprob_arr) 311 | 312 | dic = { 'output':outputs, 'last_context_vector':context_vec, 'dec_state':dec_state, 'attention_vec':attn_dists} 313 | if(self.hpm['pointer_gen']): 314 | dic['p_gen'] = p_gens 315 | if(self.hpm['coverage']): 316 | dic['coverage'] = cov_vec 317 | 318 | if self.hpm['decode_using_prev']: 319 | dic.update({ 320 | "argmax_seqs" : argmax_arr, 321 | "argmax_log_probs" : argmax_logprob_arr, 322 | "samples_seqs" : samples_arr, 323 | "samples_log_probs" : samples_logprob_arr 324 | }) 325 | 326 | return dic -------------------------------------------------------------------------------- /V1/train_test_eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """pointer_gen_train_test_eval.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | """ 7 | 8 | import numpy as np 9 | import random 10 | import tensorflow as tf 11 | import tensorflow.nn as nn 12 | import time 13 | 14 | tf.logging.set_verbosity(tf.logging.INFO) 15 | 16 | def get_config(): 17 | """Returns config for tf.session""" 18 | config = tf.ConfigProto(allow_soft_placement=True) 19 | config.gpu_options.allow_growth = True 20 | return config 21 | 22 | 23 | def run_training(model, batcher, hpm, training_steps,check_dir, logdir): 24 | 25 | with tf.train.MonitoredTrainingSession(checkpoint_dir = check_dir, 26 | hooks = [tf.train.StopAtStepHook(last_step=training_steps)], 27 | save_summaries_steps = None, save_summaries_secs= None, 28 | save_checkpoint_steps=1000, scaffold=tf.train.Scaffold(saver=tf.train.Saver(max_to_keep=3)), 29 | config = get_config()) as sess: 30 | 31 | #sess = tf.Session(config=get_config()) 32 | #restore_model(sess, hpm, model_path=check_dir+"model.ckpt-2200") 33 | writer = tf.summary.FileWriter(logdir, sess.graph) 34 | model.setSession(sess) 35 | while not sess.should_stop(): 36 | #while True: 37 | t0=time.time() 38 | batch = batcher.next_batch() 39 | results = model.train(batch) 40 | t1=time.time() 41 | 42 | if hpm['add_coverage']: 43 | coverage_loss= results['coverage_loss'] 44 | tf.logging.info('step : %d, seconds : %.3f, loss : %f, coverage loss: %f', results['global_step'], t1-t0, results['loss'], coverage_loss) 45 | else: 46 | tf.logging.info('step : %d, seconds : %.3f, loss : %f', results['global_step'], t1-t0, results['loss']) 47 | 48 | 49 | 50 | if not np.isfinite(results['loss']): 51 | raise Exception('loss is not finite. Stopping!') 52 | summaries = results['summaries'] 53 | writer.add_summary(summary=summaries, global_step=results['global_step']) 54 | if results['global_step'] %50==0: 55 | writer.flush() 56 | 57 | 58 | 59 | 60 | 61 | def restore_model(sess, hpm, model_path=None, check_path=None): 62 | assert ( model_path or check_path) 63 | saver = tf.train.Saver() 64 | try: 65 | if model_path: 66 | saver.restore(sess, model_path) 67 | return True 68 | else: 69 | saver.restore(sess, tf.train.latest_checkpoint(check_path)) 70 | return True 71 | except Exception as e: 72 | tf.logging.error(e) 73 | tf.logging.warning("Cannot restore model !!!") 74 | return False 75 | 76 | def total_num_params(): 77 | total_parameters = 0 78 | for variable in tf.trainable_variables(): 79 | # shape is an array of tf.Dimension 80 | shape = variable.get_shape() 81 | print(variable) 82 | print("shape :", shape) 83 | variable_parameters = 1 84 | for dim in shape: 85 | variable_parameters *= dim.value 86 | print("parameters : ",variable_parameters) 87 | total_parameters += variable_parameters 88 | return total_parameters -------------------------------------------------------------------------------- /V1/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """utils.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | """ 7 | 8 | import numpy as np 9 | import random 10 | import tensorflow as tf 11 | import tensorflow.nn as nn 12 | 13 | 14 | class Linear(): 15 | '''Class of object that apply a linear transformation of a 3D or 2D tensor''' 16 | ''' 17 | Args : 18 | output_size : Integer. The final size of the last dimension of the tensor after linear transformation 19 | bias : Boolean. If true, we add a bias vector to the tensor after linear transformation 20 | name : String. Name of the parameters 21 | init : Initializer object for the weight parameters 22 | ''' 23 | def __init__(self, output_size, bias, name, init=None): 24 | self.output_size = output_size 25 | self.bias = bias 26 | self.name = name 27 | self.init = init 28 | 29 | '''The call method to apply a linear tranformation when we call the Linear object (see the method linear below)''' 30 | def __call__(self, inp): 31 | return self.linear(inp) 32 | 33 | ''' Method for the linear transformation ''' 34 | def linear(self, inp): 35 | ''' 36 | Args: 37 | inp : 2D or 3D tensor 38 | Returns: 39 | a tensor with the same shape as the input, except the last dimension which equals output_size 40 | ''' 41 | inp_shape = inp.get_shape().as_list() # list of the dimensions of the input tensor 42 | 43 | weights= tf.get_variable(name = "w_"+self.name, shape =[inp_shape[-1], self.output_size], initializer=self.init) # weight w : shape = [, output_size] 44 | if self.bias: 45 | biais = tf.get_variable(name="b_"+self.name, shape = self.output_size, initializer=self.init) # bias : shape = [output_size] 46 | else: 47 | biais = 0 48 | 49 | if len(inp_shape) == 2: 50 | return tf.matmul(inp, weights)+biais 51 | elif len(inp_shape) == 3: 52 | inp2 = tf.reshape(inp, [-1, inp_shape[-1]]) 53 | out = tf.matmul(inp2, weights)+biais 54 | return tf.reshape(out, [inp_shape[0], -1, self.output_size]) 55 | else: 56 | raise Exception("3D or 2D tensors waited !!!") # we raise an exception if the the tensor is not a 2D or 3D tensor 57 | 58 | 59 | 60 | def apply_mask_normalize( vec, mask): 61 | """ Applies mask to values and normalize them 62 | Args: 63 | vec : a list length max_dec_steps containing arrays shape : [batch_size, ] 64 | """ 65 | v = tf.multiply(vec, tf.cast(mask, tf.float32)) 66 | return tf.divide(v, tf.reduce_sum(v,axis=1, keepdims=True)) 67 | 68 | 69 | 70 | def _mask_and_avg( values, padding_mask): 71 | """Applies mask to values then returns overall average (a scalar) 72 | Args: 73 | values: a list length max_dec_steps containing arrays shape (batch_size). 74 | padding_mask: tensor shape (batch_size, max_dec_steps) containing 1s and 0s. 75 | 76 | Returns: 77 | a scalar 78 | """ 79 | dec_lens = tf.reduce_sum(padding_mask, axis=1) # shape batch_size. float32 80 | values_per_step = [v * padding_mask[:,dec_step] for dec_step,v in enumerate(values)] 81 | values_per_ex = sum(values_per_step)/dec_lens # shape (batch_size); normalized value for each batch member 82 | return tf.reduce_mean(values_per_ex) # overall average 83 | 84 | 85 | 86 | 87 | def _calc_final_dist( _enc_batch_extend_vocab, vocab_dists, attn_dists, p_gens, batch_oov_len, hpm): 88 | """Calculate the final distribution, for the pointer-generator model 89 | 90 | Args: 91 | vocab_dists: The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file. 92 | attn_dists: The attention distributions. List length max_dec_steps of (batch_size, attn_len) arrays 93 | 94 | Returns: 95 | final_dists: The final distributions. List length max_dec_steps of (batch_size, extended_vsize) arrays. 96 | """ 97 | with tf.variable_scope('final_distribution'): 98 | # Multiply vocab dists by p_gen and attention dists by (1-p_gen) 99 | vocab_dists = [p_gen * dist for (p_gen,dist) in zip(p_gens, vocab_dists)] 100 | attn_dists = [(1-p_gen) * dist for (p_gen,dist) in zip(p_gens, attn_dists)] 101 | 102 | # Concatenate some zeros to each vocabulary dist, to hold the probabilities for in-article OOV words 103 | extended_vsize = hpm['vocab_size'] + batch_oov_len # the maximum (over the batch) size of the extended vocabulary 104 | extra_zeros = tf.zeros((hpm['batch_size'], batch_oov_len )) 105 | vocab_dists_extended = [tf.concat(axis=1, values=[dist, extra_zeros]) for dist in vocab_dists] # list length max_dec_steps of shape (batch_size, extended_vsize) 106 | 107 | # Project the values in the attention distributions onto the appropriate entries in the final distributions 108 | # This means that if a_i = 0.1 and the ith encoder word is w, and w has index 500 in the vocabulary, then we add 0.1 onto the 500th entry of the final distribution 109 | # This is done for each decoder timestep. 110 | # This is fiddly; we use tf.scatter_nd to do the projection 111 | batch_nums = tf.range(0, limit=hpm['batch_size']) # shape (batch_size) 112 | batch_nums = tf.expand_dims(batch_nums, 1) # shape (batch_size, 1) 113 | attn_len = tf.shape(_enc_batch_extend_vocab)[1] # number of states we attend over 114 | batch_nums = tf.tile(batch_nums, [1, attn_len]) # shape (batch_size, attn_len) 115 | indices = tf.stack( (batch_nums, _enc_batch_extend_vocab), axis=2) # shape (batch_size, enc_t, 2) 116 | shape = [hpm['batch_size'], extended_vsize] 117 | attn_dists_projected = [tf.scatter_nd(indices, copy_dist, shape) for copy_dist in attn_dists] # list length max_dec_steps (batch_size, extended_vsize) 118 | 119 | # Add the vocab distributions and the copy distributions together to get the final distributions 120 | # final_dists is a list length max_dec_steps; each entry is a tensor shape (batch_size, extended_vsize) giving the final distribution for that decoder timestep 121 | # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore. 122 | final_dists = [vocab_dist + copy_dist for (vocab_dist,copy_dist) in zip(vocab_dists_extended, attn_dists_projected)] 123 | 124 | return final_dists -------------------------------------------------------------------------------- /V2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIKevin/Pointer_Generator_Summarizer/bd8f89b1c0a3c65adef3b12c2f7d06c3e4688638/V2/__init__.py -------------------------------------------------------------------------------- /V2/data_preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Data_preprocess.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | 7 | # DATA 8 | """ 9 | 10 | import numpy as np 11 | import glob 12 | import random 13 | import struct 14 | import csv 15 | from tensorflow.core.example import example_pb2 16 | import tensorflow as tf 17 | 18 | from threading import Thread 19 | from queue import Queue 20 | import time 21 | import threading 22 | 23 | """## Vocabulary""" 24 | 25 | SENTENCE_START = '' 26 | SENTENCE_END = '' 27 | 28 | PAD_TOKEN = '[PAD]' 29 | UNKNOWN_TOKEN = '[UNK]' 30 | START_DECODING = '[START]' 31 | STOP_DECODING = '[STOP]' 32 | 33 | class Vocab: 34 | 35 | def __init__(self, vocab_file, max_size): 36 | 37 | self.word2id = {UNKNOWN_TOKEN : 0, PAD_TOKEN : 1, START_DECODING : 2, STOP_DECODING : 3} 38 | self.id2word = {0 : UNKNOWN_TOKEN, 1 : PAD_TOKEN, 2 : START_DECODING, 3 : STOP_DECODING} 39 | self.count = 4 40 | 41 | with open(vocab_file, 'r') as f: 42 | for line in f: 43 | pieces = line.split() 44 | if len(pieces) != 2 : 45 | print('Warning : incorrectly formatted line in vocabulary file : %s\n' % line) 46 | continue 47 | 48 | w = pieces[0] 49 | if w in [SENTENCE_START, SENTENCE_END, UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]: 50 | raise Exception(', , [UNK], [PAD], [START] and [STOP] shouldn\'t be in the vocab file, but %s is' % w) 51 | 52 | if w in self.word2id: 53 | raise Exception('Duplicated word in vocabulary file: %s' % w) 54 | 55 | self.word2id[w] = self.count 56 | self.id2word[self.count] = w 57 | self.count += 1 58 | if max_size != 0 and self.count >= max_size: 59 | print("max_size of vocab was specified as %i; we now have %i words. Stopping reading." % (max_size, self.count)) 60 | break 61 | 62 | print("Finished constructing vocabulary of %i total words. Last word added: %s" % (self.count, self.id2word[self.count-1])) 63 | 64 | 65 | def word_to_id(self, word): 66 | if word not in self.word2id: 67 | return self.word2id[UNKNOWN_TOKEN] 68 | return self.word2id[word] 69 | 70 | def id_to_word(self, word_id): 71 | if word_id not in self.id2word: 72 | raise ValueError('Id not found in vocab: %d' % word_id) 73 | return self.id2word[word_id] 74 | 75 | def size(self): 76 | return self.count 77 | 78 | 79 | 80 | """## Data helpers""" 81 | 82 | def article_to_ids(article_words, vocab): 83 | ids = [] 84 | oovs = [] 85 | unk_id = vocab.word_to_id(UNKNOWN_TOKEN) 86 | for w in article_words: 87 | i = vocab.word_to_id(w) 88 | if i == unk_id: # If w is OOV 89 | if w not in oovs: # Add to list of OOVs 90 | oovs.append(w) 91 | oov_num = oovs.index(w) # This is 0 for the first article OOV, 1 for the second article OOV... 92 | ids.append(vocab.size() + oov_num) # This is e.g. 50000 for the first article OOV, 50001 for the second... 93 | else: 94 | ids.append(i) 95 | return ids, oovs 96 | 97 | 98 | def abstract_to_ids(abstract_words, vocab, article_oovs): 99 | ids = [] 100 | unk_id = vocab.word_to_id(UNKNOWN_TOKEN) 101 | for w in abstract_words: 102 | i = vocab.word_to_id(w) 103 | if i == unk_id: # If w is an OOV word 104 | if w in article_oovs: # If w is an in-article OOV 105 | vocab_idx = vocab.size() + article_oovs.index(w) # Map to its temporary article OOV number 106 | ids.append(vocab_idx) 107 | else: # If w is an out-of-article OOV 108 | ids.append(unk_id) # Map to the UNK token id 109 | else: 110 | ids.append(i) 111 | return ids 112 | 113 | 114 | 115 | def output_to_words(id_list, vocab, article_oovs): 116 | words = [] 117 | for i in id_list: 118 | try: 119 | w = vocab.id_to_word(i) # might be [UNK] 120 | except ValueError as e: # w is OOV 121 | assert article_oovs is not None, "Error: model produced a word ID that isn't in the vocabulary. This should not happen in baseline (no pointer-generator) mode" 122 | article_oov_idx = i - vocab.size() 123 | try: 124 | w = article_oovs[article_oov_idx] 125 | except ValueError as e: # i doesn't correspond to an article oov 126 | raise ValueError('Error: model produced word ID %i which corresponds to article OOV %i but this example only has %i article OOVs' % (i, article_oov_idx, len(article_oovs))) 127 | words.append(w) 128 | return words 129 | 130 | 131 | 132 | def abstract_to_sents(abstract): 133 | """Splits abstract text from datafile into list of sentences. 134 | Args: 135 | abstract: string containing and tags for starts and ends of sentences 136 | Returns: 137 | sents: List of sentence strings (no tags)""" 138 | cur = 0 139 | sents = [] 140 | while True: 141 | try: 142 | start_p = abstract.index(SENTENCE_START, cur) 143 | end_p = abstract.index(SENTENCE_END, start_p + 1) 144 | cur = end_p + len(SENTENCE_END) 145 | sents.append(abstract[start_p+len(SENTENCE_START):end_p]) 146 | except ValueError as e: # no more sentences 147 | return sents 148 | 149 | 150 | 151 | def example_generator(data_path, hpm): 152 | while True: 153 | filelist = glob.glob(data_path) # get the list of datafiles 154 | assert filelist, ('Error: Empty filelist at %s' % data_path) # check filelist isn't empty 155 | if hpm['singlepass']: 156 | filelist = sorted(filelist) 157 | else: 158 | random.shuffle(filelist) 159 | for f in filelist: 160 | reader = open(f, 'rb') 161 | while True: 162 | len_bytes = reader.read(8) 163 | if not len_bytes: break # finished reading this file 164 | str_len = struct.unpack('q', len_bytes)[0] 165 | example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0] 166 | yield example_pb2.Example.FromString(example_str) 167 | if hpm['singlepass'] or hpm['finished']: 168 | print("example_generator completed reading all datafiles. No more data.") 169 | break 170 | 171 | 172 | 173 | """# Batcher""" 174 | 175 | class Example(object): 176 | """Class representing a train/val/test example for text summarization.""" 177 | def __init__(self, article, abstract_sentences, vocab, hpm): 178 | """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. 179 | Args: 180 | article: source text; a string. each token is separated by a single space. 181 | abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. 182 | vocab: Vocabulary object 183 | hps: hyperparameters 184 | """ 185 | self.hpm = hpm 186 | 187 | # Get ids of special tokens 188 | start_decoding = vocab.word_to_id(START_DECODING) 189 | stop_decoding = vocab.word_to_id(STOP_DECODING) 190 | 191 | # Process the article 192 | article_words = article.split() 193 | if len(article_words) > hpm['max_enc_len']: 194 | article_words = article_words[:hpm['max_enc_len']] 195 | self.enc_len = len(article_words) # store the length after truncation but before padding 196 | self.enc_input = [vocab.word_to_id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token 197 | 198 | # Process the abstract 199 | abstract = ' '.join(abstract_sentences) # string 200 | abstract_words = abstract.split() # list of strings 201 | abs_ids = [vocab.word_to_id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token 202 | 203 | # Get the decoder input sequence and target sequence 204 | self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hpm['max_dec_len'], start_decoding, stop_decoding) 205 | self.dec_len = len(self.dec_input) 206 | 207 | # If using pointer-generator mode, we need to store some extra info 208 | if hpm['pointer_gen']: 209 | # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves 210 | self.enc_input_extend_vocab, self.article_oovs = article_to_ids(article_words, vocab) 211 | 212 | # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id 213 | abs_ids_extend_vocab = abstract_to_ids(abstract_words, vocab, self.article_oovs) 214 | 215 | # Overwrite decoder target sequence so it uses the temp article OOV ids 216 | _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hpm['max_dec_len'], start_decoding, stop_decoding) 217 | 218 | # Store the original strings 219 | self.original_article = article 220 | self.original_abstract = abstract 221 | self.original_abstract_sents = abstract_sentences 222 | 223 | 224 | def get_dec_inp_targ_seqs(self, sequence, max_len, start_id, stop_id): 225 | """Given the reference summary as a sequence of tokens, return the input sequence for the decoder, and the target sequence which we will use to calculate loss. The sequence will be truncated if it is longer than max_len. The input sequence must start with the start_id and the target sequence must end with the stop_id (but not if it's been truncated). 226 | Args: 227 | sequence: List of ids (integers) 228 | max_len: integer 229 | start_id: integer 230 | stop_id: integer 231 | Returns: 232 | inp: sequence length <=max_len starting with start_id 233 | target: sequence same length as input, ending with stop_id only if there was no truncation 234 | """ 235 | inp = [start_id] + sequence[:] 236 | target = sequence[:] 237 | if len(inp) > max_len: # truncate 238 | inp = inp[:max_len] 239 | target = target[:max_len] # no end_token 240 | else: # no truncation 241 | target.append(stop_id) # end token 242 | assert len(inp) == len(target) 243 | return inp, target 244 | 245 | 246 | def pad_decoder_inp_targ(self, max_len, pad_id): 247 | """Pad decoder input and target sequences with pad_id up to max_len.""" 248 | while len(self.dec_input) < max_len: 249 | self.dec_input.append(pad_id) 250 | while len(self.target) < max_len: 251 | self.target.append(pad_id) 252 | 253 | 254 | def pad_encoder_input(self, max_len, pad_id): 255 | """Pad the encoder input sequence with pad_id up to max_len.""" 256 | while len(self.enc_input) < max_len: 257 | self.enc_input.append(pad_id) 258 | if self.hpm['pointer_gen']: 259 | while len(self.enc_input_extend_vocab) < max_len: 260 | self.enc_input_extend_vocab.append(pad_id) 261 | 262 | 263 | 264 | 265 | class Batch(object): 266 | """Class representing a minibatch of train/val/test examples for text summarization.""" 267 | 268 | def __init__(self, example_list, hpm, vocab): 269 | """Turns the example_list into a Batch object. 270 | Args: 271 | example_list: List of Example objects 272 | hpm: hyperparameters 273 | vocab: Vocabulary object 274 | """ 275 | self.pad_id = vocab.word_to_id(PAD_TOKEN) # id of the PAD token used to pad sequences 276 | self.init_encoder_seq(example_list, hpm) # initialize the input to the encoder 277 | self.init_decoder_seq(example_list, hpm) # initialize the input and targets for the decoder 278 | self.store_orig_strings(example_list) # store the original strings 279 | 280 | def init_encoder_seq(self, example_list, hpm): 281 | """Initializes the following: 282 | self.enc_batch: 283 | numpy array of shape (batch_size, <=max_enc_steps) containing integer ids (all OOVs represented by UNK id), padded to length of longest sequence in the batch 284 | self.enc_lens: 285 | numpy array of shape (batch_size) containing integers. The (truncated) length of each encoder input sequence (pre-padding). 286 | self.enc_padding_mask: 287 | numpy array of shape (batch_size, <=max_enc_steps), containing 1s and 0s. 1s correspond to real tokens in enc_batch and target_batch; 0s correspond to padding. 288 | If hps.pointer_gen, additionally initializes the following: 289 | self.max_art_oovs: 290 | maximum number of in-article OOVs in the batch 291 | self.art_oovs: 292 | list of list of in-article OOVs (strings), for each example in the batch 293 | self.enc_batch_extend_vocab: 294 | Same as self.enc_batch, but in-article OOVs are represented by their temporary article OOV number. 295 | """ 296 | # Determine the maximum length of the encoder input sequence in this batch 297 | max_enc_seq_len = max([ex.enc_len for ex in example_list]) 298 | 299 | # Pad the encoder input sequences up to the length of the longest sequence 300 | for ex in example_list: 301 | ex.pad_encoder_input(max_enc_seq_len, self.pad_id) 302 | 303 | # Initialize the numpy arrays 304 | # Note: our enc_batch can have different length (second dimension) for each batch because we use dynamic_rnn for the encoder. 305 | self.enc_batch = np.zeros((hpm['batch_size'], max_enc_seq_len), dtype=np.int32) 306 | self.enc_lens = np.zeros((hpm['batch_size']), dtype=np.int32) 307 | self.enc_padding_mask = np.zeros((hpm['batch_size'], max_enc_seq_len), dtype=np.float32) 308 | 309 | # Fill in the numpy arrays 310 | for i, ex in enumerate(example_list): 311 | self.enc_batch[i, :] = ex.enc_input[:] 312 | self.enc_lens[i] = ex.enc_len 313 | for j in range(ex.enc_len): 314 | self.enc_padding_mask[i][j] = 1 315 | 316 | # For pointer-generator mode, need to store some extra info 317 | if hpm['pointer_gen']: 318 | # Determine the max number of in-article OOVs in this batch 319 | self.max_art_oovs = max([len(ex.article_oovs) for ex in example_list]) 320 | # Store the in-article OOVs themselves 321 | self.art_oovs = [ex.article_oovs for ex in example_list] 322 | # Store the version of the enc_batch that uses the article OOV ids 323 | self.enc_batch_extend_vocab = np.zeros((hpm['batch_size'], max_enc_seq_len), dtype=np.int32) 324 | for i, ex in enumerate(example_list): 325 | self.enc_batch_extend_vocab[i, :] = ex.enc_input_extend_vocab[:] 326 | 327 | def init_decoder_seq(self, example_list, hpm): 328 | """Initializes the following: 329 | self.dec_batch: 330 | numpy array of shape (batch_size, max_dec_steps), containing integer ids as input for the decoder, padded to max_dec_steps length. 331 | self.target_batch: 332 | numpy array of shape (batch_size, max_dec_steps), containing integer ids for the target sequence, padded to max_dec_steps length. 333 | self.dec_padding_mask: 334 | numpy array of shape (batch_size, max_dec_steps), containing 1s and 0s. 1s correspond to real tokens in dec_batch and target_batch; 0s correspond to padding. 335 | """ 336 | # Pad the inputs and targets 337 | for ex in example_list: 338 | ex.pad_decoder_inp_targ(hpm['max_dec_len'], self.pad_id) 339 | 340 | # Initialize the numpy arrays. 341 | # Note: our decoder inputs and targets must be the same length for each batch (second dimension = max_dec_steps) because we do not use a dynamic_rnn for decoding. However I believe this is possible, or will soon be possible, with Tensorflow 1.0, in which case it may be best to upgrade to that. 342 | self.dec_batch = np.zeros((hpm['batch_size'], hpm['max_dec_len']), dtype=np.int32) 343 | self.target_batch = np.zeros((hpm['batch_size'], hpm['max_dec_len']), dtype=np.int32) 344 | self.dec_padding_mask = np.zeros((hpm['batch_size'], hpm['max_dec_len']), dtype=np.float32) 345 | 346 | # Fill in the numpy arrays 347 | for i, ex in enumerate(example_list): 348 | self.dec_batch[i, :] = ex.dec_input[:] 349 | self.target_batch[i, :] = ex.target[:] 350 | for j in range(ex.dec_len): 351 | self.dec_padding_mask[i][j] = 1 352 | 353 | def store_orig_strings(self, example_list): 354 | """Store the original article and abstract strings in the Batch object""" 355 | self.original_articles = [ex.original_article for ex in example_list] # list of lists 356 | self.original_abstracts = [ex.original_abstract for ex in example_list] # list of lists 357 | self.original_abstracts_sents = [ex.original_abstract_sents for ex in example_list] # list of list of lists 358 | 359 | 360 | 361 | 362 | class Batcher(): 363 | 364 | def __init__(self,data_path, hpm, vocab): 365 | self.hpm = hpm 366 | self.vocab = vocab 367 | self.max_examples_buffer_len = hpm['examples_max_buffer_len'] 368 | self.max_batch_buffer_len = hpm['batch_max_buffer_len'] 369 | self.max_batch_bucket_len = hpm['max_batch_bucket_len'] 370 | self.gen = self.thread_safe_generator(self.generator(example_generator(data_path, hpm))) 371 | self.num_fill_examples_threads = 4 372 | self.num_fill_batches_threads = 4 373 | self.elements_queue = Queue(self.max_examples_buffer_len) 374 | self.batch_queue = Queue(self.max_batch_buffer_len) 375 | self.launch_watch_threads() 376 | 377 | 378 | class thread_safe_generator(object): 379 | def __init__(self, gen): 380 | self.gen = gen 381 | self.lock = threading.Lock() 382 | 383 | def __next__(self): 384 | with self.lock: 385 | return next(self.gen) 386 | 387 | 388 | def generator(self, example_gen): 389 | while True : 390 | e = next(example_gen) 391 | try: 392 | article_text = e.features.feature['article'].bytes_list.value[0].decode() 393 | abstract_text = e.features.feature['abstract'].bytes_list.value[0].decode() 394 | except ValueError: 395 | tf.logging.error('Failed to get article or abstract from example') 396 | continue 397 | if len(article_text) == 0 : 398 | tf.logging.warning('Found an example with empty article text. Skipping it.') 399 | 400 | else: 401 | yield (article_text, abstract_text) 402 | 403 | 404 | 405 | def fill_examples_queue(self): 406 | while True: 407 | try: 408 | article, abstract = next(self.gen) 409 | abst = [sent.strip() for sent in abstract_to_sents(abstract)] 410 | ex = Example(article, abst,self.vocab, self.hpm) 411 | self.elements_queue.put(ex) 412 | except : 413 | break 414 | 415 | 416 | 417 | def fill_batch_queue(self): 418 | while True: 419 | try: 420 | if not self.hpm['decode']: 421 | batch = [] 422 | for _ in range(self.hpm['batch_size']*self.hpm['max_batch_bucket_len']): 423 | batch.append(self.elements_queue.get()) 424 | 425 | batch = sorted(batch, key=lambda x : x.enc_len) 426 | batches= [] 427 | i = 0 428 | while i+self.hpm['batch_size'] <= len(batch): 429 | batches.append(batch[i:i+self.hpm['batch_size']]) 430 | i = i + self.hpm['batch_size'] 431 | 432 | if i < len(batch): 433 | batches.append(batch[i:len(batch)]) 434 | 435 | if not self.hpm['singlepass']: 436 | random.shuffle(batches) 437 | 438 | for b in batches: 439 | # here again we crete batch object before doing pushing it to the batch queue 440 | self.batch_queue.put(Batch(b, self.hpm, self.vocab)) 441 | else: 442 | ex = self.elements_queue.get() 443 | b = [ex for _ in range(self.hpm['batch_size'])] 444 | self.batch_queue.put(Batch(b, self.hpm, self.vocab)) 445 | 446 | except : 447 | break 448 | 449 | def launch_watch_threads(self): 450 | 451 | self.elements_queue_threads = [] 452 | for i in range(self.num_fill_examples_threads): 453 | self.elements_queue_threads.append(Thread(target=self.fill_examples_queue)) 454 | self.elements_queue_threads[-1].setDaemon(True) 455 | self.elements_queue_threads[-1].start() 456 | 457 | 458 | self.batch_queue_threads = [] 459 | for j in range(self.num_fill_batches_threads): 460 | self.batch_queue_threads.append(Thread(target = self.fill_batch_queue)) 461 | self.batch_queue_threads[-1].setDaemon(True) 462 | self.batch_queue_threads[-1].start() 463 | 464 | 465 | def watch(): 466 | while True: 467 | time.sleep(60) 468 | for id, t in enumerate(self.elements_queue_threads): 469 | if not t.is_alive() : 470 | print("thread dead") 471 | new_t = Thread(target = self.fill_batch_queue) 472 | self.elements_queue_threads[id] = new_t 473 | new_t.daemon = True 474 | new_t.start() 475 | 476 | for id, t in enumerate(self.batch_queue_threads): 477 | if not t.is_alive() : 478 | print("batch thread dead") 479 | new_t = Thread(target=self.fill_batch_queue) 480 | self.batch_queue_threads[id] = new_t 481 | new_t.setDaemon(True) 482 | new_t.start() 483 | 484 | if not self.hpm['singlepass'] : 485 | self.watcher = Thread(target = watch) 486 | self.watcher.setDaemon(True) 487 | self.watcher.start() 488 | 489 | 490 | 491 | 492 | def next_batch(self): 493 | 494 | if self.batch_queue.qsize() ==0: 495 | tf.logging.warning('Bucket input queue is empty when calling next_batch. Bucket queue size: %i, Input queue size: %i', self.batch_queue.qsize(), self.elements_queue.qsize()) 496 | if self.hpm['singlepass'] or self.hpm['finished']: 497 | tf.logging.info("Finished reading dataset in single_pass mode.") 498 | return None 499 | return self.batch_queue.get() 500 | 501 | -------------------------------------------------------------------------------- /V2/main_gen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """pointer_gen_main.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | """ 7 | 8 | import numpy as np 9 | import random 10 | import tensorflow as tf 11 | import tensorflow.nn as nn 12 | import os 13 | import glob 14 | 15 | from data_preprocess import Vocab 16 | from data_preprocess import Batcher 17 | from data_preprocess import output_to_words 18 | 19 | from model import SummarizationModel 20 | 21 | from train_test_eval import get_config 22 | from train_test_eval import run_training 23 | from train_test_eval import restore_model 24 | from train_test_eval import total_num_params 25 | 26 | hpm={"hidden_size": 256 , 27 | "emb_size": 128, 28 | "attn_hidden_size":512, 29 | 30 | "batch_size":16 , 31 | 'beam_size':4, 32 | 33 | "max_enc_len": 400, 34 | 'max_dec_len':100, 35 | 'min_dec_steps':35, 36 | 'max_dec_steps':100, 37 | 38 | 39 | "pointer_gen":True, 40 | "coverage":True, 41 | "add_coverage":False, 42 | 43 | "training":True, 44 | 'decode':False, 45 | 'eval' : False, 46 | 47 | 48 | 'vocab_size':50000, 49 | 50 | 'examples_max_buffer_len' : 40, 51 | 'batch_max_buffer_len': 10, 52 | 'max_batch_bucket_len':5 , 53 | 54 | 'finished':False, 55 | 'singlepass':False, 56 | 57 | 'max_grad_norm':0.8, 58 | 'adagrad_init_acc':0.1, 59 | 'learning_rate':0.15, 60 | 'rand_unif_init_mag':0.02, 61 | 'trunc_norm_init_std':1e-4, 62 | 'cov_loss_weight':1.0, 63 | 64 | 'teacher_forcing' : True 65 | } 66 | 67 | 68 | vocab_path = "/content/gdrive/My Drive/cnn_stories/vocab" 69 | data_path = "/content/gdrive/My Drive/cnn_stories/train2/*" 70 | checkpoint_dir = "/content/gdrive/My Drive/pointer_gen/checkpoints/" 71 | model_path = "/content/gdrive/My Drive/pointer_gen/checkpoints/model.ckpt-33001" 72 | logdir = "/content/gdrive/My Drive/pointer_gen/logdir" 73 | GAN_gen_checkpoint = "/content/gdrive/My Drive/pointer_gen/GAN_gen_checkpoint/GAN_gen_checkpoint.ckpt" 74 | training_steps = 230000 75 | 76 | 77 | 78 | def build_graph(): 79 | tf.reset_default_graph() 80 | tf.logging.info('Building the model.') 81 | if hpm['decode'] : 82 | hpm['max_dec_len'] = 1 83 | mod = SummarizationModel(hpm) 84 | tf.logging.info('Building the graph.') 85 | mod.add_placeholder() 86 | 87 | device = "/gpu:0" if tf.test.is_gpu_available() else "/cpu:0" 88 | with tf.device(device): 89 | mod.build_graph() 90 | if hpm['training'] or hpm['eval']: 91 | tf.logging.info('Adding training ops.') 92 | mod.add_loss() 93 | mod.add_train_op(device) 94 | if hpm['decode']: 95 | assert mod.hpm['batch_size'] == mod.hpm['beam_size'] 96 | mod.add_top_k_likely_outputs() 97 | 98 | if not hpm['teacher_forcing']: 99 | mod.add_loss() 100 | #mod.add_top_k_likely_outputs() 101 | #mod.add_prob_logits_samples() 102 | return mod 103 | 104 | 105 | 106 | def main(): 107 | 108 | mod = build_graph() 109 | 110 | if hpm['eval']: 111 | pass 112 | 113 | if hpm['decode']: 114 | s = tf.Session(config=get_config()) 115 | init = tf.global_variables_initializer() 116 | s.run(init) 117 | restore_model(s, hpm, model_path=model_path, check_path = checkpoint_dir) 118 | return s, mod 119 | # and then we can call the beam_decode of the model to decode th summary (will be implemented later) 120 | 121 | if hpm['training']: 122 | tf.logging.info('Vocab and Batcher creation') 123 | vocab = Vocab(vocab_path, hpm['vocab_size']) 124 | batcher = Batcher(data_path, hpm, vocab) 125 | tf.logging.info('Starting training.') 126 | try: 127 | run_training(mod, batcher, hpm, training_steps, checkpoint_dir, logdir) 128 | except KeyboardInterrupt: 129 | tf.logging.info('stop training.') 130 | 131 | if not hpm['teacher_forcing']: 132 | tf.logging.info('Creating the generator for the GAN') 133 | with tf.Session(config=get_config()) as s: 134 | init = tf.global_variables_initializer() 135 | s.run(init) 136 | restore_model(s,hpm, model_path=model_path, check_path=checkpoint_dir) 137 | saver = tf.train.Saver() 138 | saver_path = saver.save(s, GAN_gen_checkpoint) 139 | tf.logging.info(saver_path) 140 | 141 | 142 | if __name__ == '__main__': 143 | main() 144 | 145 | -------------------------------------------------------------------------------- /V2/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """pointer_gen_model.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/11cNRDFW5_4tCVGjX-5L1OTuS7Bi9lq5R 8 | """ 9 | 10 | import numpy as np 11 | import random 12 | import tensorflow as tf 13 | import tensorflow.nn as nn 14 | 15 | from modules import Encoder 16 | from modules import Attention_decoder 17 | from utils import _mask_and_avg 18 | 19 | 20 | class SummarizationModel(): 21 | """ 22 | The pointer generator model 23 | Args: 24 | hpm : hyperparameters 25 | """ 26 | 27 | def __init__(self, hpm): 28 | self.hpm = hpm 29 | # encoder and attentional decoder objects 30 | self.encoder = Encoder(self.hpm) 31 | self.decoder = Attention_decoder(self.hpm) 32 | 33 | # a global step counter for the training 34 | self.step = tf.train.get_or_create_global_step() 35 | 36 | 37 | 38 | 39 | def add_placeholder(self): 40 | """ Adding placeholders to the model """ 41 | 42 | with tf.variable_scope("placeholder"): 43 | self.enc_batch = tf.placeholder(tf.int32, [self.hpm['batch_size'], None], name='enc_batch') # encoder input sequences (the 2nd dimension -max_enc_len- 44 | # of the shape is None because it varies with the batch) 45 | self.enc_mask = tf.placeholder(tf.float32, [self.hpm['batch_size'], None], name='enc_mask') # encoder input sequences masks 46 | self.enc_lens = tf.placeholder(tf.int32, [self.hpm['batch_size']], 'enc_lens') # lengths of the input sequences 47 | 48 | if self.hpm['pointer_gen']: 49 | self.enc_extend_vocab = tf.placeholder(tf.int32, [self.hpm['batch_size'], None], 'enc_extend_vocab') # encoder input sequences with oovs ids 50 | self.max_art_oovs = tf.placeholder(tf.int32, [], 'max_art_oovs') # maximum number of oovs for the current batch 51 | 52 | self.dec_batch = tf.placeholder(tf.int32, [self.hpm['batch_size'], self.hpm['max_dec_len']], name='dec_batch') # decoder input sequences (max_dec_len = 1 in decode mode) 53 | self.dec_target = tf.placeholder(tf.int32, [self.hpm['batch_size'], self.hpm['max_dec_len']], name='target_batch') 54 | self.dec_mask = tf.placeholder(tf.float32, [self.hpm['batch_size'], self.hpm['max_dec_len']], name='dec_mask') # decoder input masks tensors 55 | 56 | 57 | 58 | 59 | 60 | def build_graph(self): 61 | """ Graph building method""" 62 | with tf.variable_scope("embedding"): 63 | 64 | self.inp_embed = tf.keras.layers.Embedding( self.hpm['vocab_size'], self.hpm['emb_size'], name='inp_embed') # encoder input embeddings 65 | self.dec_embed = tf.keras.layers.Embedding( self.hpm['vocab_size'], self.hpm['emb_size'], name='dec_embed') # decoder input embeddings 66 | 67 | # we lookup the encoder input in the embedding matrix 68 | inps = self.inp_embed(self.enc_batch) # shape : [batch_size, , embed_size] 69 | # we lookup the decoder input in the embedding matrix 70 | dec = tf.transpose(self.dec_batch, perm=[1,0]) 71 | dec_inps = self.dec_embed( dec) # shape : [max_dec_len, batch_size, embed_size] 72 | # we add the encoder ops 73 | self.enc_outputs, self.dec_state_h, self.dec_state_c = self.encoder(inps) 74 | 75 | 76 | 77 | self.cov_vec = tf.zeros(shape=[self.hpm['batch_size'],tf.shape(self.enc_outputs)[1] ] , dtype=tf.float32, name="cov_vec") 78 | # we add the decoder ops 79 | self.enc_outputs = tf.identity(self.enc_outputs, "enc_outputs") 80 | self.dec_state_h = tf.identity(self.dec_state_h, "dec_state_h") 81 | self.dec_state_c = tf.identity(self.dec_state_c, "dec_state_c") 82 | 83 | self.returns = self.decoder(self.enc_outputs, self.enc_mask,self.dec_state_h, self.dec_state_c, dec_inps, self.max_art_oovs , self.enc_extend_vocab, self.cov_vec) 84 | 85 | self.returns['last_context_vector'] = tf.identity(self.returns['last_context_vector'],name="last_context_vector") 86 | 87 | self.returns['attention_vec'] = tf.identity(self.returns['attention_vec'], name="attention_vec") 88 | 89 | #self.returns['coverage'] = tf.identity(self.returns['coverage'] , name="coverage") 90 | self.returns['p_gen'] = tf.identity(self.returns['p_gen'], name="p_gen") 91 | 92 | self.returns['coverage'] = tf.identity(self.returns['coverage'], "coverage") 93 | 94 | self.returns['dec_state_h'] = tf.identity(self.returns['dec_state_h'], 'new_dec_state_h') 95 | self.returns['dec_state_c'] = tf.identity(self.returns['dec_state_c'], 'new_dec_state_c') 96 | 97 | self.returns['output'] = tf.identity(self.returns['output'], "logits") 98 | 99 | if not self.hpm['teacher_forcing']: 100 | self.returns['argmax_seqs'] = tf.identity(self.returns['argmax_seqs'], "argmax_seqs") 101 | self.returns['argmax_log_probs'] = tf.identity(self.returns['argmax_log_probs'], "argmax_log_probs") 102 | self.returns['samples_seqs'] = tf.identity(self.returns['samples_seqs'], "samples_seqs") 103 | self.returns['samples_log_probs'] = tf.identity(self.returns['samples_log_probs'], "samples_log_probs") 104 | 105 | 106 | 107 | 108 | 109 | def make_feed_dict(self, batch): 110 | """ 111 | Args: 112 | batch : Batch Object 113 | Return: 114 | A dictionary to feed the model during training 115 | """ 116 | feed_dict = {} 117 | 118 | feed_dict[self.enc_batch] = batch.enc_batch 119 | feed_dict[self.enc_mask] = batch.enc_padding_mask 120 | feed_dict[self.enc_lens] = batch.enc_lens 121 | 122 | if self.hpm['pointer_gen']: 123 | feed_dict[self.enc_extend_vocab] = batch.enc_batch_extend_vocab 124 | feed_dict[self.max_art_oovs] = batch.max_art_oovs 125 | 126 | feed_dict[self.dec_batch] = batch.dec_batch 127 | feed_dict[self.dec_target] = batch.target_batch 128 | feed_dict[self.dec_mask] = batch.dec_padding_mask 129 | 130 | return feed_dict 131 | 132 | 133 | 134 | def add_loss(self): 135 | """ We add the loss computation op """ 136 | with tf.variable_scope('loss'): 137 | 138 | if self.hpm['pointer_gen']: #if pointer_gen we apply the cross_entropy function ourselves: 139 | # we compute the log of the predicted probability of the target target word (this is the probability we must maximize) 140 | loss_per_step = [] 141 | batch_nums = tf.range(0, limit=self.hpm['batch_size']) # shape (batch_size) 142 | for dec_step, dist in enumerate(tf.unstack(self.returns['output'])): 143 | targets = self.dec_target[:,dec_step] # The indices of the target words. shape (batch_size) 144 | indices = tf.stack( (batch_nums, targets), axis=1) # shape (batch_size, 2) 145 | gold_probs = tf.gather_nd(dist, indices) # shape (batch_size). prob of correct words on this step 146 | losses = -tf.log(gold_probs) 147 | loss_per_step.append(losses) 148 | 149 | self.loss = _mask_and_avg(loss_per_step, self.dec_mask) # we drop the loss of the pad tokens 150 | 151 | else: 152 | self.loss = tf.contrib.seq2seq.sequence_loss(tf.stack(self.returns['output'], axis=1), self.dec_batch, self.dec_mask) 153 | #if not pointer_gen, we compute the softmax, and the sequence to squence cross_entropy loss with this helper function 154 | 155 | tf.summary.scalar('loss', self.loss) 156 | self.total_loss = self.loss 157 | if self.hpm['coverage']: 158 | 159 | # nested function 160 | def coverage_loss(self): 161 | """ coverage loss computation""" 162 | covlosses = [] 163 | coverage = tf.zeros_like(tf.unstack(self.returns['attention_vec'][0])) 164 | for a in tf.unstack(self.returns['attention_vec']): # a in an attention vector at time step t 165 | covloss = tf.reduce_sum(tf.minimum(a, coverage ), 1) 166 | covlosses.append(covloss) 167 | coverage += a 168 | coverage_loss = _mask_and_avg(covlosses, self.enc_mask) # we drop the pad tokens loss and compute the avg loss 169 | return coverage_loss 170 | 171 | self.coverage_loss = coverage_loss(self) 172 | self.coverage_loss = tf.identity(self.coverage_loss, name="coverage_loss") 173 | if self.hpm['add_coverage']: 174 | tf.summary.scalar('coverage_loss', self.coverage_loss) 175 | if self.hpm['add_coverage']: 176 | self.total_loss += self.hpm['cov_loss_weight']* self.coverage_loss # we weight the coverage loss and add it to thhe total loss 177 | # the total loss = seq2seq_loss + coverage_loss (if coverage = True) 178 | tf.summary.scalar('total_loss', self.total_loss) 179 | 180 | self.loss = tf.identity(self.loss, name="loss") 181 | self.total_loss = tf.identity(self.total_loss, name="total_loss") 182 | 183 | 184 | 185 | def add_train_op(self, device): 186 | """We add the training op to the graph""" 187 | loss_to_minimize = self.total_loss 188 | variables = tf.trainable_variables() # we recover all the trainable parameters 189 | gradients = tf.gradients(loss_to_minimize, variables, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE ) # we compute the gradients of the loss with respect to all the parameters (backpropagation) 190 | 191 | with tf.device(device): 192 | grads, global_norm = tf.clip_by_global_norm(gradients, self.hpm['max_grad_norm']) # we clip the gradients 193 | 194 | optimizer = tf.train.AdagradOptimizer(self.hpm['learning_rate'], initial_accumulator_value=self.hpm['adagrad_init_acc'], ) # we create the optimizer object 195 | with tf.device(device): 196 | self.train_op = optimizer.apply_gradients(zip(grads, variables), name='train_step', global_step=self.step) # Gradient descent (we update the parameters) 197 | # this is the training op 198 | 199 | self.summaries = tf.summary.merge_all() 200 | 201 | 202 | 203 | def setSession(self, sess): 204 | """ we set a session for the training""" 205 | self.sess = sess 206 | 207 | 208 | 209 | def train(self, batch): 210 | """We run the train op""" 211 | feed_dict = self.make_feed_dict(batch) 212 | to_return = {'train_op':self.train_op, 213 | 'loss':self.loss, 214 | 'global_step':self.step, 215 | 'summaries' : self.summaries} 216 | if (self.hpm['coverage']): 217 | to_return['coverage_loss'] = self.coverage_loss 218 | 219 | return self.sess.run(to_return, feed_dict) 220 | 221 | 222 | 223 | 224 | def add_top_k_likely_outputs(self): 225 | """We add an op to the graph that computes the top k output probabilities and their ids, used during decoding""" 226 | assert len(tf.unstack(self.returns['output'])) == 1 227 | top_k_probs, self.top_k_ids= tf.nn.top_k(self.returns['output'][0], self.hpm['beam_size']*2) 228 | self.top_k_log_probs = tf.log(top_k_probs, name="top_k_log_probs") 229 | self.top_k_ids = tf.identity(self.top_k_ids, name="top_k_ids") 230 | # we compute the log of the probalities (given the size of the vocabulary, the probaility are generally very small, it is better then to use their log) 231 | 232 | 233 | 234 | def add_prob_logits_samples(self): 235 | outputs = tf.unstack(self.returns['output']) 236 | batch_nums = tf.range(0, limit=self.hpm['batch_size'], dtype=tf.int64) 237 | argmax_seqs = [] 238 | argmax_seqs_log_probs = [] 239 | for i , x in enumerate(outputs): 240 | max_ids = tf.argmax(x, axis=-1) 241 | indices = tf.stack((batch_nums, max_ids), axis = -1) 242 | log_probs = tf.gather_nd(x, indices) 243 | argmax_seqs.append(max_ids) 244 | argmax_seqs_log_probs.append(log_probs) 245 | 246 | 247 | self.outputs = self.returns['output'] 248 | if not self.hpm['pointer_gen']: 249 | self.outputs = tf.softmax(self.outputs) 250 | 251 | self.argmax_seqs = tf.stack(argmax_seqs, name='argmax_seqs') 252 | self.argmax_seqs_log_probs = tf.stack(argmax_seqs_log_probs, name='argmax_seqs_log_probs') 253 | 254 | sampler = tf.distributions.Categorical(logits=outputs) 255 | self.samples = sampler.sample(name='samples') 256 | self.samples = tf.identity(self.samples, name='samples') 257 | self.samples_log_probs = sampler.log_prob(self.samples, name="samples_log_probs") 258 | self.samples_log_probs = tf.identity(self.samples_log_probs, name="samples_log_probs") 259 | 260 | 261 | 262 | def decode_onestep(self, sess, batch, enc_outputs, dec_state, dec_input, cov_vec): 263 | """ 264 | Method to decode the output step by step (used for beamSearch decoding) 265 | Args: 266 | sess : tf.Session object 267 | batch : current batch, shape = [beam_size, 1, vocab_size( + max_oov_len if pointer_gen)] (for the beam search decoding, batch_size = beam_size) 268 | enc_outputs : hiddens outputs computed by the encoder LSTM 269 | dec_state : beam_size-many list of decoder previous state, LSTMStateTuple objects, shape = [beam_size, 2, hidden_size] 270 | dec_input : decoder_input, the previous decoded batch_size-many words, shape = [beam_size, embed_size] 271 | cov_vec : beam_size-many list of previous coverage vector 272 | Returns: A dictionary of the results of all the ops computations (see below for more details) 273 | """ 274 | 275 | # dec_state is a batch_size-many list of LSTMStateTuple objects 276 | # we have to transform it to one LSTMStateTuple object where c and h have shape : [beam_size, hidden_size] 277 | 278 | new_h = np.array([state[0] for state in dec_state ]) 279 | new_c = np.array([state[1] for state in dec_state ]) 280 | 281 | # dictionary of all the ops that will be computed 282 | to_return = {'last_context_vector' : self.returns['last_context_vector'], # list of the previous context_vectors , shape : [beam_size, 2 x hidden_size] 283 | 'dec_state_h' : self.returns['dec_state_h'], # beam_size-many list of LSTMStateTuple cells, where c and h have shape : [hidden_size] 284 | 'dec_state_c' : self.returns['dec_state_c'], 285 | 'top_k_ids' : self.top_k_ids, # top (2x xbeam_size) ids of the most liikely words to appear at the current time step 286 | 'top_k_log_probs' : self.top_k_log_probs, # top (2x xbeam_size) probabilities of the most liikely words to appear at the current time step 287 | 'attention_vec':self.returns['attention_vec']} # beam_size-many list of attention vectors, shape : [1, beam_size, max_enc_len] 288 | 289 | if self.hpm['coverage']: 290 | to_return['coverage'] = self.returns['coverage'] # beam_size-many list of coverage vectors , shape : [batch_size, max_enc_len] 291 | if self.hpm['pointer_gen']: 292 | to_return['p_gen'] = self.returns['p_gen'] # shape : [beam_size, 1] 293 | 294 | to_feed = {self.enc_outputs : enc_outputs, 295 | self.enc_mask : batch.enc_padding_mask, 296 | self.dec_batch : np.transpose(np.array([dec_input])), #shape : [beam_size, 1] 297 | self.dec_state_h : new_h, 298 | self.dec_state_c : new_c} 299 | 300 | if self.hpm['pointer_gen']: 301 | to_feed[self.enc_extend_vocab] = batch.enc_batch_extend_vocab 302 | to_feed[self.max_art_oovs] = batch.max_art_oovs 303 | 304 | if self.hpm['coverage']: 305 | to_feed[self.cov_vec] = cov_vec 306 | 307 | results = sess.run(to_return, to_feed) 308 | 309 | return results 310 | 311 | 312 | def beam_decode(self, sess, batch, vocab): 313 | 314 | # nested class 315 | class Hypothesis: 316 | """ Class designed to hold hypothesises throughout the beamSearch decoding """ 317 | def __init__(self, tokens, log_probs, state, attn_dists, p_gens, coverage): 318 | self.tokens = tokens # list of all the tokens from time 0 to the current time step t 319 | self.log_probs = log_probs # list of the log probabilities of the tokens of the tokens 320 | self.state = state # decoder state after the last token decoding 321 | self.attn_dists = attn_dists # attention dists of all the tokens 322 | self.p_gens = p_gens # generation probability of all the tokens 323 | self.coverage = coverage # coverage at the current time step t 324 | 325 | def extend(self, token, log_prob, state, attn_dist, p_gen, coverage): 326 | """Method to extend the current hypothesis by adding the next decoded toekn and all the informations associated with it""" 327 | return Hypothesis(tokens = self.tokens + [token], # we add the decoded token 328 | log_probs = self.log_probs + [log_prob], # we add the log prob of the decoded token 329 | state = state, # we update the state 330 | attn_dists = self.attn_dists + [attn_dist], # we add the attention dist of the decoded token 331 | p_gens = self.p_gens + [p_gen], # we add the p_gen 332 | coverage = coverage) # we update the coverage 333 | 334 | @property 335 | def latest_token(self): 336 | return self.tokens[-1] 337 | 338 | @property 339 | def tot_log_prob(self): 340 | return sum(self.log_probs) 341 | 342 | @property 343 | def avg_log_prob(self): 344 | return self.tot_log_prob/len(self.tokens) 345 | 346 | # end of the nested class 347 | 348 | # We run the encoder once and then we use the results to decode each time step token 349 | enc_outputs, dec_in_state_h, dec_in_state_c = sess.run([self.enc_outputs, self.dec_state_h, self.dec_state_c], {self.enc_batch : batch.enc_batch, 350 | self.enc_mask : batch.enc_padding_mask, 351 | self.enc_lens : batch.enc_lens}) 352 | # Initial Hypothesises (beam_size many list) 353 | hyps = [Hypothesis(tokens=[vocab.word_to_id('[START]')], # we initalize all the beam_size hypothesises with the token start 354 | log_probs = [0.0], # Initial log prob = 0 355 | state = [dec_in_state_h[0], dec_in_state_c[0]], #initial dec_state (we will use only the first dec_state because they're initially the same) 356 | attn_dists=[], 357 | p_gens = [], 358 | coverage=np.zeros([enc_outputs.shape[1]]) # we init the coverage vector to zero 359 | ) for _ in range(self.hpm['batch_size'])] # batch_size == beam_size 360 | 361 | results = [] # list to hold the top beam_size hypothesises 362 | steps=0 # initial step 363 | 364 | while steps < self.hpm['max_dec_steps'] and len(results) < self.hpm['beam_size'] : 365 | latest_tokens = [h.latest_token for h in hyps] # latest token for each hypothesis , shape : [beam_size] 366 | latest_tokens = [t if t in range(self.hpm['vocab_size']) else vocab.word_to_id('[UNK]') for t in latest_tokens] # we replace all the oov is by the unknown token 367 | states = [h.state for h in hyps] # we collect the last states for each hypothesis 368 | 369 | if self.hpm['coverage']: 370 | prev_coverage = [h.coverage for h in hyps] 371 | else: 372 | prev_coverage = None 373 | 374 | # we decode the top likely 2 x beam_size tokens tokens at time step t for each hypothesis 375 | returns = self.decode_onestep(sess, batch, enc_outputs, states, latest_tokens, prev_coverage) 376 | topk_ids, topk_log_probs, new_states_h, new_states_c, attn_dists = returns['top_k_ids'], returns['top_k_log_probs'], returns['dec_state_h'], returns['dec_state_c'], returns['attention_vec'] 377 | if self.hpm['pointer_gen']: 378 | p_gens = returns['p_gen'] 379 | if self.hpm['coverage']: 380 | new_coverage = returns['coverage'] 381 | 382 | attn_dists = np.squeeze(attn_dists) # shape : [beam_size, max_enc_len] 383 | if self.hpm['pointer_gen']: 384 | p_gens = np.squeeze(p_gens) # shape : [beam_size] 385 | 386 | all_hyps = [] 387 | num_orig_hyps = 1 if steps ==0 else len(hyps) 388 | for i in range(num_orig_hyps): 389 | h, new_state_h, new_state_c, attn_dist, p_gen, new_coverage_i = hyps[i], new_states_h[i], new_states_c[i], attn_dists[i], p_gens[i], new_coverage[i] 390 | 391 | for j in range(self.hpm['beam_size']*2): 392 | # we extend each hypothesis with each of the top k tokens (this gives 2 x beam_size new hypothesises for each of the beam_size old hypothesises) 393 | new_hyp = h.extend(token=topk_ids[i,j], 394 | log_prob=topk_log_probs[i,j], 395 | state = [new_state_h, new_state_c], 396 | attn_dist=attn_dist, 397 | p_gen=p_gen, 398 | coverage=new_coverage_i) 399 | all_hyps.append(new_hyp) 400 | 401 | # in the following lines, we sort all the hypothesises, and select only the beam_size most likely hypothesises 402 | hyps = [] 403 | sorted_hyps = sorted(all_hyps, key=lambda h: h.avg_log_prob, reverse=True) 404 | for h in sorted_hyps: 405 | if h.latest_token == vocab.word_to_id('[STOP]'): 406 | if steps >= self.hpm['min_dec_steps']: 407 | results.append(h) 408 | else: 409 | hyps.append(h) 410 | if len(hyps) == self.hpm['beam_size'] or len(results) == self.hpm['beam_size']: 411 | break 412 | 413 | steps += 1 414 | 415 | if len(results)==0: 416 | results=hyps 417 | 418 | # At the end of the loop we return the most likely hypothesis, which holds the most likely ouput sequence, given the input fed to the model 419 | hyps_sorted = sorted(results, key=lambda h: h.avg_log_prob, reverse=True) 420 | return hyps_sorted[0] -------------------------------------------------------------------------------- /V2/modules.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """pointer_gen_modules.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | """ 7 | 8 | import numpy as np 9 | import random 10 | import tensorflow as tf 11 | import tensorflow.nn as nn 12 | 13 | from utils import Linear 14 | from utils import apply_mask_normalize 15 | from utils import _mask_and_avg 16 | from utils import _calc_final_dist 17 | 18 | class Encoder(): 19 | """ A simple encoder class to encode the input via Bidirectional LSTM 20 | Args: 21 | hpm : hyperparameters 22 | rand_unif_init : Initializer Object (random uniform) to initialize LSTMs parameters 23 | rand_norm_init : Initializer object (truncate normal) to initialize weights and biases for linear transf. 24 | """ 25 | def __init__(self, hpm): 26 | self.hpm= hpm 27 | 28 | with tf.variable_scope('encoder'): 29 | unif_init = tf.keras.initializers.RandomUniform(minval=-self.hpm['rand_unif_init_mag'],maxval=self.hpm['rand_unif_init_mag'], seed=123) 30 | norm_init = tf.keras.initializers.RandomNormal(stddev=self.hpm['trunc_norm_init_std'], seed=123) 31 | 32 | self.lstm_cell = tf.keras.layers.CuDNNLSTM(self.hpm["hidden_size"],return_sequences=True, return_state=True, kernel_initializer=unif_init, 33 | recurrent_initializer=unif_init, 34 | bias_initializer=unif_init ) # forward lstm cell 35 | self.bidirectional = tf.keras.layers.Bidirectional(self.lstm_cell) 36 | 37 | self.w_c = Linear(self.hpm['hidden_size'], True, "reduce_c", norm_init) # Parameters for the concatenated state linear transf. 38 | self.w_h = Linear(self.hpm['hidden_size'], True, 'reduce_h', norm_init) # Parameters for the concatenated hidden output linear transf. 39 | 40 | 41 | 42 | def __call__(self, encoder_inputs): 43 | """ Call method for the encoding feedforward 44 | Args: 45 | encoder_inpputs : 3D tensor, shape : [batch_size, max_enc_len, embed_size] 46 | seq_lens : 1D tensor, lengths of the sequences (without padding) in the batch, shape : [batch_size] 47 | 48 | Returns: 49 | encoder_outputs : 3D tensor, output of the bidirectional dynamic rnn, shape : [batch_size, None, 2*hidden_size] (None because the max seq len vary with the batch) 50 | new state : tuple object made of two tensors : c => state, h=> last hidden output, shape : [2,batch_size, hidden_size] 51 | """ 52 | with tf.variable_scope('encoder', reuse = tf.AUTO_REUSE): 53 | encoder_outputs, fw_st_h, fw_st_c, bw_st_h, bw_st_c = self.bidirectional(encoder_inputs) 54 | 55 | encoder_outputs=tf.concat(encoder_outputs, axis= 2) 56 | 57 | old_c= tf.concat(values=[fw_st_c,bw_st_c], axis= 1) # we concatenate the forward and backward state, shape: [batch_size, 2*hidden_size] 58 | old_h= tf.concat(values=[fw_st_h,bw_st_h], axis= 1) # we concatenate the forwarrd and backward last hidden output, shape : [batch_size, 2*hidden_size] 59 | new_c= tf.nn.relu(self.w_c(old_c)) # linear transformation + relu activation, shape : [batch_size, hidden_size] 60 | new_h= tf.nn.relu(self.w_h(old_h)) # same as above 61 | 62 | return encoder_outputs, new_h, new_c 63 | 64 | 65 | class Decoder(): 66 | """ 67 | A simple decoder class made of a unidirectional LSTM cell which decodes the next word given a previous one, a context vector and a previous state 68 | Args : 69 | hpm : hyperparameters 70 | rand_unif_init : Initializer Object (random uniform) to initialize LSTM parameters 71 | """ 72 | def __init__(self,hpm): 73 | self.hpm= hpm 74 | 75 | with tf.variable_scope('decoder'): 76 | unif_init = tf.keras.initializers.RandomUniform(minval=-self.hpm['rand_unif_init_mag'],maxval=self.hpm['rand_unif_init_mag'], seed=123) 77 | self.lstm_cell = tf.keras.layers.CuDNNLSTM(self.hpm["hidden_size"],return_sequences=True, return_state=True, kernel_initializer=unif_init, 78 | recurrent_initializer=unif_init, 79 | bias_initializer=unif_init ) # unidirectional lstm cell 80 | 81 | 82 | def __call__(self, dec_inputs, prev_state_h, prev_state_c): 83 | """ Feedforward method for the simple decoder 84 | 85 | Args: 86 | dec_inputs : 2D tensor, list of words time step t for each sequence in the batch, shape = [batch_size, embed_size] 87 | prev_state : tuple object made of two vectors : c => state, h => last hidden output, shape : [2, batch_size, hidden_size] 88 | 89 | Returns: 90 | decoder_outputs : 2D tensor, shape = [batch_size, hidden_size] 91 | curr_st : current state of the decoder, shape : [2, batch_size, hidden_size] 92 | """ 93 | with tf.variable_scope('decoder', reuse = tf.AUTO_REUSE): 94 | decoder_outputs, curr_st_h, curr_st_c= self.lstm_cell(dec_inputs,initial_state=[prev_state_h, prev_state_c]) 95 | return decoder_outputs, curr_st_h, curr_st_c 96 | 97 | 98 | 99 | class Attention_decoder(): 100 | """ 101 | An attentional based encoder-decoder model (bhadanau attention, additive style) 102 | Args: 103 | hpm : hyperparameters 104 | rand_unif_init : Initializer Object (random uniform) to initialize LSTMs parameters 105 | rand_norm_init : Initializer object (truncate normal) to initialize weights and biases for linear transf. 106 | 107 | """ 108 | def __init__(self,hpm ): 109 | self.hpm=hpm 110 | 111 | with tf.variable_scope('attention_decoder', reuse = tf.AUTO_REUSE): 112 | self.decoder= Decoder(self.hpm) # simple decoder object (unidirecitional lstm) 113 | 114 | # Almost all the parameters (weights and biases) for the linear transformations (see below in the call method) 115 | 116 | self.w_h = Linear(self.hpm['attn_hidden_size'], True, "h") 117 | self.w_s = Linear(self.hpm['attn_hidden_size'], True, "s" ) 118 | self.v = Linear(1, False, 'V') 119 | 120 | self.w_dec = Linear(self.hpm['emb_size'],True, "dec_inp") 121 | self.w_out = Linear(self.hpm['vocab_size'], True, 'out') 122 | 123 | if self.hpm['pointer_gen']: 124 | self.w_c_reduce = Linear(1, True, 'c_reduce') 125 | self.w_s_reduce = Linear(1, True, 's_reduce') 126 | self.w_i_reduce = Linear(1, True, 'i_reduce') 127 | 128 | 129 | 130 | def __call__(self, enc_outputs, enc_mask, enc_state_h, enc_state_c, decoder_inputs,batch_max_oov_len = None, encoder_input_with_oov = None, cov_vec=None): 131 | """ 132 | Attentional feedforward graph . 133 | We call this method once during training for each batch, and max_dec_len times for decode mode. 134 | 135 | Args: 136 | enc_outputs : 3D tensor, encoder outputs, shape : [batch_size, batch_max_enc_len, 2*hidden_size] 137 | enc_mask : 2D tensor, encoder sequence mask, shape : [batch_size, batch_max_enc_len] 138 | decoder_inputs: 3D tensor, decoder inputs, shape : [batch_size, max_dec_len, embed_size] 139 | batch_max_oov_len : Integer, Maximum number of oov for the current batch, (None if pointer_gen = False) 140 | encoder_input_with_oov : 2D tensor, encoder input with oovs ids, shape : [batch_size, batch_max_enc_len] 141 | 142 | !!! NB : batch_max_enc_len is None when we build graph, and vary during the feedforward with the current batch treated, 143 | it is the maximum length of sequences of the current batch 144 | 145 | Returns : A dictionary 146 | output : list max_dec_en of 2D tensors of shape [batch_size, vocab_size + batch_max_oov_len (if pointer_gen)] 147 | last_context_vector : 2D tensor, shape : [batch_size, 2*hidden_size], this will be useful in the decode mode 148 | dec_state : 2D tensor, decoder last state, shape : [2, batch_size, hidden_size] 149 | p_gen : max_dec_len-many list of 1D tensors of length[batch_size] (only if pointer_gen is true) 150 | attention_vec : max_dec_len-many list of 2D tensors of shape [batch_size, batch_max_enc_len] (only if coverage is true) 151 | """ 152 | 153 | if(self.hpm["pointer_gen"]): 154 | p_gens=[] # if pointer gen, we add an array to store the probability of each word in the sequences to be generated or pointed on 155 | 156 | attn_dists = [] # array to store the attention distributions over the enc seq 157 | dec_state_h = enc_state_h # we init the decoder state with the encoder last state 158 | dec_state_c = enc_state_c 159 | outputs=[] # array to store the final probability distributions (decoded sequence) 160 | dec_inp = tf.unstack(decoder_inputs) # we unstack the decoder input to be able to enumerate over this tensor 161 | 162 | if not self.hpm['teacher_forcing']: 163 | argmax_arr = [] 164 | samples_arr = [] 165 | argmax_logprob_arr = [] 166 | samples_logprob_arr = [] 167 | 168 | # nested function 169 | def attention(dec_state_c, cov_vec=None): 170 | """ 171 | Attention mechanism 172 | 173 | Args: 174 | dec_state : previous state of the decoder. shape : [2, batch_size, hidden_size]. For the first step, it corresponds to the encoder last state 175 | cov_vec : only if coverage is True (default None). shape : [batch_size, ]. The previous coverage vector. 176 | 177 | Returns: 178 | attn_vec : 2D tensor, the attention vector at time step t. shape : [batch_size, ] 179 | context_vector : 2D tensor, shape: [batch_size, 2*hidden_size] 180 | cov_vec : 2D tensor, shape : [batch_size, ], the current coverage vector 181 | """ 182 | if(self.hpm["coverage"]): 183 | with tf.variable_scope('coverage', reuse = tf.AUTO_REUSE ): 184 | w_c = tf.get_variable("w_c", [1,1,1,self.hpm['attn_hidden_size']]) # we add additional parameters for the coverage vector linear transf. 185 | 186 | cov_features = tf.expand_dims(tf.expand_dims(cov_vec, axis=2),axis=2) # given that the encoder max length is unknown and variable, we cannot just apply a 187 | cov_features = tf.nn.conv2d(cov_features, w_c, [1,1,1,1], "SAME") # linear transformation as above. To avoid this issue, we can apply a convolution layer 188 | # which will transform the cov vector as a simple linear transf. would. 189 | 190 | # e = V*tanh(w_h*h + w_s*s + w_c*c ) (the last term, only is coverage = True) 191 | # attention weights all over the encoder input sequence 192 | # shape : [batch_size, , 1] 193 | e=tf.nn.tanh(self.w_h(enc_outputs) + 194 | tf.expand_dims(self.w_s(dec_state_c), axis=1) + 195 | tf.squeeze(cov_features, [2])) 196 | else: 197 | e=tf.nn.tanh(self.w_h(enc_outputs) + 198 | tf.expand_dims(self.w_s(dec_state_c), axis=1)) 199 | e = self.v(e) 200 | 201 | # we take off the last dimension which equals 1 202 | e = tf.reshape(e, [ e.get_shape().as_list()[0], -1]) # shape : [batch_size, ] 203 | 204 | 205 | attn_vec = tf.nn.softmax(e, axis=-1) # we apply a softmax on the attention weights to normalize them and obtain the attention vector. 206 | attn_vec = apply_mask_normalize(attn_vec, enc_mask) # Given that the input is padded with token, the attentions weights over those tokens 207 | # are not relevant, we apply the encoder input masks on the attention vectors to drop those 'irrelevant' attention weights 208 | # and finally we re-normalize the attention weights to obtain probability distributions 209 | 210 | # context vector computation 211 | # we multiply the encoder outputs by the attention vector weigths (a weight for each output vector, when we consider only one sequence for the example) 212 | weighted_enc_outputs = tf.multiply(enc_outputs, tf.expand_dims(attn_vec, axis=-1)) # context vector at time step t, shape : [batch_size, ] 213 | context_vec = tf.reduce_sum(weighted_enc_outputs, axis=1) 214 | 215 | if self.hpm['coverage']: 216 | cov_vec = cov_vec + attn_vec # we update the coverage 217 | 218 | return attn_vec, context_vec, cov_vec 219 | # end of nested function 220 | 221 | with tf.variable_scope('attention_decoder', reuse = tf.AUTO_REUSE): 222 | # we compute the initial context vector 223 | _ , context_vec, _ = attention( dec_state_c, cov_vec) 224 | timesteps = self.hpm['max_dec_len'] 225 | decoder_input = dec_inp[0] 226 | 227 | for i in range (timesteps): 228 | # for each item in the decoder inputs (this loops only once for decode mode) 229 | 230 | #teacher forcing mode 231 | if self.hpm['teacher_forcing']: 232 | decoder_input = dec_inp[i] 233 | 234 | # concatenation of input (previous word) and context vector at timestep t 235 | new_dec_inp = tf.concat([decoder_input, context_vec], axis = -1) # shape : [batch_size, embed_size+2*hidden_size] 236 | new_dec_inp = self.w_dec(new_dec_inp) #shape : [batch_size, embed_size] 237 | 238 | # We apply the LSTM decoder on the new input 239 | dec_output, dec_state_h, dec_state_c = self.decoder(tf.expand_dims(new_dec_inp, axis=1), dec_state_h, dec_state_c) # dec_output shape : [batch_size,1, hidden_size] 240 | # dec_state shape : [2, batch_size, hidden_size] (2 for the state c and the last hidden output h) 241 | # attention vector of the current step, context vector for the next step 242 | # we update the coverage vector 243 | attn_vec, context_vec, cov_vec = attention( dec_state_c, cov_vec) 244 | attn_dists.append(attn_vec) 245 | 246 | dec_output = tf.reshape(dec_output, [-1, dec_output.get_shape().as_list()[-1]]) # shape : [batch_size, hidden_size] 247 | dec_output = self.w_out(dec_output) # shape : [batch_size, vocab_size] 248 | vocab_dist = dec_output 249 | 250 | if not self.hpm['pointer_gen']: 251 | outputs.append(vocab_dist) # we do not apply yet the softmax function because this function is integrated in some futures ops like the loss function 252 | else: 253 | # if pointer_gen=True, we need to compute the softmax function because of the scatter op with the attention distribution 254 | outputs.append(tf.nn.softmax(dec_output, axis=-1)) 255 | state = tf.concat([dec_state_c, dec_state_h], axis=1) 256 | 257 | #p_gen computation with the current concatenated state, context vector and the decoder input 258 | p_gen = tf.nn.sigmoid(self.w_c_reduce(context_vec)+ 259 | self.w_s_reduce(state )+ 260 | self.w_i_reduce(new_dec_inp)) # shape : [batch_size, 1] 261 | p_gens.append(p_gen) 262 | 263 | 264 | if not self.hpm['teacher_forcing']: 265 | 266 | batch_nums = tf.range(0, limit=self.hpm['batch_size'], dtype=tf.int64) 267 | argmax_seqs = [] 268 | argmax_seqs_log_probs = [] 269 | for i , x in enumerate(outputs): 270 | max_ids = tf.argmax(x, axis=-1) 271 | indices = tf.stack((batch_nums, max_ids), axis = -1) 272 | log_probs = tf.gather_nd(x, indices) 273 | argmax_seqs.append(max_ids) 274 | argmax_seqs_log_probs.append(log_probs) 275 | 276 | 277 | soft_outputs = tf.stack(outputs) 278 | if not self.hpm['pointer_gen']: 279 | soft_outputs = tf.softmax(soft_outputs) 280 | 281 | argmax_seqs = tf.stack(argmax_seqs) 282 | argmax_seqs_log_probs = tf.stack(argmax_seqs_log_probs) 283 | 284 | sampler = tf.distributions.Categorical(logits=soft_outputs) 285 | samples = sampler.sample() 286 | samples_log_probs = sampler.log_prob(samples) 287 | samples_log_probs = tf.identity(samples_log_probs) 288 | 289 | argmax_arr.append(argmax_seqs) 290 | argmax_logprob_arr.append(argmax_seqs_log_probs) 291 | samples_arr.append(samples) 292 | samples_logprob_arr.append(samples_log_probs) 293 | 294 | decoder_input = samples 295 | 296 | 297 | 298 | if self.hpm['pointer_gen']: 299 | # we apply the scatter op between the output distibutions (over the vocabulary) with the attention distributions 300 | outputs = _calc_final_dist(encoder_input_with_oov, outputs, attn_dists, p_gens, batch_max_oov_len, self.hpm) 301 | 302 | 303 | 304 | 305 | if not self.hpm['teacher_forcing']: 306 | argmax_arr = tf.stack(argmax_arr) 307 | argmax_logprob_arr = tf.stack(argmax_logprob_arr) 308 | samples_arr = tf.stack(samples_arr) 309 | samples_logprob_arr = tf.stack(samples_logprob_arr) 310 | 311 | dic = { 'output':outputs, 'last_context_vector':context_vec, 'dec_state_h':dec_state_h, 'dec_state_c' : dec_state_c, 'attention_vec':attn_dists} 312 | if(self.hpm['pointer_gen']): 313 | dic['p_gen'] = p_gens 314 | if(self.hpm['coverage']): 315 | dic['coverage'] = cov_vec 316 | 317 | if not self.hpm['teacher_forcing']: 318 | dic.update({ 319 | "argmax_seqs" : argmax_arr, 320 | "argmax_log_probs" : argmax_logprob_arr, 321 | "samples_seqs" : samples_arr, 322 | "samples_log_probs" : samples_logprob_arr 323 | }) 324 | 325 | return dic -------------------------------------------------------------------------------- /V2/train_test_eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """pointer_gen_train_test_eval.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | """ 7 | 8 | import numpy as np 9 | import random 10 | import tensorflow as tf 11 | import tensorflow.nn as nn 12 | import time 13 | 14 | tf.logging.set_verbosity(tf.logging.INFO) 15 | 16 | def get_config(): 17 | """Returns config for tf.session""" 18 | config = tf.ConfigProto(allow_soft_placement=True) 19 | config.gpu_options.allow_growth = True 20 | return config 21 | 22 | 23 | def run_training(model, batcher, hpm, training_steps,check_dir, logdir): 24 | 25 | with tf.train.MonitoredTrainingSession(checkpoint_dir = check_dir, 26 | hooks = [tf.train.StopAtStepHook(last_step=training_steps)], 27 | save_summaries_steps = None, save_summaries_secs= None, 28 | save_checkpoint_steps=23000, scaffold=tf.train.Scaffold(saver=tf.train.Saver(max_to_keep=11)), 29 | config = get_config()) as sess: 30 | 31 | #sess = tf.Session(config=get_config()) 32 | #restore_model(sess, hpm, model_path=check_dir+"model.ckpt-2200") 33 | writer = tf.summary.FileWriter(logdir, sess.graph) 34 | model.setSession(sess) 35 | try: 36 | while not sess.should_stop(): 37 | #while True: 38 | t0=time.time() 39 | batch = batcher.next_batch() 40 | results = model.train(batch) 41 | t1=time.time() 42 | 43 | if hpm['add_coverage']: 44 | coverage_loss= results['coverage_loss'] 45 | tf.logging.info('step : %d, seconds : %.3f, loss : %f, coverage loss: %f', results['global_step'], t1-t0, results['loss'], coverage_loss) 46 | else: 47 | tf.logging.info('step : %d, seconds : %.3f, loss : %f', results['global_step'], t1-t0, results['loss']) 48 | 49 | 50 | 51 | if not np.isfinite(results['loss']): 52 | raise Exception('loss is not finite. Stopping!') 53 | summaries = results['summaries'] 54 | writer.add_summary(summary=summaries, global_step=results['global_step']) 55 | if results['global_step'] %50==0: 56 | writer.flush() 57 | except KeyboardInterrupt: 58 | writer.flush() 59 | tf.logging.info('stop training.') 60 | 61 | 62 | 63 | 64 | 65 | def restore_model(sess, hpm, model_path=None, check_path=None): 66 | assert ( model_path or check_path) 67 | saver = tf.train.Saver() 68 | try: 69 | if model_path: 70 | saver.restore(sess, model_path) 71 | return True 72 | else: 73 | saver.restore(sess, tf.train.latest_checkpoint(check_path)) 74 | return True 75 | except Exception as e: 76 | tf.logging.error(e) 77 | tf.logging.warning("Cannot restore model !!!") 78 | return False 79 | 80 | def total_num_params(): 81 | total_parameters = 0 82 | for variable in tf.trainable_variables(): 83 | # shape is an array of tf.Dimension 84 | shape = variable.get_shape() 85 | print(variable) 86 | print("shape :", shape) 87 | variable_parameters = 1 88 | for dim in shape: 89 | variable_parameters *= dim.value 90 | print("parameters : ",variable_parameters) 91 | total_parameters += variable_parameters 92 | return total_parameters -------------------------------------------------------------------------------- /V2/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """utils.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | """ 7 | 8 | import numpy as np 9 | import random 10 | import tensorflow as tf 11 | import tensorflow.nn as nn 12 | 13 | 14 | class Linear(): 15 | '''Class of object that apply a linear transformation of a 3D or 2D tensor''' 16 | ''' 17 | Args : 18 | output_size : Integer. The final size of the last dimension of the tensor after linear transformation 19 | bias : Boolean. If true, we add a bias vector to the tensor after linear transformation 20 | name : String. Name of the parameters 21 | init : Initializer object for the weight parameters 22 | ''' 23 | def __init__(self, output_size, bias, name, init=None): 24 | self.output_size = output_size 25 | self.bias = bias 26 | self.name = name 27 | self.init = init 28 | 29 | '''The call method to apply a linear tranformation when we call the Linear object (see the method linear below)''' 30 | def __call__(self, inp): 31 | return self.linear(inp) 32 | 33 | ''' Method for the linear transformation ''' 34 | def linear(self, inp): 35 | ''' 36 | Args: 37 | inp : 2D or 3D tensor 38 | Returns: 39 | a tensor with the same shape as the input, except the last dimension which equals output_size 40 | ''' 41 | inp_shape = inp.get_shape().as_list() # list of the dimensions of the input tensor 42 | 43 | weights= tf.get_variable(name = "w_"+self.name, shape =[inp_shape[-1], self.output_size], initializer=self.init) # weight w : shape = [, output_size] 44 | if self.bias: 45 | biais = tf.get_variable(name="b_"+self.name, shape = self.output_size, initializer=self.init) # bias : shape = [output_size] 46 | else: 47 | biais = 0 48 | 49 | if len(inp_shape) == 2: 50 | return tf.matmul(inp, weights)+biais 51 | elif len(inp_shape) == 3: 52 | inp2 = tf.reshape(inp, [-1, inp_shape[-1]]) 53 | out = tf.matmul(inp2, weights)+biais 54 | return tf.reshape(out, [inp_shape[0], -1, self.output_size]) 55 | else: 56 | raise Exception("3D or 2D tensors waited !!!") # we raise an exception if the the tensor is not a 2D or 3D tensor 57 | 58 | 59 | 60 | def apply_mask_normalize( vec, mask): 61 | """ Applies mask to values and normalize them 62 | Args: 63 | vec : a list length max_dec_steps containing arrays shape : [batch_size, ] 64 | """ 65 | v = tf.multiply(vec, tf.cast(mask, tf.float32)) 66 | return tf.divide(v, tf.reduce_sum(v,axis=1, keepdims=True)) 67 | 68 | 69 | 70 | def _mask_and_avg( values, padding_mask): 71 | """Applies mask to values then returns overall average (a scalar) 72 | Args: 73 | values: a list length max_dec_steps containing arrays shape (batch_size). 74 | padding_mask: tensor shape (batch_size, max_dec_steps) containing 1s and 0s. 75 | 76 | Returns: 77 | a scalar 78 | """ 79 | dec_lens = tf.reduce_sum(padding_mask, axis=1) # shape batch_size. float32 80 | values_per_step = [v * padding_mask[:,dec_step] for dec_step,v in enumerate(values)] 81 | values_per_ex = sum(values_per_step)/dec_lens # shape (batch_size); normalized value for each batch member 82 | return tf.reduce_mean(values_per_ex) # overall average 83 | 84 | 85 | 86 | 87 | def _calc_final_dist( _enc_batch_extend_vocab, vocab_dists, attn_dists, p_gens, batch_oov_len, hpm): 88 | """Calculate the final distribution, for the pointer-generator model 89 | 90 | Args: 91 | vocab_dists: The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file. 92 | attn_dists: The attention distributions. List length max_dec_steps of (batch_size, attn_len) arrays 93 | 94 | Returns: 95 | final_dists: The final distributions. List length max_dec_steps of (batch_size, extended_vsize) arrays. 96 | """ 97 | with tf.variable_scope('final_distribution'): 98 | # Multiply vocab dists by p_gen and attention dists by (1-p_gen) 99 | vocab_dists = [p_gen * dist for (p_gen,dist) in zip(p_gens, vocab_dists)] 100 | attn_dists = [(1-p_gen) * dist for (p_gen,dist) in zip(p_gens, attn_dists)] 101 | 102 | # Concatenate some zeros to each vocabulary dist, to hold the probabilities for in-article OOV words 103 | extended_vsize = hpm['vocab_size'] + batch_oov_len # the maximum (over the batch) size of the extended vocabulary 104 | extra_zeros = tf.zeros((hpm['batch_size'], batch_oov_len )) 105 | vocab_dists_extended = [tf.concat(axis=1, values=[dist, extra_zeros]) for dist in vocab_dists] # list length max_dec_steps of shape (batch_size, extended_vsize) 106 | 107 | # Project the values in the attention distributions onto the appropriate entries in the final distributions 108 | # This means that if a_i = 0.1 and the ith encoder word is w, and w has index 500 in the vocabulary, then we add 0.1 onto the 500th entry of the final distribution 109 | # This is done for each decoder timestep. 110 | # This is fiddly; we use tf.scatter_nd to do the projection 111 | batch_nums = tf.range(0, limit=hpm['batch_size']) # shape (batch_size) 112 | batch_nums = tf.expand_dims(batch_nums, 1) # shape (batch_size, 1) 113 | attn_len = tf.shape(_enc_batch_extend_vocab)[1] # number of states we attend over 114 | batch_nums = tf.tile(batch_nums, [1, attn_len]) # shape (batch_size, attn_len) 115 | indices = tf.stack( (batch_nums, _enc_batch_extend_vocab), axis=2) # shape (batch_size, enc_t, 2) 116 | shape = [hpm['batch_size'], extended_vsize] 117 | attn_dists_projected = [tf.scatter_nd(indices, copy_dist, shape) for copy_dist in attn_dists] # list length max_dec_steps (batch_size, extended_vsize) 118 | 119 | # Add the vocab distributions and the copy distributions together to get the final distributions 120 | # final_dists is a list length max_dec_steps; each entry is a tensor shape (batch_size, extended_vsize) giving the final distribution for that decoder timestep 121 | # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore. 122 | final_dists = [vocab_dist + copy_dist for (vocab_dist,copy_dist) in zip(vocab_dists_extended, attn_dists_projected)] 123 | 124 | return final_dists -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-time-machine --------------------------------------------------------------------------------