├── .gitignore ├── LICENSE.txt ├── README.md ├── __init__.py ├── data_utils.py ├── my_seq2seq.py ├── neural_conversation_model.py ├── scripts ├── predict.sh └── train.sh └── seq2seq_model.py /.gitignore: -------------------------------------------------------------------------------- 1 | ubuntu 2 | *.swo 3 | *.swp 4 | __pycache__ 5 | dada 6 | data 7 | sftp-config.json 8 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neural_Conversation_Models 2 | ================================= 3 | This implementation contains an extension of seq2seq tutorial for conversation models in Tensorflow: 4 | 5 | 1. Option to use Beam Search and Beam Size for decoding 6 | 7 | 2. Currently, it supports 8 | - Simple seq2seq models 9 | - Attention based seq2seq models 10 | 11 | 3. To get better results use beam search during decoding / inference 12 | 13 | Examples of basic model can be found in this paper. 14 | 15 | https://arxiv.org/abs/1702.05512 16 | 17 | 18 | Prerequisites 19 | ------------- 20 | 21 | - Python Python 3.3+ 22 | - [NLTK](http://www.nltk.org/) 23 | - [TensorFlow](https://www.tensorflow.org/) 0.12.1 24 | 25 | Installations 26 | ----- 27 | 28 | * Mac 29 | ``` 30 | virtualenv --no-site-packages -p /usr/local/bin/python3.6 ~/venv-py3 31 | source ~/venv-py3/bin/activate 32 | pip3 install --upgrade \ 33 | https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.12.1-py3-none-any.whl # for CPU Usage 34 | ``` 35 | 36 | * Linux with GPU Driver 37 | ``` 38 | virtualenv --no-site-packages -p /usr/local/bin/python3.6 ~/venv-py3 39 | source ~/venv-py3/bin/activate 40 | pip3 install --upgrade \ https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.1-cp34-cp34m-linux_x86_64.whl 41 | ``` 42 | 43 | Data 44 | ----- 45 | Data accepted is in the tsv format where first component is the context and second is the reply 46 | 47 | TSV format Ubuntu Dialog Data can be found [here](https://drive.google.com/file/d/0BwPa9lrosQKdSTZxZ0tydUFGWE0/view) or [Git repo](http://git.oschina.net/ubiware/neural_conversation_models_ubuntu_corpus). 48 | 49 | example :- 50 | 1. What are you doing ? \t Writing seq2seq model . 51 | 52 | Usage 53 | ----- 54 | 55 | To train a model with Ubuntu dataset: 56 | 57 | $ python neural_conversation_model.py --train_dir ubuntu/ --en_vocab_size 60000 --size 512 --data_path ubuntu/train.tsv --dev_data ubuntu/valid.tsv --vocab_path ubuntu/60k_vocan.en --attention 58 | 59 | To test an existing model: 60 | 61 | $ python neural_conversation_model.py --train_dir ubuntu/ --en_vocab_size 60000 --size 512 --data_path ubuntu/train.tsv --dev_data ubuntu/valid.tsv --vocab_path ubuntu/60k_vocan.en --attention --decode --beam_search --beam_size 25 62 | 63 | Todo 64 | ----- 65 | 1. Add other state of art neural models. 66 | 2. Adding layer normalization( in progress ) 67 | 68 | https://github.com/pbhatia243/tf-layer-norm 69 | 70 | ## Contact 71 | Parminder Bhatia, parminder.bhatia243@gmail.com 72 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from tensorflow.models.rnn.translate import data_utils 6 | from tensorflow.models.rnn.translate import seq2seq_model 7 | 8 | 9 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import gzip 21 | import os 22 | import re 23 | import tarfile 24 | 25 | from six.moves import urllib 26 | 27 | from tensorflow.python.platform import gfile 28 | 29 | # Special vocabulary symbols - we always put them at the start. 30 | _PAD = "_PAD" 31 | _GO = "_GO" 32 | _EOS = "_EOS" 33 | _UNK = "_UNK" 34 | _START_VOCAB = [_PAD, _GO, _EOS, _UNK] 35 | 36 | PAD_ID = 0 37 | GO_ID = 1 38 | EOS_ID = 2 39 | UNK_ID = 3 40 | 41 | # Regular expressions used to tokenize. 42 | # _WORD_SPLIT = re.compile("([.,!?\"':;)(])") 43 | _WORD_SPLIT = re.compile("([.,!/?\":;)(])") 44 | _DIGIT_RE = re.compile(r"\d") 45 | 46 | 47 | 48 | def gunzip_file(gz_path, new_path): 49 | """Unzips from gz_path into new_path.""" 50 | print("Unpacking %s to %s" % (gz_path, new_path)) 51 | with gzip.open(gz_path, "rb") as gz_file: 52 | with open(new_path, "w") as new_file: 53 | for line in gz_file: 54 | new_file.write(line) 55 | 56 | 57 | def basic_tokenizer(sentence): 58 | """Very basic tokenizer: split the sentence into a list of tokens.""" 59 | words = [] 60 | for space_separated_fragment in sentence.strip().split(): 61 | words.extend(re.split(_WORD_SPLIT, space_separated_fragment)) 62 | return [w for w in words if w] 63 | 64 | 65 | def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, 66 | tokenizer=None, normalize_digits=True): 67 | """Create vocabulary file (if it does not exist yet) from data file. 68 | 69 | Data file is assumed to contain one sentence per line. Each sentence is 70 | tokenized and digits are normalized (if normalize_digits is set). 71 | Vocabulary contains the most-frequent tokens up to max_vocabulary_size. 72 | We write it to vocabulary_path in a one-token-per-line format, so that later 73 | token in the first line gets id=0, second line gets id=1, and so on. 74 | 75 | Args: 76 | vocabulary_path: path where the vocabulary will be created. 77 | data_path: data file that will be used to create vocabulary. 78 | max_vocabulary_size: limit on the size of the created vocabulary. 79 | tokenizer: a function to use to tokenize each data sentence; 80 | if None, basic_tokenizer will be used. 81 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 82 | """ 83 | if not gfile.Exists(vocabulary_path): 84 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) 85 | vocab = {} 86 | with gfile.GFile(data_path, mode="r") as f: 87 | counter = 0 88 | for line in f: 89 | counter += 1 90 | if counter % 100000 == 0: 91 | print(" processing line %d" % counter) 92 | text_conversation =line.strip().lower().split("\t") 93 | if len(text_conversation) == 2: 94 | txt = text_conversation[0] + " " + text_conversation[1] 95 | tokens = tokenizer(txt) if tokenizer else basic_tokenizer(txt) 96 | for w in tokens: 97 | # word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w 98 | word = w 99 | if word in vocab: 100 | vocab[word] += 1 101 | else: 102 | vocab[word] = 1 103 | vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) 104 | print(len(vocab_list)) 105 | if len(vocab_list) > max_vocabulary_size: 106 | vocab_list = vocab_list[:max_vocabulary_size] 107 | with gfile.GFile(vocabulary_path, mode="w") as vocab_file: 108 | for w in vocab_list: 109 | vocab_file.write(w + "\n") 110 | 111 | def initialize_vocabulary(vocabulary_path): 112 | """Initialize vocabulary from file. 113 | 114 | We assume the vocabulary is stored one-item-per-line, so a file: 115 | dog 116 | cat 117 | will result in a vocabulary {"dog": 0, "cat": 1}, and this function will 118 | also return the reversed-vocabulary ["dog", "cat"]. 119 | 120 | Args: 121 | vocabulary_path: path to the file containing the vocabulary. 122 | 123 | Returns: 124 | a pair: the vocabulary (a dictionary mapping string to integers), and 125 | the reversed vocabulary (a list, which reverses the vocabulary mapping). 126 | 127 | Raises: 128 | ValueError: if the provided vocabulary_path does not exist. 129 | """ 130 | if gfile.Exists(vocabulary_path): 131 | rev_vocab = [] 132 | with gfile.GFile(vocabulary_path, mode="r") as f: 133 | rev_vocab.extend(f.readlines()) 134 | rev_vocab = [line.strip() for line in rev_vocab] 135 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) 136 | return vocab, rev_vocab 137 | else: 138 | raise ValueError("Vocabulary file %s not found.", vocabulary_path) 139 | 140 | 141 | def sentence_to_token_ids(sentence, vocabulary, 142 | tokenizer=None, normalize_digits=True): 143 | """Convert a string to list of integers representing token-ids. 144 | 145 | For example, a sentence "I have a dog" may become tokenized into 146 | ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2, 147 | "a": 4, "dog": 7"} this function will return [1, 2, 4, 7]. 148 | 149 | Args: 150 | sentence: a string, the sentence to convert to token-ids. 151 | vocabulary: a dictionary mapping tokens to integers. 152 | tokenizer: a function to use to tokenize each sentence; 153 | if None, basic_tokenizer will be used. 154 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 155 | 156 | Returns: 157 | a list of integers, the token-ids for the sentence. 158 | """ 159 | if tokenizer: 160 | words = tokenizer(sentence) 161 | else: 162 | words = basic_tokenizer(sentence) 163 | # if not normalize_digits: 164 | return [vocabulary.get(w, UNK_ID) for w in words] 165 | # Normalize digits by 0 before looking words up in the vocabulary. 166 | # return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words] 167 | 168 | 169 | def data_to_token_ids(data_path, target_path, vocabulary_path, 170 | tokenizer=None, normalize_digits=True): 171 | """Tokenize data file and turn into token-ids using given vocabulary file. 172 | 173 | This function loads data line-by-line from data_path, calls the above 174 | sentence_to_token_ids, and saves the result to target_path. See comment 175 | for sentence_to_token_ids on the details of token-ids format. 176 | 177 | Args: 178 | data_path: path to the data file in one-sentence-per-line format. 179 | target_path: path where the file with token-ids will be created. 180 | vocabulary_path: path to the vocabulary file. 181 | tokenizer: a function to use to tokenize each sentence; 182 | if None, basic_tokenizer will be used. 183 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 184 | """ 185 | if not gfile.Exists(target_path): 186 | print("Tokenizing data in %s" % data_path) 187 | vocab, _ = initialize_vocabulary(vocabulary_path) 188 | with gfile.GFile(data_path, mode="r") as data_file: 189 | with gfile.GFile(target_path, mode="w") as tokens_file: 190 | counter = 0 191 | for line in data_file: 192 | counter += 1 193 | if counter % 100000 == 0: 194 | print(" tokenizing line %d" % counter) 195 | token_ids = sentence_to_token_ids(line, vocab, tokenizer, 196 | normalize_digits) 197 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") 198 | 199 | -------------------------------------------------------------------------------- /my_seq2seq.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | """Library for creating sequence-to-sequence models in TensorFlow. 4 | 5 | Sequence-to-sequence recurrent neural networks can learn complex functions 6 | that map input sequences to output sequences. These models yield very good 7 | results on a number of tasks, such as speech recognition, parsing, machine 8 | translation, or even constructing automated replies to emails. 9 | 10 | 11 | * Full sequence-to-sequence models. 12 | 13 | - embedding_rnn_seq2seq: The basic model with input embedding. 14 | - embedding_attention_seq2seq: Advanced model with input embedding and 15 | the neural attention mechanism; recommended for complex tasks. 16 | 17 | 18 | * Decoders 19 | - rnn_decoder: The basic decoder based on a pure RNN. 20 | - attention_decoder: A decoder that uses the attention mechanism. 21 | 22 | * Losses. 23 | - sequence_loss: Loss for a sequence model returning average log-perplexity. 24 | - sequence_loss_by_example: As above, but not averaging over all examples. 25 | 26 | * model_with_buckets: A convenience function to create models with bucketing 27 | (see the tutorial above for an explanation of why and how to use it). 28 | """ 29 | from __future__ import absolute_import 30 | from __future__ import division 31 | from __future__ import print_function 32 | 33 | from six.moves import xrange # pylint: disable=redefined-builtin 34 | from six.moves import zip # pylint: disable=redefined-builtin 35 | 36 | from tensorflow.python.framework import dtypes 37 | from tensorflow.python.framework import ops 38 | from tensorflow.python.ops import array_ops 39 | from tensorflow.python.ops import control_flow_ops 40 | from tensorflow.python.ops import embedding_ops 41 | from tensorflow.python.ops import math_ops 42 | from tensorflow.python.ops import nn_ops 43 | from tensorflow.python.ops import rnn 44 | from tensorflow.python.ops import rnn_cell 45 | from tensorflow.python.ops import variable_scope 46 | import tensorflow as tf 47 | 48 | try: 49 | linear = tf.nn.rnn_cell.linear 50 | except: 51 | from tensorflow.python.ops.rnn_cell import _linear as linear 52 | 53 | 54 | def _extract_argmax_and_embed(embedding, output_projection=None, 55 | update_embedding=True): 56 | """Get a loop_function that extracts the previous symbol and embeds it. 57 | Args: 58 | embedding: embedding tensor for symbols. 59 | output_projection: None or a pair (W, B). If provided, each fed previous 60 | output will first be multiplied by W and added B. 61 | update_embedding: Boolean; if False, the gradients will not propagate 62 | through the embeddings. 63 | Returns: 64 | A loop function. 65 | """ 66 | def loop_function(prev, _): 67 | if output_projection is not None: 68 | prev = nn_ops.xw_plus_b( 69 | prev, output_projection[0], output_projection[1]) 70 | prev_symbol = math_ops.argmax(prev, 1) 71 | # Note that gradients will not propagate through the second parameter of 72 | # embedding_lookup. 73 | emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) 74 | if not update_embedding: 75 | emb_prev = array_ops.stop_gradient(emb_prev) 76 | return emb_prev 77 | return loop_function 78 | 79 | def _extract_beam_search(embedding, beam_size, num_symbols, embedding_size, output_projection=None, 80 | update_embedding=True): 81 | """Get a loop_function that extracts the previous symbol and embeds it. 82 | 83 | Args: 84 | embedding: embedding tensor for symbols. 85 | output_projection: None or a pair (W, B). If provided, each fed previous 86 | output will first be multiplied by W and added B. 87 | update_embedding: Boolean; if False, the gradients will not propagate 88 | through the embeddings. 89 | 90 | Returns: 91 | A loop function. 92 | """ 93 | def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols): 94 | if output_projection is not None: 95 | prev = nn_ops.xw_plus_b( 96 | prev, output_projection[0], output_projection[1]) 97 | # prev= prev.get_shape().with_rank(2)[1] 98 | 99 | probs = tf.log(tf.nn.softmax(prev)) 100 | 101 | if i > 1: 102 | 103 | probs = tf.reshape(probs + log_beam_probs[-1], 104 | [-1, beam_size * num_symbols]) 105 | 106 | best_probs, indices = tf.nn.top_k(probs, beam_size) 107 | indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1]))) 108 | best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1])) 109 | 110 | symbols = indices % num_symbols # Which word in vocabulary. 111 | beam_parent = indices // num_symbols # Which hypothesis it came from. 112 | 113 | 114 | beam_symbols.append(symbols) 115 | beam_path.append(beam_parent) 116 | log_beam_probs.append(best_probs) 117 | 118 | # Note that gradients will not propagate through the second parameter of 119 | # embedding_lookup. 120 | 121 | emb_prev = embedding_ops.embedding_lookup(embedding, symbols) 122 | emb_prev = tf.reshape(emb_prev,[beam_size,embedding_size]) 123 | # emb_prev = embedding_ops.embedding_lookup(embedding, symbols) 124 | if not update_embedding: 125 | emb_prev = array_ops.stop_gradient(emb_prev) 126 | return emb_prev 127 | return loop_function 128 | 129 | 130 | def rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, 131 | scope=None): 132 | """RNN decoder for the sequence-to-sequence model. 133 | 134 | Args: 135 | decoder_inputs: A list of 2D Tensors [batch_size x input_size]. 136 | initial_state: 2D Tensor with shape [batch_size x cell.state_size]. 137 | cell: rnn_cell.RNNCell defining the cell function and size. 138 | loop_function: If not None, this function will be applied to the i-th output 139 | in order to generate the i+1-st input, and decoder_inputs will be ignored, 140 | except for the first element ("GO" symbol). This can be used for decoding, 141 | but also for training to emulate http://arxiv.org/abs/1506.03099. 142 | Signature -- loop_function(prev, i) = next 143 | * prev is a 2D Tensor of shape [batch_size x output_size], 144 | * i is an integer, the step number (when advanced control is needed), 145 | * next is a 2D Tensor of shape [batch_size x input_size]. 146 | scope: VariableScope for the created subgraph; defaults to "rnn_decoder". 147 | 148 | Returns: 149 | A tuple of the form (outputs, state), where: 150 | outputs: A list of the same length as decoder_inputs of 2D Tensors with 151 | shape [batch_size x output_size] containing generated outputs. 152 | state: The state of each cell at the final time-step. 153 | It is a 2D Tensor of shape [batch_size x cell.state_size]. 154 | (Note that in some cases, like basic RNN cell or GRU cell, outputs and 155 | states can be the same. They are different for LSTM cells though.) 156 | """ 157 | with variable_scope.variable_scope(scope or "rnn_decoder"): 158 | state = initial_state 159 | outputs = [] 160 | prev = None 161 | for i, inp in enumerate(decoder_inputs): 162 | if loop_function is not None and prev is not None: 163 | with variable_scope.variable_scope("loop_function", reuse=True): 164 | inp = loop_function(prev, i) 165 | if i > 0: 166 | variable_scope.get_variable_scope().reuse_variables() 167 | output, state = cell(inp, state) 168 | 169 | outputs.append(output) 170 | if loop_function is not None: 171 | prev = output 172 | return outputs, state 173 | 174 | def beam_rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, 175 | scope=None,output_projection=None, beam_size=10): 176 | """RNN decoder for the sequence-to-sequence model. 177 | 178 | Args: 179 | decoder_inputs: A list of 2D Tensors [batch_size x input_size]. 180 | initial_state: 2D Tensor with shape [batch_size x cell.state_size]. 181 | cell: rnn_cell.RNNCell defining the cell function and size. 182 | loop_function: If not None, this function will be applied to the i-th output 183 | in order to generate the i+1-st input, and decoder_inputs will be ignored, 184 | except for the first element ("GO" symbol). This can be used for decoding, 185 | but also for training to emulate http://arxiv.org/abs/1506.03099. 186 | Signature -- loop_function(prev, i) = next 187 | * prev is a 2D Tensor of shape [batch_size x output_size], 188 | * i is an integer, the step number (when advanced control is needed), 189 | * next is a 2D Tensor of shape [batch_size x input_size]. 190 | scope: VariableScope for the created subgraph; defaults to "rnn_decoder". 191 | 192 | Returns: 193 | A tuple of the form (outputs, state), where: 194 | outputs: A list of the same length as decoder_inputs of 2D Tensors with 195 | shape [batch_size x output_size] containing generated outputs. 196 | state: The state of each cell at the final time-step. 197 | It is a 2D Tensor of shape [batch_size x cell.state_size]. 198 | (Note that in some cases, like basic RNN cell or GRU cell, outputs and 199 | states can be the same. They are different for LSTM cells though.) 200 | """ 201 | with variable_scope.variable_scope(scope or "rnn_decoder"): 202 | state = initial_state 203 | outputs = [] 204 | prev = None 205 | log_beam_probs, beam_path, beam_symbols = [],[],[] 206 | state_size = int(initial_state.get_shape().with_rank(2)[1]) 207 | 208 | for i, inp in enumerate(decoder_inputs): 209 | if loop_function is not None and prev is not None: 210 | with variable_scope.variable_scope("loop_function", reuse=True): 211 | inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols) 212 | if i > 0: 213 | variable_scope.get_variable_scope().reuse_variables() 214 | 215 | input_size = inp.get_shape().with_rank(2)[1] 216 | print(input_size) 217 | x = inp 218 | output, state = cell(x, state) 219 | 220 | if loop_function is not None: 221 | prev = output 222 | if i ==0: 223 | states =[] 224 | for kk in range(beam_size): 225 | states.append(state) 226 | state = tf.reshape(tf.concat(0, states), [-1, state_size]) 227 | 228 | outputs.append(tf.argmax(nn_ops.xw_plus_b( 229 | output, output_projection[0], output_projection[1]), dimension=1)) 230 | return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size]) 231 | 232 | 233 | def embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols, 234 | embedding_size, output_projection=None, 235 | feed_previous=False, 236 | update_embedding_for_previous=True, scope=None, beam_search=True, beam_size=10 ): 237 | """RNN decoder with embedding and a pure-decoding option. 238 | 239 | Args: 240 | decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). 241 | initial_state: 2D Tensor [batch_size x cell.state_size]. 242 | cell: rnn_cell.RNNCell defining the cell function. 243 | num_symbols: Integer, how many symbols come into the embedding. 244 | embedding_size: Integer, the length of the embedding vector for each symbol. 245 | output_projection: None or a pair (W, B) of output projection weights and 246 | biases; W has shape [output_size x num_symbols] and B has 247 | shape [num_symbols]; if provided and feed_previous=True, each fed 248 | previous output will first be multiplied by W and added B. 249 | feed_previous: Boolean; if True, only the first of decoder_inputs will be 250 | used (the "GO" symbol), and all other decoder inputs will be generated by: 251 | next = embedding_lookup(embedding, argmax(previous_output)), 252 | In effect, this implements a greedy decoder. It can also be used 253 | during training to emulate http://arxiv.org/abs/1506.03099. 254 | If False, decoder_inputs are used as given (the standard decoder case). 255 | update_embedding_for_previous: Boolean; if False and feed_previous=True, 256 | only the embedding for the first symbol of decoder_inputs (the "GO" 257 | symbol) will be updated by back propagation. Embeddings for the symbols 258 | generated from the decoder itself remain unchanged. This parameter has 259 | no effect if feed_previous=False. 260 | scope: VariableScope for the created subgraph; defaults to 261 | "embedding_rnn_decoder". 262 | 263 | Returns: 264 | A tuple of the form (outputs, state), where: 265 | outputs: A list of the same length as decoder_inputs of 2D Tensors with 266 | shape [batch_size x output_size] containing the generated outputs. 267 | state: The state of each decoder cell in each time-step. This is a list 268 | with length len(decoder_inputs) -- one item for each time-step. 269 | It is a 2D Tensor of shape [batch_size x cell.state_size]. 270 | 271 | Raises: 272 | ValueError: When output_projection has the wrong shape. 273 | """ 274 | if output_projection is not None: 275 | proj_weights = ops.convert_to_tensor(output_projection[0], 276 | dtype=dtypes.float32) 277 | proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) 278 | proj_biases = ops.convert_to_tensor( 279 | output_projection[1], dtype=dtypes.float32) 280 | proj_biases.get_shape().assert_is_compatible_with([num_symbols]) 281 | 282 | with variable_scope.variable_scope(scope or "embedding_rnn_decoder"): 283 | with ops.device("/cpu:0"): 284 | embedding = variable_scope.get_variable("embedding", 285 | [num_symbols, embedding_size]) 286 | 287 | if beam_search: 288 | loop_function = _extract_beam_search( 289 | embedding, beam_size,num_symbols,embedding_size, output_projection, 290 | update_embedding_for_previous) 291 | else: 292 | loop_function = _extract_argmax_and_embed( 293 | embedding, output_projection, 294 | update_embedding_for_previous) if feed_previous else None 295 | 296 | emb_inp = [ 297 | embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs] 298 | 299 | 300 | if beam_search: 301 | return beam_rnn_decoder(emb_inp, initial_state, cell, 302 | loop_function=loop_function,output_projection=output_projection, beam_size=beam_size) 303 | 304 | else: 305 | return rnn_decoder(emb_inp, initial_state, cell, 306 | loop_function=loop_function) 307 | 308 | 309 | 310 | def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, 311 | num_encoder_symbols, num_decoder_symbols, 312 | embedding_size, output_projection=None, 313 | feed_previous=False, dtype=dtypes.float32, 314 | scope=None, beam_search=True, beam_size=10): 315 | """Embedding RNN sequence-to-sequence model. 316 | 317 | This model first embeds encoder_inputs by a newly created embedding (of shape 318 | [num_encoder_symbols x input_size]). Then it runs an RNN to encode 319 | embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs 320 | by another newly created embedding (of shape [num_decoder_symbols x 321 | input_size]). Then it runs RNN decoder, initialized with the last 322 | encoder state, on embedded decoder_inputs. 323 | 324 | Args: 325 | encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. 326 | decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. 327 | cell: rnn_cell.RNNCell defining the cell function and size. 328 | num_encoder_symbols: Integer; number of symbols on the encoder side. 329 | num_decoder_symbols: Integer; number of symbols on the decoder side. 330 | embedding_size: Integer, the length of the embedding vector for each symbol. 331 | output_projection: None or a pair (W, B) of output projection weights and 332 | biases; W has shape [output_size x num_decoder_symbols] and B has 333 | shape [num_decoder_symbols]; if provided and feed_previous=True, each 334 | fed previous output will first be multiplied by W and added B. 335 | feed_previous: Boolean or scalar Boolean Tensor; if True, only the first 336 | of decoder_inputs will be used (the "GO" symbol), and all other decoder 337 | inputs will be taken from previous outputs (as in embedding_rnn_decoder). 338 | If False, decoder_inputs are used as given (the standard decoder case). 339 | dtype: The dtype of the initial state for both the encoder and encoder 340 | rnn cells (default: tf.float32). 341 | scope: VariableScope for the created subgraph; defaults to 342 | "embedding_rnn_seq2seq" 343 | 344 | Returns: 345 | A tuple of the form (outputs, state), where: 346 | outputs: A list of the same length as decoder_inputs of 2D Tensors with 347 | shape [batch_size x num_decoder_symbols] containing the generated 348 | outputs. 349 | state: The state of each decoder cell in each time-step. This is a list 350 | with length len(decoder_inputs) -- one item for each time-step. 351 | It is a 2D Tensor of shape [batch_size x cell.state_size]. 352 | """ 353 | with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq"): 354 | # Encoder. 355 | encoder_cell = rnn_cell.EmbeddingWrapper( 356 | cell, embedding_classes=num_encoder_symbols, 357 | embedding_size=embedding_size) 358 | _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) 359 | 360 | # Decoder. 361 | if output_projection is None: 362 | cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) 363 | 364 | 365 | return embedding_rnn_decoder( 366 | decoder_inputs, encoder_state, cell, num_decoder_symbols, 367 | embedding_size, output_projection=output_projection, 368 | feed_previous=feed_previous, beam_search=beam_search, beam_size=beam_size) 369 | 370 | 371 | 372 | 373 | 374 | def attention_decoder(decoder_inputs, initial_state, attention_states, cell, 375 | output_size=None, num_heads=1, loop_function=None, 376 | dtype=dtypes.float32, scope=None, 377 | initial_state_attention=False): 378 | """RNN decoder with attention for the sequence-to-sequence model. 379 | 380 | In this context "attention" means that, during decoding, the RNN can look up 381 | information in the additional tensor attention_states, and it does this by 382 | focusing on a few entries from the tensor. This model has proven to yield 383 | especially good results in a number of sequence-to-sequence tasks. This 384 | implementation is based on http://arxiv.org/abs/1412.7449 (see below for 385 | details). It is recommended for complex sequence-to-sequence tasks. 386 | 387 | Args: 388 | decoder_inputs: A list of 2D Tensors [batch_size x input_size]. 389 | initial_state: 2D Tensor [batch_size x cell.state_size]. 390 | attention_states: 3D Tensor [batch_size x attn_length x attn_size]. 391 | cell: rnn_cell.RNNCell defining the cell function and size. 392 | output_size: Size of the output vectors; if None, we use cell.output_size. 393 | num_heads: Number of attention heads that read from attention_states. 394 | loop_function: If not None, this function will be applied to i-th output 395 | in order to generate i+1-th input, and decoder_inputs will be ignored, 396 | except for the first element ("GO" symbol). This can be used for decoding, 397 | but also for training to emulate http://arxiv.org/abs/1506.03099. 398 | Signature -- loop_function(prev, i) = next 399 | * prev is a 2D Tensor of shape [batch_size x output_size], 400 | * i is an integer, the step number (when advanced control is needed), 401 | * next is a 2D Tensor of shape [batch_size x input_size]. 402 | dtype: The dtype to use for the RNN initial state (default: tf.float32). 403 | scope: VariableScope for the created subgraph; default: "attention_decoder". 404 | initial_state_attention: If False (default), initial attentions are zero. 405 | If True, initialize the attentions from the initial state and attention 406 | states -- useful when we wish to resume decoding from a previously 407 | stored decoder state and attention states. 408 | 409 | Returns: 410 | A tuple of the form (outputs, state), where: 411 | outputs: A list of the same length as decoder_inputs of 2D Tensors of 412 | shape [batch_size x output_size]. These represent the generated outputs. 413 | Output i is computed from input i (which is either the i-th element 414 | of decoder_inputs or loop_function(output {i-1}, i)) as follows. 415 | First, we run the cell on a combination of the input and previous 416 | attention masks: 417 | cell_output, new_state = cell(linear(input, prev_attn), prev_state). 418 | Then, we calculate new attention masks: 419 | new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) 420 | and then we calculate the output: 421 | output = linear(cell_output, new_attn). 422 | state: The state of each decoder cell the final time-step. 423 | It is a 2D Tensor of shape [batch_size x cell.state_size]. 424 | 425 | Raises: 426 | ValueError: when num_heads is not positive, there are no inputs, shapes 427 | of attention_states are not set, or input size cannot be inferred 428 | from the input. 429 | """ 430 | if not decoder_inputs: 431 | raise ValueError("Must provide at least 1 input to attention decoder.") 432 | if num_heads < 1: 433 | raise ValueError("With less than 1 heads, use a non-attention decoder.") 434 | if not attention_states.get_shape()[1:2].is_fully_defined(): 435 | raise ValueError("Shape[1] and [2] of attention_states must be known: %s" 436 | % attention_states.get_shape()) 437 | if output_size is None: 438 | output_size = cell.output_size 439 | 440 | with variable_scope.variable_scope(scope or "attention_decoder"): 441 | batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. 442 | attn_length = attention_states.get_shape()[1].value 443 | attn_size = attention_states.get_shape()[2].value 444 | 445 | # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. 446 | hidden = array_ops.reshape( 447 | attention_states, [-1, attn_length, 1, attn_size]) 448 | hidden_features = [] 449 | v = [] 450 | attention_vec_size = attn_size # Size of query vectors for attention. 451 | for a in xrange(num_heads): 452 | k = variable_scope.get_variable("AttnW_%d" % a, 453 | [1, 1, attn_size, attention_vec_size]) 454 | hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) 455 | v.append(variable_scope.get_variable("AttnV_%d" % a, 456 | [attention_vec_size])) 457 | 458 | state = initial_state 459 | def attention(query): 460 | """Put attention masks on hidden using hidden_features and query.""" 461 | ds = [] # Results of attention reads will be stored here. 462 | for a in xrange(num_heads): 463 | with variable_scope.variable_scope("Attention_%d" % a): 464 | y = linear(query, attention_vec_size, True) 465 | y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) 466 | # Attention mask is a softmax of v^T * tanh(...). 467 | s = math_ops.reduce_sum( 468 | v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) 469 | a = nn_ops.softmax(s) 470 | # Now calculate the attention-weighted vector d. 471 | d = math_ops.reduce_sum( 472 | array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, 473 | [1, 2]) 474 | ds.append(array_ops.reshape(d, [-1, attn_size])) 475 | return ds 476 | 477 | outputs = [] 478 | prev = None 479 | batch_attn_size = array_ops.pack([batch_size, attn_size]) 480 | attns = [array_ops.zeros(batch_attn_size, dtype=dtype) 481 | for _ in xrange(num_heads)] 482 | for a in attns: # Ensure the second shape of attention vectors is set. 483 | a.set_shape([None, attn_size]) 484 | if initial_state_attention: 485 | attns = attention(initial_state) 486 | for i, inp in enumerate(decoder_inputs): 487 | if i > 0: 488 | variable_scope.get_variable_scope().reuse_variables() 489 | # If loop_function is set, we use it instead of decoder_inputs. 490 | if loop_function is not None : 491 | with variable_scope.variable_scope("loop_function", reuse=True): 492 | if prev is not None: 493 | inp = loop_function(prev, i) 494 | 495 | input_size = inp.get_shape().with_rank(2)[1] 496 | 497 | x = linear([inp] + attns, input_size, True) 498 | # Run the RNN. 499 | cell_output, state = cell(x, state) 500 | # Run the attention mechanism. 501 | if i == 0 and initial_state_attention: 502 | with variable_scope.variable_scope(variable_scope.get_variable_scope(), 503 | reuse=True): 504 | attns = attention(state) 505 | else: 506 | attns = attention(state) 507 | 508 | with variable_scope.variable_scope("AttnOutputProjection"): 509 | output = linear([cell_output] + attns, output_size, True) 510 | if loop_function is not None: 511 | prev = output 512 | outputs.append(output) 513 | 514 | return outputs, state 515 | 516 | 517 | def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell, 518 | output_size=None, num_heads=1, loop_function=None, 519 | dtype=dtypes.float32, scope=None, 520 | initial_state_attention=False, output_projection=None, beam_size=10): 521 | """RNN decoder with attention for the sequence-to-sequence model. 522 | 523 | In this context "attention" means that, during decoding, the RNN can look up 524 | information in the additional tensor attention_states, and it does this by 525 | focusing on a few entries from the tensor. This model has proven to yield 526 | especially good results in a number of sequence-to-sequence tasks. This 527 | implementation is based on http://arxiv.org/abs/1412.7449 (see below for 528 | details). It is recommended for complex sequence-to-sequence tasks. 529 | 530 | Args: 531 | decoder_inputs: A list of 2D Tensors [batch_size x input_size]. 532 | initial_state: 2D Tensor [batch_size x cell.state_size]. 533 | attention_states: 3D Tensor [batch_size x attn_length x attn_size]. 534 | cell: rnn_cell.RNNCell defining the cell function and size. 535 | output_size: Size of the output vectors; if None, we use cell.output_size. 536 | num_heads: Number of attention heads that read from attention_states. 537 | loop_function: If not None, this function will be applied to i-th output 538 | in order to generate i+1-th input, and decoder_inputs will be ignored, 539 | except for the first element ("GO" symbol). This can be used for decoding, 540 | but also for training to emulate http://arxiv.org/abs/1506.03099. 541 | Signature -- loop_function(prev, i) = next 542 | * prev is a 2D Tensor of shape [batch_size x output_size], 543 | * i is an integer, the step number (when advanced control is needed), 544 | * next is a 2D Tensor of shape [batch_size x input_size]. 545 | dtype: The dtype to use for the RNN initial state (default: tf.float32). 546 | scope: VariableScope for the created subgraph; default: "attention_decoder". 547 | initial_state_attention: If False (default), initial attentions are zero. 548 | If True, initialize the attentions from the initial state and attention 549 | states -- useful when we wish to resume decoding from a previously 550 | stored decoder state and attention states. 551 | 552 | Returns: 553 | A tuple of the form (outputs, state), where: 554 | outputs: A list of the same length as decoder_inputs of 2D Tensors of 555 | shape [batch_size x output_size]. These represent the generated outputs. 556 | Output i is computed from input i (which is either the i-th element 557 | of decoder_inputs or loop_function(output {i-1}, i)) as follows. 558 | First, we run the cell on a combination of the input and previous 559 | attention masks: 560 | cell_output, new_state = cell(linear(input, prev_attn), prev_state). 561 | Then, we calculate new attention masks: 562 | new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) 563 | and then we calculate the output: 564 | output = linear(cell_output, new_attn). 565 | state: The state of each decoder cell the final time-step. 566 | It is a 2D Tensor of shape [batch_size x cell.state_size]. 567 | 568 | Raises: 569 | ValueError: when num_heads is not positive, there are no inputs, shapes 570 | of attention_states are not set, or input size cannot be inferred 571 | from the input. 572 | """ 573 | if not decoder_inputs: 574 | raise ValueError("Must provide at least 1 input to attention decoder.") 575 | if num_heads < 1: 576 | raise ValueError("With less than 1 heads, use a non-attention decoder.") 577 | if not attention_states.get_shape()[1:2].is_fully_defined(): 578 | raise ValueError("Shape[1] and [2] of attention_states must be known: %s" 579 | % attention_states.get_shape()) 580 | if output_size is None: 581 | output_size = cell.output_size 582 | 583 | with variable_scope.variable_scope(scope or "attention_decoder"): 584 | batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. 585 | attn_length = attention_states.get_shape()[1].value 586 | attn_size = attention_states.get_shape()[2].value 587 | 588 | # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. 589 | hidden = array_ops.reshape( 590 | attention_states, [-1, attn_length, 1, attn_size]) 591 | hidden_features = [] 592 | v = [] 593 | attention_vec_size = attn_size # Size of query vectors for attention. 594 | for a in xrange(num_heads): 595 | k = variable_scope.get_variable("AttnW_%d" % a, 596 | [1, 1, attn_size, attention_vec_size]) 597 | hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) 598 | v.append(variable_scope.get_variable("AttnV_%d" % a, 599 | [attention_vec_size])) 600 | 601 | print("Initial_state") 602 | 603 | state_size = int(initial_state.get_shape().with_rank(2)[1]) 604 | states =[] 605 | for kk in range(1): 606 | states.append(initial_state) 607 | state = tf.reshape(tf.concat(0, states), [-1, state_size]) 608 | def attention(query): 609 | """Put attention masks on hidden using hidden_features and query.""" 610 | ds = [] # Results of attention reads will be stored here. 611 | for a in xrange(num_heads): 612 | with variable_scope.variable_scope("Attention_%d" % a): 613 | y = linear(query, attention_vec_size, True) 614 | y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) 615 | # Attention mask is a softmax of v^T * tanh(...). 616 | s = math_ops.reduce_sum( 617 | v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) 618 | a = nn_ops.softmax(s) 619 | # Now calculate the attention-weighted vector d. 620 | d = math_ops.reduce_sum( 621 | array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, 622 | [1, 2]) 623 | # for c in range(ct): 624 | ds.append(array_ops.reshape(d, [-1, attn_size])) 625 | return ds 626 | 627 | outputs = [] 628 | prev = None 629 | batch_attn_size = array_ops.pack([batch_size, attn_size]) 630 | attns = [array_ops.zeros(batch_attn_size, dtype=dtype) 631 | for _ in xrange(num_heads)] 632 | for a in attns: # Ensure the second shape of attention vectors is set. 633 | a.set_shape([None, attn_size]) 634 | 635 | if initial_state_attention: 636 | attns = [] 637 | attns.append(attention(initial_state)) 638 | tmp = tf.reshape(tf.concat(0, attns), [-1, attn_size]) 639 | attns = [] 640 | attns.append(tmp) 641 | 642 | log_beam_probs, beam_path, beam_symbols = [],[],[] 643 | for i, inp in enumerate(decoder_inputs): 644 | 645 | if i > 0: 646 | variable_scope.get_variable_scope().reuse_variables() 647 | # If loop_function is set, we use it instead of decoder_inputs. 648 | if loop_function is not None : 649 | with variable_scope.variable_scope("loop_function", reuse=True): 650 | if prev is not None: 651 | inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols) 652 | 653 | input_size = inp.get_shape().with_rank(2)[1] 654 | x = linear([inp] + attns, input_size, True) 655 | cell_output, state = cell(x, state) 656 | 657 | # Run the attention mechanism. 658 | if i == 0 and initial_state_attention: 659 | with variable_scope.variable_scope(variable_scope.get_variable_scope(), 660 | reuse=True): 661 | attns = attention(state) 662 | else: 663 | attns = attention(state) 664 | 665 | with variable_scope.variable_scope("AttnOutputProjection"): 666 | output = linear([cell_output] + attns, output_size, True) 667 | if loop_function is not None: 668 | prev = output 669 | if i ==0: 670 | states =[] 671 | for kk in range(beam_size): 672 | states.append(state) 673 | state = tf.reshape(tf.concat(0, states), [-1, state_size]) 674 | with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): 675 | attns = attention(state) 676 | 677 | outputs.append(tf.argmax(nn_ops.xw_plus_b( 678 | output, output_projection[0], output_projection[1]), dimension=1)) 679 | 680 | return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size]) 681 | 682 | def embedding_attention_decoder(decoder_inputs, initial_state, attention_states, 683 | cell, num_symbols, embedding_size, num_heads=1, 684 | output_size=None, output_projection=None, 685 | feed_previous=False, 686 | update_embedding_for_previous=True, 687 | dtype=dtypes.float32, scope=None, 688 | initial_state_attention=False, beam_search=True, beam_size=10): 689 | """RNN decoder with embedding and attention and a pure-decoding option. 690 | 691 | Args: 692 | decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). 693 | initial_state: 2D Tensor [batch_size x cell.state_size]. 694 | attention_states: 3D Tensor [batch_size x attn_length x attn_size]. 695 | cell: rnn_cell.RNNCell defining the cell function. 696 | num_symbols: Integer, how many symbols come into the embedding. 697 | embedding_size: Integer, the length of the embedding vector for each symbol. 698 | num_heads: Number of attention heads that read from attention_states. 699 | output_size: Size of the output vectors; if None, use output_size. 700 | output_projection: None or a pair (W, B) of output projection weights and 701 | biases; W has shape [output_size x num_symbols] and B has shape 702 | [num_symbols]; if provided and feed_previous=True, each fed previous 703 | output will first be multiplied by W and added B. 704 | feed_previous: Boolean; if True, only the first of decoder_inputs will be 705 | used (the "GO" symbol), and all other decoder inputs will be generated by: 706 | next = embedding_lookup(embedding, argmax(previous_output)), 707 | In effect, this implements a greedy decoder. It can also be used 708 | during training to emulate http://arxiv.org/abs/1506.03099. 709 | If False, decoder_inputs are used as given (the standard decoder case). 710 | update_embedding_for_previous: Boolean; if False and feed_previous=True, 711 | only the embedding for the first symbol of decoder_inputs (the "GO" 712 | symbol) will be updated by back propagation. Embeddings for the symbols 713 | generated from the decoder itself remain unchanged. This parameter has 714 | no effect if feed_previous=False. 715 | dtype: The dtype to use for the RNN initial states (default: tf.float32). 716 | scope: VariableScope for the created subgraph; defaults to 717 | "embedding_attention_decoder". 718 | initial_state_attention: If False (default), initial attentions are zero. 719 | If True, initialize the attentions from the initial state and attention 720 | states -- useful when we wish to resume decoding from a previously 721 | stored decoder state and attention states. 722 | 723 | Returns: 724 | A tuple of the form (outputs, state), where: 725 | outputs: A list of the same length as decoder_inputs of 2D Tensors with 726 | shape [batch_size x output_size] containing the generated outputs. 727 | state: The state of each decoder cell at the final time-step. 728 | It is a 2D Tensor of shape [batch_size x cell.state_size]. 729 | 730 | Raises: 731 | ValueError: When output_projection has the wrong shape. 732 | """ 733 | if output_size is None: 734 | output_size = cell.output_size 735 | if output_projection is not None: 736 | proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) 737 | proj_biases.get_shape().assert_is_compatible_with([num_symbols]) 738 | 739 | with variable_scope.variable_scope(scope or "embedding_attention_decoder"): 740 | with ops.device("/cpu:0"): 741 | embedding = variable_scope.get_variable("embedding", 742 | [num_symbols, embedding_size]) 743 | print("Check number of symbols") 744 | print(num_symbols) 745 | if beam_search: 746 | loop_function = _extract_beam_search( 747 | embedding, beam_size,num_symbols, embedding_size, output_projection, 748 | update_embedding_for_previous) 749 | else: 750 | loop_function = _extract_argmax_and_embed( 751 | embedding, output_projection, 752 | update_embedding_for_previous) if feed_previous else None 753 | emb_inp = [ 754 | embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs] 755 | if beam_search: 756 | return beam_attention_decoder( 757 | emb_inp, initial_state, attention_states, cell, output_size=output_size, 758 | num_heads=num_heads, loop_function=loop_function, 759 | initial_state_attention=initial_state_attention, output_projection=output_projection, beam_size=beam_size) 760 | else: 761 | 762 | return attention_decoder( 763 | emb_inp, initial_state, attention_states, cell, output_size=output_size, 764 | num_heads=num_heads, loop_function=loop_function, 765 | initial_state_attention=initial_state_attention) 766 | 767 | 768 | def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, 769 | num_encoder_symbols, num_decoder_symbols, 770 | embedding_size, 771 | num_heads=1, output_projection=None, 772 | feed_previous=False, dtype=dtypes.float32, 773 | scope=None, initial_state_attention=False, beam_search =True, beam_size = 10 ): 774 | """Embedding sequence-to-sequence model with attention. 775 | 776 | This model first embeds encoder_inputs by a newly created embedding (of shape 777 | [num_encoder_symbols x input_size]). Then it runs an RNN to encode 778 | embedded encoder_inputs into a state vector. It keeps the outputs of this 779 | RNN at every step to use for attention later. Next, it embeds decoder_inputs 780 | by another newly created embedding (of shape [num_decoder_symbols x 781 | input_size]). Then it runs attention decoder, initialized with the last 782 | encoder state, on embedded decoder_inputs and attending to encoder outputs. 783 | 784 | Args: 785 | encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. 786 | decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. 787 | cell: rnn_cell.RNNCell defining the cell function and size. 788 | num_encoder_symbols: Integer; number of symbols on the encoder side. 789 | num_decoder_symbols: Integer; number of symbols on the decoder side. 790 | embedding_size: Integer, the length of the embedding vector for each symbol. 791 | num_heads: Number of attention heads that read from attention_states. 792 | output_projection: None or a pair (W, B) of output projection weights and 793 | biases; W has shape [output_size x num_decoder_symbols] and B has 794 | shape [num_decoder_symbols]; if provided and feed_previous=True, each 795 | fed previous output will first be multiplied by W and added B. 796 | feed_previous: Boolean or scalar Boolean Tensor; if True, only the first 797 | of decoder_inputs will be used (the "GO" symbol), and all other decoder 798 | inputs will be taken from previous outputs (as in embedding_rnn_decoder). 799 | If False, decoder_inputs are used as given (the standard decoder case). 800 | dtype: The dtype of the initial RNN state (default: tf.float32). 801 | scope: VariableScope for the created subgraph; defaults to 802 | "embedding_attention_seq2seq". 803 | initial_state_attention: If False (default), initial attentions are zero. 804 | If True, initialize the attentions from the initial state and attention 805 | states. 806 | 807 | Returns: 808 | A tuple of the form (outputs, state), where: 809 | outputs: A list of the same length as decoder_inputs of 2D Tensors with 810 | shape [batch_size x num_decoder_symbols] containing the generated 811 | outputs. 812 | state: The state of each decoder cell at the final time-step. 813 | It is a 2D Tensor of shape [batch_size x cell.state_size]. 814 | """ 815 | with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): 816 | # Encoder. 817 | encoder_cell = rnn_cell.EmbeddingWrapper( 818 | cell, embedding_classes=num_encoder_symbols, 819 | embedding_size=embedding_size) 820 | encoder_outputs, encoder_state = rnn.rnn( 821 | encoder_cell, encoder_inputs, dtype=dtype) 822 | print("Symbols") 823 | print(num_encoder_symbols) 824 | print(num_decoder_symbols) 825 | # First calculate a concatenation of encoder outputs to put attention on. 826 | top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) 827 | for e in encoder_outputs] 828 | attention_states = array_ops.concat(1, top_states) 829 | print(attention_states) 830 | # Decoder. 831 | output_size = None 832 | if output_projection is None: 833 | cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) 834 | output_size = num_decoder_symbols 835 | 836 | 837 | return embedding_attention_decoder( 838 | decoder_inputs, encoder_state, attention_states, cell, 839 | num_decoder_symbols, embedding_size, num_heads=num_heads, 840 | output_size=output_size, output_projection=output_projection, 841 | feed_previous=feed_previous, 842 | initial_state_attention=initial_state_attention, beam_search=beam_search, beam_size=beam_size) 843 | 844 | 845 | 846 | 847 | def sequence_loss_by_example(logits, targets, weights, 848 | average_across_timesteps=True, 849 | softmax_loss_function=None, name=None): 850 | """Weighted cross-entropy loss for a sequence of logits (per example). 851 | 852 | Args: 853 | logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. 854 | targets: List of 1D batch-sized int32 Tensors of the same length as logits. 855 | weights: List of 1D batch-sized float-Tensors of the same length as logits. 856 | average_across_timesteps: If set, divide the returned cost by the total 857 | label weight. 858 | softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch 859 | to be used instead of the standard softmax (the default if this is None). 860 | name: Optional name for this operation, default: "sequence_loss_by_example". 861 | 862 | Returns: 863 | 1D batch-sized float Tensor: The log-perplexity for each sequence. 864 | 865 | Raises: 866 | ValueError: If len(logits) is different from len(targets) or len(weights). 867 | """ 868 | if len(targets) != len(logits) or len(weights) != len(logits): 869 | raise ValueError("Lengths of logits, weights, and targets must be the same " 870 | "%d, %d, %d." % (len(logits), len(weights), len(targets))) 871 | with ops.op_scope(logits + targets + weights, name, 872 | "sequence_loss_by_example"): 873 | log_perp_list = [] 874 | for logit, target, weight in zip(logits, targets, weights): 875 | if softmax_loss_function is None: 876 | target = array_ops.reshape(target, [-1]) 877 | crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( 878 | logit, target) 879 | else: 880 | crossent = softmax_loss_function(logit, target) 881 | log_perp_list.append(crossent * weight) 882 | log_perps = math_ops.add_n(log_perp_list) 883 | if average_across_timesteps: 884 | total_size = math_ops.add_n(weights) 885 | total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. 886 | log_perps /= total_size 887 | return log_perps 888 | 889 | 890 | def sequence_loss(logits, targets, weights, 891 | average_across_timesteps=True, average_across_batch=True, 892 | softmax_loss_function=None, name=None): 893 | """Weighted cross-entropy loss for a sequence of logits, batch-collapsed. 894 | 895 | Args: 896 | logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. 897 | targets: List of 1D batch-sized int32 Tensors of the same length as logits. 898 | weights: List of 1D batch-sized float-Tensors of the same length as logits. 899 | average_across_timesteps: If set, divide the returned cost by the total 900 | label weight. 901 | average_across_batch: If set, divide the returned cost by the batch size. 902 | softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch 903 | to be used instead of the standard softmax (the default if this is None). 904 | name: Optional name for this operation, defaults to "sequence_loss". 905 | 906 | Returns: 907 | A scalar float Tensor: The average log-perplexity per symbol (weighted). 908 | 909 | Raises: 910 | ValueError: If len(logits) is different from len(targets) or len(weights). 911 | """ 912 | with ops.op_scope(logits + targets + weights, name, "sequence_loss"): 913 | cost = math_ops.reduce_sum(sequence_loss_by_example( 914 | logits, targets, weights, 915 | average_across_timesteps=average_across_timesteps, 916 | softmax_loss_function=softmax_loss_function)) 917 | if average_across_batch: 918 | batch_size = array_ops.shape(targets[0])[0] 919 | return cost / math_ops.cast(batch_size, dtypes.float32) 920 | else: 921 | return cost 922 | 923 | 924 | def model_with_buckets(encoder_inputs, decoder_inputs, targets, weights, 925 | buckets, seq2seq, softmax_loss_function=None, 926 | per_example_loss=False, name=None): 927 | """Create a sequence-to-sequence model with support for bucketing. 928 | 929 | The seq2seq argument is a function that defines a sequence-to-sequence model, 930 | e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24)) 931 | 932 | Args: 933 | encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input. 934 | decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input. 935 | targets: A list of 1D batch-sized int32 Tensors (desired output sequence). 936 | weights: List of 1D batch-sized float-Tensors to weight the targets. 937 | buckets: A list of pairs of (input size, output size) for each bucket. 938 | seq2seq: A sequence-to-sequence model function; it takes 2 input that 939 | agree with encoder_inputs and decoder_inputs, and returns a pair 940 | consisting of outputs and states (as, e.g., basic_rnn_seq2seq). 941 | softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch 942 | to be used instead of the standard softmax (the default if this is None). 943 | per_example_loss: Boolean. If set, the returned loss will be a batch-sized 944 | tensor of losses for each sequence in the batch. If unset, it will be 945 | a scalar with the averaged loss from all examples. 946 | name: Optional name for this operation, defaults to "model_with_buckets". 947 | 948 | Returns: 949 | A tuple of the form (outputs, losses), where: 950 | outputs: The outputs for each bucket. Its j'th element consists of a list 951 | of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs). 952 | losses: List of scalar Tensors, representing losses for each bucket, or, 953 | if per_example_loss is set, a list of 1D batch-sized float Tensors. 954 | 955 | Raises: 956 | ValueError: If length of encoder_inputsut, targets, or weights is smaller 957 | than the largest (last) bucket. 958 | """ 959 | if len(encoder_inputs) < buckets[-1][0]: 960 | raise ValueError("Length of encoder_inputs (%d) must be at least that of la" 961 | "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0])) 962 | if len(targets) < buckets[-1][1]: 963 | raise ValueError("Length of targets (%d) must be at least that of last" 964 | "bucket (%d)." % (len(targets), buckets[-1][1])) 965 | if len(weights) < buckets[-1][1]: 966 | raise ValueError("Length of weights (%d) must be at least that of last" 967 | "bucket (%d)." % (len(weights), buckets[-1][1])) 968 | 969 | all_inputs = encoder_inputs + decoder_inputs + targets + weights 970 | losses = [] 971 | outputs = [] 972 | with ops.op_scope(all_inputs, name, "model_with_buckets"): 973 | for j, bucket in enumerate(buckets): 974 | with variable_scope.variable_scope(variable_scope.get_variable_scope(), 975 | reuse=True if j > 0 else None): 976 | 977 | bucket_outputs, _ = seq2seq(encoder_inputs[:bucket[0]], 978 | decoder_inputs[:bucket[1]]) 979 | 980 | outputs.append(bucket_outputs) 981 | if per_example_loss: 982 | losses.append(sequence_loss_by_example( 983 | outputs[-1], targets[:bucket[1]], weights[:bucket[1]], 984 | softmax_loss_function=softmax_loss_function)) 985 | else: 986 | losses.append(sequence_loss( 987 | outputs[-1], targets[:bucket[1]], weights[:bucket[1]], 988 | softmax_loss_function=softmax_loss_function)) 989 | 990 | return outputs, losses 991 | 992 | def decode_model_with_buckets(encoder_inputs, decoder_inputs, targets, weights, 993 | buckets, seq2seq, softmax_loss_function=None, 994 | per_example_loss=False, name=None): 995 | """Create a sequence-to-sequence model with support for bucketing. 996 | 997 | The seq2seq argument is a function that defines a sequence-to-sequence model, 998 | e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24)) 999 | 1000 | Args: 1001 | encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input. 1002 | decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input. 1003 | targets: A list of 1D batch-sized int32 Tensors (desired output sequence). 1004 | weights: List of 1D batch-sized float-Tensors to weight the targets. 1005 | buckets: A list of pairs of (input size, output size) for each bucket. 1006 | seq2seq: A sequence-to-sequence model function; it takes 2 input that 1007 | agree with encoder_inputs and decoder_inputs, and returns a pair 1008 | consisting of outputs and states (as, e.g., basic_rnn_seq2seq). 1009 | softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch 1010 | to be used instead of the standard softmax (the default if this is None). 1011 | per_example_loss: Boolean. If set, the returned loss will be a batch-sized 1012 | tensor of losses for each sequence in the batch. If unset, it will be 1013 | a scalar with the averaged loss from all examples. 1014 | name: Optional name for this operation, defaults to "model_with_buckets". 1015 | 1016 | Returns: 1017 | A tuple of the form (outputs, losses), where: 1018 | outputs: The outputs for each bucket. Its j'th element consists of a list 1019 | of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs). 1020 | losses: List of scalar Tensors, representing losses for each bucket, or, 1021 | if per_example_loss is set, a list of 1D batch-sized float Tensors. 1022 | 1023 | Raises: 1024 | ValueError: If length of encoder_inputsut, targets, or weights is smaller 1025 | than the largest (last) bucket. 1026 | """ 1027 | if len(encoder_inputs) < buckets[-1][0]: 1028 | raise ValueError("Length of encoder_inputs (%d) must be at least that of la" 1029 | "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0])) 1030 | if len(targets) < buckets[-1][1]: 1031 | raise ValueError("Length of targets (%d) must be at least that of last" 1032 | "bucket (%d)." % (len(targets), buckets[-1][1])) 1033 | if len(weights) < buckets[-1][1]: 1034 | raise ValueError("Length of weights (%d) must be at least that of last" 1035 | "bucket (%d)." % (len(weights), buckets[-1][1])) 1036 | 1037 | all_inputs = encoder_inputs + decoder_inputs + targets + weights 1038 | losses = [] 1039 | outputs = [] 1040 | beam_paths = [] 1041 | beam_symbols = [] 1042 | with ops.op_scope(all_inputs, name, "model_with_buckets"): 1043 | for j, bucket in enumerate(buckets): 1044 | with variable_scope.variable_scope(variable_scope.get_variable_scope(), 1045 | reuse=True if j > 0 else None): 1046 | bucket_outputs, _, beam_path, beam_symbol = seq2seq(encoder_inputs[:bucket[0]], 1047 | decoder_inputs[:bucket[1]]) 1048 | outputs.append(bucket_outputs) 1049 | beam_paths.append(beam_path) 1050 | beam_symbols.append(beam_symbol) 1051 | print("End**********") 1052 | 1053 | return outputs, beam_paths, beam_symbols -------------------------------------------------------------------------------- /neural_conversation_model.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | """Most of the code comes from seq2seq tutorial. Binary for training conversation models and decoding from them. 4 | 5 | Running this program without --decode will tokenize it in a very basic way, 6 | and then start training a model saving checkpoints to --train_dir. 7 | 8 | Running with --decode starts an interactive loop so you can see how 9 | the current checkpoint performs 10 | 11 | See the following papers for more information on neural translation models. 12 | * http://arxiv.org/abs/1409.3215 13 | * http://arxiv.org/abs/1409.0473 14 | * http://arxiv.org/abs/1412.2007 15 | """ 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import math 22 | import os 23 | import random 24 | import sys 25 | import time 26 | 27 | import numpy as np 28 | from six.moves import xrange # pylint: disable=redefined-builtin 29 | import tensorflow as tf 30 | 31 | from data_utils import * 32 | from seq2seq_model import * 33 | import codecs 34 | 35 | tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.") 36 | tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.99, 37 | "Learning rate decays by this much.") 38 | tf.app.flags.DEFINE_float("max_gradient_norm", 5.0, 39 | "Clip gradients to this norm.") 40 | tf.app.flags.DEFINE_integer("batch_size", 64, 41 | "Batch size to use during training.") 42 | tf.app.flags.DEFINE_integer("size", 512, "Size of each model layer.") 43 | tf.app.flags.DEFINE_integer("num_layers", 3, "Number of layers in the model.") 44 | tf.app.flags.DEFINE_integer("en_vocab_size", 40000, "English vocabulary size.") 45 | tf.app.flags.DEFINE_string("train_dir", "./tmp/", "Training directory.") 46 | tf.app.flags.DEFINE_string("vocab_path", "./tmp/", "Data directory") 47 | tf.app.flags.DEFINE_string("data_path", "./tmp/", "Training directory.") 48 | tf.app.flags.DEFINE_string("dev_data", "./tmp/", "Data directory") 49 | tf.app.flags.DEFINE_integer("max_train_data_size", 0, 50 | "Limit on the size of training data (0: no limit).") 51 | tf.app.flags.DEFINE_integer("steps_per_checkpoint", 400, 52 | "How many training steps to do per checkpoint.") 53 | tf.app.flags.DEFINE_integer("beam_size", 100, 54 | "How many training steps to do per checkpoint.") 55 | tf.app.flags.DEFINE_boolean("beam_search", False, 56 | "Set to True for beam_search.") 57 | tf.app.flags.DEFINE_boolean("decode", False, 58 | "Set to True for interactive decoding.") 59 | tf.app.flags.DEFINE_boolean("attention", False, 60 | "Set to True for interactive decoding.") 61 | tf.app.flags.DEFINE_boolean("self_test", False, 62 | "Run a self-test if this is set to True.") 63 | 64 | FLAGS = tf.app.flags.FLAGS 65 | 66 | # We use a number of buckets and pad to the closest one for efficiency. 67 | # See seq2seq_model.Seq2SeqModel for details of how they work. 68 | _buckets = [(5, 10), (10, 15), (20, 25), (40, 50)] 69 | 70 | 71 | def read_chat_data(data_path,vocabulary_path, max_size=None): 72 | counter = 0 73 | vocab, _ = initialize_vocabulary(vocabulary_path) 74 | print(len(vocab)) 75 | print(max_size) 76 | data_set = [[] for _ in _buckets] 77 | # http://stackoverflow.com/questions/33054527/python-3-5-typeerror-a-bytes-like-object-is-required-not-str-when-writing-t 78 | with codecs.open(data_path, "rb") as fi: 79 | for line in fi.readlines(): 80 | line = line.decode('utf8').strip() 81 | counter += 1 82 | if max_size!=0 and counter > max_size: 83 | break 84 | if counter % 10000 == 0: 85 | print(" reading data line %d" % counter) 86 | sys.stdout.flush() 87 | entities = line.lower().split("\t") 88 | # print entities 89 | if len(entities) == 2: 90 | source = entities[0] 91 | target = entities[1] 92 | source_ids = [int(x) for x in sentence_to_token_ids(source,vocab)] 93 | target_ids = [int(x) for x in sentence_to_token_ids(target,vocab)] 94 | target_ids.append(EOS_ID) 95 | for bucket_id, (source_size, target_size) in enumerate(_buckets): 96 | if len(source_ids) < source_size and len(target_ids) < target_size: 97 | data_set[bucket_id].append([source_ids, target_ids]) 98 | break 99 | return data_set 100 | 101 | def create_model(session, forward_only, beam_search, beam_size = 10, attention = True): 102 | """Create translation model and initialize or load parameters in session.""" 103 | model = Seq2SeqModel( 104 | FLAGS.en_vocab_size, FLAGS.en_vocab_size, _buckets, 105 | FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size, 106 | FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, 107 | forward_only=forward_only, beam_search=beam_search, beam_size=beam_size, attention=attention) 108 | print(FLAGS.train_dir) 109 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) 110 | 111 | # ckpt.model_checkpoint_path ="./big_models/chat_bot.ckpt-183600" 112 | # print ckpt.model_checkpoint_path 113 | if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): 114 | print("Reading model parameters from %s" % ckpt.model_checkpoint_path) 115 | model.saver.restore(session, ckpt.model_checkpoint_path) 116 | else: 117 | print("Created model with fresh parameters.") 118 | session.run(tf.initialize_all_variables()) 119 | return model 120 | 121 | def create_models(path, en_vocab_size, session, forward_only, beam_search, beam_size = 10, attention = True): 122 | """Create translation model and initialize or load parameters in session.""" 123 | model = Seq2SeqModel( 124 | en_vocab_size, en_vocab_size, _buckets, 125 | FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size, 126 | FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, 127 | forward_only=forward_only, beam_search=beam_search, beam_size=beam_size, attention=attention) 128 | print(FLAGS.train_dir) 129 | ckpt = tf.train.get_checkpoint_state(path) 130 | 131 | # ckpt.model_checkpoint_path ="./big_models/chat_bot.ckpt-183600" 132 | # print ckpt.model_checkpoint_path 133 | if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): 134 | print("Reading model parameters from %s" % ckpt.model_checkpoint_path) 135 | model.saver.restore(session, ckpt.model_checkpoint_path) 136 | else: 137 | print("Created model with fresh parameters.") 138 | session.run(tf.initialize_all_variables()) 139 | return model 140 | 141 | def train(): 142 | 143 | data_path =FLAGS.data_path 144 | dev_data = FLAGS.dev_data 145 | vocab_path =FLAGS.vocab_path 146 | # Beam search is false during training operation and usedat inference . 147 | beam_search = False 148 | beam_size =10 149 | attention = FLAGS.attention 150 | 151 | normalize_digits=True 152 | create_vocabulary(vocab_path, data_path, FLAGS.en_vocab_size ) 153 | 154 | 155 | with tf.Session() as sess: 156 | # Create model. 157 | print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) 158 | model = create_model(sess, False,beam_search=beam_search, beam_size=beam_size, attention=attention) 159 | 160 | # Read data into buckets and compute their sizes. 161 | print("Reading development and training data (limit: %d)." 162 | % FLAGS.max_train_data_size) 163 | train_set =read_chat_data(data_path,vocab_path, FLAGS.max_train_data_size) 164 | dev_set =read_chat_data(dev_data,vocab_path, FLAGS.max_train_data_size) 165 | 166 | train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] 167 | train_total_size = float(sum(train_bucket_sizes)) 168 | 169 | # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use 170 | # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to 171 | # the size if i-th training bucket, as used later. 172 | train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size 173 | for i in xrange(len(train_bucket_sizes))] 174 | 175 | # This is the training loop. 176 | step_time, loss = 0.0, 0.0 177 | current_step = 0 178 | previous_losses = [] 179 | while True: 180 | # Choose a bucket according to data distribution. We pick a random number 181 | # in [0, 1] and use the corresponding interval in train_buckets_scale. 182 | # print "Started" 183 | random_number_01 = np.random.random_sample() 184 | bucket_id = min([i for i in xrange(len(train_buckets_scale)) 185 | if train_buckets_scale[i] > random_number_01]) 186 | 187 | # Get a batch and make a step. 188 | start_time = time.time() 189 | encoder_inputs, decoder_inputs, target_weights = model.get_batch( 190 | train_set, bucket_id) 191 | 192 | _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, 193 | target_weights, bucket_id, False, beam_search) 194 | step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint 195 | loss += step_loss / FLAGS.steps_per_checkpoint 196 | current_step += 1 197 | 198 | # Once in a while, we save checkpoint, print statistics, and run evals. 199 | if current_step % FLAGS.steps_per_checkpoint == 0: 200 | # Print statistics for the previous epoch. 201 | print("Running epochs") 202 | perplexity = math.exp(loss) if loss < 300 else float('inf') 203 | print ("global step %d learning rate %.4f step-time %.2f perplexity " 204 | "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), 205 | step_time, perplexity)) 206 | # # Decrease learning rate if no improvement was seen over last 3 times. 207 | if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): 208 | sess.run(model.learning_rate_decay_op) 209 | previous_losses.append(loss) 210 | # # Save checkpoint and zero timer and loss. 211 | checkpoint_path = os.path.join(FLAGS.train_dir, "chat_bot.ckpt") 212 | model.saver.save(sess, checkpoint_path, global_step=model.global_step) 213 | step_time, loss = 0.0, 0.0 214 | # # Run evals on development set and print their perplexity. 215 | for bucket_id in xrange(len(_buckets)): 216 | if len(dev_set[bucket_id]) == 0: 217 | print(" eval: empty bucket %d" % (bucket_id)) 218 | continue 219 | encoder_inputs, decoder_inputs, target_weights = model.get_batch( 220 | dev_set, bucket_id) 221 | _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, 222 | target_weights, bucket_id, True, beam_search) 223 | eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') 224 | print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) 225 | sys.stdout.flush() 226 | 227 | def decode(): 228 | with tf.Session() as sess: 229 | # Create model and load parameters. 230 | beam_size = FLAGS.beam_size 231 | beam_search = FLAGS.beam_search 232 | attention = FLAGS.attention 233 | model = create_model(sess, True, beam_search=beam_search, beam_size=beam_size, attention=attention) 234 | model.batch_size = 1 # We decode one sentence at a time. 235 | 236 | # Load vocabularies. 237 | vocab_path = FLAGS.vocab_path 238 | vocab, rev_vocab = initialize_vocabulary(vocab_path) 239 | 240 | # Decode from standard input. 241 | if beam_search: 242 | sys.stdout.write("> ") 243 | sys.stdout.flush() 244 | sentence = sys.stdin.readline() 245 | while sentence: 246 | # Get token-ids for the input sentence. 247 | token_ids = sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab) 248 | # Which bucket does it belong to? 249 | bucket_id = min([b for b in xrange(len(_buckets)) 250 | if _buckets[b][0] > len(token_ids)]) 251 | # Get a 1-element batch to feed the sentence to the model. 252 | encoder_inputs, decoder_inputs, target_weights = model.get_batch( 253 | {bucket_id: [(token_ids, [])]}, bucket_id) 254 | # Get output logits for the sentence. 255 | # print bucket_id 256 | path, symbol , output_logits = model.step(sess, encoder_inputs, decoder_inputs, 257 | target_weights, bucket_id, True,beam_search ) 258 | 259 | k = output_logits[0] 260 | paths = [] 261 | for kk in range(beam_size): 262 | paths.append([]) 263 | curr = range(beam_size) 264 | num_steps = len(path) 265 | for i in range(num_steps-1, -1, -1): 266 | for kk in range(beam_size): 267 | paths[kk].append(symbol[i][curr[kk]]) 268 | curr[kk] = path[i][curr[kk]] 269 | recos = set() 270 | print("Replies --------------------------------------->") 271 | for kk in range(beam_size): 272 | foutputs = [int(logit) for logit in paths[kk][::-1]] 273 | 274 | # If there is an EOS symbol in outputs, cut them at that point. 275 | if EOS_ID in foutputs: 276 | # # print outputs 277 | foutputs = foutputs[:foutputs.index(EOS_ID)] 278 | rec = " ".join([tf.compat.as_str(rev_vocab[output]) for output in foutputs]) 279 | if rec not in recos: 280 | recos.add(rec) 281 | print(rec) 282 | 283 | print("> ", "") 284 | sys.stdout.flush() 285 | sentence = sys.stdin.readline() 286 | else: 287 | sys.stdout.write("> ") 288 | sys.stdout.flush() 289 | sentence = sys.stdin.readline() 290 | 291 | while sentence: 292 | # Get token-ids for the input sentence. 293 | token_ids = sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab) 294 | # Which bucket does it belong to? 295 | bucket_id = min([b for b in xrange(len(_buckets)) 296 | if _buckets[b][0] > len(token_ids)]) 297 | # for loc in locs: 298 | # Get a 1-element batch to feed the sentence to the model. 299 | encoder_inputs, decoder_inputs, target_weights = model.get_batch( 300 | {bucket_id: [(token_ids, [],)]}, bucket_id) 301 | 302 | _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, 303 | target_weights, bucket_id, True,beam_search) 304 | # This is a greedy decoder - outputs are just argmaxes of output_logits. 305 | 306 | outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] 307 | # If there is an EOS symbol in outputs, cut them at that point. 308 | if EOS_ID in outputs: 309 | # print outputs 310 | outputs = outputs[:outputs.index(EOS_ID)] 311 | 312 | print(" ".join([tf.compat.as_str(rev_vocab[output]) for output in outputs])) 313 | print("> ", "") 314 | sys.stdout.flush() 315 | sentence = sys.stdin.readline() 316 | 317 | 318 | 319 | def main(_): 320 | if FLAGS.decode: 321 | decode() 322 | else: 323 | train() 324 | 325 | if __name__ == "__main__": 326 | tf.app.run() 327 | -------------------------------------------------------------------------------- /scripts/predict.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | ########################################### 3 | # 4 | ########################################### 5 | 6 | # constants 7 | baseDir=$(cd `dirname "$0"`;pwd) 8 | # functions 9 | 10 | # main 11 | [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return 12 | # source /root/venv-py3/bin/activate 13 | # http://stackoverflow.com/questions/35911252/disable-tensorflow-debugging-information 14 | export TF_CPP_MIN_LOG_LEVEL=3 15 | cd $baseDir/.. 16 | python neural_conversation_model.py \ 17 | --train_dir ubuntu/ \ 18 | --en_vocab_size 60000 \ 19 | --size 512 \ 20 | --data_path ubuntu/train.tsv \ 21 | --dev_data ubuntu/valid.tsv \ 22 | --vocab_path ubuntu/60_chat_vocab.en \ 23 | --attention \ 24 | --decode \ 25 | --beam_search \ 26 | --beam_size 25 \ 27 | -------------------------------------------------------------------------------- /scripts/train.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | ########################################### 3 | # 4 | ########################################### 5 | 6 | # constants 7 | baseDir=$(cd `dirname "$0"`;pwd) 8 | # functions 9 | 10 | # main 11 | [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return 12 | # source /root/venv-py3/bin/activate 13 | # http://stackoverflow.com/questions/35911252/disable-tensorflow-debugging-information 14 | export TF_CPP_MIN_LOG_LEVEL=3 15 | cd $baseDir/.. 16 | python neural_conversation_model.py \ 17 | --train_dir ubuntu/ \ 18 | --en_vocab_size 60000 \ 19 | --size 512 \ 20 | --data_path ubuntu/train.tsv \ 21 | --dev_data ubuntu/valid.tsv \ 22 | --vocab_path ubuntu/60_chat_vocab.en \ 23 | --attention \ 24 | -------------------------------------------------------------------------------- /seq2seq_model.py: -------------------------------------------------------------------------------- 1 | 2 | """Sequence-to-sequence model with an attention mechanism.""" 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | import random 8 | 9 | import numpy as np 10 | from six.moves import xrange # pylint: disable=redefined-builtin 11 | import tensorflow as tf 12 | 13 | from data_utils import * 14 | from my_seq2seq import * 15 | 16 | class Seq2SeqModel(object): 17 | """Sequence-to-sequence model with attention and for multiple buckets. 18 | 19 | This class implements a multi-layer recurrent neural network as encoder, 20 | and an attention-based decoder. This is the same as the model described in 21 | this paper: http://arxiv.org/abs/1412.7449 - please look there for details, 22 | or into the seq2seq library for complete model implementation. 23 | This class also allows to use GRU cells in addition to LSTM cells, and 24 | sampled softmax to handle large output vocabulary size. A single-layer 25 | version of this model, but with bi-directional encoder, was presented in 26 | http://arxiv.org/abs/1409.0473 27 | and sampled softmax is described in Section 3 of the following paper. 28 | http://arxiv.org/abs/1412.2007 29 | """ 30 | 31 | def __init__(self, source_vocab_size, target_vocab_size, buckets, size, 32 | num_layers, max_gradient_norm, batch_size, learning_rate, 33 | learning_rate_decay_factor, use_lstm=False, 34 | num_samples=1024, forward_only=False, beam_search = True, beam_size=10, attention=True): 35 | """Create the model. 36 | 37 | Args: 38 | source_vocab_size: size of the source vocabulary. 39 | target_vocab_size: size of the target vocabulary. 40 | buckets: a list of pairs (I, O), where I specifies maximum input length 41 | that will be processed in that bucket, and O specifies maximum output 42 | length. Training instances that have inputs longer than I or outputs 43 | longer than O will be pushed to the next bucket and padded accordingly. 44 | We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. 45 | size: number of units in each layer of the model. 46 | num_layers: number of layers in the model. 47 | max_gradient_norm: gradients will be clipped to maximally this norm. 48 | batch_size: the size of the batches used during training; 49 | the model construction is independent of batch_size, so it can be 50 | changed after initialization if this is convenient, e.g., for decoding. 51 | learning_rate: learning rate to start with. 52 | learning_rate_decay_factor: decay learning rate by this much when needed. 53 | use_lstm: if true, we use LSTM cells instead of GRU cells. 54 | num_samples: number of samples for sampled softmax. 55 | forward_only: if set, we do not construct the backward pass in the model. 56 | """ 57 | self.source_vocab_size = source_vocab_size 58 | self.target_vocab_size = target_vocab_size 59 | self.buckets = buckets 60 | self.batch_size = batch_size 61 | self.learning_rate = tf.Variable(float(learning_rate), trainable=False) 62 | self.learning_rate_decay_op = self.learning_rate.assign( 63 | self.learning_rate * learning_rate_decay_factor) 64 | self.global_step = tf.Variable(0, trainable=False) 65 | 66 | # If we use sampled softmax, we need an output projection. 67 | output_projection = None 68 | softmax_loss_function = None 69 | # Sampled softmax only makes sense if we sample less than vocabulary size. 70 | if num_samples > 0 and num_samples < self.target_vocab_size: 71 | with tf.device("/cpu:0"): 72 | w = tf.get_variable("proj_w", [size, self.target_vocab_size]) 73 | w_t = tf.transpose(w) 74 | b = tf.get_variable("proj_b", [self.target_vocab_size]) 75 | output_projection = (w, b) 76 | 77 | def sampled_loss(inputs, labels): 78 | with tf.device("/cpu:0"): 79 | labels = tf.reshape(labels, [-1, 1]) 80 | return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, 81 | self.target_vocab_size) 82 | softmax_loss_function = sampled_loss 83 | # Create the internal multi-layer cell for our RNN. 84 | single_cell = tf.nn.rnn_cell.GRUCell(size) 85 | if use_lstm: 86 | single_cell = tf.nn.rnn_cell.BasicLSTMCell(size) 87 | cell = single_cell 88 | if num_layers > 1: 89 | cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers, state_is_tuple=False) 90 | 91 | # The seq2seq function: we use embedding for the input and attention. 92 | def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): 93 | if attention: 94 | print("Attention Model") 95 | return embedding_attention_seq2seq( 96 | encoder_inputs, decoder_inputs, cell, 97 | num_encoder_symbols=source_vocab_size, 98 | num_decoder_symbols=target_vocab_size, 99 | embedding_size=size, 100 | output_projection=output_projection, 101 | feed_previous=do_decode, 102 | beam_search=beam_search, 103 | beam_size=beam_size ) 104 | else: 105 | print("Simple Model") 106 | return embedding_rnn_seq2seq( 107 | encoder_inputs, decoder_inputs, cell, 108 | num_encoder_symbols=source_vocab_size, 109 | num_decoder_symbols=target_vocab_size, 110 | embedding_size=size, 111 | output_projection=output_projection, 112 | feed_previous=do_decode, 113 | beam_search=beam_search, 114 | beam_size=beam_size ) 115 | 116 | 117 | # Feeds for inputs. 118 | self.encoder_inputs = [] 119 | self.decoder_inputs = [] 120 | self.target_weights = [] 121 | for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. 122 | self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], 123 | name="encoder{0}".format(i))) 124 | for i in xrange(buckets[-1][1] + 1): 125 | self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], 126 | name="decoder{0}".format(i))) 127 | self.target_weights.append(tf.placeholder(tf.float32, shape=[None], 128 | name="weight{0}".format(i))) 129 | 130 | # Our targets are decoder inputs shifted by one. 131 | targets = [self.decoder_inputs[i + 1] 132 | for i in xrange(len(self.decoder_inputs) - 1)] 133 | 134 | # Training outputs and losses. 135 | if forward_only: 136 | if beam_search: 137 | self.outputs, self.beam_path, self.beam_symbol = decode_model_with_buckets( 138 | self.encoder_inputs, self.decoder_inputs, targets, 139 | self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), 140 | softmax_loss_function=softmax_loss_function) 141 | else: 142 | # print self.decoder_inputs 143 | self.outputs, self.losses = model_with_buckets( 144 | self.encoder_inputs, self.decoder_inputs, targets, 145 | self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), 146 | softmax_loss_function=softmax_loss_function) 147 | # If we use output projection, we need to project outputs for decoding. 148 | if output_projection is not None: 149 | for b in xrange(len(buckets)): 150 | self.outputs[b] = [ 151 | tf.matmul(output, output_projection[0]) + output_projection[1] 152 | for output in self.outputs[b] 153 | ] 154 | 155 | 156 | else: 157 | self.outputs, self.losses = model_with_buckets( 158 | self.encoder_inputs, self.decoder_inputs, targets, 159 | self.target_weights, buckets, 160 | lambda x, y: seq2seq_f(x, y, False), 161 | softmax_loss_function=softmax_loss_function) 162 | 163 | # Gradients and SGD update operation for training the model. 164 | params = tf.trainable_variables() 165 | if not forward_only: 166 | self.gradient_norms = [] 167 | self.updates = [] 168 | opt = tf.train.GradientDescentOptimizer(self.learning_rate) 169 | for b in xrange(len(buckets)): 170 | gradients = tf.gradients(self.losses[b], params) 171 | clipped_gradients, norm = tf.clip_by_global_norm(gradients, 172 | max_gradient_norm) 173 | self.gradient_norms.append(norm) 174 | self.updates.append(opt.apply_gradients( 175 | zip(clipped_gradients, params), global_step=self.global_step)) 176 | 177 | self.saver = tf.train.Saver(tf.all_variables()) 178 | 179 | def step(self, session, encoder_inputs, decoder_inputs, target_weights, 180 | bucket_id, forward_only, beam_search): 181 | """Run a step of the model feeding the given inputs. 182 | 183 | Args: 184 | session: tensorflow session to use. 185 | encoder_inputs: list of numpy int vectors to feed as encoder inputs. 186 | decoder_inputs: list of numpy int vectors to feed as decoder inputs. 187 | target_weights: list of numpy float vectors to feed as target weights. 188 | bucket_id: which bucket of the model to use. 189 | forward_only: whether to do the backward step or only forward. 190 | 191 | Returns: 192 | A triple consisting of gradient norm (or None if we did not do backward), 193 | average perplexity, and the outputs. 194 | 195 | Raises: 196 | ValueError: if length of encoder_inputs, decoder_inputs, or 197 | target_weights disagrees with bucket size for the specified bucket_id. 198 | """ 199 | # Check if the sizes match. 200 | encoder_size, decoder_size = self.buckets[bucket_id] 201 | if len(encoder_inputs) != encoder_size: 202 | raise ValueError("Encoder length must be equal to the one in bucket," 203 | " %d != %d." % (len(encoder_inputs), encoder_size)) 204 | if len(decoder_inputs) != decoder_size: 205 | raise ValueError("Decoder length must be equal to the one in bucket," 206 | " %d != %d." % (len(decoder_inputs), decoder_size)) 207 | if len(target_weights) != decoder_size: 208 | raise ValueError("Weights length must be equal to the one in bucket," 209 | " %d != %d." % (len(target_weights), decoder_size)) 210 | 211 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided. 212 | input_feed = {} 213 | for l in xrange(encoder_size): 214 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] 215 | for l in xrange(decoder_size): 216 | input_feed[self.decoder_inputs[l].name] = decoder_inputs[l] 217 | input_feed[self.target_weights[l].name] = target_weights[l] 218 | 219 | # Since our targets are decoder inputs shifted by one, we need one more. 220 | last_target = self.decoder_inputs[decoder_size].name 221 | input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) 222 | 223 | # Output feed: depends on whether we do a backward step or not. 224 | if not forward_only: 225 | output_feed = [self.updates[bucket_id], # Update Op that does SGD. 226 | self.gradient_norms[bucket_id], # Gradient norm. 227 | self.losses[bucket_id]] # Loss for this batch. 228 | else: 229 | if beam_search: 230 | output_feed = [self.beam_path[bucket_id]] # Loss for this batch. 231 | output_feed.append(self.beam_symbol[bucket_id]) 232 | else: 233 | output_feed = [self.losses[bucket_id]] 234 | 235 | for l in xrange(decoder_size): # Output logits. 236 | output_feed.append(self.outputs[bucket_id][l]) 237 | # print bucket_id 238 | outputs = session.run(output_feed, input_feed) 239 | if not forward_only: 240 | return outputs[1], outputs[2], None # Gradient norm, loss, no outputs. 241 | else: 242 | if beam_search: 243 | return outputs[0], outputs[1], outputs[2:] # No gradient norm, loss, outputs. 244 | else: 245 | return None, outputs[0], outputs[1:] # No gradient norm, loss, outputs. 246 | 247 | def get_batch(self, data, bucket_id): 248 | """Get a random batch of data from the specified bucket, prepare for step. 249 | 250 | To feed data in step(..) it must be a list of batch-major vectors, while 251 | data here contains single length-major cases. So the main logic of this 252 | function is to re-index data cases to be in the proper format for feeding. 253 | 254 | Args: 255 | data: a tuple of size len(self.buckets) in which each element contains 256 | lists of pairs of input and output data that we use to create a batch. 257 | bucket_id: integer, which bucket to get the batch for. 258 | 259 | Returns: 260 | The triple (encoder_inputs, decoder_inputs, target_weights) for 261 | the constructed batch that has the proper format to call step(...) later. 262 | """ 263 | encoder_size, decoder_size = self.buckets[bucket_id] 264 | encoder_inputs, decoder_inputs = [], [] 265 | 266 | # Get a random batch of encoder and decoder inputs from data, 267 | # pad them if needed, reverse encoder inputs and add GO to decoder. 268 | for _ in xrange(self.batch_size): 269 | encoder_input, decoder_input = random.choice(data[bucket_id]) 270 | 271 | # Encoder inputs are padded and then reversed. 272 | encoder_pad = [PAD_ID] * (encoder_size - len(encoder_input)) 273 | encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) 274 | 275 | # Decoder inputs get an extra "GO" symbol, and are padded then. 276 | decoder_pad_size = decoder_size - len(decoder_input) - 1 277 | decoder_inputs.append([GO_ID] + decoder_input + 278 | [PAD_ID] * decoder_pad_size) 279 | 280 | # Now we create batch-major vectors from the data selected above. 281 | batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] 282 | 283 | # Batch encoder inputs are just re-indexed encoder_inputs. 284 | for length_idx in xrange(encoder_size): 285 | batch_encoder_inputs.append( 286 | np.array([encoder_inputs[batch_idx][length_idx] 287 | for batch_idx in xrange(self.batch_size)], dtype=np.int32)) 288 | 289 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights. 290 | for length_idx in xrange(decoder_size): 291 | batch_decoder_inputs.append( 292 | np.array([decoder_inputs[batch_idx][length_idx] 293 | for batch_idx in xrange(self.batch_size)], dtype=np.int32)) 294 | 295 | # Create target_weights to be 0 for targets that are padding. 296 | batch_weight = np.ones(self.batch_size, dtype=np.float32) 297 | for batch_idx in xrange(self.batch_size): 298 | # We set weight to 0 if the corresponding target is a PAD symbol. 299 | # The corresponding target is decoder_input shifted by 1 forward. 300 | if length_idx < decoder_size - 1: 301 | target = decoder_inputs[batch_idx][length_idx + 1] 302 | if length_idx == decoder_size - 1 or target == PAD_ID: 303 | batch_weight[batch_idx] = 0.0 304 | batch_weights.append(batch_weight) 305 | return batch_encoder_inputs, batch_decoder_inputs, batch_weights 306 | --------------------------------------------------------------------------------