├── .gitignore
├── LICENSE.txt
├── README.md
├── __init__.py
├── data_utils.py
├── my_seq2seq.py
├── neural_conversation_model.py
├── scripts
    ├── predict.sh
    └── train.sh
└── seq2seq_model.py


/.gitignore:
--------------------------------------------------------------------------------
1 | ubuntu
2 | *.swo
3 | *.swp
4 | __pycache__
5 | dada
6 | data
7 | sftp-config.json
8 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Neural_Conversation_Models
 2 | =================================
 3 | This implementation contains an extension of seq2seq tutorial for conversation models in Tensorflow:
 4 | 
 5 | 1. Option to use Beam Search and Beam Size for decoding
 6 |     
 7 | 2. Currently, it supports
 8 |     - Simple seq2seq  models
 9 |     - Attention based seq2seq models
10 |     
11 | 3. To get better results use beam search during decoding / inference 
12 | 
13 | Examples of basic model can be found in this paper.
14 | 
15 | https://arxiv.org/abs/1702.05512
16 | 
17 | 
18 | Prerequisites
19 | -------------
20 | 
21 | - Python Python 3.3+
22 | - [NLTK](http://www.nltk.org/)
23 | - [TensorFlow](https://www.tensorflow.org/) 0.12.1
24 | 
25 | Installations
26 | -----
27 | 
28 | * Mac
29 | ```
30 | virtualenv --no-site-packages -p /usr/local/bin/python3.6 ~/venv-py3
31 | source ~/venv-py3/bin/activate
32 | pip3 install --upgrade \
33 |  https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.12.1-py3-none-any.whl # for CPU Usage
34 | ```
35 | 
36 | * Linux with GPU Driver
37 | ```
38 | virtualenv --no-site-packages -p /usr/local/bin/python3.6 ~/venv-py3
39 | source ~/venv-py3/bin/activate
40 | pip3 install --upgrade \ https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.1-cp34-cp34m-linux_x86_64.whl
41 | ```
42 | 
43 | Data
44 | -----
45 | Data accepted is in the tsv format where first component is the context and second is the reply
46 | 
47 | TSV format Ubuntu Dialog Data can be found [here](https://drive.google.com/file/d/0BwPa9lrosQKdSTZxZ0tydUFGWE0/view) or [Git repo](http://git.oschina.net/ubiware/neural_conversation_models_ubuntu_corpus).
48 |  
49 | example :-
50 | 1. What are you doing ? \t Writing seq2seq model . 
51 | 
52 | Usage
53 | -----
54 | 
55 | To train a model with Ubuntu dataset:
56 | 
57 |     $ python neural_conversation_model.py --train_dir ubuntu/ --en_vocab_size 60000 --size 512 --data_path ubuntu/train.tsv --dev_data ubuntu/valid.tsv  --vocab_path ubuntu/60k_vocan.en --attention
58 | 
59 | To test an existing model:
60 | 
61 |     $ python neural_conversation_model.py --train_dir ubuntu/ --en_vocab_size 60000 --size 512 --data_path ubuntu/train.tsv --dev_data ubuntu/valid.tsv  --vocab_path ubuntu/60k_vocan.en --attention --decode --beam_search --beam_size 25
62 | 
63 | Todo
64 | -----
65 | 1. Add other state of art neural models. 
66 | 2. Adding layer normalization( in progress )
67 | 
68 | https://github.com/pbhatia243/tf-layer-norm
69 | 
70 | ## Contact
71 | Parminder Bhatia, parminder.bhatia243@gmail.com
72 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 | 
5 | from tensorflow.models.rnn.translate import data_utils
6 | from tensorflow.models.rnn.translate import seq2seq_model
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | 
 20 | import gzip
 21 | import os
 22 | import re
 23 | import tarfile
 24 | 
 25 | from six.moves import urllib
 26 | 
 27 | from tensorflow.python.platform import gfile
 28 | 
 29 | # Special vocabulary symbols - we always put them at the start.
 30 | _PAD = "_PAD"
 31 | _GO = "_GO"
 32 | _EOS = "_EOS"
 33 | _UNK = "_UNK"
 34 | _START_VOCAB = [_PAD, _GO, _EOS, _UNK]
 35 | 
 36 | PAD_ID = 0
 37 | GO_ID = 1
 38 | EOS_ID = 2
 39 | UNK_ID = 3
 40 | 
 41 | # Regular expressions used to tokenize.
 42 | # _WORD_SPLIT = re.compile("([.,!?\"':;)(])")
 43 | _WORD_SPLIT = re.compile("([.,!/?\":;)(])")
 44 | _DIGIT_RE = re.compile(r"\d")
 45 | 
 46 | 
 47 | 
 48 | def gunzip_file(gz_path, new_path):
 49 |   """Unzips from gz_path into new_path."""
 50 |   print("Unpacking %s to %s" % (gz_path, new_path))
 51 |   with gzip.open(gz_path, "rb") as gz_file:
 52 |     with open(new_path, "w") as new_file:
 53 |       for line in gz_file:
 54 |         new_file.write(line)
 55 | 
 56 | 
 57 | def basic_tokenizer(sentence):
 58 |   """Very basic tokenizer: split the sentence into a list of tokens."""
 59 |   words = []
 60 |   for space_separated_fragment in sentence.strip().split():
 61 |     words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
 62 |   return [w for w in words if w]
 63 | 
 64 | 
 65 | def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
 66 |                       tokenizer=None, normalize_digits=True):
 67 |   """Create vocabulary file (if it does not exist yet) from data file.
 68 | 
 69 |   Data file is assumed to contain one sentence per line. Each sentence is
 70 |   tokenized and digits are normalized (if normalize_digits is set).
 71 |   Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
 72 |   We write it to vocabulary_path in a one-token-per-line format, so that later
 73 |   token in the first line gets id=0, second line gets id=1, and so on.
 74 | 
 75 |   Args:
 76 |     vocabulary_path: path where the vocabulary will be created.
 77 |     data_path: data file that will be used to create vocabulary.
 78 |     max_vocabulary_size: limit on the size of the created vocabulary.
 79 |     tokenizer: a function to use to tokenize each data sentence;
 80 |       if None, basic_tokenizer will be used.
 81 |     normalize_digits: Boolean; if true, all digits are replaced by 0s.
 82 |   """
 83 |   if not gfile.Exists(vocabulary_path):
 84 |     print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
 85 |     vocab = {}
 86 |     with gfile.GFile(data_path, mode="r") as f:
 87 |       counter = 0
 88 |       for line in f:
 89 |         counter += 1
 90 |         if counter % 100000 == 0:
 91 |           print("  processing line %d" % counter)
 92 |         text_conversation =line.strip().lower().split("\t")
 93 |         if len(text_conversation) == 2:
 94 |           txt  = text_conversation[0] + " " + text_conversation[1]
 95 |           tokens = tokenizer(txt) if tokenizer else basic_tokenizer(txt)
 96 |           for w in tokens:
 97 |             # word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w
 98 |             word = w
 99 |             if word in vocab:
100 |               vocab[word] += 1
101 |             else:
102 |               vocab[word] = 1
103 |       vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
104 |       print(len(vocab_list))
105 |       if len(vocab_list) > max_vocabulary_size:
106 |         vocab_list = vocab_list[:max_vocabulary_size]
107 |       with gfile.GFile(vocabulary_path, mode="w") as vocab_file:
108 |         for w in vocab_list:
109 |           vocab_file.write(w + "\n")
110 | 
111 | def initialize_vocabulary(vocabulary_path):
112 |   """Initialize vocabulary from file.
113 | 
114 |   We assume the vocabulary is stored one-item-per-line, so a file:
115 |     dog
116 |     cat
117 |   will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
118 |   also return the reversed-vocabulary ["dog", "cat"].
119 | 
120 |   Args:
121 |     vocabulary_path: path to the file containing the vocabulary.
122 | 
123 |   Returns:
124 |     a pair: the vocabulary (a dictionary mapping string to integers), and
125 |     the reversed vocabulary (a list, which reverses the vocabulary mapping).
126 | 
127 |   Raises:
128 |     ValueError: if the provided vocabulary_path does not exist.
129 |   """
130 |   if gfile.Exists(vocabulary_path):
131 |     rev_vocab = []
132 |     with gfile.GFile(vocabulary_path, mode="r") as f:
133 |       rev_vocab.extend(f.readlines())
134 |     rev_vocab = [line.strip() for line in rev_vocab]
135 |     vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
136 |     return vocab, rev_vocab
137 |   else:
138 |     raise ValueError("Vocabulary file %s not found.", vocabulary_path)
139 | 
140 | 
141 | def sentence_to_token_ids(sentence, vocabulary,
142 |                           tokenizer=None, normalize_digits=True):
143 |   """Convert a string to list of integers representing token-ids.
144 | 
145 |   For example, a sentence "I have a dog" may become tokenized into
146 |   ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
147 |   "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
148 | 
149 |   Args:
150 |     sentence: a string, the sentence to convert to token-ids.
151 |     vocabulary: a dictionary mapping tokens to integers.
152 |     tokenizer: a function to use to tokenize each sentence;
153 |       if None, basic_tokenizer will be used.
154 |     normalize_digits: Boolean; if true, all digits are replaced by 0s.
155 | 
156 |   Returns:
157 |     a list of integers, the token-ids for the sentence.
158 |   """
159 |   if tokenizer:
160 |     words = tokenizer(sentence)
161 |   else:
162 |     words = basic_tokenizer(sentence)
163 |   # if not normalize_digits:
164 |   return [vocabulary.get(w, UNK_ID) for w in words]
165 |   # Normalize digits by 0 before looking words up in the vocabulary.
166 |   # return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words]
167 | 
168 | 
169 | def data_to_token_ids(data_path, target_path, vocabulary_path,
170 |                       tokenizer=None, normalize_digits=True):
171 |   """Tokenize data file and turn into token-ids using given vocabulary file.
172 | 
173 |   This function loads data line-by-line from data_path, calls the above
174 |   sentence_to_token_ids, and saves the result to target_path. See comment
175 |   for sentence_to_token_ids on the details of token-ids format.
176 | 
177 |   Args:
178 |     data_path: path to the data file in one-sentence-per-line format.
179 |     target_path: path where the file with token-ids will be created.
180 |     vocabulary_path: path to the vocabulary file.
181 |     tokenizer: a function to use to tokenize each sentence;
182 |       if None, basic_tokenizer will be used.
183 |     normalize_digits: Boolean; if true, all digits are replaced by 0s.
184 |   """
185 |   if not gfile.Exists(target_path):
186 |     print("Tokenizing data in %s" % data_path)
187 |     vocab, _ = initialize_vocabulary(vocabulary_path)
188 |     with gfile.GFile(data_path, mode="r") as data_file:
189 |       with gfile.GFile(target_path, mode="w") as tokens_file:
190 |         counter = 0
191 |         for line in data_file:
192 |           counter += 1
193 |           if counter % 100000 == 0:
194 |             print("  tokenizing line %d" % counter)
195 |           token_ids = sentence_to_token_ids(line, vocab, tokenizer,
196 |                                             normalize_digits)
197 |           tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
198 | 
199 | 


--------------------------------------------------------------------------------
/my_seq2seq.py:
--------------------------------------------------------------------------------
   1 | 
   2 | 
   3 | """Library for creating sequence-to-sequence models in TensorFlow.
   4 | 
   5 | Sequence-to-sequence recurrent neural networks can learn complex functions
   6 | that map input sequences to output sequences. These models yield very good
   7 | results on a number of tasks, such as speech recognition, parsing, machine
   8 | translation, or even constructing automated replies to emails.
   9 | 
  10 | 
  11 | * Full sequence-to-sequence models.
  12 | 
  13 |   - embedding_rnn_seq2seq: The basic model with input embedding.
  14 |   - embedding_attention_seq2seq: Advanced model with input embedding and
  15 |       the neural attention mechanism; recommended for complex tasks.
  16 | 
  17 | 
  18 | * Decoders
  19 |   - rnn_decoder: The basic decoder based on a pure RNN.
  20 |   - attention_decoder: A decoder that uses the attention mechanism.
  21 | 
  22 | * Losses.
  23 |   - sequence_loss: Loss for a sequence model returning average log-perplexity.
  24 |   - sequence_loss_by_example: As above, but not averaging over all examples.
  25 | 
  26 | * model_with_buckets: A convenience function to create models with bucketing
  27 |     (see the tutorial above for an explanation of why and how to use it).
  28 | """
  29 | from __future__ import absolute_import
  30 | from __future__ import division
  31 | from __future__ import print_function
  32 | 
  33 | from six.moves import xrange  # pylint: disable=redefined-builtin
  34 | from six.moves import zip     # pylint: disable=redefined-builtin
  35 | 
  36 | from tensorflow.python.framework import dtypes
  37 | from tensorflow.python.framework import ops
  38 | from tensorflow.python.ops import array_ops
  39 | from tensorflow.python.ops import control_flow_ops
  40 | from tensorflow.python.ops import embedding_ops
  41 | from tensorflow.python.ops import math_ops
  42 | from tensorflow.python.ops import nn_ops
  43 | from tensorflow.python.ops import rnn
  44 | from tensorflow.python.ops import rnn_cell
  45 | from tensorflow.python.ops import variable_scope
  46 | import tensorflow as tf
  47 | 
  48 | try:
  49 |   linear = tf.nn.rnn_cell.linear
  50 | except:
  51 |   from tensorflow.python.ops.rnn_cell import _linear as linear
  52 |   
  53 | 
  54 | def _extract_argmax_and_embed(embedding, output_projection=None,
  55 |                               update_embedding=True):
  56 |   """Get a loop_function that extracts the previous symbol and embeds it.
  57 |   Args:
  58 |     embedding: embedding tensor for symbols.
  59 |     output_projection: None or a pair (W, B). If provided, each fed previous
  60 |       output will first be multiplied by W and added B.
  61 |     update_embedding: Boolean; if False, the gradients will not propagate
  62 |       through the embeddings.
  63 |   Returns:
  64 |     A loop function.
  65 |   """
  66 |   def loop_function(prev, _):
  67 |     if output_projection is not None:
  68 |       prev = nn_ops.xw_plus_b(
  69 |           prev, output_projection[0], output_projection[1])
  70 |     prev_symbol = math_ops.argmax(prev, 1)
  71 |     # Note that gradients will not propagate through the second parameter of
  72 |     # embedding_lookup.
  73 |     emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
  74 |     if not update_embedding:
  75 |       emb_prev = array_ops.stop_gradient(emb_prev)
  76 |     return emb_prev
  77 |   return loop_function
  78 | 
  79 | def _extract_beam_search(embedding, beam_size, num_symbols, embedding_size,  output_projection=None,
  80 |                               update_embedding=True):
  81 |   """Get a loop_function that extracts the previous symbol and embeds it.
  82 | 
  83 |   Args:
  84 |     embedding: embedding tensor for symbols.
  85 |     output_projection: None or a pair (W, B). If provided, each fed previous
  86 |       output will first be multiplied by W and added B.
  87 |     update_embedding: Boolean; if False, the gradients will not propagate
  88 |       through the embeddings.
  89 | 
  90 |   Returns:
  91 |     A loop function.
  92 |   """
  93 |   def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols):
  94 |     if output_projection is not None:
  95 |       prev = nn_ops.xw_plus_b(
  96 |           prev, output_projection[0], output_projection[1])
  97 |     # prev= prev.get_shape().with_rank(2)[1]
  98 | 
  99 |     probs  = tf.log(tf.nn.softmax(prev))
 100 | 
 101 |     if i > 1:
 102 | 
 103 |         probs = tf.reshape(probs + log_beam_probs[-1],
 104 |                                [-1, beam_size * num_symbols])
 105 | 
 106 |     best_probs, indices = tf.nn.top_k(probs, beam_size)
 107 |     indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1])))
 108 |     best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1]))
 109 | 
 110 |     symbols = indices % num_symbols # Which word in vocabulary.
 111 |     beam_parent = indices // num_symbols # Which hypothesis it came from.
 112 | 
 113 | 
 114 |     beam_symbols.append(symbols)
 115 |     beam_path.append(beam_parent)
 116 |     log_beam_probs.append(best_probs)
 117 | 
 118 |     # Note that gradients will not propagate through the second parameter of
 119 |     # embedding_lookup.
 120 | 
 121 |     emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
 122 |     emb_prev  = tf.reshape(emb_prev,[beam_size,embedding_size])
 123 |     # emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
 124 |     if not update_embedding:
 125 |       emb_prev = array_ops.stop_gradient(emb_prev)
 126 |     return emb_prev
 127 |   return loop_function
 128 | 
 129 | 
 130 | def rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None,
 131 |                 scope=None):
 132 |   """RNN decoder for the sequence-to-sequence model.
 133 | 
 134 |   Args:
 135 |     decoder_inputs: A list of 2D Tensors [batch_size x input_size].
 136 |     initial_state: 2D Tensor with shape [batch_size x cell.state_size].
 137 |     cell: rnn_cell.RNNCell defining the cell function and size.
 138 |     loop_function: If not None, this function will be applied to the i-th output
 139 |       in order to generate the i+1-st input, and decoder_inputs will be ignored,
 140 |       except for the first element ("GO" symbol). This can be used for decoding,
 141 |       but also for training to emulate http://arxiv.org/abs/1506.03099.
 142 |       Signature -- loop_function(prev, i) = next
 143 |         * prev is a 2D Tensor of shape [batch_size x output_size],
 144 |         * i is an integer, the step number (when advanced control is needed),
 145 |         * next is a 2D Tensor of shape [batch_size x input_size].
 146 |     scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
 147 | 
 148 |   Returns:
 149 |     A tuple of the form (outputs, state), where:
 150 |       outputs: A list of the same length as decoder_inputs of 2D Tensors with
 151 |         shape [batch_size x output_size] containing generated outputs.
 152 |       state: The state of each cell at the final time-step.
 153 |         It is a 2D Tensor of shape [batch_size x cell.state_size].
 154 |         (Note that in some cases, like basic RNN cell or GRU cell, outputs and
 155 |          states can be the same. They are different for LSTM cells though.)
 156 |   """
 157 |   with variable_scope.variable_scope(scope or "rnn_decoder"):
 158 |     state = initial_state
 159 |     outputs = []
 160 |     prev = None
 161 |     for i, inp in enumerate(decoder_inputs):
 162 |       if loop_function is not None and prev is not None:
 163 |         with variable_scope.variable_scope("loop_function", reuse=True):
 164 |           inp = loop_function(prev, i)
 165 |       if i > 0:
 166 |         variable_scope.get_variable_scope().reuse_variables()
 167 |       output, state = cell(inp, state)
 168 | 
 169 |       outputs.append(output)
 170 |       if loop_function is not None:
 171 |         prev = output
 172 |   return outputs, state
 173 | 
 174 | def beam_rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None,
 175 |                 scope=None,output_projection=None, beam_size=10):
 176 |   """RNN decoder for the sequence-to-sequence model.
 177 | 
 178 |   Args:
 179 |     decoder_inputs: A list of 2D Tensors [batch_size x input_size].
 180 |     initial_state: 2D Tensor with shape [batch_size x cell.state_size].
 181 |     cell: rnn_cell.RNNCell defining the cell function and size.
 182 |     loop_function: If not None, this function will be applied to the i-th output
 183 |       in order to generate the i+1-st input, and decoder_inputs will be ignored,
 184 |       except for the first element ("GO" symbol). This can be used for decoding,
 185 |       but also for training to emulate http://arxiv.org/abs/1506.03099.
 186 |       Signature -- loop_function(prev, i) = next
 187 |         * prev is a 2D Tensor of shape [batch_size x output_size],
 188 |         * i is an integer, the step number (when advanced control is needed),
 189 |         * next is a 2D Tensor of shape [batch_size x input_size].
 190 |     scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
 191 | 
 192 |   Returns:
 193 |     A tuple of the form (outputs, state), where:
 194 |       outputs: A list of the same length as decoder_inputs of 2D Tensors with
 195 |         shape [batch_size x output_size] containing generated outputs.
 196 |       state: The state of each cell at the final time-step.
 197 |         It is a 2D Tensor of shape [batch_size x cell.state_size].
 198 |         (Note that in some cases, like basic RNN cell or GRU cell, outputs and
 199 |          states can be the same. They are different for LSTM cells though.)
 200 |   """
 201 |   with variable_scope.variable_scope(scope or "rnn_decoder"):
 202 |     state = initial_state
 203 |     outputs = []
 204 |     prev = None
 205 |     log_beam_probs, beam_path, beam_symbols = [],[],[]
 206 |     state_size = int(initial_state.get_shape().with_rank(2)[1])
 207 | 
 208 |     for i, inp in enumerate(decoder_inputs):
 209 |       if loop_function is not None and prev is not None:
 210 |         with variable_scope.variable_scope("loop_function", reuse=True):
 211 |           inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols)
 212 |       if i > 0:
 213 |         variable_scope.get_variable_scope().reuse_variables()
 214 | 
 215 |       input_size = inp.get_shape().with_rank(2)[1]
 216 |       print(input_size)
 217 |       x = inp
 218 |       output, state = cell(x, state)
 219 | 
 220 |       if loop_function is not None:
 221 |         prev = output
 222 |       if  i ==0:
 223 |           states =[]
 224 |           for kk in range(beam_size):
 225 |                 states.append(state)
 226 |           state = tf.reshape(tf.concat(0, states), [-1, state_size])
 227 | 
 228 |       outputs.append(tf.argmax(nn_ops.xw_plus_b(
 229 |           output, output_projection[0], output_projection[1]), dimension=1))
 230 |   return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size])
 231 | 
 232 | 
 233 | def embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols,
 234 |                           embedding_size, output_projection=None,
 235 |                           feed_previous=False,
 236 |                           update_embedding_for_previous=True, scope=None, beam_search=True, beam_size=10 ):
 237 |   """RNN decoder with embedding and a pure-decoding option.
 238 | 
 239 |   Args:
 240 |     decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
 241 |     initial_state: 2D Tensor [batch_size x cell.state_size].
 242 |     cell: rnn_cell.RNNCell defining the cell function.
 243 |     num_symbols: Integer, how many symbols come into the embedding.
 244 |     embedding_size: Integer, the length of the embedding vector for each symbol.
 245 |     output_projection: None or a pair (W, B) of output projection weights and
 246 |       biases; W has shape [output_size x num_symbols] and B has
 247 |       shape [num_symbols]; if provided and feed_previous=True, each fed
 248 |       previous output will first be multiplied by W and added B.
 249 |     feed_previous: Boolean; if True, only the first of decoder_inputs will be
 250 |       used (the "GO" symbol), and all other decoder inputs will be generated by:
 251 |         next = embedding_lookup(embedding, argmax(previous_output)),
 252 |       In effect, this implements a greedy decoder. It can also be used
 253 |       during training to emulate http://arxiv.org/abs/1506.03099.
 254 |       If False, decoder_inputs are used as given (the standard decoder case).
 255 |     update_embedding_for_previous: Boolean; if False and feed_previous=True,
 256 |       only the embedding for the first symbol of decoder_inputs (the "GO"
 257 |       symbol) will be updated by back propagation. Embeddings for the symbols
 258 |       generated from the decoder itself remain unchanged. This parameter has
 259 |       no effect if feed_previous=False.
 260 |     scope: VariableScope for the created subgraph; defaults to
 261 |       "embedding_rnn_decoder".
 262 | 
 263 |   Returns:
 264 |     A tuple of the form (outputs, state), where:
 265 |       outputs: A list of the same length as decoder_inputs of 2D Tensors with
 266 |         shape [batch_size x output_size] containing the generated outputs.
 267 |       state: The state of each decoder cell in each time-step. This is a list
 268 |         with length len(decoder_inputs) -- one item for each time-step.
 269 |         It is a 2D Tensor of shape [batch_size x cell.state_size].
 270 | 
 271 |   Raises:
 272 |     ValueError: When output_projection has the wrong shape.
 273 |   """
 274 |   if output_projection is not None:
 275 |     proj_weights = ops.convert_to_tensor(output_projection[0],
 276 |                                          dtype=dtypes.float32)
 277 |     proj_weights.get_shape().assert_is_compatible_with([None, num_symbols])
 278 |     proj_biases = ops.convert_to_tensor(
 279 |         output_projection[1], dtype=dtypes.float32)
 280 |     proj_biases.get_shape().assert_is_compatible_with([num_symbols])
 281 | 
 282 |   with variable_scope.variable_scope(scope or "embedding_rnn_decoder"):
 283 |     with ops.device("/cpu:0"):
 284 |       embedding = variable_scope.get_variable("embedding",
 285 |                                               [num_symbols, embedding_size])
 286 | 
 287 |     if beam_search:
 288 |         loop_function = _extract_beam_search(
 289 |         embedding, beam_size,num_symbols,embedding_size,  output_projection,
 290 |         update_embedding_for_previous)
 291 |     else:
 292 |         loop_function = _extract_argmax_and_embed(
 293 |         embedding, output_projection,
 294 |         update_embedding_for_previous) if feed_previous else None
 295 | 
 296 |     emb_inp = [
 297 |         embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs]
 298 | 
 299 | 
 300 |     if beam_search:
 301 |         return beam_rnn_decoder(emb_inp, initial_state, cell,
 302 |                        loop_function=loop_function,output_projection=output_projection, beam_size=beam_size)
 303 | 
 304 |     else:
 305 |         return  rnn_decoder(emb_inp, initial_state, cell,
 306 |                        loop_function=loop_function)
 307 | 
 308 | 
 309 | 
 310 | def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
 311 |                           num_encoder_symbols, num_decoder_symbols,
 312 |                           embedding_size, output_projection=None,
 313 |                           feed_previous=False, dtype=dtypes.float32,
 314 |                           scope=None, beam_search=True, beam_size=10):
 315 |   """Embedding RNN sequence-to-sequence model.
 316 | 
 317 |   This model first embeds encoder_inputs by a newly created embedding (of shape
 318 |   [num_encoder_symbols x input_size]). Then it runs an RNN to encode
 319 |   embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs
 320 |   by another newly created embedding (of shape [num_decoder_symbols x
 321 |   input_size]). Then it runs RNN decoder, initialized with the last
 322 |   encoder state, on embedded decoder_inputs.
 323 | 
 324 |   Args:
 325 |     encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
 326 |     decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
 327 |     cell: rnn_cell.RNNCell defining the cell function and size.
 328 |     num_encoder_symbols: Integer; number of symbols on the encoder side.
 329 |     num_decoder_symbols: Integer; number of symbols on the decoder side.
 330 |     embedding_size: Integer, the length of the embedding vector for each symbol.
 331 |     output_projection: None or a pair (W, B) of output projection weights and
 332 |       biases; W has shape [output_size x num_decoder_symbols] and B has
 333 |       shape [num_decoder_symbols]; if provided and feed_previous=True, each
 334 |       fed previous output will first be multiplied by W and added B.
 335 |     feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
 336 |       of decoder_inputs will be used (the "GO" symbol), and all other decoder
 337 |       inputs will be taken from previous outputs (as in embedding_rnn_decoder).
 338 |       If False, decoder_inputs are used as given (the standard decoder case).
 339 |     dtype: The dtype of the initial state for both the encoder and encoder
 340 |       rnn cells (default: tf.float32).
 341 |     scope: VariableScope for the created subgraph; defaults to
 342 |       "embedding_rnn_seq2seq"
 343 | 
 344 |   Returns:
 345 |     A tuple of the form (outputs, state), where:
 346 |       outputs: A list of the same length as decoder_inputs of 2D Tensors with
 347 |         shape [batch_size x num_decoder_symbols] containing the generated
 348 |         outputs.
 349 |       state: The state of each decoder cell in each time-step. This is a list
 350 |         with length len(decoder_inputs) -- one item for each time-step.
 351 |         It is a 2D Tensor of shape [batch_size x cell.state_size].
 352 |   """
 353 |   with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq"):
 354 |     # Encoder.
 355 |     encoder_cell = rnn_cell.EmbeddingWrapper(
 356 |         cell, embedding_classes=num_encoder_symbols,
 357 |         embedding_size=embedding_size)
 358 |     _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
 359 | 
 360 |     # Decoder.
 361 |     if output_projection is None:
 362 |       cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
 363 | 
 364 | 
 365 |     return embedding_rnn_decoder(
 366 |           decoder_inputs, encoder_state, cell, num_decoder_symbols,
 367 |           embedding_size, output_projection=output_projection,
 368 |           feed_previous=feed_previous, beam_search=beam_search, beam_size=beam_size)
 369 | 
 370 | 
 371 | 
 372 | 
 373 | 
 374 | def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
 375 |                       output_size=None, num_heads=1, loop_function=None,
 376 |                       dtype=dtypes.float32, scope=None,
 377 |                       initial_state_attention=False):
 378 |   """RNN decoder with attention for the sequence-to-sequence model.
 379 | 
 380 |   In this context "attention" means that, during decoding, the RNN can look up
 381 |   information in the additional tensor attention_states, and it does this by
 382 |   focusing on a few entries from the tensor. This model has proven to yield
 383 |   especially good results in a number of sequence-to-sequence tasks. This
 384 |   implementation is based on http://arxiv.org/abs/1412.7449 (see below for
 385 |   details). It is recommended for complex sequence-to-sequence tasks.
 386 | 
 387 |   Args:
 388 |     decoder_inputs: A list of 2D Tensors [batch_size x input_size].
 389 |     initial_state: 2D Tensor [batch_size x cell.state_size].
 390 |     attention_states: 3D Tensor [batch_size x attn_length x attn_size].
 391 |     cell: rnn_cell.RNNCell defining the cell function and size.
 392 |     output_size: Size of the output vectors; if None, we use cell.output_size.
 393 |     num_heads: Number of attention heads that read from attention_states.
 394 |     loop_function: If not None, this function will be applied to i-th output
 395 |       in order to generate i+1-th input, and decoder_inputs will be ignored,
 396 |       except for the first element ("GO" symbol). This can be used for decoding,
 397 |       but also for training to emulate http://arxiv.org/abs/1506.03099.
 398 |       Signature -- loop_function(prev, i) = next
 399 |         * prev is a 2D Tensor of shape [batch_size x output_size],
 400 |         * i is an integer, the step number (when advanced control is needed),
 401 |         * next is a 2D Tensor of shape [batch_size x input_size].
 402 |     dtype: The dtype to use for the RNN initial state (default: tf.float32).
 403 |     scope: VariableScope for the created subgraph; default: "attention_decoder".
 404 |     initial_state_attention: If False (default), initial attentions are zero.
 405 |       If True, initialize the attentions from the initial state and attention
 406 |       states -- useful when we wish to resume decoding from a previously
 407 |       stored decoder state and attention states.
 408 | 
 409 |   Returns:
 410 |     A tuple of the form (outputs, state), where:
 411 |       outputs: A list of the same length as decoder_inputs of 2D Tensors of
 412 |         shape [batch_size x output_size]. These represent the generated outputs.
 413 |         Output i is computed from input i (which is either the i-th element
 414 |         of decoder_inputs or loop_function(output {i-1}, i)) as follows.
 415 |         First, we run the cell on a combination of the input and previous
 416 |         attention masks:
 417 |           cell_output, new_state = cell(linear(input, prev_attn), prev_state).
 418 |         Then, we calculate new attention masks:
 419 |           new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
 420 |         and then we calculate the output:
 421 |           output = linear(cell_output, new_attn).
 422 |       state: The state of each decoder cell the final time-step.
 423 |         It is a 2D Tensor of shape [batch_size x cell.state_size].
 424 | 
 425 |   Raises:
 426 |     ValueError: when num_heads is not positive, there are no inputs, shapes
 427 |       of attention_states are not set, or input size cannot be inferred
 428 |       from the input.
 429 |   """
 430 |   if not decoder_inputs:
 431 |     raise ValueError("Must provide at least 1 input to attention decoder.")
 432 |   if num_heads < 1:
 433 |     raise ValueError("With less than 1 heads, use a non-attention decoder.")
 434 |   if not attention_states.get_shape()[1:2].is_fully_defined():
 435 |     raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
 436 |                      % attention_states.get_shape())
 437 |   if output_size is None:
 438 |     output_size = cell.output_size
 439 | 
 440 |   with variable_scope.variable_scope(scope or "attention_decoder"):
 441 |     batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
 442 |     attn_length = attention_states.get_shape()[1].value
 443 |     attn_size = attention_states.get_shape()[2].value
 444 | 
 445 |     # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
 446 |     hidden = array_ops.reshape(
 447 |         attention_states, [-1, attn_length, 1, attn_size])
 448 |     hidden_features = []
 449 |     v = []
 450 |     attention_vec_size = attn_size  # Size of query vectors for attention.
 451 |     for a in xrange(num_heads):
 452 |       k = variable_scope.get_variable("AttnW_%d" % a,
 453 |                                       [1, 1, attn_size, attention_vec_size])
 454 |       hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
 455 |       v.append(variable_scope.get_variable("AttnV_%d" % a,
 456 |                                            [attention_vec_size]))
 457 | 
 458 |     state = initial_state
 459 |     def attention(query):
 460 |       """Put attention masks on hidden using hidden_features and query."""
 461 |       ds = []  # Results of attention reads will be stored here.
 462 |       for a in xrange(num_heads):
 463 |         with variable_scope.variable_scope("Attention_%d" % a):
 464 |           y = linear(query, attention_vec_size, True)
 465 |           y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
 466 |           # Attention mask is a softmax of v^T * tanh(...).
 467 |           s = math_ops.reduce_sum(
 468 |               v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
 469 |           a = nn_ops.softmax(s)
 470 |           # Now calculate the attention-weighted vector d.
 471 |           d = math_ops.reduce_sum(
 472 |               array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
 473 |               [1, 2])
 474 |           ds.append(array_ops.reshape(d, [-1, attn_size]))
 475 |       return ds
 476 | 
 477 |     outputs = []
 478 |     prev = None
 479 |     batch_attn_size = array_ops.pack([batch_size, attn_size])
 480 |     attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
 481 |              for _ in xrange(num_heads)]
 482 |     for a in attns:  # Ensure the second shape of attention vectors is set.
 483 |       a.set_shape([None, attn_size])
 484 |     if initial_state_attention:
 485 |       attns = attention(initial_state)
 486 |     for i, inp in enumerate(decoder_inputs):
 487 |       if i > 0:
 488 |         variable_scope.get_variable_scope().reuse_variables()
 489 |       # If loop_function is set, we use it instead of decoder_inputs.
 490 |       if loop_function is not None :
 491 |         with variable_scope.variable_scope("loop_function", reuse=True):
 492 |             if prev is not None:
 493 |                 inp = loop_function(prev, i)
 494 | 
 495 |       input_size = inp.get_shape().with_rank(2)[1]
 496 | 
 497 |       x = linear([inp] + attns, input_size, True)
 498 |       # Run the RNN.
 499 |       cell_output, state = cell(x, state)
 500 |       # Run the attention mechanism.
 501 |       if i == 0 and initial_state_attention:
 502 |         with variable_scope.variable_scope(variable_scope.get_variable_scope(),
 503 |                                            reuse=True):
 504 |           attns = attention(state)
 505 |       else:
 506 |         attns = attention(state)
 507 | 
 508 |       with variable_scope.variable_scope("AttnOutputProjection"):
 509 |         output = linear([cell_output] + attns, output_size, True)
 510 |       if loop_function is not None:
 511 |         prev = output
 512 |       outputs.append(output)
 513 | 
 514 |   return outputs, state
 515 | 
 516 | 
 517 | def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell,
 518 |                       output_size=None, num_heads=1, loop_function=None,
 519 |                       dtype=dtypes.float32, scope=None,
 520 |                       initial_state_attention=False, output_projection=None, beam_size=10):
 521 |   """RNN decoder with attention for the sequence-to-sequence model.
 522 | 
 523 |   In this context "attention" means that, during decoding, the RNN can look up
 524 |   information in the additional tensor attention_states, and it does this by
 525 |   focusing on a few entries from the tensor. This model has proven to yield
 526 |   especially good results in a number of sequence-to-sequence tasks. This
 527 |   implementation is based on http://arxiv.org/abs/1412.7449 (see below for
 528 |   details). It is recommended for complex sequence-to-sequence tasks.
 529 | 
 530 |   Args:
 531 |     decoder_inputs: A list of 2D Tensors [batch_size x input_size].
 532 |     initial_state: 2D Tensor [batch_size x cell.state_size].
 533 |     attention_states: 3D Tensor [batch_size x attn_length x attn_size].
 534 |     cell: rnn_cell.RNNCell defining the cell function and size.
 535 |     output_size: Size of the output vectors; if None, we use cell.output_size.
 536 |     num_heads: Number of attention heads that read from attention_states.
 537 |     loop_function: If not None, this function will be applied to i-th output
 538 |       in order to generate i+1-th input, and decoder_inputs will be ignored,
 539 |       except for the first element ("GO" symbol). This can be used for decoding,
 540 |       but also for training to emulate http://arxiv.org/abs/1506.03099.
 541 |       Signature -- loop_function(prev, i) = next
 542 |         * prev is a 2D Tensor of shape [batch_size x output_size],
 543 |         * i is an integer, the step number (when advanced control is needed),
 544 |         * next is a 2D Tensor of shape [batch_size x input_size].
 545 |     dtype: The dtype to use for the RNN initial state (default: tf.float32).
 546 |     scope: VariableScope for the created subgraph; default: "attention_decoder".
 547 |     initial_state_attention: If False (default), initial attentions are zero.
 548 |       If True, initialize the attentions from the initial state and attention
 549 |       states -- useful when we wish to resume decoding from a previously
 550 |       stored decoder state and attention states.
 551 | 
 552 |   Returns:
 553 |     A tuple of the form (outputs, state), where:
 554 |       outputs: A list of the same length as decoder_inputs of 2D Tensors of
 555 |         shape [batch_size x output_size]. These represent the generated outputs.
 556 |         Output i is computed from input i (which is either the i-th element
 557 |         of decoder_inputs or loop_function(output {i-1}, i)) as follows.
 558 |         First, we run the cell on a combination of the input and previous
 559 |         attention masks:
 560 |           cell_output, new_state = cell(linear(input, prev_attn), prev_state).
 561 |         Then, we calculate new attention masks:
 562 |           new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
 563 |         and then we calculate the output:
 564 |           output = linear(cell_output, new_attn).
 565 |       state: The state of each decoder cell the final time-step.
 566 |         It is a 2D Tensor of shape [batch_size x cell.state_size].
 567 | 
 568 |   Raises:
 569 |     ValueError: when num_heads is not positive, there are no inputs, shapes
 570 |       of attention_states are not set, or input size cannot be inferred
 571 |       from the input.
 572 |   """
 573 |   if not decoder_inputs:
 574 |     raise ValueError("Must provide at least 1 input to attention decoder.")
 575 |   if num_heads < 1:
 576 |     raise ValueError("With less than 1 heads, use a non-attention decoder.")
 577 |   if not attention_states.get_shape()[1:2].is_fully_defined():
 578 |     raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
 579 |                      % attention_states.get_shape())
 580 |   if output_size is None:
 581 |     output_size = cell.output_size
 582 | 
 583 |   with variable_scope.variable_scope(scope or "attention_decoder"):
 584 |     batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
 585 |     attn_length = attention_states.get_shape()[1].value
 586 |     attn_size = attention_states.get_shape()[2].value
 587 | 
 588 |     # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
 589 |     hidden = array_ops.reshape(
 590 |         attention_states, [-1, attn_length, 1, attn_size])
 591 |     hidden_features = []
 592 |     v = []
 593 |     attention_vec_size = attn_size  # Size of query vectors for attention.
 594 |     for a in xrange(num_heads):
 595 |       k = variable_scope.get_variable("AttnW_%d" % a,
 596 |                                       [1, 1, attn_size, attention_vec_size])
 597 |       hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
 598 |       v.append(variable_scope.get_variable("AttnV_%d" % a,
 599 |                                            [attention_vec_size]))
 600 | 
 601 |     print("Initial_state")
 602 | 
 603 |     state_size =  int(initial_state.get_shape().with_rank(2)[1])
 604 |     states =[]
 605 |     for kk in range(1):
 606 |         states.append(initial_state)
 607 |     state = tf.reshape(tf.concat(0, states), [-1, state_size])
 608 |     def attention(query):
 609 |       """Put attention masks on hidden using hidden_features and query."""
 610 |       ds = []  # Results of attention reads will be stored here.
 611 |       for a in xrange(num_heads):
 612 |         with variable_scope.variable_scope("Attention_%d" % a):
 613 |           y = linear(query, attention_vec_size, True)
 614 |           y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
 615 |           # Attention mask is a softmax of v^T * tanh(...).
 616 |           s = math_ops.reduce_sum(
 617 |               v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
 618 |           a = nn_ops.softmax(s)
 619 |           # Now calculate the attention-weighted vector d.
 620 |           d = math_ops.reduce_sum(
 621 |               array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
 622 |               [1, 2])
 623 |           # for c in range(ct):
 624 |           ds.append(array_ops.reshape(d, [-1, attn_size]))
 625 |       return ds
 626 | 
 627 |     outputs = []
 628 |     prev = None
 629 |     batch_attn_size = array_ops.pack([batch_size, attn_size])
 630 |     attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
 631 |              for _ in xrange(num_heads)]
 632 |     for a in attns:  # Ensure the second shape of attention vectors is set.
 633 |       a.set_shape([None, attn_size])
 634 | 
 635 |     if initial_state_attention:
 636 |        attns = []
 637 |        attns.append(attention(initial_state))
 638 |        tmp = tf.reshape(tf.concat(0, attns), [-1, attn_size])
 639 |        attns = []
 640 |        attns.append(tmp)
 641 | 
 642 |     log_beam_probs, beam_path, beam_symbols = [],[],[]
 643 |     for i, inp in enumerate(decoder_inputs):
 644 | 
 645 |       if i > 0:
 646 |         variable_scope.get_variable_scope().reuse_variables()
 647 |       # If loop_function is set, we use it instead of decoder_inputs.
 648 |       if loop_function is not None :
 649 |         with variable_scope.variable_scope("loop_function", reuse=True):
 650 |             if prev is not None:
 651 |                 inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols)
 652 | 
 653 |       input_size = inp.get_shape().with_rank(2)[1]
 654 |       x = linear([inp] + attns, input_size, True)
 655 |       cell_output, state = cell(x, state)
 656 | 
 657 |       # Run the attention mechanism.
 658 |       if i == 0 and initial_state_attention:
 659 |         with variable_scope.variable_scope(variable_scope.get_variable_scope(),
 660 |                                            reuse=True):
 661 |           attns = attention(state)
 662 |       else:
 663 |           attns = attention(state)
 664 | 
 665 |       with variable_scope.variable_scope("AttnOutputProjection"):
 666 |         output = linear([cell_output] + attns, output_size, True)
 667 |       if loop_function is not None:
 668 |         prev = output
 669 |       if  i ==0:
 670 |           states =[]
 671 |           for kk in range(beam_size):
 672 |                 states.append(state)
 673 |           state = tf.reshape(tf.concat(0, states), [-1, state_size])
 674 |           with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True):
 675 |                 attns = attention(state)
 676 | 
 677 |       outputs.append(tf.argmax(nn_ops.xw_plus_b(
 678 |           output, output_projection[0], output_projection[1]), dimension=1))
 679 | 
 680 |   return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size])
 681 | 
 682 | def embedding_attention_decoder(decoder_inputs, initial_state, attention_states,
 683 |                                 cell, num_symbols, embedding_size, num_heads=1,
 684 |                                 output_size=None, output_projection=None,
 685 |                                 feed_previous=False,
 686 |                                 update_embedding_for_previous=True,
 687 |                                 dtype=dtypes.float32, scope=None,
 688 |                                 initial_state_attention=False, beam_search=True, beam_size=10):
 689 |   """RNN decoder with embedding and attention and a pure-decoding option.
 690 | 
 691 |   Args:
 692 |     decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
 693 |     initial_state: 2D Tensor [batch_size x cell.state_size].
 694 |     attention_states: 3D Tensor [batch_size x attn_length x attn_size].
 695 |     cell: rnn_cell.RNNCell defining the cell function.
 696 |     num_symbols: Integer, how many symbols come into the embedding.
 697 |     embedding_size: Integer, the length of the embedding vector for each symbol.
 698 |     num_heads: Number of attention heads that read from attention_states.
 699 |     output_size: Size of the output vectors; if None, use output_size.
 700 |     output_projection: None or a pair (W, B) of output projection weights and
 701 |       biases; W has shape [output_size x num_symbols] and B has shape
 702 |       [num_symbols]; if provided and feed_previous=True, each fed previous
 703 |       output will first be multiplied by W and added B.
 704 |     feed_previous: Boolean; if True, only the first of decoder_inputs will be
 705 |       used (the "GO" symbol), and all other decoder inputs will be generated by:
 706 |         next = embedding_lookup(embedding, argmax(previous_output)),
 707 |       In effect, this implements a greedy decoder. It can also be used
 708 |       during training to emulate http://arxiv.org/abs/1506.03099.
 709 |       If False, decoder_inputs are used as given (the standard decoder case).
 710 |     update_embedding_for_previous: Boolean; if False and feed_previous=True,
 711 |       only the embedding for the first symbol of decoder_inputs (the "GO"
 712 |       symbol) will be updated by back propagation. Embeddings for the symbols
 713 |       generated from the decoder itself remain unchanged. This parameter has
 714 |       no effect if feed_previous=False.
 715 |     dtype: The dtype to use for the RNN initial states (default: tf.float32).
 716 |     scope: VariableScope for the created subgraph; defaults to
 717 |       "embedding_attention_decoder".
 718 |     initial_state_attention: If False (default), initial attentions are zero.
 719 |       If True, initialize the attentions from the initial state and attention
 720 |       states -- useful when we wish to resume decoding from a previously
 721 |       stored decoder state and attention states.
 722 | 
 723 |   Returns:
 724 |     A tuple of the form (outputs, state), where:
 725 |       outputs: A list of the same length as decoder_inputs of 2D Tensors with
 726 |         shape [batch_size x output_size] containing the generated outputs.
 727 |       state: The state of each decoder cell at the final time-step.
 728 |         It is a 2D Tensor of shape [batch_size x cell.state_size].
 729 | 
 730 |   Raises:
 731 |     ValueError: When output_projection has the wrong shape.
 732 |   """
 733 |   if output_size is None:
 734 |     output_size = cell.output_size
 735 |   if output_projection is not None:
 736 |     proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
 737 |     proj_biases.get_shape().assert_is_compatible_with([num_symbols])
 738 | 
 739 |   with variable_scope.variable_scope(scope or "embedding_attention_decoder"):
 740 |     with ops.device("/cpu:0"):
 741 |       embedding = variable_scope.get_variable("embedding",
 742 |                                               [num_symbols, embedding_size])
 743 |     print("Check number of symbols")
 744 |     print(num_symbols)
 745 |     if beam_search:
 746 |         loop_function = _extract_beam_search(
 747 |         embedding, beam_size,num_symbols, embedding_size, output_projection,
 748 |         update_embedding_for_previous)
 749 |     else:
 750 |         loop_function = _extract_argmax_and_embed(
 751 |         embedding, output_projection,
 752 |         update_embedding_for_previous) if feed_previous else None
 753 |     emb_inp = [
 754 |         embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs]
 755 |     if beam_search:
 756 |         return beam_attention_decoder(
 757 |             emb_inp, initial_state, attention_states, cell, output_size=output_size,
 758 |             num_heads=num_heads, loop_function=loop_function,
 759 |             initial_state_attention=initial_state_attention, output_projection=output_projection, beam_size=beam_size)
 760 |     else:
 761 | 
 762 |         return attention_decoder(
 763 |             emb_inp, initial_state, attention_states, cell, output_size=output_size,
 764 |             num_heads=num_heads, loop_function=loop_function,
 765 |             initial_state_attention=initial_state_attention)
 766 | 
 767 | 
 768 | def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
 769 |                                 num_encoder_symbols, num_decoder_symbols,
 770 |                                 embedding_size,
 771 |                                 num_heads=1, output_projection=None,
 772 |                                 feed_previous=False, dtype=dtypes.float32,
 773 |                                 scope=None, initial_state_attention=False, beam_search =True, beam_size = 10 ):
 774 |   """Embedding sequence-to-sequence model with attention.
 775 | 
 776 |   This model first embeds encoder_inputs by a newly created embedding (of shape
 777 |   [num_encoder_symbols x input_size]). Then it runs an RNN to encode
 778 |   embedded encoder_inputs into a state vector. It keeps the outputs of this
 779 |   RNN at every step to use for attention later. Next, it embeds decoder_inputs
 780 |   by another newly created embedding (of shape [num_decoder_symbols x
 781 |   input_size]). Then it runs attention decoder, initialized with the last
 782 |   encoder state, on embedded decoder_inputs and attending to encoder outputs.
 783 | 
 784 |   Args:
 785 |     encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
 786 |     decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
 787 |     cell: rnn_cell.RNNCell defining the cell function and size.
 788 |     num_encoder_symbols: Integer; number of symbols on the encoder side.
 789 |     num_decoder_symbols: Integer; number of symbols on the decoder side.
 790 |     embedding_size: Integer, the length of the embedding vector for each symbol.
 791 |     num_heads: Number of attention heads that read from attention_states.
 792 |     output_projection: None or a pair (W, B) of output projection weights and
 793 |       biases; W has shape [output_size x num_decoder_symbols] and B has
 794 |       shape [num_decoder_symbols]; if provided and feed_previous=True, each
 795 |       fed previous output will first be multiplied by W and added B.
 796 |     feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
 797 |       of decoder_inputs will be used (the "GO" symbol), and all other decoder
 798 |       inputs will be taken from previous outputs (as in embedding_rnn_decoder).
 799 |       If False, decoder_inputs are used as given (the standard decoder case).
 800 |     dtype: The dtype of the initial RNN state (default: tf.float32).
 801 |     scope: VariableScope for the created subgraph; defaults to
 802 |       "embedding_attention_seq2seq".
 803 |     initial_state_attention: If False (default), initial attentions are zero.
 804 |       If True, initialize the attentions from the initial state and attention
 805 |       states.
 806 | 
 807 |   Returns:
 808 |     A tuple of the form (outputs, state), where:
 809 |       outputs: A list of the same length as decoder_inputs of 2D Tensors with
 810 |         shape [batch_size x num_decoder_symbols] containing the generated
 811 |         outputs.
 812 |       state: The state of each decoder cell at the final time-step.
 813 |         It is a 2D Tensor of shape [batch_size x cell.state_size].
 814 |   """
 815 |   with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"):
 816 |     # Encoder.
 817 |     encoder_cell = rnn_cell.EmbeddingWrapper(
 818 |         cell, embedding_classes=num_encoder_symbols,
 819 |         embedding_size=embedding_size)
 820 |     encoder_outputs, encoder_state = rnn.rnn(
 821 |         encoder_cell, encoder_inputs, dtype=dtype)
 822 |     print("Symbols")
 823 |     print(num_encoder_symbols)
 824 |     print(num_decoder_symbols)
 825 |     # First calculate a concatenation of encoder outputs to put attention on.
 826 |     top_states = [array_ops.reshape(e, [-1, 1, cell.output_size])
 827 |                   for e in encoder_outputs]
 828 |     attention_states = array_ops.concat(1, top_states)
 829 |     print(attention_states)
 830 |     # Decoder.
 831 |     output_size = None
 832 |     if output_projection is None:
 833 |       cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
 834 |       output_size = num_decoder_symbols
 835 | 
 836 | 
 837 |     return embedding_attention_decoder(
 838 |           decoder_inputs, encoder_state, attention_states, cell,
 839 |           num_decoder_symbols, embedding_size, num_heads=num_heads,
 840 |           output_size=output_size, output_projection=output_projection,
 841 |           feed_previous=feed_previous,
 842 |           initial_state_attention=initial_state_attention, beam_search=beam_search, beam_size=beam_size)
 843 | 
 844 | 
 845 | 
 846 | 
 847 | def sequence_loss_by_example(logits, targets, weights,
 848 |                              average_across_timesteps=True,
 849 |                              softmax_loss_function=None, name=None):
 850 |   """Weighted cross-entropy loss for a sequence of logits (per example).
 851 | 
 852 |   Args:
 853 |     logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
 854 |     targets: List of 1D batch-sized int32 Tensors of the same length as logits.
 855 |     weights: List of 1D batch-sized float-Tensors of the same length as logits.
 856 |     average_across_timesteps: If set, divide the returned cost by the total
 857 |       label weight.
 858 |     softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
 859 |       to be used instead of the standard softmax (the default if this is None).
 860 |     name: Optional name for this operation, default: "sequence_loss_by_example".
 861 | 
 862 |   Returns:
 863 |     1D batch-sized float Tensor: The log-perplexity for each sequence.
 864 | 
 865 |   Raises:
 866 |     ValueError: If len(logits) is different from len(targets) or len(weights).
 867 |   """
 868 |   if len(targets) != len(logits) or len(weights) != len(logits):
 869 |     raise ValueError("Lengths of logits, weights, and targets must be the same "
 870 |                      "%d, %d, %d." % (len(logits), len(weights), len(targets)))
 871 |   with ops.op_scope(logits + targets + weights, name,
 872 |                     "sequence_loss_by_example"):
 873 |     log_perp_list = []
 874 |     for logit, target, weight in zip(logits, targets, weights):
 875 |       if softmax_loss_function is None:
 876 |         target = array_ops.reshape(target, [-1])
 877 |         crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
 878 |             logit, target)
 879 |       else:
 880 |         crossent = softmax_loss_function(logit, target)
 881 |       log_perp_list.append(crossent * weight)
 882 |     log_perps = math_ops.add_n(log_perp_list)
 883 |     if average_across_timesteps:
 884 |       total_size = math_ops.add_n(weights)
 885 |       total_size += 1e-12  # Just to avoid division by 0 for all-0 weights.
 886 |       log_perps /= total_size
 887 |   return log_perps
 888 | 
 889 | 
 890 | def sequence_loss(logits, targets, weights,
 891 |                   average_across_timesteps=True, average_across_batch=True,
 892 |                   softmax_loss_function=None, name=None):
 893 |   """Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
 894 | 
 895 |   Args:
 896 |     logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
 897 |     targets: List of 1D batch-sized int32 Tensors of the same length as logits.
 898 |     weights: List of 1D batch-sized float-Tensors of the same length as logits.
 899 |     average_across_timesteps: If set, divide the returned cost by the total
 900 |       label weight.
 901 |     average_across_batch: If set, divide the returned cost by the batch size.
 902 |     softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
 903 |       to be used instead of the standard softmax (the default if this is None).
 904 |     name: Optional name for this operation, defaults to "sequence_loss".
 905 | 
 906 |   Returns:
 907 |     A scalar float Tensor: The average log-perplexity per symbol (weighted).
 908 | 
 909 |   Raises:
 910 |     ValueError: If len(logits) is different from len(targets) or len(weights).
 911 |   """
 912 |   with ops.op_scope(logits + targets + weights, name, "sequence_loss"):
 913 |     cost = math_ops.reduce_sum(sequence_loss_by_example(
 914 |         logits, targets, weights,
 915 |         average_across_timesteps=average_across_timesteps,
 916 |         softmax_loss_function=softmax_loss_function))
 917 |     if average_across_batch:
 918 |       batch_size = array_ops.shape(targets[0])[0]
 919 |       return cost / math_ops.cast(batch_size, dtypes.float32)
 920 |     else:
 921 |       return cost
 922 | 
 923 | 
 924 | def model_with_buckets(encoder_inputs, decoder_inputs, targets, weights,
 925 |                        buckets, seq2seq, softmax_loss_function=None,
 926 |                        per_example_loss=False, name=None):
 927 |   """Create a sequence-to-sequence model with support for bucketing.
 928 | 
 929 |   The seq2seq argument is a function that defines a sequence-to-sequence model,
 930 |   e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24))
 931 | 
 932 |   Args:
 933 |     encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input.
 934 |     decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input.
 935 |     targets: A list of 1D batch-sized int32 Tensors (desired output sequence).
 936 |     weights: List of 1D batch-sized float-Tensors to weight the targets.
 937 |     buckets: A list of pairs of (input size, output size) for each bucket.
 938 |     seq2seq: A sequence-to-sequence model function; it takes 2 input that
 939 |       agree with encoder_inputs and decoder_inputs, and returns a pair
 940 |       consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
 941 |     softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
 942 |       to be used instead of the standard softmax (the default if this is None).
 943 |     per_example_loss: Boolean. If set, the returned loss will be a batch-sized
 944 |       tensor of losses for each sequence in the batch. If unset, it will be
 945 |       a scalar with the averaged loss from all examples.
 946 |     name: Optional name for this operation, defaults to "model_with_buckets".
 947 | 
 948 |   Returns:
 949 |     A tuple of the form (outputs, losses), where:
 950 |       outputs: The outputs for each bucket. Its j'th element consists of a list
 951 |         of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs).
 952 |       losses: List of scalar Tensors, representing losses for each bucket, or,
 953 |         if per_example_loss is set, a list of 1D batch-sized float Tensors.
 954 | 
 955 |   Raises:
 956 |     ValueError: If length of encoder_inputsut, targets, or weights is smaller
 957 |       than the largest (last) bucket.
 958 |   """
 959 |   if len(encoder_inputs) < buckets[-1][0]:
 960 |     raise ValueError("Length of encoder_inputs (%d) must be at least that of la"
 961 |                      "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0]))
 962 |   if len(targets) < buckets[-1][1]:
 963 |     raise ValueError("Length of targets (%d) must be at least that of last"
 964 |                      "bucket (%d)." % (len(targets), buckets[-1][1]))
 965 |   if len(weights) < buckets[-1][1]:
 966 |     raise ValueError("Length of weights (%d) must be at least that of last"
 967 |                      "bucket (%d)." % (len(weights), buckets[-1][1]))
 968 | 
 969 |   all_inputs = encoder_inputs + decoder_inputs + targets + weights
 970 |   losses = []
 971 |   outputs = []
 972 |   with ops.op_scope(all_inputs, name, "model_with_buckets"):
 973 |     for j, bucket in enumerate(buckets):
 974 |       with variable_scope.variable_scope(variable_scope.get_variable_scope(),
 975 |                                          reuse=True if j > 0 else None):
 976 | 
 977 |         bucket_outputs, _ = seq2seq(encoder_inputs[:bucket[0]],
 978 |                                     decoder_inputs[:bucket[1]])
 979 | 
 980 |         outputs.append(bucket_outputs)
 981 |         if per_example_loss:
 982 |           losses.append(sequence_loss_by_example(
 983 |               outputs[-1], targets[:bucket[1]], weights[:bucket[1]],
 984 |               softmax_loss_function=softmax_loss_function))
 985 |         else:
 986 |           losses.append(sequence_loss(
 987 |               outputs[-1], targets[:bucket[1]], weights[:bucket[1]],
 988 |               softmax_loss_function=softmax_loss_function))
 989 | 
 990 |   return outputs, losses
 991 | 
 992 | def decode_model_with_buckets(encoder_inputs, decoder_inputs, targets, weights,
 993 |                        buckets, seq2seq, softmax_loss_function=None,
 994 |                        per_example_loss=False, name=None):
 995 |   """Create a sequence-to-sequence model with support for bucketing.
 996 | 
 997 |   The seq2seq argument is a function that defines a sequence-to-sequence model,
 998 |   e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24))
 999 | 
1000 |   Args:
1001 |     encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input.
1002 |     decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input.
1003 |     targets: A list of 1D batch-sized int32 Tensors (desired output sequence).
1004 |     weights: List of 1D batch-sized float-Tensors to weight the targets.
1005 |     buckets: A list of pairs of (input size, output size) for each bucket.
1006 |     seq2seq: A sequence-to-sequence model function; it takes 2 input that
1007 |       agree with encoder_inputs and decoder_inputs, and returns a pair
1008 |       consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
1009 |     softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
1010 |       to be used instead of the standard softmax (the default if this is None).
1011 |     per_example_loss: Boolean. If set, the returned loss will be a batch-sized
1012 |       tensor of losses for each sequence in the batch. If unset, it will be
1013 |       a scalar with the averaged loss from all examples.
1014 |     name: Optional name for this operation, defaults to "model_with_buckets".
1015 | 
1016 |   Returns:
1017 |     A tuple of the form (outputs, losses), where:
1018 |       outputs: The outputs for each bucket. Its j'th element consists of a list
1019 |         of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs).
1020 |       losses: List of scalar Tensors, representing losses for each bucket, or,
1021 |         if per_example_loss is set, a list of 1D batch-sized float Tensors.
1022 | 
1023 |   Raises:
1024 |     ValueError: If length of encoder_inputsut, targets, or weights is smaller
1025 |       than the largest (last) bucket.
1026 |   """
1027 |   if len(encoder_inputs) < buckets[-1][0]:
1028 |     raise ValueError("Length of encoder_inputs (%d) must be at least that of la"
1029 |                      "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0]))
1030 |   if len(targets) < buckets[-1][1]:
1031 |     raise ValueError("Length of targets (%d) must be at least that of last"
1032 |                      "bucket (%d)." % (len(targets), buckets[-1][1]))
1033 |   if len(weights) < buckets[-1][1]:
1034 |     raise ValueError("Length of weights (%d) must be at least that of last"
1035 |                      "bucket (%d)." % (len(weights), buckets[-1][1]))
1036 | 
1037 |   all_inputs = encoder_inputs + decoder_inputs + targets + weights
1038 |   losses = []
1039 |   outputs = []
1040 |   beam_paths = []
1041 |   beam_symbols = []
1042 |   with ops.op_scope(all_inputs, name, "model_with_buckets"):
1043 |     for j, bucket in enumerate(buckets):
1044 |       with variable_scope.variable_scope(variable_scope.get_variable_scope(),
1045 |                                          reuse=True if j > 0 else None):
1046 |         bucket_outputs, _, beam_path, beam_symbol = seq2seq(encoder_inputs[:bucket[0]],
1047 |                                     decoder_inputs[:bucket[1]])
1048 |         outputs.append(bucket_outputs)
1049 |         beam_paths.append(beam_path)
1050 |         beam_symbols.append(beam_symbol)
1051 |   print("End**********")
1052 | 
1053 |   return outputs, beam_paths, beam_symbols


--------------------------------------------------------------------------------
/neural_conversation_model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | """Most of the code comes from seq2seq tutorial. Binary for training conversation models and decoding from them.
  4 | 
  5 | Running this program without --decode will  tokenize it in a very basic way,
  6 | and then start training a model saving checkpoints to --train_dir.
  7 | 
  8 | Running with --decode starts an interactive loop so you can see how
  9 | the current checkpoint performs
 10 | 
 11 | See the following papers for more information on neural translation models.
 12 |  * http://arxiv.org/abs/1409.3215
 13 |  * http://arxiv.org/abs/1409.0473
 14 |  * http://arxiv.org/abs/1412.2007
 15 | """
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import math
 22 | import os
 23 | import random
 24 | import sys
 25 | import time
 26 | 
 27 | import numpy as np
 28 | from six.moves import xrange  # pylint: disable=redefined-builtin
 29 | import tensorflow as tf
 30 | 
 31 | from  data_utils import *
 32 | from  seq2seq_model import *
 33 | import codecs
 34 | 
 35 | tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.")
 36 | tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.99,
 37 |                           "Learning rate decays by this much.")
 38 | tf.app.flags.DEFINE_float("max_gradient_norm", 5.0,
 39 |                           "Clip gradients to this norm.")
 40 | tf.app.flags.DEFINE_integer("batch_size", 64,
 41 |                             "Batch size to use during training.")
 42 | tf.app.flags.DEFINE_integer("size", 512, "Size of each model layer.")
 43 | tf.app.flags.DEFINE_integer("num_layers", 3, "Number of layers in the model.")
 44 | tf.app.flags.DEFINE_integer("en_vocab_size", 40000, "English vocabulary size.")
 45 | tf.app.flags.DEFINE_string("train_dir", "./tmp/", "Training directory.")
 46 | tf.app.flags.DEFINE_string("vocab_path", "./tmp/", "Data directory")
 47 | tf.app.flags.DEFINE_string("data_path", "./tmp/", "Training directory.")
 48 | tf.app.flags.DEFINE_string("dev_data", "./tmp/", "Data directory")
 49 | tf.app.flags.DEFINE_integer("max_train_data_size", 0,
 50 |                             "Limit on the size of training data (0: no limit).")
 51 | tf.app.flags.DEFINE_integer("steps_per_checkpoint", 400,
 52 |                             "How many training steps to do per checkpoint.")
 53 | tf.app.flags.DEFINE_integer("beam_size", 100,
 54 |                             "How many training steps to do per checkpoint.")
 55 | tf.app.flags.DEFINE_boolean("beam_search", False,
 56 |                             "Set to True for beam_search.")
 57 | tf.app.flags.DEFINE_boolean("decode", False,
 58 |                             "Set to True for interactive decoding.")
 59 | tf.app.flags.DEFINE_boolean("attention", False,
 60 |                             "Set to True for interactive decoding.")
 61 | tf.app.flags.DEFINE_boolean("self_test", False,
 62 |                             "Run a self-test if this is set to True.")
 63 | 
 64 | FLAGS = tf.app.flags.FLAGS
 65 | 
 66 | # We use a number of buckets and pad to the closest one for efficiency.
 67 | # See seq2seq_model.Seq2SeqModel for details of how they work.
 68 | _buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
 69 | 
 70 | 
 71 | def read_chat_data(data_path,vocabulary_path, max_size=None):
 72 |     counter = 0
 73 |     vocab, _ = initialize_vocabulary(vocabulary_path)
 74 |     print(len(vocab))
 75 |     print(max_size)
 76 |     data_set = [[] for _ in _buckets]
 77 |     # http://stackoverflow.com/questions/33054527/python-3-5-typeerror-a-bytes-like-object-is-required-not-str-when-writing-t
 78 |     with codecs.open(data_path, "rb") as fi:
 79 |         for line in fi.readlines():
 80 |             line = line.decode('utf8').strip()
 81 |             counter += 1
 82 |             if max_size!=0 and counter > max_size:
 83 |                 break
 84 |             if counter % 10000 == 0:
 85 |               print("  reading data line %d" % counter)
 86 |               sys.stdout.flush()
 87 |             entities = line.lower().split("\t")
 88 |             # print entities
 89 |             if len(entities) == 2:
 90 |                 source = entities[0]
 91 |                 target = entities[1]
 92 |                 source_ids = [int(x) for x in sentence_to_token_ids(source,vocab)]
 93 |                 target_ids = [int(x) for x in sentence_to_token_ids(target,vocab)]
 94 |                 target_ids.append(EOS_ID)
 95 |                 for bucket_id, (source_size, target_size) in enumerate(_buckets):
 96 |                   if len(source_ids) < source_size and len(target_ids) < target_size:
 97 |                     data_set[bucket_id].append([source_ids, target_ids])
 98 |                     break
 99 |     return data_set
100 | 
101 | def create_model(session, forward_only, beam_search, beam_size = 10, attention = True):
102 |   """Create translation model and initialize or load parameters in session."""
103 |   model = Seq2SeqModel(
104 |       FLAGS.en_vocab_size, FLAGS.en_vocab_size, _buckets,
105 |       FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size,
106 |       FLAGS.learning_rate, FLAGS.learning_rate_decay_factor,
107 |       forward_only=forward_only, beam_search=beam_search, beam_size=beam_size, attention=attention)
108 |   print(FLAGS.train_dir)
109 |   ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
110 | 
111 |   # ckpt.model_checkpoint_path ="./big_models/chat_bot.ckpt-183600"
112 |   # print ckpt.model_checkpoint_path
113 |   if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
114 |     print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
115 |     model.saver.restore(session, ckpt.model_checkpoint_path)
116 |   else:
117 |     print("Created model with fresh parameters.")
118 |     session.run(tf.initialize_all_variables())
119 |   return model
120 | 
121 | def create_models(path, en_vocab_size, session, forward_only, beam_search, beam_size = 10, attention = True):
122 |   """Create translation model and initialize or load parameters in session."""
123 |   model = Seq2SeqModel(
124 |       en_vocab_size, en_vocab_size, _buckets,
125 |       FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size,
126 |       FLAGS.learning_rate, FLAGS.learning_rate_decay_factor,
127 |       forward_only=forward_only, beam_search=beam_search, beam_size=beam_size, attention=attention)
128 |   print(FLAGS.train_dir)
129 |   ckpt = tf.train.get_checkpoint_state(path)
130 | 
131 |   # ckpt.model_checkpoint_path ="./big_models/chat_bot.ckpt-183600"
132 |   # print ckpt.model_checkpoint_path
133 |   if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
134 |     print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
135 |     model.saver.restore(session, ckpt.model_checkpoint_path)
136 |   else:
137 |     print("Created model with fresh parameters.")
138 |     session.run(tf.initialize_all_variables())
139 |   return model
140 | 
141 | def train():
142 | 
143 |   data_path =FLAGS.data_path
144 |   dev_data = FLAGS.dev_data
145 |   vocab_path =FLAGS.vocab_path
146 |   # Beam search is false during training operation and usedat inference .
147 |   beam_search = False
148 |   beam_size =10
149 |   attention = FLAGS.attention
150 | 
151 |   normalize_digits=True
152 |   create_vocabulary(vocab_path, data_path, FLAGS.en_vocab_size )
153 | 
154 | 
155 |   with tf.Session() as sess:
156 |     # Create model.
157 |     print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
158 |     model = create_model(sess, False,beam_search=beam_search, beam_size=beam_size, attention=attention)
159 | 
160 |     # Read data into buckets and compute their sizes.
161 |     print("Reading development and training data (limit: %d)."
162 |            % FLAGS.max_train_data_size)
163 |     train_set =read_chat_data(data_path,vocab_path, FLAGS.max_train_data_size)
164 |     dev_set =read_chat_data(dev_data,vocab_path, FLAGS.max_train_data_size)
165 | 
166 |     train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
167 |     train_total_size = float(sum(train_bucket_sizes))
168 | 
169 |     # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
170 |     # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
171 |     # the size if i-th training bucket, as used later.
172 |     train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
173 |                            for i in xrange(len(train_bucket_sizes))]
174 | 
175 |     # This is the training loop.
176 |     step_time, loss = 0.0, 0.0
177 |     current_step = 0
178 |     previous_losses = []
179 |     while True:
180 |       # Choose a bucket according to data distribution. We pick a random number
181 |       # in [0, 1] and use the corresponding interval in train_buckets_scale.
182 |       # print "Started"
183 |       random_number_01 = np.random.random_sample()
184 |       bucket_id = min([i for i in xrange(len(train_buckets_scale))
185 |                        if train_buckets_scale[i] > random_number_01])
186 | 
187 |       # Get a batch and make a step.
188 |       start_time = time.time()
189 |       encoder_inputs, decoder_inputs, target_weights = model.get_batch(
190 |           train_set, bucket_id)
191 | 
192 |       _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
193 |                                    target_weights, bucket_id, False, beam_search)
194 |       step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
195 |       loss += step_loss / FLAGS.steps_per_checkpoint
196 |       current_step += 1
197 | 
198 |       # Once in a while, we save checkpoint, print statistics, and run evals.
199 |       if current_step % FLAGS.steps_per_checkpoint == 0:
200 |         # Print statistics for the previous epoch.
201 |         print("Running epochs")
202 |         perplexity = math.exp(loss) if loss < 300 else float('inf')
203 |         print ("global step %d learning rate %.4f step-time %.2f perplexity "
204 |                 "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
205 |                           step_time, perplexity))
206 |         # # Decrease learning rate if no improvement was seen over last 3 times.
207 |         if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
208 |            sess.run(model.learning_rate_decay_op)
209 |         previous_losses.append(loss)
210 |         # # Save checkpoint and zero timer and loss.
211 |         checkpoint_path = os.path.join(FLAGS.train_dir, "chat_bot.ckpt")
212 |         model.saver.save(sess, checkpoint_path, global_step=model.global_step)
213 |         step_time, loss = 0.0, 0.0
214 |         # # Run evals on development set and print their perplexity.
215 |         for bucket_id in xrange(len(_buckets)):
216 |             if len(dev_set[bucket_id]) == 0:
217 |               print("  eval: empty bucket %d" % (bucket_id))
218 |               continue
219 |             encoder_inputs, decoder_inputs, target_weights = model.get_batch(
220 |                 dev_set, bucket_id)
221 |             _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
222 |                                          target_weights, bucket_id, True, beam_search)
223 |             eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
224 |             print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
225 |         sys.stdout.flush()
226 | 
227 | def decode():
228 |   with tf.Session() as sess:
229 |     # Create model and load parameters.
230 |     beam_size = FLAGS.beam_size
231 |     beam_search = FLAGS.beam_search
232 |     attention = FLAGS.attention
233 |     model = create_model(sess, True, beam_search=beam_search, beam_size=beam_size, attention=attention)
234 |     model.batch_size = 1  # We decode one sentence at a time.
235 | 
236 |     # Load vocabularies.
237 |     vocab_path = FLAGS.vocab_path
238 |     vocab, rev_vocab = initialize_vocabulary(vocab_path)
239 | 
240 |     # Decode from standard input.
241 |     if beam_search:
242 |         sys.stdout.write("> ")
243 |         sys.stdout.flush()
244 |         sentence = sys.stdin.readline()
245 |         while sentence:
246 |           # Get token-ids for the input sentence.
247 |           token_ids = sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab)
248 |           # Which bucket does it belong to?
249 |           bucket_id = min([b for b in xrange(len(_buckets))
250 |                            if _buckets[b][0] > len(token_ids)])
251 |           # Get a 1-element batch to feed the sentence to the model.
252 |           encoder_inputs, decoder_inputs, target_weights = model.get_batch(
253 |               {bucket_id: [(token_ids, [])]}, bucket_id)
254 |           # Get output logits for the sentence.
255 |           # print bucket_id
256 |           path, symbol , output_logits = model.step(sess, encoder_inputs, decoder_inputs,
257 |                                            target_weights, bucket_id, True,beam_search )
258 | 
259 |           k = output_logits[0]
260 |           paths = []
261 |           for kk in range(beam_size):
262 |               paths.append([])
263 |           curr = range(beam_size)
264 |           num_steps = len(path)
265 |           for i in range(num_steps-1, -1, -1):
266 |               for kk in range(beam_size):
267 |                 paths[kk].append(symbol[i][curr[kk]])
268 |                 curr[kk] = path[i][curr[kk]]
269 |           recos = set()
270 |           print("Replies --------------------------------------->")
271 |           for kk in range(beam_size):
272 |               foutputs = [int(logit)  for logit in paths[kk][::-1]]
273 | 
274 |           # If there is an EOS symbol in outputs, cut them at that point.
275 |               if EOS_ID in foutputs:
276 |           #         # print outputs
277 |                    foutputs = foutputs[:foutputs.index(EOS_ID)]
278 |               rec = " ".join([tf.compat.as_str(rev_vocab[output]) for output in foutputs])
279 |               if rec not in recos:
280 |                       recos.add(rec)
281 |                       print(rec)
282 | 
283 |           print("> ", "")
284 |           sys.stdout.flush()
285 |           sentence = sys.stdin.readline()
286 |     else:
287 |         sys.stdout.write("> ")
288 |         sys.stdout.flush()
289 |         sentence = sys.stdin.readline()
290 | 
291 |         while sentence:
292 |               # Get token-ids for the input sentence.
293 |               token_ids = sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab)
294 |               # Which bucket does it belong to?
295 |               bucket_id = min([b for b in xrange(len(_buckets))
296 |                                if _buckets[b][0] > len(token_ids)])
297 |               # for loc in locs:
298 |                   # Get a 1-element batch to feed the sentence to the model.
299 |               encoder_inputs, decoder_inputs, target_weights = model.get_batch(
300 |                       {bucket_id: [(token_ids, [],)]}, bucket_id)
301 | 
302 |               _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
303 |                                                    target_weights, bucket_id, True,beam_search)
304 |               # This is a greedy decoder - outputs are just argmaxes of output_logits.
305 | 
306 |               outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
307 |               # If there is an EOS symbol in outputs, cut them at that point.
308 |               if EOS_ID in outputs:
309 |                   # print outputs
310 |                   outputs = outputs[:outputs.index(EOS_ID)]
311 | 
312 |               print(" ".join([tf.compat.as_str(rev_vocab[output]) for output in outputs]))
313 |               print("> ", "")
314 |               sys.stdout.flush()
315 |               sentence = sys.stdin.readline()
316 | 
317 | 
318 | 
319 | def main(_):
320 |   if FLAGS.decode:
321 |     decode()
322 |   else:
323 |     train()
324 | 
325 | if __name__ == "__main__":
326 |   tf.app.run()
327 | 


--------------------------------------------------------------------------------
/scripts/predict.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash 
 2 | ###########################################
 3 | #
 4 | ###########################################
 5 | 
 6 | # constants
 7 | baseDir=$(cd `dirname "$0"`;pwd)
 8 | # functions
 9 | 
10 | # main 
11 | [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return
12 | # source /root/venv-py3/bin/activate
13 | # http://stackoverflow.com/questions/35911252/disable-tensorflow-debugging-information
14 | export TF_CPP_MIN_LOG_LEVEL=3
15 | cd $baseDir/..
16 | python neural_conversation_model.py \
17 |     --train_dir ubuntu/ \
18 |     --en_vocab_size 60000 \
19 |     --size 512 \
20 |     --data_path ubuntu/train.tsv \
21 |     --dev_data ubuntu/valid.tsv  \
22 |     --vocab_path ubuntu/60_chat_vocab.en \
23 |     --attention \
24 |     --decode \
25 |     --beam_search \
26 |     --beam_size 25 \
27 | 


--------------------------------------------------------------------------------
/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash 
 2 | ###########################################
 3 | #
 4 | ###########################################
 5 | 
 6 | # constants
 7 | baseDir=$(cd `dirname "$0"`;pwd)
 8 | # functions
 9 | 
10 | # main 
11 | [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return
12 | # source /root/venv-py3/bin/activate
13 | # http://stackoverflow.com/questions/35911252/disable-tensorflow-debugging-information
14 | export TF_CPP_MIN_LOG_LEVEL=3
15 | cd $baseDir/..
16 | python neural_conversation_model.py \
17 |     --train_dir ubuntu/ \
18 |     --en_vocab_size 60000 \
19 |     --size 512 \
20 |     --data_path ubuntu/train.tsv \
21 |     --dev_data ubuntu/valid.tsv  \
22 |     --vocab_path ubuntu/60_chat_vocab.en \
23 |     --attention \
24 | 


--------------------------------------------------------------------------------
/seq2seq_model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """Sequence-to-sequence model with an attention mechanism."""
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | import random
  8 | 
  9 | import numpy as np
 10 | from six.moves import xrange  # pylint: disable=redefined-builtin
 11 | import tensorflow as tf
 12 | 
 13 | from  data_utils import *
 14 | from my_seq2seq import *
 15 | 
 16 | class Seq2SeqModel(object):
 17 |   """Sequence-to-sequence model with attention and for multiple buckets.
 18 | 
 19 |   This class implements a multi-layer recurrent neural network as encoder,
 20 |   and an attention-based decoder. This is the same as the model described in
 21 |   this paper: http://arxiv.org/abs/1412.7449 - please look there for details,
 22 |   or into the seq2seq library for complete model implementation.
 23 |   This class also allows to use GRU cells in addition to LSTM cells, and
 24 |   sampled softmax to handle large output vocabulary size. A single-layer
 25 |   version of this model, but with bi-directional encoder, was presented in
 26 |     http://arxiv.org/abs/1409.0473
 27 |   and sampled softmax is described in Section 3 of the following paper.
 28 |     http://arxiv.org/abs/1412.2007
 29 |   """
 30 | 
 31 |   def __init__(self, source_vocab_size, target_vocab_size, buckets, size,
 32 |                num_layers, max_gradient_norm, batch_size, learning_rate,
 33 |                learning_rate_decay_factor, use_lstm=False,
 34 |                num_samples=1024, forward_only=False, beam_search = True, beam_size=10, attention=True):
 35 |     """Create the model.
 36 | 
 37 |     Args:
 38 |       source_vocab_size: size of the source vocabulary.
 39 |       target_vocab_size: size of the target vocabulary.
 40 |       buckets: a list of pairs (I, O), where I specifies maximum input length
 41 |         that will be processed in that bucket, and O specifies maximum output
 42 |         length. Training instances that have inputs longer than I or outputs
 43 |         longer than O will be pushed to the next bucket and padded accordingly.
 44 |         We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
 45 |       size: number of units in each layer of the model.
 46 |       num_layers: number of layers in the model.
 47 |       max_gradient_norm: gradients will be clipped to maximally this norm.
 48 |       batch_size: the size of the batches used during training;
 49 |         the model construction is independent of batch_size, so it can be
 50 |         changed after initialization if this is convenient, e.g., for decoding.
 51 |       learning_rate: learning rate to start with.
 52 |       learning_rate_decay_factor: decay learning rate by this much when needed.
 53 |       use_lstm: if true, we use LSTM cells instead of GRU cells.
 54 |       num_samples: number of samples for sampled softmax.
 55 |       forward_only: if set, we do not construct the backward pass in the model.
 56 |     """
 57 |     self.source_vocab_size = source_vocab_size
 58 |     self.target_vocab_size = target_vocab_size
 59 |     self.buckets = buckets
 60 |     self.batch_size = batch_size
 61 |     self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
 62 |     self.learning_rate_decay_op = self.learning_rate.assign(
 63 |         self.learning_rate * learning_rate_decay_factor)
 64 |     self.global_step = tf.Variable(0, trainable=False)
 65 | 
 66 |     # If we use sampled softmax, we need an output projection.
 67 |     output_projection = None
 68 |     softmax_loss_function = None
 69 |     # Sampled softmax only makes sense if we sample less than vocabulary size.
 70 |     if num_samples > 0 and num_samples < self.target_vocab_size:
 71 |       with tf.device("/cpu:0"):
 72 |         w = tf.get_variable("proj_w", [size, self.target_vocab_size])
 73 |         w_t = tf.transpose(w)
 74 |         b = tf.get_variable("proj_b", [self.target_vocab_size])
 75 |       output_projection = (w, b)
 76 | 
 77 |       def sampled_loss(inputs, labels):
 78 |         with tf.device("/cpu:0"):
 79 |           labels = tf.reshape(labels, [-1, 1])
 80 |           return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
 81 |                                             self.target_vocab_size)
 82 |       softmax_loss_function = sampled_loss
 83 |     # Create the internal multi-layer cell for our RNN.
 84 |     single_cell = tf.nn.rnn_cell.GRUCell(size)
 85 |     if use_lstm:
 86 |       single_cell = tf.nn.rnn_cell.BasicLSTMCell(size)
 87 |     cell = single_cell
 88 |     if num_layers > 1:
 89 |       cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers, state_is_tuple=False)
 90 | 
 91 |     # The seq2seq function: we use embedding for the input and attention.
 92 |     def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
 93 |         if attention:
 94 |             print("Attention Model")
 95 |             return embedding_attention_seq2seq(
 96 |                encoder_inputs, decoder_inputs, cell,
 97 |                num_encoder_symbols=source_vocab_size,
 98 |                num_decoder_symbols=target_vocab_size,
 99 |                embedding_size=size,
100 |                output_projection=output_projection,
101 |                feed_previous=do_decode,
102 |                beam_search=beam_search,
103 |                beam_size=beam_size )
104 |         else:
105 |             print("Simple Model")
106 |             return embedding_rnn_seq2seq(
107 |               encoder_inputs, decoder_inputs, cell,
108 |               num_encoder_symbols=source_vocab_size,
109 |               num_decoder_symbols=target_vocab_size,
110 |               embedding_size=size,
111 |               output_projection=output_projection,
112 |               feed_previous=do_decode,
113 |               beam_search=beam_search,
114 |               beam_size=beam_size )
115 | 
116 | 
117 |     # Feeds for inputs.
118 |     self.encoder_inputs = []
119 |     self.decoder_inputs = []
120 |     self.target_weights = []
121 |     for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
122 |       self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
123 |                                                 name="encoder{0}".format(i)))
124 |     for i in xrange(buckets[-1][1] + 1):
125 |       self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
126 |                                                 name="decoder{0}".format(i)))
127 |       self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
128 |                                                 name="weight{0}".format(i)))
129 | 
130 |     # Our targets are decoder inputs shifted by one.
131 |     targets = [self.decoder_inputs[i + 1]
132 |                for i in xrange(len(self.decoder_inputs) - 1)]
133 | 
134 |     # Training outputs and losses.
135 |     if forward_only:
136 |         if beam_search:
137 |               self.outputs, self.beam_path, self.beam_symbol = decode_model_with_buckets(
138 |                   self.encoder_inputs, self.decoder_inputs, targets,
139 |                   self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
140 |                   softmax_loss_function=softmax_loss_function)
141 |         else:
142 |               # print self.decoder_inputs
143 |               self.outputs, self.losses = model_with_buckets(
144 |                   self.encoder_inputs, self.decoder_inputs, targets,
145 |                   self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
146 |                   softmax_loss_function=softmax_loss_function)
147 |               # If we use output projection, we need to project outputs for decoding.
148 |               if output_projection is not None:
149 |                     for b in xrange(len(buckets)):
150 |                       self.outputs[b] = [
151 |                           tf.matmul(output, output_projection[0]) + output_projection[1]
152 |                           for output in self.outputs[b]
153 |                       ]
154 | 
155 | 
156 |     else:
157 |       self.outputs, self.losses = model_with_buckets(
158 |           self.encoder_inputs, self.decoder_inputs, targets,
159 |           self.target_weights, buckets,
160 |           lambda x, y: seq2seq_f(x, y, False),
161 |           softmax_loss_function=softmax_loss_function)
162 | 
163 |     # Gradients and SGD update operation for training the model.
164 |     params = tf.trainable_variables()
165 |     if not forward_only:
166 |       self.gradient_norms = []
167 |       self.updates = []
168 |       opt = tf.train.GradientDescentOptimizer(self.learning_rate)
169 |       for b in xrange(len(buckets)):
170 |         gradients = tf.gradients(self.losses[b], params)
171 |         clipped_gradients, norm = tf.clip_by_global_norm(gradients,
172 |                                                          max_gradient_norm)
173 |         self.gradient_norms.append(norm)
174 |         self.updates.append(opt.apply_gradients(
175 |             zip(clipped_gradients, params), global_step=self.global_step))
176 | 
177 |     self.saver = tf.train.Saver(tf.all_variables())
178 | 
179 |   def step(self, session, encoder_inputs, decoder_inputs, target_weights,
180 |            bucket_id, forward_only, beam_search):
181 |     """Run a step of the model feeding the given inputs.
182 | 
183 |     Args:
184 |       session: tensorflow session to use.
185 |       encoder_inputs: list of numpy int vectors to feed as encoder inputs.
186 |       decoder_inputs: list of numpy int vectors to feed as decoder inputs.
187 |       target_weights: list of numpy float vectors to feed as target weights.
188 |       bucket_id: which bucket of the model to use.
189 |       forward_only: whether to do the backward step or only forward.
190 | 
191 |     Returns:
192 |       A triple consisting of gradient norm (or None if we did not do backward),
193 |       average perplexity, and the outputs.
194 | 
195 |     Raises:
196 |       ValueError: if length of encoder_inputs, decoder_inputs, or
197 |         target_weights disagrees with bucket size for the specified bucket_id.
198 |     """
199 |     # Check if the sizes match.
200 |     encoder_size, decoder_size = self.buckets[bucket_id]
201 |     if len(encoder_inputs) != encoder_size:
202 |       raise ValueError("Encoder length must be equal to the one in bucket,"
203 |                        " %d != %d." % (len(encoder_inputs), encoder_size))
204 |     if len(decoder_inputs) != decoder_size:
205 |       raise ValueError("Decoder length must be equal to the one in bucket,"
206 |                        " %d != %d." % (len(decoder_inputs), decoder_size))
207 |     if len(target_weights) != decoder_size:
208 |       raise ValueError("Weights length must be equal to the one in bucket,"
209 |                        " %d != %d." % (len(target_weights), decoder_size))
210 | 
211 |     # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
212 |     input_feed = {}
213 |     for l in xrange(encoder_size):
214 |       input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
215 |     for l in xrange(decoder_size):
216 |       input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
217 |       input_feed[self.target_weights[l].name] = target_weights[l]
218 | 
219 |     # Since our targets are decoder inputs shifted by one, we need one more.
220 |     last_target = self.decoder_inputs[decoder_size].name
221 |     input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32)
222 | 
223 |     # Output feed: depends on whether we do a backward step or not.
224 |     if not forward_only:
225 |       output_feed = [self.updates[bucket_id],  # Update Op that does SGD.
226 |                      self.gradient_norms[bucket_id],  # Gradient norm.
227 |                      self.losses[bucket_id]]  # Loss for this batch.
228 |     else:
229 |         if beam_search:
230 |               output_feed = [self.beam_path[bucket_id]]  # Loss for this batch.
231 |               output_feed.append(self.beam_symbol[bucket_id])
232 |         else:
233 |             output_feed = [self.losses[bucket_id]]
234 | 
235 |         for l in xrange(decoder_size):  # Output logits.
236 |             output_feed.append(self.outputs[bucket_id][l])
237 |     # print bucket_id
238 |     outputs = session.run(output_feed, input_feed)
239 |     if not forward_only:
240 |       return outputs[1], outputs[2], None  # Gradient norm, loss, no outputs.
241 |     else:
242 |       if beam_search:
243 |           return outputs[0], outputs[1], outputs[2:]  # No gradient norm, loss, outputs.
244 |       else:
245 |           return None, outputs[0], outputs[1:]  # No gradient norm, loss, outputs.
246 | 
247 |   def get_batch(self, data, bucket_id):
248 |     """Get a random batch of data from the specified bucket, prepare for step.
249 | 
250 |     To feed data in step(..) it must be a list of batch-major vectors, while
251 |     data here contains single length-major cases. So the main logic of this
252 |     function is to re-index data cases to be in the proper format for feeding.
253 | 
254 |     Args:
255 |       data: a tuple of size len(self.buckets) in which each element contains
256 |         lists of pairs of input and output data that we use to create a batch.
257 |       bucket_id: integer, which bucket to get the batch for.
258 | 
259 |     Returns:
260 |       The triple (encoder_inputs, decoder_inputs, target_weights) for
261 |       the constructed batch that has the proper format to call step(...) later.
262 |     """
263 |     encoder_size, decoder_size = self.buckets[bucket_id]
264 |     encoder_inputs, decoder_inputs = [], []
265 | 
266 |     # Get a random batch of encoder and decoder inputs from data,
267 |     # pad them if needed, reverse encoder inputs and add GO to decoder.
268 |     for _ in xrange(self.batch_size):
269 |       encoder_input, decoder_input = random.choice(data[bucket_id])
270 | 
271 |       # Encoder inputs are padded and then reversed.
272 |       encoder_pad = [PAD_ID] * (encoder_size - len(encoder_input))
273 |       encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
274 | 
275 |       # Decoder inputs get an extra "GO" symbol, and are padded then.
276 |       decoder_pad_size = decoder_size - len(decoder_input) - 1
277 |       decoder_inputs.append([GO_ID] + decoder_input +
278 |                             [PAD_ID] * decoder_pad_size)
279 | 
280 |     # Now we create batch-major vectors from the data selected above.
281 |     batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
282 | 
283 |     # Batch encoder inputs are just re-indexed encoder_inputs.
284 |     for length_idx in xrange(encoder_size):
285 |       batch_encoder_inputs.append(
286 |           np.array([encoder_inputs[batch_idx][length_idx]
287 |                     for batch_idx in xrange(self.batch_size)], dtype=np.int32))
288 | 
289 |     # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
290 |     for length_idx in xrange(decoder_size):
291 |       batch_decoder_inputs.append(
292 |           np.array([decoder_inputs[batch_idx][length_idx]
293 |                     for batch_idx in xrange(self.batch_size)], dtype=np.int32))
294 | 
295 |       # Create target_weights to be 0 for targets that are padding.
296 |       batch_weight = np.ones(self.batch_size, dtype=np.float32)
297 |       for batch_idx in xrange(self.batch_size):
298 |         # We set weight to 0 if the corresponding target is a PAD symbol.
299 |         # The corresponding target is decoder_input shifted by 1 forward.
300 |         if length_idx < decoder_size - 1:
301 |           target = decoder_inputs[batch_idx][length_idx + 1]
302 |         if length_idx == decoder_size - 1 or target == PAD_ID:
303 |           batch_weight[batch_idx] = 0.0
304 |       batch_weights.append(batch_weight)
305 |     return batch_encoder_inputs, batch_decoder_inputs, batch_weights
306 | 


--------------------------------------------------------------------------------