├── LICENSE
├── RDPG.py
├── README.md
├── bleu.py
├── data_iterator.py
├── insepection.py
├── layers.py
├── mteval.sh
├── nmt_uni.py
├── noisy_translator.py
├── noisytrans_training.py
├── optimizer.py
├── policy.py
├── pretrain_uni.py
├── reward.py
├── run_eval.sh
├── run_train.sh
├── translate.sh
├── translate_uni.py
└── utils.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Jiatao Gu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/RDPG.py:
--------------------------------------------------------------------------------
  1 | """
  2 | -- Recurrent Deterministic Policy Gradient
  3 | """
  4 | 
  5 | from nmt_uni import *
  6 | 
  7 | import os
  8 | import time, datetime
  9 | import cPickle as pkl
 10 | 
 11 | 
 12 | class RDPG(object):
 13 | 
 14 |     def __init__(self,
 15 |                  trng, options, policy, config,
 16 |                  n_in=None, n_out=None,
 17 |                  recurrent=False, id=None):
 18 | 
 19 |         self.trng      = trng
 20 |         self.options   = options
 21 |         self.policy    = policy
 22 |         self.recurrent = recurrent
 23 | 
 24 |         self.n_hidden  = 512
 25 |         self.n_in      = n_in
 26 |         self.n_out     = n_out
 27 | 
 28 |         self.rec       = 'lngru'
 29 |         if not n_in:
 30 |             self.n_in  = options['readout_dim']
 31 | 
 32 |         # ------------------------------------------------------------------------------
 33 |         print 'policy network initialization'
 34 | 
 35 |         params = OrderedDict()
 36 |         if not self.recurrent:
 37 |             print 'building a feed-forward controller'
 38 |             params = get_layer('ff')[0](options, params, prefix='policy_net_in',
 39 |                                         nin=self.n_in, nout=self.n_hidden, scale=0.001)
 40 |         else:
 41 |             print 'building a recurrent controller'
 42 |             params = get_layer(self.rec)[0](options, params, prefix='policy_net_in',
 43 |                                             nin=self.n_in, dim=self.n_hidden, scale=0.001)
 44 | 
 45 |         params = get_layer('ff')[0](options, params, prefix='policy_net_out',
 46 |                                     nin=self.n_hidden,
 47 |                                     nout=self.n_out,
 48 |                                     scale=0.001)
 49 | 
 50 |         # --------------------------------------------------------------------------------
 51 |         print 'critic network initialization (RNN)'
 52 |         params_b = OrderedDict()
 53 |         params_b = get_layer(self.rec)[0](options, params_b, prefix='critic_net_in',
 54 |                                             nin=self.n_in + self.n_out,
 55 |                                           dim=self.n_hidden, scale=0.001)
 56 |         params_b = get_layer('ff')[0](options, params_b, prefix='critic_net_out',
 57 |                                       nin=self.n_hidden,
 58 |                                       nout=1,
 59 |                                       scale=0.001)
 60 |         if id is not None:
 61 |             print 'reload the saved model: {}'.format(id)
 62 |             params = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params)
 63 |             params_b = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params_b)
 64 |         else:
 65 |             id = datetime.datetime.fromtimestamp(time.time()).strftime('%y%m%d-%H%M%S')
 66 |             print 'start from a new model: {}'.format(id)
 67 | 
 68 |             with open('.config/conf.{}.txt'.format(id), 'w') as f:
 69 |                 f.write('[config]\n')
 70 | 
 71 |                 for c in config:
 72 |                     f.write('{}: {}\n'.format(c, config[c]))
 73 |                 f.write('\n')
 74 | 
 75 |                 f.write('[policy]\n')
 76 | 
 77 |                 for c in policy:
 78 |                     f.write('{}: {}\n'.format(c, policy[c]))
 79 | 
 80 |             # pkl.dump([policy, config], open('.config/{}.conf'.format(id), 'w'))
 81 |             print 'save the config file'
 82 | 
 83 |         self.id = id
 84 |         self.model = '.policy/{}-{}'.format(id, self.policy['base'])
 85 | 
 86 |         # theano shared params
 87 |         self.tparams   = init_tparams(params)
 88 |         self.tparams_b = init_tparams(params_b)
 89 | 
 90 |         # build the policy network
 91 |         self.build_actor(options=options)
 92 |         self.build_discriminator(options=options)
 93 | 
 94 |     def build_actor(self, options):
 95 |         # ============================================================================= #
 96 |         # Actor from Policy Network
 97 |         # ============================================================================= #
 98 |         observation = tensor.matrix('observation', dtype='float32')  # batch_size x readout_dim (seq_steps=1)
 99 |         prev_hidden = tensor.matrix('p_hidden', dtype='float32')
100 | 
101 |         if not self.recurrent:
102 |             hiddens = get_layer('ff')[1](self.tparams, observation,
103 |                                          options, prefix='policy_net_in',
104 |                                          activ='tanh')
105 |         else:
106 |             hiddens = get_layer(self.rec)[1](self.tparams, observation,
107 |                                              options, prefix='policy_net_in', mask=None,
108 |                                              one_step=True, _init_state=prev_hidden)[0]
109 | 
110 |         act_inps = [observation, prev_hidden]
111 |         act_outs = get_layer('ff')[1](self.tparams, hiddens, options,
112 |                                       prefix='policy_net_out',
113 |                                       activ='tanh'
114 |                                       )
115 |         print 'build action function [Deterministic]'
116 |         self.f_action = theano.function(act_inps, act_outs,
117 |                                         on_unused_input='ignore')  # action/dist/hiddens
118 |         print 'done.'
119 | 
120 | 
121 |     def build_discriminator(self, options):
122 |         # ============================================================================= #
123 |         # Build for End-t-End learning
124 |         # ============================================================================= #
125 |         observations = tensor.tensor3('observations', dtype='float32')
126 |         mask         = tensor.matrix('mask', dtype='float32')
127 |         targets      = tensor.vector('targets', dtype='float32')
128 | 
129 |         print 'build actor'
130 |         if not self.recurrent:
131 |             hiddens  = get_layer('ff')[1](self.tparams, observations,
132 |                                          options, prefix='policy_net_in',
133 |                                          activ='tanh')
134 |         else:
135 |             hiddens  = get_layer(self.rec)[1](self.tparams, observations,
136 |                                              options, prefix='policy_net_in', mask=mask)[0]
137 |         actions      = get_layer('ff')[1](self.tparams, hiddens, options, prefix='policy_net_out',
138 |                                           activ='tanh')  # seq_steps x batch_size x n_out
139 | 
140 |         print 'build critic'
141 |         state_action = concatenate([observations, actions], axis=-1)
142 |         hiddens_b    = get_layer(self.rec)[1](self.tparams_b, state_action,
143 |                                              options, prefix='critic_net_in', mask=mask)[0]
144 |         values       = get_layer('ff')[1](self.tparams_b, hiddens_b, options,
145 |                                            prefix='critic_net_out',
146 |                                            activ='tanh')[-1, :, 0]  # (batch_size, )
147 | 
148 |         # =============================================================================== #
149 |         # Build Deterministic Policy Gradient [Actor Parts]
150 |         # =============================================================================== #
151 |         inps_A       = [observations, mask]
152 |         loss_A       = -tensor.mean(values)
153 |         grad_A       = tensor.grad(loss_A, wrt=itemlist(self.tparams))
154 |         grad_A       = grad_clip(grad_A)
155 |         outs_A       = [loss_A, actions]
156 | 
157 |         # optimizer: Adam
158 |         lr           = tensor.scalar(name='lr')
159 |         f_A, f_Aup   = adam(lr, self.tparams, grad_A, inps_A, outs_A)
160 | 
161 |         # =============================================================================== #
162 |         # Build Deterministic Policy Gradient [Critic Parts]
163 |         # =============================================================================== #
164 |         inps_B       = [observations, mask, actions, targets]
165 |         loss_B       = tensor.mean((values - targets) ** 2)
166 |         grad_B       = tensor.grad(loss_B, wrt=itemlist(self.tparams_b))
167 |         grad_B       = grad_clip(grad_B)
168 |         outs_B       = [loss_B]
169 | 
170 |         # optimizer: Adam
171 |         lr           = tensor.scalar(name='lr')
172 |         f_B, f_Bup   = adam(lr, self.tparams_b, grad_B, inps_B, outs_B)
173 | 
174 |         self.f_learner = [f_A, f_Aup, f_B, f_Bup]
175 |         print 'done.'
176 | 
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NMT-RDPG
2 | Neural machine translation with Recurrent Deterministic Policy Gradient
3 | 


--------------------------------------------------------------------------------
/bleu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Natural Language Toolkit: BLEU Score
  3 | #
  4 | # Copyright (C) 2001-2016 NLTK Project
  5 | # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
  6 | # Contributors: Dmitrijs Milajevs, Liling Tan
  7 | # URL: <http://nltk.org/>
  8 | # For license information, see LICENSE.TXT
  9 | 
 10 | """BLEU score implementation."""
 11 | from __future__ import division
 12 | 
 13 | import math
 14 | import fractions
 15 | from collections import Counter
 16 | 
 17 | from nltk.util import ngrams
 18 | 
 19 | try:
 20 |     fractions.Fraction(0, 1000, _normalize=False)
 21 |     from fractions import Fraction
 22 | except TypeError:
 23 |     from nltk.compat import Fraction
 24 | 
 25 | 
 26 | def sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25),
 27 |                   smoothing_function=None):
 28 |     """
 29 |     Calculate BLEU score (Bilingual Evaluation Understudy) from
 30 |     Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
 31 |     "BLEU: a method for automatic evaluation of machine translation."
 32 |     In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
 33 | 
 34 |     >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
 35 |     ...               'ensures', 'that', 'the', 'military', 'always',
 36 |     ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
 37 | 
 38 |     >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
 39 |     ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
 40 |     ...               'that', 'party', 'direct']
 41 | 
 42 |     >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
 43 |     ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
 44 |     ...               'heed', 'Party', 'commands']
 45 | 
 46 |     >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
 47 |     ...               'guarantees', 'the', 'military', 'forces', 'always',
 48 |     ...               'being', 'under', 'the', 'command', 'of', 'the',
 49 |     ...               'Party']
 50 | 
 51 |     >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
 52 |     ...               'army', 'always', 'to', 'heed', 'the', 'directions',
 53 |     ...               'of', 'the', 'party']
 54 | 
 55 |     >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
 56 |     0.5045...
 57 | 
 58 |     >>> sentence_bleu([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
 59 |     0.3969...
 60 | 
 61 |     The default BLEU calculates a score for up to 4grams using uniform
 62 |     weights. To evaluate your translations with higher/lower order ngrams,
 63 |     use customized weights. E.g. when accounting for up to 6grams with uniform
 64 |     weights:
 65 | 
 66 |     >>> weights = (0.1666, 0.1666, 0.1666, 0.1666, 0.1666)
 67 |     >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights)
 68 |     0.45838627164939455
 69 | 
 70 |     :param references: reference sentences
 71 |     :type references: list(list(str))
 72 |     :param hypothesis: a hypothesis sentence
 73 |     :type hypothesis: list(str)
 74 |     :param weights: weights for unigrams, bigrams, trigrams and so on
 75 |     :type weights: list(float)
 76 |     :return: The sentence-level BLEU score.
 77 |     :rtype: float
 78 |     """
 79 |     return corpus_bleu([references], [hypothesis], weights, smoothing_function)
 80 | 
 81 | 
 82 | def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25),
 83 |                 smoothing_function=None):
 84 |     """
 85 |     Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
 86 |     the hypotheses and their respective references.
 87 | 
 88 |     Instead of averaging the sentence level BLEU scores (i.e. marco-average
 89 |     precision), the original BLEU metric (Papineni et al. 2002) accounts for
 90 |     the micro-average precision (i.e. summing the numerators and denominators
 91 |     for each hypothesis-reference(s) pairs before the division).
 92 | 
 93 |     >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
 94 |     ...         'ensures', 'that', 'the', 'military', 'always',
 95 |     ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
 96 |     >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
 97 |     ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
 98 |     ...          'heed', 'Party', 'commands']
 99 |     >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
100 |     ...          'guarantees', 'the', 'military', 'forces', 'always',
101 |     ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
102 |     >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
103 |     ...          'army', 'always', 'to', 'heed', 'the', 'directions',
104 |     ...          'of', 'the', 'party']
105 | 
106 |     >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
107 |     ...         'interested', 'in', 'world', 'history']
108 |     >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
109 |     ...          'because', 'he', 'read', 'the', 'book']
110 | 
111 |     >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
112 |     >>> hypotheses = [hyp1, hyp2]
113 |     >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
114 |     0.5920...
115 | 
116 |     The example below show that corpus_bleu() is different from averaging
117 |     sentence_bleu() for hypotheses
118 | 
119 |     >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
120 |     >>> score2 = sentence_bleu([ref2a], hyp2)
121 |     >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
122 |     0.6223...
123 | 
124 |     :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
125 |     :type references: list(list(list(str)))
126 |     :param hypotheses: a list of hypothesis sentences
127 |     :type hypotheses: list(list(str))
128 |     :param weights: weights for unigrams, bigrams, trigrams and so on
129 |     :type weights: list(float)
130 |     :return: The corpus-level BLEU score.
131 |     :rtype: float
132 |     """
133 |     # Before proceeding to compute BLEU, perform sanity checks.
134 | 
135 |     p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
136 |     p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
137 |     hyp_lengths, ref_lengths = 0, 0
138 | 
139 |     assert len(list_of_references) == len(
140 |         hypotheses), "The number of hypotheses and their reference(s) should be the same"
141 | 
142 |     # Iterate through each hypothesis and their corresponding references.
143 |     for references, hypothesis in zip(list_of_references, hypotheses):
144 |         # For each order of ngram, calculate the numerator and
145 |         # denominator for the corpus-level modified precision.
146 |         for i, _ in enumerate(weights, start=1):
147 |             p_i = modified_precision(references, hypothesis, i)
148 |             p_numerators[i] += p_i.numerator
149 |             p_denominators[i] += p_i.denominator
150 | 
151 |         # Calculate the hypothesis length and the closest reference length.
152 |         # Adds them to the corpus-level hypothesis and reference counts.
153 |         hyp_len = len(hypothesis)
154 |         hyp_lengths += hyp_len
155 |         ref_lengths += closest_ref_length(references, hyp_len)
156 | 
157 |     # Calculate corpus-level brevity penalty.
158 |     bp = brevity_penalty(ref_lengths, hyp_lengths)
159 | 
160 |     # Collects the various precision values for the different ngram orders.
161 |     p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
162 |            for i, _ in enumerate(weights, start=1)]
163 | 
164 |     # Returns 0 if there's no matching n-grams
165 |     # We only need to check for p_numerators[1] == 0, since if there's
166 |     # no unigrams, there won't be any higher order ngrams.
167 |     if p_numerators[1] == 0:
168 |         return 0, 0
169 | 
170 |     # Smoothen the modified precision.
171 |     # Note: smooth_precision() converts values into float.
172 |     if not smoothing_function:
173 |         smoothing_function = SmoothingFunction().method0
174 |     p_n = smoothing_function(p_n, references=references,
175 |                              hypothesis=hypothesis, hyp_len=hyp_len)
176 | 
177 |     # Calculates the overall modified precision for all ngrams.
178 |     # By sum of the product of the weights and the respective *p_n*
179 |     s = (w * math.log(p_i) for w, p_i in zip(weights, p_n)
180 |          if p_i.numerator != 0)
181 | 
182 |     # return bp * math.exp(math.fsum(s))
183 |     return math.exp(math.fsum(s)), bp * math.exp(math.fsum(s))
184 | 
185 | 
186 | def modified_precision(references, hypothesis, n):
187 |     """
188 |     Calculate modified ngram precision.
189 | 
190 |     The normal precision method may lead to some wrong translations with
191 |     high-precision, e.g., the translation, in which a word of reference
192 |     repeats several times, has very high precision.
193 | 
194 |     This function only returns the Fraction object that contains the numerator
195 |     and denominator necessary to calculate the corpus-level precision.
196 |     To calculate the modified precision for a single pair of hypothesis and
197 |     references, cast the Fraction object into a float.
198 | 
199 |     The famous "the the the ... " example shows that you can get BLEU precision
200 |     by duplicating high frequency words.
201 | 
202 |         >>> reference1 = 'the cat is on the mat'.split()
203 |         >>> reference2 = 'there is a cat on the mat'.split()
204 |         >>> hypothesis1 = 'the the the the the the the'.split()
205 |         >>> references = [reference1, reference2]
206 |         >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
207 |         0.2857...
208 | 
209 |     In the modified n-gram precision, a reference word will be considered
210 |     exhausted after a matching hypothesis word is identified, e.g.
211 | 
212 |         >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
213 |         ...               'ensures', 'that', 'the', 'military', 'will',
214 |         ...               'forever', 'heed', 'Party', 'commands']
215 |         >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
216 |         ...               'guarantees', 'the', 'military', 'forces', 'always',
217 |         ...               'being', 'under', 'the', 'command', 'of', 'the',
218 |         ...               'Party']
219 |         >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
220 |         ...               'army', 'always', 'to', 'heed', 'the', 'directions',
221 |         ...               'of', 'the', 'party']
222 |         >>> hypothesis = 'of the'.split()
223 |         >>> references = [reference1, reference2, reference3]
224 |         >>> float(modified_precision(references, hypothesis, n=1))
225 |         1.0
226 |         >>> float(modified_precision(references, hypothesis, n=2))
227 |         1.0
228 | 
229 |     An example of a normal machine translation hypothesis:
230 | 
231 |         >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
232 |         ...               'ensures', 'that', 'the', 'military', 'always',
233 |         ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
234 | 
235 |         >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
236 |         ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
237 |         ...               'that', 'party', 'direct']
238 | 
239 |         >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
240 |         ...               'ensures', 'that', 'the', 'military', 'will',
241 |         ...               'forever', 'heed', 'Party', 'commands']
242 | 
243 |         >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
244 |         ...               'guarantees', 'the', 'military', 'forces', 'always',
245 |         ...               'being', 'under', 'the', 'command', 'of', 'the',
246 |         ...               'Party']
247 | 
248 |         >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
249 |         ...               'army', 'always', 'to', 'heed', 'the', 'directions',
250 |         ...               'of', 'the', 'party']
251 |         >>> references = [reference1, reference2, reference3]
252 |         >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
253 |         0.9444...
254 |         >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
255 |         0.5714...
256 |         >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
257 |         0.5882352941176471
258 |         >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
259 |         0.07692...
260 | 
261 | 
262 |     :param references: A list of reference translations.
263 |     :type references: list(list(str))
264 |     :param hypothesis: A hypothesis translation.
265 |     :type hypothesis: list(str)
266 |     :param n: The ngram order.
267 |     :type n: int
268 |     :return: BLEU's modified precision for the nth order ngram.
269 |     :rtype: Fraction
270 |     """
271 |     # Extracts all ngrams in hypothesis.
272 |     counts = Counter(ngrams(hypothesis, n))
273 | 
274 |     # Extract a union of references' counts.
275 |     ## max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
276 |     max_counts = {}
277 |     for reference in references:
278 |         reference_counts = Counter(ngrams(reference, n))
279 |         for ngram in counts:
280 |             max_counts[ngram] = max(max_counts.get(ngram, 0),
281 |                                     reference_counts[ngram])
282 | 
283 |     # Assigns the intersection between hypothesis and references' counts.
284 |     clipped_counts = {ngram: min(count, max_counts[ngram])
285 |                       for ngram, count in counts.items()}
286 | 
287 |     numerator = sum(clipped_counts.values())
288 |     # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
289 |     # Usually this happens when the ngram order is > len(reference).
290 |     denominator = max(1, sum(counts.values()))
291 | 
292 |     return Fraction(numerator, denominator, _normalize=False)
293 | 
294 | 
295 | def closest_ref_length(references, hyp_len):
296 |     """
297 |     This function finds the reference that is the closest length to the
298 |     hypothesis. The closest reference length is referred to as *r* variable
299 |     from the brevity penalty formula in Papineni et. al. (2002)
300 | 
301 |     :param references: A list of reference translations.
302 |     :type references: list(list(str))
303 |     :param hypothesis: The length of the hypothesis.
304 |     :type hypothesis: int
305 |     :return: The length of the reference that's closest to the hypothesis.
306 |     :rtype: int
307 |     """
308 |     ref_lens = (len(reference) for reference in references)
309 |     closest_ref_len = min(ref_lens, key=lambda ref_len:
310 |     (abs(ref_len - hyp_len), ref_len))
311 |     return closest_ref_len
312 | 
313 | 
314 | def brevity_penalty(closest_ref_len, hyp_len):
315 |     """
316 |     Calculate brevity penalty.
317 | 
318 |     As the modified n-gram precision still has the problem from the short
319 |     length sentence, brevity penalty is used to modify the overall BLEU
320 |     score according to length.
321 | 
322 |     An example from the paper. There are three references with length 12, 15
323 |     and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
324 | 
325 |         >>> reference1 = list('aaaaaaaaaaaa')      # i.e. ['a'] * 12
326 |         >>> reference2 = list('aaaaaaaaaaaaaaa')   # i.e. ['a'] * 15
327 |         >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
328 |         >>> hypothesis = list('aaaaaaaaaaaa')      # i.e. ['a'] * 12
329 |         >>> references = [reference1, reference2, reference3]
330 |         >>> hyp_len = len(hypothesis)
331 |         >>> closest_ref_len =  closest_ref_length(references, hyp_len)
332 |         >>> brevity_penalty(closest_ref_len, hyp_len)
333 |         1.0
334 | 
335 |     In case a hypothesis translation is shorter than the references, penalty is
336 |     applied.
337 | 
338 |         >>> references = [['a'] * 28, ['a'] * 28]
339 |         >>> hypothesis = ['a'] * 12
340 |         >>> hyp_len = len(hypothesis)
341 |         >>> closest_ref_len =  closest_ref_length(references, hyp_len)
342 |         >>> brevity_penalty(closest_ref_len, hyp_len)
343 |         0.2635971381157267
344 | 
345 |     The length of the closest reference is used to compute the penalty. If the
346 |     length of a hypothesis is 12, and the reference lengths are 13 and 2, the
347 |     penalty is applied because the hypothesis length (12) is less then the
348 |     closest reference length (13).
349 | 
350 |         >>> references = [['a'] * 13, ['a'] * 2]
351 |         >>> hypothesis = ['a'] * 12
352 |         >>> hyp_len = len(hypothesis)
353 |         >>> closest_ref_len =  closest_ref_length(references, hyp_len)
354 |         >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
355 |         0.9200...
356 | 
357 |     The brevity penalty doesn't depend on reference order. More importantly,
358 |     when two reference sentences are at the same distance, the shortest
359 |     reference sentence length is used.
360 | 
361 |         >>> references = [['a'] * 13, ['a'] * 11]
362 |         >>> hypothesis = ['a'] * 12
363 |         >>> hyp_len = len(hypothesis)
364 |         >>> closest_ref_len =  closest_ref_length(references, hyp_len)
365 |         >>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
366 |         >>> hyp_len = len(hypothesis)
367 |         >>> closest_ref_len =  closest_ref_length(reversed(references), hyp_len)
368 |         >>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
369 |         >>> bp1 == bp2 == 1
370 |         True
371 | 
372 |     A test example from mteval-v13a.pl (starting from the line 705):
373 | 
374 |         >>> references = [['a'] * 11, ['a'] * 8]
375 |         >>> hypothesis = ['a'] * 7
376 |         >>> hyp_len = len(hypothesis)
377 |         >>> closest_ref_len =  closest_ref_length(references, hyp_len)
378 |         >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
379 |         0.8668...
380 | 
381 |         >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
382 |         >>> hypothesis = ['a'] * 7
383 |         >>> hyp_len = len(hypothesis)
384 |         >>> closest_ref_len =  closest_ref_length(references, hyp_len)
385 |         >>> brevity_penalty(closest_ref_len, hyp_len)
386 |         1.0
387 | 
388 |     :param hyp_len: The length of the hypothesis for a single sentence OR the
389 |     sum of all the hypotheses' lengths for a corpus
390 |     :type hyp_len: int
391 |     :param closest_ref_len: The length of the closest reference for a single
392 |     hypothesis OR the sum of all the closest references for every hypotheses.
393 |     :type closest_reference_len: int
394 |     :return: BLEU's brevity penalty.
395 |     :rtype: float
396 |     """
397 |     if hyp_len > closest_ref_len:
398 |         return 1
399 |     # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
400 |     elif hyp_len == 0:
401 |         return 0
402 |     else:
403 |         return math.exp(1 - closest_ref_len / hyp_len)
404 | 
405 | 
406 | class SmoothingFunction:
407 |     """
408 |     This is an implementation of the smoothing techniques
409 |     for segment-level BLEU scores that was presented in
410 |     Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
411 |     Smoothing Techniques for Sentence-Level BLEU. In WMT14.
412 |     http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
413 |     """
414 | 
415 |     def __init__(self, epsilon=0.1, alpha=5, k=5):
416 |         """
417 |         This will initialize the parameters required for the various smoothing
418 |         techniques, the default values are set to the numbers used in the
419 |         experiments from Chen and Cherry (2014).
420 | 
421 |         >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
422 |         ...                 'that', 'the', 'military', 'always', 'obeys', 'the',
423 |         ...                 'commands', 'of', 'the', 'party']
424 |         >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
425 |         ...               'that', 'the', 'military', 'will', 'forever', 'heed',
426 |         ...               'Party', 'commands']
427 | 
428 |         >>> chencherry = SmoothingFunction()
429 |         >>> print (sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
430 |         0.4118...
431 |         >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
432 |         0.4118...
433 |         >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
434 |         0.4118...
435 |         >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
436 |         0.4489...
437 |         >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
438 |         0.4118...
439 |         >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
440 |         0.4118...
441 |         >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
442 |         0.4905...
443 |         >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
444 |         0.1801...
445 |         >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
446 |         0.4905...
447 | 
448 |         :param epsilon: the epsilon value use in method 1
449 |         :type epsilon: float
450 |         :param alpha: the alpha value use in method 6
451 |         :type alpha: int
452 |         :param k: the k value use in method 4
453 |         :type k: int
454 |         """
455 |         self.epsilon = epsilon
456 |         self.alpha = alpha
457 |         self.k = k
458 | 
459 |     def method0(self, p_n, *args, **kwargs):
460 |         """ No smoothing. """
461 |         return p_n
462 | 
463 |     def method1(self, p_n, *args, **kwargs):
464 |         """
465 |         Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
466 |         """
467 |         return [(p_i.numerator + self.epsilon) / p_i.denominator
468 |                 if p_i.numerator == 0 else p_i for p_i in p_n]
469 | 
470 |     def method2(self, p_n, *args, **kwargs):
471 |         """
472 |         Smoothing method 2: Add 1 to both numerator and denominator from
473 |         Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
474 |         machine translation quality using longest common subsequence and
475 |         skip-bigram statistics. In ACL04.
476 |         """
477 |         return [Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False) for p_i in p_n]
478 | 
479 |     def method3(self, p_n, *args, **kwargs):
480 |         """
481 |         Smoothing method 3: NIST geometric sequence smoothing
482 |         The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
483 |         precision score whose matching n-gram count is null.
484 |         k is 1 for the first 'n' value for which the n-gram match count is null/
485 |         For example, if the text contains:
486 |          - one 2-gram match
487 |          - and (consequently) two 1-gram matches
488 |         the n-gram count for each individual precision score would be:
489 |          - n=1  =>  prec_count = 2     (two unigrams)
490 |          - n=2  =>  prec_count = 1     (one bigram)
491 |          - n=3  =>  prec_count = 1/2   (no trigram,  taking 'smoothed' value of 1 / ( 2^k ), with k=1)
492 |          - n=4  =>  prec_count = 1/4   (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
493 |         """
494 |         incvnt = 1  # From the mteval-v13a.pl, it's referred to as k.
495 |         for i, p_i in enumerate(p_n):
496 |             if p_i.numerator == 0:
497 |                 p_n[i] = 1 / (2 ** incvnt * p_i.denominator)
498 |                 incvnt += 1
499 |         return p_n
500 | 
501 |     def method4(self, p_n, references, hypothesis, hyp_len):
502 |         """
503 |         Smoothing method 4:
504 |         Shorter translations may have inflated precision values due to having
505 |         smaller denominators; therefore, we give them proportionally
506 |         smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
507 |         suggests dividing by 1/ln(len(T)), where T is the length of the translation.
508 |         """
509 |         incvnt = 1
510 |         for i, p_i in enumerate(p_n):
511 |             if p_i.numerator == 0 and hyp_len != 0:
512 |                 p_n[i] = incvnt * self.k / math.log(hyp_len)  # Note that this K is different from the K from NIST.
513 |                 incvnt += 1
514 |         return p_n
515 | 
516 |     def method5(self, p_n, references, hypothesis, hyp_len):
517 |         """
518 |         Smoothing method 5:
519 |         The matched counts for similar values of n should be similar. To a
520 |         calculate the n-gram matched count, it averages the n−1, n and n+1 gram
521 |         matched counts.
522 |         """
523 |         m = {}
524 |         # Requires an precision value for an addition ngram order.
525 |         p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
526 |         m[-1] = p_n[0] + 1
527 |         for i, p_i in enumerate(p_n):
528 |             p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
529 |             m[i] = p_n[i]
530 |         return p_n
531 | 
532 |     def method6(self, p_n, references, hypothesis, hyp_len):
533 |         """
534 |         Smoothing method 6:
535 |         Interpolates the maximum likelihood estimate of the precision *p_n* with
536 |         a prior estimate *pi0*. The prior is estimated by assuming that the ratio
537 |         between pn and pn−1 will be the same as that between pn−1 and pn−2.
538 |         """
539 |         for i, p_i in enumerate(p_n):
540 |             if i in [1, 2]:  # Skips the first 2 orders of ngrams.
541 |                 continue
542 |             else:
543 |                 pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
544 |                 # No. of ngrams in translation.
545 |                 l = sum(1 for _ in ngrams(hypothesis, i + 1))
546 |                 p_n[i] = (p_i + self.alpha * pi0) / (l + self.alpha)
547 |         return p_n
548 | 
549 |     def method7(self, p_n, references, hypothesis, hyp_len):
550 |         """
551 |         Smoothing method 6:
552 |         Interpolates the maximum likelihood estimate of the precision *p_n* with
553 |         a prior estimate *pi0*. The prior is estimated by assuming that the ratio
554 |         between pn and pn−1 will be the same as that between pn−1 and pn−2.
555 |         """
556 |         p_n = self.method4(p_n, references, hypothesis, hyp_len)
557 |         p_n = self.method5(p_n, references, hypothesis, hyp_len)
558 |         return p_n


--------------------------------------------------------------------------------
/data_iterator.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | 
  3 | import cPickle as pkl
  4 | import gzip
  5 | 
  6 | 
  7 | def fopen(filename, mode='r'):
  8 |     if filename.endswith('.gz'):
  9 |         return gzip.open(filename, mode)
 10 |     return open(filename, mode)
 11 | 
 12 | 
 13 | class TextIterator:
 14 |     """Simple Bitext iterator."""
 15 |     def __init__(self, source, target,
 16 |                  source_dict, target_dict,
 17 |                  batch_size=128,
 18 |                  maxlen=100,
 19 |                  n_words_source=-1,
 20 |                  n_words_target=-1,
 21 |                  cache=5):
 22 | 
 23 |         self.source = fopen(source, 'r')
 24 |         self.target = fopen(target, 'r')
 25 | 
 26 |         print 'scan the dataset.'
 27 |         for si, _ in enumerate(self.source):
 28 |             pass
 29 |         for ti, _ in enumerate(self.target):
 30 |             pass
 31 | 
 32 |         self.source.close()
 33 |         self.target.close()
 34 | 
 35 |         assert si == ti, 'the number of the source and target document must the same'
 36 |         print 'scanned {} lines'.format(si)
 37 | 
 38 |         self.source = fopen(source, 'r')
 39 |         self.target = fopen(target, 'r')
 40 | 
 41 |         with open(source_dict, 'rb') as f:
 42 |             self.source_dict = pkl.load(f)
 43 |         with open(target_dict, 'rb') as f:
 44 |             self.target_dict = pkl.load(f)
 45 | 
 46 |         self.num = si
 47 |         self.batch_size = batch_size
 48 |         self.maxlen = maxlen
 49 | 
 50 |         self.n_words_source = n_words_source
 51 |         self.n_words_target = n_words_target
 52 | 
 53 |         self.source_buffer = []
 54 |         self.target_buffer = []
 55 |         self.k = batch_size * cache
 56 | 
 57 |         self.end_of_data = False
 58 | 
 59 | 
 60 | 
 61 | 
 62 |     def __iter__(self):
 63 |         return self
 64 | 
 65 |     def reset(self):
 66 |         self.source.seek(0)
 67 |         self.target.seek(0)
 68 | 
 69 |     def next(self):
 70 |         if self.end_of_data:
 71 |             self.end_of_data = False
 72 |             self.reset()
 73 |             raise StopIteration
 74 | 
 75 |         source = []
 76 |         target = []
 77 | 
 78 |         # fill buffer, if it's empty
 79 |         assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
 80 | 
 81 |         if len(self.source_buffer) == 0:
 82 |             for k_ in xrange(self.k):
 83 |                 ss = self.source.readline()
 84 |                 if ss == "":
 85 |                     break
 86 |                 tt = self.target.readline()
 87 |                 if tt == "":
 88 |                     break
 89 | 
 90 |                 self.source_buffer.append(ss.strip().split())
 91 |                 self.target_buffer.append(tt.strip().split())
 92 | 
 93 |             # sort by target buffer
 94 |             tlen = numpy.array([len(t) for t in self.target_buffer])
 95 |             tidx = tlen.argsort()
 96 | 
 97 |             _sbuf = [self.source_buffer[i] for i in tidx]
 98 |             _tbuf = [self.target_buffer[i] for i in tidx]
 99 | 
100 |             self.source_buffer = _sbuf
101 |             self.target_buffer = _tbuf
102 | 
103 |         if len(self.source_buffer) == 0 or len(self.target_buffer) == 0:
104 |             self.end_of_data = False
105 |             self.reset()
106 |             raise StopIteration
107 | 
108 |         try:
109 | 
110 |             # actual work here
111 |             while True:
112 | 
113 |                 # read from source file and map to word index
114 |                 try:
115 |                     ss = self.source_buffer.pop()
116 |                 except IndexError:
117 |                     break
118 |                 ss = [self.source_dict[w] if w in self.source_dict else 1
119 |                       for w in ss]
120 |                 if self.n_words_source > 0:
121 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
122 | 
123 |                 # read from source file and map to word index
124 |                 tt = self.target_buffer.pop()
125 |                 tt = [self.target_dict[w] if w in self.target_dict else 1
126 |                       for w in tt]
127 |                 if self.n_words_target > 0:
128 |                     tt = [w if w < self.n_words_target else 1 for w in tt]
129 | 
130 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen:
131 |                     continue
132 | 
133 |                 source.append(ss)
134 |                 target.append(tt)
135 | 
136 |                 if len(source) >= self.batch_size or \
137 |                         len(target) >= self.batch_size:
138 |                     break
139 |         except IOError:
140 |             self.end_of_data = True
141 | 
142 |         if len(source) <= 0 or len(target) <= 0:
143 |             self.end_of_data = False
144 |             self.reset()
145 |             raise StopIteration
146 | 
147 |         return source, target
148 | 


--------------------------------------------------------------------------------
/insepection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import matplotlib
  3 | # matplotlib.use('agg')
  4 | import copy
  5 | import numpy
  6 | import os
  7 | import seaborn as sns
  8 | import pandas as pd
  9 | sns.set(context="paper", font="monospace", style='whitegrid')
 10 | from matplotlib import pyplot as plot
 11 | from matplotlib import rc
 12 | 
 13 | rc('font',**{'family':'Verdana', 'weight': 'normal'})
 14 | rc('font', size=8)
 15 | rc('text', usetex=True)
 16 | rc('text.latex',unicode=True)
 17 | rc('text.latex',preamble='\usepackage[utf8]{inputenc}')
 18 | rc('text.latex',preamble='\usepackage[russian]{babel}')
 19 | rc('text.latex',preamble='\usepackage[german]{babel}')
 20 | rc('text.latex',preamble='\usepackage[ngerman]{babel}')
 21 | 
 22 | matplotlib.rcParams['ytick.labelsize'] = 11
 23 | matplotlib.rcParams['xtick.labelsize'] = 11 
 24 | 
 25 | def heatmap(sources, refs, trans, actions, idx, atten=None, savefig=True, name='test', info=None, show=False):
 26 |     source = [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||']
 27 |     target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||']
 28 |     action = actions[idx]
 29 |    
 30 | 
 31 |     if atten:
 32 |         attention = numpy.array(atten[idx])
 33 | 
 34 |     def track(acts, data, annote):
 35 |         x, y = 0, 0
 36 |         for a in acts:
 37 |             x += a
 38 |             y += 1 - a
 39 |             # print a, x, y, target[x].encode('utf8')
 40 |             data[y, x]   = 1
 41 |             annote[y, x] = 'W' if a == 0  else 'C'
 42 | 
 43 |         return data, annote
 44 |     # print target
 45 |     
 46 |     data       = numpy.zeros((len(source), len(target)))
 47 |     annote     = numpy.chararray(data.shape, itemsize=8)
 48 |     annote[:]  = '' 
 49 |     data, annote  = track(action, data, annote)
 50 |     data[0, 0]    = 1
 51 |     annote[0, 0]  = 'S'
 52 |     if atten:
 53 |         data[:-1, 1:] += attention.T
 54 |     
 55 |     d  = pd.DataFrame(data=data, columns=target, index=source)
 56 |     # p  = sns.diverging_palette(220, 10, as_cmap=True)
 57 |     f, ax = plot.subplots(figsize=(11, 11))
 58 |     f.set_canvas(plot.gcf().canvas)
 59 |     g  = sns.heatmap(d, ax=ax, annot=annote, fmt='s')
 60 |     g.xaxis.tick_top()
 61 | 
 62 |     plot.xticks(rotation=90)
 63 |     plot.yticks(rotation=0)
 64 |     # plot.show()
 65 |     if savefig:
 66 |         if not os.path.exists('.images/C_{}'.format(name)):
 67 |             os.mkdir('.images/C_{}'.format(name))
 68 | 
 69 |         filename = 'Idx={}||'.format(info['index'])
 70 |         for w in info:
 71 |             if w is not 'index':
 72 |                 filename += '.{}={:.2f}'.format(w, float(info[w]))
 73 | 
 74 |         print 'saving...'
 75 |         f.savefig('.images/C_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100)
 76 |     if show:
 77 |         plot.show()
 78 | 
 79 |     print 'plotting done.'
 80 |     plot.close()
 81 |  
 82 | def heatmap2(sources, refs, trans, actions, idx, atten=None, full_atten=None, savefig=True, name='test', info=None, show=False):
 83 |     source = ['*'] + [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||']
 84 |     target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||'] + ['*']
 85 |     action = actions[idx]
 86 |    
 87 |     flag   = 0
 88 |     if atten:
 89 |         attention = numpy.array(atten[idx])
 90 |     else:
 91 |         attention = None
 92 | 
 93 |     if full_atten:
 94 |         fullatten = numpy.array(full_atten[idx])
 95 |     else:
 96 |         fullatten = None
 97 |     
 98 |     def track(acts, data, annote):
 99 |         x, y, z = 0, 0, 0
100 |         for a in acts:
101 |             x += (a == 1)
102 |             y += (a == 0)
103 |             z += (a == 2)
104 | 
105 |             # data[y + 1, x]   = 1
106 |             # data[z, x + 1]   = 1
107 |             # annote[y, x] = 'W' if a == 0  else 'C'
108 | 
109 |         return data, annote
110 |     # print target
111 |     
112 |     data       = numpy.zeros((len(source), len(target)))
113 |     annote     = numpy.chararray(data.shape, itemsize=8)
114 |     annote[:]  = '' 
115 |     data, annote  = track(action, data, annote)
116 |     data[1, 0] = 1
117 |     
118 |     def draw(data_t, ax, attention=None):
119 |         
120 |         data   = copy.copy(data_t)
121 |         data[1:-1, 1:-1] += attention.T
122 |         d  = pd.DataFrame(data=data, columns=target, index=source)
123 |         # p  = sns.diverging_palette(220, 10, as_cmap=True)
124 |         g  = sns.heatmap(d, mask=(data==0), square=True, cbar=False, linewidths=0.1, ax=ax, annot=annote, fmt='s')
125 |         g.xaxis.tick_top()
126 |    
127 |         for tick in ax.get_xticklabels():
128 |             tick.set_rotation(90)
129 |         for tick in ax.get_yticklabels():
130 |             tick.set_rotation(0)
131 |         
132 |         ax.grid(True)
133 |     f, [ax1, ax2] = plot.subplots(1, 2, figsize=(22, 11))
134 |     f.set_canvas(plot.gcf().canvas)
135 |     
136 |     draw(data, ax1, attention)
137 |     # plot.xticks(rotation=90)
138 |     # plot.yticks(rotation=0)
139 |     # plot.grid()
140 |     
141 |     draw(data, ax2, fullatten)
142 |     # plot.xticks(rotation=90)
143 |     # plot.yticks(rotation=0)
144 |     # plot.grid()
145 | 
146 |     
147 |     if savefig:
148 |         if not os.path.exists('.images/M_{}'.format(name)):
149 |             os.mkdir('.images/M_{}'.format(name))
150 | 
151 |         filename = 'Idx={}||'.format(info['index'])
152 |         for w in info:
153 |             if w is not 'index':
154 |                 filename += '.{}={:.2f}'.format(w, float(info[w]))
155 | 
156 |         print 'saving...'
157 |         plot.savefig('.images/M_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100)
158 |     
159 |     if show:
160 |         plot.show()
161 | 
162 |     print 'plotting done.'
163 |     plot.close()
164 |  
165 | 
166 | 
167 | 
168 | 
169 | 
170 | def visualize(sources, refs, trans, aligns, idx, savefig=True, name='test', info=None):
171 |     
172 |     colors = ['b', 'g']
173 | 
174 |     fig = plot.figure(figsize=(20, 2))
175 |     ax = plot.gca()
176 | 
177 |     # plot.hold('on')
178 | 
179 |     plot.xlim([0., 10.])
180 | 
181 |     scolors = []
182 |     caidx = 0
183 |     coloridx = 0
184 |     for sidx in xrange(len([s_.replace('@@', '--').strip() for s_ in sources[idx].split()] + ['<eos>'])):
185 |         if caidx >= len(numpy.unique(aligns[idx])) or sidx >= numpy.unique(aligns[idx])[caidx]:
186 |             caidx = caidx + 1
187 |             coloridx = 1 - coloridx
188 |         scolors.append(colors[coloridx])
189 | 
190 |     tcolors = []
191 |     lastidx = -1
192 |     coloridx = 1
193 |     for tt in aligns[idx]:
194 |         if tt != lastidx:
195 |             lastidx = tt
196 |             coloridx = 1 - coloridx
197 |         tcolors.append(colors[coloridx])
198 | 
199 |     x, y = 0., 1.
200 |     s_pos = [(x, y)]
201 |     for ii, ss in enumerate([s_.replace('@@', '--').strip() for s_ in sources[idx].split()] + ['<eos>']):
202 |         
203 |         ss.replace('%', '\%')
204 |         xx = plot.text(x, y, ss)
205 |         xx.set_bbox(dict(color=scolors[ii], alpha=0.1, edgecolor=scolors[ii]))
206 |         xx._renderer = fig.canvas.get_renderer()
207 |         wext = xx.get_window_extent()
208 |         bbox = ax.transData.inverted().transform(wext)
209 |         x = bbox[1, 0] + 0.
210 |         s_pos.append((x, y))
211 |     s_pos.append((bbox[1, 0], y))
212 | 
213 |     x, y = 0., .95
214 |     t_pos = []
215 |     for ii, ss in enumerate([s_.decode('utf8').replace('@@', '--') for s_ in trans[idx].split()]):
216 |         
217 |         ss.replace('%', '\%')
218 |         xx = plot.text(x, y, ss)
219 |         xx._renderer = fig.canvas.get_renderer()
220 |         wext = xx.get_window_extent()
221 |         bbox = ax.transData.inverted().transform(wext)
222 |         t_pos.append((bbox[0, 0], bbox[0, 1] + 0.03))
223 |         x = bbox[1, 0] + 0.
224 |     t_pos.append((bbox[1, 0], bbox[0, 1] + 0.03))
225 | 
226 |     lasttidx = 0
227 |     lastidx = -1
228 |     for tidx, sidx in enumerate(aligns[idx]):
229 |         if lastidx != sidx:
230 |             lastidx = sidx
231 |             lasttidx = tidx
232 |             sidx = numpy.minimum(sidx, len(s_pos) - 1)
233 |             plot.arrow(s_pos[sidx][0], s_pos[sidx][1],
234 |                        t_pos[tidx][0] - s_pos[sidx][0],
235 |                        t_pos[tidx][1] - s_pos[sidx][1],
236 |                        head_width=0., head_length=0.,
237 |                        fc=tcolors[tidx], ec=tcolors[tidx],
238 |                        linestyle='dotted', width=0.0001)
239 |             for tt in xrange(tidx, len(aligns[idx])):
240 |                 if aligns[idx][tt] != sidx:
241 |                     plot.arrow(s_pos[sidx][0], s_pos[sidx][1],
242 |                                t_pos[tt][0] - s_pos[sidx][0],
243 |                                t_pos[tt][1] - s_pos[sidx][1],
244 |                                head_width=0., head_length=0.,
245 |                                fc=tcolors[tidx], ec=tcolors[tidx],
246 |                                linestyle='dotted', width=0.0001)
247 |                     plot.fill_between([t_pos[tidx][0], s_pos[sidx][0], t_pos[tt][0]],
248 |                                       [t_pos[tidx][1], s_pos[sidx][1], t_pos[tt][1]],
249 |                                       facecolor=tcolors[tidx], alpha=0.1)
250 |                     break
251 |     plot.arrow(s_pos[sidx][0], s_pos[sidx][1],
252 |                t_pos[-1][0] - s_pos[sidx][0],
253 |                t_pos[-1][1] - s_pos[sidx][1],
254 |                head_width=0., head_length=0.,
255 |                fc=tcolors[-1], ec=tcolors[-1],
256 |                linestyle='dotted', width=0.0001)
257 |     plot.fill_between([t_pos[lasttidx][0], s_pos[sidx][0], t_pos[-1][0]],
258 |                       [t_pos[lasttidx][1], s_pos[sidx][1], t_pos[-1][1]],
259 |                       facecolor=tcolors[tidx], alpha=0.1)
260 | 
261 |     # plot.hold('off')
262 | 
263 |     plot.axis('off')
264 |     plot.ylim([0.95, 1.01])
265 |     plot.tight_layout()
266 | 
267 |     if savefig:
268 |         if not os.path.exists('.images/{}'.format(name)):
269 |             os.mkdir('.images/{}'.format(name))
270 | 
271 |         filename = 'Idx={}||'.format(info['index'])
272 |         for w in info:
273 |             if w is not 'index':
274 |                 filename += '.{}={:.2f}'.format(w, float(info[w]))
275 | 
276 |     plot.savefig('.images/{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=300)
277 | 
278 |     print 'plotting done.'
279 |     plot.close()
280 |     # plot.show()
281 | 
282 | 
283 | if __name__ == "__main__":
284 | 
285 |     sources = ['I cannot understand .']
286 |     targets = ['Ich verstehe nicht .']
287 |     actions = [[0, 0, 1, 1, 2, 0, 1, 2, 2,  0, 1]]
288 |     heatmap2(sources, targets, targets, actions, 0, savefig=False, show=True)
289 | 


--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Build the basic layers for neural machine translation
  3 | """
  4 | import warnings
  5 | import os
  6 | import theano
  7 | import theano.tensor as tensor
  8 | import numpy
  9 | 
 10 | from collections import OrderedDict
 11 | 
 12 | profile = False
 13 | TINY    = 1e-7
 14 | 
 15 | # -------------------------------------------------------------------------#
 16 | # Basic utils:
 17 | # push parameters to Theano shared variables
 18 | def zipp(params, tparams):
 19 |     for kk, vv in params.iteritems():
 20 |         tparams[kk].set_value(vv)
 21 | 
 22 | 
 23 | # pull parameters from Theano shared variables
 24 | def unzip(zipped, new_params=None):
 25 |     if new_params is None:
 26 |         new_params = OrderedDict()
 27 | 
 28 |     for kk, vv in zipped.iteritems():
 29 |         new_params[kk] = vv.get_value()
 30 |     return new_params
 31 | 
 32 | 
 33 | # flatten-grad
 34 | def flatcat(arrays):
 35 |     '''
 36 |     Flattens arrays and concatenates them in order.
 37 |     '''
 38 |     return tensor.concatenate([a.flatten() for a in arrays])
 39 | 
 40 | def flatgrad(loss, vars_):
 41 |     return flatcat(tensor.grad(loss, wrt=itemlist(vars_)))
 42 | 
 43 | def zipsame(*seqs):
 44 |     L = len(seqs[0])
 45 |     assert all(len(seq) == L for seq in seqs[1:])
 46 |     return zip(*seqs)
 47 | 
 48 | 
 49 | 
 50 | # ------------------------------------------------------------------------#
 51 | # get the list of parameters: Note that tparams must be OrderedDict
 52 | def itemlist(tparams, exception=None):
 53 |     if not exception:
 54 |         return [vv for kk, vv in tparams.iteritems()]
 55 | 
 56 |     return [vv for kk, vv in tparams.iteritems() if kk not in exception]
 57 | 
 58 | # make prefix-appended name
 59 | def _p(pp, name):
 60 |     return '%s_%s' % (pp, name)
 61 | 
 62 | # initialize Theano shared variables according to the initial parameters
 63 | def init_tparams(params):
 64 |     tparams = OrderedDict()
 65 |     for kk, pp in params.iteritems():
 66 |         tparams[kk] = theano.shared(params[kk], name=kk)
 67 |     return tparams
 68 | 
 69 | 
 70 | # load parameters
 71 | def load_params(path, params):
 72 |     pp = numpy.load(path)
 73 |     for kk, vv in params.iteritems():
 74 |         if kk not in pp:
 75 |             warnings.warn('%s is not in the archive' % kk)
 76 |             continue
 77 |         print 'loading {}: {}'.format(kk, pp[kk].shape)
 78 |         params[kk] = pp[kk]
 79 | 
 80 |     return params
 81 | 
 82 | # lateral normalization
 83 | def ln(x, b, s):
 84 |     _eps = 1e-5
 85 |     output = (x - x.mean(1)[:,None]) / tensor.sqrt((x.var(1)[:,None] + _eps))
 86 |     output = s[None, :] * output + b[None,:]
 87 |     return output
 88 | 
 89 | 
 90 | # -------------------------------------------------------------------------#
 91 | # Layers:
 92 | # 'layer-name': ('parameter initializer', 'computational graph')  -- registeration
 93 | layers             = dict()
 94 | layers['ff']       = ('param_init_fflayer', 'fflayer')
 95 | layers['gru']      = ('param_init_gru', 'gru_layer')
 96 | layers['gru_cond'] = ('param_init_gru_cond', 'gru_cond_layer', 'gru_cond_context', 'gru_cond_update')
 97 | layers['lngru']    = ('param_init_lngru', 'lngru_layer')
 98 | 
 99 | def get_layer(name):
100 |     fns = layers[name]
101 |     return (eval(fns[0]), eval(fns[1]))
102 | 
103 | # some utilities
104 | def ortho_weight(ndim):
105 |     W = numpy.random.randn(ndim, ndim)
106 |     u, s, v = numpy.linalg.svd(W)
107 |     return u.astype('float32')
108 | 
109 | # norm initialization
110 | def norm_weight(nin, nout=None, scale=0.01, ortho=True):
111 |     if nout is None:
112 |         nout = nin
113 |     if nout == nin and ortho:
114 |         W = ortho_weight(nin)
115 |     else:
116 |         W = scale * numpy.random.randn(nin, nout)
117 | 
118 |     return W.astype('float32')
119 | 
120 | 
121 | def tanh(x):
122 |     return tensor.tanh(x)
123 | 
124 | def linear(x):
125 |     return x
126 | 
127 | def sigmoid(x):
128 |     return tensor.nnet.sigmoid(x)
129 | 
130 | def relu(x):
131 |     return tensor.nnet.relu(x)
132 | 
133 | def softmax(x):
134 |     return tensor.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)
135 | 
136 | def concatenate(tensor_list, axis=0):
137 |     """
138 |     Alternative implementation of `theano.tensor.concatenate`.
139 |     This function does exactly the same thing, but contrary to Theano's own
140 |     implementation, the gradient is implemented on the GPU.
141 |     Backpropagating through `theano.tensor.concatenate` yields slowdowns
142 |     because the inverse operation (splitting) needs to be done on the CPU.
143 |     This implementation does not have that problem.
144 |     :usage:
145 |         >>> x, y = theano.tensor.matrices('x', 'y')
146 |         >>> c = concatenate([x, y], axis=1)
147 |     :parameters:
148 |         - tensor_list : list
149 |             list of Theano tensor expressions that should be concatenated.
150 |         - axis : int
151 |             the tensors will be joined along this axis.
152 |     :returns:
153 |         - out : tensor
154 |             the concatenated tensor expression.
155 |     """
156 |     concat_size = sum(tt.shape[axis] for tt in tensor_list)
157 | 
158 |     output_shape = ()
159 |     for k in range(axis):
160 |         output_shape += (tensor_list[0].shape[k],)
161 |     output_shape += (concat_size,)
162 |     for k in range(axis + 1, tensor_list[0].ndim):
163 |         output_shape += (tensor_list[0].shape[k],)
164 | 
165 |     out = tensor.zeros(output_shape)
166 |     offset = 0
167 |     for tt in tensor_list:
168 |         indices = ()
169 |         for k in range(axis):
170 |             indices += (slice(None),)
171 |         indices += (slice(offset, offset + tt.shape[axis]),)
172 |         for k in range(axis + 1, tensor_list[0].ndim):
173 |             indices += (slice(None),)
174 | 
175 |         out = tensor.set_subtensor(out[indices], tt)
176 |         offset += tt.shape[axis]
177 | 
178 |     return out
179 | 
180 | #-------------------------------------------------------------------------#
181 | # Dropout:
182 | 
183 | def dropout_layer(state_before, use_noise, trng):
184 |     proj = tensor.switch(
185 |         use_noise,
186 |         state_before * trng.binomial(state_before.shape, p=0.5, n=1,
187 |                                      dtype=state_before.dtype),
188 |         state_before * 0.5)
189 |     return proj
190 | 
191 | 
192 | # -------------------------------------------------------------------------#
193 | # Feedforward:
194 | #  affine transformation + point-wise nonlinearity
195 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None,
196 |                        ortho=True, negative=0, scale=0.01):
197 |     if nin is None:
198 |         nin = options['dim_proj']
199 |     if nout is None:
200 |         nout = options['dim_proj']
201 |     params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=scale, ortho=ortho)
202 |     if negative == 0:
203 |         params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32')
204 |     else:
205 |         params[_p(prefix, 'b')] = numpy.ones((nout,)).astype('float32') * negative
206 | 
207 |     return params
208 | 
209 | 
210 | def fflayer(tparams, state_below, options, prefix='rconv',
211 |             activ='lambda x: tensor.tanh(x)', **kwargs):
212 |     return eval(activ)(
213 |         tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
214 |         tparams[_p(prefix, 'b')])
215 | 
216 | 
217 | # -------------------------------------------------------------------------#
218 | # Gated Recurrent Unit:
219 | #
220 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None, scale=0.01):
221 |     if nin is None:
222 |         nin = options['dim_proj']
223 |     if dim is None:
224 |         dim = options['dim_proj']
225 | 
226 |     # embedding to gates transformation weights, biases
227 |     W = numpy.concatenate([norm_weight(nin, dim, scale=scale),
228 |                            norm_weight(nin, dim, scale=scale)], axis=1)
229 |     params[_p(prefix, 'W')] = W
230 |     params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
231 | 
232 |     # recurrent transformation weights for gates
233 |     U = numpy.concatenate([ortho_weight(dim),
234 |                            ortho_weight(dim)], axis=1)
235 |     params[_p(prefix, 'U')] = U
236 | 
237 |     # embedding to hidden state proposal weights, biases
238 |     Wx = norm_weight(nin, dim, scale=scale)
239 |     params[_p(prefix, 'Wx')] = Wx
240 |     params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')
241 | 
242 |     # recurrent transformation weights for hidden state proposal
243 |     Ux = ortho_weight(dim)
244 |     params[_p(prefix, 'Ux')] = Ux
245 | 
246 |     return params
247 | 
248 | 
249 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None,
250 |               one_step=False, _init_state=None, **kwargs):
251 |     if one_step:
252 |         assert _init_state, 'previous state must be provided'
253 | 
254 |     nsteps = state_below.shape[0]
255 |     if state_below.ndim == 3:
256 |         n_samples = state_below.shape[1]
257 |     else:
258 |         n_samples = 1
259 | 
260 |     dim = tparams[_p(prefix, 'Ux')].shape[1]
261 | 
262 |     if mask is None:
263 |         mask = tensor.alloc(1., state_below.shape[0], 1)
264 | 
265 |     # utility function to slice a tensor
266 |     def _slice(_x, n, dim):
267 |         if _x.ndim == 3:
268 |             return _x[:, :, n*dim:(n+1)*dim]
269 |         return _x[:, n*dim:(n+1)*dim]
270 | 
271 |     # state_below is the input word embeddings
272 |     # input to the gates, concatenated
273 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
274 |         tparams[_p(prefix, 'b')]
275 |     # input to compute the hidden state proposal
276 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
277 |         tparams[_p(prefix, 'bx')]
278 | 
279 |     # step function to be used by scan
280 |     # arguments    | sequences |outputs-info| non-seqs
281 |     def _step_slice(m_, x_, xx_, h_, U, Ux):
282 |         preact = tensor.dot(h_, U)
283 |         preact += x_
284 | 
285 |         # reset and update gates
286 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
287 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
288 | 
289 |         # compute the hidden state proposal
290 |         preactx = tensor.dot(h_, Ux)
291 |         preactx = preactx * r
292 |         preactx = preactx + xx_
293 | 
294 |         # hidden state proposal
295 |         h = tensor.tanh(preactx)
296 | 
297 |         # leaky integrate and obtain next hidden state
298 |         h = u * h_ + (1. - u) * h
299 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
300 | 
301 |         return h
302 | 
303 |     # prepare scan arguments
304 |     seqs = [mask, state_below_, state_belowx]
305 |     init_states = [tensor.alloc(0., n_samples, dim)]
306 |     _step = _step_slice
307 |     shared_vars = [tparams[_p(prefix, 'U')],
308 |                    tparams[_p(prefix, 'Ux')]]
309 | 
310 |     if one_step:
311 |         rval = _step(*(seqs + [_init_state] + shared_vars))
312 |     else:
313 |         rval, updates = theano.scan(_step,
314 |                                     sequences=seqs,
315 |                                     outputs_info=init_states,
316 |                                     non_sequences=shared_vars,
317 |                                     name=_p(prefix, '_layers'),
318 |                                     n_steps=nsteps,
319 |                                     profile=profile,
320 |                                     strict=True)
321 |     rval = [rval]
322 |     return rval
323 | 
324 | # -------------------------------------------------------------------------#
325 | # Conditional Gated Recurrent Unit with Attention (GRU_cond)
326 | #
327 | def param_init_gru_cond(options, params, prefix='gru_cond',
328 |                         nin=None, dim=None, dimctx=None,
329 |                         nin_nonlin=None, dim_nonlin=None, scale=0.01):
330 |     if nin is None:
331 |         nin = options['dim']
332 |     if dim is None:
333 |         dim = options['dim']
334 |     if dimctx is None:
335 |         dimctx = options['dim']
336 |     if nin_nonlin is None:
337 |         nin_nonlin = nin
338 |     if dim_nonlin is None:
339 |         dim_nonlin = dim
340 | 
341 |     W = numpy.concatenate([norm_weight(nin, dim, scale=scale),
342 |                            norm_weight(nin, dim, scale=scale)], axis=1)
343 |     params[_p(prefix, 'W')] = W
344 |     params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
345 |     U = numpy.concatenate([ortho_weight(dim_nonlin),
346 |                            ortho_weight(dim_nonlin)], axis=1)
347 |     params[_p(prefix, 'U')] = U
348 | 
349 |     Wx = norm_weight(nin_nonlin, dim_nonlin, scale=scale)
350 |     params[_p(prefix, 'Wx')] = Wx
351 |     Ux = ortho_weight(dim_nonlin)
352 |     params[_p(prefix, 'Ux')] = Ux
353 |     params[_p(prefix, 'bx')] = numpy.zeros((dim_nonlin,)).astype('float32')
354 | 
355 |     U_nl = numpy.concatenate([ortho_weight(dim_nonlin),
356 |                               ortho_weight(dim_nonlin)], axis=1)
357 |     params[_p(prefix, 'U_nl')] = U_nl
358 |     params[_p(prefix, 'b_nl')] = numpy.zeros((2 * dim_nonlin,)).astype('float32')
359 | 
360 |     Ux_nl = ortho_weight(dim_nonlin)
361 |     params[_p(prefix, 'Ux_nl')] = Ux_nl
362 |     params[_p(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin,)).astype('float32')
363 | 
364 |     # context to LSTM
365 |     Wc = norm_weight(dimctx, dim*2, scale=scale)
366 |     params[_p(prefix, 'Wc')] = Wc
367 | 
368 |     Wcx = norm_weight(dimctx, dim, scale=scale)
369 |     params[_p(prefix, 'Wcx')] = Wcx
370 | 
371 |     # attention: combined -> hidden
372 |     W_comb_att = norm_weight(dim, dimctx, scale=scale)
373 |     params[_p(prefix, 'W_comb_att')] = W_comb_att
374 | 
375 |     # attention: context -> hidden
376 |     Wc_att = norm_weight(dimctx, scale=scale)
377 |     params[_p(prefix, 'Wc_att')] = Wc_att
378 | 
379 |     # attention: hidden bias
380 |     b_att = numpy.zeros((dimctx,)).astype('float32')
381 |     params[_p(prefix, 'b_att')] = b_att
382 | 
383 |     # attention:
384 |     U_att = norm_weight(dimctx, 1, scale=scale)
385 |     params[_p(prefix, 'U_att')] = U_att
386 |     c_att = numpy.zeros((1,)).astype('float32')
387 |     params[_p(prefix, 'c_tt')] = c_att
388 | 
389 |     return params
390 | 
391 | 
392 | def gru_cond_layer(tparams, state_below, options, prefix='gru',
393 |                    mask=None, context=None, one_step=False,
394 |                    init_memory=None, init_state=None,
395 |                    context_mask=None,
396 |                    **kwargs):
397 | 
398 |     assert context, 'Context must be provided'
399 | 
400 |     if one_step:
401 |         assert init_state, 'previous state must be provided'
402 | 
403 |     nsteps = state_below.shape[0]
404 |     if state_below.ndim == 3:
405 |         n_samples = state_below.shape[1]
406 |     else:
407 |         n_samples = 1
408 | 
409 |     # mask
410 |     if mask is None:
411 |         mask = tensor.alloc(1., state_below.shape[0], 1)
412 | 
413 |     dim = tparams[_p(prefix, 'Wcx')].shape[1]
414 | 
415 |     # initial/previous state
416 |     if init_state is None:
417 |         init_state = tensor.alloc(0., n_samples, dim)
418 | 
419 |     # projected context
420 |     assert context.ndim == 3, \
421 |         'Context must be 3-d: #annotation x #sample x dim'
422 |     pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\
423 |         tparams[_p(prefix, 'b_att')]
424 | 
425 |     def _slice(_x, n, dim):
426 |         if _x.ndim == 3:
427 |             return _x[:, :, n*dim:(n+1)*dim]
428 |         return _x[:, n*dim:(n+1)*dim]
429 | 
430 |     # projected x
431 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\
432 |         tparams[_p(prefix, 'bx')]
433 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\
434 |         tparams[_p(prefix, 'b')]
435 | 
436 |     def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_,
437 |                     U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
438 |                     U_nl, Ux_nl, b_nl, bx_nl):
439 |         preact1 = tensor.dot(h_, U)
440 |         preact1 += x_
441 |         preact1 = tensor.nnet.sigmoid(preact1)
442 | 
443 |         r1 = _slice(preact1, 0, dim)
444 |         u1 = _slice(preact1, 1, dim)
445 | 
446 |         preactx1 = tensor.dot(h_, Ux)
447 |         preactx1 *= r1
448 |         preactx1 += xx_
449 | 
450 |         h1 = tensor.tanh(preactx1)
451 | 
452 |         h1 = u1 * h_ + (1. - u1) * h1
453 |         h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_
454 | 
455 |         # attention
456 |         pstate_ = tensor.dot(h1, W_comb_att)
457 |         pctx__ = pctx_ + pstate_[None, :, :]
458 |         #pctx__ += xc_
459 |         pctx__ = tensor.tanh(pctx__)
460 |         alpha = tensor.dot(pctx__, U_att)+c_tt
461 |         alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
462 |         alpha = tensor.exp(alpha)
463 | 
464 |         if context_mask:
465 |             alpha = alpha * context_mask
466 |         alpha = alpha / (alpha.sum(0, keepdims=True) + TINY)
467 | 
468 |         ctx_ = (cc_ * alpha[:, :, None]).sum(0)  # current context
469 | 
470 |         preact2 = tensor.dot(h1, U_nl)+b_nl
471 |         preact2 += tensor.dot(ctx_, Wc)
472 |         preact2 = tensor.nnet.sigmoid(preact2)
473 | 
474 |         r2 = _slice(preact2, 0, dim)
475 |         u2 = _slice(preact2, 1, dim)
476 | 
477 |         preactx2 = tensor.dot(h1, Ux_nl)+bx_nl
478 |         preactx2 *= r2
479 |         preactx2 += tensor.dot(ctx_, Wcx)
480 | 
481 |         h2 = tensor.tanh(preactx2)
482 | 
483 |         h2 = u2 * h1 + (1. - u2) * h2
484 |         h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
485 | 
486 |         return h2, ctx_, alpha.T  # pstate_, preact, preactx, r, u
487 | 
488 |     seqs = [mask, state_below_, state_belowx]
489 |     #seqs = [mask, state_below_, state_belowx, state_belowc]
490 |     _step = _step_slice
491 | 
492 |     shared_vars = [tparams[_p(prefix, 'U')],
493 |                    tparams[_p(prefix, 'Wc')],
494 |                    tparams[_p(prefix, 'W_comb_att')],
495 |                    tparams[_p(prefix, 'U_att')],
496 |                    tparams[_p(prefix, 'c_tt')],
497 |                    tparams[_p(prefix, 'Ux')],
498 |                    tparams[_p(prefix, 'Wcx')],
499 |                    tparams[_p(prefix, 'U_nl')],
500 |                    tparams[_p(prefix, 'Ux_nl')],
501 |                    tparams[_p(prefix, 'b_nl')],
502 |                    tparams[_p(prefix, 'bx_nl')]]
503 | 
504 |     if one_step:
505 |         rval = _step(*(seqs + [init_state, None, None, pctx_, context] +
506 |                        shared_vars))
507 |     else:
508 |         rval, updates = theano.scan(_step,
509 |                                     sequences=seqs,
510 |                                     outputs_info=[init_state,
511 |                                                   tensor.alloc(0., n_samples,
512 |                                                                context.shape[2]),
513 |                                                   tensor.alloc(0., n_samples,
514 |                                                                context.shape[0])],
515 |                                     non_sequences=[pctx_, context]+shared_vars,
516 |                                     name=_p(prefix, '_layers'),
517 |                                     n_steps=nsteps,
518 |                                     profile=profile,
519 |                                     strict=True)
520 |     return rval
521 | 
522 | # ================================================================================== #
523 | # Conditional GRU: depart the network
524 | 
525 | def gru_cond_context(tparams, state_below, options, prefix='gru',
526 |                      mask=None, context=None,
527 |                      init_memory=None, init_state=None,
528 |                      context_mask=None,
529 |                      **kwargs):
530 | 
531 |     assert context, 'Context must be provided'
532 |     assert init_state, 'previous state must be provided'
533 | 
534 |     if state_below.ndim == 3:
535 |         n_samples = state_below.shape[1]
536 |     else:
537 |         n_samples = 1
538 | 
539 |     # mask
540 |     if mask is None:
541 |         mask = tensor.alloc(1., state_below.shape[0], 1)
542 | 
543 |     dim = tparams[_p(prefix, 'Wcx')].shape[1]
544 | 
545 |     # initial/previous state
546 |     if init_state is None:
547 |         init_state = tensor.alloc(0., n_samples, dim)
548 | 
549 |     # projected context
550 |     assert context.ndim == 3, \
551 |         'Context must be 3-d: #annotation x #sample x dim'
552 |     pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\
553 |         tparams[_p(prefix, 'b_att')]
554 | 
555 |     def _slice(_x, n, dim):
556 |         if _x.ndim == 3:
557 |             return _x[:, :, n*dim:(n+1)*dim]
558 |         return _x[:, n*dim:(n+1)*dim]
559 | 
560 |     # projected x
561 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\
562 |         tparams[_p(prefix, 'bx')]
563 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\
564 |         tparams[_p(prefix, 'b')]
565 | 
566 |     def _step_slice(m_, x_, xx_, h_, pctx_, cc_,
567 |                     U, W_comb_att, U_att, c_tt, Ux):
568 |         preact1 = tensor.dot(h_, U)
569 |         preact1 += x_
570 |         preact1 = tensor.nnet.sigmoid(preact1)
571 | 
572 |         r1 = _slice(preact1, 0, dim)
573 |         u1 = _slice(preact1, 1, dim)
574 | 
575 |         preactx1 = tensor.dot(h_, Ux)
576 |         preactx1 *= r1
577 |         preactx1 += xx_
578 | 
579 |         h1 = tensor.tanh(preactx1)
580 | 
581 |         h1 = u1 * h_ + (1. - u1) * h1
582 |         h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_
583 | 
584 |         # attention
585 |         pstate_ = tensor.dot(h1, W_comb_att)
586 | 
587 |         pctx__  = pctx_ + pstate_[None, :, :]
588 |         pctx__  = tensor.tanh(pctx__)
589 | 
590 |         alpha   = tensor.dot(pctx__, U_att)+c_tt
591 |         alpha   = alpha.reshape([alpha.shape[0], alpha.shape[1]])
592 |         alpha   = tensor.exp(alpha)
593 | 
594 |         if context_mask:
595 |             alpha = alpha * context_mask
596 |         alpha = alpha / (alpha.sum(0, keepdims=True) + TINY)
597 | 
598 |         ctx_ = (cc_ * alpha[:, :, None]).sum(0)  # current context
599 |         return h1, ctx_, alpha.T  # pstate_, preact, preactx, r, u
600 | 
601 |     seqs  = [mask, state_below_, state_belowx]
602 |     _step = _step_slice
603 | 
604 |     shared_vars = [tparams[_p(prefix, 'U')],
605 |                    tparams[_p(prefix, 'W_comb_att')],
606 |                    tparams[_p(prefix, 'U_att')],
607 |                    tparams[_p(prefix, 'c_tt')],
608 |                    tparams[_p(prefix, 'Ux')]]
609 | 
610 |     rval = _step(*(seqs + [init_state, pctx_, context] + shared_vars))
611 |     return rval
612 | 
613 | 
614 | def gru_cond_update(tparams, options, prefix='gru',
615 |                    mask=None, cxt=None, h1=None,
616 |                    **kwargs):
617 | 
618 |     assert cxt, 'Context vector must be provided'
619 |     assert h1,  'Temperal state vector must be provided'
620 | 
621 |     # mask
622 |     if mask is None:
623 |         mask = tensor.alloc(1., h1.shape[0], 1)
624 | 
625 |     dim = tparams[_p(prefix, 'Wcx')].shape[1]
626 | 
627 | 
628 |     def _slice(_x, n, dim):
629 |         if _x.ndim == 3:
630 |             return _x[:, :, n*dim:(n+1)*dim]
631 |         return _x[:, n*dim:(n+1)*dim]
632 | 
633 | 
634 |     def _step_slice(m_, ctx_, h1,
635 |                     Wc, Wcx,
636 |                     U_nl, Ux_nl,
637 |                     b_nl, bx_nl):
638 | 
639 |         preact2 = tensor.dot(h1, U_nl)+b_nl
640 |         preact2 += tensor.dot(ctx_, Wc)
641 |         preact2 = tensor.nnet.sigmoid(preact2)
642 | 
643 |         r2 = _slice(preact2, 0, dim)
644 |         u2 = _slice(preact2, 1, dim)
645 | 
646 |         preactx2 = tensor.dot(h1, Ux_nl)+bx_nl
647 |         preactx2 *= r2
648 |         preactx2 += tensor.dot(ctx_, Wcx)
649 | 
650 |         h2 = tensor.tanh(preactx2)
651 |         h2 = u2 * h1 + (1. - u2) * h2
652 |         h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
653 | 
654 |         return h2
655 | 
656 |     seqs  = [mask, cxt, h1]
657 |     _step = _step_slice
658 | 
659 |     shared_vars = [tparams[_p(prefix, 'Wc')],
660 |                    tparams[_p(prefix, 'Wcx')],
661 |                    tparams[_p(prefix, 'U_nl')],
662 |                    tparams[_p(prefix, 'Ux_nl')],
663 |                    tparams[_p(prefix, 'b_nl')],
664 |                    tparams[_p(prefix, 'bx_nl')]]
665 | 
666 |     rval = _step(*(seqs + shared_vars))
667 |     return rval
668 | 
669 | # ================================================================================== #
670 | 
671 | 
672 | 
673 | 
674 | # LN-GRU layer
675 | def param_init_lngru(options, params, prefix='lngru', nin=None, dim=None, scale=0.01):
676 |     """
677 |     Gated Recurrent Unit (GRU) with LN
678 |     """
679 |     if nin == None:
680 |         nin = options['dim_proj']
681 |     if dim == None:
682 |         dim = options['dim_proj']
683 |     W = numpy.concatenate([norm_weight(nin,dim, scale=scale),
684 |                            norm_weight(nin,dim, scale=scale)], axis=1)
685 |     params[_p(prefix,'W')] = W
686 |     params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
687 |     U = numpy.concatenate([ortho_weight(dim),
688 |                            ortho_weight(dim)], axis=1)
689 |     params[_p(prefix,'U')] = U
690 | 
691 |     Wx = norm_weight(nin, dim, scale=scale)
692 |     params[_p(prefix,'Wx')] = Wx
693 |     Ux = ortho_weight(dim)
694 |     params[_p(prefix,'Ux')] = Ux
695 |     params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')
696 | 
697 |     # LN parameters
698 |     scale_add = 0.0
699 |     scale_mul = 1.0
700 |     params[_p(prefix,'b1')] = scale_add * numpy.ones((2*dim)).astype('float32')
701 |     params[_p(prefix,'b2')] = scale_add * numpy.ones((1*dim)).astype('float32')
702 |     params[_p(prefix,'b3')] = scale_add * numpy.ones((2*dim)).astype('float32')
703 |     params[_p(prefix,'b4')] = scale_add * numpy.ones((1*dim)).astype('float32')
704 |     params[_p(prefix,'s1')] = scale_mul * numpy.ones((2*dim)).astype('float32')
705 |     params[_p(prefix,'s2')] = scale_mul * numpy.ones((1*dim)).astype('float32')
706 |     params[_p(prefix,'s3')] = scale_mul * numpy.ones((2*dim)).astype('float32')
707 |     params[_p(prefix,'s4')] = scale_mul * numpy.ones((1*dim)).astype('float32')
708 | 
709 |     return params
710 | 
711 | def lngru_layer(tparams, state_below, options, prefix='lngru', mask=None, one_step=False, _init_state=None, **kwargs):
712 |     """
713 |     Feedforward pass through GRU with LN
714 |     """
715 |     nsteps = state_below.shape[0]
716 |     if state_below.ndim == 3:
717 |         n_samples = state_below.shape[1]
718 |     else:
719 |         n_samples = 1
720 | 
721 |     dim = tparams[_p(prefix,'Ux')].shape[1]
722 | 
723 |     if _init_state == None:
724 |         _init_state = tensor.alloc(0., n_samples, dim)
725 | 
726 |     if mask == None:
727 |         mask = tensor.alloc(1., state_below.shape[0], 1)
728 | 
729 |     def _slice(_x, n, dim):
730 |         if _x.ndim == 3:
731 |             return _x[:, :, n*dim:(n+1)*dim]
732 |         return _x[:, n*dim:(n+1)*dim]
733 | 
734 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
735 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
736 |     U = tparams[_p(prefix, 'U')]
737 |     Ux = tparams[_p(prefix, 'Ux')]
738 | 
739 |     def _step_slice(m_, x_, xx_, h_, U, Ux, b1, b2, b3, b4, s1, s2, s3, s4):
740 | 
741 |         x_ = ln(x_, b1, s1)
742 |         xx_ = ln(xx_, b2, s2)
743 | 
744 |         preact = tensor.dot(h_, U)
745 |         preact = ln(preact, b3, s3)
746 |         preact += x_
747 | 
748 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
749 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
750 | 
751 |         preactx = tensor.dot(h_, Ux)
752 |         preactx = ln(preactx, b4, s4)
753 |         preactx = preactx * r
754 |         preactx = preactx + xx_
755 | 
756 |         h = tensor.tanh(preactx)
757 | 
758 |         h = u * h_ + (1. - u) * h
759 |         h = m_[:,None] * h + (1. - m_)[:,None] * h_
760 | 
761 |         return h
762 | 
763 |     seqs = [mask, state_below_, state_belowx]
764 |     _step = _step_slice
765 | 
766 |     non_seqs = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]]
767 |     non_seqs += [tparams[_p(prefix, 'b1')], tparams[_p(prefix, 'b2')], tparams[_p(prefix, 'b3')], tparams[_p(prefix, 'b4')]]
768 |     non_seqs += [tparams[_p(prefix, 's1')], tparams[_p(prefix, 's2')], tparams[_p(prefix, 's3')], tparams[_p(prefix, 's4')]]
769 | 
770 |     if one_step:
771 |         rval = _step(*(seqs+[_init_state]+non_seqs))
772 |     else:
773 |         rval, updates = theano.scan(_step,
774 |                                     sequences=seqs,
775 |                                     outputs_info = [_init_state],
776 |                                     non_sequences = non_seqs,
777 |                                     name=_p(prefix, '_layers'),
778 |                                     n_steps=nsteps,
779 |                                     profile=False,
780 |                                     strict=True)
781 |     rval = [rval]
782 |     return rval
783 | 
784 | 
785 | 


--------------------------------------------------------------------------------
/mteval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ref=" /misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-ref.ru.tok"
4 | sed -i 's/@@ //g' $1
5 | ./data/multi-bleu.perl ref < $1
6 | 


--------------------------------------------------------------------------------
/nmt_uni.py:
--------------------------------------------------------------------------------
   1 | '''
   2 | Build a neural machine translation model with soft attention
   3 | '''
   4 | import theano
   5 | import theano.tensor as tensor
   6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
   7 | 
   8 | import cPickle as pkl
   9 | #import ipdb
  10 | import numpy
  11 | import copy
  12 | 
  13 | import os
  14 | 
  15 | import sys
  16 | import time
  17 | 
  18 | from collections import OrderedDict
  19 | from data_iterator import TextIterator
  20 | from layers import *
  21 | from optimizer import *
  22 | 
  23 | profile = False
  24 | TINY    = 1e-7
  25 | 
  26 | # -----------------------------------------------------------------------------#
  27 | # Build the Attention-based Neural Machine Translation
  28 | 
  29 | # initialize all parameters
  30 | def init_params(options):
  31 |     params = OrderedDict()
  32 | 
  33 |     # embedding
  34 |     params['Wemb']     = norm_weight(options['n_words_src'], options['dim_word'])
  35 |     params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word'])
  36 | 
  37 |     # encoder: uni-directional RNN
  38 |     params = get_layer(options['encoder'])[0](options, params,
  39 |                                               prefix='encoder',
  40 |                                               nin=options['dim_word'],
  41 |                                               dim=options['dim'])
  42 | 
  43 |     if options.get('birnn', False):
  44 |         params = get_layer(options['encoder'])[0](options, params,
  45 |                                                   prefix='encoder_r',
  46 |                                                   nin=options['dim_word'],
  47 |                                                   dim=options['dim'])
  48 | 
  49 | 
  50 |     ctxdim = options['dim'] if not options.get('birnn', False) else 2 * options['dim']
  51 | 
  52 |     # init_state, init_cell
  53 |     params = get_layer('ff')[0](options, params, prefix='ff_state',
  54 |                                 nin=ctxdim, nout=options['dim'])
  55 |     # decoder
  56 |     params = get_layer(options['decoder'])[0](options, params,
  57 |                                               prefix='decoder',
  58 |                                               nin=options['dim_word'],
  59 |                                               dim=options['dim'],
  60 |                                               dimctx=ctxdim)
  61 |     # readout
  62 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm',
  63 |                                 nin=options['dim'], nout=options['dim_word'],
  64 |                                 ortho=False)
  65 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
  66 |                                 nin=options['dim_word'],
  67 |                                 nout=options['dim_word'], ortho=False)
  68 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx',
  69 |                                 nin=ctxdim, nout=options['dim_word'],
  70 |                                 ortho=False)
  71 |     params = get_layer('ff')[0](options, params, prefix='ff_logit',
  72 |                                 nin=options['dim_word'],
  73 |                                 nout=options['n_words'])
  74 |     return params
  75 | 
  76 | 
  77 | def build_model(tparams, options):
  78 |     opt_ret   = dict()
  79 | 
  80 |     trng      = RandomStreams(1234)
  81 |     use_noise = theano.shared(numpy.float32(0.))
  82 | 
  83 |     # description string: #words x #samples
  84 |     x      = tensor.matrix('x', dtype='int64')
  85 |     x_mask = tensor.matrix('x_mask', dtype='float32')
  86 |     y      = tensor.matrix('y', dtype='int64')
  87 |     y_mask = tensor.matrix('y_mask', dtype='float32')
  88 | 
  89 |     # time_steps
  90 |     n_timesteps     = x_mask.shape[0]
  91 |     n_timesteps_trg = y_mask.shape[0]
  92 |     n_samples       = x_mask.shape[1]
  93 | 
  94 |     # word embedding for forward rnn (source)
  95 |     emb    = tparams['Wemb'][x.flatten()]
  96 |     emb    = emb.reshape([n_timesteps, n_samples, options['dim_word']])
  97 |     proj   = get_layer(options['encoder'])[1](tparams, emb, options,
  98 |                                             prefix='encoder',
  99 |                                             mask=x_mask)
 100 | 
 101 |     # for reverse RNN: bi-directional RNN encoder
 102 |     if options.get('birnn', False):
 103 |         xr      = x[::-1]
 104 |         xr_mask = x_mask[::-1]
 105 | 
 106 |         embr    = tparams['Wemb'][xr.flatten()]
 107 |         embr    = embr.reshape([n_timesteps, n_samples, options['dim_word']])
 108 |         projr   = get_layer(options['encoder'])[1](tparams, embr, options,
 109 |                                                 prefix='encoder_r',
 110 |                                                 mask=xr_mask)
 111 |         ctx     = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
 112 | 
 113 |     else:
 114 |         ctx     = proj[0]  # context vectors
 115 | 
 116 |     # mean of the context (across time) will be used to initialize decoder rnn
 117 |     ctx_mean    = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
 118 | 
 119 | 
 120 |     # initial decoder state
 121 |     init_state  = get_layer('ff')[1](tparams, ctx_mean, options,
 122 |                                     prefix='ff_state', activ='tanh')
 123 | 
 124 |     # word embedding (target), we will shift the target sequence one time step
 125 |     # to the right. This is done because of the bi-gram connections in the
 126 |     # readout and decoder rnn. The first target will be all zeros and we will
 127 |     # not condition on the last output.
 128 |     emb         = tparams['Wemb_dec'][y.flatten()]
 129 |     emb         = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
 130 |     emb_shifted = tensor.zeros_like(emb)
 131 |     emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
 132 |     emb         = emb_shifted
 133 | 
 134 |     # decoder - pass through the decoder conditional gru with attention
 135 |     proj   = get_layer(options['decoder'])[1](tparams, emb, options,
 136 |                                               prefix='decoder',
 137 |                                               mask=y_mask, context=ctx,
 138 |                                               context_mask=x_mask,
 139 |                                               one_step=False,
 140 |                                               init_state=init_state)
 141 |     # hidden states of the decoder gru
 142 |     proj_h = proj[0]
 143 | 
 144 |     # weighted averages of context, generated by attention module
 145 |     ctxs   = proj[1]
 146 | 
 147 |     # weights (alignment matrix)
 148 |     opt_ret['dec_alphas'] = proj[2]  # --> to show the attenion weights
 149 | 
 150 |     # compute word probabilities
 151 |     logit_lstm = get_layer('ff')[1](tparams, proj_h, options,
 152 |                                     prefix='ff_logit_lstm', activ='linear')
 153 |     logit_prev = get_layer('ff')[1](tparams, emb, options,
 154 |                                     prefix='ff_logit_prev', activ='linear')
 155 |     logit_ctx  = get_layer('ff')[1](tparams, ctxs, options,
 156 |                                    prefix='ff_logit_ctx', activ='linear')
 157 |     logit      = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
 158 | 
 159 |     # dropout (noise)
 160 |     if options['use_dropout']:
 161 |         logit  = dropout_layer(logit, use_noise, trng)
 162 |     logit      = get_layer('ff')[1](tparams, logit, options,
 163 |                                prefix='ff_logit', activ='linear')
 164 |     logit_shp  = logit.shape
 165 |     probs      = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
 166 | 
 167 |     # compute the cost (negative loglikelihood)
 168 |     y_flat     = y.flatten()
 169 |     y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
 170 | 
 171 |     cost = -tensor.log(probs.flatten()[y_flat_idx])
 172 |     cost = cost.reshape([y.shape[0], y.shape[1]])
 173 |     cost = (cost * y_mask).sum(0)
 174 | 
 175 |     # we will build an additional function for computing costs
 176 |     f_cost = theano.function([ctx, x_mask, y, y_mask], cost)
 177 |     return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, f_cost
 178 | 
 179 | 
 180 | # build a fine-tuner
 181 | def build_fine(tparams, options, fullmodel=True):
 182 | 
 183 |     # ------------------- ENCODER ------------------------------------------ #
 184 | 
 185 |     opt_ret   = dict()
 186 | 
 187 |     trng      = RandomStreams(1234)
 188 |     use_noise = theano.shared(numpy.float32(0.))
 189 | 
 190 |     # description string: #words x #samples
 191 |     x      = tensor.matrix('x', dtype='int64')
 192 |     x_mask = tensor.matrix('x_mask', dtype='float32')
 193 |     y      = tensor.matrix('y', dtype='int64')
 194 |     y_mask = tensor.matrix('y_mask', dtype='float32')
 195 | 
 196 |     # time_steps
 197 |     n_timesteps     = x_mask.shape[0]
 198 |     n_timesteps_trg = y_mask.shape[0]
 199 |     n_samples       = x_mask.shape[1]
 200 | 
 201 |     # word embedding for forward rnn (source)
 202 |     emb    = tparams['Wemb'][x.flatten()]
 203 |     emb    = emb.reshape([n_timesteps, n_samples, options['dim_word']])
 204 |     proj   = get_layer(options['encoder'])[1](tparams, emb, options,
 205 |                                             prefix='encoder',
 206 |                                             mask=x_mask)
 207 | 
 208 |     # for reverse RNN: bi-directional RNN encoder
 209 |     if options.get('birnn', False):
 210 |         xr      = x[::-1]
 211 |         xr_mask = x_mask[::-1]
 212 | 
 213 |         embr    = tparams['Wemb'][xr.flatten()]
 214 |         embr    = embr.reshape([n_timesteps, n_samples, options['dim_word']])
 215 |         projr   = get_layer(options['encoder'])[1](tparams, embr, options,
 216 |                                                 prefix='encoder_r',
 217 |                                                 mask=xr_mask)
 218 |         ctx     = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
 219 | 
 220 |     else:
 221 |         ctx     = proj[0]  # context vectors
 222 | 
 223 |     # mean of the context (across time) will be used to initialize decoder rnn
 224 |     ctx_mean    = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
 225 | 
 226 |     # or you can use the last state of forward + backward encoder rnns
 227 |     # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2)
 228 | 
 229 |     # initial decoder state
 230 |     init_state  = get_layer('ff')[1](tparams, ctx_mean, options,
 231 |                                     prefix='ff_state', activ='tanh')
 232 | 
 233 |     print 'compile the initializer'
 234 |     f_init      = theano.function([x, x_mask], [ctx, init_state])
 235 |     print 'encoder done.'
 236 |     # ------------------- ENCODER ------------------------------------------ #
 237 | 
 238 | 
 239 |     c_mask      = tensor.tensor3('c_mask', dtype='float32') # seq_t x seq_s x batches
 240 | 
 241 |     emb         = tparams['Wemb_dec'][y.flatten()]
 242 |     emb         = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
 243 |     emb_shifted = tensor.zeros_like(emb)
 244 |     emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
 245 |     emb         = emb_shifted
 246 | 
 247 |     # decoder - pass through the decoder conditional gru with attention
 248 |     def _step(_emb, _y_mask, _c_mask, _init_state, _ctx):
 249 |         return get_layer(options['decoder'])[1](tparams, _emb, options,
 250 |                                               prefix='decoder',
 251 |                                               mask=_y_mask, context=_ctx,
 252 |                                               context_mask=_c_mask,
 253 |                                               one_step=True,
 254 |                                               init_state=_init_state)
 255 | 
 256 |     proj, _ = theano.scan(_step,
 257 |                           sequences=[emb, y_mask, c_mask],
 258 |                           outputs_info=[init_state, None, None],
 259 |                           non_sequences=[ctx])
 260 | 
 261 | 
 262 |     # hidden states of the decoder gru
 263 |     proj_h = proj[0]
 264 | 
 265 |     # weighted averages of context, generated by attention module
 266 |     ctxs   = proj[1]
 267 | 
 268 |     # weights (alignment matrix)
 269 |     opt_ret['dec_alphas'] = proj[2]  # --> to show the attenion weights
 270 | 
 271 |     # compute word probabilities
 272 |     logit_lstm = get_layer('ff')[1](tparams, proj_h, options,
 273 |                                     prefix='ff_logit_lstm', activ='linear')
 274 |     logit_prev = get_layer('ff')[1](tparams, emb, options,
 275 |                                     prefix='ff_logit_prev', activ='linear')
 276 |     logit_ctx  = get_layer('ff')[1](tparams, ctxs, options,
 277 |                                    prefix='ff_logit_ctx', activ='linear')
 278 |     logit      = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
 279 | 
 280 |     # dropout (noise)
 281 |     if options['use_dropout']:
 282 |         logit  = dropout_layer(logit, use_noise, trng)
 283 |     logit      = get_layer('ff')[1](tparams, logit, options,
 284 |                                prefix='ff_logit', activ='linear')
 285 |     logit_shp  = logit.shape
 286 |     probs      = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
 287 | 
 288 |     # compute the cost (negative loglikelihood)
 289 |     y_flat     = y.flatten()
 290 |     y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
 291 | 
 292 |     cost       = -tensor.log(probs.flatten()[y_flat_idx] + TINY)
 293 |     cost       = cost.reshape([y.shape[0], y.shape[1]])
 294 |     a_cost     = tensor.mean((cost * y_mask).sum(0))
 295 | 
 296 |     # gradient clipping
 297 |     def _clip(grad):
 298 |         clip_c = 1.
 299 |         if clip_c > 0.:
 300 |             g2 = 0.
 301 |             for g in grad:
 302 |                 g2 += (g ** 2).sum()
 303 |             new_grads = []
 304 |             for g in grad:
 305 |                 new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g))
 306 |             grad = new_grads
 307 |         return grad
 308 | 
 309 | 
 310 |     lr    = tensor.scalar(name='lr')
 311 |     if fullmodel:
 312 |         print 'build MLE optimizer for the whole NMT model:'
 313 |         a_grad = _clip(theano.grad(a_cost, wrt=itemlist(tparams)))
 314 |         inps   = [x, x_mask, y, y_mask, c_mask]
 315 |         outps  = [a_cost, cost]
 316 |         f_cost, f_update = adam(lr, tparams, a_grad, inps, outps)
 317 |     else:
 318 |         print 'build MLE only for decoder'
 319 |         tparams_d = OrderedDict()
 320 |         for w in tparams:
 321 |             if ('ff_state' not in w) and ('encoder' not in w) and (w != 'Wemb'):
 322 |                 print w, 'updated.'
 323 |                 tparams_d[w] = tparams[w]
 324 | 
 325 |         a_grad = _clip(theano.grad(a_cost, wrt=itemlist(tparams_d)))
 326 |         inps   = [x, x_mask, y, y_mask, c_mask]
 327 |         outps  = [a_cost, cost]
 328 |         f_cost, f_update = adam(lr, tparams_d, a_grad, inps, outps)
 329 | 
 330 |     print 'done.'
 331 |     return f_init, f_cost, f_update
 332 | 
 333 | 
 334 | # build a sampler for NMT
 335 | def build_sampler(tparams, options, trng):
 336 | 
 337 |     x    = tensor.matrix('x', dtype='int64')
 338 | 
 339 |     n_timesteps = x.shape[0]
 340 |     n_samples   = x.shape[1]
 341 | 
 342 |     # word embedding (source), forward and backward
 343 |     emb  = tparams['Wemb'][x.flatten()]
 344 |     emb  = emb.reshape([n_timesteps, n_samples, options['dim_word']])
 345 | 
 346 |     # encoder
 347 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
 348 |                                             prefix='encoder')
 349 | 
 350 |     # bi-rnn
 351 |     if options.get('birnn', False):
 352 |         xr    = x[::-1]
 353 | 
 354 |         embr  = tparams['Wemb'][xr.flatten()]
 355 |         embr  = embr.reshape([n_timesteps, n_samples, options['dim_word']])
 356 |         projr = get_layer(options['encoder'])[1](tparams, embr, options,
 357 |                                                  prefix='encoder_r')
 358 | 
 359 |         ## concatenate forward and backward rnn hidden states
 360 |         ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
 361 | 
 362 |     else:
 363 |         ctx  = proj[0]
 364 | 
 365 |     # get the input for decoder rnn initializer mlp
 366 |     ctx_mean   = ctx.mean(0)
 367 |     # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
 368 |     init_state = get_layer('ff')[1](tparams, ctx_mean, options,
 369 |                                     prefix='ff_state', activ='tanh')
 370 | 
 371 |     print 'Building f_init...',
 372 |     outs       = [init_state, ctx]
 373 |     f_init     = theano.function([x], outs, name='f_init', profile=profile)
 374 |     print 'Done.'
 375 | 
 376 |     # ..........................................................................
 377 |     # x: 1 x 1
 378 |     y          = tensor.vector('y_sampler', dtype='int64')
 379 |     init_state = tensor.matrix('init_state', dtype='float32')
 380 |     use_noise  = theano.shared(numpy.float32(0.))
 381 | 
 382 | 
 383 |     # if it's the first word, emb should be all zero and it is indicated by -1
 384 |     emb  = tensor.switch(y[:, None] < 0,
 385 |                          tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
 386 |                          tparams['Wemb_dec'][y])
 387 | 
 388 |     # apply one step of conditional gru with attention
 389 |     proj = get_layer(options['decoder'])[1](tparams, emb, options,
 390 |                                             prefix='decoder',
 391 |                                             mask=None, context=ctx,
 392 |                                             one_step=True,
 393 |                                             init_state=init_state)
 394 |     # get the next hidden state
 395 |     next_state = proj[0]
 396 | 
 397 |     # get the weighted averages of context for this target word y
 398 |     ctxs       = proj[1]
 399 | 
 400 |     logit_lstm = get_layer('ff')[1](tparams, next_state, options,
 401 |                                     prefix='ff_logit_lstm', activ='linear')
 402 |     logit_prev = get_layer('ff')[1](tparams, emb, options,
 403 |                                     prefix='ff_logit_prev', activ='linear')
 404 |     logit_ctx  = get_layer('ff')[1](tparams, ctxs, options,
 405 |                                    prefix='ff_logit_ctx', activ='linear')
 406 |     logit      = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
 407 | 
 408 |     if options['use_dropout']:
 409 |         logit  = dropout_layer(logit, use_noise, trng)
 410 |     logit = get_layer('ff')[1](tparams, logit, options,
 411 |                                prefix='ff_logit', activ='linear')
 412 | 
 413 |     # compute the softmax probability
 414 |     next_probs  = tensor.nnet.softmax(logit)
 415 | 
 416 |     # sample from softmax distribution to get the sample
 417 |     next_sample = trng.multinomial(pvals=next_probs).argmax(1)
 418 | 
 419 |     # compile a function to do the whole thing above, next word probability,
 420 |     # sampled word for the next target, next hidden state to be used
 421 |     print 'Building f_next..',
 422 |     inps = [y, ctx, init_state]
 423 |     outs = [next_probs, next_sample, next_state]
 424 |     f_next = theano.function(inps, outs, name='f_next', profile=profile)
 425 |     print 'Done.'
 426 | 
 427 |     return f_init, f_next
 428 | 
 429 | def build_partial(tparams, options, trng):
 430 | 
 431 |     assert options.get('birnn', False), 'must used in uni-directional mode'
 432 | 
 433 |     x           = tensor.matrix('x', dtype='int64')
 434 |     prev_state  = tensor.matrix('prev_state', dtype='float32')
 435 |     n_timesteps = x.shape[0]
 436 |     n_samples   = x.shape[1]
 437 | 
 438 |     # word embedding (source), forward and backward
 439 |     emb  = tparams['Wemb'][x.flatten()]
 440 |     emb  = emb.reshape([n_timesteps, n_samples, options['dim_word']])
 441 | 
 442 |     # encoder
 443 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
 444 |                                             one_step=True,
 445 |                                             _init_state=prev_state,
 446 |                                             prefix='encoder')
 447 |     next_state = proj[0]
 448 | 
 449 | 
 450 |     print 'Building f_partial...',
 451 |     outs      = [next_state]
 452 |     f_partial = theano.function([x, prev_state], outs, name='f_partial', profile=profile)
 453 |     print 'Done'
 454 | 
 455 |     return f_partial
 456 | 
 457 | 
 458 | def build_simultaneous_sampler(tparams, options, trng):
 459 |     x = tensor.matrix('x', dtype='int64')
 460 | 
 461 |     n_timesteps = x.shape[0]
 462 |     n_samples   = x.shape[1]
 463 | 
 464 |     # word embedding (source), forward and backward
 465 |     emb   = tparams['Wemb'][x.flatten()]
 466 |     emb   = emb.reshape([n_timesteps, n_samples, options['dim_word']])
 467 | 
 468 |     # encoder
 469 |     proj  = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder')
 470 | 
 471 |     # bi-rnn
 472 |     if options.get('birnn', False):
 473 |         xr    = x[::-1]
 474 | 
 475 |         embr  = tparams['Wemb'][xr.flatten()]
 476 |         embr  = embr.reshape([n_timesteps, n_samples, options['dim_word']])
 477 |         projr = get_layer(options['encoder'])[1](tparams, embr, options,
 478 |                                                  prefix='encoder_r')
 479 | 
 480 |         ## concatenate forward and backward rnn hidden states
 481 |         ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
 482 | 
 483 |     else:
 484 |         ctx  = proj[0]
 485 | 
 486 |     # get the input for decoder rnn initializer mlp
 487 |     ctx_mean   = ctx.mean(0)
 488 |     # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
 489 |     init_state = get_layer('ff')[1](tparams, ctx_mean, options,
 490 |                                     prefix='ff_state', activ='tanh')
 491 | 
 492 |     print 'Building f_ctx/init...',
 493 | 
 494 |     f_sim_ctx  = theano.function([x], ctx, name = 'f_sim_ctx')
 495 |     f_sim_init = theano.function([ctx], init_state, name='f_sim_init', profile=profile)
 496 | 
 497 |     print 'Done.'
 498 | 
 499 |     # -------------------------------------------------------------------------------- #
 500 |     y          = tensor.vector('y_sampler', dtype='int64')
 501 |     ctx        = tensor.tensor3('context_vectors', dtype='float32')
 502 |     mask       = tensor.matrix('context_mask', dtype='float32')
 503 |     init_state = tensor.matrix('init_state', dtype='float32')
 504 |     use_noise  = theano.shared(numpy.float32(0.))
 505 | 
 506 |     # if it's the first word, emb should be all zero and it is indicated by -1
 507 |     emb  = tensor.switch(y[:, None] < 0,
 508 |                         tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
 509 |                         tparams['Wemb_dec'][y])
 510 | 
 511 |     # apply one step of conditional gru with attention
 512 |     proj = get_layer(options['decoder'])[1](tparams, emb, options,
 513 |                                             prefix='decoder',
 514 |                                             mask=None, context=ctx,
 515 |                                             one_step=True,
 516 |                                             init_state=init_state,
 517 |                                             context_mask=mask)
 518 | 
 519 |     # get the next hidden state
 520 |     next_state  = proj[0]
 521 | 
 522 |     # get the weighted averages of context for this target word y
 523 |     ctxs        = proj[1]
 524 |     attention   = proj[2]
 525 | 
 526 |     logit_lstm  = get_layer('ff')[1](tparams, next_state, options,
 527 |                                     prefix='ff_logit_lstm', activ='linear')
 528 |     logit_prev  = get_layer('ff')[1](tparams, emb, options,
 529 |                                     prefix='ff_logit_prev', activ='linear')
 530 |     logit_ctx   = get_layer('ff')[1](tparams, ctxs, options,
 531 |                                    prefix='ff_logit_ctx', activ='linear')
 532 |     logit       = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
 533 | 
 534 |     if options['use_dropout']:
 535 |         logit   = dropout_layer(logit, use_noise, trng)
 536 | 
 537 |     logit       = get_layer('ff')[1](tparams, logit, options,
 538 |                                prefix='ff_logit', activ='linear')
 539 | 
 540 |     # compute the softmax probability
 541 |     next_probs  = tensor.nnet.softmax(logit)
 542 | 
 543 |     # sample from softmax distribution to get the sample
 544 |     next_sample = trng.multinomial(pvals=next_probs).argmax(1)
 545 | 
 546 |     # ***== special care: use additional inforamtion ====*** #
 547 |     # compile a function to do the whole thing above, next word probability,
 548 |     # sampled word for the next target, next hidden state to be used
 549 |     print 'Building f_sim_next..',
 550 |     inps = [y, ctx, mask, init_state]
 551 |     ctxdim = options['dim'] if not options.get('birnn', False) else 2 * options['dim']
 552 | 
 553 |     if 'pre' in options and options['pre']:
 554 |         assert not options.get('birnn', False), 'should not use birnn for SimulTrans'
 555 | 
 556 |         read_head   = tensor.ivector('read_head')
 557 |         forget_head = tensor.ivector('forget_head')
 558 |         inps += [read_head, forget_head]
 559 | 
 560 |         def _grab(contexts, index):
 561 |             assert contexts.ndim == 3
 562 | 
 563 |             batch_size = contexts.shape[1]
 564 |             return contexts[index, tensor.arange(batch_size), :]
 565 | 
 566 |         last_ctx   = _grab(ctx, read_head)
 567 |         first_ctx  = _grab(ctx, forget_head)
 568 |         next_max_w = tparams['Wemb_dec'][next_probs.argmax(1)]
 569 | 
 570 |         readout    = tensor.concatenate([next_state, ctxs, last_ctx, first_ctx, next_max_w], axis=-1)
 571 |         options['readout_dim'] = options['dim_word'] + ctxdim * 3 + options['dim']
 572 | 
 573 |     else:
 574 |         print 'with normal input'
 575 |         readout = tensor.concatenate([next_state, ctxs, emb], axis=-1)  # the obersavtion for each step.
 576 |         options['readout_dim'] = options['dim_word'] + options['dim'] + ctxdim
 577 | 
 578 |     outs = [next_probs, next_sample, next_state, readout, attention]
 579 |     f_sim_next = theano.function(inps, outs, name='f_sim_next', profile=profile)
 580 |     print 'Done.'
 581 | 
 582 |     return f_sim_ctx, f_sim_init, f_sim_next
 583 | 
 584 | # ---------------------------------------------------------------------------- #
 585 | # What we need are this part = v =                                             #
 586 | #                                                                              #
 587 | # ---------> for reinforcement noisy decoding                                  #
 588 | # ---------------------------------------------------------------------------- #
 589 | 
 590 | def build_noisy_sampler(tparams, options, trng):
 591 |     x = tensor.matrix('x', dtype='int64')
 592 | 
 593 |     n_timesteps = x.shape[0]
 594 |     n_samples   = x.shape[1]
 595 | 
 596 |     # word embedding (source), forward and backward
 597 |     emb   = tparams['Wemb'][x.flatten()]
 598 |     emb   = emb.reshape([n_timesteps, n_samples, options['dim_word']])
 599 | 
 600 |     # encoder
 601 |     proj  = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder')
 602 |     if options.get('birnn', False):
 603 |         xr    = x[::-1]
 604 |         embr  = tparams['Wemb'][xr.flatten()]
 605 |         embr  = embr.reshape([n_timesteps, n_samples, options['dim_word']])
 606 |         projr = get_layer(options['encoder'])[1](tparams, embr, options,
 607 |                                                  prefix='encoder_r')
 608 |         ctx   = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
 609 | 
 610 |     else:
 611 |         ctx   = proj[0]
 612 | 
 613 |     # get the input for decoder rnn initializer mlp
 614 |     ctx_mean   = ctx.mean(0)
 615 |     init_state = get_layer('ff')[1](tparams, ctx_mean, options,
 616 |                                     prefix='ff_state', activ='tanh')
 617 | 
 618 |     print 'Building Encoder: f_ctx/init...',
 619 | 
 620 |     f_sim_ctx  = theano.function([x], ctx, name = 'f_sim_ctx')
 621 |     f_sim_init = theano.function([ctx], init_state, name='f_sim_init', profile=profile)
 622 | 
 623 |     print 'Done.'
 624 | 
 625 |     # -------------------------------------------------------------------------------- #
 626 |     y          = tensor.vector('y_sampler', dtype='int64')
 627 |     ctx        = tensor.tensor3('context_vectors', dtype='float32')
 628 |     mask       = tensor.matrix('context_mask', dtype='float32')
 629 |     prev_state = tensor.matrix('prev_state', dtype='float32')
 630 |     use_noise  = theano.shared(numpy.float32(0.))
 631 | 
 632 |     injd_noise = tensor.matrix('injected_noise', dtype='float32')
 633 | 
 634 |     # if it's the first word, emb should be all zero and it is indicated by -1
 635 |     emb  = tensor.switch(y[:, None] < 0,
 636 |                         tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
 637 |                         tparams['Wemb_dec'][y])
 638 | 
 639 |     # inject noise
 640 |     init_state = prev_state + injd_noise  # apply the injected noise
 641 | 
 642 |     # apply one step of conditional gru with attention
 643 |     proj = get_layer(options['decoder'])[1](tparams, emb, options,
 644 |                                             prefix='decoder',
 645 |                                             mask=None, context=ctx,
 646 |                                             one_step=True,
 647 |                                             init_state=init_state,
 648 |                                             context_mask=mask)
 649 | 
 650 |     # get the next hidden state
 651 |     next_state  = proj[0]
 652 | 
 653 |     # get the weighted averages of context for this target word y
 654 |     ctxs        = proj[1]
 655 |     attention   = proj[2]
 656 | 
 657 |     logit_lstm  = get_layer('ff')[1](tparams, next_state, options,
 658 |                                     prefix='ff_logit_lstm', activ='linear')
 659 |     logit_prev  = get_layer('ff')[1](tparams, emb, options,
 660 |                                     prefix='ff_logit_prev', activ='linear')
 661 |     logit_ctx   = get_layer('ff')[1](tparams, ctxs, options,
 662 |                                    prefix='ff_logit_ctx', activ='linear')
 663 |     logit       = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
 664 | 
 665 |     if options['use_dropout']:
 666 |         logit   = dropout_layer(logit, use_noise, trng)
 667 | 
 668 |     logit       = get_layer('ff')[1](tparams, logit, options,
 669 |                                prefix='ff_logit', activ='linear')
 670 | 
 671 |     # compute the softmax probability
 672 |     next_probs  = tensor.nnet.softmax(logit)
 673 | 
 674 |     # sample from softmax distribution to get the sample
 675 |     next_sample = trng.multinomial(pvals=next_probs).argmax(1)
 676 | 
 677 |     # compile the function read-out and samples
 678 |     print 'Building f_sim_next..',
 679 | 
 680 |     inps      = [y, ctx, mask, prev_state, injd_noise]
 681 |     ctxdim    = options['dim'] if not options.get('birnn', False) else 2 * options['dim']
 682 |     readout   = tensor.concatenate([next_state, ctxs, emb], axis=-1)  # the obersavtion for each step.
 683 |     options['readout_dim'] = options['dim_word'] + options['dim'] + ctxdim
 684 | 
 685 |     outs = [next_probs, next_sample, next_state, readout, attention]
 686 |     f_sim_next = theano.function(inps, outs, name='f_sim_next', profile=profile)
 687 | 
 688 |     print 'Done.'
 689 |     return f_sim_ctx, f_sim_init, f_sim_next
 690 | 
 691 | 
 692 | # generate sample, either with stochastic sampling or beam search. Note that,
 693 | # this function iteratively calls f_init and f_next functions.
 694 | def gen_sample(tparams, f_init, f_next, x, options, trng=None, k=1, maxlen=30,
 695 |                stochastic=True, argmax=False, sigma=-1.):
 696 | 
 697 |     # k is the beam size we have
 698 |     if k > 1:
 699 |         assert not stochastic, \
 700 |             'Beam search does not support stochastic sampling'
 701 | 
 702 |     sample = []
 703 |     sample_score = []
 704 |     if stochastic:
 705 |         sample_score = 0
 706 | 
 707 |     live_k = 1
 708 |     dead_k = 0
 709 | 
 710 |     hyp_samples = [[]] * live_k
 711 |     hyp_scores = numpy.zeros(live_k).astype('float32')
 712 |     hyp_states = []
 713 | 
 714 |     # get initial state of decoder rnn and encoder context
 715 |     ret = f_init(x)
 716 |     next_state, ctx0 = ret[0], ret[1]
 717 |     next_w = -1 * numpy.ones((1,)).astype('int64')  # bos indicator
 718 | 
 719 |     for ii in xrange(maxlen):
 720 |         ctx = numpy.tile(ctx0, [live_k, 1])
 721 | 
 722 |         if sigma > 0.:
 723 |             next_state_inp = next_state + numpy.float32((sigma/(ii+1)) * numpy.random.randn(*next_state.shape))
 724 |         else:
 725 |             next_state_inp = next_state
 726 | 
 727 |         inps = [next_w, ctx, next_state_inp]
 728 |         ret = f_next(*inps)
 729 |         next_p, next_w, next_state = ret[0], ret[1], ret[2]
 730 | 
 731 |         if stochastic:
 732 |             if argmax:
 733 |                 nw = next_p[0].argmax()
 734 |             else:
 735 |                 nw = next_w[0]
 736 |             sample.append(nw)
 737 |             sample_score += next_p[0, nw]
 738 |             if nw == 0:
 739 |                 break
 740 |         else:
 741 |             cand_scores = hyp_scores[:, None] - numpy.log(next_p)
 742 |             cand_flat = cand_scores.flatten()
 743 |             ranks_flat = cand_flat.argsort()[:(k-dead_k)]
 744 | 
 745 |             voc_size = next_p.shape[1]
 746 |             trans_indices = ranks_flat / voc_size
 747 |             word_indices = ranks_flat % voc_size
 748 |             costs = cand_flat[ranks_flat]
 749 | 
 750 |             new_hyp_samples = []
 751 |             new_hyp_scores = numpy.zeros(k-dead_k).astype('float32')
 752 |             new_hyp_states = []
 753 | 
 754 |             for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
 755 |                 new_hyp_samples.append(hyp_samples[ti]+[wi])
 756 |                 new_hyp_scores[idx] = copy.copy(costs[idx])
 757 |                 new_hyp_states.append(copy.copy(next_state[ti]))
 758 | 
 759 |             # check the finished samples
 760 |             new_live_k = 0
 761 |             hyp_samples = []
 762 |             hyp_scores = []
 763 |             hyp_states = []
 764 | 
 765 |             for idx in xrange(len(new_hyp_samples)):
 766 |                 if new_hyp_samples[idx][-1] == 0:
 767 |                     sample.append(new_hyp_samples[idx])
 768 |                     sample_score.append(new_hyp_scores[idx])
 769 |                     dead_k += 1
 770 |                 else:
 771 |                     new_live_k += 1
 772 |                     hyp_samples.append(new_hyp_samples[idx])
 773 |                     hyp_scores.append(new_hyp_scores[idx])
 774 |                     hyp_states.append(new_hyp_states[idx])
 775 |             hyp_scores = numpy.array(hyp_scores)
 776 |             live_k = new_live_k
 777 | 
 778 |             if new_live_k < 1:
 779 |                 break
 780 |             if dead_k >= k:
 781 |                 break
 782 | 
 783 |             next_w = numpy.array([w[-1] for w in hyp_samples])
 784 |             next_state = numpy.array(hyp_states)
 785 | 
 786 |     if not stochastic:
 787 |         # dump every remaining one
 788 |         if live_k > 0:
 789 |             for idx in xrange(live_k):
 790 |                 sample.append(hyp_samples[idx])
 791 |                 sample_score.append(hyp_scores[idx])
 792 | 
 793 |     return sample, sample_score
 794 | 
 795 | 
 796 | # calculate the log probablities on a given corpus using translation model
 797 | def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True):
 798 |     probs = []
 799 | 
 800 |     n_done = 0
 801 | 
 802 |     for x, y in iterator:
 803 |         n_done += len(x)
 804 | 
 805 |         x, x_mask, y, y_mask = prepare_data(x, y,
 806 |                                             n_words_src=options['n_words_src'],
 807 |                                             n_words=options['n_words'])
 808 | 
 809 |         pprobs = f_log_probs(x, x_mask, y, y_mask)
 810 |         for pp in pprobs:
 811 |             probs.append(pp)
 812 | 
 813 |         #if numpy.isnan(numpy.mean(probs)):
 814 |         #    ipdb.set_trace()
 815 | 
 816 |         if verbose:
 817 |             print >>sys.stderr, '%d samples computed' % (n_done)
 818 | 
 819 |     return numpy.array(probs)
 820 | 
 821 | #-----------------------------------------------------------------------------#
 822 | # Batch preparation
 823 | 
 824 | def prepare_data(seqs_x,
 825 |                  seqs_y,
 826 |                  maxlen=None,
 827 |                  n_words_src=30000,
 828 |                  n_words=30000):
 829 | 
 830 |     # x: a list of sentences
 831 |     lengths_x = [len(s) for s in seqs_x]
 832 |     lengths_y = [len(s) for s in seqs_y]
 833 | 
 834 |     if maxlen is not None:
 835 |         new_seqs_x    = []
 836 |         new_seqs_y    = []
 837 |         new_lengths_x = []
 838 |         new_lengths_y = []
 839 |         for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y):
 840 |             if l_x < maxlen and l_y < maxlen:
 841 |                 new_seqs_x.append(s_x)
 842 |                 new_lengths_x.append(l_x)
 843 |                 new_seqs_y.append(s_y)
 844 |                 new_lengths_y.append(l_y)
 845 |         lengths_x = new_lengths_x
 846 |         seqs_x    = new_seqs_x
 847 |         lengths_y = new_lengths_y
 848 |         seqs_y    = new_seqs_y
 849 | 
 850 |         if len(lengths_x) < 1 or len(lengths_y) < 1:
 851 |             return None, None, None, None
 852 | 
 853 |     n_samples = len(seqs_x)
 854 |     maxlen_x  = numpy.max(lengths_x) + 1
 855 |     maxlen_y  = numpy.max(lengths_y) + 1
 856 | 
 857 |     x      = numpy.zeros((maxlen_x, n_samples)).astype('int64')
 858 |     y      = numpy.zeros((maxlen_y, n_samples)).astype('int64')
 859 |     x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
 860 |     y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
 861 |     for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)):
 862 |         x[:lengths_x[idx], idx] = s_x
 863 |         x_mask[:lengths_x[idx]+1, idx] = 1.
 864 |         y[:lengths_y[idx], idx] = s_y
 865 |         y_mask[:lengths_y[idx]+1, idx] = 1.
 866 | 
 867 |     return x, x_mask, y, y_mask
 868 | 
 869 | 
 870 | #-----------------------------------------------------------------------------#
 871 | # Training Function:
 872 | 
 873 | def train(dim_word     = 100,  # word vector dimensionality
 874 |           dim          = 1000,  # the number of RNN units
 875 |           encoder      = 'gru',
 876 |           decoder      = 'gru_cond',
 877 |           patience     = 10,  # early stopping patience
 878 |           max_epochs   = 5000,
 879 |           finish_after = 10000000,  # finish after this many updates
 880 |           dispFreq     = 100,
 881 |           decay_c      = 0.,  # L2 regularization penalty
 882 |           alpha_c      = 0.,  # alignment regularization
 883 |           clip_c       = -1.,  # gradient clipping threshold
 884 |           lrate        = 0.01,  # learning rate
 885 |           n_words_src  = 100000,  # source vocabulary size
 886 |           n_words      = 100000,  # target vocabulary size
 887 |           maxlen       = 100,  # maximum length of the description
 888 |           optimizer    = 'rmsprop',
 889 |           batch_size   = 16,
 890 |           valid_batch_size = 16,
 891 |           saveto       = 'model.npz',
 892 |           validFreq    = 1000,
 893 |           saveFreq     = 1000,   # save the parameters after every saveFreq updates
 894 |           sampleFreq   = 100,   # generate some samples after every sampleFreq
 895 |           datasets     =[
 896 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
 897 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
 898 | 
 899 |           valid_datasets=['../data/dev/newstest2011.en.tok',
 900 |                           '../data/dev/newstest2011.fr.tok'],
 901 | 
 902 |           dictionaries=[
 903 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
 904 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
 905 | 
 906 |           use_dropout  = False,
 907 |           reload_      = False,
 908 |           overwrite    = False):
 909 | 
 910 |     # Model options
 911 |     model_options = locals().copy()
 912 | 
 913 |     # load dictionaries and invert them
 914 |     worddicts = [None] * len(dictionaries)
 915 |     worddicts_r = [None] * len(dictionaries)
 916 |     for ii, dd in enumerate(dictionaries):
 917 |         with open(dd, 'rb') as f:
 918 |             worddicts[ii] = pkl.load(f)
 919 |         worddicts_r[ii] = dict()
 920 |         for kk, vv in worddicts[ii].iteritems():
 921 |             worddicts_r[ii][vv] = kk
 922 | 
 923 |     # reload options
 924 |     if reload_ and os.path.exists(saveto):
 925 |         print 'Reloading model options'
 926 |         with open('%s.pkl' % saveto, 'rb') as f:
 927 |             model_options = pkl.load(f)
 928 | 
 929 |     print 'Loading data'
 930 |     train = TextIterator(datasets[0], datasets[1],
 931 |                          dictionaries[0], dictionaries[1],
 932 |                          n_words_source=n_words_src, n_words_target=n_words,
 933 |                          batch_size=batch_size,
 934 |                          maxlen=maxlen)
 935 |     valid = TextIterator(valid_datasets[0], valid_datasets[1],
 936 |                          dictionaries[0], dictionaries[1],
 937 |                          n_words_source=n_words_src, n_words_target=n_words,
 938 |                          batch_size=valid_batch_size,
 939 |                          maxlen=maxlen)
 940 | 
 941 |     print 'Building model'
 942 |     params = init_params(model_options)
 943 |     # reload parameters
 944 |     if reload_ and os.path.exists(saveto):
 945 |         print 'Reloading model parameters'
 946 |         params = load_params(saveto, params)
 947 | 
 948 |     tparams = init_tparams(params)
 949 | 
 950 |     trng, use_noise, \
 951 |         x, x_mask, y, y_mask, \
 952 |         opt_ret, \
 953 |         cost, f_cost = \
 954 |         build_model(tparams, model_options)
 955 |     inps = [x, x_mask, y, y_mask]
 956 | 
 957 |     print 'Building sampler'
 958 |     f_init, f_next = build_sampler(tparams, model_options, trng)
 959 | 
 960 |     # before any regularizer
 961 |     print 'Building f_log_probs...',
 962 |     f_log_probs = theano.function(inps, cost, profile=profile)
 963 |     print 'Done'
 964 | 
 965 |     cost = cost.mean()
 966 | 
 967 |     # apply L2 regularization on weights
 968 |     if decay_c > 0.:
 969 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
 970 |         weight_decay = 0.
 971 |         for kk, vv in tparams.iteritems():
 972 |             weight_decay += (vv ** 2).sum()
 973 |         weight_decay *= decay_c
 974 |         cost += weight_decay
 975 | 
 976 |     # regularize the alpha weights
 977 |     if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
 978 |         alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
 979 |         alpha_reg = alpha_c * (
 980 |             (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] -
 981 |              opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
 982 |         cost += alpha_reg
 983 | 
 984 |     # after all regularizers - compile the computational graph for cost
 985 |     print 'Building f_cost...',
 986 |     f_cost = theano.function(inps, cost, profile=profile)
 987 |     print 'Done'
 988 | 
 989 |     print 'Computing gradient...',
 990 |     grads = tensor.grad(cost, wrt=itemlist(tparams))
 991 |     print 'Done'
 992 | 
 993 |     # apply gradient clipping here
 994 |     if clip_c > 0.:
 995 |         g2 = 0.
 996 |         for g in grads:
 997 |             g2 += (g**2).sum()
 998 |         new_grads = []
 999 |         for g in grads:
1000 |             new_grads.append(tensor.switch(g2 > (clip_c**2),
1001 |                                            g / tensor.sqrt(g2) * clip_c,
1002 |                                            g))
1003 |         grads = new_grads
1004 | 
1005 |     # compile the optimizer, the actual computational graph is compiled here
1006 |     lr = tensor.scalar(name='lr')
1007 |     print 'Building optimizers...',
1008 |     f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
1009 |     print 'Done'
1010 | 
1011 |     print 'Optimization'
1012 | 
1013 |     best_p = None
1014 |     bad_counter = 0
1015 |     uidx = 0
1016 |     estop = False
1017 |     history_errs = []
1018 |     # reload history
1019 |     if reload_ and os.path.exists(saveto):
1020 |         rmodel = numpy.load(saveto)
1021 |         history_errs = list(rmodel['history_errs'])
1022 |         if 'uidx' in rmodel:
1023 |             uidx = rmodel['uidx']
1024 | 
1025 |     if validFreq == -1:
1026 |         validFreq = len(train[0])/batch_size
1027 |     if saveFreq == -1:
1028 |         saveFreq = len(train[0])/batch_size
1029 |     if sampleFreq == -1:
1030 |         sampleFreq = len(train[0])/batch_size
1031 | 
1032 |     for eidx in xrange(max_epochs):
1033 |         n_samples = 0
1034 | 
1035 |         for x, y in train:
1036 |             n_samples += len(x)
1037 |             uidx += 1
1038 |             use_noise.set_value(1.)
1039 | 
1040 |             x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen,
1041 |                                                 n_words_src=n_words_src,
1042 |                                                 n_words=n_words)
1043 | 
1044 |             if x is None:
1045 |                 print 'Minibatch with zero sample under length ', maxlen
1046 |                 uidx -= 1
1047 |                 continue
1048 | 
1049 |             ud_start = time.time()
1050 | 
1051 |             # compute cost, grads and copy grads to shared variables
1052 |             cost = f_grad_shared(x, x_mask, y, y_mask)
1053 | 
1054 |             # do the update on parameters
1055 |             f_update(lrate)
1056 | 
1057 |             ud = time.time() - ud_start
1058 | 
1059 |             # check for bad numbers, usually we remove non-finite elements
1060 |             # and continue training - but not done here
1061 |             if numpy.isnan(cost) or numpy.isinf(cost):
1062 |                 print 'NaN detected'
1063 |                 return 1., 1., 1.
1064 | 
1065 |             # verbose
1066 |             if numpy.mod(uidx, dispFreq) == 0:
1067 |                 print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
1068 | 
1069 |             # save the best model so far, in addition, save the latest model
1070 |             # into a separate file with the iteration number for external eval
1071 |             if numpy.mod(uidx, saveFreq) == 0:
1072 |                 print 'Saving the best model...',
1073 |                 if best_p is not None:
1074 |                     params = best_p
1075 |                 else:
1076 |                     params = unzip(tparams)
1077 |                 numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
1078 |                 pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
1079 |                 params = unzip(tparams)
1080 |                 numpy.savez('%s.current'%(saveto), history_errs=history_errs, **params)
1081 |                 pkl.dump(model_options, open('%s.current.pkl' % saveto, 'wb'))
1082 |                 print 'Done'
1083 | 
1084 |                 # save with uidx
1085 |                 if not overwrite:
1086 |                     print 'Saving the model at iteration {}...'.format(uidx),
1087 |                     saveto_uidx = '{}.iter{}.npz'.format(
1088 |                         os.path.splitext(saveto)[0], uidx)
1089 |                     numpy.savez(saveto_uidx, history_errs=history_errs,
1090 |                                 uidx=uidx, **unzip(tparams))
1091 |                     print 'Done'
1092 | 
1093 | 
1094 |             # generate some samples with the model and display them
1095 |             if numpy.mod(uidx, sampleFreq) == 0:
1096 |                 # FIXME: random selection?
1097 |                 for jj in xrange(numpy.minimum(5, x.shape[1])):
1098 |                     stochastic = False
1099 |                     sample, score = gen_sample(tparams, f_init, f_next,
1100 |                                                x[:, jj][:, None],
1101 |                                                model_options, trng=trng, k=1,
1102 |                                                maxlen=30,
1103 |                                                stochastic=stochastic,
1104 |                                                argmax=True)
1105 |                     print 'Source ', jj, ': ',
1106 |                     ss = []
1107 |                     for vv in x[:, jj]:
1108 |                         if vv == 0:
1109 |                             break
1110 |                         if vv in worddicts_r[0]:
1111 |                             ss.append(worddicts_r[0][vv])
1112 |                         else:
1113 |                             ss.append('UNK')
1114 |                     print ' '.join(ss).replace('@@ ', '')
1115 |                     print 'Truth ', jj, ' : ',
1116 |                     ss = []
1117 |                     for vv in y[:, jj]:
1118 |                         if vv == 0:
1119 |                             break
1120 |                         if vv in worddicts_r[1]:
1121 |                             ss.append(worddicts_r[1][vv])
1122 |                         else:
1123 |                             ss.append('UNK')
1124 |                     print ' '.join(ss).replace('@@ ', '')
1125 |                     print 'Sample ', jj, ': ',
1126 |                     tt = []
1127 |                     score = score / numpy.array([len(s) for s in sample])
1128 |                     ss = sample[score.argmin()]
1129 |                     for vv in ss:
1130 |                         if vv == 0:
1131 |                             break
1132 |                         if vv in worddicts_r[1]:
1133 |                             tt.append(worddicts_r[1][vv])
1134 |                         else:
1135 |                             tt.append('UNK')
1136 |                     print ' '.join(tt).replace('@@ ', '')
1137 | 
1138 |             # validate model on validation set and early stop if necessary
1139 |             if numpy.mod(uidx, validFreq) == 0:
1140 |                 use_noise.set_value(0.)
1141 |                 valid_errs = pred_probs(f_log_probs, prepare_data,
1142 |                                         model_options, valid)
1143 |                 valid_err = valid_errs.mean()
1144 |                 history_errs.append(valid_err)
1145 | 
1146 |                 if uidx == 0 or valid_err <= numpy.array(history_errs).min():
1147 |                     best_p = unzip(tparams)
1148 |                     bad_counter = 0
1149 |                 if len(history_errs) > patience and valid_err >= \
1150 |                         numpy.array(history_errs)[:-patience].min():
1151 |                     bad_counter += 1
1152 |                     if bad_counter > patience:
1153 |                         print 'Early Stop!'
1154 |                         estop = True
1155 |                         break
1156 | 
1157 |                 #if numpy.isnan(valid_err):
1158 |                 #    ipdb.set_trace()
1159 | 
1160 |                 print 'Valid ', valid_err
1161 | 
1162 |             # finish after this many updates
1163 |             if uidx >= finish_after:
1164 |                 print 'Finishing after %d iterations!' % uidx
1165 |                 estop = True
1166 |                 break
1167 | 
1168 |         print 'Seen %d samples' % n_samples
1169 | 
1170 |         if estop:
1171 |             break
1172 | 
1173 |     if best_p is not None:
1174 |         zipp(best_p, tparams)
1175 | 
1176 |     use_noise.set_value(0.)
1177 |     valid_err = pred_probs(f_log_probs, prepare_data,
1178 |                            model_options, valid).mean()
1179 | 
1180 |     print 'Valid ', valid_err
1181 | 
1182 |     params = copy.copy(best_p)
1183 |     numpy.savez(saveto, zipped_params=best_p,
1184 |                 history_errs=history_errs,
1185 |                 uidx=uidx,
1186 |                 **params)
1187 | 
1188 |     return valid_err
1189 | 
1190 | def grad_clip(dJ, clip_c=1):
1191 |     clip_c = clip_c.
1192 |     if clip_c > 0.:
1193 |         g2 = 0.
1194 |         for g in dJ:
1195 |             g2 += (g ** 2).sum()
1196 |         new_grads = []
1197 |         for g in dJ:
1198 |             new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g))
1199 |         dJ = new_grads
1200 |     return dJ
1201 | 
1202 | if __name__ == '__main__':
1203 |     pass
1204 | 


--------------------------------------------------------------------------------
/noisy_translator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Neural Machine Translation with Reinforcement Bias
  3 | """
  4 | 
  5 | from nmt_uni import *
  6 | from reward import translation_cost
  7 | import time
  8 | 
  9 | time = time.time
 10 | 
 11 | # utility functions
 12 | def _seqs2words(caps, idict):
 13 |     capsw = []
 14 |     for cc in caps:
 15 |         ww = []
 16 |         for w in cc:
 17 |             if w == 0:
 18 |                 break
 19 |             ww.append(idict[w])
 20 |         capsw.append(' '.join(ww))
 21 |     return capsw
 22 | 
 23 | def _bpe2words(capsw):
 24 |     capw = []
 25 |     for cc in capsw:
 26 |         capw += [cc.replace('@@ ', '')]
 27 |     return capw
 28 | 
 29 | def _action2delay(src, actions):
 30 |     delays = []
 31 |     X = len(src)
 32 |     for act in actions:
 33 |         A = numpy.array(act, dtype='float32')
 34 |         Y = numpy.sum(act)
 35 |         S = numpy.sum(numpy.cumsum(1 - A) * A)
 36 | 
 37 |         assert (X > 0) and (Y > 0), 'avoid NAN {}, {}'.format(X, Y)
 38 | 
 39 |         tau = S / (Y * X)
 40 |         delays.append([tau, X, Y, S])
 41 | 
 42 |     return delays
 43 | 
 44 | 
 45 | # padding for computing policy gradient
 46 | def _padding(arrays, shape, dtype='float32', return_mask=False, sidx=0):
 47 |     B = numpy.zeros(shape, dtype=dtype)
 48 | 
 49 |     if return_mask:
 50 |         M = numpy.zeros((shape[0], shape[1]), dtype='float32')
 51 | 
 52 |     for it, arr in enumerate(arrays):
 53 |         arr = numpy.asarray(arr, dtype=dtype)
 54 |         # print arr.shape
 55 | 
 56 |         steps = arr.shape[0]
 57 | 
 58 |         if arr.ndim < 2:
 59 |             B[sidx: steps + sidx, it] = arr
 60 |         else:
 61 |             steps2 = arr.shape[1]
 62 |             B[sidx: steps + sidx, it, : steps2] = arr
 63 | 
 64 |         if return_mask:
 65 |             M[sidx: steps + sidx, it] = 1.
 66 | 
 67 |     if return_mask:
 68 |         return B, M
 69 |     return B
 70 | 
 71 | 
 72 | class PIPE(object):
 73 |     def __init__(self, keys=None):
 74 |         self.messages          = OrderedDict()
 75 |         self.hyp_messages      = OrderedDict()
 76 |         self.new_hyp_messages  = OrderedDict()
 77 |         for key in keys:
 78 |             self.messages[key] = []
 79 | 
 80 |     def reset(self):
 81 |         for key in self.messages:
 82 |             self.messages[key] = []
 83 | 
 84 |         self.hyp_messages = OrderedDict()
 85 |         self.new_hyp_messages = OrderedDict()
 86 | 
 87 |     def clean_hyp(self):
 88 |         self.hyp_messages      = OrderedDict()
 89 | 
 90 |     def clean_new_hyp(self):
 91 |         self.new_hyp_messages  = OrderedDict()
 92 | 
 93 |     def init_hyp(self, key, live_k=None):
 94 |         if live_k is not None:
 95 |             self.hyp_messages[key] = [[] for _ in xrange(live_k)]
 96 |         else:
 97 |             self.hyp_messages[key] = []
 98 | 
 99 |     def init_new_hyp(self, key, use_copy=False):
100 |         if use_copy:
101 |             self.new_hyp_messages[key] = copy.copy(self.hyp_messages[key])
102 |         else:
103 |             self.new_hyp_messages[key] = []
104 | 
105 |     def append(self, key, new, idx=None, use_hyp=False):
106 |         if not use_hyp:
107 |             self.new_hyp_messages[key].append(new)
108 |         else:
109 |             self.new_hyp_messages[key].append(self.hyp_messages[key][idx] + [new])
110 | 
111 |     def append_new(self, key, idx, hyper=True):
112 |         if hyper:
113 |             self.hyp_messages[key].append(self.new_hyp_messages[key][idx])
114 |         else:
115 |             # print self.messages['sample']
116 |             self.messages[key].append(self.new_hyp_messages[key][idx])
117 | 
118 |     def add(self, key, new, idx):
119 |         self.new_hyp_messages[key][idx] += new
120 | 
121 |     def asarray(self, key, replace=False):
122 |         if replace:
123 |             self.hyp_messages[key] = numpy.array(self.hyp_messages[key])
124 |         else:
125 |             return numpy.array(self.hyp_messages[key], dtype='float32')
126 | 
127 |     def split(self):
128 |         truth  = OrderedDict()
129 |         sample = OrderedDict()
130 | 
131 | 
132 |         for key in self.messages:
133 |             if key == 'source':
134 |                 continue
135 | 
136 |             truth[key]  = []
137 |             sample[key] = []
138 | 
139 |             if key == 'mask':
140 |                 for idx in xrange(len(self.messages['source'])):
141 |                     if self.messages['source'][idx] < 0:
142 |                         sample[key].append(self.messages[key][:, idx])
143 |                     else:
144 |                         truth[key].append(self.messages[key][:, idx])
145 |             else:
146 |                 for idx in xrange(len(self.messages['source'])):
147 |                     if self.messages['source'][idx] < 0:
148 |                         sample[key].append(self.messages[key][idx])
149 |                     else:
150 |                         truth[key].append(self.messages[key][idx])
151 | 
152 |         self.messages = sample
153 |         return truth
154 | 
155 | 
156 | 
157 | # ============================================================================ #
158 | # Noisy Decoding in Batch-Mode
159 | # ============================================================================ #
160 | def noisy_decoding(f_sim_ctx,
161 |                    f_sim_init,
162 |                    f_sim_next,
163 |                    f_cost,
164 |                    srcs,     # source sentences
165 |                    trgs,     # taeget sentences
166 |                    t_idict=None,
167 |                    _policy=None,
168 |                    n_samples=10,
169 |                    maxlen=200,
170 |                    reward_config=None,
171 |                    train=False):
172 |     """
173 |     :param f_init:     initializer using the first "sidx" words.
174 |     :param f_sim_next:
175 |     :param f_partial:
176 |     :param src:        the original input needed to be translated (just for the speed)
177 |     :param step:       step_size for each wait
178 |     :param peek:
179 |         hidden0   = _policy.init_hidden()
180 |     :param sidx:       pre-read sidx words from the source
181 |     :return:
182 |     """
183 |     Statistcs     = OrderedDict()
184 |     n_sentences   = len(srcs)
185 |     max_steps     = -1
186 | 
187 |     # ======================================================================== #
188 |     # Generating Trajectories based on Current Policy
189 |     # ======================================================================== #
190 | 
191 |     live_k    = n_samples * n_sentences
192 |     live_all  = live_k
193 | 
194 |     x, ctx0, z0, secs0 = [], [], [], []
195 |     # data initialization
196 |     for id, (src, trg) in enumerate(zip(srcs, trgs)):
197 | 
198 |         _x    = numpy.array(src, dtype='int64')[:, None]
199 |         _ctx0 = f_sim_ctx(_x)
200 |         _z0   = f_sim_init(_ctx0[:sidx, :])
201 | 
202 |         x.append(_x[:, 0])
203 |         ctx0.append(_ctx0[:, 0, :])
204 |         z0.append(_z0.flatten())
205 |         secs0.append([id, len(src), 0])  # word id / source length / correctness
206 | 
207 |     # pad the results
208 |     x, x_mask = _padding(x, (src_max, n_sentences), dtype='int64', return_mask=True)
209 |     ctx       = _padding(ctx0, (src_max, n_sentences, ctx0[0].shape[-1]))
210 |     z0        = numpy.asarray(z0)
211 |     mask      = x_mask
212 | 
213 |     # initial actions and hidden states
214 |     action0, _, _, hidden0 = _policy.init_action(n_samples=n_samples)
215 | 
216 |     x_mask    = numpy.ones_like(x, dtype='float32')
217 |     mask0     = x_mask
218 | 
219 |     # if we have multiple samples for one input sentence
220 |     mask      = numpy.tile(mask0, [1, n_samples])
221 |     z0        = numpy.tile(z0,    [n_samples, 1])
222 |     ctx       = numpy.tile(ctx,   [1, n_samples, 1])
223 | 
224 |     hidden0   = numpy.tile(hidden0, [live_k, 1])
225 |     action0   = numpy.tile(action0, [live_k, 1])
226 | 
227 |     secs      = []
228 |     for _ in xrange(live_k / n_sentences):
229 |         secs += copy.deepcopy(secs0)
230 | 
231 |     # PIPE for message passing
232 |     pipe      = PIPE(['sample', 'score', 'action', 'obs', 'attentions','secs'])
233 | 
234 |     # Build for the temporal results: hyp-message
235 |     for key in ['sample', 'obs', 'attentions', 'hidden', 'action']:
236 |         pipe.init_hyp(key, live_k)
237 | 
238 |     # special care
239 |     pipe.hyp_messages['score']  = numpy.zeros(live_k).astype('float32')
240 |     pipe.hyp_messages['secs']   = secs
241 |     pipe.hyp_messages['states'] = z0
242 |     pipe.hyp_messages['mask']   = mask
243 |     pipe.hyp_messages['ctx']    = ctx
244 | 
245 |     # these are inputs that needs to be updated
246 |     prev_w     = -1 * numpy.ones((live_k, )).astype('int64')
247 |     prev_z     = z0
248 |     prev_hid   = hidden0
249 |     prev_noise = action0
250 |     step       = 0
251 | 
252 |     # ROLLOUT: Iteration until all the samples over.
253 |     # Action space:
254 |     # =======================================================================
255 |     while live_k > 0:
256 | 
257 |         step += 1
258 | 
259 |         # compute one step
260 |         inps           = [prev_w, ctx, mask, prev_z, prev_noise]
261 |         next_p, _, next_z, next_o, next_a = f_sim_next(*inps)
262 | 
263 |         # obtain the candidate and the accumulated score.
264 |         _cand          = next_p.argmax(axis=-1)  # live_k
265 |         _score         = next_p[range(live_k), _cand]
266 | 
267 |         # new place-holders for temporal results: new-hyp-message
268 |         pipe.clean_new_hyp()
269 | 
270 |         for key in ['sample', 'score', 'attentions', 'secs', 'mask', 'ctx', 'states']:
271 |             pipe.init_new_hyp(key, use_copy=True)
272 | 
273 |         for key in ['action', 'obs', 'hidden']:
274 |             pipe.init_new_hyp(key, use_copy=False)
275 | 
276 | 
277 |         # Rollout the action.
278 |         _actions, _mean, _logstd, _hidden = _policy.action(next_o, prev_hid)  # input the current observation
279 | 
280 | 
281 |         # check each candidate
282 |         for idx, wi in enumerate(_cand):
283 | 
284 |             # collect the action
285 |             a    = _actions[idx]    # 1024-D Gaussian Vector
286 | 
287 |             # message appending
288 |             pipe.append('obs',       next_o[idx],   idx=idx, use_hyp=True)
289 |             pipe.append('action',    a,             idx=idx, use_hyp=True)   # collect action.
290 |             pipe.append('hidden',    _hidden[idx])
291 | 
292 |             # for commit:
293 |             # update new_hyp_message
294 |             pipe.add('sample',              [wi], idx)
295 |             pipe.add('score',        _score[idx], idx)
296 |             pipe.add('attentions', [next_a[idx]], idx)
297 | 
298 |             # *** special care
299 |             pipe.new_hyp_messages['states'][idx]    = next_z[idx]
300 | 
301 | 
302 |         #  kill the completed samples, so I need to build new hyp-messages
303 |         pipe.clean_hyp()
304 | 
305 |         for key in ['sample', 'score', 'states',
306 |                     'action', 'obs', 'attentions', 'hidden',
307 |                     'ctx', 'secs', 'mask']:
308 |             pipe.init_hyp(key)
309 | 
310 | 
311 |         # print new_hyp_sample
312 |         for idx in xrange(len(pipe.new_hyp_messages['sample'])):
313 |             # check if reachs the end
314 | 
315 |             if (len(pipe.new_hyp_messages['sample'][idx]) >= maxlen) or \
316 |                     (pipe.new_hyp_messages['sample'][idx][-1] == 0):
317 | 
318 |                 for key in ['sample', 'score', 'action', 'obs', 'attentions']:
319 |                     pipe.append_new(key, idx, hyper=False)
320 | 
321 |                 live_k -= 1
322 | 
323 |             else:
324 | 
325 |                 for key in ['sample', 'score', 'states', 'action',
326 |                             'obs', 'attentions', 'hidden']:
327 |                     pipe.append_new(key, idx, hyper=True)
328 | 
329 |                 # *** special care ***
330 |                 pipe.hyp_messages['secs'].append(pipe.new_hyp_messages['secs'][idx])
331 |                 pipe.hyp_messages['mask'].append(pipe.new_hyp_messages['mask'][:, idx])
332 |                 pipe.hyp_messages['ctx'].append(pipe.new_hyp_messages['ctx'][:, idx])
333 | 
334 | 
335 | 
336 |         # make it numpy array
337 |         for key in ['score', 'mask', 'ctx', 'states', 'hidden']:
338 |             pipe.asarray(key, True)
339 | 
340 |         pipe.hyp_messages['mask'] = pipe.hyp_messages['mask'].T
341 |         if pipe.hyp_messages['ctx'].ndim == 3:
342 |             pipe.hyp_messages['ctx']  = pipe.hyp_messages['ctx'].transpose(1, 0, 2)
343 |         elif pipe.hyp_messages['ctx'].ndim == 2:
344 |             pipe.hyp_messages['ctx']  = pipe.hyp_messages['ctx'][:, None, :]
345 | 
346 |         prev_z    = pipe.hyp_messages['states']
347 |         prev_hid  = pipe.hyp_messages['hidden']
348 |         mask      = pipe.hyp_messages['mask']
349 |         ctx       = pipe.hyp_messages['ctx']
350 | 
351 |         prev_w    = numpy.array([w[-1] if len(w) > 0
352 |                                  else -1 for w in pipe.hyp_messages['sample']],
353 |                                 dtype='int64')
354 | 
355 |         mask       = numpy.tile(mask0, [1, live_k])
356 | 
357 |         prev_noise = numpy.array([a[-1] for a in pipe.hyp_messages['action']], dtype='float32')
358 |         # prev_noise = numpy.concatenate(pipe.hyp_messages['action'], axis=0)
359 | 
360 | 
361 |     # =======================================================================
362 |     # Collecting Rewards.
363 |     # =======================================================================
364 |     # print 'collect reward'
365 |     R     = []
366 |     track = []
367 |     reference       = [_bpe2words(_seqs2words([trg], t_idict))[0].split()]
368 |     for k in xrange(n_samples):
369 |         sp, sc, act = [pipe.messages[key][k] for key in ['sample', 'score', 'action']]
370 |         y           = numpy.asarray(sp, dtype='int64')[:, None]
371 |         y_mask      = numpy.ones_like(y, dtype='float32')
372 |         steps       = len(act)
373 | 
374 |         # turn back to sentence level
375 |         words       = _seqs2words([sp], t_idict)[0]
376 |         decoded     = _bpe2words([words])[0].split()
377 | 
378 |         # -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
379 |         # reward configs
380 |         keys = {"steps": steps, "y": y,
381 |                 "y_mask": y_mask,
382 |                 "x_mask": x_mask,
383 |                 "f_cost": f_cost,
384 |                 "sample": decoded,
385 |                 "reference": reference,
386 |                 "words": words}
387 | 
388 |         ret  = translation_cost(**keys)
389 |         Rk, bleu = ret
390 | 
391 |         R     += [Rk]
392 |         track += [bleu]
393 | 
394 |     pipe.messages['R'] = R
395 |     pipe.messages['track'] = track
396 | 
397 |     # --------------------------------------------------- #
398 |     # add to global lists.
399 |     keywords     = ['sample', 'action', 'obs', 'secs',
400 |                     'attentions', 'score', 'track', 'R']
401 |     for k in keywords:
402 |         if k not in Statistcs:
403 |             Statistcs[k]  = pipe.messages[k]
404 |         else:
405 |             Statistcs[k] += pipe.messages[k]
406 | 
407 | 
408 |     # If not train, End here
409 |     if not train:
410 |         return Statistcs
411 | 
412 |     # ================================================================================================= #
413 |     # Policy Gradient over Trajectories
414 |     # ================================================================================================= #
415 | 
416 |     p_obs, p_mask   \
417 |             = _padding(Observations,
418 |                        shape=(max_steps, n_samples * n_sentences, _policy.n_in),
419 |                        return_mask=True)
420 |     p_r     = _padding(Rewards,
421 |                        shape=(max_steps, n_samples * n_sentences))
422 |     p_act   = _padding(Actions,
423 |                        shape=(max_steps, n_samples * n_sentences, _policy.n_out))
424 | 
425 | 
426 |     # print 'learning policy gradient'
427 |     # learning
428 |     info    = _policy.get_learner()([p_obs, p_mask], p_act, p_r)
429 | 
430 |     # add the reward statistics
431 |     q = Tracks
432 |     info['Q']   = numpy.mean(q)
433 |     info['A']   = numpy.mean(p_act)
434 | 
435 |     return Samples, Scores, Actions, Rewards, info
436 | 
437 | 
438 | 


--------------------------------------------------------------------------------
/noisytrans_training.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Neural Machine Translation with Greedy Decoding
  3 | """
  4 | import argparse
  5 | import os
  6 | import cPickle as pkl
  7 | 
  8 | from nmt_uni import *
  9 | from policy import Controller as Policy
 10 | from utils import Progbar, Monitor
 11 | from noisy_translator import noisy_decoding
 12 | from simultrans_infinite2 import _seqs2words, _bpe2words, _action2delay
 13 | 
 14 | import time
 15 | 
 16 | 
 17 | numpy.random.seed(19920206)
 18 | timer = time.time
 19 | 
 20 | # check hidden folders
 21 | def check_env():
 22 |     paths = ['.policy', '.pretrained', '.log',
 23 |              '.config', '.images', '.translate']
 24 |     for p in paths:
 25 |         if not os.path.exists(p):
 26 |             os.mkdir(p)
 27 | 
 28 | 
 29 | # run training function:: >>>
 30 | def run_simultrans(model,
 31 |                    options_file=None,
 32 |                    config=None,
 33 |                    policy=None,
 34 |                    id=None,
 35 |                    remote=False):
 36 |     # check envoriments
 37 |     check_env()
 38 |     if id is not None:
 39 |         fcon = '.config/{}.conf'.format(id)
 40 |         if os.path.exists(fcon):
 41 |             print 'load config files'
 42 |             policy, config = pkl.load(open(fcon, 'r'))
 43 | 
 44 |     # ======================================================================= #
 45 |     # load model model_options
 46 |     # ======================================================================= #
 47 |     _model = model
 48 |     model  = '.pretrained/{}'.format(model)
 49 | 
 50 |     if options_file is not None:
 51 |         with open(options_file, 'rb') as f:
 52 |             options = pkl.load(f)
 53 |     else:
 54 |         with open('%s.pkl' % model, 'rb') as f:
 55 |             options = pkl.load(f)
 56 |     options['birnn'] = True
 57 | 
 58 |     print 'load options...'
 59 |     for w, p in sorted(options.items(), key=lambda x:x[0]):
 60 |         print '{}: {}'.format(w, p)
 61 | 
 62 |     # load detail settings from option file:
 63 |     dictionary, dictionary_target = options['dictionaries']
 64 | 
 65 |     def _iter(fname):
 66 |         with open(fname, 'r') as f:
 67 |             for line in f:
 68 |                 words = line.strip().split()
 69 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
 70 |                 x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
 71 |                 x += [0]
 72 |                 yield x
 73 | 
 74 |     def _check_length(fname):
 75 |         f = open(fname, 'r')
 76 |         count = 0
 77 |         for _ in f:
 78 |             count += 1
 79 |         f.close()
 80 | 
 81 |         return count
 82 | 
 83 |     # load source dictionary and invert
 84 |     with open(dictionary, 'rb') as f:
 85 |         word_dict = pkl.load(f)
 86 |     word_idict = dict()
 87 |     for kk, vv in word_dict.iteritems():
 88 |         word_idict[vv] = kk
 89 |     word_idict[0] = '<eos>'
 90 |     word_idict[1] = 'UNK'
 91 | 
 92 |     # load target dictionary and invert
 93 |     with open(dictionary_target, 'rb') as f:
 94 |         word_dict_trg = pkl.load(f)
 95 |     word_idict_trg = dict()
 96 |     for kk, vv in word_dict_trg.iteritems():
 97 |         word_idict_trg[vv] = kk
 98 |     word_idict_trg[0] = '<eos>'
 99 |     word_idict_trg[1] = 'UNK'
100 | 
101 |     # ======================================================================== #
102 |     # Build a Translator
103 |     # ======================================================================== #
104 | 
105 |     # allocate model parameters
106 |     params  = init_params(options)
107 |     params  = load_params(model, params)
108 |     tparams = init_tparams(params)
109 | 
110 |     # print 'build the model for computing cost (full source sentence).'
111 |     trng, use_noise, \
112 |     _x, _x_mask, _y, _y_mask, \
113 |     opt_ret, \
114 |     cost, f_cost = build_model(tparams, options)
115 |     print 'done.'
116 | 
117 |     # functions for sampler
118 |     # f_sim_ctx, f_sim_init, f_sim_next = build_simultaneous_sampler(tparams, options, trng)
119 |     f_sim_ctx, f_sim_init, f_sim_next = build_noisy_sampler(tparams, options, trng)
120 |     print 'build sampler done.'
121 | 
122 |     # check the ID:
123 |     policy['base'] = _model
124 |     _policy        = Policy(trng, options, policy, config,
125 |                             n_out=options['dim'],
126 |                             recurrent=True, id=id)
127 | 
128 | 
129 |     # DATASET
130 |     trainIter = TextIterator(options['datasets'][0], options['datasets'][1],
131 |                              options['dictionaries'][0], options['dictionaries'][1],
132 |                              n_words_source=options['n_words_src'], n_words_target=options['n_words'],
133 |                              batch_size=config['batchsize'],
134 |                              maxlen=options['maxlen'])
135 | 
136 |     train_num = trainIter.num
137 | 
138 |     validIter = TextIterator(options['valid_datasets'][0], options['valid_datasets'][1],
139 |                              options['dictionaries'][0], options['dictionaries'][1],
140 |                              n_words_source=options['n_words_src'], n_words_target=options['n_words'],
141 |                              batch_size=1,
142 |                              maxlen=options['maxlen'])
143 | 
144 |     valid_num = validIter.num
145 | 
146 |     valid_    = options['valid_datasets'][0]
147 |     valid_num = _check_length(valid_)
148 |     print 'training set {} lines / validation set {} lines'.format(train_num, valid_num)
149 |     print 'use the reward function {}'.format(chr(config['Rtype'] + 65))
150 | 
151 |     # Translator model
152 |     def _translate(src, trg, train=False, samples=80):
153 |         ret = noisy_decoding(
154 |             f_sim_ctx, f_sim_init,
155 |             f_sim_next, f_cost,
156 |             src, trg, word_idict_trg, n_samples=samples,
157 |             train=train,
158 |             _policy=_policy)
159 | 
160 |         if not train:
161 |             sample, score, actions, R, tracks, attentions = ret
162 |             return sample, score, actions, R, tracks
163 |         else:
164 |             sample, score, actions, R, info = ret
165 |             return sample, score, actions, R, info
166 | 
167 | 
168 |     # ======================================================================== #
169 |     # Main Loop: Run
170 |     # ======================================================================== #
171 |     print 'Start Simultaneous Translator...'
172 |     probar           = Progbar(train_num / config['batchsize'],  with_history=False)
173 | 
174 |     # freqs
175 |     save_freq        = 2000
176 |     sample_freq      = 10
177 |     valid_freq       = 1000
178 |     valid_size       = 200
179 |     display_freq     = 50
180 | 
181 |     history, last_it = _policy.load()
182 |     time0            = timer()
183 | 
184 |     for it, (srcs, trgs) in enumerate(trainIter):  # only one sentence each iteration
185 |         if it < last_it:  # go over the scanned lines.
186 |             continue
187 | 
188 |         samples, scores, actions, rewards, info = _translate(srcs, trgs, train=True)
189 |         if it % sample_freq == 0:
190 | 
191 |             print '\nModel has been trained for {} seconds'.format(timer() - time0)
192 |             print 'source: ', _bpe2words(_seqs2words([srcs[0]], word_idict))[0]
193 |             print 'target: ', _bpe2words(_seqs2words([trgs[0]], word_idict_trg))[0]
194 | 
195 |             # obtain the translation results
196 |             samples = _bpe2words(_seqs2words(samples, word_idict_trg))
197 | 
198 |             print '---'
199 |             print 'sample: ', samples[40]
200 |             print 'sample: ', samples[60]
201 | 
202 |         values = [(w, info[w]) for w in info]
203 |         probar.update(it + 1, values=values)
204 | 
205 |         # NaN detector
206 |         for w in info:
207 |             if numpy.isnan(info[w]) or numpy.isinf(info[w]):
208 |                 raise RuntimeError, 'NaN/INF is detected!! {} : ID={}'.format(w, id)
209 | 
210 | 
211 | 
212 | if __name__ == "__main__":
213 |     parser = argparse.ArgumentParser()
214 |     parser.add_argument('-m', '--model',
215 |                         default='model_wmt15_bpe2k_basic_cs-en.npz')
216 |     parser.add_argument('--id', type=str, default=None)
217 |     parser.add_argument('-o', type=str, default=None)
218 | 
219 |     args   = parser.parse_args()
220 |     print args
221 | 
222 |     policy = OrderedDict()
223 |     policy['layernorm'] = True
224 |     policy['upper']     = False
225 |     policy['updater']   = 'REINFORCE'
226 |     policy['type']      = 'gaussian'
227 | 
228 |     config = OrderedDict()
229 |     config['batchsize'] = 1
230 |     config['Rtype']     = 8
231 | 
232 |     run_simultrans(args.model,
233 |                    options_file=args.o,
234 |                    config=config,
235 |                    policy=policy,
236 |                    id=args.id,
237 |                    remote=False)
238 | 
239 | 
240 | 


--------------------------------------------------------------------------------
/optimizer.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as tensor
  3 | import numpy
  4 | 
  5 | from layers import *
  6 | profile = False
  7 | 
  8 | # optimizers
  9 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
 10 | 
 11 | """
 12 | First order optimizer
 13 | """
 14 | def adam(lr, tparams, grads, inp, cost):
 15 |     gshared = [theano.shared(p.get_value() * 0.,
 16 |                              name='%s_grad' % k)
 17 |                for k, p in tparams.iteritems()]
 18 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
 19 | 
 20 |     f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile, on_unused_input='ignore')
 21 | 
 22 |     lr0 = lr # 0.0002
 23 |     b1 = 0.1
 24 |     b2 = 0.001
 25 |     e = 1e-8
 26 | 
 27 |     updates = []
 28 | 
 29 |     i = theano.shared(numpy.float32(0.))
 30 |     i_t = i + 1.
 31 |     fix1 = 1. - b1**(i_t)
 32 |     fix2 = 1. - b2**(i_t)
 33 |     lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
 34 | 
 35 |     for p, g in zip(tparams.values(), gshared):
 36 |         m = theano.shared(p.get_value() * 0.)
 37 |         v = theano.shared(p.get_value() * 0.)
 38 |         m_t = (b1 * g) + ((1. - b1) * m)
 39 |         v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
 40 |         g_t = m_t / (tensor.sqrt(v_t) + e)
 41 |         p_t = p - (lr_t * g_t)
 42 |         updates.append((m, m_t))
 43 |         updates.append((v, v_t))
 44 |         updates.append((p, p_t))
 45 |     updates.append((i, i_t))
 46 | 
 47 |     print 'build optimizer with Adam'
 48 |     f_update = theano.function([lr], [], updates=updates,
 49 |                                on_unused_input='ignore', profile=profile)
 50 | 
 51 |     return f_grad_shared, f_update
 52 | 
 53 | 
 54 | def adadelta(lr, tparams, grads, inp, cost):
 55 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
 56 |                                   name='%s_grad' % k)
 57 |                     for k, p in tparams.iteritems()]
 58 |     running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
 59 |                                  name='%s_rup2' % k)
 60 |                    for k, p in tparams.iteritems()]
 61 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
 62 |                                     name='%s_rgrad2' % k)
 63 |                       for k, p in tparams.iteritems()]
 64 | 
 65 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
 66 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
 67 |              for rg2, g in zip(running_grads2, grads)]
 68 | 
 69 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up,
 70 |                                     profile=profile)
 71 | 
 72 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
 73 |              for zg, ru2, rg2 in zip(zipped_grads, running_up2,
 74 |                                      running_grads2)]
 75 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
 76 |              for ru2, ud in zip(running_up2, updir)]
 77 |     param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
 78 | 
 79 |     f_update = theano.function([lr], [], updates=ru2up+param_up,
 80 |                                on_unused_input='ignore', profile=profile)
 81 | 
 82 |     print 'build optimizer with Adadelta'
 83 |     return f_grad_shared, f_update
 84 | 
 85 | 
 86 | def rmsprop(lr, tparams, grads, inp, cost):
 87 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
 88 |                                   name='%s_grad' % k)
 89 |                     for k, p in tparams.iteritems()]
 90 |     running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
 91 |                                    name='%s_rgrad' % k)
 92 |                      for k, p in tparams.iteritems()]
 93 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
 94 |                                     name='%s_rgrad2' % k)
 95 |                       for k, p in tparams.iteritems()]
 96 | 
 97 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
 98 |     rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
 99 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
100 |              for rg2, g in zip(running_grads2, grads)]
101 | 
102 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up,
103 |                                on_unused_input='ignore', profile=profile)
104 | 
105 |     updir = [theano.shared(p.get_value() * numpy.float32(0.),
106 |                            name='%s_updir' % k)
107 |              for k, p in tparams.iteritems()]
108 |     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
109 |                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
110 |                                             running_grads2)]
111 |     param_up = [(p, p + udn[1])
112 |                 for p, udn in zip(itemlist(tparams), updir_new)]
113 |     f_update = theano.function([lr], [], updates=updir_new+param_up,
114 |                                on_unused_input='ignore', profile=profile)
115 | 
116 |     print 'build optimizer with Rmsprop'
117 |     return f_grad_shared, f_update
118 | 
119 | 
120 | def sgd(lr, tparams, grads, x, mask, y, cost):
121 |     gshared = [theano.shared(p.get_value() * 0.,
122 |                              name='%s_grad' % k)
123 |                for k, p in tparams.iteritems()]
124 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
125 | 
126 |     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
127 |                                     profile=profile)
128 | 
129 |     pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
130 |     f_update = theano.function([lr], [], updates=pup, profile=profile)
131 | 
132 |     print 'build optimizer with SGD'
133 |     return f_grad_shared, f_update
134 | 
135 | 
136 | """
137 | Beyond first-order optimizer
138 | """
139 | def conjugate(lr, tparams, grads, inps, cost):
140 |     """
141 |     Performs constrained optimization via line search.
142 |     The search direction is computed using a conjugate gradient algorithm,
143 |     which gives x = A^{-1}g, where A is a second order approximation of the constraint and g is the gradient
144 |     of the loss function.
145 |     """
146 |     pass
147 | 


--------------------------------------------------------------------------------
/policy.py:
--------------------------------------------------------------------------------
  1 | """
  2 | -- Policy Network for decision making [more general]
  3 | """
  4 | from nmt_uni import *
  5 | from layers import _p
  6 | 
  7 | import os
  8 | import time, datetime
  9 | import cPickle as pkl
 10 | 
 11 | # hyper params
 12 | TINY = 1e-7
 13 | PI   = numpy.pi
 14 | E    = numpy.e
 15 | A    = 0.2
 16 | B    = 1
 17 | 
 18 | class Controller(object):
 19 | 
 20 |     def __init__(self, trng,
 21 |                  options,
 22 |                  policy,
 23 |                  config,
 24 |                  n_in=None, n_out=None,
 25 |                  recurrent=False, id=None):
 26 | 
 27 |         self.trng      = trng
 28 |         self.options   = options
 29 |         self.policy    = policy
 30 |         self.recurrent = recurrent
 31 |         self.type      = self.policy.get('type', 'categorical')
 32 | 
 33 |         self.n_hidden  = 512
 34 |         self.n_in      = n_in
 35 |         self.n_out     = n_out
 36 | 
 37 |         if self.policy.get('layernorm', True):
 38 |             self.rec = 'lngru'
 39 |         else:
 40 |             self.rec = 'gru'
 41 | 
 42 |         if not n_in:
 43 |             self.n_in  = options['readout_dim']
 44 | 
 45 |         if not n_out:
 46 |             if self.type == 'categorical':
 47 |                 self.n_out = 2    # initially it is a WAIT/COMMIT action.
 48 |             elif self.type == 'gaussian':
 49 |                 self.n_out = 100
 50 |             else:
 51 |                 raise NotImplementedError
 52 | 
 53 |         # build the policy network
 54 |         print 'parameter initialization'
 55 | 
 56 |         params = OrderedDict()
 57 | 
 58 |         if not self.recurrent:
 59 |             print 'building a feedforward controller'
 60 |             params = get_layer('ff')[0](options, params, prefix='policy_net_in',
 61 |                                         nin=self.n_in, nout=self.n_hidden, scale=0.001)
 62 |         else:
 63 |             print 'building a recurrent controller'
 64 |             params = get_layer(self.rec)[0](options, params, prefix='policy_net_in',
 65 |                                          nin=self.n_in, dim=self.n_hidden, scale=0.001)
 66 | 
 67 |         params = get_layer('ff')[0](options, params, prefix='policy_net_out',
 68 |                                     nin=self.n_hidden,
 69 |                                     nout=self.n_out if self.type == 'categorical' else self.n_out * 2,
 70 |                                     scale=0.001)
 71 | 
 72 |         # bias the forget probability
 73 |         # if self.n_out == 3:
 74 |         #    params[_p('policy_net_out', 'b')][-1]  = -2
 75 | 
 76 | 
 77 |         # for the baseline network.
 78 |         params_b = OrderedDict()
 79 | 
 80 |         # using a scalar baseline [**]
 81 |         # params_b['b0'] = numpy.array(numpy.random.rand() * 0.0, dtype='float32')
 82 | 
 83 |         # using a MLP as a baseline
 84 |         params_b = get_layer('ff')[0](options, params_b, prefix='baseline_net_in',
 85 |                                       nin=self.n_in, nout=128, scale=0.001)
 86 |         params_b = get_layer('ff')[0](options, params_b, prefix='baseline_net_out',
 87 |                                       nin=128, nout=1, scale=0.001)
 88 | 
 89 |         if id is not None:
 90 |             print 'reload the saved model: {}'.format(id)
 91 |             params   = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params)
 92 |             params_b = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params_b)
 93 |         else:
 94 |             id = datetime.datetime.fromtimestamp(time.time()).strftime('%y%m%d-%H%M%S')
 95 |             print 'start from a new model: {}'.format(id)
 96 | 
 97 |             with open('.config/conf.{}.txt'.format(id), 'w') as f:
 98 |                 f.write('[config]\n')
 99 | 
100 |                 for c in config:
101 |                     f.write('{}: {}\n'.format(c, config[c]))
102 |                 f.write('\n')
103 | 
104 |                 f.write('[policy]\n')
105 | 
106 |                 for c in policy:
107 |                     f.write('{}: {}\n'.format(c, policy[c]))
108 | 
109 |             # pkl.dump([policy, config], open('.config/{}.conf'.format(id), 'w'))
110 |             print 'save the config file'
111 | 
112 |         self.id = id
113 |         self.model = '.policy/{}-{}'.format(id, self.policy['base'])
114 | 
115 |         # theano shared params
116 |         tparams        = init_tparams(params)
117 |         tparams_b      = init_tparams(params_b)
118 | 
119 |         if ('bn' in policy) and policy['bn']:
120 |             # params for input-batch normalization
121 |             self.gamma = theano.shared(numpy.asarray(numpy.random.uniform(
122 |                 low=-1.0 / numpy.sqrt(self.n_in),
123 |                 high=1.0 / numpy.sqrt(self.n_in),
124 |                 size=(self.n_in)), dtype=theano.config.floatX), name='policy_gamma', borrow=True)
125 |             self.beta = theano.shared(numpy.zeros(
126 |                 (self.n_in), dtype=theano.config.floatX), name='policy_beta', borrow=True)
127 | 
128 |             self.mean = theano.shared(numpy.zeros((self.n_in), dtype=theano.config.floatX), name='mean', borrow=True)
129 |             self.var = theano.shared(numpy.ones((self.n_in), dtype=theano.config.floatX), name='var', borrow=True)
130 |             tparams['gamma'] = self.gamma
131 |             tparams['beta']  = self.beta
132 | 
133 |         self.tparams   = tparams
134 |         self.tparams_b = tparams_b
135 | 
136 |         # build the policy network
137 |         self.build_sampler(options=options)
138 |         self.build_discriminator(options=options)
139 | 
140 | 
141 |     def build_batchnorm(self, observation, mask=None):
142 |         raise NotImplementedError
143 | 
144 | 
145 |     def build_sampler(self, options):
146 | 
147 |         # ==================================================================================== #
148 |         # Build Action function: samplers
149 |         # ==================================================================================== #
150 | 
151 |         observation = tensor.matrix('observation', dtype='float32')  # batch_size x readout_dim (seq_steps=1)
152 |         prev_hidden = tensor.matrix('p_hidden', dtype='float32')
153 | 
154 |         if not self.recurrent:
155 |             hiddens = get_layer('ff')[1](self.tparams, observation,
156 |                                          options, prefix='policy_net_in',
157 |                                          activ='tanh')
158 |         else:
159 |             hiddens = get_layer(self.rec)[1](self.tparams, observation,
160 |                                           options, prefix='policy_net_in', mask=None,
161 |                                           one_step=True, _init_state=prev_hidden)[0]
162 | 
163 |         act_inps = [observation, prev_hidden]
164 |         if self.type == 'categorical':
165 |             act_prob  = get_layer('ff')[1](self.tparams, hiddens, options,
166 |                                          prefix='policy_net_out',
167 |                                          activ='softmax'
168 |                                         )   # batch_size x n_out
169 | 
170 |             # add action mask
171 |             if self.policy.get('act_mask', False):
172 |                 act_mask  = tensor.matrix('act_mask', dtype='float32')
173 |                 act_inps += [act_mask]
174 |                 act_prob *= act_mask
175 |                 act_prob /= (act_prob.sum(axis=-1, keepdims=True) + TINY)
176 |                 act_prob *= act_mask
177 | 
178 |             act_prob2 = tensor.clip(act_prob, TINY, 1 - TINY)
179 | 
180 |             # testing upper bound
181 |             # if self.policy['upper']:
182 |             #     act_prob *= 0.0
183 | 
184 |             # compiling the sampling function for action
185 |             # action        = self.trng.binomial(size=act_prop.shape, p=act_prop)
186 |             action          = self.trng.multinomial(pvals=act_prob).argmax(1)     # 0, 1, ...
187 | 
188 |             print 'build action sampling function [Discrete]'
189 |             self.f_action = theano.function(act_inps, [action, act_prob, hiddens, act_prob2],
190 |                                             on_unused_input='ignore')  # action/dist/hiddens
191 | 
192 |         elif self.type == 'gaussian':
193 |             _temp = get_layer('ff')[1](self.tparams, hiddens, options,
194 |                                          prefix='policy_net_out',
195 |                                          activ='linear'
196 |                                         )   # batch_size x n_out
197 |             mean, log_std = _temp[:, :self.n_out], _temp[:, self.n_out:]
198 |             mean, log_std = -A * tanh(mean), -B-relu(log_std)
199 | 
200 |             action0       = self.trng.normal(size=mean.shape, dtype='float32')
201 |             action        = action0 * tensor.exp(log_std) + mean
202 | 
203 | 
204 |             print 'build action sampling function [Gaussian]'
205 |             self.f_action = theano.function(act_inps, [action, mean, log_std, hiddens],
206 |                                             on_unused_input='ignore')  # action/dist/hiddens
207 |         else:
208 |             raise NotImplementedError
209 | 
210 | 
211 |     def build_discriminator(self, options):
212 |         # ==================================================================================== #
213 |         # Build Action Discriminator
214 |         # ==================================================================================== #
215 | 
216 |         observations  = tensor.tensor3('observations', dtype='float32')
217 |         mask          = tensor.matrix('mask', dtype='float32')
218 |         if self.type == 'categorical':
219 |             actions       = tensor.matrix('actions', dtype='int64')
220 |         elif self.type == 'gaussian':
221 |             actions       = tensor.tensor3('actions', dtype='float32')
222 |         else:
223 |             raise NotImplementedError
224 | 
225 | 
226 |         if not self.recurrent:
227 |             hiddens   = get_layer('ff')[1](self.tparams, observations,
228 |                                          options, prefix='policy_net_in',
229 |                                          activ='tanh')
230 |         else:
231 |             hiddens   = get_layer(self.rec)[1](self.tparams, observations,
232 |                                             options, prefix='policy_net_in', mask=mask)[0]
233 | 
234 |         act_inputs = [observations, mask]
235 |         if self.type == 'categorical':
236 |             act_probs     = get_layer('ff')[1](self.tparams, hiddens, options, prefix='policy_net_out',
237 |                                                activ='softmax') # seq_steps x batch_size x n_out
238 | 
239 |             if 'act_mask' in self.policy and self.policy['act_mask']:
240 |                 act_masks   = tensor.tensor3('act_masks', dtype='float32')
241 |                 act_inputs += [act_masks]
242 |                 act_probs  *= act_masks
243 |                 act_probs  /= (act_probs.sum(axis=-1, keepdims=True) + TINY)
244 |                 act_probs  *= act_masks
245 | 
246 |             act_probs = tensor.clip(act_probs, TINY, 1 - TINY)
247 | 
248 |             print 'build action distribiution'
249 |             self.f_probs  = theano.function(act_inputs, act_probs,
250 |                                            on_unused_input='ignore')  # get the action probabilities
251 |         elif self.type == 'gaussian':
252 |             _temps = get_layer('ff')[1](self.tparams, hiddens, options,
253 |                                        prefix='policy_net_out',
254 |                                        activ='linear'
255 |                                        )  # batch_size x n_out
256 |             means, log_stds = _temps[:, :, :self.n_out], _temps[:, :, self.n_out:]
257 |             means, log_stds = -A * tanh(means), -B-relu(log_stds)
258 | 
259 |             act_probs     = [means, log_stds]
260 | 
261 |             print 'build Gaussian PDF'
262 |             self.f_pdf    = theano.function(act_inputs, [means, log_stds],
263 |                                            on_unused_input='ignore')  # get the action probabilities
264 |         else:
265 |             raise NotImplementedError
266 | 
267 | 
268 |         # ==================================================================================== #
269 |         # Build Baseline Network (Input-dependent Value Function) & Advantages
270 |         # ==================================================================================== #
271 | 
272 |         print 'setup the advantages & baseline network'
273 |         reward        = tensor.matrix('reward')  # seq_steps x batch_size :: rewards for each steps
274 | 
275 |         # baseline is estimated with a 2-layer neural network.
276 |         hiddens_b     = get_layer('ff')[1](self.tparams_b, observations, options,
277 |                                            prefix='baseline_net_in',
278 |                                            activ='tanh')
279 |         baseline      = get_layer('ff')[1](self.tparams_b, hiddens_b, options,
280 |                                            prefix='baseline_net_out',
281 |                                            activ='linear')[:, :, 0]  # seq_steps x batch_size or batch_size
282 |         advantages    = self.build_advantages(act_inputs, reward, baseline, normalize=True)
283 | 
284 | 
285 |         # ==================================================================================== #
286 |         # Build Policy Gradient (here we provide two options)
287 |         # ==================================================================================== #
288 |         if self.policy['updater'] == 'REINFORCE':
289 |             print 'build RENIFROCE.'
290 |             self.build_reinforce(act_inputs, act_probs, actions, advantages)
291 | 
292 |         elif self.policy['updater'] == 'TRPO':
293 |             print 'build TRPO'
294 |             self.build_trpo(act_inputs, act_probs, actions, advantages)
295 |         else:
296 |             raise NotImplementedError
297 | 
298 |     # ==================================================================================== #
299 |     # Controller Actions
300 |     # ==================================================================================== #
301 |     def random(self, states, p=0.5):
302 |         live_k = states.shape[0]
303 |         return (numpy.random.random(live_k) > p).astype('int64'), \
304 |                numpy.ones(live_k) * p
305 | 
306 |     def action(self, states, prevhidden, act_mask=None):
307 |         if act_mask is None:
308 |             return self.f_action(states, prevhidden)
309 |         else:
310 |             return self.f_action(states, prevhidden, act_mask)
311 | 
312 | 
313 |     def init_hidden(self, n_samples=1):
314 |         return numpy.zeros((n_samples, self.n_hidden), dtype='float32')
315 | 
316 |     def init_action(self, n_samples=1):
317 |         states0 = numpy.zeros((n_samples, self.n_in), dtype='float32')
318 |         return self.f_action(states0, self.init_hidden(n_samples))
319 | 
320 | 
321 |     def get_learner(self):
322 |         if self.policy['updater'] == 'REINFORCE':
323 |             return self.run_reinforce
324 |         elif self.policy['updater'] == 'TRPO':
325 |             return self.run_trpo
326 |         else:
327 |             raise NotImplementedError
328 | 
329 |     @staticmethod
330 |     def kl(prob0, prob1):
331 |         p1 = (prob0 + TINY) / (prob1 + TINY)
332 |         # p2 = (1 - prob0 + TINY) / (1 - prob1 + TINY)
333 |         return tensor.sum(prob0 * tensor.log(p1), axis=-1)
334 | 
335 | 
336 |     @staticmethod
337 |     def _grab_prob(probs, X):
338 |         assert probs.ndim == 3
339 | 
340 |         batch_size = probs.shape[1]
341 |         max_len    = probs.shape[0]
342 |         vocab_size = probs.shape[2]
343 | 
344 |         probs = probs.reshape((batch_size * max_len, vocab_size))
345 |         return probs[tensor.arange(batch_size * max_len), X.flatten(1)].reshape(X.shape)  # advanced indexing
346 | 
347 |     def cross(self, probs, actions):
348 |         # return tensor.log(probs) * actions + tensor.log(1 - probs) * (1 - actions)
349 |         return self._grab_prob(tensor.log(probs), actions)
350 | 
351 |     def build_advantages(self, act_inputs, reward, baseline, normalize=True):
352 |         # TODO: maybe we need a discount factor gamma for advantages.
353 |         # TODO: we can also rewrite advantages with value functions (GAE)
354 | 
355 |         # Advantages and Normalization the return
356 |         reward_adv  = reward - baseline
357 |         mask        = act_inputs[1]
358 | 
359 |         if normalize:
360 |             reward_mean  = tensor.sum(mask * reward_adv) / (tensor.sum(mask) + TINY)
361 |             reward_mean2 = tensor.sum(mask * (reward_adv ** 2)) / (tensor.sum(mask) + TINY)
362 |             reward_std   = tensor.sqrt(tensor.maximum(reward_mean2 - reward_mean ** 2, TINY))
363 |             # reward_std  = tensor.maximum(reward_std, 1)
364 |             reward_c     = reward_adv - reward_mean  # independent mean
365 |             advantages   = reward_c / (reward_std + TINY)
366 |         else:
367 |             advantages   = reward_adv
368 | 
369 |         print 'build advantages and baseline gradient'
370 |         L      = tensor.sum(mask * (reward_adv ** 2)) / (tensor.sum(mask) + TINY)
371 |         dL     = tensor.grad(L, wrt=itemlist(self.tparams_b))
372 |         lr     = tensor.scalar(name='lr')
373 | 
374 |         inps_b = act_inputs + [reward]
375 |         oups_b = [L, advantages]
376 |         f_adv, f_update_b = adam(lr, self.tparams_b, dL, inps_b, oups_b)
377 |         # f_adv, f_update_b = rmsprop(lr, self.tparams_b, dL, inps_b, oups_b)
378 | 
379 |         self.f_adv      = f_adv
380 |         self.f_update_b = f_update_b
381 | 
382 |         return advantages
383 | 
384 | 
385 |     # ===================================================================
386 |     # Policy Grident: REINFORCE with Adam
387 |     # ===================================================================
388 |     def build_reinforce(self, act_inputs, act_probs, actions, advantages):
389 | 
390 |         mask          = act_inputs[1]
391 | 
392 |         if self.type == 'categorical':
393 |             if self.policy.get('act_mask', False):
394 |                 act_masks   = act_inputs[2]
395 |                 negEntropy  = tensor.sum(tensor.log(act_probs) * (act_probs * act_masks), axis=-1)
396 |             else:
397 |                 negEntropy  = tensor.sum(tensor.log(act_probs) * act_probs, axis=-1)
398 | 
399 |             logLikelihood   = self.cross(act_probs, actions)
400 | 
401 |         elif self.type == 'gaussian':
402 |             means, log_stds = act_probs
403 |             negEntropy      = -tensor.sum(log_stds + tensor.log(tensor.sqrt(2 * PI * E)), axis=-1)
404 | 
405 |             actions0        = (actions - means) / tensor.exp(log_stds)
406 |             logLikelihood   = -tensor.sum(log_stds, axis=-1) - \
407 |                               0.5 * tensor.sum(tensor.sqr(actions0), axis=-1) - \
408 |                               0.5 * means.shape[-1] * tensor.log(2 * PI)
409 | 
410 |         else:
411 |             raise NotImplementedError
412 | 
413 |         # tensor.log(act_probs) * actions + tensor.log(1 - act_probs) * (1 - actions)
414 | 
415 |         H     = tensor.sum(mask * negEntropy, axis=0).mean() * 0.01  # entropy penalty
416 |         J     = tensor.sum(mask * -logLikelihood * advantages, axis=0).mean() + H
417 |         dJ    = tensor.grad(J, wrt=itemlist(self.tparams))
418 | 
419 |         # clip the policy gradient to 1 (to avoid gradient exploding)
420 |         clip_c = 1.
421 |         if clip_c > 0.:
422 |             g2 = 0.
423 |             for g in dJ:
424 |                 g2 += (g ** 2).sum()
425 |             new_grads = []
426 |             for g in dJ:
427 |                 new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g))
428 |             dJ = new_grads
429 | 
430 |         print 'build REINFORCE optimizer'
431 |         lr    = tensor.scalar(name='lr')
432 | 
433 |         inps  = act_inputs + [actions, advantages]
434 |         outps = [J, H]
435 |         if self.type == 'gaussian':
436 |             outps += [actions0.mean(), actions.mean()]
437 | 
438 |         f_cost, f_update = adam(lr, self.tparams, dJ, inps, outps)
439 |         # f_cost, f_update = rmsprop(lr, self.tparams, dJ, inps, outps)
440 | 
441 |         self.f_cost   = f_cost
442 |         self.f_update = f_update
443 |         print 'done'
444 | 
445 | 
446 |     def run_reinforce(self, act_inputs, actions, reward, update=True, lr=0.0001):
447 | 
448 |         # sub baseline
449 |         inps_adv      = act_inputs + [reward]
450 |         L, advantages = self.f_adv(*inps_adv)
451 | 
452 |         inps_reinfoce = act_inputs + [actions, advantages]
453 |         if self.type == 'gaussian':
454 |             J, H, m, s    = self.f_cost(*inps_reinfoce)
455 |             info = {'J': J, 'G_norm': H, 'B_loss': L, 'Adv': advantages.mean(), 'm': m, 's': s}
456 |         else:
457 |             J, H = self.f_cost(*inps_reinfoce)
458 |             info = {'J': J, 'G_norm': H, 'B_loss': L, 'Adv': advantages.mean()}
459 | 
460 | 
461 |         if update:  # update the parameters
462 |             self.f_update_b(lr)
463 |             self.f_update(lr)
464 | 
465 |         return info
466 | 
467 | 
468 |     # ==================================================================================== #
469 |     # Trust Region Policy Optimization
470 |     # ==================================================================================== #
471 |     def build_trpo(self, act_inputs, act_probs, actions, advantages):
472 | 
473 |         assert self.type == 'categorical', 'in this stage not support TRPO'
474 | 
475 |         # probability distribution
476 |         mask      = act_inputs[1]
477 |         probs     = act_probs
478 |         probs_old = tensor.matrix(dtype='float32')
479 | 
480 |         logp      = self.cross(probs, actions)
481 |         logp_old  = self.cross(probs_old, actions)
482 | 
483 |         # policy gradient
484 |         J         = tensor.sum(mask * -tensor.exp(logp - logp_old) * advantages, axis=0).mean()
485 |         dJ        = flatgrad(J, self.tparams)
486 |         probs_fix = theano.gradient.disconnected_grad(probs)
487 | 
488 |         kl_fix    = tensor.sum(mask * self.kl(probs_fix, probs), axis=0).mean()
489 |         kl_grads  = tensor.grad(kl_fix, wrt=itemlist(self.tparams))
490 |         ftangents = tensor.fvector(name='flat_tan')
491 |         shapes    = [self.tparams[var].get_value(borrow=True).shape for var in self.tparams]
492 |         start     = 0
493 |         tangents  = []
494 |         for shape in shapes:
495 |             size = numpy.prod(shape)
496 |             tangents.append(tensor.reshape(ftangents[start:start + size], shape))
497 |             start += size
498 |         gvp       = tensor.add(*[tensor.sum(g * t) for (g, t) in zipsame(kl_grads, tangents)])
499 | 
500 |         # Fisher-vectror product
501 |         fvp       = flatgrad(gvp, self.tparams)
502 |         entropy   = tensor.sum(mask * -self.cross(probs, probs), axis=0).mean()
503 |         kl        = tensor.sum(mask * self.kl(probs_old, probs), axis=0).mean()
504 | 
505 |         print 'compile the functions'
506 |         inps          = act_inputs + [actions, advantages, probs_old]
507 |         loss          = [J, kl, entropy]
508 |         self.f_pg     = theano.function(inps, dJ)
509 |         self.f_loss   = theano.function(inps, loss)
510 |         self.f_fisher = theano.function([ftangents] + inps, fvp, on_unused_input='ignore')
511 | 
512 |         # get/set flatten params
513 |         print 'compling flat updater'
514 |         self.get_flat = theano.function([], tensor.concatenate([self.tparams[v].flatten() for v in self.tparams]))
515 |         theta         = tensor.vector()
516 |         start         = 0
517 |         updates       = []
518 |         for v in self.tparams:
519 |             p     = self.tparams[v]
520 |             shape = p.shape
521 |             size  = tensor.prod(shape)
522 |             updates.append((p, theta[start:start + size].reshape(shape)))
523 |             start += size
524 |         self.set_flat = theano.function([theta], [], updates=updates)
525 | 
526 | 
527 |     def run_trpo(self, act_inputs, actions, reward,
528 |                  update=True, cg_damping=1e-3, max_kl=1e-2, lr=0.0002):
529 | 
530 |         # sub baseline
531 |         inps_adv      = act_inputs + [reward]
532 |         L, advantages = self.f_adv(*inps_adv)
533 |         self.f_update_b(lr)
534 | 
535 |         # get current action distributions
536 |         probs  = self.f_probs(*act_inputs)
537 |         inps   = act_inputs + [actions, advantages, probs]
538 |         thprev = self.get_flat()
539 | 
540 |         def fisher_vector_product(p):
541 |             return self.f_fisher(p, *inps) + cg_damping * p
542 | 
543 |         g             = self.f_pg(*inps)
544 |         losses_before = self.f_loss(*inps)
545 | 
546 |         if numpy.allclose(g, 0):
547 |             print 'zero gradient, not updating'
548 |         else:
549 |             stepdir  = self.cg(fisher_vector_product, -g)
550 |             shs      = .5 * stepdir.dot(fisher_vector_product(stepdir))
551 |             lm       = numpy.sqrt(shs / max_kl)
552 | 
553 |             print "\nlagrange multiplier:", lm, "gnorm:", numpy.linalg.norm(g)
554 |             fullstep       = stepdir / lm
555 |             neggdotstepdir = -g.dot(stepdir)
556 | 
557 |             def loss(th):
558 |                 self.set_flat(th)
559 |                 return self.f_loss(*inps)[0]
560 | 
561 |             print 'do line search'
562 |             success, theta = self.linesearch(loss, thprev, fullstep, neggdotstepdir / lm)
563 | 
564 |             print "success", success
565 |             self.set_flat(theta)
566 | 
567 |         losses_after = self.f_loss(*inps)
568 | 
569 |         info = OrderedDict()
570 |         for (lname, lbefore, lafter) in zipsame(['J', 'KL', 'entropy'], losses_before, losses_after):
571 |             info[lname + "_before"] = lbefore
572 |             info[lname + "_after"]  = lafter
573 | 
574 |         # add the baseline loss into full information
575 |         info['B_loss'] = L
576 |         return info
577 | 
578 | 
579 |     @staticmethod
580 |     def linesearch(f, x, fullstep, expected_improve_rate, max_backtracks=10, accept_ratio=.1):
581 |         """
582 |         Backtracking linesearch, where expected_improve_rate is the slope dy/dx at the initial point
583 |         """
584 |         fval = f(x)
585 |         print "fval before", fval
586 |         for (_n_backtracks, stepfrac) in enumerate(.5 ** numpy.arange(max_backtracks)):
587 |             xnew = x + stepfrac * fullstep
588 |             newfval = f(xnew)
589 |             actual_improve = fval - newfval
590 |             expected_improve = expected_improve_rate * stepfrac
591 |             ratio = actual_improve / expected_improve
592 |             print "a/e/r", actual_improve, expected_improve, ratio
593 |             if ratio > accept_ratio and actual_improve > 0:
594 |                 print "fval after", newfval
595 |                 return True, xnew
596 |         return False, x
597 | 
598 |     @staticmethod
599 |     def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
600 |         """
601 |         Conjuctate Gradient
602 |         """
603 |         p = b.copy()
604 |         r = b.copy()
605 |         x = numpy.zeros_like(b)
606 |         rdotr = r.dot(r)
607 | 
608 |         fmtstr = "%10i %10.3g %10.3g"
609 |         titlestr = "%10s %10s %10s"
610 |         if verbose: print titlestr % ("iter", "residual norm", "soln norm")
611 | 
612 |         for i in xrange(cg_iters):
613 |             if callback is not None:
614 |                 callback(x)
615 |             if verbose: print fmtstr % (i, rdotr, numpy.linalg.norm(x))
616 |             z = f_Ax(p)
617 |             v = rdotr / p.dot(z)
618 |             x += v * p
619 |             r -= v * z
620 |             newrdotr = r.dot(r)
621 |             mu = newrdotr / rdotr
622 |             p = r + mu * p
623 | 
624 |             rdotr = newrdotr
625 |             if rdotr < residual_tol:
626 |                 break
627 | 
628 |         if callback is not None:
629 |             callback(x)
630 |         if verbose: print fmtstr % (i + 1, rdotr, numpy.linalg.norm(x))
631 |         return x
632 | 
633 | 
634 |     # ====================================================================== #
635 |     # Save & Load
636 |     # ====================================================================== #
637 | 
638 |     def save(self, history, it):
639 |         _params = OrderedDict()
640 |         _params = unzip(self.tparams, _params)
641 |         _params = unzip(self.tparams_b, _params)
642 | 
643 |         print 'save the policy network >> {}'.format(self.model)
644 |         numpy.savez('%s.current' % (self.model),
645 |                     history=history,
646 |                     it=it,
647 |                     **_params)
648 | 
649 |     def load(self):
650 |         if os.path.exists(self.model):
651 |             print 'loading from the existing model (current)'
652 | 
653 |             rmodel = numpy.load(self.model)
654 |             history = rmodel['history']
655 |             it = rmodel['it']
656 | 
657 |             self.params = load_params(rmodel, self.params)
658 |             self.params_b = load_params(rmodel, self.params_b)
659 |             self.tparams = init_tparams(self.params)
660 |             self.tparams_b = init_tparams(self.params_b)
661 | 
662 |             print 'the dataset need to go over {} lines'.format(it)
663 |             return history, it
664 |         else:
665 |             return [], -1
666 | 
667 | 
668 | 
669 | 
670 | 


--------------------------------------------------------------------------------
/pretrain_uni.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | from nmt_uni import train
 5 | 
 6 | def main(job_id, params):
 7 |     print params
 8 |     validerr = train(saveto=params['model'][0],
 9 |                      reload_=params['reload'][0],
10 |                      dim_word=params['dim_word'][0],
11 |                      dim=params['dim'][0],
12 |                      n_words=params['n-words'][0],
13 |                      n_words_src=params['n-words'][0],
14 |                      decay_c=params['decay-c'][0],
15 |                      clip_c=params['clip-c'][0],
16 |                      lrate=params['learning-rate'][0],
17 |                      optimizer=params['optimizer'][0], 
18 |                      patience=1000,
19 |                      maxlen=50,
20 |                      batch_size=64,
21 |                      valid_batch_size=64,
22 |                      validFreq=1000,
23 |                      dispFreq=50,
24 |                      saveFreq=1000,
25 |                      sampleFreq=99,
26 |                      datasets=['/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe',
27 |                                '/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe'],
28 |                      valid_datasets=['/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-src.en.tok.bpe',
29 |                                      '/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-ref.ru.tok.bpe'],
30 |                      dictionaries=['/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe.word.pkl',
31 |                                '/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe.word.pkl'],
32 |                      use_dropout=params['use-dropout'][0])
33 |     return validerr
34 | 
35 | if __name__ == '__main__':
36 |     main(0, {
37 |         'model': ['models/model_wmt15_bpe2k_uni_en-ru.npz'],
38 |         'dim_word': [512],
39 |         'dim': [1028],
40 |         'n-words': [20000], 
41 |         'optimizer': ['adadelta'],
42 |         'decay-c': [0.], 
43 |         'clip-c': [1.], 
44 |         'use-dropout': [False],
45 |         'learning-rate': [0.0001],
46 |         'reload': [False]})
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/reward.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Collection of reward functions for Simultaneous Machine Translation
  5 | """
  6 | import numpy
  7 | from bleu import *
  8 | 
  9 | 
 10 | # computing the discounting matrix
 11 | gamma = 0.9
 12 | maxlen = 100
 13 | 
 14 | 
 15 | def compute_discount(gamma, maxlen):
 16 |     c = numpy.ones((maxlen,)) * gamma
 17 |     c[0] = 1.
 18 |     c = c.cumprod()
 19 | 
 20 |     C = numpy.triu(numpy.repeat(c[None, :], repeats=maxlen, axis=0))
 21 |     C /= c[:, None]
 22 |     return C
 23 | 
 24 | 
 25 | GAMMA = compute_discount(gamma, maxlen)  # precomputed
 26 | 
 27 | def translation_cost(**_k):
 28 | 
 29 |     def BLEU():
 30 |         q = numpy.zeros((_k['steps'],))
 31 |         s = _k['sample']
 32 |         r = _k['reference']
 33 |         chencherry = SmoothingFunction()
 34 |         b = sentence_bleu(r, s, smoothing_function=chencherry.method5)
 35 |         q[-1] = b[1]
 36 |         return q, b
 37 | 
 38 | 
 39 |     return BLEU()
 40 | 
 41 | 
 42 | 
 43 | 
 44 | # The general function for rewards (for simultrans):
 45 | def return_reward(**_k):
 46 | 
 47 |     # ----------------------------------------------------------------- #
 48 |     # reward for quality
 49 |     # use negative-loglikelihood as the reward (full sentence)
 50 |     # we can also use BLEU for quality, but let's try the simplest one'
 51 |     #
 52 |     @staticmethod
 53 |     def _bpe2words(capsw):
 54 |         capw = []
 55 |         for cc in capsw:
 56 |             capw += [cc.replace('@@ ', '')]
 57 |         return capw
 58 | 
 59 | 
 60 |     def LogLikelihood():
 61 |         q = numpy.zeros((_k['steps'],))
 62 |         q[-1] = _k['f_cost'](
 63 |             _k['ctx0'], _k['x_mask'], _k['y'], _k['y_mask']
 64 |         )
 65 |         return q
 66 | 
 67 |     def StepLogLikelihood():
 68 |         pass
 69 | 
 70 | 
 71 |     def NormLogLikelihood():
 72 |         q      = LogLikelihood()
 73 |         length = _k['y'].shape[0]
 74 |         return q / float(length)
 75 | 
 76 |     def BLEU():
 77 |         q = numpy.zeros((_k['steps'],))
 78 |         s = _k['sample']
 79 |         r = _k['reference']
 80 |         chencherry = SmoothingFunction()
 81 |         q[-1] = sentence_bleu(r, s, smoothing_function=chencherry.method5)
 82 |         return q
 83 | 
 84 |     def LatencyBLEUwithForget(beta=None, discount=1., return_quality=False):
 85 | 
 86 |         # init
 87 |         words = _k['words'].split()  # end-of-sentence is treated as a word
 88 |         ref   = _k['reference']
 89 | 
 90 |         q0    = numpy.zeros((_k['steps'],))
 91 | 
 92 |         # check 0, 1
 93 |         maps  = [(it, a) for it, a in enumerate(_k['act']) if a < 2]
 94 |         kmap  = len(maps)
 95 |         lb    = numpy.zeros((kmap,))
 96 |         ts    = numpy.zeros((kmap,))
 97 |         q     = numpy.zeros((kmap,))
 98 | 
 99 |         if not beta:
100 |             beta = kmap
101 | 
102 |         beta = 1. / float(beta)
103 | 
104 |         chencherry = SmoothingFunction()
105 | 
106 |         # compute BLEU for each Yt
107 |         Y = []
108 |         bleus = []
109 |         truebleus = []
110 |         for t in xrange(len(words)):
111 |             if len(Y) > 0:
112 |                 _temp = Y[-1] + ' ' + words[t]
113 |                 _temp = _temp.replace('@@ ', '')
114 |                 Y = Y[:-1] + _temp.split()
115 |             else:
116 |                 Y = [words[t]]
117 | 
118 |             bb = sentence_bleu(ref, Y, smoothing_function=chencherry.method5)
119 | 
120 |             bleus.append(bb[0])
121 |             truebleus.append(bb[1])
122 | 
123 |         bleus.reverse()
124 |         truebleus.reverse()
125 | 
126 |         # compute the Latency-Bleu
127 |         T = 0
128 |         Prev = 0
129 |         for i, (it, a) in enumerate(maps):
130 |             # print 'Prev', Prev
131 |             if a == 0:  # WAIT
132 |                 T += 1
133 |                 if i == 0:
134 |                     lb[i] = 0
135 |                 else:
136 |                     lb[i] = lb[i - 1] + Prev
137 |             elif a == 1:
138 |                 if i < kmap - 1:
139 |                     lb[i] = lb[i - 1] - Prev
140 | 
141 |                     Prev = bleus.pop()
142 |                     lb[i] += Prev
143 |                 else:
144 |                     lb[i] = lb[i - 2]
145 |             else:
146 |                 lb[i] = 0
147 | 
148 |             ts[i] = T
149 | 
150 |         # average the score
151 |         # print 'Unnormalized BLEU', lb
152 |         lbn = lb / ts
153 | 
154 |         # print 'Latency BLEU', lbn
155 |         q[1:] = lbn[1:] - lbn[:-1]
156 |         # print 'instant reward', q
157 | 
158 |         # add the whole sentence balance on it
159 |         q[-1] = Prev  # the last BLEU
160 |         # print 'instant reward', q
161 | 
162 |         for i, (it, a) in enumerate(maps):
163 |             q0[it] = q[i]
164 | 
165 |         return q0
166 | 
167 | 
168 |     def LatencyBLEUex(beta=None, discount=1., return_quality=False):
169 | 
170 |         # init
171 |         words = _k['words'].split()  # end-of-sentence is treated as a word
172 |         ref   = _k['reference']
173 | 
174 |         q     = numpy.zeros((_k['steps'],))
175 |         lb    = numpy.zeros((_k['steps'],))
176 |         ts    = numpy.zeros((_k['steps'],))
177 | 
178 |         if not beta:
179 |             beta  = _k['steps']
180 | 
181 |         beta  = 1. / float(beta)
182 | 
183 |         chencherry = SmoothingFunction()
184 | 
185 |         # compute BLEU for each Yt
186 |         Y     = []
187 |         bleus = []
188 |         truebleus = []
189 |         for t in xrange(len(words)):
190 |             if len(Y) > 0:
191 |                 _temp = Y[-1] + ' ' + words[t]
192 |                 _temp = _temp.replace('@@ ', '')
193 |                 Y     = Y[:-1] + _temp.split()
194 |             else:
195 |                 Y     = [words[t]]
196 | 
197 |             bb = sentence_bleu(ref, Y, smoothing_function=chencherry.method5)
198 | 
199 |             bleus.append(bb[0])
200 |             truebleus.append(bb[1])
201 | 
202 |         bleus.reverse()
203 |         truebleus.reverse()
204 |         # print bleus
205 | 
206 |         # compute the Latency-Bleu
207 |         T     = 0
208 |         Prev  = 0
209 |         for i, a in enumerate(_k['act']):
210 |             # print 'Prev', Prev
211 |             if a == 0:                  # WAIT
212 |                 T     += 1
213 |                 if i == 0:
214 |                     lb[i] = 0
215 |                 else:
216 |                     lb[i] = lb[i - 1] + Prev
217 |             elif a == 1:
218 |                 if i < len(_k['act']) - 1:
219 |                     lb[i]  = lb[i - 1] - Prev
220 | 
221 |                     Prev   = bleus.pop()
222 |                     lb[i] += Prev
223 |                 else:
224 |                     lb[i]  = lb[i - 2]
225 |             else:
226 |                 lb[i] = 0
227 | 
228 |             ts[i] = T
229 | 
230 |         # average the score
231 |         # print 'Unnormalized BLEU', lb
232 |         lbn   = lb / ts
233 | 
234 |         # print 'Latency BLEU', lbn
235 |         q[1:] = lbn[1:] - lbn[:-1]
236 |         # print 'instant reward', q
237 | 
238 |         # add the whole sentence balance on it
239 |         q[-1] = Prev  # the last BLEU
240 |         # print 'instant reward', q
241 | 
242 |         if return_quality:  # instant reward sequence (Latency BLEU)
243 |             return q
244 | 
245 | 
246 |         # cumulitive futurereward (with discounting factor)
247 | 
248 |         if discount == 1:
249 |             R = q[::-1].cumsum()[::-1]
250 |             # print 'future reward', R
251 | 
252 |         else:
253 |             L    = _k['steps']
254 |             FLAG = False
255 | 
256 |             if not(gamma == discount):
257 |                 FLAG  = True
258 |                 gamma = discount
259 | 
260 |             if L > maxlen:
261 |                 FLAG   = True
262 |                 maxlen = L
263 | 
264 |             if FLAG:
265 |                 GAMMA  = compute_discount(gamma, maxlen)
266 |                 FLAG   = False
267 | 
268 |             R = numpy.dot(GAMMA[:L, :L], q[:, None]).flatten()
269 |         # import sys
270 |         # sys.exit(123)
271 |         # print q  # collect all instant reward
272 | 
273 |         d = NormalizedDelay()
274 |         return R, q[-1], d[-1], lbn[-1] + q[-1]
275 | 
276 | 
277 |     # ----------------------------------------------------------------- #
278 |     # reward for delay
279 |     # several options:
280 |     # 1. the total delay, which is computed at the last step
281 |     def NormalizedDelay():
282 |         d = numpy.zeros((_k['steps'],))
283 |         # print a
284 |         _src = 0
285 |         _trg = 0
286 |         _sum = 0
287 |         for it, a in enumerate(_k['act']):
288 |             if a == 0:
289 |                 _src += 1
290 |             elif a == 1:
291 |                 _trg += 1
292 |                 _sum += _src
293 |         d[-1] = _sum / (_src * _trg + 1e-6)
294 |         return d
295 | 
296 |     # do not use this
297 |     def NormalizedDelaywithPenalty():
298 |         d = numpy.zeros((_k['steps'],))
299 |         a = numpy.array(_k['act'], dtype='float32')
300 |         # print a
301 |         d[-1] = numpy.sum(numpy.cumsum(1 - a) * a) / (_k['src_max'] * numpy.sum(a)) * numpy.exp(-3. / _k['src_max'])
302 |         return d
303 | 
304 |     def ConsectiveWaiting():
305 |         d = numpy.zeros((_k['steps'],))
306 |         a = numpy.array(_k['act'], dtype='float32')
307 | 
308 | 
309 |     def StepDeley():
310 |         d = numpy.array(_k['act'], dtype='float32') - 1.
311 |         return d
312 | 
313 | 
314 |     def SilceDelay(win=5):
315 |         d0 = numpy.array(_k['act'], dtype='float32') - 1.
316 | 
317 |         def slice(m):
318 |             d     = d0
319 |             d[m:] = d0[:-m]
320 |             return  d
321 | 
322 |         dd = numpy.mean([d0] + [slice(w) for w in range(1, win)])
323 |         return dd
324 | 
325 |     # -reward of delay
326 |     def MovingDelay(beta=0.1):
327 |         d    = numpy.zeros((_k['steps'],))
328 |         _max = 0
329 |         _cur = 0
330 | 
331 |         for it, a in enumerate(_k['act']):
332 |             if a == 0:
333 |                 _cur += 1
334 |                 if _cur > _max:
335 |                     _max += 1
336 |                     d[it] = -1
337 |             else:
338 |                 _cur = 0
339 | 
340 |         return d * beta
341 | 
342 | 
343 |     def MaximumDelay(_max=5, beta=0.1):
344 |         d    = numpy.zeros((_k['steps'],))
345 |         _cur = 0
346 |         for it, a in enumerate(_k['act']):
347 |             if a == 0:
348 |                 _cur += 1
349 |                 if _cur > _max:
350 |                     d[it] = -1
351 |                 pass
352 |             elif a == 1:   # only for new commit
353 |                 _cur = 0
354 | 
355 |         return d * beta
356 | 
357 |     # ----------------------------------------------------------------- #
358 |     def MaximumSource(_max=7, beta=0.1):
359 |         s = numpy.zeros((_k['steps'], ))
360 |         _cur = 0
361 |         _end = 0
362 |         for it, a in enumerate(_k['act']):
363 |             if a == 0:
364 |                 _cur += 1
365 |             elif a == 2:
366 |                 _end += 1
367 | 
368 |             if (_cur - _end) > _max:
369 |                 s[it] = -1
370 |         return s * beta
371 | 
372 |     def MovingSource(beta=0.1):
373 |         s    = numpy.zeros((_k['steps'],))
374 |         _max = 0
375 |         _cur = 0
376 |         _end = 0
377 | 
378 |         for it, a in enumerate(_k['act']):
379 |             if a == 0:
380 |                 _cur += 1
381 |             elif a == 2:
382 |                 _end += 1
383 | 
384 |             temp = _cur - _end
385 |             if temp > _max:
386 |                 s[it] = -1
387 |                 _max = temp
388 | 
389 |         return s * beta
390 | 
391 |     def AwardForget(_max=5, beta=0.1):
392 |         s = numpy.zeros((_k['steps'],))
393 |         _cur = 0
394 |         _end = 0
395 |         for it, a in enumerate(_k['act']):
396 |             if a == 0:
397 |                 _cur += 1
398 |             elif a == 2:
399 |                 _end += 1
400 | 
401 |             if ((_cur - _end) >= _max) and (a == 2):
402 |                 s[it] = 1
403 |         return s * beta, _cur / float(_k['src_max'])
404 | 
405 |     def AwardForget2(_max=5, beta=0.001):
406 |         s = numpy.zeros((_k['steps'],))
407 |         _cur = 0
408 |         _end = 0
409 |         for it, a in enumerate(_k['act']):
410 |             if a == 0:
411 |                 _cur += 1
412 |             elif a == 2:
413 |                 _end += 1
414 | 
415 |             if a == 2:
416 |                 s[it] = (_cur - _end - _max) * 2
417 |         return s * beta
418 | 
419 | 
420 | 
421 |     # ----------------------------------------------------------------- #
422 |     # reward for quality + delay
423 |     def Q2D1(alpha=0.5):
424 |         # q = LogLikelihood()
425 |         q = NormLogLikelihood()
426 |         d = NormalizedDelay()
427 | 
428 |         r = (q ** alpha) * ((1 - d) ** (1 - alpha))
429 |         R = r[::-1].cumsum()[::-1]
430 |         return R, q[-1], d[-1], r[-1]
431 | 
432 |     def Q2D2(alpha=0.5):
433 |         # q = LogLikelihood()
434 |         q = BLEU()
435 |         d = NormalizedDelaywithPenalty()
436 | 
437 |         r = (q * alpha) + ((1 - d) * (1 - alpha))
438 |         R = r[::-1].cumsum()[::-1]
439 |         return R, q[-1], d[-1], r[-1]
440 | 
441 |     def Q2D3(alpha=0.5):
442 |         # q = LogLikelihood()
443 |         q = BLEU()
444 |         d = NormalizedDelay()
445 | 
446 |         r = q # (q * alpha) + ((1 - d) * (1 - alpha))
447 |         R = r[::-1].cumsum()[::-1]
448 |         return R, q[-1], d[-1], r[-1]
449 | 
450 |     def Q2D4(alpha=0.5):
451 |         # q = LogLikelihood()
452 |         q = BLEU()
453 |         d = NormalizedDelay()
454 |         d0 = d[-1]
455 |         d[-1] = numpy.exp(-max(d0 - 0.7, 0))
456 |         r = q * d  # (q * alpha) + ((1 - d) * (1 - alpha))
457 |         R = r[::-1].cumsum()[::-1]
458 |         return R, q[-1], d0, r[-1]
459 | 
460 | 
461 |     # ---------------------------------------------------------------- #
462 |     # user defined target delay \tau*
463 |     def QualityDelay(tau = 0.5, gamma=3):
464 |         q = LatencyBLEUex(return_quality=True)
465 |         d = NormalizedDelay()
466 | 
467 |         # just bleu
468 |         bleu  = q[-1]
469 | 
470 |         # just delay
471 |         delay = d[-1]
472 | 
473 |         r = q -  gamma * numpy.maximum(d - tau, 0) ** 2  # instant reward
474 |         R = r[::-1].cumsum()[::-1]
475 |         return R, bleu, delay, r
476 | 
477 |     def FullQualityDelay(tau = 0.5, gamma=10):
478 |         q  = LatencyBLEUex(return_quality=True)
479 |         d  = NormalizedDelay()
480 |         d1 = SilceDelay()
481 | 
482 |         # just bleu
483 |         bleu = q[-1]
484 | 
485 |         # just delay
486 |         delay = d[-1]
487 | 
488 |         r = q  + d1 - gamma * numpy.maximum(d - tau, 0) ** 2  # instant reward
489 |         R = r[::-1].cumsum()[::-1]
490 |         return R, bleu, delay, r
491 | 
492 |     # UPDATE: July 11, 2016: we have several varisions::
493 |     def ReturnA():
494 |         # params
495 |         gamma = _k['gamma']
496 |         beta  = 0.1
497 | 
498 |         q0 = LatencyBLEUex(return_quality=True)
499 |         d0 = NormalizedDelay()
500 | 
501 |         # just bleu
502 |         bleu   = q0[-1]
503 | 
504 |         # just delay
505 |         delay  = d0[-1]
506 | 
507 |         # use moving-delay + latency bleu (without final BLEU)
508 |         q      = q0
509 |         q[-1]  = 0.
510 |         d      = MovingDelay(beta=beta)
511 | 
512 |         r      = q + gamma * d
513 |         R      = r[::-1].cumsum()[::-1]
514 |         return R, bleu, delay, r
515 | 
516 |     def ReturnB():
517 |         # params
518 |         gamma = _k['gamma']
519 |         beta  = 0.1
520 | 
521 |         q0 = LatencyBLEUex(return_quality=True)
522 |         d0 = NormalizedDelay()
523 | 
524 |         # just bleu
525 |         bleu  = q0[-1]
526 | 
527 |         # just delay
528 |         delay = d0[-1]
529 | 
530 |         # use maximum-delay + latency bleu (without final BLEU)
531 |         q     = q0
532 |         q[-1] = 0.
533 |         d     = MaximumDelay(_max=4, beta=beta)
534 | 
535 |         r = q + gamma * d
536 |         R = r[::-1].cumsum()[::-1]
537 |         return R, bleu, delay, r
538 | 
539 |     def ReturnC():
540 |         # params
541 |         gamma = _k['gamma']
542 |         beta = 0.1
543 | 
544 |         q0 = LatencyBLEUex(return_quality=True)
545 |         d0 = NormalizedDelay()
546 | 
547 |         # just bleu
548 |         bleu = q0[-1]
549 | 
550 |         # just delay
551 |         delay = d0[-1]
552 | 
553 |         # use maximum-delay + latency bleu (with final BLEU)
554 |         q = q0
555 |         d = MaximumDelay(_max=5, beta=beta)
556 | 
557 |         r = q + gamma * d
558 |         R = r[::-1].cumsum()[::-1]
559 |         return R, bleu, delay, r
560 | 
561 |     def ReturnD():
562 |         # params
563 |         gamma = _k['gamma']
564 |         beta  = 0.1
565 | 
566 |         q0 = LatencyBLEUex(return_quality=True)
567 |         d0 = NormalizedDelay()
568 | 
569 |         # just bleu
570 |         bleu   = q0[-1]
571 | 
572 |         # just delay
573 |         delay  = d0[-1]
574 | 
575 |         # use moving-delay + latency bleu (with final BLEU)
576 |         q      = q0
577 |         d      = MovingDelay(beta=beta)
578 | 
579 |         r      = q + gamma * d
580 |         R      = r[::-1].cumsum()[::-1]
581 |         return R, bleu, delay, r
582 | 
583 |     def ReturnE():
584 |         # params
585 |         gamma = _k['gamma']
586 |         beta  = 0.1
587 |         tau   = _k['target']
588 | 
589 |         q0 = LatencyBLEUex(return_quality=True)
590 |         d0 = NormalizedDelay()
591 | 
592 |         # just bleu
593 |         bleu  = q0[-1]
594 | 
595 |         # just delay
596 |         delay = d0[-1]
597 | 
598 |         # use maximum-delay + latency bleu (without final BLEU) + global delay
599 |         q     = q0
600 |         q[-1] = 0.
601 |         d     = MaximumDelay(_max=4, beta=beta)
602 |         d[-1]-= numpy.maximum(delay - tau, 0)
603 | 
604 |         r = q + gamma * d
605 |         R = r[::-1].cumsum()[::-1]
606 |         return R, bleu, delay, r
607 | 
608 |     def ReturnF():
609 |         # params
610 |         gamma = _k['gamma']
611 |         beta  = 0.1
612 |         tau   = _k['target']
613 | 
614 |         q0 = LatencyBLEUex(return_quality=True)
615 |         d0 = NormalizedDelay()
616 | 
617 |         # just bleu
618 |         bleu = q0[-1]
619 | 
620 |         # just delay
621 |         delay = d0[-1]
622 | 
623 |         # use maximum-delay + latency bleu (with final BLEU) + global delay
624 |         q = q0
625 |         d = MaximumDelay(_max=5, beta=beta)
626 |         d[-1] -= numpy.maximum(delay - tau, 0) * gamma
627 | 
628 |         r = q + d
629 |         R = r[::-1].cumsum()[::-1]
630 |         return R, bleu, delay, r
631 | 
632 |     # ---------------------------------------------------------------- #
633 |     def ReturnG():
634 |         # params
635 |         discount = _k['discount']   ## 0.95 here gamma is the discounting factor
636 |         beta = 0.1
637 | 
638 |         q0 = LatencyBLEUwithForget(return_quality=True)
639 |         d0 = NormalizedDelay()
640 | 
641 |         # just bleu
642 |         bleu = q0[-1]
643 | 
644 |         # just delay
645 |         delay = d0[-1]
646 | 
647 |         # use maximum-delay + latency bleu (with final BLEU)
648 |         q = q0
649 |         d = MaximumDelay(_max=4,  beta=beta)
650 |         s = MaximumSource(_max=7, beta=0.01)
651 | 
652 |         if discount == 1:
653 |             r = q + d + s
654 |             R = r[::-1].cumsum()[::-1]
655 |         else:
656 |             raise NotImplementedError
657 | 
658 |         return R, bleu, delay, r
659 | 
660 |     def ReturnH():
661 |         # params
662 |         discount = _k['discount']   ## 0.95 here gamma is the discounting factor
663 |         beta = 0.1
664 | 
665 |         q0 = LatencyBLEUwithForget(return_quality=True)
666 |         d0 = NormalizedDelay()
667 | 
668 |         # just bleu
669 |         bleu = q0[-1]
670 | 
671 |         # just delay
672 |         delay = d0[-1]
673 | 
674 |         # use maximum-delay + latency bleu (with final BLEU)
675 |         q = q0
676 |         d = MaximumDelay(_max=4,  beta=beta)
677 |         s = MovingSource(beta=0.02)
678 | 
679 |         if discount == 1:
680 |             r = q + d + s
681 |             R = r[::-1].cumsum()[::-1]
682 |         else:
683 |             raise NotImplementedError
684 | 
685 |         return R, bleu, delay, r
686 | 
687 |     def ReturnI():
688 |         # params
689 | 
690 |         discount = _k['gamma']  ## 0.95 here gamma is the discounting factor
691 |         maxsrc   = _k['maxsrc']
692 |         beta = 0.1
693 | 
694 |         q0 = LatencyBLEUwithForget(return_quality=True)
695 |         d0 = NormalizedDelay()
696 | 
697 |         # global reward signal :::>>>
698 |         # just bleu
699 |         bleu  = q0[-1]
700 | 
701 |         # just delay
702 |         delay = d0[-1]
703 | 
704 |         # local reward signal :::>>>>
705 |         # use maximum-delay + latency bleu (with final BLEU)
706 |         q     = q0
707 |         q[-1] = 0
708 |         d     = MaximumDelay(_max=5, beta=beta)
709 |         s, _  = AwardForget(_max=maxsrc, beta=0.01)
710 |         # s     = AwardForget2(_max=maxsrc, beta=0.001)
711 | 
712 |         r0    = q + d + s
713 |         rg    = bleu      # it is a global reward, will not be discounted.
714 | 
715 |         if discount == 1:
716 |             r      = r0
717 |             r[-1] += rg
718 |             R      = r[::-1].cumsum()[::-1]
719 |         else:
720 |             R      = numpy.zeros_like(r0)
721 |             R[-1]  = r0[-1]
722 |             for it in range(_k['steps'] - 2, -1, -1):
723 |                 R[it] = discount * R[it + 1] + r0[it]
724 |             R      += rg  # add a global signal (without a discount factor)
725 | 
726 |         return R, bleu, delay, r0
727 | 
728 |     def ReturnJ():
729 |         # params
730 | 
731 |         discount = _k['gamma']  ## 0.95 here gamma is the discounting factor
732 |         beta = 0.1
733 | 
734 |         q0 = LatencyBLEUwithForget(return_quality=True)
735 |         d0 = NormalizedDelay()
736 | 
737 |         # global reward signal :::>>>
738 |         # just bleu
739 |         bleu  = q0[-1]
740 | 
741 |         # just delay
742 |         delay = d0[-1]
743 | 
744 |         # local reward signal :::>>>>
745 |         # use maximum-delay + latency bleu (with final BLEU)
746 |         q     = q0
747 |         q[-1] = 0
748 |         d     = MaximumDelay(_max=5, beta=beta)
749 |         # s, m  = AwardForget(_max=5, beta=0.01)
750 | 
751 |         r0    = q + d # + s
752 |         rg    = bleu # * m  # it is a global reward, will not be discounted.
753 | 
754 |         if discount == 1:
755 |             r      = r0
756 |             r[-1] += rg
757 |             R      = r[::-1].cumsum()[::-1]
758 |         else:
759 |             R      = numpy.zeros_like(r0)
760 |             R[-1]  = r0[-1]
761 |             for it in range(_k['steps'] - 2, -1, -1):
762 |                 R[it] = discount * R[it + 1] + r0[it]
763 |             R      += rg  # add a global signal (without a discount factor)
764 | 
765 |         return R, bleu, delay, r0
766 | 
767 | 
768 |     # **------------------------------------------------ **#
769 |     # アサガオの散る頃に
770 | 
771 |     def Q2Ds():
772 |         q = NormLogLikelihood()
773 |         d = NormalizedDelay()
774 |         return q, d
775 | 
776 |     gamma = _k['gamma']
777 |     type  = _k['Rtype']
778 | 
779 |     funcs = [ReturnA, ReturnB, ReturnC, ReturnD, ReturnE, ReturnF, ReturnG, ReturnH, ReturnI, ReturnJ]
780 |     return funcs[type]()
781 | 
782 |     # return FullQualityDelay(tau, gamma)
783 |     # return QualityDelay(tau=tau, gamma=gamma)
784 | 
785 |     # return LatencyBLEUex()
786 |     # return Q2D4(0.2)
787 |     # return Q2Ds()
788 | 


--------------------------------------------------------------------------------
/run_eval.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=device=gpu$2 python simultrans_evaluation.py  --sinit 1 --target 0.5 --sample 64 --batchsize 1 --Rtype $1 --gamma 1 --id $3 --recurrent True  2>&1 | tee .images/$4.log
2 | 


--------------------------------------------------------------------------------
/run_train.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=device=gpu$2 python simultrans_training.py --sample 32 --batchsize 1 --target $1 --gamma $3 --recurrent True  2>&1 | tee .log/$4.log
2 | 


--------------------------------------------------------------------------------
/translate.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | model=".pretrained/model_wmt15_bpe2k_uni_en-ru.npz"
 4 | dict="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe.word.pkl"
 5 | dict_rev="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe.word.pkl"
 6 | source="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-src.en.tok.bpe"
 7 | saveto=".translate/standard.trans.1"
 8 | 
 9 | THEANO_FLAGS="floatX=float32, device=cpu" python translate_uni.py -k 1 $model $dict $dict_rev $source $saveto
10 | 


--------------------------------------------------------------------------------
/translate_uni.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Translates a source file using a translation model.
  3 | '''
  4 | import theano
  5 | import argparse
  6 | 
  7 | import numpy
  8 | import cPickle as pkl
  9 | 
 10 | from nmt_uni import (build_model, build_sampler, gen_sample, load_params,
 11 |                  init_params, init_tparams, prepare_data)
 12 | 
 13 | from multiprocessing import Process, Queue
 14 | 
 15 | 
 16 | def translate_model(queue, rqueue, pid, model, options, k, normalize, kp, sigma):
 17 | 
 18 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 19 |     trng = RandomStreams(1234)
 20 | 
 21 |     # allocate model parameters
 22 |     params = init_params(options)
 23 | 
 24 |     # load model parameters and set theano shared variables
 25 |     params = load_params(model, params)
 26 |     tparams = init_tparams(params)
 27 | 
 28 |     trng, use_noise, \
 29 |         x, x_mask, y, y_mask, \
 30 |         opt_ret, \
 31 |         cost = \
 32 |         build_model(tparams, options)
 33 |     inps = [x, x_mask, y, y_mask]
 34 | 
 35 |     f_log_probs = theano.function(inps, cost)
 36 | 
 37 |     # word index
 38 |     f_init, f_next = build_sampler(tparams, options, trng)
 39 | 
 40 |     def _translate(idx, seq):
 41 |         all_samples = []
 42 |         all_scores = []
 43 | 
 44 |         for kidx in xrange(kp):
 45 |             if kidx == 0:
 46 |                 ss = -1.
 47 |             else:
 48 |                 ss = sigma
 49 |             # sample given an input sequence and obtain scores
 50 |             sample, score = gen_sample(tparams, f_init, f_next,
 51 |                                        numpy.array(seq).reshape([len(seq), 1]),
 52 |                                        options, trng=trng, k=k, maxlen=200,
 53 |                                        stochastic=False, argmax=False, sigma=ss)
 54 | 
 55 |             # normalize scores according to sequence lengths
 56 |             if normalize:
 57 |                 lengths = numpy.array([len(s) for s in sample])
 58 |                 score = score / lengths
 59 |             #print idx, score
 60 |             sidx = numpy.argmin(score)
 61 |             all_samples.append(sample[sidx])
 62 |             all_scores.append(score[sidx])
 63 | 
 64 |         source_list = [seq] * kp
 65 |         x, x_mask, y, y_mask = prepare_data(source_list, all_samples, maxlen=None)
 66 |         all_scores = f_log_probs(x, x_mask, y, y_mask)
 67 |         if normalize:
 68 |             lengths = numpy.array([len(s) for s in all_samples])
 69 |             all_scores = all_scores / lengths
 70 | 
 71 |         print idx, all_scores
 72 |         sidx = numpy.argmin(all_scores)
 73 |         return all_samples[sidx]
 74 | 
 75 |     while True:
 76 |         req = queue.get()
 77 |         if req is None:
 78 |             break
 79 | 
 80 |         idx, x = req[0], req[1]
 81 |         print pid, '-', idx
 82 |         seq = _translate(idx, x)
 83 | 
 84 |         rqueue.put((idx, seq))
 85 | 
 86 |     return
 87 | 
 88 | 
 89 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5,
 90 |          normalize=False, n_process=5, chr_level=False,
 91 |          options_file=None, sigma=-1., kp=1):
 92 | 
 93 |     # load model model_options
 94 |     if options_file is not None:
 95 |         with open(options_file, 'rb') as f:
 96 |             options = pkl.load(f)
 97 |     else:
 98 |         with open('%s.pkl' % model, 'rb') as f:
 99 |             options = pkl.load(f)
100 | 
101 |     # load source dictionary and invert
102 |     with open(dictionary, 'rb') as f:
103 |         word_dict = pkl.load(f)
104 |     word_idict = dict()
105 |     for kk, vv in word_dict.iteritems():
106 |         word_idict[vv] = kk
107 |     word_idict[0] = '<eos>'
108 |     word_idict[1] = 'UNK'
109 | 
110 |     # load target dictionary and invert
111 |     with open(dictionary_target, 'rb') as f:
112 |         word_dict_trg = pkl.load(f)
113 |     word_idict_trg = dict()
114 |     for kk, vv in word_dict_trg.iteritems():
115 |         word_idict_trg[vv] = kk
116 |     word_idict_trg[0] = '<eos>'
117 |     word_idict_trg[1] = 'UNK'
118 | 
119 |     # create input and output queues for processes
120 |     queue = Queue()
121 |     rqueue = Queue()
122 |     processes = [None] * n_process
123 |     for midx in xrange(n_process):
124 |         processes[midx] = Process(
125 |             target=translate_model,
126 |             args=(queue, rqueue, midx, model, options, k, normalize, kp, sigma))
127 |         processes[midx].start()
128 | 
129 |     # utility function
130 |     def _seqs2words(caps):
131 |         capsw = []
132 |         for cc in caps:
133 |             ww = []
134 |             for w in cc:
135 |                 if w == 0:
136 |                     break
137 |                 ww.append(word_idict_trg[w])
138 |             capsw.append(' '.join(ww))
139 |         return capsw
140 | 
141 |     def _send_jobs(fname):
142 |         with open(fname, 'r') as f:
143 |             for idx, line in enumerate(f):
144 |                 if chr_level:
145 |                     words = list(line.decode('utf-8').strip())
146 |                 else:
147 |                     words = line.strip().split()
148 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
149 |                 x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
150 |                 x += [0]
151 |                 queue.put((idx, x))
152 |         return idx+1
153 | 
154 |     def _finish_processes():
155 |         for midx in xrange(n_process):
156 |             queue.put(None)
157 | 
158 |     def _retrieve_jobs(n_samples):
159 |         trans = [None] * n_samples
160 |         for idx in xrange(n_samples):
161 |             resp = rqueue.get()
162 |             trans[resp[0]] = resp[1]
163 |             if numpy.mod(idx, 10) == 0:
164 |                 print 'Sample ', (idx+1), '/', n_samples, ' Done'
165 |         return trans
166 | 
167 |     print 'Translating ', source_file, '...'
168 |     n_samples = _send_jobs(source_file)
169 |     trans = _seqs2words(_retrieve_jobs(n_samples))
170 |     _finish_processes()
171 |     with open(saveto, 'w') as f:
172 |         print >>f, '\n'.join(trans)
173 |     print 'Done'
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     parser = argparse.ArgumentParser()
178 |     parser.add_argument('-k', type=int, default=5)
179 |     parser.add_argument('-kp', type=int, default=1)
180 |     parser.add_argument('-p', type=int, default=5)
181 |     parser.add_argument('-n', action="store_true", default=False)
182 |     parser.add_argument('-c', action="store_true", default=False)
183 |     parser.add_argument('-o', type=str, default=None)
184 |     parser.add_argument('-s', type=float, default=-1.)
185 |     parser.add_argument('model', type=str)
186 |     parser.add_argument('dictionary', type=str)
187 |     parser.add_argument('dictionary_target', type=str)
188 |     parser.add_argument('source', type=str)
189 |     parser.add_argument('saveto', type=str)
190 | 
191 |     args = parser.parse_args()
192 | 
193 |     main(args.model, args.dictionary, args.dictionary_target, args.source,
194 |          args.saveto, k=args.k, normalize=args.n, n_process=args.p,
195 |          chr_level=args.c, options_file=args.o, kp=args.kp, sigma=args.s)
196 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file is for functions to help the translation
  3 | """
  4 | import numpy as np
  5 | import time
  6 | import sys
  7 | import json
  8 | 
  9 | class Monitor(object):
 10 |     def __init__(self, root='http://localhost:9000'):
 11 |         self.root = root
 12 | 
 13 |     def display(self, batch, logs={}):
 14 |         import requests
 15 |         send = {}
 16 |         send['epoch'] = batch
 17 |         for k, v in logs.items():
 18 |             send[k] = v
 19 | 
 20 |         try:
 21 |             requests.post(self.root + '/publish/epoch/end/',
 22 |                           {'data': json.dumps(send)})
 23 |         except:
 24 |             print('Warning: could not reach RemoteMonitor '
 25 |                   'root server at ' + str(self.root))
 26 | 
 27 | 
 28 | 
 29 | class Progbar(object):
 30 |     def __init__(self, target, width=30, verbose=1, with_history=True):
 31 |         '''
 32 |             @param target: total number of steps expected
 33 |         '''
 34 |         self.width = width
 35 |         self.target = target
 36 |         self.sum_values = {}
 37 |         self.unique_values = []
 38 |         self.start = time.time()
 39 |         self.total_width = 0
 40 |         self.seen_so_far = 0
 41 |         self.verbose = verbose
 42 |         self.with_history = with_history
 43 | 
 44 |     def update(self, current, values=[]):
 45 |         '''
 46 |             @param current: index of current step
 47 |             @param values: list of tuples (name, value_for_last_step).
 48 |             The progress bar will display averages for these values.
 49 |         '''
 50 |         if not self.with_history:
 51 |             self.sum_values    = {}
 52 |             self.unique_values = []
 53 | 
 54 |         for k, v in values:
 55 |             if k not in self.sum_values:
 56 |                 self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far]
 57 |                 self.unique_values.append(k)
 58 |             else:
 59 |                 self.sum_values[k][0] += v * (current - self.seen_so_far)
 60 |                 self.sum_values[k][1] += (current - self.seen_so_far)
 61 |         self.seen_so_far = current
 62 | 
 63 |         now = time.time()
 64 |         if self.verbose == 1:
 65 |             prev_total_width = self.total_width
 66 |             sys.stdout.write("\b" * prev_total_width)
 67 |             sys.stdout.write("\r")
 68 | 
 69 |             numdigits = int(np.floor(np.log10(self.target))) + 1
 70 |             barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
 71 |             bar = barstr % (current, self.target)
 72 |             prog = float(current)/self.target
 73 |             prog_width = int(self.width*prog)
 74 |             if prog_width > 0:
 75 |                 bar += ('.'*(prog_width-1))
 76 |                 if current < self.target:
 77 |                     bar += '(-w-)'
 78 |                 else:
 79 |                     bar += '(-v-)!!'
 80 |             bar += ('~' * (self.width-prog_width))
 81 |             bar += ']'
 82 |             sys.stdout.write(bar)
 83 |             self.total_width = len(bar)
 84 | 
 85 |             if current:
 86 |                 time_per_unit = (now - self.start) / current
 87 |             else:
 88 |                 time_per_unit = 0
 89 |             eta = time_per_unit*(self.target - current)
 90 |             info = ''
 91 |             if current < self.target:
 92 |                 info += ' - ETA: %ds' % eta
 93 |             else:
 94 |                 info += ' - %ds' % (now - self.start)
 95 |             for k in self.unique_values:
 96 |                 if k == 'perplexity' or k == 'PPL':
 97 |                     info += ' - %s: %.4f' % (k, np.exp(self.sum_values[k][0] / max(1, self.sum_values[k][1])))
 98 |                 else:
 99 |                     info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
100 | 
101 |             self.total_width += len(info)
102 |             if prev_total_width > self.total_width:
103 |                 info += ((prev_total_width-self.total_width) * " ")
104 | 
105 |             sys.stdout.write(info)
106 |             sys.stdout.flush()
107 | 
108 |             if current >= self.target:
109 |                 sys.stdout.write("\n")
110 | 
111 |         if self.verbose == 2:
112 |             if current >= self.target:
113 |                 info = '%ds' % (now - self.start)
114 |                 for k in self.unique_values:
115 |                     info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
116 |                 sys.stdout.write(info + "\n")
117 | 
118 |     def add(self, n, values=[]):
119 |         self.update(self.seen_so_far + n, values)
120 | 
121 |     def clear(self):
122 |         self.sum_values = {}
123 |         self.unique_values = []
124 |         self.total_width = 0
125 |         self.seen_so_far = 0
126 | 


--------------------------------------------------------------------------------