├── LICENSE
├── RDPG.py
├── README.md
├── bleu.py
├── data_iterator.py
├── insepection.py
├── layers.py
├── mteval.sh
├── nmt_uni.py
├── noisy_translator.py
├── noisytrans_training.py
├── optimizer.py
├── policy.py
├── pretrain_uni.py
├── reward.py
├── run_eval.sh
├── run_train.sh
├── translate.sh
├── translate_uni.py
└── utils.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 Jiatao Gu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/RDPG.py:
--------------------------------------------------------------------------------
1 | """
2 | -- Recurrent Deterministic Policy Gradient
3 | """
4 |
5 | from nmt_uni import *
6 |
7 | import os
8 | import time, datetime
9 | import cPickle as pkl
10 |
11 |
12 | class RDPG(object):
13 |
14 | def __init__(self,
15 | trng, options, policy, config,
16 | n_in=None, n_out=None,
17 | recurrent=False, id=None):
18 |
19 | self.trng = trng
20 | self.options = options
21 | self.policy = policy
22 | self.recurrent = recurrent
23 |
24 | self.n_hidden = 512
25 | self.n_in = n_in
26 | self.n_out = n_out
27 |
28 | self.rec = 'lngru'
29 | if not n_in:
30 | self.n_in = options['readout_dim']
31 |
32 | # ------------------------------------------------------------------------------
33 | print 'policy network initialization'
34 |
35 | params = OrderedDict()
36 | if not self.recurrent:
37 | print 'building a feed-forward controller'
38 | params = get_layer('ff')[0](options, params, prefix='policy_net_in',
39 | nin=self.n_in, nout=self.n_hidden, scale=0.001)
40 | else:
41 | print 'building a recurrent controller'
42 | params = get_layer(self.rec)[0](options, params, prefix='policy_net_in',
43 | nin=self.n_in, dim=self.n_hidden, scale=0.001)
44 |
45 | params = get_layer('ff')[0](options, params, prefix='policy_net_out',
46 | nin=self.n_hidden,
47 | nout=self.n_out,
48 | scale=0.001)
49 |
50 | # --------------------------------------------------------------------------------
51 | print 'critic network initialization (RNN)'
52 | params_b = OrderedDict()
53 | params_b = get_layer(self.rec)[0](options, params_b, prefix='critic_net_in',
54 | nin=self.n_in + self.n_out,
55 | dim=self.n_hidden, scale=0.001)
56 | params_b = get_layer('ff')[0](options, params_b, prefix='critic_net_out',
57 | nin=self.n_hidden,
58 | nout=1,
59 | scale=0.001)
60 | if id is not None:
61 | print 'reload the saved model: {}'.format(id)
62 | params = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params)
63 | params_b = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params_b)
64 | else:
65 | id = datetime.datetime.fromtimestamp(time.time()).strftime('%y%m%d-%H%M%S')
66 | print 'start from a new model: {}'.format(id)
67 |
68 | with open('.config/conf.{}.txt'.format(id), 'w') as f:
69 | f.write('[config]\n')
70 |
71 | for c in config:
72 | f.write('{}: {}\n'.format(c, config[c]))
73 | f.write('\n')
74 |
75 | f.write('[policy]\n')
76 |
77 | for c in policy:
78 | f.write('{}: {}\n'.format(c, policy[c]))
79 |
80 | # pkl.dump([policy, config], open('.config/{}.conf'.format(id), 'w'))
81 | print 'save the config file'
82 |
83 | self.id = id
84 | self.model = '.policy/{}-{}'.format(id, self.policy['base'])
85 |
86 | # theano shared params
87 | self.tparams = init_tparams(params)
88 | self.tparams_b = init_tparams(params_b)
89 |
90 | # build the policy network
91 | self.build_actor(options=options)
92 | self.build_discriminator(options=options)
93 |
94 | def build_actor(self, options):
95 | # ============================================================================= #
96 | # Actor from Policy Network
97 | # ============================================================================= #
98 | observation = tensor.matrix('observation', dtype='float32') # batch_size x readout_dim (seq_steps=1)
99 | prev_hidden = tensor.matrix('p_hidden', dtype='float32')
100 |
101 | if not self.recurrent:
102 | hiddens = get_layer('ff')[1](self.tparams, observation,
103 | options, prefix='policy_net_in',
104 | activ='tanh')
105 | else:
106 | hiddens = get_layer(self.rec)[1](self.tparams, observation,
107 | options, prefix='policy_net_in', mask=None,
108 | one_step=True, _init_state=prev_hidden)[0]
109 |
110 | act_inps = [observation, prev_hidden]
111 | act_outs = get_layer('ff')[1](self.tparams, hiddens, options,
112 | prefix='policy_net_out',
113 | activ='tanh'
114 | )
115 | print 'build action function [Deterministic]'
116 | self.f_action = theano.function(act_inps, act_outs,
117 | on_unused_input='ignore') # action/dist/hiddens
118 | print 'done.'
119 |
120 |
121 | def build_discriminator(self, options):
122 | # ============================================================================= #
123 | # Build for End-t-End learning
124 | # ============================================================================= #
125 | observations = tensor.tensor3('observations', dtype='float32')
126 | mask = tensor.matrix('mask', dtype='float32')
127 | targets = tensor.vector('targets', dtype='float32')
128 |
129 | print 'build actor'
130 | if not self.recurrent:
131 | hiddens = get_layer('ff')[1](self.tparams, observations,
132 | options, prefix='policy_net_in',
133 | activ='tanh')
134 | else:
135 | hiddens = get_layer(self.rec)[1](self.tparams, observations,
136 | options, prefix='policy_net_in', mask=mask)[0]
137 | actions = get_layer('ff')[1](self.tparams, hiddens, options, prefix='policy_net_out',
138 | activ='tanh') # seq_steps x batch_size x n_out
139 |
140 | print 'build critic'
141 | state_action = concatenate([observations, actions], axis=-1)
142 | hiddens_b = get_layer(self.rec)[1](self.tparams_b, state_action,
143 | options, prefix='critic_net_in', mask=mask)[0]
144 | values = get_layer('ff')[1](self.tparams_b, hiddens_b, options,
145 | prefix='critic_net_out',
146 | activ='tanh')[-1, :, 0] # (batch_size, )
147 |
148 | # =============================================================================== #
149 | # Build Deterministic Policy Gradient [Actor Parts]
150 | # =============================================================================== #
151 | inps_A = [observations, mask]
152 | loss_A = -tensor.mean(values)
153 | grad_A = tensor.grad(loss_A, wrt=itemlist(self.tparams))
154 | grad_A = grad_clip(grad_A)
155 | outs_A = [loss_A, actions]
156 |
157 | # optimizer: Adam
158 | lr = tensor.scalar(name='lr')
159 | f_A, f_Aup = adam(lr, self.tparams, grad_A, inps_A, outs_A)
160 |
161 | # =============================================================================== #
162 | # Build Deterministic Policy Gradient [Critic Parts]
163 | # =============================================================================== #
164 | inps_B = [observations, mask, actions, targets]
165 | loss_B = tensor.mean((values - targets) ** 2)
166 | grad_B = tensor.grad(loss_B, wrt=itemlist(self.tparams_b))
167 | grad_B = grad_clip(grad_B)
168 | outs_B = [loss_B]
169 |
170 | # optimizer: Adam
171 | lr = tensor.scalar(name='lr')
172 | f_B, f_Bup = adam(lr, self.tparams_b, grad_B, inps_B, outs_B)
173 |
174 | self.f_learner = [f_A, f_Aup, f_B, f_Bup]
175 | print 'done.'
176 |
177 |
178 |
179 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NMT-RDPG
2 | Neural machine translation with Recurrent Deterministic Policy Gradient
3 |
--------------------------------------------------------------------------------
/bleu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Natural Language Toolkit: BLEU Score
3 | #
4 | # Copyright (C) 2001-2016 NLTK Project
5 | # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
6 | # Contributors: Dmitrijs Milajevs, Liling Tan
7 | # URL:
8 | # For license information, see LICENSE.TXT
9 |
10 | """BLEU score implementation."""
11 | from __future__ import division
12 |
13 | import math
14 | import fractions
15 | from collections import Counter
16 |
17 | from nltk.util import ngrams
18 |
19 | try:
20 | fractions.Fraction(0, 1000, _normalize=False)
21 | from fractions import Fraction
22 | except TypeError:
23 | from nltk.compat import Fraction
24 |
25 |
26 | def sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25),
27 | smoothing_function=None):
28 | """
29 | Calculate BLEU score (Bilingual Evaluation Understudy) from
30 | Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
31 | "BLEU: a method for automatic evaluation of machine translation."
32 | In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
33 |
34 | >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
35 | ... 'ensures', 'that', 'the', 'military', 'always',
36 | ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
37 |
38 | >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
39 | ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
40 | ... 'that', 'party', 'direct']
41 |
42 | >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
43 | ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
44 | ... 'heed', 'Party', 'commands']
45 |
46 | >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
47 | ... 'guarantees', 'the', 'military', 'forces', 'always',
48 | ... 'being', 'under', 'the', 'command', 'of', 'the',
49 | ... 'Party']
50 |
51 | >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
52 | ... 'army', 'always', 'to', 'heed', 'the', 'directions',
53 | ... 'of', 'the', 'party']
54 |
55 | >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
56 | 0.5045...
57 |
58 | >>> sentence_bleu([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
59 | 0.3969...
60 |
61 | The default BLEU calculates a score for up to 4grams using uniform
62 | weights. To evaluate your translations with higher/lower order ngrams,
63 | use customized weights. E.g. when accounting for up to 6grams with uniform
64 | weights:
65 |
66 | >>> weights = (0.1666, 0.1666, 0.1666, 0.1666, 0.1666)
67 | >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights)
68 | 0.45838627164939455
69 |
70 | :param references: reference sentences
71 | :type references: list(list(str))
72 | :param hypothesis: a hypothesis sentence
73 | :type hypothesis: list(str)
74 | :param weights: weights for unigrams, bigrams, trigrams and so on
75 | :type weights: list(float)
76 | :return: The sentence-level BLEU score.
77 | :rtype: float
78 | """
79 | return corpus_bleu([references], [hypothesis], weights, smoothing_function)
80 |
81 |
82 | def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25),
83 | smoothing_function=None):
84 | """
85 | Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
86 | the hypotheses and their respective references.
87 |
88 | Instead of averaging the sentence level BLEU scores (i.e. marco-average
89 | precision), the original BLEU metric (Papineni et al. 2002) accounts for
90 | the micro-average precision (i.e. summing the numerators and denominators
91 | for each hypothesis-reference(s) pairs before the division).
92 |
93 | >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
94 | ... 'ensures', 'that', 'the', 'military', 'always',
95 | ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
96 | >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
97 | ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
98 | ... 'heed', 'Party', 'commands']
99 | >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
100 | ... 'guarantees', 'the', 'military', 'forces', 'always',
101 | ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
102 | >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
103 | ... 'army', 'always', 'to', 'heed', 'the', 'directions',
104 | ... 'of', 'the', 'party']
105 |
106 | >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
107 | ... 'interested', 'in', 'world', 'history']
108 | >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
109 | ... 'because', 'he', 'read', 'the', 'book']
110 |
111 | >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
112 | >>> hypotheses = [hyp1, hyp2]
113 | >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
114 | 0.5920...
115 |
116 | The example below show that corpus_bleu() is different from averaging
117 | sentence_bleu() for hypotheses
118 |
119 | >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
120 | >>> score2 = sentence_bleu([ref2a], hyp2)
121 | >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
122 | 0.6223...
123 |
124 | :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
125 | :type references: list(list(list(str)))
126 | :param hypotheses: a list of hypothesis sentences
127 | :type hypotheses: list(list(str))
128 | :param weights: weights for unigrams, bigrams, trigrams and so on
129 | :type weights: list(float)
130 | :return: The corpus-level BLEU score.
131 | :rtype: float
132 | """
133 | # Before proceeding to compute BLEU, perform sanity checks.
134 |
135 | p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
136 | p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
137 | hyp_lengths, ref_lengths = 0, 0
138 |
139 | assert len(list_of_references) == len(
140 | hypotheses), "The number of hypotheses and their reference(s) should be the same"
141 |
142 | # Iterate through each hypothesis and their corresponding references.
143 | for references, hypothesis in zip(list_of_references, hypotheses):
144 | # For each order of ngram, calculate the numerator and
145 | # denominator for the corpus-level modified precision.
146 | for i, _ in enumerate(weights, start=1):
147 | p_i = modified_precision(references, hypothesis, i)
148 | p_numerators[i] += p_i.numerator
149 | p_denominators[i] += p_i.denominator
150 |
151 | # Calculate the hypothesis length and the closest reference length.
152 | # Adds them to the corpus-level hypothesis and reference counts.
153 | hyp_len = len(hypothesis)
154 | hyp_lengths += hyp_len
155 | ref_lengths += closest_ref_length(references, hyp_len)
156 |
157 | # Calculate corpus-level brevity penalty.
158 | bp = brevity_penalty(ref_lengths, hyp_lengths)
159 |
160 | # Collects the various precision values for the different ngram orders.
161 | p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
162 | for i, _ in enumerate(weights, start=1)]
163 |
164 | # Returns 0 if there's no matching n-grams
165 | # We only need to check for p_numerators[1] == 0, since if there's
166 | # no unigrams, there won't be any higher order ngrams.
167 | if p_numerators[1] == 0:
168 | return 0, 0
169 |
170 | # Smoothen the modified precision.
171 | # Note: smooth_precision() converts values into float.
172 | if not smoothing_function:
173 | smoothing_function = SmoothingFunction().method0
174 | p_n = smoothing_function(p_n, references=references,
175 | hypothesis=hypothesis, hyp_len=hyp_len)
176 |
177 | # Calculates the overall modified precision for all ngrams.
178 | # By sum of the product of the weights and the respective *p_n*
179 | s = (w * math.log(p_i) for w, p_i in zip(weights, p_n)
180 | if p_i.numerator != 0)
181 |
182 | # return bp * math.exp(math.fsum(s))
183 | return math.exp(math.fsum(s)), bp * math.exp(math.fsum(s))
184 |
185 |
186 | def modified_precision(references, hypothesis, n):
187 | """
188 | Calculate modified ngram precision.
189 |
190 | The normal precision method may lead to some wrong translations with
191 | high-precision, e.g., the translation, in which a word of reference
192 | repeats several times, has very high precision.
193 |
194 | This function only returns the Fraction object that contains the numerator
195 | and denominator necessary to calculate the corpus-level precision.
196 | To calculate the modified precision for a single pair of hypothesis and
197 | references, cast the Fraction object into a float.
198 |
199 | The famous "the the the ... " example shows that you can get BLEU precision
200 | by duplicating high frequency words.
201 |
202 | >>> reference1 = 'the cat is on the mat'.split()
203 | >>> reference2 = 'there is a cat on the mat'.split()
204 | >>> hypothesis1 = 'the the the the the the the'.split()
205 | >>> references = [reference1, reference2]
206 | >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
207 | 0.2857...
208 |
209 | In the modified n-gram precision, a reference word will be considered
210 | exhausted after a matching hypothesis word is identified, e.g.
211 |
212 | >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
213 | ... 'ensures', 'that', 'the', 'military', 'will',
214 | ... 'forever', 'heed', 'Party', 'commands']
215 | >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
216 | ... 'guarantees', 'the', 'military', 'forces', 'always',
217 | ... 'being', 'under', 'the', 'command', 'of', 'the',
218 | ... 'Party']
219 | >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
220 | ... 'army', 'always', 'to', 'heed', 'the', 'directions',
221 | ... 'of', 'the', 'party']
222 | >>> hypothesis = 'of the'.split()
223 | >>> references = [reference1, reference2, reference3]
224 | >>> float(modified_precision(references, hypothesis, n=1))
225 | 1.0
226 | >>> float(modified_precision(references, hypothesis, n=2))
227 | 1.0
228 |
229 | An example of a normal machine translation hypothesis:
230 |
231 | >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
232 | ... 'ensures', 'that', 'the', 'military', 'always',
233 | ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
234 |
235 | >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
236 | ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
237 | ... 'that', 'party', 'direct']
238 |
239 | >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
240 | ... 'ensures', 'that', 'the', 'military', 'will',
241 | ... 'forever', 'heed', 'Party', 'commands']
242 |
243 | >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
244 | ... 'guarantees', 'the', 'military', 'forces', 'always',
245 | ... 'being', 'under', 'the', 'command', 'of', 'the',
246 | ... 'Party']
247 |
248 | >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
249 | ... 'army', 'always', 'to', 'heed', 'the', 'directions',
250 | ... 'of', 'the', 'party']
251 | >>> references = [reference1, reference2, reference3]
252 | >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
253 | 0.9444...
254 | >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
255 | 0.5714...
256 | >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
257 | 0.5882352941176471
258 | >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
259 | 0.07692...
260 |
261 |
262 | :param references: A list of reference translations.
263 | :type references: list(list(str))
264 | :param hypothesis: A hypothesis translation.
265 | :type hypothesis: list(str)
266 | :param n: The ngram order.
267 | :type n: int
268 | :return: BLEU's modified precision for the nth order ngram.
269 | :rtype: Fraction
270 | """
271 | # Extracts all ngrams in hypothesis.
272 | counts = Counter(ngrams(hypothesis, n))
273 |
274 | # Extract a union of references' counts.
275 | ## max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
276 | max_counts = {}
277 | for reference in references:
278 | reference_counts = Counter(ngrams(reference, n))
279 | for ngram in counts:
280 | max_counts[ngram] = max(max_counts.get(ngram, 0),
281 | reference_counts[ngram])
282 |
283 | # Assigns the intersection between hypothesis and references' counts.
284 | clipped_counts = {ngram: min(count, max_counts[ngram])
285 | for ngram, count in counts.items()}
286 |
287 | numerator = sum(clipped_counts.values())
288 | # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
289 | # Usually this happens when the ngram order is > len(reference).
290 | denominator = max(1, sum(counts.values()))
291 |
292 | return Fraction(numerator, denominator, _normalize=False)
293 |
294 |
295 | def closest_ref_length(references, hyp_len):
296 | """
297 | This function finds the reference that is the closest length to the
298 | hypothesis. The closest reference length is referred to as *r* variable
299 | from the brevity penalty formula in Papineni et. al. (2002)
300 |
301 | :param references: A list of reference translations.
302 | :type references: list(list(str))
303 | :param hypothesis: The length of the hypothesis.
304 | :type hypothesis: int
305 | :return: The length of the reference that's closest to the hypothesis.
306 | :rtype: int
307 | """
308 | ref_lens = (len(reference) for reference in references)
309 | closest_ref_len = min(ref_lens, key=lambda ref_len:
310 | (abs(ref_len - hyp_len), ref_len))
311 | return closest_ref_len
312 |
313 |
314 | def brevity_penalty(closest_ref_len, hyp_len):
315 | """
316 | Calculate brevity penalty.
317 |
318 | As the modified n-gram precision still has the problem from the short
319 | length sentence, brevity penalty is used to modify the overall BLEU
320 | score according to length.
321 |
322 | An example from the paper. There are three references with length 12, 15
323 | and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
324 |
325 | >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
326 | >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
327 | >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
328 | >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
329 | >>> references = [reference1, reference2, reference3]
330 | >>> hyp_len = len(hypothesis)
331 | >>> closest_ref_len = closest_ref_length(references, hyp_len)
332 | >>> brevity_penalty(closest_ref_len, hyp_len)
333 | 1.0
334 |
335 | In case a hypothesis translation is shorter than the references, penalty is
336 | applied.
337 |
338 | >>> references = [['a'] * 28, ['a'] * 28]
339 | >>> hypothesis = ['a'] * 12
340 | >>> hyp_len = len(hypothesis)
341 | >>> closest_ref_len = closest_ref_length(references, hyp_len)
342 | >>> brevity_penalty(closest_ref_len, hyp_len)
343 | 0.2635971381157267
344 |
345 | The length of the closest reference is used to compute the penalty. If the
346 | length of a hypothesis is 12, and the reference lengths are 13 and 2, the
347 | penalty is applied because the hypothesis length (12) is less then the
348 | closest reference length (13).
349 |
350 | >>> references = [['a'] * 13, ['a'] * 2]
351 | >>> hypothesis = ['a'] * 12
352 | >>> hyp_len = len(hypothesis)
353 | >>> closest_ref_len = closest_ref_length(references, hyp_len)
354 | >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
355 | 0.9200...
356 |
357 | The brevity penalty doesn't depend on reference order. More importantly,
358 | when two reference sentences are at the same distance, the shortest
359 | reference sentence length is used.
360 |
361 | >>> references = [['a'] * 13, ['a'] * 11]
362 | >>> hypothesis = ['a'] * 12
363 | >>> hyp_len = len(hypothesis)
364 | >>> closest_ref_len = closest_ref_length(references, hyp_len)
365 | >>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
366 | >>> hyp_len = len(hypothesis)
367 | >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
368 | >>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
369 | >>> bp1 == bp2 == 1
370 | True
371 |
372 | A test example from mteval-v13a.pl (starting from the line 705):
373 |
374 | >>> references = [['a'] * 11, ['a'] * 8]
375 | >>> hypothesis = ['a'] * 7
376 | >>> hyp_len = len(hypothesis)
377 | >>> closest_ref_len = closest_ref_length(references, hyp_len)
378 | >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
379 | 0.8668...
380 |
381 | >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
382 | >>> hypothesis = ['a'] * 7
383 | >>> hyp_len = len(hypothesis)
384 | >>> closest_ref_len = closest_ref_length(references, hyp_len)
385 | >>> brevity_penalty(closest_ref_len, hyp_len)
386 | 1.0
387 |
388 | :param hyp_len: The length of the hypothesis for a single sentence OR the
389 | sum of all the hypotheses' lengths for a corpus
390 | :type hyp_len: int
391 | :param closest_ref_len: The length of the closest reference for a single
392 | hypothesis OR the sum of all the closest references for every hypotheses.
393 | :type closest_reference_len: int
394 | :return: BLEU's brevity penalty.
395 | :rtype: float
396 | """
397 | if hyp_len > closest_ref_len:
398 | return 1
399 | # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
400 | elif hyp_len == 0:
401 | return 0
402 | else:
403 | return math.exp(1 - closest_ref_len / hyp_len)
404 |
405 |
406 | class SmoothingFunction:
407 | """
408 | This is an implementation of the smoothing techniques
409 | for segment-level BLEU scores that was presented in
410 | Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
411 | Smoothing Techniques for Sentence-Level BLEU. In WMT14.
412 | http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
413 | """
414 |
415 | def __init__(self, epsilon=0.1, alpha=5, k=5):
416 | """
417 | This will initialize the parameters required for the various smoothing
418 | techniques, the default values are set to the numbers used in the
419 | experiments from Chen and Cherry (2014).
420 |
421 | >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
422 | ... 'that', 'the', 'military', 'always', 'obeys', 'the',
423 | ... 'commands', 'of', 'the', 'party']
424 | >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
425 | ... 'that', 'the', 'military', 'will', 'forever', 'heed',
426 | ... 'Party', 'commands']
427 |
428 | >>> chencherry = SmoothingFunction()
429 | >>> print (sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
430 | 0.4118...
431 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
432 | 0.4118...
433 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
434 | 0.4118...
435 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
436 | 0.4489...
437 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
438 | 0.4118...
439 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
440 | 0.4118...
441 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
442 | 0.4905...
443 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
444 | 0.1801...
445 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
446 | 0.4905...
447 |
448 | :param epsilon: the epsilon value use in method 1
449 | :type epsilon: float
450 | :param alpha: the alpha value use in method 6
451 | :type alpha: int
452 | :param k: the k value use in method 4
453 | :type k: int
454 | """
455 | self.epsilon = epsilon
456 | self.alpha = alpha
457 | self.k = k
458 |
459 | def method0(self, p_n, *args, **kwargs):
460 | """ No smoothing. """
461 | return p_n
462 |
463 | def method1(self, p_n, *args, **kwargs):
464 | """
465 | Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
466 | """
467 | return [(p_i.numerator + self.epsilon) / p_i.denominator
468 | if p_i.numerator == 0 else p_i for p_i in p_n]
469 |
470 | def method2(self, p_n, *args, **kwargs):
471 | """
472 | Smoothing method 2: Add 1 to both numerator and denominator from
473 | Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
474 | machine translation quality using longest common subsequence and
475 | skip-bigram statistics. In ACL04.
476 | """
477 | return [Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False) for p_i in p_n]
478 |
479 | def method3(self, p_n, *args, **kwargs):
480 | """
481 | Smoothing method 3: NIST geometric sequence smoothing
482 | The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
483 | precision score whose matching n-gram count is null.
484 | k is 1 for the first 'n' value for which the n-gram match count is null/
485 | For example, if the text contains:
486 | - one 2-gram match
487 | - and (consequently) two 1-gram matches
488 | the n-gram count for each individual precision score would be:
489 | - n=1 => prec_count = 2 (two unigrams)
490 | - n=2 => prec_count = 1 (one bigram)
491 | - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
492 | - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
493 | """
494 | incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
495 | for i, p_i in enumerate(p_n):
496 | if p_i.numerator == 0:
497 | p_n[i] = 1 / (2 ** incvnt * p_i.denominator)
498 | incvnt += 1
499 | return p_n
500 |
501 | def method4(self, p_n, references, hypothesis, hyp_len):
502 | """
503 | Smoothing method 4:
504 | Shorter translations may have inflated precision values due to having
505 | smaller denominators; therefore, we give them proportionally
506 | smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
507 | suggests dividing by 1/ln(len(T)), where T is the length of the translation.
508 | """
509 | incvnt = 1
510 | for i, p_i in enumerate(p_n):
511 | if p_i.numerator == 0 and hyp_len != 0:
512 | p_n[i] = incvnt * self.k / math.log(hyp_len) # Note that this K is different from the K from NIST.
513 | incvnt += 1
514 | return p_n
515 |
516 | def method5(self, p_n, references, hypothesis, hyp_len):
517 | """
518 | Smoothing method 5:
519 | The matched counts for similar values of n should be similar. To a
520 | calculate the n-gram matched count, it averages the n−1, n and n+1 gram
521 | matched counts.
522 | """
523 | m = {}
524 | # Requires an precision value for an addition ngram order.
525 | p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
526 | m[-1] = p_n[0] + 1
527 | for i, p_i in enumerate(p_n):
528 | p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
529 | m[i] = p_n[i]
530 | return p_n
531 |
532 | def method6(self, p_n, references, hypothesis, hyp_len):
533 | """
534 | Smoothing method 6:
535 | Interpolates the maximum likelihood estimate of the precision *p_n* with
536 | a prior estimate *pi0*. The prior is estimated by assuming that the ratio
537 | between pn and pn−1 will be the same as that between pn−1 and pn−2.
538 | """
539 | for i, p_i in enumerate(p_n):
540 | if i in [1, 2]: # Skips the first 2 orders of ngrams.
541 | continue
542 | else:
543 | pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
544 | # No. of ngrams in translation.
545 | l = sum(1 for _ in ngrams(hypothesis, i + 1))
546 | p_n[i] = (p_i + self.alpha * pi0) / (l + self.alpha)
547 | return p_n
548 |
549 | def method7(self, p_n, references, hypothesis, hyp_len):
550 | """
551 | Smoothing method 6:
552 | Interpolates the maximum likelihood estimate of the precision *p_n* with
553 | a prior estimate *pi0*. The prior is estimated by assuming that the ratio
554 | between pn and pn−1 will be the same as that between pn−1 and pn−2.
555 | """
556 | p_n = self.method4(p_n, references, hypothesis, hyp_len)
557 | p_n = self.method5(p_n, references, hypothesis, hyp_len)
558 | return p_n
--------------------------------------------------------------------------------
/data_iterator.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 | import cPickle as pkl
4 | import gzip
5 |
6 |
7 | def fopen(filename, mode='r'):
8 | if filename.endswith('.gz'):
9 | return gzip.open(filename, mode)
10 | return open(filename, mode)
11 |
12 |
13 | class TextIterator:
14 | """Simple Bitext iterator."""
15 | def __init__(self, source, target,
16 | source_dict, target_dict,
17 | batch_size=128,
18 | maxlen=100,
19 | n_words_source=-1,
20 | n_words_target=-1,
21 | cache=5):
22 |
23 | self.source = fopen(source, 'r')
24 | self.target = fopen(target, 'r')
25 |
26 | print 'scan the dataset.'
27 | for si, _ in enumerate(self.source):
28 | pass
29 | for ti, _ in enumerate(self.target):
30 | pass
31 |
32 | self.source.close()
33 | self.target.close()
34 |
35 | assert si == ti, 'the number of the source and target document must the same'
36 | print 'scanned {} lines'.format(si)
37 |
38 | self.source = fopen(source, 'r')
39 | self.target = fopen(target, 'r')
40 |
41 | with open(source_dict, 'rb') as f:
42 | self.source_dict = pkl.load(f)
43 | with open(target_dict, 'rb') as f:
44 | self.target_dict = pkl.load(f)
45 |
46 | self.num = si
47 | self.batch_size = batch_size
48 | self.maxlen = maxlen
49 |
50 | self.n_words_source = n_words_source
51 | self.n_words_target = n_words_target
52 |
53 | self.source_buffer = []
54 | self.target_buffer = []
55 | self.k = batch_size * cache
56 |
57 | self.end_of_data = False
58 |
59 |
60 |
61 |
62 | def __iter__(self):
63 | return self
64 |
65 | def reset(self):
66 | self.source.seek(0)
67 | self.target.seek(0)
68 |
69 | def next(self):
70 | if self.end_of_data:
71 | self.end_of_data = False
72 | self.reset()
73 | raise StopIteration
74 |
75 | source = []
76 | target = []
77 |
78 | # fill buffer, if it's empty
79 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
80 |
81 | if len(self.source_buffer) == 0:
82 | for k_ in xrange(self.k):
83 | ss = self.source.readline()
84 | if ss == "":
85 | break
86 | tt = self.target.readline()
87 | if tt == "":
88 | break
89 |
90 | self.source_buffer.append(ss.strip().split())
91 | self.target_buffer.append(tt.strip().split())
92 |
93 | # sort by target buffer
94 | tlen = numpy.array([len(t) for t in self.target_buffer])
95 | tidx = tlen.argsort()
96 |
97 | _sbuf = [self.source_buffer[i] for i in tidx]
98 | _tbuf = [self.target_buffer[i] for i in tidx]
99 |
100 | self.source_buffer = _sbuf
101 | self.target_buffer = _tbuf
102 |
103 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0:
104 | self.end_of_data = False
105 | self.reset()
106 | raise StopIteration
107 |
108 | try:
109 |
110 | # actual work here
111 | while True:
112 |
113 | # read from source file and map to word index
114 | try:
115 | ss = self.source_buffer.pop()
116 | except IndexError:
117 | break
118 | ss = [self.source_dict[w] if w in self.source_dict else 1
119 | for w in ss]
120 | if self.n_words_source > 0:
121 | ss = [w if w < self.n_words_source else 1 for w in ss]
122 |
123 | # read from source file and map to word index
124 | tt = self.target_buffer.pop()
125 | tt = [self.target_dict[w] if w in self.target_dict else 1
126 | for w in tt]
127 | if self.n_words_target > 0:
128 | tt = [w if w < self.n_words_target else 1 for w in tt]
129 |
130 | if len(ss) > self.maxlen and len(tt) > self.maxlen:
131 | continue
132 |
133 | source.append(ss)
134 | target.append(tt)
135 |
136 | if len(source) >= self.batch_size or \
137 | len(target) >= self.batch_size:
138 | break
139 | except IOError:
140 | self.end_of_data = True
141 |
142 | if len(source) <= 0 or len(target) <= 0:
143 | self.end_of_data = False
144 | self.reset()
145 | raise StopIteration
146 |
147 | return source, target
148 |
--------------------------------------------------------------------------------
/insepection.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import matplotlib
3 | # matplotlib.use('agg')
4 | import copy
5 | import numpy
6 | import os
7 | import seaborn as sns
8 | import pandas as pd
9 | sns.set(context="paper", font="monospace", style='whitegrid')
10 | from matplotlib import pyplot as plot
11 | from matplotlib import rc
12 |
13 | rc('font',**{'family':'Verdana', 'weight': 'normal'})
14 | rc('font', size=8)
15 | rc('text', usetex=True)
16 | rc('text.latex',unicode=True)
17 | rc('text.latex',preamble='\usepackage[utf8]{inputenc}')
18 | rc('text.latex',preamble='\usepackage[russian]{babel}')
19 | rc('text.latex',preamble='\usepackage[german]{babel}')
20 | rc('text.latex',preamble='\usepackage[ngerman]{babel}')
21 |
22 | matplotlib.rcParams['ytick.labelsize'] = 11
23 | matplotlib.rcParams['xtick.labelsize'] = 11
24 |
25 | def heatmap(sources, refs, trans, actions, idx, atten=None, savefig=True, name='test', info=None, show=False):
26 | source = [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||']
27 | target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||']
28 | action = actions[idx]
29 |
30 |
31 | if atten:
32 | attention = numpy.array(atten[idx])
33 |
34 | def track(acts, data, annote):
35 | x, y = 0, 0
36 | for a in acts:
37 | x += a
38 | y += 1 - a
39 | # print a, x, y, target[x].encode('utf8')
40 | data[y, x] = 1
41 | annote[y, x] = 'W' if a == 0 else 'C'
42 |
43 | return data, annote
44 | # print target
45 |
46 | data = numpy.zeros((len(source), len(target)))
47 | annote = numpy.chararray(data.shape, itemsize=8)
48 | annote[:] = ''
49 | data, annote = track(action, data, annote)
50 | data[0, 0] = 1
51 | annote[0, 0] = 'S'
52 | if atten:
53 | data[:-1, 1:] += attention.T
54 |
55 | d = pd.DataFrame(data=data, columns=target, index=source)
56 | # p = sns.diverging_palette(220, 10, as_cmap=True)
57 | f, ax = plot.subplots(figsize=(11, 11))
58 | f.set_canvas(plot.gcf().canvas)
59 | g = sns.heatmap(d, ax=ax, annot=annote, fmt='s')
60 | g.xaxis.tick_top()
61 |
62 | plot.xticks(rotation=90)
63 | plot.yticks(rotation=0)
64 | # plot.show()
65 | if savefig:
66 | if not os.path.exists('.images/C_{}'.format(name)):
67 | os.mkdir('.images/C_{}'.format(name))
68 |
69 | filename = 'Idx={}||'.format(info['index'])
70 | for w in info:
71 | if w is not 'index':
72 | filename += '.{}={:.2f}'.format(w, float(info[w]))
73 |
74 | print 'saving...'
75 | f.savefig('.images/C_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100)
76 | if show:
77 | plot.show()
78 |
79 | print 'plotting done.'
80 | plot.close()
81 |
82 | def heatmap2(sources, refs, trans, actions, idx, atten=None, full_atten=None, savefig=True, name='test', info=None, show=False):
83 | source = ['*'] + [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||']
84 | target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||'] + ['*']
85 | action = actions[idx]
86 |
87 | flag = 0
88 | if atten:
89 | attention = numpy.array(atten[idx])
90 | else:
91 | attention = None
92 |
93 | if full_atten:
94 | fullatten = numpy.array(full_atten[idx])
95 | else:
96 | fullatten = None
97 |
98 | def track(acts, data, annote):
99 | x, y, z = 0, 0, 0
100 | for a in acts:
101 | x += (a == 1)
102 | y += (a == 0)
103 | z += (a == 2)
104 |
105 | # data[y + 1, x] = 1
106 | # data[z, x + 1] = 1
107 | # annote[y, x] = 'W' if a == 0 else 'C'
108 |
109 | return data, annote
110 | # print target
111 |
112 | data = numpy.zeros((len(source), len(target)))
113 | annote = numpy.chararray(data.shape, itemsize=8)
114 | annote[:] = ''
115 | data, annote = track(action, data, annote)
116 | data[1, 0] = 1
117 |
118 | def draw(data_t, ax, attention=None):
119 |
120 | data = copy.copy(data_t)
121 | data[1:-1, 1:-1] += attention.T
122 | d = pd.DataFrame(data=data, columns=target, index=source)
123 | # p = sns.diverging_palette(220, 10, as_cmap=True)
124 | g = sns.heatmap(d, mask=(data==0), square=True, cbar=False, linewidths=0.1, ax=ax, annot=annote, fmt='s')
125 | g.xaxis.tick_top()
126 |
127 | for tick in ax.get_xticklabels():
128 | tick.set_rotation(90)
129 | for tick in ax.get_yticklabels():
130 | tick.set_rotation(0)
131 |
132 | ax.grid(True)
133 | f, [ax1, ax2] = plot.subplots(1, 2, figsize=(22, 11))
134 | f.set_canvas(plot.gcf().canvas)
135 |
136 | draw(data, ax1, attention)
137 | # plot.xticks(rotation=90)
138 | # plot.yticks(rotation=0)
139 | # plot.grid()
140 |
141 | draw(data, ax2, fullatten)
142 | # plot.xticks(rotation=90)
143 | # plot.yticks(rotation=0)
144 | # plot.grid()
145 |
146 |
147 | if savefig:
148 | if not os.path.exists('.images/M_{}'.format(name)):
149 | os.mkdir('.images/M_{}'.format(name))
150 |
151 | filename = 'Idx={}||'.format(info['index'])
152 | for w in info:
153 | if w is not 'index':
154 | filename += '.{}={:.2f}'.format(w, float(info[w]))
155 |
156 | print 'saving...'
157 | plot.savefig('.images/M_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100)
158 |
159 | if show:
160 | plot.show()
161 |
162 | print 'plotting done.'
163 | plot.close()
164 |
165 |
166 |
167 |
168 |
169 |
170 | def visualize(sources, refs, trans, aligns, idx, savefig=True, name='test', info=None):
171 |
172 | colors = ['b', 'g']
173 |
174 | fig = plot.figure(figsize=(20, 2))
175 | ax = plot.gca()
176 |
177 | # plot.hold('on')
178 |
179 | plot.xlim([0., 10.])
180 |
181 | scolors = []
182 | caidx = 0
183 | coloridx = 0
184 | for sidx in xrange(len([s_.replace('@@', '--').strip() for s_ in sources[idx].split()] + [''])):
185 | if caidx >= len(numpy.unique(aligns[idx])) or sidx >= numpy.unique(aligns[idx])[caidx]:
186 | caidx = caidx + 1
187 | coloridx = 1 - coloridx
188 | scolors.append(colors[coloridx])
189 |
190 | tcolors = []
191 | lastidx = -1
192 | coloridx = 1
193 | for tt in aligns[idx]:
194 | if tt != lastidx:
195 | lastidx = tt
196 | coloridx = 1 - coloridx
197 | tcolors.append(colors[coloridx])
198 |
199 | x, y = 0., 1.
200 | s_pos = [(x, y)]
201 | for ii, ss in enumerate([s_.replace('@@', '--').strip() for s_ in sources[idx].split()] + ['']):
202 |
203 | ss.replace('%', '\%')
204 | xx = plot.text(x, y, ss)
205 | xx.set_bbox(dict(color=scolors[ii], alpha=0.1, edgecolor=scolors[ii]))
206 | xx._renderer = fig.canvas.get_renderer()
207 | wext = xx.get_window_extent()
208 | bbox = ax.transData.inverted().transform(wext)
209 | x = bbox[1, 0] + 0.
210 | s_pos.append((x, y))
211 | s_pos.append((bbox[1, 0], y))
212 |
213 | x, y = 0., .95
214 | t_pos = []
215 | for ii, ss in enumerate([s_.decode('utf8').replace('@@', '--') for s_ in trans[idx].split()]):
216 |
217 | ss.replace('%', '\%')
218 | xx = plot.text(x, y, ss)
219 | xx._renderer = fig.canvas.get_renderer()
220 | wext = xx.get_window_extent()
221 | bbox = ax.transData.inverted().transform(wext)
222 | t_pos.append((bbox[0, 0], bbox[0, 1] + 0.03))
223 | x = bbox[1, 0] + 0.
224 | t_pos.append((bbox[1, 0], bbox[0, 1] + 0.03))
225 |
226 | lasttidx = 0
227 | lastidx = -1
228 | for tidx, sidx in enumerate(aligns[idx]):
229 | if lastidx != sidx:
230 | lastidx = sidx
231 | lasttidx = tidx
232 | sidx = numpy.minimum(sidx, len(s_pos) - 1)
233 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1],
234 | t_pos[tidx][0] - s_pos[sidx][0],
235 | t_pos[tidx][1] - s_pos[sidx][1],
236 | head_width=0., head_length=0.,
237 | fc=tcolors[tidx], ec=tcolors[tidx],
238 | linestyle='dotted', width=0.0001)
239 | for tt in xrange(tidx, len(aligns[idx])):
240 | if aligns[idx][tt] != sidx:
241 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1],
242 | t_pos[tt][0] - s_pos[sidx][0],
243 | t_pos[tt][1] - s_pos[sidx][1],
244 | head_width=0., head_length=0.,
245 | fc=tcolors[tidx], ec=tcolors[tidx],
246 | linestyle='dotted', width=0.0001)
247 | plot.fill_between([t_pos[tidx][0], s_pos[sidx][0], t_pos[tt][0]],
248 | [t_pos[tidx][1], s_pos[sidx][1], t_pos[tt][1]],
249 | facecolor=tcolors[tidx], alpha=0.1)
250 | break
251 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1],
252 | t_pos[-1][0] - s_pos[sidx][0],
253 | t_pos[-1][1] - s_pos[sidx][1],
254 | head_width=0., head_length=0.,
255 | fc=tcolors[-1], ec=tcolors[-1],
256 | linestyle='dotted', width=0.0001)
257 | plot.fill_between([t_pos[lasttidx][0], s_pos[sidx][0], t_pos[-1][0]],
258 | [t_pos[lasttidx][1], s_pos[sidx][1], t_pos[-1][1]],
259 | facecolor=tcolors[tidx], alpha=0.1)
260 |
261 | # plot.hold('off')
262 |
263 | plot.axis('off')
264 | plot.ylim([0.95, 1.01])
265 | plot.tight_layout()
266 |
267 | if savefig:
268 | if not os.path.exists('.images/{}'.format(name)):
269 | os.mkdir('.images/{}'.format(name))
270 |
271 | filename = 'Idx={}||'.format(info['index'])
272 | for w in info:
273 | if w is not 'index':
274 | filename += '.{}={:.2f}'.format(w, float(info[w]))
275 |
276 | plot.savefig('.images/{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=300)
277 |
278 | print 'plotting done.'
279 | plot.close()
280 | # plot.show()
281 |
282 |
283 | if __name__ == "__main__":
284 |
285 | sources = ['I cannot understand .']
286 | targets = ['Ich verstehe nicht .']
287 | actions = [[0, 0, 1, 1, 2, 0, 1, 2, 2, 0, 1]]
288 | heatmap2(sources, targets, targets, actions, 0, savefig=False, show=True)
289 |
--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
1 | """
2 | Build the basic layers for neural machine translation
3 | """
4 | import warnings
5 | import os
6 | import theano
7 | import theano.tensor as tensor
8 | import numpy
9 |
10 | from collections import OrderedDict
11 |
12 | profile = False
13 | TINY = 1e-7
14 |
15 | # -------------------------------------------------------------------------#
16 | # Basic utils:
17 | # push parameters to Theano shared variables
18 | def zipp(params, tparams):
19 | for kk, vv in params.iteritems():
20 | tparams[kk].set_value(vv)
21 |
22 |
23 | # pull parameters from Theano shared variables
24 | def unzip(zipped, new_params=None):
25 | if new_params is None:
26 | new_params = OrderedDict()
27 |
28 | for kk, vv in zipped.iteritems():
29 | new_params[kk] = vv.get_value()
30 | return new_params
31 |
32 |
33 | # flatten-grad
34 | def flatcat(arrays):
35 | '''
36 | Flattens arrays and concatenates them in order.
37 | '''
38 | return tensor.concatenate([a.flatten() for a in arrays])
39 |
40 | def flatgrad(loss, vars_):
41 | return flatcat(tensor.grad(loss, wrt=itemlist(vars_)))
42 |
43 | def zipsame(*seqs):
44 | L = len(seqs[0])
45 | assert all(len(seq) == L for seq in seqs[1:])
46 | return zip(*seqs)
47 |
48 |
49 |
50 | # ------------------------------------------------------------------------#
51 | # get the list of parameters: Note that tparams must be OrderedDict
52 | def itemlist(tparams, exception=None):
53 | if not exception:
54 | return [vv for kk, vv in tparams.iteritems()]
55 |
56 | return [vv for kk, vv in tparams.iteritems() if kk not in exception]
57 |
58 | # make prefix-appended name
59 | def _p(pp, name):
60 | return '%s_%s' % (pp, name)
61 |
62 | # initialize Theano shared variables according to the initial parameters
63 | def init_tparams(params):
64 | tparams = OrderedDict()
65 | for kk, pp in params.iteritems():
66 | tparams[kk] = theano.shared(params[kk], name=kk)
67 | return tparams
68 |
69 |
70 | # load parameters
71 | def load_params(path, params):
72 | pp = numpy.load(path)
73 | for kk, vv in params.iteritems():
74 | if kk not in pp:
75 | warnings.warn('%s is not in the archive' % kk)
76 | continue
77 | print 'loading {}: {}'.format(kk, pp[kk].shape)
78 | params[kk] = pp[kk]
79 |
80 | return params
81 |
82 | # lateral normalization
83 | def ln(x, b, s):
84 | _eps = 1e-5
85 | output = (x - x.mean(1)[:,None]) / tensor.sqrt((x.var(1)[:,None] + _eps))
86 | output = s[None, :] * output + b[None,:]
87 | return output
88 |
89 |
90 | # -------------------------------------------------------------------------#
91 | # Layers:
92 | # 'layer-name': ('parameter initializer', 'computational graph') -- registeration
93 | layers = dict()
94 | layers['ff'] = ('param_init_fflayer', 'fflayer')
95 | layers['gru'] = ('param_init_gru', 'gru_layer')
96 | layers['gru_cond'] = ('param_init_gru_cond', 'gru_cond_layer', 'gru_cond_context', 'gru_cond_update')
97 | layers['lngru'] = ('param_init_lngru', 'lngru_layer')
98 |
99 | def get_layer(name):
100 | fns = layers[name]
101 | return (eval(fns[0]), eval(fns[1]))
102 |
103 | # some utilities
104 | def ortho_weight(ndim):
105 | W = numpy.random.randn(ndim, ndim)
106 | u, s, v = numpy.linalg.svd(W)
107 | return u.astype('float32')
108 |
109 | # norm initialization
110 | def norm_weight(nin, nout=None, scale=0.01, ortho=True):
111 | if nout is None:
112 | nout = nin
113 | if nout == nin and ortho:
114 | W = ortho_weight(nin)
115 | else:
116 | W = scale * numpy.random.randn(nin, nout)
117 |
118 | return W.astype('float32')
119 |
120 |
121 | def tanh(x):
122 | return tensor.tanh(x)
123 |
124 | def linear(x):
125 | return x
126 |
127 | def sigmoid(x):
128 | return tensor.nnet.sigmoid(x)
129 |
130 | def relu(x):
131 | return tensor.nnet.relu(x)
132 |
133 | def softmax(x):
134 | return tensor.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)
135 |
136 | def concatenate(tensor_list, axis=0):
137 | """
138 | Alternative implementation of `theano.tensor.concatenate`.
139 | This function does exactly the same thing, but contrary to Theano's own
140 | implementation, the gradient is implemented on the GPU.
141 | Backpropagating through `theano.tensor.concatenate` yields slowdowns
142 | because the inverse operation (splitting) needs to be done on the CPU.
143 | This implementation does not have that problem.
144 | :usage:
145 | >>> x, y = theano.tensor.matrices('x', 'y')
146 | >>> c = concatenate([x, y], axis=1)
147 | :parameters:
148 | - tensor_list : list
149 | list of Theano tensor expressions that should be concatenated.
150 | - axis : int
151 | the tensors will be joined along this axis.
152 | :returns:
153 | - out : tensor
154 | the concatenated tensor expression.
155 | """
156 | concat_size = sum(tt.shape[axis] for tt in tensor_list)
157 |
158 | output_shape = ()
159 | for k in range(axis):
160 | output_shape += (tensor_list[0].shape[k],)
161 | output_shape += (concat_size,)
162 | for k in range(axis + 1, tensor_list[0].ndim):
163 | output_shape += (tensor_list[0].shape[k],)
164 |
165 | out = tensor.zeros(output_shape)
166 | offset = 0
167 | for tt in tensor_list:
168 | indices = ()
169 | for k in range(axis):
170 | indices += (slice(None),)
171 | indices += (slice(offset, offset + tt.shape[axis]),)
172 | for k in range(axis + 1, tensor_list[0].ndim):
173 | indices += (slice(None),)
174 |
175 | out = tensor.set_subtensor(out[indices], tt)
176 | offset += tt.shape[axis]
177 |
178 | return out
179 |
180 | #-------------------------------------------------------------------------#
181 | # Dropout:
182 |
183 | def dropout_layer(state_before, use_noise, trng):
184 | proj = tensor.switch(
185 | use_noise,
186 | state_before * trng.binomial(state_before.shape, p=0.5, n=1,
187 | dtype=state_before.dtype),
188 | state_before * 0.5)
189 | return proj
190 |
191 |
192 | # -------------------------------------------------------------------------#
193 | # Feedforward:
194 | # affine transformation + point-wise nonlinearity
195 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None,
196 | ortho=True, negative=0, scale=0.01):
197 | if nin is None:
198 | nin = options['dim_proj']
199 | if nout is None:
200 | nout = options['dim_proj']
201 | params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=scale, ortho=ortho)
202 | if negative == 0:
203 | params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32')
204 | else:
205 | params[_p(prefix, 'b')] = numpy.ones((nout,)).astype('float32') * negative
206 |
207 | return params
208 |
209 |
210 | def fflayer(tparams, state_below, options, prefix='rconv',
211 | activ='lambda x: tensor.tanh(x)', **kwargs):
212 | return eval(activ)(
213 | tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
214 | tparams[_p(prefix, 'b')])
215 |
216 |
217 | # -------------------------------------------------------------------------#
218 | # Gated Recurrent Unit:
219 | #
220 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None, scale=0.01):
221 | if nin is None:
222 | nin = options['dim_proj']
223 | if dim is None:
224 | dim = options['dim_proj']
225 |
226 | # embedding to gates transformation weights, biases
227 | W = numpy.concatenate([norm_weight(nin, dim, scale=scale),
228 | norm_weight(nin, dim, scale=scale)], axis=1)
229 | params[_p(prefix, 'W')] = W
230 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
231 |
232 | # recurrent transformation weights for gates
233 | U = numpy.concatenate([ortho_weight(dim),
234 | ortho_weight(dim)], axis=1)
235 | params[_p(prefix, 'U')] = U
236 |
237 | # embedding to hidden state proposal weights, biases
238 | Wx = norm_weight(nin, dim, scale=scale)
239 | params[_p(prefix, 'Wx')] = Wx
240 | params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')
241 |
242 | # recurrent transformation weights for hidden state proposal
243 | Ux = ortho_weight(dim)
244 | params[_p(prefix, 'Ux')] = Ux
245 |
246 | return params
247 |
248 |
249 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None,
250 | one_step=False, _init_state=None, **kwargs):
251 | if one_step:
252 | assert _init_state, 'previous state must be provided'
253 |
254 | nsteps = state_below.shape[0]
255 | if state_below.ndim == 3:
256 | n_samples = state_below.shape[1]
257 | else:
258 | n_samples = 1
259 |
260 | dim = tparams[_p(prefix, 'Ux')].shape[1]
261 |
262 | if mask is None:
263 | mask = tensor.alloc(1., state_below.shape[0], 1)
264 |
265 | # utility function to slice a tensor
266 | def _slice(_x, n, dim):
267 | if _x.ndim == 3:
268 | return _x[:, :, n*dim:(n+1)*dim]
269 | return _x[:, n*dim:(n+1)*dim]
270 |
271 | # state_below is the input word embeddings
272 | # input to the gates, concatenated
273 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
274 | tparams[_p(prefix, 'b')]
275 | # input to compute the hidden state proposal
276 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
277 | tparams[_p(prefix, 'bx')]
278 |
279 | # step function to be used by scan
280 | # arguments | sequences |outputs-info| non-seqs
281 | def _step_slice(m_, x_, xx_, h_, U, Ux):
282 | preact = tensor.dot(h_, U)
283 | preact += x_
284 |
285 | # reset and update gates
286 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
287 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
288 |
289 | # compute the hidden state proposal
290 | preactx = tensor.dot(h_, Ux)
291 | preactx = preactx * r
292 | preactx = preactx + xx_
293 |
294 | # hidden state proposal
295 | h = tensor.tanh(preactx)
296 |
297 | # leaky integrate and obtain next hidden state
298 | h = u * h_ + (1. - u) * h
299 | h = m_[:, None] * h + (1. - m_)[:, None] * h_
300 |
301 | return h
302 |
303 | # prepare scan arguments
304 | seqs = [mask, state_below_, state_belowx]
305 | init_states = [tensor.alloc(0., n_samples, dim)]
306 | _step = _step_slice
307 | shared_vars = [tparams[_p(prefix, 'U')],
308 | tparams[_p(prefix, 'Ux')]]
309 |
310 | if one_step:
311 | rval = _step(*(seqs + [_init_state] + shared_vars))
312 | else:
313 | rval, updates = theano.scan(_step,
314 | sequences=seqs,
315 | outputs_info=init_states,
316 | non_sequences=shared_vars,
317 | name=_p(prefix, '_layers'),
318 | n_steps=nsteps,
319 | profile=profile,
320 | strict=True)
321 | rval = [rval]
322 | return rval
323 |
324 | # -------------------------------------------------------------------------#
325 | # Conditional Gated Recurrent Unit with Attention (GRU_cond)
326 | #
327 | def param_init_gru_cond(options, params, prefix='gru_cond',
328 | nin=None, dim=None, dimctx=None,
329 | nin_nonlin=None, dim_nonlin=None, scale=0.01):
330 | if nin is None:
331 | nin = options['dim']
332 | if dim is None:
333 | dim = options['dim']
334 | if dimctx is None:
335 | dimctx = options['dim']
336 | if nin_nonlin is None:
337 | nin_nonlin = nin
338 | if dim_nonlin is None:
339 | dim_nonlin = dim
340 |
341 | W = numpy.concatenate([norm_weight(nin, dim, scale=scale),
342 | norm_weight(nin, dim, scale=scale)], axis=1)
343 | params[_p(prefix, 'W')] = W
344 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
345 | U = numpy.concatenate([ortho_weight(dim_nonlin),
346 | ortho_weight(dim_nonlin)], axis=1)
347 | params[_p(prefix, 'U')] = U
348 |
349 | Wx = norm_weight(nin_nonlin, dim_nonlin, scale=scale)
350 | params[_p(prefix, 'Wx')] = Wx
351 | Ux = ortho_weight(dim_nonlin)
352 | params[_p(prefix, 'Ux')] = Ux
353 | params[_p(prefix, 'bx')] = numpy.zeros((dim_nonlin,)).astype('float32')
354 |
355 | U_nl = numpy.concatenate([ortho_weight(dim_nonlin),
356 | ortho_weight(dim_nonlin)], axis=1)
357 | params[_p(prefix, 'U_nl')] = U_nl
358 | params[_p(prefix, 'b_nl')] = numpy.zeros((2 * dim_nonlin,)).astype('float32')
359 |
360 | Ux_nl = ortho_weight(dim_nonlin)
361 | params[_p(prefix, 'Ux_nl')] = Ux_nl
362 | params[_p(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin,)).astype('float32')
363 |
364 | # context to LSTM
365 | Wc = norm_weight(dimctx, dim*2, scale=scale)
366 | params[_p(prefix, 'Wc')] = Wc
367 |
368 | Wcx = norm_weight(dimctx, dim, scale=scale)
369 | params[_p(prefix, 'Wcx')] = Wcx
370 |
371 | # attention: combined -> hidden
372 | W_comb_att = norm_weight(dim, dimctx, scale=scale)
373 | params[_p(prefix, 'W_comb_att')] = W_comb_att
374 |
375 | # attention: context -> hidden
376 | Wc_att = norm_weight(dimctx, scale=scale)
377 | params[_p(prefix, 'Wc_att')] = Wc_att
378 |
379 | # attention: hidden bias
380 | b_att = numpy.zeros((dimctx,)).astype('float32')
381 | params[_p(prefix, 'b_att')] = b_att
382 |
383 | # attention:
384 | U_att = norm_weight(dimctx, 1, scale=scale)
385 | params[_p(prefix, 'U_att')] = U_att
386 | c_att = numpy.zeros((1,)).astype('float32')
387 | params[_p(prefix, 'c_tt')] = c_att
388 |
389 | return params
390 |
391 |
392 | def gru_cond_layer(tparams, state_below, options, prefix='gru',
393 | mask=None, context=None, one_step=False,
394 | init_memory=None, init_state=None,
395 | context_mask=None,
396 | **kwargs):
397 |
398 | assert context, 'Context must be provided'
399 |
400 | if one_step:
401 | assert init_state, 'previous state must be provided'
402 |
403 | nsteps = state_below.shape[0]
404 | if state_below.ndim == 3:
405 | n_samples = state_below.shape[1]
406 | else:
407 | n_samples = 1
408 |
409 | # mask
410 | if mask is None:
411 | mask = tensor.alloc(1., state_below.shape[0], 1)
412 |
413 | dim = tparams[_p(prefix, 'Wcx')].shape[1]
414 |
415 | # initial/previous state
416 | if init_state is None:
417 | init_state = tensor.alloc(0., n_samples, dim)
418 |
419 | # projected context
420 | assert context.ndim == 3, \
421 | 'Context must be 3-d: #annotation x #sample x dim'
422 | pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\
423 | tparams[_p(prefix, 'b_att')]
424 |
425 | def _slice(_x, n, dim):
426 | if _x.ndim == 3:
427 | return _x[:, :, n*dim:(n+1)*dim]
428 | return _x[:, n*dim:(n+1)*dim]
429 |
430 | # projected x
431 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\
432 | tparams[_p(prefix, 'bx')]
433 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\
434 | tparams[_p(prefix, 'b')]
435 |
436 | def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_,
437 | U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
438 | U_nl, Ux_nl, b_nl, bx_nl):
439 | preact1 = tensor.dot(h_, U)
440 | preact1 += x_
441 | preact1 = tensor.nnet.sigmoid(preact1)
442 |
443 | r1 = _slice(preact1, 0, dim)
444 | u1 = _slice(preact1, 1, dim)
445 |
446 | preactx1 = tensor.dot(h_, Ux)
447 | preactx1 *= r1
448 | preactx1 += xx_
449 |
450 | h1 = tensor.tanh(preactx1)
451 |
452 | h1 = u1 * h_ + (1. - u1) * h1
453 | h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_
454 |
455 | # attention
456 | pstate_ = tensor.dot(h1, W_comb_att)
457 | pctx__ = pctx_ + pstate_[None, :, :]
458 | #pctx__ += xc_
459 | pctx__ = tensor.tanh(pctx__)
460 | alpha = tensor.dot(pctx__, U_att)+c_tt
461 | alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
462 | alpha = tensor.exp(alpha)
463 |
464 | if context_mask:
465 | alpha = alpha * context_mask
466 | alpha = alpha / (alpha.sum(0, keepdims=True) + TINY)
467 |
468 | ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context
469 |
470 | preact2 = tensor.dot(h1, U_nl)+b_nl
471 | preact2 += tensor.dot(ctx_, Wc)
472 | preact2 = tensor.nnet.sigmoid(preact2)
473 |
474 | r2 = _slice(preact2, 0, dim)
475 | u2 = _slice(preact2, 1, dim)
476 |
477 | preactx2 = tensor.dot(h1, Ux_nl)+bx_nl
478 | preactx2 *= r2
479 | preactx2 += tensor.dot(ctx_, Wcx)
480 |
481 | h2 = tensor.tanh(preactx2)
482 |
483 | h2 = u2 * h1 + (1. - u2) * h2
484 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
485 |
486 | return h2, ctx_, alpha.T # pstate_, preact, preactx, r, u
487 |
488 | seqs = [mask, state_below_, state_belowx]
489 | #seqs = [mask, state_below_, state_belowx, state_belowc]
490 | _step = _step_slice
491 |
492 | shared_vars = [tparams[_p(prefix, 'U')],
493 | tparams[_p(prefix, 'Wc')],
494 | tparams[_p(prefix, 'W_comb_att')],
495 | tparams[_p(prefix, 'U_att')],
496 | tparams[_p(prefix, 'c_tt')],
497 | tparams[_p(prefix, 'Ux')],
498 | tparams[_p(prefix, 'Wcx')],
499 | tparams[_p(prefix, 'U_nl')],
500 | tparams[_p(prefix, 'Ux_nl')],
501 | tparams[_p(prefix, 'b_nl')],
502 | tparams[_p(prefix, 'bx_nl')]]
503 |
504 | if one_step:
505 | rval = _step(*(seqs + [init_state, None, None, pctx_, context] +
506 | shared_vars))
507 | else:
508 | rval, updates = theano.scan(_step,
509 | sequences=seqs,
510 | outputs_info=[init_state,
511 | tensor.alloc(0., n_samples,
512 | context.shape[2]),
513 | tensor.alloc(0., n_samples,
514 | context.shape[0])],
515 | non_sequences=[pctx_, context]+shared_vars,
516 | name=_p(prefix, '_layers'),
517 | n_steps=nsteps,
518 | profile=profile,
519 | strict=True)
520 | return rval
521 |
522 | # ================================================================================== #
523 | # Conditional GRU: depart the network
524 |
525 | def gru_cond_context(tparams, state_below, options, prefix='gru',
526 | mask=None, context=None,
527 | init_memory=None, init_state=None,
528 | context_mask=None,
529 | **kwargs):
530 |
531 | assert context, 'Context must be provided'
532 | assert init_state, 'previous state must be provided'
533 |
534 | if state_below.ndim == 3:
535 | n_samples = state_below.shape[1]
536 | else:
537 | n_samples = 1
538 |
539 | # mask
540 | if mask is None:
541 | mask = tensor.alloc(1., state_below.shape[0], 1)
542 |
543 | dim = tparams[_p(prefix, 'Wcx')].shape[1]
544 |
545 | # initial/previous state
546 | if init_state is None:
547 | init_state = tensor.alloc(0., n_samples, dim)
548 |
549 | # projected context
550 | assert context.ndim == 3, \
551 | 'Context must be 3-d: #annotation x #sample x dim'
552 | pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\
553 | tparams[_p(prefix, 'b_att')]
554 |
555 | def _slice(_x, n, dim):
556 | if _x.ndim == 3:
557 | return _x[:, :, n*dim:(n+1)*dim]
558 | return _x[:, n*dim:(n+1)*dim]
559 |
560 | # projected x
561 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\
562 | tparams[_p(prefix, 'bx')]
563 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\
564 | tparams[_p(prefix, 'b')]
565 |
566 | def _step_slice(m_, x_, xx_, h_, pctx_, cc_,
567 | U, W_comb_att, U_att, c_tt, Ux):
568 | preact1 = tensor.dot(h_, U)
569 | preact1 += x_
570 | preact1 = tensor.nnet.sigmoid(preact1)
571 |
572 | r1 = _slice(preact1, 0, dim)
573 | u1 = _slice(preact1, 1, dim)
574 |
575 | preactx1 = tensor.dot(h_, Ux)
576 | preactx1 *= r1
577 | preactx1 += xx_
578 |
579 | h1 = tensor.tanh(preactx1)
580 |
581 | h1 = u1 * h_ + (1. - u1) * h1
582 | h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_
583 |
584 | # attention
585 | pstate_ = tensor.dot(h1, W_comb_att)
586 |
587 | pctx__ = pctx_ + pstate_[None, :, :]
588 | pctx__ = tensor.tanh(pctx__)
589 |
590 | alpha = tensor.dot(pctx__, U_att)+c_tt
591 | alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
592 | alpha = tensor.exp(alpha)
593 |
594 | if context_mask:
595 | alpha = alpha * context_mask
596 | alpha = alpha / (alpha.sum(0, keepdims=True) + TINY)
597 |
598 | ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context
599 | return h1, ctx_, alpha.T # pstate_, preact, preactx, r, u
600 |
601 | seqs = [mask, state_below_, state_belowx]
602 | _step = _step_slice
603 |
604 | shared_vars = [tparams[_p(prefix, 'U')],
605 | tparams[_p(prefix, 'W_comb_att')],
606 | tparams[_p(prefix, 'U_att')],
607 | tparams[_p(prefix, 'c_tt')],
608 | tparams[_p(prefix, 'Ux')]]
609 |
610 | rval = _step(*(seqs + [init_state, pctx_, context] + shared_vars))
611 | return rval
612 |
613 |
614 | def gru_cond_update(tparams, options, prefix='gru',
615 | mask=None, cxt=None, h1=None,
616 | **kwargs):
617 |
618 | assert cxt, 'Context vector must be provided'
619 | assert h1, 'Temperal state vector must be provided'
620 |
621 | # mask
622 | if mask is None:
623 | mask = tensor.alloc(1., h1.shape[0], 1)
624 |
625 | dim = tparams[_p(prefix, 'Wcx')].shape[1]
626 |
627 |
628 | def _slice(_x, n, dim):
629 | if _x.ndim == 3:
630 | return _x[:, :, n*dim:(n+1)*dim]
631 | return _x[:, n*dim:(n+1)*dim]
632 |
633 |
634 | def _step_slice(m_, ctx_, h1,
635 | Wc, Wcx,
636 | U_nl, Ux_nl,
637 | b_nl, bx_nl):
638 |
639 | preact2 = tensor.dot(h1, U_nl)+b_nl
640 | preact2 += tensor.dot(ctx_, Wc)
641 | preact2 = tensor.nnet.sigmoid(preact2)
642 |
643 | r2 = _slice(preact2, 0, dim)
644 | u2 = _slice(preact2, 1, dim)
645 |
646 | preactx2 = tensor.dot(h1, Ux_nl)+bx_nl
647 | preactx2 *= r2
648 | preactx2 += tensor.dot(ctx_, Wcx)
649 |
650 | h2 = tensor.tanh(preactx2)
651 | h2 = u2 * h1 + (1. - u2) * h2
652 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
653 |
654 | return h2
655 |
656 | seqs = [mask, cxt, h1]
657 | _step = _step_slice
658 |
659 | shared_vars = [tparams[_p(prefix, 'Wc')],
660 | tparams[_p(prefix, 'Wcx')],
661 | tparams[_p(prefix, 'U_nl')],
662 | tparams[_p(prefix, 'Ux_nl')],
663 | tparams[_p(prefix, 'b_nl')],
664 | tparams[_p(prefix, 'bx_nl')]]
665 |
666 | rval = _step(*(seqs + shared_vars))
667 | return rval
668 |
669 | # ================================================================================== #
670 |
671 |
672 |
673 |
674 | # LN-GRU layer
675 | def param_init_lngru(options, params, prefix='lngru', nin=None, dim=None, scale=0.01):
676 | """
677 | Gated Recurrent Unit (GRU) with LN
678 | """
679 | if nin == None:
680 | nin = options['dim_proj']
681 | if dim == None:
682 | dim = options['dim_proj']
683 | W = numpy.concatenate([norm_weight(nin,dim, scale=scale),
684 | norm_weight(nin,dim, scale=scale)], axis=1)
685 | params[_p(prefix,'W')] = W
686 | params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
687 | U = numpy.concatenate([ortho_weight(dim),
688 | ortho_weight(dim)], axis=1)
689 | params[_p(prefix,'U')] = U
690 |
691 | Wx = norm_weight(nin, dim, scale=scale)
692 | params[_p(prefix,'Wx')] = Wx
693 | Ux = ortho_weight(dim)
694 | params[_p(prefix,'Ux')] = Ux
695 | params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')
696 |
697 | # LN parameters
698 | scale_add = 0.0
699 | scale_mul = 1.0
700 | params[_p(prefix,'b1')] = scale_add * numpy.ones((2*dim)).astype('float32')
701 | params[_p(prefix,'b2')] = scale_add * numpy.ones((1*dim)).astype('float32')
702 | params[_p(prefix,'b3')] = scale_add * numpy.ones((2*dim)).astype('float32')
703 | params[_p(prefix,'b4')] = scale_add * numpy.ones((1*dim)).astype('float32')
704 | params[_p(prefix,'s1')] = scale_mul * numpy.ones((2*dim)).astype('float32')
705 | params[_p(prefix,'s2')] = scale_mul * numpy.ones((1*dim)).astype('float32')
706 | params[_p(prefix,'s3')] = scale_mul * numpy.ones((2*dim)).astype('float32')
707 | params[_p(prefix,'s4')] = scale_mul * numpy.ones((1*dim)).astype('float32')
708 |
709 | return params
710 |
711 | def lngru_layer(tparams, state_below, options, prefix='lngru', mask=None, one_step=False, _init_state=None, **kwargs):
712 | """
713 | Feedforward pass through GRU with LN
714 | """
715 | nsteps = state_below.shape[0]
716 | if state_below.ndim == 3:
717 | n_samples = state_below.shape[1]
718 | else:
719 | n_samples = 1
720 |
721 | dim = tparams[_p(prefix,'Ux')].shape[1]
722 |
723 | if _init_state == None:
724 | _init_state = tensor.alloc(0., n_samples, dim)
725 |
726 | if mask == None:
727 | mask = tensor.alloc(1., state_below.shape[0], 1)
728 |
729 | def _slice(_x, n, dim):
730 | if _x.ndim == 3:
731 | return _x[:, :, n*dim:(n+1)*dim]
732 | return _x[:, n*dim:(n+1)*dim]
733 |
734 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
735 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
736 | U = tparams[_p(prefix, 'U')]
737 | Ux = tparams[_p(prefix, 'Ux')]
738 |
739 | def _step_slice(m_, x_, xx_, h_, U, Ux, b1, b2, b3, b4, s1, s2, s3, s4):
740 |
741 | x_ = ln(x_, b1, s1)
742 | xx_ = ln(xx_, b2, s2)
743 |
744 | preact = tensor.dot(h_, U)
745 | preact = ln(preact, b3, s3)
746 | preact += x_
747 |
748 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
749 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
750 |
751 | preactx = tensor.dot(h_, Ux)
752 | preactx = ln(preactx, b4, s4)
753 | preactx = preactx * r
754 | preactx = preactx + xx_
755 |
756 | h = tensor.tanh(preactx)
757 |
758 | h = u * h_ + (1. - u) * h
759 | h = m_[:,None] * h + (1. - m_)[:,None] * h_
760 |
761 | return h
762 |
763 | seqs = [mask, state_below_, state_belowx]
764 | _step = _step_slice
765 |
766 | non_seqs = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]]
767 | non_seqs += [tparams[_p(prefix, 'b1')], tparams[_p(prefix, 'b2')], tparams[_p(prefix, 'b3')], tparams[_p(prefix, 'b4')]]
768 | non_seqs += [tparams[_p(prefix, 's1')], tparams[_p(prefix, 's2')], tparams[_p(prefix, 's3')], tparams[_p(prefix, 's4')]]
769 |
770 | if one_step:
771 | rval = _step(*(seqs+[_init_state]+non_seqs))
772 | else:
773 | rval, updates = theano.scan(_step,
774 | sequences=seqs,
775 | outputs_info = [_init_state],
776 | non_sequences = non_seqs,
777 | name=_p(prefix, '_layers'),
778 | n_steps=nsteps,
779 | profile=False,
780 | strict=True)
781 | rval = [rval]
782 | return rval
783 |
784 |
785 |
--------------------------------------------------------------------------------
/mteval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ref=" /misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-ref.ru.tok"
4 | sed -i 's/@@ //g' $1
5 | ./data/multi-bleu.perl ref < $1
6 |
--------------------------------------------------------------------------------
/nmt_uni.py:
--------------------------------------------------------------------------------
1 | '''
2 | Build a neural machine translation model with soft attention
3 | '''
4 | import theano
5 | import theano.tensor as tensor
6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
7 |
8 | import cPickle as pkl
9 | #import ipdb
10 | import numpy
11 | import copy
12 |
13 | import os
14 |
15 | import sys
16 | import time
17 |
18 | from collections import OrderedDict
19 | from data_iterator import TextIterator
20 | from layers import *
21 | from optimizer import *
22 |
23 | profile = False
24 | TINY = 1e-7
25 |
26 | # -----------------------------------------------------------------------------#
27 | # Build the Attention-based Neural Machine Translation
28 |
29 | # initialize all parameters
30 | def init_params(options):
31 | params = OrderedDict()
32 |
33 | # embedding
34 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])
35 | params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word'])
36 |
37 | # encoder: uni-directional RNN
38 | params = get_layer(options['encoder'])[0](options, params,
39 | prefix='encoder',
40 | nin=options['dim_word'],
41 | dim=options['dim'])
42 |
43 | if options.get('birnn', False):
44 | params = get_layer(options['encoder'])[0](options, params,
45 | prefix='encoder_r',
46 | nin=options['dim_word'],
47 | dim=options['dim'])
48 |
49 |
50 | ctxdim = options['dim'] if not options.get('birnn', False) else 2 * options['dim']
51 |
52 | # init_state, init_cell
53 | params = get_layer('ff')[0](options, params, prefix='ff_state',
54 | nin=ctxdim, nout=options['dim'])
55 | # decoder
56 | params = get_layer(options['decoder'])[0](options, params,
57 | prefix='decoder',
58 | nin=options['dim_word'],
59 | dim=options['dim'],
60 | dimctx=ctxdim)
61 | # readout
62 | params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm',
63 | nin=options['dim'], nout=options['dim_word'],
64 | ortho=False)
65 | params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
66 | nin=options['dim_word'],
67 | nout=options['dim_word'], ortho=False)
68 | params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx',
69 | nin=ctxdim, nout=options['dim_word'],
70 | ortho=False)
71 | params = get_layer('ff')[0](options, params, prefix='ff_logit',
72 | nin=options['dim_word'],
73 | nout=options['n_words'])
74 | return params
75 |
76 |
77 | def build_model(tparams, options):
78 | opt_ret = dict()
79 |
80 | trng = RandomStreams(1234)
81 | use_noise = theano.shared(numpy.float32(0.))
82 |
83 | # description string: #words x #samples
84 | x = tensor.matrix('x', dtype='int64')
85 | x_mask = tensor.matrix('x_mask', dtype='float32')
86 | y = tensor.matrix('y', dtype='int64')
87 | y_mask = tensor.matrix('y_mask', dtype='float32')
88 |
89 | # time_steps
90 | n_timesteps = x_mask.shape[0]
91 | n_timesteps_trg = y_mask.shape[0]
92 | n_samples = x_mask.shape[1]
93 |
94 | # word embedding for forward rnn (source)
95 | emb = tparams['Wemb'][x.flatten()]
96 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
97 | proj = get_layer(options['encoder'])[1](tparams, emb, options,
98 | prefix='encoder',
99 | mask=x_mask)
100 |
101 | # for reverse RNN: bi-directional RNN encoder
102 | if options.get('birnn', False):
103 | xr = x[::-1]
104 | xr_mask = x_mask[::-1]
105 |
106 | embr = tparams['Wemb'][xr.flatten()]
107 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
108 | projr = get_layer(options['encoder'])[1](tparams, embr, options,
109 | prefix='encoder_r',
110 | mask=xr_mask)
111 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
112 |
113 | else:
114 | ctx = proj[0] # context vectors
115 |
116 | # mean of the context (across time) will be used to initialize decoder rnn
117 | ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
118 |
119 |
120 | # initial decoder state
121 | init_state = get_layer('ff')[1](tparams, ctx_mean, options,
122 | prefix='ff_state', activ='tanh')
123 |
124 | # word embedding (target), we will shift the target sequence one time step
125 | # to the right. This is done because of the bi-gram connections in the
126 | # readout and decoder rnn. The first target will be all zeros and we will
127 | # not condition on the last output.
128 | emb = tparams['Wemb_dec'][y.flatten()]
129 | emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
130 | emb_shifted = tensor.zeros_like(emb)
131 | emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
132 | emb = emb_shifted
133 |
134 | # decoder - pass through the decoder conditional gru with attention
135 | proj = get_layer(options['decoder'])[1](tparams, emb, options,
136 | prefix='decoder',
137 | mask=y_mask, context=ctx,
138 | context_mask=x_mask,
139 | one_step=False,
140 | init_state=init_state)
141 | # hidden states of the decoder gru
142 | proj_h = proj[0]
143 |
144 | # weighted averages of context, generated by attention module
145 | ctxs = proj[1]
146 |
147 | # weights (alignment matrix)
148 | opt_ret['dec_alphas'] = proj[2] # --> to show the attenion weights
149 |
150 | # compute word probabilities
151 | logit_lstm = get_layer('ff')[1](tparams, proj_h, options,
152 | prefix='ff_logit_lstm', activ='linear')
153 | logit_prev = get_layer('ff')[1](tparams, emb, options,
154 | prefix='ff_logit_prev', activ='linear')
155 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
156 | prefix='ff_logit_ctx', activ='linear')
157 | logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
158 |
159 | # dropout (noise)
160 | if options['use_dropout']:
161 | logit = dropout_layer(logit, use_noise, trng)
162 | logit = get_layer('ff')[1](tparams, logit, options,
163 | prefix='ff_logit', activ='linear')
164 | logit_shp = logit.shape
165 | probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
166 |
167 | # compute the cost (negative loglikelihood)
168 | y_flat = y.flatten()
169 | y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
170 |
171 | cost = -tensor.log(probs.flatten()[y_flat_idx])
172 | cost = cost.reshape([y.shape[0], y.shape[1]])
173 | cost = (cost * y_mask).sum(0)
174 |
175 | # we will build an additional function for computing costs
176 | f_cost = theano.function([ctx, x_mask, y, y_mask], cost)
177 | return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, f_cost
178 |
179 |
180 | # build a fine-tuner
181 | def build_fine(tparams, options, fullmodel=True):
182 |
183 | # ------------------- ENCODER ------------------------------------------ #
184 |
185 | opt_ret = dict()
186 |
187 | trng = RandomStreams(1234)
188 | use_noise = theano.shared(numpy.float32(0.))
189 |
190 | # description string: #words x #samples
191 | x = tensor.matrix('x', dtype='int64')
192 | x_mask = tensor.matrix('x_mask', dtype='float32')
193 | y = tensor.matrix('y', dtype='int64')
194 | y_mask = tensor.matrix('y_mask', dtype='float32')
195 |
196 | # time_steps
197 | n_timesteps = x_mask.shape[0]
198 | n_timesteps_trg = y_mask.shape[0]
199 | n_samples = x_mask.shape[1]
200 |
201 | # word embedding for forward rnn (source)
202 | emb = tparams['Wemb'][x.flatten()]
203 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
204 | proj = get_layer(options['encoder'])[1](tparams, emb, options,
205 | prefix='encoder',
206 | mask=x_mask)
207 |
208 | # for reverse RNN: bi-directional RNN encoder
209 | if options.get('birnn', False):
210 | xr = x[::-1]
211 | xr_mask = x_mask[::-1]
212 |
213 | embr = tparams['Wemb'][xr.flatten()]
214 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
215 | projr = get_layer(options['encoder'])[1](tparams, embr, options,
216 | prefix='encoder_r',
217 | mask=xr_mask)
218 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
219 |
220 | else:
221 | ctx = proj[0] # context vectors
222 |
223 | # mean of the context (across time) will be used to initialize decoder rnn
224 | ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
225 |
226 | # or you can use the last state of forward + backward encoder rnns
227 | # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2)
228 |
229 | # initial decoder state
230 | init_state = get_layer('ff')[1](tparams, ctx_mean, options,
231 | prefix='ff_state', activ='tanh')
232 |
233 | print 'compile the initializer'
234 | f_init = theano.function([x, x_mask], [ctx, init_state])
235 | print 'encoder done.'
236 | # ------------------- ENCODER ------------------------------------------ #
237 |
238 |
239 | c_mask = tensor.tensor3('c_mask', dtype='float32') # seq_t x seq_s x batches
240 |
241 | emb = tparams['Wemb_dec'][y.flatten()]
242 | emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
243 | emb_shifted = tensor.zeros_like(emb)
244 | emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
245 | emb = emb_shifted
246 |
247 | # decoder - pass through the decoder conditional gru with attention
248 | def _step(_emb, _y_mask, _c_mask, _init_state, _ctx):
249 | return get_layer(options['decoder'])[1](tparams, _emb, options,
250 | prefix='decoder',
251 | mask=_y_mask, context=_ctx,
252 | context_mask=_c_mask,
253 | one_step=True,
254 | init_state=_init_state)
255 |
256 | proj, _ = theano.scan(_step,
257 | sequences=[emb, y_mask, c_mask],
258 | outputs_info=[init_state, None, None],
259 | non_sequences=[ctx])
260 |
261 |
262 | # hidden states of the decoder gru
263 | proj_h = proj[0]
264 |
265 | # weighted averages of context, generated by attention module
266 | ctxs = proj[1]
267 |
268 | # weights (alignment matrix)
269 | opt_ret['dec_alphas'] = proj[2] # --> to show the attenion weights
270 |
271 | # compute word probabilities
272 | logit_lstm = get_layer('ff')[1](tparams, proj_h, options,
273 | prefix='ff_logit_lstm', activ='linear')
274 | logit_prev = get_layer('ff')[1](tparams, emb, options,
275 | prefix='ff_logit_prev', activ='linear')
276 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
277 | prefix='ff_logit_ctx', activ='linear')
278 | logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
279 |
280 | # dropout (noise)
281 | if options['use_dropout']:
282 | logit = dropout_layer(logit, use_noise, trng)
283 | logit = get_layer('ff')[1](tparams, logit, options,
284 | prefix='ff_logit', activ='linear')
285 | logit_shp = logit.shape
286 | probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
287 |
288 | # compute the cost (negative loglikelihood)
289 | y_flat = y.flatten()
290 | y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
291 |
292 | cost = -tensor.log(probs.flatten()[y_flat_idx] + TINY)
293 | cost = cost.reshape([y.shape[0], y.shape[1]])
294 | a_cost = tensor.mean((cost * y_mask).sum(0))
295 |
296 | # gradient clipping
297 | def _clip(grad):
298 | clip_c = 1.
299 | if clip_c > 0.:
300 | g2 = 0.
301 | for g in grad:
302 | g2 += (g ** 2).sum()
303 | new_grads = []
304 | for g in grad:
305 | new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g))
306 | grad = new_grads
307 | return grad
308 |
309 |
310 | lr = tensor.scalar(name='lr')
311 | if fullmodel:
312 | print 'build MLE optimizer for the whole NMT model:'
313 | a_grad = _clip(theano.grad(a_cost, wrt=itemlist(tparams)))
314 | inps = [x, x_mask, y, y_mask, c_mask]
315 | outps = [a_cost, cost]
316 | f_cost, f_update = adam(lr, tparams, a_grad, inps, outps)
317 | else:
318 | print 'build MLE only for decoder'
319 | tparams_d = OrderedDict()
320 | for w in tparams:
321 | if ('ff_state' not in w) and ('encoder' not in w) and (w != 'Wemb'):
322 | print w, 'updated.'
323 | tparams_d[w] = tparams[w]
324 |
325 | a_grad = _clip(theano.grad(a_cost, wrt=itemlist(tparams_d)))
326 | inps = [x, x_mask, y, y_mask, c_mask]
327 | outps = [a_cost, cost]
328 | f_cost, f_update = adam(lr, tparams_d, a_grad, inps, outps)
329 |
330 | print 'done.'
331 | return f_init, f_cost, f_update
332 |
333 |
334 | # build a sampler for NMT
335 | def build_sampler(tparams, options, trng):
336 |
337 | x = tensor.matrix('x', dtype='int64')
338 |
339 | n_timesteps = x.shape[0]
340 | n_samples = x.shape[1]
341 |
342 | # word embedding (source), forward and backward
343 | emb = tparams['Wemb'][x.flatten()]
344 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
345 |
346 | # encoder
347 | proj = get_layer(options['encoder'])[1](tparams, emb, options,
348 | prefix='encoder')
349 |
350 | # bi-rnn
351 | if options.get('birnn', False):
352 | xr = x[::-1]
353 |
354 | embr = tparams['Wemb'][xr.flatten()]
355 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
356 | projr = get_layer(options['encoder'])[1](tparams, embr, options,
357 | prefix='encoder_r')
358 |
359 | ## concatenate forward and backward rnn hidden states
360 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
361 |
362 | else:
363 | ctx = proj[0]
364 |
365 | # get the input for decoder rnn initializer mlp
366 | ctx_mean = ctx.mean(0)
367 | # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
368 | init_state = get_layer('ff')[1](tparams, ctx_mean, options,
369 | prefix='ff_state', activ='tanh')
370 |
371 | print 'Building f_init...',
372 | outs = [init_state, ctx]
373 | f_init = theano.function([x], outs, name='f_init', profile=profile)
374 | print 'Done.'
375 |
376 | # ..........................................................................
377 | # x: 1 x 1
378 | y = tensor.vector('y_sampler', dtype='int64')
379 | init_state = tensor.matrix('init_state', dtype='float32')
380 | use_noise = theano.shared(numpy.float32(0.))
381 |
382 |
383 | # if it's the first word, emb should be all zero and it is indicated by -1
384 | emb = tensor.switch(y[:, None] < 0,
385 | tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
386 | tparams['Wemb_dec'][y])
387 |
388 | # apply one step of conditional gru with attention
389 | proj = get_layer(options['decoder'])[1](tparams, emb, options,
390 | prefix='decoder',
391 | mask=None, context=ctx,
392 | one_step=True,
393 | init_state=init_state)
394 | # get the next hidden state
395 | next_state = proj[0]
396 |
397 | # get the weighted averages of context for this target word y
398 | ctxs = proj[1]
399 |
400 | logit_lstm = get_layer('ff')[1](tparams, next_state, options,
401 | prefix='ff_logit_lstm', activ='linear')
402 | logit_prev = get_layer('ff')[1](tparams, emb, options,
403 | prefix='ff_logit_prev', activ='linear')
404 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
405 | prefix='ff_logit_ctx', activ='linear')
406 | logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
407 |
408 | if options['use_dropout']:
409 | logit = dropout_layer(logit, use_noise, trng)
410 | logit = get_layer('ff')[1](tparams, logit, options,
411 | prefix='ff_logit', activ='linear')
412 |
413 | # compute the softmax probability
414 | next_probs = tensor.nnet.softmax(logit)
415 |
416 | # sample from softmax distribution to get the sample
417 | next_sample = trng.multinomial(pvals=next_probs).argmax(1)
418 |
419 | # compile a function to do the whole thing above, next word probability,
420 | # sampled word for the next target, next hidden state to be used
421 | print 'Building f_next..',
422 | inps = [y, ctx, init_state]
423 | outs = [next_probs, next_sample, next_state]
424 | f_next = theano.function(inps, outs, name='f_next', profile=profile)
425 | print 'Done.'
426 |
427 | return f_init, f_next
428 |
429 | def build_partial(tparams, options, trng):
430 |
431 | assert options.get('birnn', False), 'must used in uni-directional mode'
432 |
433 | x = tensor.matrix('x', dtype='int64')
434 | prev_state = tensor.matrix('prev_state', dtype='float32')
435 | n_timesteps = x.shape[0]
436 | n_samples = x.shape[1]
437 |
438 | # word embedding (source), forward and backward
439 | emb = tparams['Wemb'][x.flatten()]
440 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
441 |
442 | # encoder
443 | proj = get_layer(options['encoder'])[1](tparams, emb, options,
444 | one_step=True,
445 | _init_state=prev_state,
446 | prefix='encoder')
447 | next_state = proj[0]
448 |
449 |
450 | print 'Building f_partial...',
451 | outs = [next_state]
452 | f_partial = theano.function([x, prev_state], outs, name='f_partial', profile=profile)
453 | print 'Done'
454 |
455 | return f_partial
456 |
457 |
458 | def build_simultaneous_sampler(tparams, options, trng):
459 | x = tensor.matrix('x', dtype='int64')
460 |
461 | n_timesteps = x.shape[0]
462 | n_samples = x.shape[1]
463 |
464 | # word embedding (source), forward and backward
465 | emb = tparams['Wemb'][x.flatten()]
466 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
467 |
468 | # encoder
469 | proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder')
470 |
471 | # bi-rnn
472 | if options.get('birnn', False):
473 | xr = x[::-1]
474 |
475 | embr = tparams['Wemb'][xr.flatten()]
476 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
477 | projr = get_layer(options['encoder'])[1](tparams, embr, options,
478 | prefix='encoder_r')
479 |
480 | ## concatenate forward and backward rnn hidden states
481 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
482 |
483 | else:
484 | ctx = proj[0]
485 |
486 | # get the input for decoder rnn initializer mlp
487 | ctx_mean = ctx.mean(0)
488 | # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
489 | init_state = get_layer('ff')[1](tparams, ctx_mean, options,
490 | prefix='ff_state', activ='tanh')
491 |
492 | print 'Building f_ctx/init...',
493 |
494 | f_sim_ctx = theano.function([x], ctx, name = 'f_sim_ctx')
495 | f_sim_init = theano.function([ctx], init_state, name='f_sim_init', profile=profile)
496 |
497 | print 'Done.'
498 |
499 | # -------------------------------------------------------------------------------- #
500 | y = tensor.vector('y_sampler', dtype='int64')
501 | ctx = tensor.tensor3('context_vectors', dtype='float32')
502 | mask = tensor.matrix('context_mask', dtype='float32')
503 | init_state = tensor.matrix('init_state', dtype='float32')
504 | use_noise = theano.shared(numpy.float32(0.))
505 |
506 | # if it's the first word, emb should be all zero and it is indicated by -1
507 | emb = tensor.switch(y[:, None] < 0,
508 | tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
509 | tparams['Wemb_dec'][y])
510 |
511 | # apply one step of conditional gru with attention
512 | proj = get_layer(options['decoder'])[1](tparams, emb, options,
513 | prefix='decoder',
514 | mask=None, context=ctx,
515 | one_step=True,
516 | init_state=init_state,
517 | context_mask=mask)
518 |
519 | # get the next hidden state
520 | next_state = proj[0]
521 |
522 | # get the weighted averages of context for this target word y
523 | ctxs = proj[1]
524 | attention = proj[2]
525 |
526 | logit_lstm = get_layer('ff')[1](tparams, next_state, options,
527 | prefix='ff_logit_lstm', activ='linear')
528 | logit_prev = get_layer('ff')[1](tparams, emb, options,
529 | prefix='ff_logit_prev', activ='linear')
530 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
531 | prefix='ff_logit_ctx', activ='linear')
532 | logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
533 |
534 | if options['use_dropout']:
535 | logit = dropout_layer(logit, use_noise, trng)
536 |
537 | logit = get_layer('ff')[1](tparams, logit, options,
538 | prefix='ff_logit', activ='linear')
539 |
540 | # compute the softmax probability
541 | next_probs = tensor.nnet.softmax(logit)
542 |
543 | # sample from softmax distribution to get the sample
544 | next_sample = trng.multinomial(pvals=next_probs).argmax(1)
545 |
546 | # ***== special care: use additional inforamtion ====*** #
547 | # compile a function to do the whole thing above, next word probability,
548 | # sampled word for the next target, next hidden state to be used
549 | print 'Building f_sim_next..',
550 | inps = [y, ctx, mask, init_state]
551 | ctxdim = options['dim'] if not options.get('birnn', False) else 2 * options['dim']
552 |
553 | if 'pre' in options and options['pre']:
554 | assert not options.get('birnn', False), 'should not use birnn for SimulTrans'
555 |
556 | read_head = tensor.ivector('read_head')
557 | forget_head = tensor.ivector('forget_head')
558 | inps += [read_head, forget_head]
559 |
560 | def _grab(contexts, index):
561 | assert contexts.ndim == 3
562 |
563 | batch_size = contexts.shape[1]
564 | return contexts[index, tensor.arange(batch_size), :]
565 |
566 | last_ctx = _grab(ctx, read_head)
567 | first_ctx = _grab(ctx, forget_head)
568 | next_max_w = tparams['Wemb_dec'][next_probs.argmax(1)]
569 |
570 | readout = tensor.concatenate([next_state, ctxs, last_ctx, first_ctx, next_max_w], axis=-1)
571 | options['readout_dim'] = options['dim_word'] + ctxdim * 3 + options['dim']
572 |
573 | else:
574 | print 'with normal input'
575 | readout = tensor.concatenate([next_state, ctxs, emb], axis=-1) # the obersavtion for each step.
576 | options['readout_dim'] = options['dim_word'] + options['dim'] + ctxdim
577 |
578 | outs = [next_probs, next_sample, next_state, readout, attention]
579 | f_sim_next = theano.function(inps, outs, name='f_sim_next', profile=profile)
580 | print 'Done.'
581 |
582 | return f_sim_ctx, f_sim_init, f_sim_next
583 |
584 | # ---------------------------------------------------------------------------- #
585 | # What we need are this part = v = #
586 | # #
587 | # ---------> for reinforcement noisy decoding #
588 | # ---------------------------------------------------------------------------- #
589 |
590 | def build_noisy_sampler(tparams, options, trng):
591 | x = tensor.matrix('x', dtype='int64')
592 |
593 | n_timesteps = x.shape[0]
594 | n_samples = x.shape[1]
595 |
596 | # word embedding (source), forward and backward
597 | emb = tparams['Wemb'][x.flatten()]
598 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
599 |
600 | # encoder
601 | proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder')
602 | if options.get('birnn', False):
603 | xr = x[::-1]
604 | embr = tparams['Wemb'][xr.flatten()]
605 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
606 | projr = get_layer(options['encoder'])[1](tparams, embr, options,
607 | prefix='encoder_r')
608 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
609 |
610 | else:
611 | ctx = proj[0]
612 |
613 | # get the input for decoder rnn initializer mlp
614 | ctx_mean = ctx.mean(0)
615 | init_state = get_layer('ff')[1](tparams, ctx_mean, options,
616 | prefix='ff_state', activ='tanh')
617 |
618 | print 'Building Encoder: f_ctx/init...',
619 |
620 | f_sim_ctx = theano.function([x], ctx, name = 'f_sim_ctx')
621 | f_sim_init = theano.function([ctx], init_state, name='f_sim_init', profile=profile)
622 |
623 | print 'Done.'
624 |
625 | # -------------------------------------------------------------------------------- #
626 | y = tensor.vector('y_sampler', dtype='int64')
627 | ctx = tensor.tensor3('context_vectors', dtype='float32')
628 | mask = tensor.matrix('context_mask', dtype='float32')
629 | prev_state = tensor.matrix('prev_state', dtype='float32')
630 | use_noise = theano.shared(numpy.float32(0.))
631 |
632 | injd_noise = tensor.matrix('injected_noise', dtype='float32')
633 |
634 | # if it's the first word, emb should be all zero and it is indicated by -1
635 | emb = tensor.switch(y[:, None] < 0,
636 | tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
637 | tparams['Wemb_dec'][y])
638 |
639 | # inject noise
640 | init_state = prev_state + injd_noise # apply the injected noise
641 |
642 | # apply one step of conditional gru with attention
643 | proj = get_layer(options['decoder'])[1](tparams, emb, options,
644 | prefix='decoder',
645 | mask=None, context=ctx,
646 | one_step=True,
647 | init_state=init_state,
648 | context_mask=mask)
649 |
650 | # get the next hidden state
651 | next_state = proj[0]
652 |
653 | # get the weighted averages of context for this target word y
654 | ctxs = proj[1]
655 | attention = proj[2]
656 |
657 | logit_lstm = get_layer('ff')[1](tparams, next_state, options,
658 | prefix='ff_logit_lstm', activ='linear')
659 | logit_prev = get_layer('ff')[1](tparams, emb, options,
660 | prefix='ff_logit_prev', activ='linear')
661 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
662 | prefix='ff_logit_ctx', activ='linear')
663 | logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
664 |
665 | if options['use_dropout']:
666 | logit = dropout_layer(logit, use_noise, trng)
667 |
668 | logit = get_layer('ff')[1](tparams, logit, options,
669 | prefix='ff_logit', activ='linear')
670 |
671 | # compute the softmax probability
672 | next_probs = tensor.nnet.softmax(logit)
673 |
674 | # sample from softmax distribution to get the sample
675 | next_sample = trng.multinomial(pvals=next_probs).argmax(1)
676 |
677 | # compile the function read-out and samples
678 | print 'Building f_sim_next..',
679 |
680 | inps = [y, ctx, mask, prev_state, injd_noise]
681 | ctxdim = options['dim'] if not options.get('birnn', False) else 2 * options['dim']
682 | readout = tensor.concatenate([next_state, ctxs, emb], axis=-1) # the obersavtion for each step.
683 | options['readout_dim'] = options['dim_word'] + options['dim'] + ctxdim
684 |
685 | outs = [next_probs, next_sample, next_state, readout, attention]
686 | f_sim_next = theano.function(inps, outs, name='f_sim_next', profile=profile)
687 |
688 | print 'Done.'
689 | return f_sim_ctx, f_sim_init, f_sim_next
690 |
691 |
692 | # generate sample, either with stochastic sampling or beam search. Note that,
693 | # this function iteratively calls f_init and f_next functions.
694 | def gen_sample(tparams, f_init, f_next, x, options, trng=None, k=1, maxlen=30,
695 | stochastic=True, argmax=False, sigma=-1.):
696 |
697 | # k is the beam size we have
698 | if k > 1:
699 | assert not stochastic, \
700 | 'Beam search does not support stochastic sampling'
701 |
702 | sample = []
703 | sample_score = []
704 | if stochastic:
705 | sample_score = 0
706 |
707 | live_k = 1
708 | dead_k = 0
709 |
710 | hyp_samples = [[]] * live_k
711 | hyp_scores = numpy.zeros(live_k).astype('float32')
712 | hyp_states = []
713 |
714 | # get initial state of decoder rnn and encoder context
715 | ret = f_init(x)
716 | next_state, ctx0 = ret[0], ret[1]
717 | next_w = -1 * numpy.ones((1,)).astype('int64') # bos indicator
718 |
719 | for ii in xrange(maxlen):
720 | ctx = numpy.tile(ctx0, [live_k, 1])
721 |
722 | if sigma > 0.:
723 | next_state_inp = next_state + numpy.float32((sigma/(ii+1)) * numpy.random.randn(*next_state.shape))
724 | else:
725 | next_state_inp = next_state
726 |
727 | inps = [next_w, ctx, next_state_inp]
728 | ret = f_next(*inps)
729 | next_p, next_w, next_state = ret[0], ret[1], ret[2]
730 |
731 | if stochastic:
732 | if argmax:
733 | nw = next_p[0].argmax()
734 | else:
735 | nw = next_w[0]
736 | sample.append(nw)
737 | sample_score += next_p[0, nw]
738 | if nw == 0:
739 | break
740 | else:
741 | cand_scores = hyp_scores[:, None] - numpy.log(next_p)
742 | cand_flat = cand_scores.flatten()
743 | ranks_flat = cand_flat.argsort()[:(k-dead_k)]
744 |
745 | voc_size = next_p.shape[1]
746 | trans_indices = ranks_flat / voc_size
747 | word_indices = ranks_flat % voc_size
748 | costs = cand_flat[ranks_flat]
749 |
750 | new_hyp_samples = []
751 | new_hyp_scores = numpy.zeros(k-dead_k).astype('float32')
752 | new_hyp_states = []
753 |
754 | for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
755 | new_hyp_samples.append(hyp_samples[ti]+[wi])
756 | new_hyp_scores[idx] = copy.copy(costs[idx])
757 | new_hyp_states.append(copy.copy(next_state[ti]))
758 |
759 | # check the finished samples
760 | new_live_k = 0
761 | hyp_samples = []
762 | hyp_scores = []
763 | hyp_states = []
764 |
765 | for idx in xrange(len(new_hyp_samples)):
766 | if new_hyp_samples[idx][-1] == 0:
767 | sample.append(new_hyp_samples[idx])
768 | sample_score.append(new_hyp_scores[idx])
769 | dead_k += 1
770 | else:
771 | new_live_k += 1
772 | hyp_samples.append(new_hyp_samples[idx])
773 | hyp_scores.append(new_hyp_scores[idx])
774 | hyp_states.append(new_hyp_states[idx])
775 | hyp_scores = numpy.array(hyp_scores)
776 | live_k = new_live_k
777 |
778 | if new_live_k < 1:
779 | break
780 | if dead_k >= k:
781 | break
782 |
783 | next_w = numpy.array([w[-1] for w in hyp_samples])
784 | next_state = numpy.array(hyp_states)
785 |
786 | if not stochastic:
787 | # dump every remaining one
788 | if live_k > 0:
789 | for idx in xrange(live_k):
790 | sample.append(hyp_samples[idx])
791 | sample_score.append(hyp_scores[idx])
792 |
793 | return sample, sample_score
794 |
795 |
796 | # calculate the log probablities on a given corpus using translation model
797 | def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True):
798 | probs = []
799 |
800 | n_done = 0
801 |
802 | for x, y in iterator:
803 | n_done += len(x)
804 |
805 | x, x_mask, y, y_mask = prepare_data(x, y,
806 | n_words_src=options['n_words_src'],
807 | n_words=options['n_words'])
808 |
809 | pprobs = f_log_probs(x, x_mask, y, y_mask)
810 | for pp in pprobs:
811 | probs.append(pp)
812 |
813 | #if numpy.isnan(numpy.mean(probs)):
814 | # ipdb.set_trace()
815 |
816 | if verbose:
817 | print >>sys.stderr, '%d samples computed' % (n_done)
818 |
819 | return numpy.array(probs)
820 |
821 | #-----------------------------------------------------------------------------#
822 | # Batch preparation
823 |
824 | def prepare_data(seqs_x,
825 | seqs_y,
826 | maxlen=None,
827 | n_words_src=30000,
828 | n_words=30000):
829 |
830 | # x: a list of sentences
831 | lengths_x = [len(s) for s in seqs_x]
832 | lengths_y = [len(s) for s in seqs_y]
833 |
834 | if maxlen is not None:
835 | new_seqs_x = []
836 | new_seqs_y = []
837 | new_lengths_x = []
838 | new_lengths_y = []
839 | for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y):
840 | if l_x < maxlen and l_y < maxlen:
841 | new_seqs_x.append(s_x)
842 | new_lengths_x.append(l_x)
843 | new_seqs_y.append(s_y)
844 | new_lengths_y.append(l_y)
845 | lengths_x = new_lengths_x
846 | seqs_x = new_seqs_x
847 | lengths_y = new_lengths_y
848 | seqs_y = new_seqs_y
849 |
850 | if len(lengths_x) < 1 or len(lengths_y) < 1:
851 | return None, None, None, None
852 |
853 | n_samples = len(seqs_x)
854 | maxlen_x = numpy.max(lengths_x) + 1
855 | maxlen_y = numpy.max(lengths_y) + 1
856 |
857 | x = numpy.zeros((maxlen_x, n_samples)).astype('int64')
858 | y = numpy.zeros((maxlen_y, n_samples)).astype('int64')
859 | x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
860 | y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
861 | for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)):
862 | x[:lengths_x[idx], idx] = s_x
863 | x_mask[:lengths_x[idx]+1, idx] = 1.
864 | y[:lengths_y[idx], idx] = s_y
865 | y_mask[:lengths_y[idx]+1, idx] = 1.
866 |
867 | return x, x_mask, y, y_mask
868 |
869 |
870 | #-----------------------------------------------------------------------------#
871 | # Training Function:
872 |
873 | def train(dim_word = 100, # word vector dimensionality
874 | dim = 1000, # the number of RNN units
875 | encoder = 'gru',
876 | decoder = 'gru_cond',
877 | patience = 10, # early stopping patience
878 | max_epochs = 5000,
879 | finish_after = 10000000, # finish after this many updates
880 | dispFreq = 100,
881 | decay_c = 0., # L2 regularization penalty
882 | alpha_c = 0., # alignment regularization
883 | clip_c = -1., # gradient clipping threshold
884 | lrate = 0.01, # learning rate
885 | n_words_src = 100000, # source vocabulary size
886 | n_words = 100000, # target vocabulary size
887 | maxlen = 100, # maximum length of the description
888 | optimizer = 'rmsprop',
889 | batch_size = 16,
890 | valid_batch_size = 16,
891 | saveto = 'model.npz',
892 | validFreq = 1000,
893 | saveFreq = 1000, # save the parameters after every saveFreq updates
894 | sampleFreq = 100, # generate some samples after every sampleFreq
895 | datasets =[
896 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
897 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
898 |
899 | valid_datasets=['../data/dev/newstest2011.en.tok',
900 | '../data/dev/newstest2011.fr.tok'],
901 |
902 | dictionaries=[
903 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
904 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
905 |
906 | use_dropout = False,
907 | reload_ = False,
908 | overwrite = False):
909 |
910 | # Model options
911 | model_options = locals().copy()
912 |
913 | # load dictionaries and invert them
914 | worddicts = [None] * len(dictionaries)
915 | worddicts_r = [None] * len(dictionaries)
916 | for ii, dd in enumerate(dictionaries):
917 | with open(dd, 'rb') as f:
918 | worddicts[ii] = pkl.load(f)
919 | worddicts_r[ii] = dict()
920 | for kk, vv in worddicts[ii].iteritems():
921 | worddicts_r[ii][vv] = kk
922 |
923 | # reload options
924 | if reload_ and os.path.exists(saveto):
925 | print 'Reloading model options'
926 | with open('%s.pkl' % saveto, 'rb') as f:
927 | model_options = pkl.load(f)
928 |
929 | print 'Loading data'
930 | train = TextIterator(datasets[0], datasets[1],
931 | dictionaries[0], dictionaries[1],
932 | n_words_source=n_words_src, n_words_target=n_words,
933 | batch_size=batch_size,
934 | maxlen=maxlen)
935 | valid = TextIterator(valid_datasets[0], valid_datasets[1],
936 | dictionaries[0], dictionaries[1],
937 | n_words_source=n_words_src, n_words_target=n_words,
938 | batch_size=valid_batch_size,
939 | maxlen=maxlen)
940 |
941 | print 'Building model'
942 | params = init_params(model_options)
943 | # reload parameters
944 | if reload_ and os.path.exists(saveto):
945 | print 'Reloading model parameters'
946 | params = load_params(saveto, params)
947 |
948 | tparams = init_tparams(params)
949 |
950 | trng, use_noise, \
951 | x, x_mask, y, y_mask, \
952 | opt_ret, \
953 | cost, f_cost = \
954 | build_model(tparams, model_options)
955 | inps = [x, x_mask, y, y_mask]
956 |
957 | print 'Building sampler'
958 | f_init, f_next = build_sampler(tparams, model_options, trng)
959 |
960 | # before any regularizer
961 | print 'Building f_log_probs...',
962 | f_log_probs = theano.function(inps, cost, profile=profile)
963 | print 'Done'
964 |
965 | cost = cost.mean()
966 |
967 | # apply L2 regularization on weights
968 | if decay_c > 0.:
969 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
970 | weight_decay = 0.
971 | for kk, vv in tparams.iteritems():
972 | weight_decay += (vv ** 2).sum()
973 | weight_decay *= decay_c
974 | cost += weight_decay
975 |
976 | # regularize the alpha weights
977 | if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
978 | alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
979 | alpha_reg = alpha_c * (
980 | (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] -
981 | opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
982 | cost += alpha_reg
983 |
984 | # after all regularizers - compile the computational graph for cost
985 | print 'Building f_cost...',
986 | f_cost = theano.function(inps, cost, profile=profile)
987 | print 'Done'
988 |
989 | print 'Computing gradient...',
990 | grads = tensor.grad(cost, wrt=itemlist(tparams))
991 | print 'Done'
992 |
993 | # apply gradient clipping here
994 | if clip_c > 0.:
995 | g2 = 0.
996 | for g in grads:
997 | g2 += (g**2).sum()
998 | new_grads = []
999 | for g in grads:
1000 | new_grads.append(tensor.switch(g2 > (clip_c**2),
1001 | g / tensor.sqrt(g2) * clip_c,
1002 | g))
1003 | grads = new_grads
1004 |
1005 | # compile the optimizer, the actual computational graph is compiled here
1006 | lr = tensor.scalar(name='lr')
1007 | print 'Building optimizers...',
1008 | f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
1009 | print 'Done'
1010 |
1011 | print 'Optimization'
1012 |
1013 | best_p = None
1014 | bad_counter = 0
1015 | uidx = 0
1016 | estop = False
1017 | history_errs = []
1018 | # reload history
1019 | if reload_ and os.path.exists(saveto):
1020 | rmodel = numpy.load(saveto)
1021 | history_errs = list(rmodel['history_errs'])
1022 | if 'uidx' in rmodel:
1023 | uidx = rmodel['uidx']
1024 |
1025 | if validFreq == -1:
1026 | validFreq = len(train[0])/batch_size
1027 | if saveFreq == -1:
1028 | saveFreq = len(train[0])/batch_size
1029 | if sampleFreq == -1:
1030 | sampleFreq = len(train[0])/batch_size
1031 |
1032 | for eidx in xrange(max_epochs):
1033 | n_samples = 0
1034 |
1035 | for x, y in train:
1036 | n_samples += len(x)
1037 | uidx += 1
1038 | use_noise.set_value(1.)
1039 |
1040 | x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen,
1041 | n_words_src=n_words_src,
1042 | n_words=n_words)
1043 |
1044 | if x is None:
1045 | print 'Minibatch with zero sample under length ', maxlen
1046 | uidx -= 1
1047 | continue
1048 |
1049 | ud_start = time.time()
1050 |
1051 | # compute cost, grads and copy grads to shared variables
1052 | cost = f_grad_shared(x, x_mask, y, y_mask)
1053 |
1054 | # do the update on parameters
1055 | f_update(lrate)
1056 |
1057 | ud = time.time() - ud_start
1058 |
1059 | # check for bad numbers, usually we remove non-finite elements
1060 | # and continue training - but not done here
1061 | if numpy.isnan(cost) or numpy.isinf(cost):
1062 | print 'NaN detected'
1063 | return 1., 1., 1.
1064 |
1065 | # verbose
1066 | if numpy.mod(uidx, dispFreq) == 0:
1067 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
1068 |
1069 | # save the best model so far, in addition, save the latest model
1070 | # into a separate file with the iteration number for external eval
1071 | if numpy.mod(uidx, saveFreq) == 0:
1072 | print 'Saving the best model...',
1073 | if best_p is not None:
1074 | params = best_p
1075 | else:
1076 | params = unzip(tparams)
1077 | numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
1078 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
1079 | params = unzip(tparams)
1080 | numpy.savez('%s.current'%(saveto), history_errs=history_errs, **params)
1081 | pkl.dump(model_options, open('%s.current.pkl' % saveto, 'wb'))
1082 | print 'Done'
1083 |
1084 | # save with uidx
1085 | if not overwrite:
1086 | print 'Saving the model at iteration {}...'.format(uidx),
1087 | saveto_uidx = '{}.iter{}.npz'.format(
1088 | os.path.splitext(saveto)[0], uidx)
1089 | numpy.savez(saveto_uidx, history_errs=history_errs,
1090 | uidx=uidx, **unzip(tparams))
1091 | print 'Done'
1092 |
1093 |
1094 | # generate some samples with the model and display them
1095 | if numpy.mod(uidx, sampleFreq) == 0:
1096 | # FIXME: random selection?
1097 | for jj in xrange(numpy.minimum(5, x.shape[1])):
1098 | stochastic = False
1099 | sample, score = gen_sample(tparams, f_init, f_next,
1100 | x[:, jj][:, None],
1101 | model_options, trng=trng, k=1,
1102 | maxlen=30,
1103 | stochastic=stochastic,
1104 | argmax=True)
1105 | print 'Source ', jj, ': ',
1106 | ss = []
1107 | for vv in x[:, jj]:
1108 | if vv == 0:
1109 | break
1110 | if vv in worddicts_r[0]:
1111 | ss.append(worddicts_r[0][vv])
1112 | else:
1113 | ss.append('UNK')
1114 | print ' '.join(ss).replace('@@ ', '')
1115 | print 'Truth ', jj, ' : ',
1116 | ss = []
1117 | for vv in y[:, jj]:
1118 | if vv == 0:
1119 | break
1120 | if vv in worddicts_r[1]:
1121 | ss.append(worddicts_r[1][vv])
1122 | else:
1123 | ss.append('UNK')
1124 | print ' '.join(ss).replace('@@ ', '')
1125 | print 'Sample ', jj, ': ',
1126 | tt = []
1127 | score = score / numpy.array([len(s) for s in sample])
1128 | ss = sample[score.argmin()]
1129 | for vv in ss:
1130 | if vv == 0:
1131 | break
1132 | if vv in worddicts_r[1]:
1133 | tt.append(worddicts_r[1][vv])
1134 | else:
1135 | tt.append('UNK')
1136 | print ' '.join(tt).replace('@@ ', '')
1137 |
1138 | # validate model on validation set and early stop if necessary
1139 | if numpy.mod(uidx, validFreq) == 0:
1140 | use_noise.set_value(0.)
1141 | valid_errs = pred_probs(f_log_probs, prepare_data,
1142 | model_options, valid)
1143 | valid_err = valid_errs.mean()
1144 | history_errs.append(valid_err)
1145 |
1146 | if uidx == 0 or valid_err <= numpy.array(history_errs).min():
1147 | best_p = unzip(tparams)
1148 | bad_counter = 0
1149 | if len(history_errs) > patience and valid_err >= \
1150 | numpy.array(history_errs)[:-patience].min():
1151 | bad_counter += 1
1152 | if bad_counter > patience:
1153 | print 'Early Stop!'
1154 | estop = True
1155 | break
1156 |
1157 | #if numpy.isnan(valid_err):
1158 | # ipdb.set_trace()
1159 |
1160 | print 'Valid ', valid_err
1161 |
1162 | # finish after this many updates
1163 | if uidx >= finish_after:
1164 | print 'Finishing after %d iterations!' % uidx
1165 | estop = True
1166 | break
1167 |
1168 | print 'Seen %d samples' % n_samples
1169 |
1170 | if estop:
1171 | break
1172 |
1173 | if best_p is not None:
1174 | zipp(best_p, tparams)
1175 |
1176 | use_noise.set_value(0.)
1177 | valid_err = pred_probs(f_log_probs, prepare_data,
1178 | model_options, valid).mean()
1179 |
1180 | print 'Valid ', valid_err
1181 |
1182 | params = copy.copy(best_p)
1183 | numpy.savez(saveto, zipped_params=best_p,
1184 | history_errs=history_errs,
1185 | uidx=uidx,
1186 | **params)
1187 |
1188 | return valid_err
1189 |
1190 | def grad_clip(dJ, clip_c=1):
1191 | clip_c = clip_c.
1192 | if clip_c > 0.:
1193 | g2 = 0.
1194 | for g in dJ:
1195 | g2 += (g ** 2).sum()
1196 | new_grads = []
1197 | for g in dJ:
1198 | new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g))
1199 | dJ = new_grads
1200 | return dJ
1201 |
1202 | if __name__ == '__main__':
1203 | pass
1204 |
--------------------------------------------------------------------------------
/noisy_translator.py:
--------------------------------------------------------------------------------
1 | """
2 | Neural Machine Translation with Reinforcement Bias
3 | """
4 |
5 | from nmt_uni import *
6 | from reward import translation_cost
7 | import time
8 |
9 | time = time.time
10 |
11 | # utility functions
12 | def _seqs2words(caps, idict):
13 | capsw = []
14 | for cc in caps:
15 | ww = []
16 | for w in cc:
17 | if w == 0:
18 | break
19 | ww.append(idict[w])
20 | capsw.append(' '.join(ww))
21 | return capsw
22 |
23 | def _bpe2words(capsw):
24 | capw = []
25 | for cc in capsw:
26 | capw += [cc.replace('@@ ', '')]
27 | return capw
28 |
29 | def _action2delay(src, actions):
30 | delays = []
31 | X = len(src)
32 | for act in actions:
33 | A = numpy.array(act, dtype='float32')
34 | Y = numpy.sum(act)
35 | S = numpy.sum(numpy.cumsum(1 - A) * A)
36 |
37 | assert (X > 0) and (Y > 0), 'avoid NAN {}, {}'.format(X, Y)
38 |
39 | tau = S / (Y * X)
40 | delays.append([tau, X, Y, S])
41 |
42 | return delays
43 |
44 |
45 | # padding for computing policy gradient
46 | def _padding(arrays, shape, dtype='float32', return_mask=False, sidx=0):
47 | B = numpy.zeros(shape, dtype=dtype)
48 |
49 | if return_mask:
50 | M = numpy.zeros((shape[0], shape[1]), dtype='float32')
51 |
52 | for it, arr in enumerate(arrays):
53 | arr = numpy.asarray(arr, dtype=dtype)
54 | # print arr.shape
55 |
56 | steps = arr.shape[0]
57 |
58 | if arr.ndim < 2:
59 | B[sidx: steps + sidx, it] = arr
60 | else:
61 | steps2 = arr.shape[1]
62 | B[sidx: steps + sidx, it, : steps2] = arr
63 |
64 | if return_mask:
65 | M[sidx: steps + sidx, it] = 1.
66 |
67 | if return_mask:
68 | return B, M
69 | return B
70 |
71 |
72 | class PIPE(object):
73 | def __init__(self, keys=None):
74 | self.messages = OrderedDict()
75 | self.hyp_messages = OrderedDict()
76 | self.new_hyp_messages = OrderedDict()
77 | for key in keys:
78 | self.messages[key] = []
79 |
80 | def reset(self):
81 | for key in self.messages:
82 | self.messages[key] = []
83 |
84 | self.hyp_messages = OrderedDict()
85 | self.new_hyp_messages = OrderedDict()
86 |
87 | def clean_hyp(self):
88 | self.hyp_messages = OrderedDict()
89 |
90 | def clean_new_hyp(self):
91 | self.new_hyp_messages = OrderedDict()
92 |
93 | def init_hyp(self, key, live_k=None):
94 | if live_k is not None:
95 | self.hyp_messages[key] = [[] for _ in xrange(live_k)]
96 | else:
97 | self.hyp_messages[key] = []
98 |
99 | def init_new_hyp(self, key, use_copy=False):
100 | if use_copy:
101 | self.new_hyp_messages[key] = copy.copy(self.hyp_messages[key])
102 | else:
103 | self.new_hyp_messages[key] = []
104 |
105 | def append(self, key, new, idx=None, use_hyp=False):
106 | if not use_hyp:
107 | self.new_hyp_messages[key].append(new)
108 | else:
109 | self.new_hyp_messages[key].append(self.hyp_messages[key][idx] + [new])
110 |
111 | def append_new(self, key, idx, hyper=True):
112 | if hyper:
113 | self.hyp_messages[key].append(self.new_hyp_messages[key][idx])
114 | else:
115 | # print self.messages['sample']
116 | self.messages[key].append(self.new_hyp_messages[key][idx])
117 |
118 | def add(self, key, new, idx):
119 | self.new_hyp_messages[key][idx] += new
120 |
121 | def asarray(self, key, replace=False):
122 | if replace:
123 | self.hyp_messages[key] = numpy.array(self.hyp_messages[key])
124 | else:
125 | return numpy.array(self.hyp_messages[key], dtype='float32')
126 |
127 | def split(self):
128 | truth = OrderedDict()
129 | sample = OrderedDict()
130 |
131 |
132 | for key in self.messages:
133 | if key == 'source':
134 | continue
135 |
136 | truth[key] = []
137 | sample[key] = []
138 |
139 | if key == 'mask':
140 | for idx in xrange(len(self.messages['source'])):
141 | if self.messages['source'][idx] < 0:
142 | sample[key].append(self.messages[key][:, idx])
143 | else:
144 | truth[key].append(self.messages[key][:, idx])
145 | else:
146 | for idx in xrange(len(self.messages['source'])):
147 | if self.messages['source'][idx] < 0:
148 | sample[key].append(self.messages[key][idx])
149 | else:
150 | truth[key].append(self.messages[key][idx])
151 |
152 | self.messages = sample
153 | return truth
154 |
155 |
156 |
157 | # ============================================================================ #
158 | # Noisy Decoding in Batch-Mode
159 | # ============================================================================ #
160 | def noisy_decoding(f_sim_ctx,
161 | f_sim_init,
162 | f_sim_next,
163 | f_cost,
164 | srcs, # source sentences
165 | trgs, # taeget sentences
166 | t_idict=None,
167 | _policy=None,
168 | n_samples=10,
169 | maxlen=200,
170 | reward_config=None,
171 | train=False):
172 | """
173 | :param f_init: initializer using the first "sidx" words.
174 | :param f_sim_next:
175 | :param f_partial:
176 | :param src: the original input needed to be translated (just for the speed)
177 | :param step: step_size for each wait
178 | :param peek:
179 | hidden0 = _policy.init_hidden()
180 | :param sidx: pre-read sidx words from the source
181 | :return:
182 | """
183 | Statistcs = OrderedDict()
184 | n_sentences = len(srcs)
185 | max_steps = -1
186 |
187 | # ======================================================================== #
188 | # Generating Trajectories based on Current Policy
189 | # ======================================================================== #
190 |
191 | live_k = n_samples * n_sentences
192 | live_all = live_k
193 |
194 | x, ctx0, z0, secs0 = [], [], [], []
195 | # data initialization
196 | for id, (src, trg) in enumerate(zip(srcs, trgs)):
197 |
198 | _x = numpy.array(src, dtype='int64')[:, None]
199 | _ctx0 = f_sim_ctx(_x)
200 | _z0 = f_sim_init(_ctx0[:sidx, :])
201 |
202 | x.append(_x[:, 0])
203 | ctx0.append(_ctx0[:, 0, :])
204 | z0.append(_z0.flatten())
205 | secs0.append([id, len(src), 0]) # word id / source length / correctness
206 |
207 | # pad the results
208 | x, x_mask = _padding(x, (src_max, n_sentences), dtype='int64', return_mask=True)
209 | ctx = _padding(ctx0, (src_max, n_sentences, ctx0[0].shape[-1]))
210 | z0 = numpy.asarray(z0)
211 | mask = x_mask
212 |
213 | # initial actions and hidden states
214 | action0, _, _, hidden0 = _policy.init_action(n_samples=n_samples)
215 |
216 | x_mask = numpy.ones_like(x, dtype='float32')
217 | mask0 = x_mask
218 |
219 | # if we have multiple samples for one input sentence
220 | mask = numpy.tile(mask0, [1, n_samples])
221 | z0 = numpy.tile(z0, [n_samples, 1])
222 | ctx = numpy.tile(ctx, [1, n_samples, 1])
223 |
224 | hidden0 = numpy.tile(hidden0, [live_k, 1])
225 | action0 = numpy.tile(action0, [live_k, 1])
226 |
227 | secs = []
228 | for _ in xrange(live_k / n_sentences):
229 | secs += copy.deepcopy(secs0)
230 |
231 | # PIPE for message passing
232 | pipe = PIPE(['sample', 'score', 'action', 'obs', 'attentions','secs'])
233 |
234 | # Build for the temporal results: hyp-message
235 | for key in ['sample', 'obs', 'attentions', 'hidden', 'action']:
236 | pipe.init_hyp(key, live_k)
237 |
238 | # special care
239 | pipe.hyp_messages['score'] = numpy.zeros(live_k).astype('float32')
240 | pipe.hyp_messages['secs'] = secs
241 | pipe.hyp_messages['states'] = z0
242 | pipe.hyp_messages['mask'] = mask
243 | pipe.hyp_messages['ctx'] = ctx
244 |
245 | # these are inputs that needs to be updated
246 | prev_w = -1 * numpy.ones((live_k, )).astype('int64')
247 | prev_z = z0
248 | prev_hid = hidden0
249 | prev_noise = action0
250 | step = 0
251 |
252 | # ROLLOUT: Iteration until all the samples over.
253 | # Action space:
254 | # =======================================================================
255 | while live_k > 0:
256 |
257 | step += 1
258 |
259 | # compute one step
260 | inps = [prev_w, ctx, mask, prev_z, prev_noise]
261 | next_p, _, next_z, next_o, next_a = f_sim_next(*inps)
262 |
263 | # obtain the candidate and the accumulated score.
264 | _cand = next_p.argmax(axis=-1) # live_k
265 | _score = next_p[range(live_k), _cand]
266 |
267 | # new place-holders for temporal results: new-hyp-message
268 | pipe.clean_new_hyp()
269 |
270 | for key in ['sample', 'score', 'attentions', 'secs', 'mask', 'ctx', 'states']:
271 | pipe.init_new_hyp(key, use_copy=True)
272 |
273 | for key in ['action', 'obs', 'hidden']:
274 | pipe.init_new_hyp(key, use_copy=False)
275 |
276 |
277 | # Rollout the action.
278 | _actions, _mean, _logstd, _hidden = _policy.action(next_o, prev_hid) # input the current observation
279 |
280 |
281 | # check each candidate
282 | for idx, wi in enumerate(_cand):
283 |
284 | # collect the action
285 | a = _actions[idx] # 1024-D Gaussian Vector
286 |
287 | # message appending
288 | pipe.append('obs', next_o[idx], idx=idx, use_hyp=True)
289 | pipe.append('action', a, idx=idx, use_hyp=True) # collect action.
290 | pipe.append('hidden', _hidden[idx])
291 |
292 | # for commit:
293 | # update new_hyp_message
294 | pipe.add('sample', [wi], idx)
295 | pipe.add('score', _score[idx], idx)
296 | pipe.add('attentions', [next_a[idx]], idx)
297 |
298 | # *** special care
299 | pipe.new_hyp_messages['states'][idx] = next_z[idx]
300 |
301 |
302 | # kill the completed samples, so I need to build new hyp-messages
303 | pipe.clean_hyp()
304 |
305 | for key in ['sample', 'score', 'states',
306 | 'action', 'obs', 'attentions', 'hidden',
307 | 'ctx', 'secs', 'mask']:
308 | pipe.init_hyp(key)
309 |
310 |
311 | # print new_hyp_sample
312 | for idx in xrange(len(pipe.new_hyp_messages['sample'])):
313 | # check if reachs the end
314 |
315 | if (len(pipe.new_hyp_messages['sample'][idx]) >= maxlen) or \
316 | (pipe.new_hyp_messages['sample'][idx][-1] == 0):
317 |
318 | for key in ['sample', 'score', 'action', 'obs', 'attentions']:
319 | pipe.append_new(key, idx, hyper=False)
320 |
321 | live_k -= 1
322 |
323 | else:
324 |
325 | for key in ['sample', 'score', 'states', 'action',
326 | 'obs', 'attentions', 'hidden']:
327 | pipe.append_new(key, idx, hyper=True)
328 |
329 | # *** special care ***
330 | pipe.hyp_messages['secs'].append(pipe.new_hyp_messages['secs'][idx])
331 | pipe.hyp_messages['mask'].append(pipe.new_hyp_messages['mask'][:, idx])
332 | pipe.hyp_messages['ctx'].append(pipe.new_hyp_messages['ctx'][:, idx])
333 |
334 |
335 |
336 | # make it numpy array
337 | for key in ['score', 'mask', 'ctx', 'states', 'hidden']:
338 | pipe.asarray(key, True)
339 |
340 | pipe.hyp_messages['mask'] = pipe.hyp_messages['mask'].T
341 | if pipe.hyp_messages['ctx'].ndim == 3:
342 | pipe.hyp_messages['ctx'] = pipe.hyp_messages['ctx'].transpose(1, 0, 2)
343 | elif pipe.hyp_messages['ctx'].ndim == 2:
344 | pipe.hyp_messages['ctx'] = pipe.hyp_messages['ctx'][:, None, :]
345 |
346 | prev_z = pipe.hyp_messages['states']
347 | prev_hid = pipe.hyp_messages['hidden']
348 | mask = pipe.hyp_messages['mask']
349 | ctx = pipe.hyp_messages['ctx']
350 |
351 | prev_w = numpy.array([w[-1] if len(w) > 0
352 | else -1 for w in pipe.hyp_messages['sample']],
353 | dtype='int64')
354 |
355 | mask = numpy.tile(mask0, [1, live_k])
356 |
357 | prev_noise = numpy.array([a[-1] for a in pipe.hyp_messages['action']], dtype='float32')
358 | # prev_noise = numpy.concatenate(pipe.hyp_messages['action'], axis=0)
359 |
360 |
361 | # =======================================================================
362 | # Collecting Rewards.
363 | # =======================================================================
364 | # print 'collect reward'
365 | R = []
366 | track = []
367 | reference = [_bpe2words(_seqs2words([trg], t_idict))[0].split()]
368 | for k in xrange(n_samples):
369 | sp, sc, act = [pipe.messages[key][k] for key in ['sample', 'score', 'action']]
370 | y = numpy.asarray(sp, dtype='int64')[:, None]
371 | y_mask = numpy.ones_like(y, dtype='float32')
372 | steps = len(act)
373 |
374 | # turn back to sentence level
375 | words = _seqs2words([sp], t_idict)[0]
376 | decoded = _bpe2words([words])[0].split()
377 |
378 | # -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
379 | # reward configs
380 | keys = {"steps": steps, "y": y,
381 | "y_mask": y_mask,
382 | "x_mask": x_mask,
383 | "f_cost": f_cost,
384 | "sample": decoded,
385 | "reference": reference,
386 | "words": words}
387 |
388 | ret = translation_cost(**keys)
389 | Rk, bleu = ret
390 |
391 | R += [Rk]
392 | track += [bleu]
393 |
394 | pipe.messages['R'] = R
395 | pipe.messages['track'] = track
396 |
397 | # --------------------------------------------------- #
398 | # add to global lists.
399 | keywords = ['sample', 'action', 'obs', 'secs',
400 | 'attentions', 'score', 'track', 'R']
401 | for k in keywords:
402 | if k not in Statistcs:
403 | Statistcs[k] = pipe.messages[k]
404 | else:
405 | Statistcs[k] += pipe.messages[k]
406 |
407 |
408 | # If not train, End here
409 | if not train:
410 | return Statistcs
411 |
412 | # ================================================================================================= #
413 | # Policy Gradient over Trajectories
414 | # ================================================================================================= #
415 |
416 | p_obs, p_mask \
417 | = _padding(Observations,
418 | shape=(max_steps, n_samples * n_sentences, _policy.n_in),
419 | return_mask=True)
420 | p_r = _padding(Rewards,
421 | shape=(max_steps, n_samples * n_sentences))
422 | p_act = _padding(Actions,
423 | shape=(max_steps, n_samples * n_sentences, _policy.n_out))
424 |
425 |
426 | # print 'learning policy gradient'
427 | # learning
428 | info = _policy.get_learner()([p_obs, p_mask], p_act, p_r)
429 |
430 | # add the reward statistics
431 | q = Tracks
432 | info['Q'] = numpy.mean(q)
433 | info['A'] = numpy.mean(p_act)
434 |
435 | return Samples, Scores, Actions, Rewards, info
436 |
437 |
438 |
--------------------------------------------------------------------------------
/noisytrans_training.py:
--------------------------------------------------------------------------------
1 | """
2 | Neural Machine Translation with Greedy Decoding
3 | """
4 | import argparse
5 | import os
6 | import cPickle as pkl
7 |
8 | from nmt_uni import *
9 | from policy import Controller as Policy
10 | from utils import Progbar, Monitor
11 | from noisy_translator import noisy_decoding
12 | from simultrans_infinite2 import _seqs2words, _bpe2words, _action2delay
13 |
14 | import time
15 |
16 |
17 | numpy.random.seed(19920206)
18 | timer = time.time
19 |
20 | # check hidden folders
21 | def check_env():
22 | paths = ['.policy', '.pretrained', '.log',
23 | '.config', '.images', '.translate']
24 | for p in paths:
25 | if not os.path.exists(p):
26 | os.mkdir(p)
27 |
28 |
29 | # run training function:: >>>
30 | def run_simultrans(model,
31 | options_file=None,
32 | config=None,
33 | policy=None,
34 | id=None,
35 | remote=False):
36 | # check envoriments
37 | check_env()
38 | if id is not None:
39 | fcon = '.config/{}.conf'.format(id)
40 | if os.path.exists(fcon):
41 | print 'load config files'
42 | policy, config = pkl.load(open(fcon, 'r'))
43 |
44 | # ======================================================================= #
45 | # load model model_options
46 | # ======================================================================= #
47 | _model = model
48 | model = '.pretrained/{}'.format(model)
49 |
50 | if options_file is not None:
51 | with open(options_file, 'rb') as f:
52 | options = pkl.load(f)
53 | else:
54 | with open('%s.pkl' % model, 'rb') as f:
55 | options = pkl.load(f)
56 | options['birnn'] = True
57 |
58 | print 'load options...'
59 | for w, p in sorted(options.items(), key=lambda x:x[0]):
60 | print '{}: {}'.format(w, p)
61 |
62 | # load detail settings from option file:
63 | dictionary, dictionary_target = options['dictionaries']
64 |
65 | def _iter(fname):
66 | with open(fname, 'r') as f:
67 | for line in f:
68 | words = line.strip().split()
69 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
70 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
71 | x += [0]
72 | yield x
73 |
74 | def _check_length(fname):
75 | f = open(fname, 'r')
76 | count = 0
77 | for _ in f:
78 | count += 1
79 | f.close()
80 |
81 | return count
82 |
83 | # load source dictionary and invert
84 | with open(dictionary, 'rb') as f:
85 | word_dict = pkl.load(f)
86 | word_idict = dict()
87 | for kk, vv in word_dict.iteritems():
88 | word_idict[vv] = kk
89 | word_idict[0] = ''
90 | word_idict[1] = 'UNK'
91 |
92 | # load target dictionary and invert
93 | with open(dictionary_target, 'rb') as f:
94 | word_dict_trg = pkl.load(f)
95 | word_idict_trg = dict()
96 | for kk, vv in word_dict_trg.iteritems():
97 | word_idict_trg[vv] = kk
98 | word_idict_trg[0] = ''
99 | word_idict_trg[1] = 'UNK'
100 |
101 | # ======================================================================== #
102 | # Build a Translator
103 | # ======================================================================== #
104 |
105 | # allocate model parameters
106 | params = init_params(options)
107 | params = load_params(model, params)
108 | tparams = init_tparams(params)
109 |
110 | # print 'build the model for computing cost (full source sentence).'
111 | trng, use_noise, \
112 | _x, _x_mask, _y, _y_mask, \
113 | opt_ret, \
114 | cost, f_cost = build_model(tparams, options)
115 | print 'done.'
116 |
117 | # functions for sampler
118 | # f_sim_ctx, f_sim_init, f_sim_next = build_simultaneous_sampler(tparams, options, trng)
119 | f_sim_ctx, f_sim_init, f_sim_next = build_noisy_sampler(tparams, options, trng)
120 | print 'build sampler done.'
121 |
122 | # check the ID:
123 | policy['base'] = _model
124 | _policy = Policy(trng, options, policy, config,
125 | n_out=options['dim'],
126 | recurrent=True, id=id)
127 |
128 |
129 | # DATASET
130 | trainIter = TextIterator(options['datasets'][0], options['datasets'][1],
131 | options['dictionaries'][0], options['dictionaries'][1],
132 | n_words_source=options['n_words_src'], n_words_target=options['n_words'],
133 | batch_size=config['batchsize'],
134 | maxlen=options['maxlen'])
135 |
136 | train_num = trainIter.num
137 |
138 | validIter = TextIterator(options['valid_datasets'][0], options['valid_datasets'][1],
139 | options['dictionaries'][0], options['dictionaries'][1],
140 | n_words_source=options['n_words_src'], n_words_target=options['n_words'],
141 | batch_size=1,
142 | maxlen=options['maxlen'])
143 |
144 | valid_num = validIter.num
145 |
146 | valid_ = options['valid_datasets'][0]
147 | valid_num = _check_length(valid_)
148 | print 'training set {} lines / validation set {} lines'.format(train_num, valid_num)
149 | print 'use the reward function {}'.format(chr(config['Rtype'] + 65))
150 |
151 | # Translator model
152 | def _translate(src, trg, train=False, samples=80):
153 | ret = noisy_decoding(
154 | f_sim_ctx, f_sim_init,
155 | f_sim_next, f_cost,
156 | src, trg, word_idict_trg, n_samples=samples,
157 | train=train,
158 | _policy=_policy)
159 |
160 | if not train:
161 | sample, score, actions, R, tracks, attentions = ret
162 | return sample, score, actions, R, tracks
163 | else:
164 | sample, score, actions, R, info = ret
165 | return sample, score, actions, R, info
166 |
167 |
168 | # ======================================================================== #
169 | # Main Loop: Run
170 | # ======================================================================== #
171 | print 'Start Simultaneous Translator...'
172 | probar = Progbar(train_num / config['batchsize'], with_history=False)
173 |
174 | # freqs
175 | save_freq = 2000
176 | sample_freq = 10
177 | valid_freq = 1000
178 | valid_size = 200
179 | display_freq = 50
180 |
181 | history, last_it = _policy.load()
182 | time0 = timer()
183 |
184 | for it, (srcs, trgs) in enumerate(trainIter): # only one sentence each iteration
185 | if it < last_it: # go over the scanned lines.
186 | continue
187 |
188 | samples, scores, actions, rewards, info = _translate(srcs, trgs, train=True)
189 | if it % sample_freq == 0:
190 |
191 | print '\nModel has been trained for {} seconds'.format(timer() - time0)
192 | print 'source: ', _bpe2words(_seqs2words([srcs[0]], word_idict))[0]
193 | print 'target: ', _bpe2words(_seqs2words([trgs[0]], word_idict_trg))[0]
194 |
195 | # obtain the translation results
196 | samples = _bpe2words(_seqs2words(samples, word_idict_trg))
197 |
198 | print '---'
199 | print 'sample: ', samples[40]
200 | print 'sample: ', samples[60]
201 |
202 | values = [(w, info[w]) for w in info]
203 | probar.update(it + 1, values=values)
204 |
205 | # NaN detector
206 | for w in info:
207 | if numpy.isnan(info[w]) or numpy.isinf(info[w]):
208 | raise RuntimeError, 'NaN/INF is detected!! {} : ID={}'.format(w, id)
209 |
210 |
211 |
212 | if __name__ == "__main__":
213 | parser = argparse.ArgumentParser()
214 | parser.add_argument('-m', '--model',
215 | default='model_wmt15_bpe2k_basic_cs-en.npz')
216 | parser.add_argument('--id', type=str, default=None)
217 | parser.add_argument('-o', type=str, default=None)
218 |
219 | args = parser.parse_args()
220 | print args
221 |
222 | policy = OrderedDict()
223 | policy['layernorm'] = True
224 | policy['upper'] = False
225 | policy['updater'] = 'REINFORCE'
226 | policy['type'] = 'gaussian'
227 |
228 | config = OrderedDict()
229 | config['batchsize'] = 1
230 | config['Rtype'] = 8
231 |
232 | run_simultrans(args.model,
233 | options_file=args.o,
234 | config=config,
235 | policy=policy,
236 | id=args.id,
237 | remote=False)
238 |
239 |
240 |
--------------------------------------------------------------------------------
/optimizer.py:
--------------------------------------------------------------------------------
1 | import theano
2 | import theano.tensor as tensor
3 | import numpy
4 |
5 | from layers import *
6 | profile = False
7 |
8 | # optimizers
9 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
10 |
11 | """
12 | First order optimizer
13 | """
14 | def adam(lr, tparams, grads, inp, cost):
15 | gshared = [theano.shared(p.get_value() * 0.,
16 | name='%s_grad' % k)
17 | for k, p in tparams.iteritems()]
18 | gsup = [(gs, g) for gs, g in zip(gshared, grads)]
19 |
20 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile, on_unused_input='ignore')
21 |
22 | lr0 = lr # 0.0002
23 | b1 = 0.1
24 | b2 = 0.001
25 | e = 1e-8
26 |
27 | updates = []
28 |
29 | i = theano.shared(numpy.float32(0.))
30 | i_t = i + 1.
31 | fix1 = 1. - b1**(i_t)
32 | fix2 = 1. - b2**(i_t)
33 | lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
34 |
35 | for p, g in zip(tparams.values(), gshared):
36 | m = theano.shared(p.get_value() * 0.)
37 | v = theano.shared(p.get_value() * 0.)
38 | m_t = (b1 * g) + ((1. - b1) * m)
39 | v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
40 | g_t = m_t / (tensor.sqrt(v_t) + e)
41 | p_t = p - (lr_t * g_t)
42 | updates.append((m, m_t))
43 | updates.append((v, v_t))
44 | updates.append((p, p_t))
45 | updates.append((i, i_t))
46 |
47 | print 'build optimizer with Adam'
48 | f_update = theano.function([lr], [], updates=updates,
49 | on_unused_input='ignore', profile=profile)
50 |
51 | return f_grad_shared, f_update
52 |
53 |
54 | def adadelta(lr, tparams, grads, inp, cost):
55 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
56 | name='%s_grad' % k)
57 | for k, p in tparams.iteritems()]
58 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
59 | name='%s_rup2' % k)
60 | for k, p in tparams.iteritems()]
61 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
62 | name='%s_rgrad2' % k)
63 | for k, p in tparams.iteritems()]
64 |
65 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
66 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
67 | for rg2, g in zip(running_grads2, grads)]
68 |
69 | f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up,
70 | profile=profile)
71 |
72 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
73 | for zg, ru2, rg2 in zip(zipped_grads, running_up2,
74 | running_grads2)]
75 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
76 | for ru2, ud in zip(running_up2, updir)]
77 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
78 |
79 | f_update = theano.function([lr], [], updates=ru2up+param_up,
80 | on_unused_input='ignore', profile=profile)
81 |
82 | print 'build optimizer with Adadelta'
83 | return f_grad_shared, f_update
84 |
85 |
86 | def rmsprop(lr, tparams, grads, inp, cost):
87 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
88 | name='%s_grad' % k)
89 | for k, p in tparams.iteritems()]
90 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
91 | name='%s_rgrad' % k)
92 | for k, p in tparams.iteritems()]
93 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
94 | name='%s_rgrad2' % k)
95 | for k, p in tparams.iteritems()]
96 |
97 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
98 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
99 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
100 | for rg2, g in zip(running_grads2, grads)]
101 |
102 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up,
103 | on_unused_input='ignore', profile=profile)
104 |
105 | updir = [theano.shared(p.get_value() * numpy.float32(0.),
106 | name='%s_updir' % k)
107 | for k, p in tparams.iteritems()]
108 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
109 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
110 | running_grads2)]
111 | param_up = [(p, p + udn[1])
112 | for p, udn in zip(itemlist(tparams), updir_new)]
113 | f_update = theano.function([lr], [], updates=updir_new+param_up,
114 | on_unused_input='ignore', profile=profile)
115 |
116 | print 'build optimizer with Rmsprop'
117 | return f_grad_shared, f_update
118 |
119 |
120 | def sgd(lr, tparams, grads, x, mask, y, cost):
121 | gshared = [theano.shared(p.get_value() * 0.,
122 | name='%s_grad' % k)
123 | for k, p in tparams.iteritems()]
124 | gsup = [(gs, g) for gs, g in zip(gshared, grads)]
125 |
126 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
127 | profile=profile)
128 |
129 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
130 | f_update = theano.function([lr], [], updates=pup, profile=profile)
131 |
132 | print 'build optimizer with SGD'
133 | return f_grad_shared, f_update
134 |
135 |
136 | """
137 | Beyond first-order optimizer
138 | """
139 | def conjugate(lr, tparams, grads, inps, cost):
140 | """
141 | Performs constrained optimization via line search.
142 | The search direction is computed using a conjugate gradient algorithm,
143 | which gives x = A^{-1}g, where A is a second order approximation of the constraint and g is the gradient
144 | of the loss function.
145 | """
146 | pass
147 |
--------------------------------------------------------------------------------
/policy.py:
--------------------------------------------------------------------------------
1 | """
2 | -- Policy Network for decision making [more general]
3 | """
4 | from nmt_uni import *
5 | from layers import _p
6 |
7 | import os
8 | import time, datetime
9 | import cPickle as pkl
10 |
11 | # hyper params
12 | TINY = 1e-7
13 | PI = numpy.pi
14 | E = numpy.e
15 | A = 0.2
16 | B = 1
17 |
18 | class Controller(object):
19 |
20 | def __init__(self, trng,
21 | options,
22 | policy,
23 | config,
24 | n_in=None, n_out=None,
25 | recurrent=False, id=None):
26 |
27 | self.trng = trng
28 | self.options = options
29 | self.policy = policy
30 | self.recurrent = recurrent
31 | self.type = self.policy.get('type', 'categorical')
32 |
33 | self.n_hidden = 512
34 | self.n_in = n_in
35 | self.n_out = n_out
36 |
37 | if self.policy.get('layernorm', True):
38 | self.rec = 'lngru'
39 | else:
40 | self.rec = 'gru'
41 |
42 | if not n_in:
43 | self.n_in = options['readout_dim']
44 |
45 | if not n_out:
46 | if self.type == 'categorical':
47 | self.n_out = 2 # initially it is a WAIT/COMMIT action.
48 | elif self.type == 'gaussian':
49 | self.n_out = 100
50 | else:
51 | raise NotImplementedError
52 |
53 | # build the policy network
54 | print 'parameter initialization'
55 |
56 | params = OrderedDict()
57 |
58 | if not self.recurrent:
59 | print 'building a feedforward controller'
60 | params = get_layer('ff')[0](options, params, prefix='policy_net_in',
61 | nin=self.n_in, nout=self.n_hidden, scale=0.001)
62 | else:
63 | print 'building a recurrent controller'
64 | params = get_layer(self.rec)[0](options, params, prefix='policy_net_in',
65 | nin=self.n_in, dim=self.n_hidden, scale=0.001)
66 |
67 | params = get_layer('ff')[0](options, params, prefix='policy_net_out',
68 | nin=self.n_hidden,
69 | nout=self.n_out if self.type == 'categorical' else self.n_out * 2,
70 | scale=0.001)
71 |
72 | # bias the forget probability
73 | # if self.n_out == 3:
74 | # params[_p('policy_net_out', 'b')][-1] = -2
75 |
76 |
77 | # for the baseline network.
78 | params_b = OrderedDict()
79 |
80 | # using a scalar baseline [**]
81 | # params_b['b0'] = numpy.array(numpy.random.rand() * 0.0, dtype='float32')
82 |
83 | # using a MLP as a baseline
84 | params_b = get_layer('ff')[0](options, params_b, prefix='baseline_net_in',
85 | nin=self.n_in, nout=128, scale=0.001)
86 | params_b = get_layer('ff')[0](options, params_b, prefix='baseline_net_out',
87 | nin=128, nout=1, scale=0.001)
88 |
89 | if id is not None:
90 | print 'reload the saved model: {}'.format(id)
91 | params = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params)
92 | params_b = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params_b)
93 | else:
94 | id = datetime.datetime.fromtimestamp(time.time()).strftime('%y%m%d-%H%M%S')
95 | print 'start from a new model: {}'.format(id)
96 |
97 | with open('.config/conf.{}.txt'.format(id), 'w') as f:
98 | f.write('[config]\n')
99 |
100 | for c in config:
101 | f.write('{}: {}\n'.format(c, config[c]))
102 | f.write('\n')
103 |
104 | f.write('[policy]\n')
105 |
106 | for c in policy:
107 | f.write('{}: {}\n'.format(c, policy[c]))
108 |
109 | # pkl.dump([policy, config], open('.config/{}.conf'.format(id), 'w'))
110 | print 'save the config file'
111 |
112 | self.id = id
113 | self.model = '.policy/{}-{}'.format(id, self.policy['base'])
114 |
115 | # theano shared params
116 | tparams = init_tparams(params)
117 | tparams_b = init_tparams(params_b)
118 |
119 | if ('bn' in policy) and policy['bn']:
120 | # params for input-batch normalization
121 | self.gamma = theano.shared(numpy.asarray(numpy.random.uniform(
122 | low=-1.0 / numpy.sqrt(self.n_in),
123 | high=1.0 / numpy.sqrt(self.n_in),
124 | size=(self.n_in)), dtype=theano.config.floatX), name='policy_gamma', borrow=True)
125 | self.beta = theano.shared(numpy.zeros(
126 | (self.n_in), dtype=theano.config.floatX), name='policy_beta', borrow=True)
127 |
128 | self.mean = theano.shared(numpy.zeros((self.n_in), dtype=theano.config.floatX), name='mean', borrow=True)
129 | self.var = theano.shared(numpy.ones((self.n_in), dtype=theano.config.floatX), name='var', borrow=True)
130 | tparams['gamma'] = self.gamma
131 | tparams['beta'] = self.beta
132 |
133 | self.tparams = tparams
134 | self.tparams_b = tparams_b
135 |
136 | # build the policy network
137 | self.build_sampler(options=options)
138 | self.build_discriminator(options=options)
139 |
140 |
141 | def build_batchnorm(self, observation, mask=None):
142 | raise NotImplementedError
143 |
144 |
145 | def build_sampler(self, options):
146 |
147 | # ==================================================================================== #
148 | # Build Action function: samplers
149 | # ==================================================================================== #
150 |
151 | observation = tensor.matrix('observation', dtype='float32') # batch_size x readout_dim (seq_steps=1)
152 | prev_hidden = tensor.matrix('p_hidden', dtype='float32')
153 |
154 | if not self.recurrent:
155 | hiddens = get_layer('ff')[1](self.tparams, observation,
156 | options, prefix='policy_net_in',
157 | activ='tanh')
158 | else:
159 | hiddens = get_layer(self.rec)[1](self.tparams, observation,
160 | options, prefix='policy_net_in', mask=None,
161 | one_step=True, _init_state=prev_hidden)[0]
162 |
163 | act_inps = [observation, prev_hidden]
164 | if self.type == 'categorical':
165 | act_prob = get_layer('ff')[1](self.tparams, hiddens, options,
166 | prefix='policy_net_out',
167 | activ='softmax'
168 | ) # batch_size x n_out
169 |
170 | # add action mask
171 | if self.policy.get('act_mask', False):
172 | act_mask = tensor.matrix('act_mask', dtype='float32')
173 | act_inps += [act_mask]
174 | act_prob *= act_mask
175 | act_prob /= (act_prob.sum(axis=-1, keepdims=True) + TINY)
176 | act_prob *= act_mask
177 |
178 | act_prob2 = tensor.clip(act_prob, TINY, 1 - TINY)
179 |
180 | # testing upper bound
181 | # if self.policy['upper']:
182 | # act_prob *= 0.0
183 |
184 | # compiling the sampling function for action
185 | # action = self.trng.binomial(size=act_prop.shape, p=act_prop)
186 | action = self.trng.multinomial(pvals=act_prob).argmax(1) # 0, 1, ...
187 |
188 | print 'build action sampling function [Discrete]'
189 | self.f_action = theano.function(act_inps, [action, act_prob, hiddens, act_prob2],
190 | on_unused_input='ignore') # action/dist/hiddens
191 |
192 | elif self.type == 'gaussian':
193 | _temp = get_layer('ff')[1](self.tparams, hiddens, options,
194 | prefix='policy_net_out',
195 | activ='linear'
196 | ) # batch_size x n_out
197 | mean, log_std = _temp[:, :self.n_out], _temp[:, self.n_out:]
198 | mean, log_std = -A * tanh(mean), -B-relu(log_std)
199 |
200 | action0 = self.trng.normal(size=mean.shape, dtype='float32')
201 | action = action0 * tensor.exp(log_std) + mean
202 |
203 |
204 | print 'build action sampling function [Gaussian]'
205 | self.f_action = theano.function(act_inps, [action, mean, log_std, hiddens],
206 | on_unused_input='ignore') # action/dist/hiddens
207 | else:
208 | raise NotImplementedError
209 |
210 |
211 | def build_discriminator(self, options):
212 | # ==================================================================================== #
213 | # Build Action Discriminator
214 | # ==================================================================================== #
215 |
216 | observations = tensor.tensor3('observations', dtype='float32')
217 | mask = tensor.matrix('mask', dtype='float32')
218 | if self.type == 'categorical':
219 | actions = tensor.matrix('actions', dtype='int64')
220 | elif self.type == 'gaussian':
221 | actions = tensor.tensor3('actions', dtype='float32')
222 | else:
223 | raise NotImplementedError
224 |
225 |
226 | if not self.recurrent:
227 | hiddens = get_layer('ff')[1](self.tparams, observations,
228 | options, prefix='policy_net_in',
229 | activ='tanh')
230 | else:
231 | hiddens = get_layer(self.rec)[1](self.tparams, observations,
232 | options, prefix='policy_net_in', mask=mask)[0]
233 |
234 | act_inputs = [observations, mask]
235 | if self.type == 'categorical':
236 | act_probs = get_layer('ff')[1](self.tparams, hiddens, options, prefix='policy_net_out',
237 | activ='softmax') # seq_steps x batch_size x n_out
238 |
239 | if 'act_mask' in self.policy and self.policy['act_mask']:
240 | act_masks = tensor.tensor3('act_masks', dtype='float32')
241 | act_inputs += [act_masks]
242 | act_probs *= act_masks
243 | act_probs /= (act_probs.sum(axis=-1, keepdims=True) + TINY)
244 | act_probs *= act_masks
245 |
246 | act_probs = tensor.clip(act_probs, TINY, 1 - TINY)
247 |
248 | print 'build action distribiution'
249 | self.f_probs = theano.function(act_inputs, act_probs,
250 | on_unused_input='ignore') # get the action probabilities
251 | elif self.type == 'gaussian':
252 | _temps = get_layer('ff')[1](self.tparams, hiddens, options,
253 | prefix='policy_net_out',
254 | activ='linear'
255 | ) # batch_size x n_out
256 | means, log_stds = _temps[:, :, :self.n_out], _temps[:, :, self.n_out:]
257 | means, log_stds = -A * tanh(means), -B-relu(log_stds)
258 |
259 | act_probs = [means, log_stds]
260 |
261 | print 'build Gaussian PDF'
262 | self.f_pdf = theano.function(act_inputs, [means, log_stds],
263 | on_unused_input='ignore') # get the action probabilities
264 | else:
265 | raise NotImplementedError
266 |
267 |
268 | # ==================================================================================== #
269 | # Build Baseline Network (Input-dependent Value Function) & Advantages
270 | # ==================================================================================== #
271 |
272 | print 'setup the advantages & baseline network'
273 | reward = tensor.matrix('reward') # seq_steps x batch_size :: rewards for each steps
274 |
275 | # baseline is estimated with a 2-layer neural network.
276 | hiddens_b = get_layer('ff')[1](self.tparams_b, observations, options,
277 | prefix='baseline_net_in',
278 | activ='tanh')
279 | baseline = get_layer('ff')[1](self.tparams_b, hiddens_b, options,
280 | prefix='baseline_net_out',
281 | activ='linear')[:, :, 0] # seq_steps x batch_size or batch_size
282 | advantages = self.build_advantages(act_inputs, reward, baseline, normalize=True)
283 |
284 |
285 | # ==================================================================================== #
286 | # Build Policy Gradient (here we provide two options)
287 | # ==================================================================================== #
288 | if self.policy['updater'] == 'REINFORCE':
289 | print 'build RENIFROCE.'
290 | self.build_reinforce(act_inputs, act_probs, actions, advantages)
291 |
292 | elif self.policy['updater'] == 'TRPO':
293 | print 'build TRPO'
294 | self.build_trpo(act_inputs, act_probs, actions, advantages)
295 | else:
296 | raise NotImplementedError
297 |
298 | # ==================================================================================== #
299 | # Controller Actions
300 | # ==================================================================================== #
301 | def random(self, states, p=0.5):
302 | live_k = states.shape[0]
303 | return (numpy.random.random(live_k) > p).astype('int64'), \
304 | numpy.ones(live_k) * p
305 |
306 | def action(self, states, prevhidden, act_mask=None):
307 | if act_mask is None:
308 | return self.f_action(states, prevhidden)
309 | else:
310 | return self.f_action(states, prevhidden, act_mask)
311 |
312 |
313 | def init_hidden(self, n_samples=1):
314 | return numpy.zeros((n_samples, self.n_hidden), dtype='float32')
315 |
316 | def init_action(self, n_samples=1):
317 | states0 = numpy.zeros((n_samples, self.n_in), dtype='float32')
318 | return self.f_action(states0, self.init_hidden(n_samples))
319 |
320 |
321 | def get_learner(self):
322 | if self.policy['updater'] == 'REINFORCE':
323 | return self.run_reinforce
324 | elif self.policy['updater'] == 'TRPO':
325 | return self.run_trpo
326 | else:
327 | raise NotImplementedError
328 |
329 | @staticmethod
330 | def kl(prob0, prob1):
331 | p1 = (prob0 + TINY) / (prob1 + TINY)
332 | # p2 = (1 - prob0 + TINY) / (1 - prob1 + TINY)
333 | return tensor.sum(prob0 * tensor.log(p1), axis=-1)
334 |
335 |
336 | @staticmethod
337 | def _grab_prob(probs, X):
338 | assert probs.ndim == 3
339 |
340 | batch_size = probs.shape[1]
341 | max_len = probs.shape[0]
342 | vocab_size = probs.shape[2]
343 |
344 | probs = probs.reshape((batch_size * max_len, vocab_size))
345 | return probs[tensor.arange(batch_size * max_len), X.flatten(1)].reshape(X.shape) # advanced indexing
346 |
347 | def cross(self, probs, actions):
348 | # return tensor.log(probs) * actions + tensor.log(1 - probs) * (1 - actions)
349 | return self._grab_prob(tensor.log(probs), actions)
350 |
351 | def build_advantages(self, act_inputs, reward, baseline, normalize=True):
352 | # TODO: maybe we need a discount factor gamma for advantages.
353 | # TODO: we can also rewrite advantages with value functions (GAE)
354 |
355 | # Advantages and Normalization the return
356 | reward_adv = reward - baseline
357 | mask = act_inputs[1]
358 |
359 | if normalize:
360 | reward_mean = tensor.sum(mask * reward_adv) / (tensor.sum(mask) + TINY)
361 | reward_mean2 = tensor.sum(mask * (reward_adv ** 2)) / (tensor.sum(mask) + TINY)
362 | reward_std = tensor.sqrt(tensor.maximum(reward_mean2 - reward_mean ** 2, TINY))
363 | # reward_std = tensor.maximum(reward_std, 1)
364 | reward_c = reward_adv - reward_mean # independent mean
365 | advantages = reward_c / (reward_std + TINY)
366 | else:
367 | advantages = reward_adv
368 |
369 | print 'build advantages and baseline gradient'
370 | L = tensor.sum(mask * (reward_adv ** 2)) / (tensor.sum(mask) + TINY)
371 | dL = tensor.grad(L, wrt=itemlist(self.tparams_b))
372 | lr = tensor.scalar(name='lr')
373 |
374 | inps_b = act_inputs + [reward]
375 | oups_b = [L, advantages]
376 | f_adv, f_update_b = adam(lr, self.tparams_b, dL, inps_b, oups_b)
377 | # f_adv, f_update_b = rmsprop(lr, self.tparams_b, dL, inps_b, oups_b)
378 |
379 | self.f_adv = f_adv
380 | self.f_update_b = f_update_b
381 |
382 | return advantages
383 |
384 |
385 | # ===================================================================
386 | # Policy Grident: REINFORCE with Adam
387 | # ===================================================================
388 | def build_reinforce(self, act_inputs, act_probs, actions, advantages):
389 |
390 | mask = act_inputs[1]
391 |
392 | if self.type == 'categorical':
393 | if self.policy.get('act_mask', False):
394 | act_masks = act_inputs[2]
395 | negEntropy = tensor.sum(tensor.log(act_probs) * (act_probs * act_masks), axis=-1)
396 | else:
397 | negEntropy = tensor.sum(tensor.log(act_probs) * act_probs, axis=-1)
398 |
399 | logLikelihood = self.cross(act_probs, actions)
400 |
401 | elif self.type == 'gaussian':
402 | means, log_stds = act_probs
403 | negEntropy = -tensor.sum(log_stds + tensor.log(tensor.sqrt(2 * PI * E)), axis=-1)
404 |
405 | actions0 = (actions - means) / tensor.exp(log_stds)
406 | logLikelihood = -tensor.sum(log_stds, axis=-1) - \
407 | 0.5 * tensor.sum(tensor.sqr(actions0), axis=-1) - \
408 | 0.5 * means.shape[-1] * tensor.log(2 * PI)
409 |
410 | else:
411 | raise NotImplementedError
412 |
413 | # tensor.log(act_probs) * actions + tensor.log(1 - act_probs) * (1 - actions)
414 |
415 | H = tensor.sum(mask * negEntropy, axis=0).mean() * 0.01 # entropy penalty
416 | J = tensor.sum(mask * -logLikelihood * advantages, axis=0).mean() + H
417 | dJ = tensor.grad(J, wrt=itemlist(self.tparams))
418 |
419 | # clip the policy gradient to 1 (to avoid gradient exploding)
420 | clip_c = 1.
421 | if clip_c > 0.:
422 | g2 = 0.
423 | for g in dJ:
424 | g2 += (g ** 2).sum()
425 | new_grads = []
426 | for g in dJ:
427 | new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g))
428 | dJ = new_grads
429 |
430 | print 'build REINFORCE optimizer'
431 | lr = tensor.scalar(name='lr')
432 |
433 | inps = act_inputs + [actions, advantages]
434 | outps = [J, H]
435 | if self.type == 'gaussian':
436 | outps += [actions0.mean(), actions.mean()]
437 |
438 | f_cost, f_update = adam(lr, self.tparams, dJ, inps, outps)
439 | # f_cost, f_update = rmsprop(lr, self.tparams, dJ, inps, outps)
440 |
441 | self.f_cost = f_cost
442 | self.f_update = f_update
443 | print 'done'
444 |
445 |
446 | def run_reinforce(self, act_inputs, actions, reward, update=True, lr=0.0001):
447 |
448 | # sub baseline
449 | inps_adv = act_inputs + [reward]
450 | L, advantages = self.f_adv(*inps_adv)
451 |
452 | inps_reinfoce = act_inputs + [actions, advantages]
453 | if self.type == 'gaussian':
454 | J, H, m, s = self.f_cost(*inps_reinfoce)
455 | info = {'J': J, 'G_norm': H, 'B_loss': L, 'Adv': advantages.mean(), 'm': m, 's': s}
456 | else:
457 | J, H = self.f_cost(*inps_reinfoce)
458 | info = {'J': J, 'G_norm': H, 'B_loss': L, 'Adv': advantages.mean()}
459 |
460 |
461 | if update: # update the parameters
462 | self.f_update_b(lr)
463 | self.f_update(lr)
464 |
465 | return info
466 |
467 |
468 | # ==================================================================================== #
469 | # Trust Region Policy Optimization
470 | # ==================================================================================== #
471 | def build_trpo(self, act_inputs, act_probs, actions, advantages):
472 |
473 | assert self.type == 'categorical', 'in this stage not support TRPO'
474 |
475 | # probability distribution
476 | mask = act_inputs[1]
477 | probs = act_probs
478 | probs_old = tensor.matrix(dtype='float32')
479 |
480 | logp = self.cross(probs, actions)
481 | logp_old = self.cross(probs_old, actions)
482 |
483 | # policy gradient
484 | J = tensor.sum(mask * -tensor.exp(logp - logp_old) * advantages, axis=0).mean()
485 | dJ = flatgrad(J, self.tparams)
486 | probs_fix = theano.gradient.disconnected_grad(probs)
487 |
488 | kl_fix = tensor.sum(mask * self.kl(probs_fix, probs), axis=0).mean()
489 | kl_grads = tensor.grad(kl_fix, wrt=itemlist(self.tparams))
490 | ftangents = tensor.fvector(name='flat_tan')
491 | shapes = [self.tparams[var].get_value(borrow=True).shape for var in self.tparams]
492 | start = 0
493 | tangents = []
494 | for shape in shapes:
495 | size = numpy.prod(shape)
496 | tangents.append(tensor.reshape(ftangents[start:start + size], shape))
497 | start += size
498 | gvp = tensor.add(*[tensor.sum(g * t) for (g, t) in zipsame(kl_grads, tangents)])
499 |
500 | # Fisher-vectror product
501 | fvp = flatgrad(gvp, self.tparams)
502 | entropy = tensor.sum(mask * -self.cross(probs, probs), axis=0).mean()
503 | kl = tensor.sum(mask * self.kl(probs_old, probs), axis=0).mean()
504 |
505 | print 'compile the functions'
506 | inps = act_inputs + [actions, advantages, probs_old]
507 | loss = [J, kl, entropy]
508 | self.f_pg = theano.function(inps, dJ)
509 | self.f_loss = theano.function(inps, loss)
510 | self.f_fisher = theano.function([ftangents] + inps, fvp, on_unused_input='ignore')
511 |
512 | # get/set flatten params
513 | print 'compling flat updater'
514 | self.get_flat = theano.function([], tensor.concatenate([self.tparams[v].flatten() for v in self.tparams]))
515 | theta = tensor.vector()
516 | start = 0
517 | updates = []
518 | for v in self.tparams:
519 | p = self.tparams[v]
520 | shape = p.shape
521 | size = tensor.prod(shape)
522 | updates.append((p, theta[start:start + size].reshape(shape)))
523 | start += size
524 | self.set_flat = theano.function([theta], [], updates=updates)
525 |
526 |
527 | def run_trpo(self, act_inputs, actions, reward,
528 | update=True, cg_damping=1e-3, max_kl=1e-2, lr=0.0002):
529 |
530 | # sub baseline
531 | inps_adv = act_inputs + [reward]
532 | L, advantages = self.f_adv(*inps_adv)
533 | self.f_update_b(lr)
534 |
535 | # get current action distributions
536 | probs = self.f_probs(*act_inputs)
537 | inps = act_inputs + [actions, advantages, probs]
538 | thprev = self.get_flat()
539 |
540 | def fisher_vector_product(p):
541 | return self.f_fisher(p, *inps) + cg_damping * p
542 |
543 | g = self.f_pg(*inps)
544 | losses_before = self.f_loss(*inps)
545 |
546 | if numpy.allclose(g, 0):
547 | print 'zero gradient, not updating'
548 | else:
549 | stepdir = self.cg(fisher_vector_product, -g)
550 | shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
551 | lm = numpy.sqrt(shs / max_kl)
552 |
553 | print "\nlagrange multiplier:", lm, "gnorm:", numpy.linalg.norm(g)
554 | fullstep = stepdir / lm
555 | neggdotstepdir = -g.dot(stepdir)
556 |
557 | def loss(th):
558 | self.set_flat(th)
559 | return self.f_loss(*inps)[0]
560 |
561 | print 'do line search'
562 | success, theta = self.linesearch(loss, thprev, fullstep, neggdotstepdir / lm)
563 |
564 | print "success", success
565 | self.set_flat(theta)
566 |
567 | losses_after = self.f_loss(*inps)
568 |
569 | info = OrderedDict()
570 | for (lname, lbefore, lafter) in zipsame(['J', 'KL', 'entropy'], losses_before, losses_after):
571 | info[lname + "_before"] = lbefore
572 | info[lname + "_after"] = lafter
573 |
574 | # add the baseline loss into full information
575 | info['B_loss'] = L
576 | return info
577 |
578 |
579 | @staticmethod
580 | def linesearch(f, x, fullstep, expected_improve_rate, max_backtracks=10, accept_ratio=.1):
581 | """
582 | Backtracking linesearch, where expected_improve_rate is the slope dy/dx at the initial point
583 | """
584 | fval = f(x)
585 | print "fval before", fval
586 | for (_n_backtracks, stepfrac) in enumerate(.5 ** numpy.arange(max_backtracks)):
587 | xnew = x + stepfrac * fullstep
588 | newfval = f(xnew)
589 | actual_improve = fval - newfval
590 | expected_improve = expected_improve_rate * stepfrac
591 | ratio = actual_improve / expected_improve
592 | print "a/e/r", actual_improve, expected_improve, ratio
593 | if ratio > accept_ratio and actual_improve > 0:
594 | print "fval after", newfval
595 | return True, xnew
596 | return False, x
597 |
598 | @staticmethod
599 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
600 | """
601 | Conjuctate Gradient
602 | """
603 | p = b.copy()
604 | r = b.copy()
605 | x = numpy.zeros_like(b)
606 | rdotr = r.dot(r)
607 |
608 | fmtstr = "%10i %10.3g %10.3g"
609 | titlestr = "%10s %10s %10s"
610 | if verbose: print titlestr % ("iter", "residual norm", "soln norm")
611 |
612 | for i in xrange(cg_iters):
613 | if callback is not None:
614 | callback(x)
615 | if verbose: print fmtstr % (i, rdotr, numpy.linalg.norm(x))
616 | z = f_Ax(p)
617 | v = rdotr / p.dot(z)
618 | x += v * p
619 | r -= v * z
620 | newrdotr = r.dot(r)
621 | mu = newrdotr / rdotr
622 | p = r + mu * p
623 |
624 | rdotr = newrdotr
625 | if rdotr < residual_tol:
626 | break
627 |
628 | if callback is not None:
629 | callback(x)
630 | if verbose: print fmtstr % (i + 1, rdotr, numpy.linalg.norm(x))
631 | return x
632 |
633 |
634 | # ====================================================================== #
635 | # Save & Load
636 | # ====================================================================== #
637 |
638 | def save(self, history, it):
639 | _params = OrderedDict()
640 | _params = unzip(self.tparams, _params)
641 | _params = unzip(self.tparams_b, _params)
642 |
643 | print 'save the policy network >> {}'.format(self.model)
644 | numpy.savez('%s.current' % (self.model),
645 | history=history,
646 | it=it,
647 | **_params)
648 |
649 | def load(self):
650 | if os.path.exists(self.model):
651 | print 'loading from the existing model (current)'
652 |
653 | rmodel = numpy.load(self.model)
654 | history = rmodel['history']
655 | it = rmodel['it']
656 |
657 | self.params = load_params(rmodel, self.params)
658 | self.params_b = load_params(rmodel, self.params_b)
659 | self.tparams = init_tparams(self.params)
660 | self.tparams_b = init_tparams(self.params_b)
661 |
662 | print 'the dataset need to go over {} lines'.format(it)
663 | return history, it
664 | else:
665 | return [], -1
666 |
667 |
668 |
669 |
670 |
--------------------------------------------------------------------------------
/pretrain_uni.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import os
3 |
4 | from nmt_uni import train
5 |
6 | def main(job_id, params):
7 | print params
8 | validerr = train(saveto=params['model'][0],
9 | reload_=params['reload'][0],
10 | dim_word=params['dim_word'][0],
11 | dim=params['dim'][0],
12 | n_words=params['n-words'][0],
13 | n_words_src=params['n-words'][0],
14 | decay_c=params['decay-c'][0],
15 | clip_c=params['clip-c'][0],
16 | lrate=params['learning-rate'][0],
17 | optimizer=params['optimizer'][0],
18 | patience=1000,
19 | maxlen=50,
20 | batch_size=64,
21 | valid_batch_size=64,
22 | validFreq=1000,
23 | dispFreq=50,
24 | saveFreq=1000,
25 | sampleFreq=99,
26 | datasets=['/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe',
27 | '/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe'],
28 | valid_datasets=['/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-src.en.tok.bpe',
29 | '/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-ref.ru.tok.bpe'],
30 | dictionaries=['/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe.word.pkl',
31 | '/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe.word.pkl'],
32 | use_dropout=params['use-dropout'][0])
33 | return validerr
34 |
35 | if __name__ == '__main__':
36 | main(0, {
37 | 'model': ['models/model_wmt15_bpe2k_uni_en-ru.npz'],
38 | 'dim_word': [512],
39 | 'dim': [1028],
40 | 'n-words': [20000],
41 | 'optimizer': ['adadelta'],
42 | 'decay-c': [0.],
43 | 'clip-c': [1.],
44 | 'use-dropout': [False],
45 | 'learning-rate': [0.0001],
46 | 'reload': [False]})
47 |
48 |
49 |
--------------------------------------------------------------------------------
/reward.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Collection of reward functions for Simultaneous Machine Translation
5 | """
6 | import numpy
7 | from bleu import *
8 |
9 |
10 | # computing the discounting matrix
11 | gamma = 0.9
12 | maxlen = 100
13 |
14 |
15 | def compute_discount(gamma, maxlen):
16 | c = numpy.ones((maxlen,)) * gamma
17 | c[0] = 1.
18 | c = c.cumprod()
19 |
20 | C = numpy.triu(numpy.repeat(c[None, :], repeats=maxlen, axis=0))
21 | C /= c[:, None]
22 | return C
23 |
24 |
25 | GAMMA = compute_discount(gamma, maxlen) # precomputed
26 |
27 | def translation_cost(**_k):
28 |
29 | def BLEU():
30 | q = numpy.zeros((_k['steps'],))
31 | s = _k['sample']
32 | r = _k['reference']
33 | chencherry = SmoothingFunction()
34 | b = sentence_bleu(r, s, smoothing_function=chencherry.method5)
35 | q[-1] = b[1]
36 | return q, b
37 |
38 |
39 | return BLEU()
40 |
41 |
42 |
43 |
44 | # The general function for rewards (for simultrans):
45 | def return_reward(**_k):
46 |
47 | # ----------------------------------------------------------------- #
48 | # reward for quality
49 | # use negative-loglikelihood as the reward (full sentence)
50 | # we can also use BLEU for quality, but let's try the simplest one'
51 | #
52 | @staticmethod
53 | def _bpe2words(capsw):
54 | capw = []
55 | for cc in capsw:
56 | capw += [cc.replace('@@ ', '')]
57 | return capw
58 |
59 |
60 | def LogLikelihood():
61 | q = numpy.zeros((_k['steps'],))
62 | q[-1] = _k['f_cost'](
63 | _k['ctx0'], _k['x_mask'], _k['y'], _k['y_mask']
64 | )
65 | return q
66 |
67 | def StepLogLikelihood():
68 | pass
69 |
70 |
71 | def NormLogLikelihood():
72 | q = LogLikelihood()
73 | length = _k['y'].shape[0]
74 | return q / float(length)
75 |
76 | def BLEU():
77 | q = numpy.zeros((_k['steps'],))
78 | s = _k['sample']
79 | r = _k['reference']
80 | chencherry = SmoothingFunction()
81 | q[-1] = sentence_bleu(r, s, smoothing_function=chencherry.method5)
82 | return q
83 |
84 | def LatencyBLEUwithForget(beta=None, discount=1., return_quality=False):
85 |
86 | # init
87 | words = _k['words'].split() # end-of-sentence is treated as a word
88 | ref = _k['reference']
89 |
90 | q0 = numpy.zeros((_k['steps'],))
91 |
92 | # check 0, 1
93 | maps = [(it, a) for it, a in enumerate(_k['act']) if a < 2]
94 | kmap = len(maps)
95 | lb = numpy.zeros((kmap,))
96 | ts = numpy.zeros((kmap,))
97 | q = numpy.zeros((kmap,))
98 |
99 | if not beta:
100 | beta = kmap
101 |
102 | beta = 1. / float(beta)
103 |
104 | chencherry = SmoothingFunction()
105 |
106 | # compute BLEU for each Yt
107 | Y = []
108 | bleus = []
109 | truebleus = []
110 | for t in xrange(len(words)):
111 | if len(Y) > 0:
112 | _temp = Y[-1] + ' ' + words[t]
113 | _temp = _temp.replace('@@ ', '')
114 | Y = Y[:-1] + _temp.split()
115 | else:
116 | Y = [words[t]]
117 |
118 | bb = sentence_bleu(ref, Y, smoothing_function=chencherry.method5)
119 |
120 | bleus.append(bb[0])
121 | truebleus.append(bb[1])
122 |
123 | bleus.reverse()
124 | truebleus.reverse()
125 |
126 | # compute the Latency-Bleu
127 | T = 0
128 | Prev = 0
129 | for i, (it, a) in enumerate(maps):
130 | # print 'Prev', Prev
131 | if a == 0: # WAIT
132 | T += 1
133 | if i == 0:
134 | lb[i] = 0
135 | else:
136 | lb[i] = lb[i - 1] + Prev
137 | elif a == 1:
138 | if i < kmap - 1:
139 | lb[i] = lb[i - 1] - Prev
140 |
141 | Prev = bleus.pop()
142 | lb[i] += Prev
143 | else:
144 | lb[i] = lb[i - 2]
145 | else:
146 | lb[i] = 0
147 |
148 | ts[i] = T
149 |
150 | # average the score
151 | # print 'Unnormalized BLEU', lb
152 | lbn = lb / ts
153 |
154 | # print 'Latency BLEU', lbn
155 | q[1:] = lbn[1:] - lbn[:-1]
156 | # print 'instant reward', q
157 |
158 | # add the whole sentence balance on it
159 | q[-1] = Prev # the last BLEU
160 | # print 'instant reward', q
161 |
162 | for i, (it, a) in enumerate(maps):
163 | q0[it] = q[i]
164 |
165 | return q0
166 |
167 |
168 | def LatencyBLEUex(beta=None, discount=1., return_quality=False):
169 |
170 | # init
171 | words = _k['words'].split() # end-of-sentence is treated as a word
172 | ref = _k['reference']
173 |
174 | q = numpy.zeros((_k['steps'],))
175 | lb = numpy.zeros((_k['steps'],))
176 | ts = numpy.zeros((_k['steps'],))
177 |
178 | if not beta:
179 | beta = _k['steps']
180 |
181 | beta = 1. / float(beta)
182 |
183 | chencherry = SmoothingFunction()
184 |
185 | # compute BLEU for each Yt
186 | Y = []
187 | bleus = []
188 | truebleus = []
189 | for t in xrange(len(words)):
190 | if len(Y) > 0:
191 | _temp = Y[-1] + ' ' + words[t]
192 | _temp = _temp.replace('@@ ', '')
193 | Y = Y[:-1] + _temp.split()
194 | else:
195 | Y = [words[t]]
196 |
197 | bb = sentence_bleu(ref, Y, smoothing_function=chencherry.method5)
198 |
199 | bleus.append(bb[0])
200 | truebleus.append(bb[1])
201 |
202 | bleus.reverse()
203 | truebleus.reverse()
204 | # print bleus
205 |
206 | # compute the Latency-Bleu
207 | T = 0
208 | Prev = 0
209 | for i, a in enumerate(_k['act']):
210 | # print 'Prev', Prev
211 | if a == 0: # WAIT
212 | T += 1
213 | if i == 0:
214 | lb[i] = 0
215 | else:
216 | lb[i] = lb[i - 1] + Prev
217 | elif a == 1:
218 | if i < len(_k['act']) - 1:
219 | lb[i] = lb[i - 1] - Prev
220 |
221 | Prev = bleus.pop()
222 | lb[i] += Prev
223 | else:
224 | lb[i] = lb[i - 2]
225 | else:
226 | lb[i] = 0
227 |
228 | ts[i] = T
229 |
230 | # average the score
231 | # print 'Unnormalized BLEU', lb
232 | lbn = lb / ts
233 |
234 | # print 'Latency BLEU', lbn
235 | q[1:] = lbn[1:] - lbn[:-1]
236 | # print 'instant reward', q
237 |
238 | # add the whole sentence balance on it
239 | q[-1] = Prev # the last BLEU
240 | # print 'instant reward', q
241 |
242 | if return_quality: # instant reward sequence (Latency BLEU)
243 | return q
244 |
245 |
246 | # cumulitive futurereward (with discounting factor)
247 |
248 | if discount == 1:
249 | R = q[::-1].cumsum()[::-1]
250 | # print 'future reward', R
251 |
252 | else:
253 | L = _k['steps']
254 | FLAG = False
255 |
256 | if not(gamma == discount):
257 | FLAG = True
258 | gamma = discount
259 |
260 | if L > maxlen:
261 | FLAG = True
262 | maxlen = L
263 |
264 | if FLAG:
265 | GAMMA = compute_discount(gamma, maxlen)
266 | FLAG = False
267 |
268 | R = numpy.dot(GAMMA[:L, :L], q[:, None]).flatten()
269 | # import sys
270 | # sys.exit(123)
271 | # print q # collect all instant reward
272 |
273 | d = NormalizedDelay()
274 | return R, q[-1], d[-1], lbn[-1] + q[-1]
275 |
276 |
277 | # ----------------------------------------------------------------- #
278 | # reward for delay
279 | # several options:
280 | # 1. the total delay, which is computed at the last step
281 | def NormalizedDelay():
282 | d = numpy.zeros((_k['steps'],))
283 | # print a
284 | _src = 0
285 | _trg = 0
286 | _sum = 0
287 | for it, a in enumerate(_k['act']):
288 | if a == 0:
289 | _src += 1
290 | elif a == 1:
291 | _trg += 1
292 | _sum += _src
293 | d[-1] = _sum / (_src * _trg + 1e-6)
294 | return d
295 |
296 | # do not use this
297 | def NormalizedDelaywithPenalty():
298 | d = numpy.zeros((_k['steps'],))
299 | a = numpy.array(_k['act'], dtype='float32')
300 | # print a
301 | d[-1] = numpy.sum(numpy.cumsum(1 - a) * a) / (_k['src_max'] * numpy.sum(a)) * numpy.exp(-3. / _k['src_max'])
302 | return d
303 |
304 | def ConsectiveWaiting():
305 | d = numpy.zeros((_k['steps'],))
306 | a = numpy.array(_k['act'], dtype='float32')
307 |
308 |
309 | def StepDeley():
310 | d = numpy.array(_k['act'], dtype='float32') - 1.
311 | return d
312 |
313 |
314 | def SilceDelay(win=5):
315 | d0 = numpy.array(_k['act'], dtype='float32') - 1.
316 |
317 | def slice(m):
318 | d = d0
319 | d[m:] = d0[:-m]
320 | return d
321 |
322 | dd = numpy.mean([d0] + [slice(w) for w in range(1, win)])
323 | return dd
324 |
325 | # -reward of delay
326 | def MovingDelay(beta=0.1):
327 | d = numpy.zeros((_k['steps'],))
328 | _max = 0
329 | _cur = 0
330 |
331 | for it, a in enumerate(_k['act']):
332 | if a == 0:
333 | _cur += 1
334 | if _cur > _max:
335 | _max += 1
336 | d[it] = -1
337 | else:
338 | _cur = 0
339 |
340 | return d * beta
341 |
342 |
343 | def MaximumDelay(_max=5, beta=0.1):
344 | d = numpy.zeros((_k['steps'],))
345 | _cur = 0
346 | for it, a in enumerate(_k['act']):
347 | if a == 0:
348 | _cur += 1
349 | if _cur > _max:
350 | d[it] = -1
351 | pass
352 | elif a == 1: # only for new commit
353 | _cur = 0
354 |
355 | return d * beta
356 |
357 | # ----------------------------------------------------------------- #
358 | def MaximumSource(_max=7, beta=0.1):
359 | s = numpy.zeros((_k['steps'], ))
360 | _cur = 0
361 | _end = 0
362 | for it, a in enumerate(_k['act']):
363 | if a == 0:
364 | _cur += 1
365 | elif a == 2:
366 | _end += 1
367 |
368 | if (_cur - _end) > _max:
369 | s[it] = -1
370 | return s * beta
371 |
372 | def MovingSource(beta=0.1):
373 | s = numpy.zeros((_k['steps'],))
374 | _max = 0
375 | _cur = 0
376 | _end = 0
377 |
378 | for it, a in enumerate(_k['act']):
379 | if a == 0:
380 | _cur += 1
381 | elif a == 2:
382 | _end += 1
383 |
384 | temp = _cur - _end
385 | if temp > _max:
386 | s[it] = -1
387 | _max = temp
388 |
389 | return s * beta
390 |
391 | def AwardForget(_max=5, beta=0.1):
392 | s = numpy.zeros((_k['steps'],))
393 | _cur = 0
394 | _end = 0
395 | for it, a in enumerate(_k['act']):
396 | if a == 0:
397 | _cur += 1
398 | elif a == 2:
399 | _end += 1
400 |
401 | if ((_cur - _end) >= _max) and (a == 2):
402 | s[it] = 1
403 | return s * beta, _cur / float(_k['src_max'])
404 |
405 | def AwardForget2(_max=5, beta=0.001):
406 | s = numpy.zeros((_k['steps'],))
407 | _cur = 0
408 | _end = 0
409 | for it, a in enumerate(_k['act']):
410 | if a == 0:
411 | _cur += 1
412 | elif a == 2:
413 | _end += 1
414 |
415 | if a == 2:
416 | s[it] = (_cur - _end - _max) * 2
417 | return s * beta
418 |
419 |
420 |
421 | # ----------------------------------------------------------------- #
422 | # reward for quality + delay
423 | def Q2D1(alpha=0.5):
424 | # q = LogLikelihood()
425 | q = NormLogLikelihood()
426 | d = NormalizedDelay()
427 |
428 | r = (q ** alpha) * ((1 - d) ** (1 - alpha))
429 | R = r[::-1].cumsum()[::-1]
430 | return R, q[-1], d[-1], r[-1]
431 |
432 | def Q2D2(alpha=0.5):
433 | # q = LogLikelihood()
434 | q = BLEU()
435 | d = NormalizedDelaywithPenalty()
436 |
437 | r = (q * alpha) + ((1 - d) * (1 - alpha))
438 | R = r[::-1].cumsum()[::-1]
439 | return R, q[-1], d[-1], r[-1]
440 |
441 | def Q2D3(alpha=0.5):
442 | # q = LogLikelihood()
443 | q = BLEU()
444 | d = NormalizedDelay()
445 |
446 | r = q # (q * alpha) + ((1 - d) * (1 - alpha))
447 | R = r[::-1].cumsum()[::-1]
448 | return R, q[-1], d[-1], r[-1]
449 |
450 | def Q2D4(alpha=0.5):
451 | # q = LogLikelihood()
452 | q = BLEU()
453 | d = NormalizedDelay()
454 | d0 = d[-1]
455 | d[-1] = numpy.exp(-max(d0 - 0.7, 0))
456 | r = q * d # (q * alpha) + ((1 - d) * (1 - alpha))
457 | R = r[::-1].cumsum()[::-1]
458 | return R, q[-1], d0, r[-1]
459 |
460 |
461 | # ---------------------------------------------------------------- #
462 | # user defined target delay \tau*
463 | def QualityDelay(tau = 0.5, gamma=3):
464 | q = LatencyBLEUex(return_quality=True)
465 | d = NormalizedDelay()
466 |
467 | # just bleu
468 | bleu = q[-1]
469 |
470 | # just delay
471 | delay = d[-1]
472 |
473 | r = q - gamma * numpy.maximum(d - tau, 0) ** 2 # instant reward
474 | R = r[::-1].cumsum()[::-1]
475 | return R, bleu, delay, r
476 |
477 | def FullQualityDelay(tau = 0.5, gamma=10):
478 | q = LatencyBLEUex(return_quality=True)
479 | d = NormalizedDelay()
480 | d1 = SilceDelay()
481 |
482 | # just bleu
483 | bleu = q[-1]
484 |
485 | # just delay
486 | delay = d[-1]
487 |
488 | r = q + d1 - gamma * numpy.maximum(d - tau, 0) ** 2 # instant reward
489 | R = r[::-1].cumsum()[::-1]
490 | return R, bleu, delay, r
491 |
492 | # UPDATE: July 11, 2016: we have several varisions::
493 | def ReturnA():
494 | # params
495 | gamma = _k['gamma']
496 | beta = 0.1
497 |
498 | q0 = LatencyBLEUex(return_quality=True)
499 | d0 = NormalizedDelay()
500 |
501 | # just bleu
502 | bleu = q0[-1]
503 |
504 | # just delay
505 | delay = d0[-1]
506 |
507 | # use moving-delay + latency bleu (without final BLEU)
508 | q = q0
509 | q[-1] = 0.
510 | d = MovingDelay(beta=beta)
511 |
512 | r = q + gamma * d
513 | R = r[::-1].cumsum()[::-1]
514 | return R, bleu, delay, r
515 |
516 | def ReturnB():
517 | # params
518 | gamma = _k['gamma']
519 | beta = 0.1
520 |
521 | q0 = LatencyBLEUex(return_quality=True)
522 | d0 = NormalizedDelay()
523 |
524 | # just bleu
525 | bleu = q0[-1]
526 |
527 | # just delay
528 | delay = d0[-1]
529 |
530 | # use maximum-delay + latency bleu (without final BLEU)
531 | q = q0
532 | q[-1] = 0.
533 | d = MaximumDelay(_max=4, beta=beta)
534 |
535 | r = q + gamma * d
536 | R = r[::-1].cumsum()[::-1]
537 | return R, bleu, delay, r
538 |
539 | def ReturnC():
540 | # params
541 | gamma = _k['gamma']
542 | beta = 0.1
543 |
544 | q0 = LatencyBLEUex(return_quality=True)
545 | d0 = NormalizedDelay()
546 |
547 | # just bleu
548 | bleu = q0[-1]
549 |
550 | # just delay
551 | delay = d0[-1]
552 |
553 | # use maximum-delay + latency bleu (with final BLEU)
554 | q = q0
555 | d = MaximumDelay(_max=5, beta=beta)
556 |
557 | r = q + gamma * d
558 | R = r[::-1].cumsum()[::-1]
559 | return R, bleu, delay, r
560 |
561 | def ReturnD():
562 | # params
563 | gamma = _k['gamma']
564 | beta = 0.1
565 |
566 | q0 = LatencyBLEUex(return_quality=True)
567 | d0 = NormalizedDelay()
568 |
569 | # just bleu
570 | bleu = q0[-1]
571 |
572 | # just delay
573 | delay = d0[-1]
574 |
575 | # use moving-delay + latency bleu (with final BLEU)
576 | q = q0
577 | d = MovingDelay(beta=beta)
578 |
579 | r = q + gamma * d
580 | R = r[::-1].cumsum()[::-1]
581 | return R, bleu, delay, r
582 |
583 | def ReturnE():
584 | # params
585 | gamma = _k['gamma']
586 | beta = 0.1
587 | tau = _k['target']
588 |
589 | q0 = LatencyBLEUex(return_quality=True)
590 | d0 = NormalizedDelay()
591 |
592 | # just bleu
593 | bleu = q0[-1]
594 |
595 | # just delay
596 | delay = d0[-1]
597 |
598 | # use maximum-delay + latency bleu (without final BLEU) + global delay
599 | q = q0
600 | q[-1] = 0.
601 | d = MaximumDelay(_max=4, beta=beta)
602 | d[-1]-= numpy.maximum(delay - tau, 0)
603 |
604 | r = q + gamma * d
605 | R = r[::-1].cumsum()[::-1]
606 | return R, bleu, delay, r
607 |
608 | def ReturnF():
609 | # params
610 | gamma = _k['gamma']
611 | beta = 0.1
612 | tau = _k['target']
613 |
614 | q0 = LatencyBLEUex(return_quality=True)
615 | d0 = NormalizedDelay()
616 |
617 | # just bleu
618 | bleu = q0[-1]
619 |
620 | # just delay
621 | delay = d0[-1]
622 |
623 | # use maximum-delay + latency bleu (with final BLEU) + global delay
624 | q = q0
625 | d = MaximumDelay(_max=5, beta=beta)
626 | d[-1] -= numpy.maximum(delay - tau, 0) * gamma
627 |
628 | r = q + d
629 | R = r[::-1].cumsum()[::-1]
630 | return R, bleu, delay, r
631 |
632 | # ---------------------------------------------------------------- #
633 | def ReturnG():
634 | # params
635 | discount = _k['discount'] ## 0.95 here gamma is the discounting factor
636 | beta = 0.1
637 |
638 | q0 = LatencyBLEUwithForget(return_quality=True)
639 | d0 = NormalizedDelay()
640 |
641 | # just bleu
642 | bleu = q0[-1]
643 |
644 | # just delay
645 | delay = d0[-1]
646 |
647 | # use maximum-delay + latency bleu (with final BLEU)
648 | q = q0
649 | d = MaximumDelay(_max=4, beta=beta)
650 | s = MaximumSource(_max=7, beta=0.01)
651 |
652 | if discount == 1:
653 | r = q + d + s
654 | R = r[::-1].cumsum()[::-1]
655 | else:
656 | raise NotImplementedError
657 |
658 | return R, bleu, delay, r
659 |
660 | def ReturnH():
661 | # params
662 | discount = _k['discount'] ## 0.95 here gamma is the discounting factor
663 | beta = 0.1
664 |
665 | q0 = LatencyBLEUwithForget(return_quality=True)
666 | d0 = NormalizedDelay()
667 |
668 | # just bleu
669 | bleu = q0[-1]
670 |
671 | # just delay
672 | delay = d0[-1]
673 |
674 | # use maximum-delay + latency bleu (with final BLEU)
675 | q = q0
676 | d = MaximumDelay(_max=4, beta=beta)
677 | s = MovingSource(beta=0.02)
678 |
679 | if discount == 1:
680 | r = q + d + s
681 | R = r[::-1].cumsum()[::-1]
682 | else:
683 | raise NotImplementedError
684 |
685 | return R, bleu, delay, r
686 |
687 | def ReturnI():
688 | # params
689 |
690 | discount = _k['gamma'] ## 0.95 here gamma is the discounting factor
691 | maxsrc = _k['maxsrc']
692 | beta = 0.1
693 |
694 | q0 = LatencyBLEUwithForget(return_quality=True)
695 | d0 = NormalizedDelay()
696 |
697 | # global reward signal :::>>>
698 | # just bleu
699 | bleu = q0[-1]
700 |
701 | # just delay
702 | delay = d0[-1]
703 |
704 | # local reward signal :::>>>>
705 | # use maximum-delay + latency bleu (with final BLEU)
706 | q = q0
707 | q[-1] = 0
708 | d = MaximumDelay(_max=5, beta=beta)
709 | s, _ = AwardForget(_max=maxsrc, beta=0.01)
710 | # s = AwardForget2(_max=maxsrc, beta=0.001)
711 |
712 | r0 = q + d + s
713 | rg = bleu # it is a global reward, will not be discounted.
714 |
715 | if discount == 1:
716 | r = r0
717 | r[-1] += rg
718 | R = r[::-1].cumsum()[::-1]
719 | else:
720 | R = numpy.zeros_like(r0)
721 | R[-1] = r0[-1]
722 | for it in range(_k['steps'] - 2, -1, -1):
723 | R[it] = discount * R[it + 1] + r0[it]
724 | R += rg # add a global signal (without a discount factor)
725 |
726 | return R, bleu, delay, r0
727 |
728 | def ReturnJ():
729 | # params
730 |
731 | discount = _k['gamma'] ## 0.95 here gamma is the discounting factor
732 | beta = 0.1
733 |
734 | q0 = LatencyBLEUwithForget(return_quality=True)
735 | d0 = NormalizedDelay()
736 |
737 | # global reward signal :::>>>
738 | # just bleu
739 | bleu = q0[-1]
740 |
741 | # just delay
742 | delay = d0[-1]
743 |
744 | # local reward signal :::>>>>
745 | # use maximum-delay + latency bleu (with final BLEU)
746 | q = q0
747 | q[-1] = 0
748 | d = MaximumDelay(_max=5, beta=beta)
749 | # s, m = AwardForget(_max=5, beta=0.01)
750 |
751 | r0 = q + d # + s
752 | rg = bleu # * m # it is a global reward, will not be discounted.
753 |
754 | if discount == 1:
755 | r = r0
756 | r[-1] += rg
757 | R = r[::-1].cumsum()[::-1]
758 | else:
759 | R = numpy.zeros_like(r0)
760 | R[-1] = r0[-1]
761 | for it in range(_k['steps'] - 2, -1, -1):
762 | R[it] = discount * R[it + 1] + r0[it]
763 | R += rg # add a global signal (without a discount factor)
764 |
765 | return R, bleu, delay, r0
766 |
767 |
768 | # **------------------------------------------------ **#
769 | # アサガオの散る頃に
770 |
771 | def Q2Ds():
772 | q = NormLogLikelihood()
773 | d = NormalizedDelay()
774 | return q, d
775 |
776 | gamma = _k['gamma']
777 | type = _k['Rtype']
778 |
779 | funcs = [ReturnA, ReturnB, ReturnC, ReturnD, ReturnE, ReturnF, ReturnG, ReturnH, ReturnI, ReturnJ]
780 | return funcs[type]()
781 |
782 | # return FullQualityDelay(tau, gamma)
783 | # return QualityDelay(tau=tau, gamma=gamma)
784 |
785 | # return LatencyBLEUex()
786 | # return Q2D4(0.2)
787 | # return Q2Ds()
788 |
--------------------------------------------------------------------------------
/run_eval.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=device=gpu$2 python simultrans_evaluation.py --sinit 1 --target 0.5 --sample 64 --batchsize 1 --Rtype $1 --gamma 1 --id $3 --recurrent True 2>&1 | tee .images/$4.log
2 |
--------------------------------------------------------------------------------
/run_train.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=device=gpu$2 python simultrans_training.py --sample 32 --batchsize 1 --target $1 --gamma $3 --recurrent True 2>&1 | tee .log/$4.log
2 |
--------------------------------------------------------------------------------
/translate.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | model=".pretrained/model_wmt15_bpe2k_uni_en-ru.npz"
4 | dict="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe.word.pkl"
5 | dict_rev="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe.word.pkl"
6 | source="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-src.en.tok.bpe"
7 | saveto=".translate/standard.trans.1"
8 |
9 | THEANO_FLAGS="floatX=float32, device=cpu" python translate_uni.py -k 1 $model $dict $dict_rev $source $saveto
10 |
--------------------------------------------------------------------------------
/translate_uni.py:
--------------------------------------------------------------------------------
1 | '''
2 | Translates a source file using a translation model.
3 | '''
4 | import theano
5 | import argparse
6 |
7 | import numpy
8 | import cPickle as pkl
9 |
10 | from nmt_uni import (build_model, build_sampler, gen_sample, load_params,
11 | init_params, init_tparams, prepare_data)
12 |
13 | from multiprocessing import Process, Queue
14 |
15 |
16 | def translate_model(queue, rqueue, pid, model, options, k, normalize, kp, sigma):
17 |
18 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
19 | trng = RandomStreams(1234)
20 |
21 | # allocate model parameters
22 | params = init_params(options)
23 |
24 | # load model parameters and set theano shared variables
25 | params = load_params(model, params)
26 | tparams = init_tparams(params)
27 |
28 | trng, use_noise, \
29 | x, x_mask, y, y_mask, \
30 | opt_ret, \
31 | cost = \
32 | build_model(tparams, options)
33 | inps = [x, x_mask, y, y_mask]
34 |
35 | f_log_probs = theano.function(inps, cost)
36 |
37 | # word index
38 | f_init, f_next = build_sampler(tparams, options, trng)
39 |
40 | def _translate(idx, seq):
41 | all_samples = []
42 | all_scores = []
43 |
44 | for kidx in xrange(kp):
45 | if kidx == 0:
46 | ss = -1.
47 | else:
48 | ss = sigma
49 | # sample given an input sequence and obtain scores
50 | sample, score = gen_sample(tparams, f_init, f_next,
51 | numpy.array(seq).reshape([len(seq), 1]),
52 | options, trng=trng, k=k, maxlen=200,
53 | stochastic=False, argmax=False, sigma=ss)
54 |
55 | # normalize scores according to sequence lengths
56 | if normalize:
57 | lengths = numpy.array([len(s) for s in sample])
58 | score = score / lengths
59 | #print idx, score
60 | sidx = numpy.argmin(score)
61 | all_samples.append(sample[sidx])
62 | all_scores.append(score[sidx])
63 |
64 | source_list = [seq] * kp
65 | x, x_mask, y, y_mask = prepare_data(source_list, all_samples, maxlen=None)
66 | all_scores = f_log_probs(x, x_mask, y, y_mask)
67 | if normalize:
68 | lengths = numpy.array([len(s) for s in all_samples])
69 | all_scores = all_scores / lengths
70 |
71 | print idx, all_scores
72 | sidx = numpy.argmin(all_scores)
73 | return all_samples[sidx]
74 |
75 | while True:
76 | req = queue.get()
77 | if req is None:
78 | break
79 |
80 | idx, x = req[0], req[1]
81 | print pid, '-', idx
82 | seq = _translate(idx, x)
83 |
84 | rqueue.put((idx, seq))
85 |
86 | return
87 |
88 |
89 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5,
90 | normalize=False, n_process=5, chr_level=False,
91 | options_file=None, sigma=-1., kp=1):
92 |
93 | # load model model_options
94 | if options_file is not None:
95 | with open(options_file, 'rb') as f:
96 | options = pkl.load(f)
97 | else:
98 | with open('%s.pkl' % model, 'rb') as f:
99 | options = pkl.load(f)
100 |
101 | # load source dictionary and invert
102 | with open(dictionary, 'rb') as f:
103 | word_dict = pkl.load(f)
104 | word_idict = dict()
105 | for kk, vv in word_dict.iteritems():
106 | word_idict[vv] = kk
107 | word_idict[0] = ''
108 | word_idict[1] = 'UNK'
109 |
110 | # load target dictionary and invert
111 | with open(dictionary_target, 'rb') as f:
112 | word_dict_trg = pkl.load(f)
113 | word_idict_trg = dict()
114 | for kk, vv in word_dict_trg.iteritems():
115 | word_idict_trg[vv] = kk
116 | word_idict_trg[0] = ''
117 | word_idict_trg[1] = 'UNK'
118 |
119 | # create input and output queues for processes
120 | queue = Queue()
121 | rqueue = Queue()
122 | processes = [None] * n_process
123 | for midx in xrange(n_process):
124 | processes[midx] = Process(
125 | target=translate_model,
126 | args=(queue, rqueue, midx, model, options, k, normalize, kp, sigma))
127 | processes[midx].start()
128 |
129 | # utility function
130 | def _seqs2words(caps):
131 | capsw = []
132 | for cc in caps:
133 | ww = []
134 | for w in cc:
135 | if w == 0:
136 | break
137 | ww.append(word_idict_trg[w])
138 | capsw.append(' '.join(ww))
139 | return capsw
140 |
141 | def _send_jobs(fname):
142 | with open(fname, 'r') as f:
143 | for idx, line in enumerate(f):
144 | if chr_level:
145 | words = list(line.decode('utf-8').strip())
146 | else:
147 | words = line.strip().split()
148 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
149 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
150 | x += [0]
151 | queue.put((idx, x))
152 | return idx+1
153 |
154 | def _finish_processes():
155 | for midx in xrange(n_process):
156 | queue.put(None)
157 |
158 | def _retrieve_jobs(n_samples):
159 | trans = [None] * n_samples
160 | for idx in xrange(n_samples):
161 | resp = rqueue.get()
162 | trans[resp[0]] = resp[1]
163 | if numpy.mod(idx, 10) == 0:
164 | print 'Sample ', (idx+1), '/', n_samples, ' Done'
165 | return trans
166 |
167 | print 'Translating ', source_file, '...'
168 | n_samples = _send_jobs(source_file)
169 | trans = _seqs2words(_retrieve_jobs(n_samples))
170 | _finish_processes()
171 | with open(saveto, 'w') as f:
172 | print >>f, '\n'.join(trans)
173 | print 'Done'
174 |
175 |
176 | if __name__ == "__main__":
177 | parser = argparse.ArgumentParser()
178 | parser.add_argument('-k', type=int, default=5)
179 | parser.add_argument('-kp', type=int, default=1)
180 | parser.add_argument('-p', type=int, default=5)
181 | parser.add_argument('-n', action="store_true", default=False)
182 | parser.add_argument('-c', action="store_true", default=False)
183 | parser.add_argument('-o', type=str, default=None)
184 | parser.add_argument('-s', type=float, default=-1.)
185 | parser.add_argument('model', type=str)
186 | parser.add_argument('dictionary', type=str)
187 | parser.add_argument('dictionary_target', type=str)
188 | parser.add_argument('source', type=str)
189 | parser.add_argument('saveto', type=str)
190 |
191 | args = parser.parse_args()
192 |
193 | main(args.model, args.dictionary, args.dictionary_target, args.source,
194 | args.saveto, k=args.k, normalize=args.n, n_process=args.p,
195 | chr_level=args.c, options_file=args.o, kp=args.kp, sigma=args.s)
196 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is for functions to help the translation
3 | """
4 | import numpy as np
5 | import time
6 | import sys
7 | import json
8 |
9 | class Monitor(object):
10 | def __init__(self, root='http://localhost:9000'):
11 | self.root = root
12 |
13 | def display(self, batch, logs={}):
14 | import requests
15 | send = {}
16 | send['epoch'] = batch
17 | for k, v in logs.items():
18 | send[k] = v
19 |
20 | try:
21 | requests.post(self.root + '/publish/epoch/end/',
22 | {'data': json.dumps(send)})
23 | except:
24 | print('Warning: could not reach RemoteMonitor '
25 | 'root server at ' + str(self.root))
26 |
27 |
28 |
29 | class Progbar(object):
30 | def __init__(self, target, width=30, verbose=1, with_history=True):
31 | '''
32 | @param target: total number of steps expected
33 | '''
34 | self.width = width
35 | self.target = target
36 | self.sum_values = {}
37 | self.unique_values = []
38 | self.start = time.time()
39 | self.total_width = 0
40 | self.seen_so_far = 0
41 | self.verbose = verbose
42 | self.with_history = with_history
43 |
44 | def update(self, current, values=[]):
45 | '''
46 | @param current: index of current step
47 | @param values: list of tuples (name, value_for_last_step).
48 | The progress bar will display averages for these values.
49 | '''
50 | if not self.with_history:
51 | self.sum_values = {}
52 | self.unique_values = []
53 |
54 | for k, v in values:
55 | if k not in self.sum_values:
56 | self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far]
57 | self.unique_values.append(k)
58 | else:
59 | self.sum_values[k][0] += v * (current - self.seen_so_far)
60 | self.sum_values[k][1] += (current - self.seen_so_far)
61 | self.seen_so_far = current
62 |
63 | now = time.time()
64 | if self.verbose == 1:
65 | prev_total_width = self.total_width
66 | sys.stdout.write("\b" * prev_total_width)
67 | sys.stdout.write("\r")
68 |
69 | numdigits = int(np.floor(np.log10(self.target))) + 1
70 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
71 | bar = barstr % (current, self.target)
72 | prog = float(current)/self.target
73 | prog_width = int(self.width*prog)
74 | if prog_width > 0:
75 | bar += ('.'*(prog_width-1))
76 | if current < self.target:
77 | bar += '(-w-)'
78 | else:
79 | bar += '(-v-)!!'
80 | bar += ('~' * (self.width-prog_width))
81 | bar += ']'
82 | sys.stdout.write(bar)
83 | self.total_width = len(bar)
84 |
85 | if current:
86 | time_per_unit = (now - self.start) / current
87 | else:
88 | time_per_unit = 0
89 | eta = time_per_unit*(self.target - current)
90 | info = ''
91 | if current < self.target:
92 | info += ' - ETA: %ds' % eta
93 | else:
94 | info += ' - %ds' % (now - self.start)
95 | for k in self.unique_values:
96 | if k == 'perplexity' or k == 'PPL':
97 | info += ' - %s: %.4f' % (k, np.exp(self.sum_values[k][0] / max(1, self.sum_values[k][1])))
98 | else:
99 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
100 |
101 | self.total_width += len(info)
102 | if prev_total_width > self.total_width:
103 | info += ((prev_total_width-self.total_width) * " ")
104 |
105 | sys.stdout.write(info)
106 | sys.stdout.flush()
107 |
108 | if current >= self.target:
109 | sys.stdout.write("\n")
110 |
111 | if self.verbose == 2:
112 | if current >= self.target:
113 | info = '%ds' % (now - self.start)
114 | for k in self.unique_values:
115 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
116 | sys.stdout.write(info + "\n")
117 |
118 | def add(self, n, values=[]):
119 | self.update(self.seen_so_far + n, values)
120 |
121 | def clear(self):
122 | self.sum_values = {}
123 | self.unique_values = []
124 | self.total_width = 0
125 | self.seen_so_far = 0
126 |
--------------------------------------------------------------------------------